From 841713e1e487bdb82fd106a52ad998c5f87b59e9 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Thu, 3 Oct 2024 13:00:52 +0300
Subject: [PATCH 001/396] rpc : enable vulkan (#9714)

closes #8536
---
 examples/rpc/rpc-server.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
index 6342e6488..355125831 100644
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -6,6 +6,10 @@
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@@ -79,6 +83,12 @@ static ggml_backend_t create_backend() {
     if (!backend) {
         fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
     }
+#elif GGML_USE_VULKAN
+    fprintf(stderr, "%s: using Vulkan backend\n", __func__);
+    backend = ggml_backend_vk_init(0); // init device 0
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
+    }
 #endif
 
     // if there aren't GPU Backends fallback to CPU backend
@@ -92,6 +102,8 @@ static ggml_backend_t create_backend() {
 static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
 #ifdef GGML_USE_CUDA
     ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
+#elif GGML_USE_VULKAN
+    ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
 #else
     #ifdef _WIN32
         MEMORYSTATUSEX status;

From e3c355ba654d4164c1c09e5d0dcacecb8b214af8 Mon Sep 17 00:00:00 2001
From: compilade <git@compilade.net>
Date: Thu, 3 Oct 2024 10:22:15 -0400
Subject: [PATCH 002/396] convert : handle tokenizer merges format from
 transformers 4.45 (#9696)

---
 gguf-py/gguf/vocab.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index dc5749913..f2645f921 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -122,8 +122,30 @@ class SpecialVocab:
                 tokenizer = json.load(f)
             if self.load_merges:
                 merges = tokenizer.get('model', {}).get('merges')
-                if isinstance(merges, list) and merges and isinstance(merges[0], str):
-                    self.merges = merges
+                if isinstance(merges, list) and merges:
+                    if isinstance(merges[0], str):
+                        self.merges = merges
+                    elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
+                        # New format since transformers 4.45 to support spaces in merges
+                        # ref: https://github.com/ggerganov/llama.cpp/issues/9692
+                        # TODO: internally store as the new format instead of converting to old
+                        if any(' ' in s for pair in merges for s in pair):
+                            logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
+                        self.merges = [
+                            ' '.join(
+                                [
+                                    # ensure the spaces are properly encoded
+                                    ''.join(
+                                        chr(ord(c) + 256) if c == ' ' else c
+                                        for c in part
+                                    )
+                                    for part in pair
+                                ]
+                            )
+                            for pair in merges
+                        ]
+                    else:
+                        raise ValueError("Unknown tokenizer merges format")
             added_tokens = tokenizer.get('added_tokens', {})
         else:
             added_tokens = {}

From d6fe7abf04e8ec5240dead6e2773ed1b7e7495d3 Mon Sep 17 00:00:00 2001
From: bandoti <141645996+bandoti@users.noreply.github.com>
Date: Thu, 3 Oct 2024 12:39:03 -0300
Subject: [PATCH 003/396] ggml: unify backend logging mechanism (#9709)

* Add scaffolding for ggml logging macros

* Metal backend now uses GGML logging

* Cuda backend now uses GGML logging

* Cann backend now uses GGML logging

* Add enum tag to parameters

* Use C memory allocation funcs

* Fix compile error

* Use GGML_LOG instead of GGML_PRINT

* Rename llama_state to llama_logger_state

* Prevent null format string

* Fix whitespace

* Remove log callbacks from ggml backends

* Remove cuda log statement
---
 ggml/include/ggml-backend.h  |   5 +-
 ggml/include/ggml-cann.h     |  11 ---
 ggml/include/ggml-cuda.h     |   2 -
 ggml/include/ggml-metal.h    |   2 -
 ggml/include/ggml.h          |   4 +
 ggml/src/ggml-backend-impl.h |   3 -
 ggml/src/ggml-backend.cpp    |  14 ---
 ggml/src/ggml-cann.cpp       |  89 +++---------------
 ggml/src/ggml-cuda.cu        | 102 ++++++---------------
 ggml/src/ggml-impl.h         |  15 +++
 ggml/src/ggml-metal.m        | 172 +++++++++++++----------------------
 ggml/src/ggml.c              |  92 ++++++++++++++-----
 src/llama.cpp                |  26 ++----
 13 files changed, 197 insertions(+), 340 deletions(-)

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index b096aaed6..864bcbded 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -164,7 +164,7 @@ extern "C" {
     GGML_API size_t             ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
     GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
     GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
-    GGML_API void               ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data);
+
 
     // Functions that may be obtained using ggml_backend_reg_get_proc_address
     typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
@@ -184,9 +184,6 @@ extern "C" {
     GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
     GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
 
-    // Set the log callback for all registered backends
-    GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data);
-
     // Direct backend (stream) initialization
     // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
     GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h
index ba9ff2292..95bdaf10d 100644
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@@ -116,17 +116,6 @@ GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
                                                   size_t* free,
                                                   size_t* total);
 
-/**
- * @brief Set the logging callback for GGML.
- *
- * This function sets the logging callback and user data for logging.
- *
- * @param log_callback The logging callback to set.
- * @param user_data User data to pass to the logging callback.
- */
-GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
-                                                 void* user_data);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index a8feddc94..f44d8f4e6 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -40,8 +40,6 @@ GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, siz
 GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
 GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 
-GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
-
 GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
 
 #ifdef  __cplusplus
diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h
index 55e6ecd84..c3ec572b2 100644
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -39,8 +39,6 @@ extern "C" {
 // user-code should use only these functions
 //
 
-GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
-
 GGML_API ggml_backend_t ggml_backend_metal_init(void);
 
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 969be3e94..1b4006b62 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2167,6 +2167,10 @@ extern "C" {
     typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
     typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
 
+    // Set callback for all future logging events.
+    // If this is not called, or NULL is supplied, everything is output on stderr.
+    GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
+
     // optimization parameters
     //
     //   see ggml.c (ggml_opt_default_params) for default values
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 470c922fe..ba2e26999 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -215,9 +215,6 @@ extern "C" {
         // (optional) get a pointer to a function in the backend
         // backends can add custom functions that are not part of the standard ggml-backend interface
         void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
-
-        // (optional) set the log callback for the backend
-        void (*set_log_callback)(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data);
     };
 
     struct ggml_backend_reg {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 73a2b24f8..3300ddb52 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -505,12 +505,6 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
     return reg->iface.get_proc_address(reg, name);
 }
 
-void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
-    if (reg->iface.set_log_callback) {
-        reg->iface.set_log_callback(reg, log_callback, user_data);
-    }
-}
-
 // Backend registry
 
 #ifdef GGML_USE_CUDA
@@ -614,13 +608,6 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
     return NULL;
 }
 
-void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data) {
-    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
-        ggml_backend_reg_t reg = ggml_backend_reg_get(i);
-        ggml_backend_reg_set_log_callback(reg, log_callback, user_data);
-    }
-}
-
 // Convenience functions
 ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
     ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
@@ -1161,7 +1148,6 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
     /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
     /* .get_device       = */ ggml_backend_cpu_reg_get_device,
     /* .get_proc_address = */ NULL,
-    /* .set_log_callback = */ NULL,
 };
 
 ggml_backend_reg_t ggml_backend_cpu_reg(void) {
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index 63ad0b878..db5f8f186 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -39,69 +39,6 @@
 
 #include "ggml-common.h"
 
-/**
- * @brief Default logging callback for GGML.
- *
- * This function is the default logging callback that logs messages to stderr.
- *
- * @param level The log level.
- * @param msg The log message.
- * @param user_data User data passed to the callback.
- */
-static void ggml_cann_default_log_callback(enum ggml_log_level level,
-                                           const char* msg, void* user_data) {
-    GGML_UNUSED(level);
-    GGML_UNUSED(user_data);
-    fprintf(stderr, "%s", msg);
-}
-
-ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback;
-void* ggml_cann_log_user_data = NULL;
-
-GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
-                                                 void* user_data) {
-    ggml_cann_log_callback = log_callback;
-    ggml_cann_log_user_data = user_data;
-}
-
-#define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
-#define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
-#define GGML_CANN_LOG_ERROR(...) \
-    ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-
-GGML_ATTRIBUTE_FORMAT(2, 3)
-
-/**
- * @brief Log a message using the current logging callback.
- *
- * This function formats a log message and passes it to the current logging
- * callback.
- *
- * @param level The log level.
- * @param format The format string for the log message.
- * @param ... The arguments for the format string.
- */
-static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
-    if (ggml_cann_log_callback != NULL) {
-        va_list args;
-        va_start(args, format);
-        char buffer[128];
-        int len = vsnprintf(buffer, 128, format, args);
-        if (len < 128) {
-            ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data);
-        } else {
-             // vsnprintf adds a null terminator
-            std::vector<char> buffer2(len + 1);
-            va_end(args);
-            va_start(args, format);
-            vsnprintf(&buffer2[0], buffer2.size(), format, args);
-            ggml_cann_log_callback(level, buffer2.data(),
-                                   ggml_cann_log_user_data);
-        }
-        va_end(args);
-    }
-}
-
 /**
  * @brief Handles CANN errors by printing an error message and aborting.
  *
@@ -116,10 +53,10 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
     int32_t id = -1;
     aclrtGetDevice(&id);
 
-    GGML_CANN_LOG_ERROR("CANN error: %s\n", msg);
-    GGML_CANN_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func,
+    GGML_LOG_ERROR("CANN error: %s\n", msg);
+    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func,
             file, line);
-    GGML_CANN_LOG_ERROR("  %s\n", stmt);
+    GGML_LOG_ERROR("  %s\n", stmt);
     // abort with GGML_ASSERT to get a stack trace
     GGML_ABORT("CANN error");
 }
@@ -165,7 +102,7 @@ static ggml_cann_device_info ggml_cann_init() {
     aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
 
     if (err != ACL_SUCCESS) {
-        GGML_CANN_LOG_ERROR("%s: failed to initialize CANN: %s\n",
+        GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
                 __func__, aclGetRecentErrMsg());
         return info;
     }
@@ -315,7 +252,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
         *actual_size = look_ahead_size;
         pool_size += look_ahead_size;
 #ifdef DEBUG_CANN_MALLOC
-        GGML_CANN_LOG_INFO(
+        GGML_LOG_INFO(
             "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
             "requested %u MB\n",
             __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
@@ -470,7 +407,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
             // add to the pool
             pool_size += reserve_size;
 
-            // GGML_CANN_LOG_INFO("cann pool[%d]: size increased to %llu MB (
+            // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
             // reserved %llu MB)\n",
             //       device, (unsigned long long) (pool_size/1024/1024),
             //       (unsigned long long) (reserve_size/1024/1024));
@@ -483,7 +420,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
         pool_used += size;
 
 #ifdef DEBUG_CANN_MALLOC
-        GGML_CANN_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
+        GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
                (unsigned long long)size, (unsigned long long)ptr);
 #endif
         return ptr;
@@ -497,7 +434,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      */
     void free(void* ptr, size_t size) override {
 #ifdef DEBUG_CANN_MALLOC
-        GGML_CANN_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
+        GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
                (unsigned long long)size, (unsigned long long)ptr);
 #endif
 
@@ -1095,7 +1032,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
     void* dev_ptr;
     aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
     if (err != ACL_SUCCESS) {
-        GGML_CANN_LOG_ERROR(
+        GGML_LOG_ERROR(
             "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
             __func__, size / 1024.0 / 1024.0, buft_ctx->device,
             aclGetRecentErrMsg());
@@ -1280,7 +1217,7 @@ static void * ggml_cann_host_malloc(size_t size) {
     aclError err = aclrtMallocHost((void **) &hostPtr, size);
     if (err != ACL_SUCCESS) {
 
-        GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
+        GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                            size / 1024.0 / 1024.0, aclGetRecentErrMsg());
         return nullptr;
     }
@@ -1733,7 +1670,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
         bool ok = ggml_cann_compute_forward(*cann_ctx, node);
 
         if (!ok) {
-            GGML_CANN_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
+            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
                     node->name, ggml_op_name(node->op));
         }
         GGML_ASSERT(ok);
@@ -2043,13 +1980,13 @@ static ggml_guid_t ggml_backend_cann_guid() {
 ggml_backend_t ggml_backend_cann_init(int32_t device) {
     aclInit(nullptr);
     if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
-        GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
+        GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
         return nullptr;
     }
 
     ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
     if (ctx == nullptr) {
-        GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
         return nullptr;
     }
     ggml_cann_set_device(ctx->device);
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 43151e235..663e97cd7 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -56,52 +56,14 @@
 
 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
-static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
-    GGML_UNUSED(level);
-    GGML_UNUSED(user_data);
-    fprintf(stderr, "%s", msg);
-}
-
-ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
-void * ggml_cuda_log_user_data = NULL;
-
-GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
-    ggml_cuda_log_callback = log_callback;
-    ggml_cuda_log_user_data = user_data;
-}
-
-#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
-#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
-#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-
-GGML_ATTRIBUTE_FORMAT(2, 3)
-static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
-    if (ggml_cuda_log_callback != NULL) {
-        va_list args;
-        va_start(args, format);
-        char buffer[128];
-        int len = vsnprintf(buffer, 128, format, args);
-        if (len < 128) {
-            ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
-        } else {
-            std::vector<char> buffer2(len + 1);  // vsnprintf adds a null terminator
-            va_end(args);
-            va_start(args, format);
-            vsnprintf(&buffer2[0], buffer2.size(), format, args);
-            ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
-        }
-        va_end(args);
-    }
-}
-
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
     int id = -1; // in case cudaGetDevice fails
     cudaGetDevice(&id);
 
-    GGML_CUDA_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
-    GGML_CUDA_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
-    GGML_CUDA_LOG_ERROR("  %s\n", stmt);
+    GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
+    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
+    GGML_LOG_ERROR("  %s\n", stmt);
     // abort with GGML_ABORT to get a stack trace
     GGML_ABORT(GGML_CUDA_NAME " error");
 }
@@ -166,7 +128,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
     cudaError_t err = cudaGetDeviceCount(&info.device_count);
     if (err != cudaSuccess) {
-        GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
+        GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
         return info;
     }
 
@@ -174,16 +136,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
     int64_t total_vram = 0;
 #ifdef GGML_CUDA_FORCE_MMQ
-    GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:    yes\n", __func__);
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:    yes\n", __func__);
 #else
-    GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:    no\n", __func__);
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:    no\n", __func__);
 #endif // GGML_CUDA_FORCE_MMQ
 #ifdef GGML_CUDA_FORCE_CUBLAS
-    GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
 #else
-    GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
 #endif // GGML_CUDA_FORCE_CUBLAS
-    GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
+    GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
     for (int id = 0; id < info.device_count; ++id) {
         int device_vmm = 0;
 
@@ -204,7 +166,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
         cudaDeviceProp prop;
         CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-        GGML_CUDA_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
 
         info.default_tensor_split[id] = total_vram;
         total_vram += prop.totalGlobalMem;
@@ -312,7 +274,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
         *actual_size = look_ahead_size;
         pool_size += look_ahead_size;
 #ifdef DEBUG_CUDA_MALLOC
-        GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
+        GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
                            (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
 #endif
         return ptr;
@@ -327,7 +289,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
                 return;
             }
         }
-        GGML_CUDA_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
+        GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
         ggml_cuda_set_device(device);
         CUDA_CHECK(cudaFree(ptr));
         pool_size -= size;
@@ -591,7 +553,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
     if (err != cudaSuccess) {
         // clear the error
         cudaGetLastError();
-        GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
+        GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
         return nullptr;
     }
 
@@ -1016,7 +978,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
     if (err != cudaSuccess) {
         // clear the error
         cudaGetLastError();
-        GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
+        GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                            size / 1024.0 / 1024.0, cudaGetErrorString(err));
         return nullptr;
     }
@@ -2283,7 +2245,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
             break;
         case GGML_OP_MUL_MAT:
             if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
-                GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
+                GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
                 return false;
             } else {
                 ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@@ -2367,7 +2329,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
-        GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
+        GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
         CUDA_CHECK(err);
     }
 
@@ -2436,7 +2398,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
 
     if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
 #ifndef NDEBUG
-        GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
+        GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
 #endif
         return false;
     }
@@ -2552,7 +2514,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
         if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
             cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
 #ifndef NDEBUG
-            GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
+            GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
 #endif
         }
     }
@@ -2603,14 +2565,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
             if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
                 use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
 #ifndef NDEBUG
-                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
+                GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
 #endif
             }
 
             if (node->op == GGML_OP_MUL_MAT_ID) {
                 use_cuda_graph = false; // This node type is not supported by CUDA graph capture
 #ifndef NDEBUG
-                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
+                GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
 #endif
             }
 
@@ -2619,7 +2581,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                 // Changes in batch size or context size can cause changes to the grid size of some kernels.
                 use_cuda_graph = false;
 #ifndef NDEBUG
-                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+                GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
 #endif
             }
 
@@ -2631,7 +2593,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                 if (!ptr) {
                     use_cuda_graph = false;
 #ifndef NDEBUG
-                    GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
+                    GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
 #endif
                 } else {
                     if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
@@ -2655,7 +2617,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
         if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
             cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
 #ifndef NDEBUG
-            GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
+            GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
 #endif
         }
     }
@@ -2694,7 +2656,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
 
                 bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
                 if (!ok) {
-                    GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+                    GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
                 }
                 GGML_ASSERT(ok);
             }
@@ -2713,7 +2675,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                 use_cuda_graph = false;
                 cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
 #ifndef NDEBUG
-                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
+                GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
 #endif
             } else {
                 graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2780,7 +2742,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
         cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
         if (stat == cudaErrorGraphExecUpdateFailure) {
 #ifndef NDEBUG
-            GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
+            GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
 #endif
             // The pre-existing graph exec cannot be updated due to violated constraints
             // so instead clear error and re-instantiate
@@ -2882,7 +2844,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
         // clear the error
         cudaGetLastError();
 
-        GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
+        GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
                            size / 1024.0 / 1024.0, cudaGetErrorString(err));
         return false;
     }
@@ -3305,17 +3267,11 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
     return nullptr;
 }
 
-static void ggml_backend_cuda_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
-    GGML_UNUSED(reg);
-    ggml_backend_cuda_log_set_callback(log_callback, user_data);
-}
-
 static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
     /* .get_name          = */ ggml_backend_cuda_reg_get_name,
     /* .get_device_count  = */ ggml_backend_cuda_reg_get_device_count,
     /* .get_device_get    = */ ggml_backend_cuda_reg_get_device,
     /* .get_proc_address  = */ ggml_backend_cuda_reg_get_proc_address,
-    /* .set_log_callback  = */ ggml_backend_cuda_reg_set_log_callback,
 };
 
 // backend registry
@@ -3361,13 +3317,13 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
 
 ggml_backend_t ggml_backend_cuda_init(int device) {
     if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
-        GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
+        GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
         return nullptr;
     }
 
     ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
     if (ctx == nullptr) {
-        GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
+        GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
         return nullptr;
     }
 
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 833984190..d3f4bad8c 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -33,6 +33,21 @@ extern "C" {
 #endif
 #endif
 
+//
+// logging
+//
+
+GGML_ATTRIBUTE_FORMAT(2, 3)
+void ggml_log_internal        (enum ggml_log_level level, const char * format, ...);
+void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
+
+#define GGML_LOG(...)       ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
+#define GGML_LOG_INFO(...)  ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
+#define GGML_LOG_WARN(...)  ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
+#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define GGML_LOG_CONT(...)  ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
+
 // bitset
 
 typedef uint32_t ggml_bitset_t;
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 8ff16983e..7ffaaf8d8 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -18,19 +18,6 @@
 // max number of MTLCommandBuffer used to submit a graph for processing
 #define GGML_METAL_MAX_COMMAND_BUFFERS 8
 
-#ifdef GGML_METAL_NDEBUG
-#define GGML_METAL_LOG(...)
-#define GGML_METAL_LOG_INFO(...)
-#define GGML_METAL_LOG_WARN(...)
-#define GGML_METAL_LOG_ERROR(...)
-#else
-#define GGML_METAL_LOG(...)       ggml_metal_log(GGML_LOG_LEVEL_NONE,  __VA_ARGS__)
-#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
-#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
-#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
-#endif
-
 #define UNUSED(x) (void)(x)
 
 struct ggml_metal_kernel {
@@ -277,51 +264,19 @@ struct ggml_backend_metal_context {
 @implementation GGMLMetalClass
 @end
 
-static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
-    fprintf(stderr, "%s", msg);
-
-    UNUSED(level);
-    UNUSED(user_data);
-}
-
-ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
-void * ggml_metal_log_user_data = NULL;
-
-GGML_ATTRIBUTE_FORMAT(2, 3)
-static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
-    if (ggml_metal_log_callback != NULL) {
-        va_list args;
-        va_start(args, format);
-        char buffer[128];
-        int len = vsnprintf(buffer, 128, format, args);
-        if (len < 128) {
-            ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
-        } else {
-            char* buffer2 = malloc(len+1);
-            va_end(args);
-            va_start(args, format);
-            vsnprintf(buffer2, len+1, format, args);
-            buffer2[len] = 0;
-            ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
-            free(buffer2);
-        }
-        va_end(args);
-    }
-}
-
 static void * ggml_metal_host_malloc(size_t n) {
     void * data = NULL;
 
 #if TARGET_OS_OSX
     kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
     if (err != KERN_SUCCESS) {
-        GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
+        GGML_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
         return NULL;
     }
 #else
     const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
     if (result != 0) {
-        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
+        GGML_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
         return NULL;
     }
 #endif
@@ -330,20 +285,20 @@ static void * ggml_metal_host_malloc(size_t n) {
 }
 
 static struct ggml_backend_metal_context * ggml_metal_init(void) {
-    GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
+    GGML_LOG_INFO("%s: allocating\n", __func__);
 
 #if TARGET_OS_OSX && !GGML_METAL_NDEBUG
     // Show all the Metal device instances in the system
     NSArray * devices = MTLCopyAllDevices();
     for (id<MTLDevice> device in devices) {
-        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
+        GGML_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
     }
     [devices release]; // since it was created by a *Copy* C method
 #endif
 
     // Pick and show default Metal device
     id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
+    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
 
     // Configure context
     struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
@@ -381,28 +336,28 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
         if (try_metallib && path_lib != nil) {
             // pre-compiled library found
             NSURL * libURL = [NSURL fileURLWithPath:path_lib];
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
+            GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
 
             metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
             if (error) {
-                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                 return NULL;
             }
         } else {
 #if GGML_METAL_EMBED_LIBRARY
-            GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
+            GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
 
             extern const char ggml_metallib_start[];
             extern const char ggml_metallib_end[];
 
             NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
 #else
-            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
+            GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
 
             NSString * path_source;
             NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
 
-            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
+            GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
 
             if (path_resource) {
                 path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
@@ -411,15 +366,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
             }
 
             if (path_source == nil) {
-                GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
+                GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
                 path_source = @"ggml-metal.metal";
             }
 
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
+            GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
 
             NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
             if (error) {
-                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                 return NULL;
             }
 #endif // GGML_METAL_EMBED_LIBRARY
@@ -435,7 +390,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
 
                 metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
                 if (error) {
-                    GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                    GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                     return NULL;
                 }
             }
@@ -443,7 +398,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
     }
 
     // print MTL GPU family:
-    GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
+    GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
 
     const NSInteger MTLGPUFamilyMetal3 = 5001;
 
@@ -453,21 +408,21 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
     {
         for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
             if ([ctx->device supportsFamily:i]) {
-                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d  (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
+                GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d  (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
                 break;
             }
         }
 
         for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
             if ([ctx->device supportsFamily:i]) {
-                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
+                GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
                 break;
             }
         }
 
         for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
             if ([ctx->device supportsFamily:i]) {
-                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
+                GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
                 break;
             }
         }
@@ -478,9 +433,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
 
     ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7];
 
-    GGML_METAL_LOG_INFO("%s: simdgroup reduction support   = %s\n",       __func__, ctx->support_simdgroup_reduction ? "true" : "false");
-    GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
-    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+    GGML_LOG_INFO("%s: simdgroup reduction support   = %s\n",       __func__, ctx->support_simdgroup_reduction ? "true" : "false");
+    GGML_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
+    GGML_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
 
     ctx->capture_next_compute = false;
     ctx->capture_started = false;
@@ -494,13 +449,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
 
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
     if (@available(macOS 10.12, iOS 16.0, *)) {
-        GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
+        GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
     }
 #elif TARGET_OS_OSX
     if (ctx->device.maxTransferRate != 0) {
-        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
+        GGML_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
     } else {
-        GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
+        GGML_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
     }
 #endif
 
@@ -513,7 +468,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
         }
 
         /*
-            GGML_METAL_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
+            GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
                     (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
                     (int) kernel->pipeline.threadExecutionWidth); \
         */
@@ -524,12 +479,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
             kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
             [metal_function release]; \
             if (error) { \
-                GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+                GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
                 [metal_library release]; \
                 return NULL; \
             } \
         } else { \
-            GGML_METAL_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
+            GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
         }
 
         // simd_sum and simd_max requires MTLGPUFamilyApple7
@@ -726,7 +681,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
 }
 
 static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
-    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
+    GGML_LOG_INFO("%s: deallocating\n", __func__);
 
     for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
         [ctx->kernels[i].pipeline release];
@@ -764,7 +719,7 @@ struct ggml_backend_metal_buffer_context {
 // Metal buffer based on the host memory pointer
 //
 static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
-    //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+    //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
 
     const int64_t tsize = ggml_nbytes(t);
 
@@ -776,17 +731,17 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
     for (int i = 0; i < buf_ctx->n_buffers; ++i) {
         const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
 
-        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
+        //GGML_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
         if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
             *offs = (size_t) ioffs;
 
-            //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
+            //GGML_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
 
             return buf_ctx->buffers[i].metal;
         }
     }
 
-    GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
+    GGML_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
 
     return nil;
 }
@@ -918,7 +873,7 @@ static void ggml_metal_encode_node(
 
     struct ggml_tensor * node = ggml_graph_node(gf, idx);
 
-    //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
+    //GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
 
     struct ggml_tensor * src0 = node->src[0];
     struct ggml_tensor * src1 = node->src[1];
@@ -944,7 +899,7 @@ static void ggml_metal_encode_node(
     }
 
     if (!ggml_metal_supports_op(ctx, dst)) {
-        GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
+        GGML_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
         GGML_ABORT("unsupported op");
     }
 
@@ -1002,17 +957,17 @@ static void ggml_metal_encode_node(
     id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
     id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
 
-    //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+    //GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
     //if (src0) {
-    //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+    //    GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
     //            ggml_is_contiguous(src0), src0->name);
     //}
     //if (src1) {
-    //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+    //    GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
     //            ggml_is_contiguous(src1), src1->name);
     //}
     //if (dst) {
-    //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+    //    GGML_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
     //            dst->name);
     //}
 
@@ -1404,7 +1359,7 @@ static void ggml_metal_encode_node(
                 } break;
                 default:
                 {
-                    GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
+                    GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
                     GGML_ABORT("fatal error");
                 }
             } break;
@@ -1956,7 +1911,7 @@ static void ggml_metal_encode_node(
                                     } break;
                                 default:
                                     {
-                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+                                        GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
                                         GGML_ABORT("not implemented");
                                     }
                             };
@@ -2252,7 +2207,7 @@ static void ggml_metal_encode_node(
                             } break;
                         default:
                             {
-                                GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
+                                GGML_LOG_ERROR("Asserting on type %d\n", (int)src2t);
                                 GGML_ABORT("not implemented");
                             }
                     };
@@ -2821,8 +2776,8 @@ static void ggml_metal_encode_node(
                                   //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
                         default:
                                   {
-                                      GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
-                                      GGML_METAL_LOG_ERROR("add template specialization for this size\n");
+                                      GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                      GGML_LOG_ERROR("add template specialization for this size\n");
                                       GGML_ABORT("add template specialization for this size");
                                   }
                     }
@@ -2834,8 +2789,8 @@ static void ggml_metal_encode_node(
                                   //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
                         default:
                                   {
-                                      GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
-                                      GGML_METAL_LOG_ERROR("add template specialization for this size\n");
+                                      GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                      GGML_LOG_ERROR("add template specialization for this size\n");
                                       GGML_ABORT("add template specialization for this size");
                                   }
                     }
@@ -2996,7 +2951,7 @@ static void ggml_metal_encode_node(
             } break;
        default:
             {
-                GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
+                GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
                 GGML_ABORT("fatal error");
             }
     }
@@ -3041,7 +2996,7 @@ static enum ggml_status ggml_metal_graph_compute(
 
                 NSError * error = nil;
                 if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
-                    GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
+                    GGML_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
                     GGML_ABORT("capture failed");
                 } else {
                     [ctx->capture_scope beginScope];
@@ -3123,9 +3078,9 @@ static enum ggml_status ggml_metal_graph_compute(
 
             MTLCommandBufferStatus status = [command_buffer status];
             if (status != MTLCommandBufferStatusCompleted) {
-                GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
+                GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
                 if (status == MTLCommandBufferStatusError) {
-                    GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                    GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
                 }
 
                 return GGML_STATUS_FAILED;
@@ -3138,9 +3093,9 @@ static enum ggml_status ggml_metal_graph_compute(
 
             MTLCommandBufferStatus status = [command_buffer status];
             if (status != MTLCommandBufferStatusCompleted) {
-                GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
+                GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
                 if (status == MTLCommandBufferStatusError) {
-                    GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                    GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
                 }
 
                 return GGML_STATUS_FAILED;
@@ -3157,7 +3112,7 @@ static enum ggml_status ggml_metal_graph_compute(
             }
 
             if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
-                GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
+                GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
                 return GGML_STATUS_ABORTED;
             }
 
@@ -3286,17 +3241,17 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
 #ifndef GGML_METAL_NDEBUG
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
     if (@available(macOS 10.12, iOS 16.0, *)) {
-        GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
+        GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
                 __func__,
                 size_aligned / 1024.0 / 1024.0,
                 device.currentAllocatedSize / 1024.0 / 1024.0,
                 device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
 
         if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
-            GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
+            GGML_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
         }
     } else {
-        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
+        GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
                 __func__,
                 size_aligned / 1024.0 / 1024.0,
                 device.currentAllocatedSize / 1024.0 / 1024.0);
@@ -3338,7 +3293,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
     }
 
     if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
-        GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+        GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
         free(ctx);
         ggml_backend_metal_free_device();
         return NULL;
@@ -3423,7 +3378,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
             ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
 
             if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+                GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
                 return false;
             }
         }
@@ -3449,7 +3404,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
                 ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
 
                 if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
+                    GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
                     return false;
                 }
             }
@@ -3457,7 +3412,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
             ggml_backend_metal_log_allocated_size(device, size_step_aligned);
 
             if (i + size_step < size) {
-                GGML_METAL_LOG_INFO("\n");
+                GGML_LOG_INFO("\n");
             }
 
             ++ctx->n_buffers;
@@ -3514,7 +3469,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
         ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
 
         if (ctx->n_cb > 2) {
-            GGML_METAL_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb);
+            GGML_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb);
         }
     }
 
@@ -3544,11 +3499,6 @@ static struct ggml_backend_i ggml_backend_metal_i = {
     /* .event_wait              = */ NULL,
 };
 
-void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
-    ggml_metal_log_callback  = log_callback;
-    ggml_metal_log_user_data = user_data;
-}
-
 static ggml_guid_t ggml_backend_metal_guid(void) {
     static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 };
     return &guid;
@@ -3557,7 +3507,7 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
 ggml_backend_t ggml_backend_metal_init(void) {
     struct ggml_backend_metal_context * ctx = ggml_metal_init();
     if (ctx == NULL) {
-        GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
         return NULL;
     }
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index e740e58b2..de500a675 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -319,26 +319,63 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
 // logging
 //
 
+struct ggml_logger_state {
+    ggml_log_callback log_callback;
+    void * log_callback_user_data;
+};
+static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
+
+static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
+    if (format == NULL)
+        return;
+    va_list args_copy;
+    va_copy(args_copy, args);
+    char buffer[128];
+    int len = vsnprintf(buffer, 128, format, args);
+    if (len < 128) {
+        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
+    } else {
+        char * buffer2 = (char *) calloc(len + 1, sizeof(char));
+        vsnprintf(buffer2, len + 1, format, args_copy);
+        buffer2[len] = 0;
+        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
+        free(buffer2);
+    }
+    va_end(args_copy);
+}
+
+void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    ggml_log_internal_v(level, format, args);
+    va_end(args);
+}
+
+void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
 #if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG(...)
 #endif
 
 #if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG_5(...)
 #endif
 
 #if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG_10(...)
 #endif
 
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
 //
 // end of logging block
 //
@@ -355,7 +392,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
 #else
 inline static void * ggml_aligned_malloc(size_t size) {
     if (size == 0) {
-        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
+        GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
         return NULL;
     }
     void * aligned_memory = NULL;
@@ -377,7 +414,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
                 error_desc = "insufficient memory";
                 break;
         }
-        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
+        GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
         GGML_ABORT("fatal error");
         return NULL;
     }
@@ -393,12 +430,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
 
 inline static void * ggml_malloc(size_t size) {
     if (size == 0) {
-        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
+        GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
         return NULL;
     }
     void * result = malloc(size);
     if (result == NULL) {
-        GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
+        GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
         GGML_ABORT("fatal error");
     }
     return result;
@@ -407,12 +444,12 @@ inline static void * ggml_malloc(size_t size) {
 // calloc
 inline static void * ggml_calloc(size_t num, size_t size) {
     if (num == 0 || size == 0) {
-        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
+        GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
         return NULL;
     }
     void * result = calloc(num, size);
     if (result == NULL) {
-        GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
+        GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
         GGML_ABORT("fatal error");
     }
     return result;
@@ -3347,7 +3384,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
         if (fptr != NULL) {
             char buf[42];
             if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
-                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
+                GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
             }
             fclose(fptr);
         }
@@ -3365,21 +3402,21 @@ bool ggml_is_numa(void) {
 ////////////////////////////////////////////////////////////////////////////////
 
 void ggml_print_object(const struct ggml_object * obj) {
-    GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
+    GGML_LOG_INFO(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
             obj->type, obj->offs, obj->size, (const void *) obj->next);
 }
 
 void ggml_print_objects(const struct ggml_context * ctx) {
     struct ggml_object * obj = ctx->objects_begin;
 
-    GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);
+    GGML_LOG_INFO("%s: objects in context %p:\n", __func__, (const void *) ctx);
 
     while (obj != NULL) {
         ggml_print_object(obj);
         obj = obj->next;
     }
 
-    GGML_PRINT("%s: --- end ---\n", __func__);
+    GGML_LOG_INFO("%s: --- end ---\n", __func__);
 }
 
 int64_t ggml_nelements(const struct ggml_tensor * tensor) {
@@ -3962,7 +3999,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
     struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
 
     if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
-        GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+        GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
                 __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
         assert(false);
         return NULL;
@@ -4026,7 +4063,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         if (ctx->scratch.data != NULL) {
             // allocate tensor data in the scratch buffer
             if (ctx->scratch.offs + data_size > ctx->scratch.size) {
-                GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
+                GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
                         __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
                 assert(false);
                 return NULL;
@@ -20013,7 +20050,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     }
 #else
     if (n_threads > threadpool->n_threads_max) {
-        GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
+        GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
         n_threads = threadpool->n_threads_max;
     }
 
@@ -20552,30 +20589,30 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
 }
 
 void ggml_graph_print(const struct ggml_cgraph * cgraph) {
-    GGML_PRINT("=== GRAPH ===\n");
+    GGML_LOG_INFO("=== GRAPH ===\n");
 
-    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
+    GGML_LOG_INFO("n_nodes = %d\n", cgraph->n_nodes);
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * node = cgraph->nodes[i];
 
-        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
+        GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
                 i,
                 node->ne[0], node->ne[1], node->ne[2],
                 ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ");
     }
 
-    GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
+    GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs);
     for (int i = 0; i < cgraph->n_leafs; i++) {
         struct ggml_tensor * node = cgraph->leafs[i];
 
-        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
+        GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
                 i,
                 node->ne[0], node->ne[1],
                 ggml_op_name(node->op),
                 ggml_get_name(node));
     }
 
-    GGML_PRINT("========================================\n");
+    GGML_LOG_INFO("========================================\n");
 }
 
 // check if node is part of the graph
@@ -20746,7 +20783,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
 
     fclose(fp);
 
-    GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
+    GGML_LOG_INFO("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -23241,4 +23278,9 @@ int ggml_cpu_get_sve_cnt(void) {
     return 0;
 #endif
 }
+
+void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
+    g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
+    g_logger_state.log_callback_user_data = user_data;
+}
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/llama.cpp b/src/llama.cpp
index 69ba65395..3443b0689 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2266,17 +2266,12 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
 // globals
 //
 
-struct llama_state {
-    llama_state() {
-        llama_log_set(log_callback, log_callback_user_data);
-    }
-
-    // We save the log callback globally
+struct llama_logger_state {
     ggml_log_callback log_callback = llama_log_callback_default;
     void * log_callback_user_data = nullptr;
 };
 
-static llama_state g_state;
+static llama_logger_state g_logger_state;
 
 // available llama models
 enum e_model {
@@ -21850,16 +21845,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
 }
 
 void llama_log_set(ggml_log_callback log_callback, void * user_data) {
-    g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
-    g_state.log_callback_user_data = user_data;
-
-    ggml_backend_set_log_callback(log_callback, user_data);
-
-#ifdef GGML_USE_METAL
-    ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
-#elif defined(GGML_USE_CANN)
-    ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
-#endif
+    ggml_log_set(log_callback, user_data);
+    g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
+    g_logger_state.log_callback_user_data = user_data;
 }
 
 static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
@@ -21868,12 +21856,12 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
     char buffer[128];
     int len = vsnprintf(buffer, 128, format, args);
     if (len < 128) {
-        g_state.log_callback(level, buffer, g_state.log_callback_user_data);
+        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
     } else {
         char * buffer2 = new char[len + 1];
         vsnprintf(buffer2, len + 1, format, args_copy);
         buffer2[len] = 0;
-        g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
+        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
         delete[] buffer2;
     }
     va_end(args_copy);

From a7ad553513a5d70b4ceacd36f64705cf3654dc97 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 3 Oct 2024 17:39:18 +0200
Subject: [PATCH 004/396] ggml-backend : add device description to CPU backend
 (#9720)

---
 ggml/src/ggml-backend.cpp | 88 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 83 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 3300ddb52..0551764fe 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1,5 +1,13 @@
 // Note: porting this file to C++ is a work in progress
 
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
 #include "ggml-backend-impl.h"
 #include "ggml-alloc.h"
 #include "ggml-impl.h"
@@ -10,9 +18,15 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-
+#include <string>
 #include <vector>
 
+#ifdef __APPLE__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+
 // backend buffer type
 
 const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
@@ -1008,6 +1022,70 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
 
 ////////////////////////
 
+struct ggml_backend_cpu_device_context {
+    std::string description = "CPU";
+
+    ggml_backend_cpu_device_context() {
+#ifdef __APPLE__
+        size_t len = 0;
+        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
+            description.resize(len);
+            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
+        }
+#elif defined(__linux__)
+        FILE * f = fopen("/proc/cpuinfo", "r");
+        if (f) {
+            char buf[1024];
+            while (fgets(buf, sizeof(buf), f)) {
+                if (strncmp(buf, "model name", 10) == 0) {
+                    char * p = strchr(buf, ':');
+                    if (p) {
+                        p++;
+                        while (std::isspace(*p)) {
+                            p++;
+                        }
+                        while (std::isspace(p[strlen(p) - 1])) {
+                            p[strlen(p) - 1] = '\0';
+                        }
+                        description = p;
+                        break;
+                    }
+                }
+            }
+            fclose(f);
+        }
+#elif defined(_WIN32)
+        HKEY hKey;
+        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
+                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
+                        0,
+                        KEY_READ,
+                        &hKey) == ERROR_SUCCESS) {
+            DWORD cpu_brand_size = 0;
+            if (RegQueryValueExA(hKey,
+                                TEXT("ProcessorNameString"),
+                                NULL,
+                                NULL,
+                                NULL,
+                                &cpu_brand_size) == ERROR_SUCCESS) {
+                description.resize(cpu_brand_size);
+                if (RegQueryValueExA(hKey,
+                                    TEXT("ProcessorNameString"),
+                                    NULL,
+                                    NULL,
+                                    (LPBYTE)&description[0], // NOLINT
+                                    &cpu_brand_size) == ERROR_SUCCESS) {
+                    if (description.find('\0') != std::string::npos) {
+                        description.resize(description.find('\0'));
+                    }
+                }
+            }
+            RegCloseKey(hKey);
+        }
+#endif
+    }
+};
+
 static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
     return "CPU";
 
@@ -1015,10 +1093,9 @@ static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
 }
 
 static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
-    // TODO
-    return "CPU";
+    struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
 
-    GGML_UNUSED(dev);
+    return ctx->description.c_str();
 }
 
 static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@@ -1131,10 +1208,11 @@ static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
 static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
     GGML_ASSERT(index == 0);
 
+    static ggml_backend_cpu_device_context ctx;
     static ggml_backend_device ggml_backend_cpu_device = {
         /* .iface   = */ ggml_backend_cpu_device_i,
         /* .reg     = */ reg,
-        /* .context = */ NULL,
+        /* .context = */ &ctx,
     };
 
     return &ggml_backend_cpu_device;

From 5d5ab1e5cca0d6d63701a1cb85dbe26cb57d2c4e Mon Sep 17 00:00:00 2001
From: Jack Mousseau <jack@software.inc>
Date: Thu, 3 Oct 2024 11:01:46 -0700
Subject: [PATCH 005/396] metal : fix compute pass descriptor autorelease crash
 (#9718)

---
 ggml/src/ggml-metal.m | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 7ffaaf8d8..d10f5af0b 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -217,8 +217,6 @@ struct ggml_backend_metal_context {
     id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
 
-    MTLComputePassDescriptor * edesc;
-
     dispatch_queue_t d_queue;
 
     struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
@@ -304,8 +302,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
     struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
     ctx->device = device;
     ctx->queue  = [ctx->device newCommandQueue];
-    ctx->edesc  = MTLComputePassDescriptor.computePassDescriptor;
-    ctx->edesc.dispatchType = MTLDispatchTypeSerial;
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
     id<MTLLibrary> metal_library;
@@ -3016,7 +3012,7 @@ static enum ggml_status ggml_metal_graph_compute(
             const int n_nodes_per_cb = ctx->n_nodes_per_cb;
 
             id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
-            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: ctx->edesc];
+            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
 
             int node_start = 0;
             int node_end   = n_nodes_0;

From eee39bdc96065b69242877fe8f1be05c885fc2aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 2 Oct 2024 15:32:39 +0200
Subject: [PATCH 006/396] ggml: refactor cross entropy loss CPU impl.
 (ggml/976)

---
 ggml/include/ggml-backend.h |  4 +--
 ggml/src/ggml.c             | 64 ++++++++++++++++++++-----------------
 2 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 864bcbded..4d7d2716e 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -247,7 +247,7 @@ extern "C" {
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
 
     // Initialize backend buffers from a measure graph
-    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
 
     GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
     GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
@@ -262,7 +262,7 @@ extern "C" {
     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
 
     // Allocate and compute graph on the backend scheduler
-    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
     GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
     GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
     GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index de500a675..6e034a087 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4232,9 +4232,13 @@ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, floa
 }
 
 struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
+    if (ggml_is_empty(tensor)) {
+        return tensor;
+    }
     if (tensor->buffer) {
         ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor));
     } else {
+        GGML_ASSERT(tensor->data);
         memset(tensor->data, 0, ggml_nbytes(tensor));
     }
     return tensor;
@@ -16851,41 +16855,40 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
 
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(ggml_is_scalar(dst));
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
     GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    GGML_ASSERT(ggml_is_scalar(dst));
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    // TODO: handle transposed/permuted matrices
+    const int64_t nc = src0->ne[0];
+    const int64_t nr = ggml_nrows(src0);
 
     const int ith = params->ith;
     const int nth = params->nth;
 
-    float * sums = (float *) params->wdata;
-
-    // TODO: handle transposed/permuted matrices
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
+    float * sums =  (float *) params->wdata;
+    float * st   = ((float *) params->wdata) + nth + ith*nc;
+    float sum_thread = 0.0f;
 
     GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
 
-    if (ith == 0) {
-        memset(sums, 0, sizeof(float) * (nth + nth * nc));
-    }
-    ggml_barrier(params->threadpool);
-
     // rows per thread
-    const int dr = (nr + nth - 1)/nth;
+    const int64_t dr = (nr + nth - 1)/nth;
 
     // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
 
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
-        float * st = ((float *) params->wdata) + nth + ith*nc;
+    for (int64_t i1 = ir0; i1 < ir1; ++i1) {
+        const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]);
+        const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]);
 
 #ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
+        for (int64_t i = 0; i < nc; ++i) {
             //printf("p[%d] = %f\n", i, p[i]);
             assert(!isnan(s0[i]));
             assert(!isnan(s1[i]));
@@ -16894,23 +16897,24 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
 
         float max = -INFINITY;
         ggml_vec_max_f32(nc, &max, s0);
-        ggml_float sum = ggml_vec_log_soft_max_f32(nc, st, s0, max);
-        assert(sum >= 0.0);
+        const ggml_float sum_softmax = ggml_vec_log_soft_max_f32(nc, st, s0, max);
+        assert(sum_softmax >= 0.0);
 
-        ggml_vec_add1_f32(nc, st, st, -sum);
+        ggml_vec_add1_f32(nc, st, st, -sum_softmax);
         ggml_vec_mul_f32(nc, st, st, s1);
 
-        float st_sum = 0.0f;
-        ggml_vec_sum_f32(nc, &st_sum, st);
-        sums[ith] += st_sum;
+        float sum_st = 0.0f;
+        ggml_vec_sum_f32(nc, &sum_st, st);
+        sum_thread += sum_st;
 
 #ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
+        for (int64_t i = 0; i < nc; ++i) {
             assert(!isnan(st[i]));
             assert(!isinf(st[i]));
         }
 #endif
     }
+    sums[ith] = sum_thread;
     ggml_barrier(params->threadpool);
 
     if (ith == 0) {
@@ -16976,7 +16980,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
 
 #ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
+        for (int64_t i = 0; i < nc; ++i) {
             //printf("p[%d] = %f\n", i, p[i]);
             assert(!isnan(s0[i]));
             assert(!isnan(s1[i]));
@@ -16995,7 +16999,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         ggml_vec_scale_f32(nc, ds0, d_by_nr);
 
 #ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
+        for (int64_t i = 0; i < nc; ++i) {
             assert(!isnan(ds0[i]));
             assert(!isinf(ds0[i]));
         }

From fabdc3bda396307565c4f3f4ecbc3a751a2eb6d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 3 Oct 2024 17:29:59 +0200
Subject: [PATCH 007/396] ggml/ex: calculate accuracy in graph, adapt MNIST
 (ggml/980)

---
 ggml/include/ggml.h                  |   7 ++
 ggml/src/ggml-cuda.cu                |  17 ++++
 ggml/src/ggml-cuda/argmax.cu         |  79 +++++++++++++++++
 ggml/src/ggml-cuda/argmax.cuh        |   3 +
 ggml/src/ggml-cuda/common.cuh        |  12 +++
 ggml/src/ggml-cuda/count-equal.cu    |  64 ++++++++++++++
 ggml/src/ggml-cuda/count-equal.cuh   |   5 ++
 ggml/src/ggml-cuda/fattn-tile-f16.cu |   2 +-
 ggml/src/ggml-cuda/fattn-vec-f16.cuh |   6 +-
 ggml/src/ggml.c                      | 123 ++++++++++++++++++++++++++-
 tests/test-backend-ops.cpp           |  79 ++++++++++++++++-
 11 files changed, 389 insertions(+), 8 deletions(-)
 create mode 100644 ggml/src/ggml-cuda/argmax.cu
 create mode 100644 ggml/src/ggml-cuda/argmax.cuh
 create mode 100644 ggml/src/ggml-cuda/count-equal.cu
 create mode 100644 ggml/src/ggml-cuda/count-equal.cuh

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1b4006b62..e7678d071 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -456,6 +456,7 @@ extern "C" {
         GGML_OP_SUM_ROWS,
         GGML_OP_MEAN,
         GGML_OP_ARGMAX,
+        GGML_OP_COUNT_EQUAL,
         GGML_OP_REPEAT,
         GGML_OP_REPEAT_BACK,
         GGML_OP_CONCAT,
@@ -994,6 +995,12 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    // count number of equal elements in a and b
+    GGML_API struct ggml_tensor * ggml_count_equal(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     // if a is the same shape as b, and a is not parameter, return a
     // otherwise, return a new tensor: repeat(a) to fit in b
     GGML_API struct ggml_tensor * ggml_repeat(
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 663e97cd7..bcb39766b 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -5,12 +5,14 @@
 #include "ggml-cuda/common.cuh"
 #include "ggml-cuda/acc.cuh"
 #include "ggml-cuda/arange.cuh"
+#include "ggml-cuda/argmax.cuh"
 #include "ggml-cuda/argsort.cuh"
 #include "ggml-cuda/binbcast.cuh"
 #include "ggml-cuda/clamp.cuh"
 #include "ggml-cuda/concat.cuh"
 #include "ggml-cuda/conv-transpose-1d.cuh"
 #include "ggml-cuda/convert.cuh"
+#include "ggml-cuda/count-equal.cuh"
 #include "ggml-cuda/cpy.cuh"
 #include "ggml-cuda/cross-entropy-loss.cuh"
 #include "ggml-cuda/diagmask.cuh"
@@ -2143,6 +2145,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
     }
 
     switch (dst->op) {
+        case GGML_OP_ARGMAX:
+            ggml_cuda_argmax(ctx, dst);
+            break;
+        case GGML_OP_COUNT_EQUAL:
+            ggml_cuda_count_equal(ctx, dst);
+            break;
         case GGML_OP_REPEAT:
             ggml_cuda_op_repeat(ctx, dst);
             break;
@@ -3073,6 +3081,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 return false;
             } break;
         case GGML_OP_DUP:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+            } break;
+        case GGML_OP_ARGMAX:
+        case GGML_OP_COUNT_EQUAL:
+            {
+                return true;
+            } break;
         case GGML_OP_REPEAT:
             {
                 ggml_type src0_type = op->src[0]->type;
diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu
new file mode 100644
index 000000000..aab04eca7
--- /dev/null
+++ b/ggml/src/ggml-cuda/argmax.cu
@@ -0,0 +1,79 @@
+#include "common.cuh"
+#include "argmax.cuh"
+#include "sum.cuh"
+
+#include <cstdint>
+
+static __global__ void argmax_f32(
+    const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) {
+
+    int argmax_thread = 0;
+    const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE;
+
+#pragma unroll
+    for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) {
+        const int64_t row = row0 + row1;
+
+        if (row >= nrows) {
+            break;
+        }
+
+        float maxval = -FLT_MAX;
+        int   argmax = -1;
+
+        for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) {
+            const float val        = x[row*ncols + col];
+            const int   bigger     = val > maxval;
+            const int   not_bigger = bigger ^ 0x00000001;
+
+            maxval = maxval*not_bigger + val*bigger;
+            argmax = argmax*not_bigger + col*bigger;
+        }
+
+#pragma unroll
+        for (int mask = 16; mask > 0; mask >>= 1) {
+            const float val        = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE);
+            const int   col        = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE);
+            const int   bigger     = val > maxval;
+            const int   not_bigger = bigger ^ 0x00000001;
+
+            maxval = maxval*not_bigger + val*bigger;
+            argmax = argmax*not_bigger + col*bigger;
+        }
+
+        const int store = row1 == threadIdx.x;
+        argmax_thread += store*argmax;
+    }
+
+    const int row = row0 + threadIdx.x;
+
+    if (row >= nrows) {
+        return;
+    }
+
+    dst[row] = argmax_thread;
+}
+
+void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ne00  = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    const float * src0_d = (const float *) src0->data;
+    int32_t     * dst_d  = (int32_t     *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE;
+
+    const dim3 blocks_dim(WARP_SIZE, 1, 1);
+    const dim3 blocks_num(num_blocks, 1, 1);
+
+    argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00, nrows);
+}
diff --git a/ggml/src/ggml-cuda/argmax.cuh b/ggml/src/ggml-cuda/argmax.cuh
new file mode 100644
index 000000000..5b7223adc
--- /dev/null
+++ b/ggml/src/ggml-cuda/argmax.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 6a4bcdba0..dd203fcde 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -175,6 +175,18 @@ static __device__ void no_device_code(
 #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
 #endif // __CUDA_ARCH__
 
+static __device__ __forceinline__ int warp_reduce_sum(int x) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
+    return __reduce_add_sync(0xffffffff, x);
+#else
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+    }
+    return x;
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
+}
+
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
diff --git a/ggml/src/ggml-cuda/count-equal.cu b/ggml/src/ggml-cuda/count-equal.cu
new file mode 100644
index 000000000..ffb053b10
--- /dev/null
+++ b/ggml/src/ggml-cuda/count-equal.cu
@@ -0,0 +1,64 @@
+#include "common.cuh"
+#include "count-equal.cuh"
+
+#include <cstdint>
+
+template <typename T>
+static __global__ void count_equal(const T * __restrict__ x, const T * __restrict__ y, int64_t * __restrict__ dst, const int64_t dk, const int64_t k) {
+    const int64_t i0 = (int64_t) blockIdx.x*dk;
+    const int64_t i1 = min(i0 + dk, k);
+
+    int nequal = 0;
+
+    for (int64_t i = i0 + threadIdx.x; i < i1; i += WARP_SIZE) {
+        const T xi = x[i];
+        const T yi = y[i];
+        nequal += xi == yi;
+    }
+
+    nequal = warp_reduce_sum(nequal);
+
+    if (threadIdx.x != 0) {
+        return;
+    }
+
+    atomicAdd((int *) dst, nequal);
+}
+
+void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == src1->type);
+    GGML_ASSERT( dst->type == GGML_TYPE_I64);
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    int64_t * dst_d  = (int64_t *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
+
+    const int64_t ne = ggml_nelements(src0);
+    GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
+    const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
+
+    CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));
+
+    const dim3 blocks_dim(WARP_SIZE, 1, 1);
+    const dim3 blocks_num(std::min((int64_t)4*nsm, (ne + CUDA_COUNT_EQUAL_CHUNK_SIZE - 1)/CUDA_COUNT_EQUAL_CHUNK_SIZE), 1, 1);
+
+    switch (src0->type) {
+        case GGML_TYPE_I32: {
+            const int * src0_d = (const int *) src0->data;
+            const int * src1_d = (const int *) src1->data;
+            count_equal<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_d, dne, ne);
+        } break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+}
diff --git a/ggml/src/ggml-cuda/count-equal.cuh b/ggml/src/ggml-cuda/count-equal.cuh
new file mode 100644
index 000000000..8467da79e
--- /dev/null
+++ b/ggml/src/ggml-cuda/count-equal.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
+
+void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu
index 342f2eb66..5af02c7ec 100644
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -259,7 +259,7 @@ static __global__ void flash_attn_tile_ext_f16(
         }
 
         half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
-        kqsum_j = warp_reduce_sum(kqsum_j);
+        kqsum_j = warp_reduce_sum((float)kqsum_j);
 
 #pragma unroll
         for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
index 448a9a905..2ed6509ac 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -196,7 +196,7 @@ static __global__ void flash_attn_vec_ext_f16(
 #pragma unroll
             for (int j = 0; j < ncols; ++j) {
                 half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
-                sum = warp_reduce_sum(sum);
+                sum = warp_reduce_sum((float)sum);
 
                 if (use_logit_softcap) {
                     sum = logit_softcap*tanhf(sum);
@@ -265,7 +265,7 @@ static __global__ void flash_attn_vec_ext_f16(
 
 #pragma unroll
     for (int j = 0; j < ncols; ++j) {
-        kqsum[j] = warp_reduce_sum(kqsum[j]);
+        kqsum[j] = warp_reduce_sum((float)kqsum[j]);
         if (threadIdx.x == 0) {
             kqsum_shared[j][threadIdx.y] = kqsum[j];
         }
@@ -280,7 +280,7 @@ static __global__ void flash_attn_vec_ext_f16(
         }
 
         kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
-        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
+        kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]);
 
         half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
         if (parallel_blocks == 1) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 6e034a087..03b832d0f 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2994,6 +2994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "SUM_ROWS",
     "MEAN",
     "ARGMAX",
+    "COUNT_EQUAL",
     "REPEAT",
     "REPEAT_BACK",
     "CONCAT",
@@ -3067,7 +3068,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
+static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3088,6 +3089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "Σx_k",
     "Σx/n",
     "argmax(x)",
+    "count_equal(x)",
     "repeat(x)",
     "repeat_back(x)",
     "concat(x, y)",
@@ -3161,7 +3163,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
+static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -5222,6 +5224,23 @@ struct ggml_tensor * ggml_argmax(
     return result;
 }
 
+// ggml_count_equal
+
+struct ggml_tensor * ggml_count_equal(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, 1);
+
+    result->op     = GGML_OP_COUNT_EQUAL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
 // ggml_repeat
 
 struct ggml_tensor * ggml_repeat(
@@ -10809,6 +10828,86 @@ static void ggml_compute_forward_argmax(
     }
 }
 
+// ggml_compute_forward_count_equal
+
+static void ggml_compute_forward_count_equal_i32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    GGML_ASSERT(src0->type == GGML_TYPE_I32);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    GGML_ASSERT(ggml_is_scalar(dst));
+    GGML_ASSERT(dst->type == GGML_TYPE_I64);
+
+    const int64_t nr = ggml_nrows(src0);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int64_t * sums = (int64_t *) params->wdata;
+    int64_t sum_thread = 0;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 =  ir                        / (ne02*ne01);
+        const int64_t i02 = (ir - i03*ne03)            /       ne01;
+        const int64_t i01 =  ir - i03*ne03 - i02*ne02;
+
+        const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01;
+        const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11;
+
+        for (int64_t i00 = 0; i00 < ne00; ++i00) {
+            const int32_t val0 = *((const int32_t *) (data0 + i00*nb00));
+            const int32_t val1 = *((const int32_t *) (data1 + i00*nb10));
+
+            sum_thread += val0 == val1;
+        }
+    }
+    if (ith != 0) {
+        sums[ith] = sum_thread;
+    }
+    ggml_barrier(params->threadpool);
+
+    if (ith != 0) {
+        return;
+    }
+
+    for (int ith_other = 1; ith_other < nth; ++ith_other) {
+        sum_thread += sums[ith_other];
+    }
+    *((int64_t *) dst->data) = sum_thread;
+}
+
+static void ggml_compute_forward_count_equal(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_count_equal_i32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_repeat
 
 static void ggml_compute_forward_repeat_f32(
@@ -17187,6 +17286,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_argmax(params, tensor);
             } break;
+        case GGML_OP_COUNT_EQUAL:
+            {
+                ggml_compute_forward_count_equal(params, tensor);
+            } break;
         case GGML_OP_REPEAT:
             {
                 ggml_compute_forward_repeat(params, tensor);
@@ -17937,6 +18040,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_MEAN:
         case GGML_OP_ARGMAX:
+        case GGML_OP_COUNT_EQUAL:
             {
                 GGML_ABORT("fatal error"); // TODO: implement
             }
@@ -18710,6 +18814,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
     for (int i = 0; i < gf->n_nodes; ++i) {
         struct ggml_tensor * node = gf->nodes[i];
 
+        if (node->type == GGML_TYPE_I32) {
+            continue;
+        }
+
         bool needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM;
         bool ignore_src[GGML_MAX_SRC] = {false};
         switch (node->op) {
@@ -19113,6 +19221,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_SUM_ROWS:
         case GGML_OP_MEAN:
         case GGML_OP_ARGMAX:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_COUNT_EQUAL:
+            {
+                n_tasks = n_threads;
+            } break;
         case GGML_OP_REPEAT:
         case GGML_OP_REPEAT_BACK:
         case GGML_OP_LEAKY_RELU:
@@ -19611,6 +19726,10 @@ struct ggml_cplan ggml_graph_plan(
                         cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
                     }
                 } break;
+            case GGML_OP_COUNT_EQUAL:
+                {
+                    cur = ggml_type_size(node->type)*n_tasks;
+                } break;
             case GGML_OP_MUL_MAT:
                 {
                     const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 86a0b379b..a10d98e35 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -116,6 +116,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
     } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
         // This is going to create some weird integers though.
         ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
+    } else if (tensor->type == GGML_TYPE_I64) {
+        // Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
+        const size_t nbytes_half = ggml_nbytes(tensor)/2;
+        ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
+        ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
     } else {
         GGML_ABORT("fatal error");
     }
@@ -145,6 +150,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                         tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
                     } else if (t->type == GGML_TYPE_F32) {
                         tv.push_back(*(float *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I64) {
+                        tv.push_back((float)*(int64_t *) &buf[i]);
                     } else if (t->type == GGML_TYPE_I32) {
                         tv.push_back((float)*(int32_t *) &buf[i]);
                     } else if (t->type == GGML_TYPE_I16) {
@@ -1116,6 +1123,71 @@ struct test_get_rows : public test_case {
     }
 };
 
+// GGML_OP_ARGMAX
+struct test_argmax : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_argmax(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 100, 1, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_argmax(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    double max_nmse_err() override {
+        return 0.0;
+    }
+};
+
+// GGML_OP_COUNT_EQUAL
+struct test_count_equal : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_count_equal(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {4, 500, 1, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * a_argmax = ggml_argmax(ctx, a);
+        ggml_set_name(a_argmax, "a_argmax");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        ggml_tensor * b_argmax = ggml_argmax(ctx, a);
+        ggml_set_name(b_argmax, "b_argmax");
+
+        ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    double max_nmse_err() override {
+        return 0.0;
+    }
+};
+
 // GGML_OP_REPEAT
 struct test_repeat : public test_case {
     const ggml_type type;
@@ -3260,6 +3332,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
     test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
 
+    test_cases.emplace_back(new test_argmax());
+    test_cases.emplace_back(new test_count_equal());
+
     for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
@@ -3278,8 +3353,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
     test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
     test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {1, 2, 0, 3}));
 
     for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
         test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));

From 1bb8a64ebfcbe599dacb4fc8069731b6cba0b5d6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 3 Oct 2024 21:17:49 +0300
Subject: [PATCH 008/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 23c24899e..3d79c9ac9 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-4de6ee8e6a4b2145d6b92162bc87722fecb4ea46
+e5c233e5edbfcfa1d808b9293de9065035c40751

From d5ed2b929d85bbd7dbeecb690880f07d9d7a6077 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 3 Oct 2024 21:18:19 +0300
Subject: [PATCH 009/396] metal : remove abort (skip) (ggml/0)

---
 ggml/src/ggml-metal.m | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index d10f5af0b..c6a7014fc 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -2993,7 +2993,6 @@ static enum ggml_status ggml_metal_graph_compute(
                 NSError * error = nil;
                 if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
                     GGML_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
-                    GGML_ABORT("capture failed");
                 } else {
                     [ctx->capture_scope beginScope];
                     ctx->capture_started = true;

From 133c7b46b3482f7c126c0c4ba14265f684138306 Mon Sep 17 00:00:00 2001
From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
Date: Fri, 4 Oct 2024 10:54:44 +0200
Subject: [PATCH 010/396] Fixed RNG seed docs (#9723)

* Update README.md

fixed RNG seed info

* changed print format to unsigned
---
 common/arg.cpp            | 2 +-
 examples/server/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 8266a16c2..2a85ad845 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -911,7 +911,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
     ).set_sparam());
     add_opt(llama_arg(
         {"-s", "--seed"}, "SEED",
-        format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED),
+        format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
         [](gpt_params & params, const std::string & value) {
             params.sparams.seed = std::stoul(value);
         }
diff --git a/examples/server/README.md b/examples/server/README.md
index 951c4a44c..6253de43c 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -100,7 +100,7 @@ The project is under active development, and we are [looking for feedback and co
 | Argument | Explanation |
 | -------- | ----------- |
 | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
-| `-s, --seed SEED` | RNG seed (default: 4294967295, use random seed for 4294967295) |
+| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
 | `--penalize-nl` | penalize newline tokens (default: false) |

From f3fdcfaa79afa12047def3a8793d4a566e0532d4 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Fri, 4 Oct 2024 11:47:19 +0200
Subject: [PATCH 011/396] ci : fine-grant permission (#9710)

---
 .github/workflows/build.yml          | 5 +++++
 .github/workflows/close-issue.yml    | 5 +++++
 .github/workflows/nix-ci-aarch64.yml | 7 +++++++
 .github/workflows/nix-ci.yml         | 7 +++++++
 4 files changed, 24 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c71d422e7..423173b97 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,6 +19,11 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
   cancel-in-progress: true
 
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  contents: write # for creating release
+
 env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   GGML_NLOOP: 3
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
index 69c9f4f69..f63860d14 100644
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -3,6 +3,11 @@ on:
   schedule:
     - cron: "42 0 * * *"
 
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  issues: write
+
 jobs:
   close-issues:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
index 4aa4b2379..0da6acdf1 100644
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -21,6 +21,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
   cancel-in-progress: true
 
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
+  id-token: write
+  contents: read
+
 jobs:
   nix-build-aarch64:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
index 8955f38d0..8ecbbe53b 100644
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -12,6 +12,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
   cancel-in-progress: true
 
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
+  id-token: write
+  contents: read
+
 jobs:
   nix-eval:
     strategy:

From ff565769f289c6adcc91ed1b8fdabaf9a0d4f6ee Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 4 Oct 2024 08:41:40 +0200
Subject: [PATCH 012/396] ggml : fixes after sync (ggml/983)

ggml : remove test-backend-buffer

ggml : fix CUDA build warnings
---
 ggml/src/ggml-cuda.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index bcb39766b..5b6f605b0 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2448,6 +2448,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
     GGML_UNUSED(backend);
 }
 
+#ifdef USE_CUDA_GRAPH
 static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
     graph_node_properties->node_address = node->data;
     graph_node_properties->node_op = node->op;
@@ -2498,6 +2499,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
 
     return true;
 }
+#endif
 
 static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

From 55951c018d7c107b6ef0a4c8561a6e68183d19d9 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Fri, 4 Oct 2024 15:46:18 +0200
Subject: [PATCH 013/396] ggml : fix typo in example usage ggml_gallocr_new
 (ggml/984)

---
 ggml/include/ggml-alloc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 0dff47d65..23600eea9 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -24,7 +24,7 @@ GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
 // Graph allocator
 /*
   Example usage:
-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
 
     // optional: create a worst-case graph and reserve the buffers to avoid reallocations
     ggml_gallocr_reserve(galloc, build_graph(max_batch));

From 17880771ad7dca16cdc969062f2a56f779662835 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 4 Oct 2024 18:50:25 +0300
Subject: [PATCH 014/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 3d79c9ac9..e8e0c69aa 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-e5c233e5edbfcfa1d808b9293de9065035c40751
+b77f48b1efa671e094696b99fbf566aac8c87d74

From 71967c2a6d30da9f61580d3e2d4cb00e0223b6fa Mon Sep 17 00:00:00 2001
From: "Viet-Anh NGUYEN (Andrew)" <vietanh.dev@gmail.com>
Date: Sat, 5 Oct 2024 01:29:35 +0700
Subject: [PATCH 015/396] Add Llama Assistant (#9744)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index c56c97231..41e5e5448 100644
--- a/README.md
+++ b/README.md
@@ -169,6 +169,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
 - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
+- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
 
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
 

From 905f5485b279518d30b402565c23fb153f822c0d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 5 Oct 2024 14:33:54 +0300
Subject: [PATCH 016/396] metal : zero-init buffer contexts (whisper/0)

---
 ggml/src/ggml-metal.m | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index c6a7014fc..7baee4174 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -3258,7 +3258,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
 }
 
 static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
+    struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
 
     const size_t size_page = sysconf(_SC_PAGESIZE);
 
@@ -3340,7 +3340,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
 // buffer from ptr
 
 ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
-    struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
+    struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
 
     ctx->all_data = data;
     ctx->all_size = size;

From 58b16695e146628481c6b9b8a3b101c0c9bac00f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 5 Oct 2024 15:53:49 +0300
Subject: [PATCH 017/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index e8e0c69aa..5c92cdfd6 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-b77f48b1efa671e094696b99fbf566aac8c87d74
+0d7ecbbe536dc84240f646e0ec0a712251377f34

From 8c475b97b8ba7d678d4c9904b1161bd8811a9b44 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 5 Oct 2024 15:55:04 +0300
Subject: [PATCH 018/396] rerank : use [SEP] token instead of [BOS] (#9737)

* rerank : use [SEP] token instead of [BOS]

ggml-ci

* common : sanity check for non-NULL tokens

ggml-ci

* ci : adjust rank score interval

ggml-ci

* ci : add shebang to run.sh

ggml-ci
---
 ci/run.sh                  |  7 ++++---
 common/common.cpp          | 30 +++++++++++++++++++++++++++++-
 examples/server/server.cpp |  4 ++--
 src/llama-vocab.h          | 18 +++++++++---------
 src/llama.cpp              |  2 +-
 5 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index 7d241ecc0..e06778219 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -1,4 +1,4 @@
-#/bin/bash
+#!/bin/bash
 #
 # sample usage:
 #
@@ -751,7 +751,8 @@ function gg_run_rerank_tiny {
 
     model_f16="${path_models}/ggml-model-f16.gguf"
 
-    (time ./bin/llama-embedding --model ${model_f16}  -p "what is panda?</s><s>hi\nwhat is panda?</s><s>it's a bear\nwhat is panda?</s><s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    # for this model, the SEP token is "</s>"
+    (time ./bin/llama-embedding --model ${model_f16}  -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
 
     # sample output
     # rerank score 0:    0.029
@@ -774,7 +775,7 @@ function gg_run_rerank_tiny {
 
     check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
     check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
-    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.15" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
 
     set +e
 }
diff --git a/common/common.cpp b/common/common.cpp
index a0611f3d1..29df16c95 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -838,6 +838,31 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         return iparams;
     }
 
+    if (params.reranking) {
+        bool ok = true;
+
+        if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: model does not have a  BOS token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: model does not have a  SEP token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (!ok) {
+            llama_free_model(model);
+
+            return iparams;
+        }
+    }
+
     auto cparams = llama_context_params_from_gpt_params(params);
 
     llama_context * lctx = llama_new_context_with_model(model, cparams);
@@ -855,6 +880,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         if (cvec.n_embd == -1) {
             llama_free(lctx);
             llama_free_model(model);
+
             return iparams;
         }
 
@@ -867,6 +893,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         if (err) {
             llama_free(lctx);
             llama_free_model(model);
+
             return iparams;
         }
     }
@@ -889,7 +916,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         llama_lora_adapters_apply(lctx, iparams.lora_adapters);
     }
 
-    if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
+    if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
         LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
         params.sparams.ignore_eos = false;
     }
@@ -930,6 +957,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
 
     iparams.model   = model;
     iparams.context = lctx;
+
     return iparams;
 }
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f343cc252..13e54e501 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2027,7 +2027,7 @@ struct server_context {
                                 continue;
                             }
 
-                            // prompt: <s>query</s><s>doc</s>
+                            // prompt: [BOS]query[EOS][SEP]doc[EOS]
                             prompt_tokens.clear();
                             prompt_tokens.push_back(llama_token_bos(model));
                             {
@@ -2035,7 +2035,7 @@ struct server_context {
                                 prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
                             }
                             prompt_tokens.push_back(llama_token_eos(model));
-                            prompt_tokens.push_back(llama_token_bos(model));
+                            prompt_tokens.push_back(llama_token_sep(model));
                             {
                                 const auto part = tokenize(slot.prompt[1], false);
                                 prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 069bdc423..28bad9135 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -40,17 +40,17 @@ struct llama_vocab {
     id special_bos_id  = 1;
     id special_eos_id  = 2;
     id special_unk_id  = 0;
-    id special_sep_id  = -1;
-    id special_pad_id  = -1;
-    id special_cls_id  = -1;
-    id special_mask_id = -1;
+    id special_sep_id  = LLAMA_TOKEN_NULL;
+    id special_pad_id  = LLAMA_TOKEN_NULL;
+    id special_cls_id  = LLAMA_TOKEN_NULL;
+    id special_mask_id = LLAMA_TOKEN_NULL;
 
     id linefeed_id       = 13;
-    id special_prefix_id = -1;
-    id special_suffix_id = -1;
-    id special_middle_id = -1;
-    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
-    id special_eom_id    = -1;
+    id special_prefix_id = LLAMA_TOKEN_NULL;
+    id special_suffix_id = LLAMA_TOKEN_NULL;
+    id special_middle_id = LLAMA_TOKEN_NULL;
+    id special_eot_id    = LLAMA_TOKEN_NULL; // TODO: move above after "eos_id", and here add "file separator" token
+    id special_eom_id    = LLAMA_TOKEN_NULL;
 
     // set of all tokens that cause "end of generation"
     std::set<id> special_eog_ids;
diff --git a/src/llama.cpp b/src/llama.cpp
index 3443b0689..bf6fd9277 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2412,7 +2412,7 @@ struct llama_hparams {
 
     // needed by encoder-decoder models (e.g. T5, FLAN-T5)
     // ref: https://github.com/ggerganov/llama.cpp/pull/8141
-    llama_token dec_start_token_id = -1;
+    llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
 
     enum llama_pooling_type      pooling_type            = LLAMA_POOLING_TYPE_NONE;
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;

From b0915d5b51cbaa982ce9bbb9ce302bb9abdca0eb Mon Sep 17 00:00:00 2001
From: SRHMorris <69468379+SRHMorris@users.noreply.github.com>
Date: Sun, 6 Oct 2024 08:34:20 +0100
Subject: [PATCH 019/396] vulkan : retry allocation with fallback flags
 (whisper/2451)

Co-authored-by: Samuel Morris <samuel.morris@artlist.io>
---
 ggml/src/ggml-vulkan.cpp | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 12ad9d810..30bd376da 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -1070,10 +1070,25 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
     try {
         buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index });
     } catch (const vk::SystemError& e) {
-        // Out of Host/Device memory, clean up buffer
-        device->device.destroyBuffer(buf->buffer);
-        buf->size = 0;
-        throw e;
+        if (buf->memory_property_flags != fallback_flags) {
+            // Try again with fallback flags
+            memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
+            buf->memory_property_flags = fallback_flags;
+
+            try {
+                buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index });
+            }
+            catch (const vk::SystemError& e) {
+                device->device.destroyBuffer(buf->buffer);
+                buf->size = 0;
+                throw e;
+            }
+        } else {
+            // Out of Host/Device memory, clean up buffer
+            device->device.destroyBuffer(buf->buffer);
+            buf->size = 0;
+            throw e;
+        }
     }
     buf->ptr = nullptr;
 

From b6d6c5289f1c9c677657c380591201ddb210b649 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 6 Oct 2024 12:53:28 +0300
Subject: [PATCH 020/396] sync : llama.cpp

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 5c92cdfd6..3cca9cc2f 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-0d7ecbbe536dc84240f646e0ec0a712251377f34
+564f42082f858f9674b2a2e06e9e779d9ed2c754

From f4b2dcdf4992ef11a854abc9b662624490e37b4c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 6 Oct 2024 13:49:41 +0300
Subject: [PATCH 021/396] readme : fix typo [no ci]

---
 examples/main/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/main/README.md b/examples/main/README.md
index 6730effdf..f0c3031ab 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -69,7 +69,7 @@ In this section, we cover the most commonly used options for running the `llama-
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 -   `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
 -   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
--   -   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 
 ## Input Prompts
 

From d5cb86844f26f600c48bf3643738ea68138f961d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 6 Oct 2024 14:15:27 +0300
Subject: [PATCH 022/396] contrib : simplify + minor edits [no ci]

---
 CONTRIBUTING.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3d7c6f86c..4c882c254 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,24 +1,23 @@
 # Pull requests (for contributors)
 
 - Test your changes:
-  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
+  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library
   - Execute [the full CI locally on your machine](ci/README.md) before publishing
-- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
-  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
-- Consider allowing write access to your branch for faster review
+- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs
+- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
 
 # Pull requests (for collaborators)
 
 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
-- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
 
 # Coding guidelines
 
 - Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
-- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
+- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)

From 96b69121033d2b6b951d1b6b1b43f8b4f97dac99 Mon Sep 17 00:00:00 2001
From: Paul Tsochantaris <ptsochantaris@icloud.com>
Date: Mon, 7 Oct 2024 13:26:31 +0100
Subject: [PATCH 023/396] metal : single allocation of encode_async block
 (#9747)

* Single allocation of encode_async block with non-ARC capture in ggml-metal.m

* Moving Block_release to the deallocation code

* Release encode block when re-setting encoding buffer count if needed

* Update ggml/src/ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml/src/ggml-metal.m | 92 +++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 7baee4174..08598c28b 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -239,8 +239,6 @@ struct ggml_backend_metal_context {
     struct ggml_cgraph * gf;
 
     // the callback given to the thread pool
-    // TODO: ideally, this should be created once, utilizing the command buffer state above
-    //       for some reason, doing it like this leads to a crash
     void (^encode_async)(size_t ith);
 
     // n_cb command buffers + 1 used by the main thread
@@ -683,6 +681,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
         [ctx->kernels[i].pipeline release];
     }
 
+    Block_release(ctx->encode_async);
+
     [ctx->queue release];
     [ctx->device release];
 
@@ -3000,46 +3000,6 @@ static enum ggml_status ggml_metal_graph_compute(
             }
         }
 
-        // TODO: how to avoid this allocation? I tried initializing it in ggml_backend_metal_set_n_cb but it crashes.
-        ctx->encode_async = ^(size_t iter) {
-            const int cb_idx = iter;
-            const int n_cb_l = ctx->n_cb;
-
-            const int n_nodes_0 = ctx->n_nodes_0;
-            const int n_nodes_1 = ctx->n_nodes_1;
-
-            const int n_nodes_per_cb = ctx->n_nodes_per_cb;
-
-            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
-            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
-
-            int node_start = 0;
-            int node_end   = n_nodes_0;
-
-            if (cb_idx < n_cb_l) {
-                node_start = n_nodes_0 + (                                         (cb_idx + 0) * n_nodes_per_cb);
-                node_end   = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
-            }
-
-            for (int idx = node_start; idx < node_end; ++idx) {
-                if (should_capture) {
-                    [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(gf, idx)) encoding:NSUTF8StringEncoding]];
-                }
-
-                ggml_metal_encode_node(ctx, idx, encoder);
-
-                if (should_capture) {
-                    [encoder popDebugGroup];
-                }
-            }
-
-            [encoder endEncoding];
-
-            if (cb_idx < 2 || ctx->abort_callback == NULL) {
-                [command_buffer commit];
-            }
-        };
-
         // the main thread commits the first few commands immediately
         // command_buffer[n_cb]
         {
@@ -3468,10 +3428,50 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
         }
     }
 
-    // TODO: setting encode_async here causes crash during the next ggml_metal_graph_compute call. why?
-    //ctx->encode_async = ^(size_t iter) {
-    //    ...
-    //};
+    if (ctx->encode_async) {
+        Block_release(ctx->encode_async);
+    }
+
+    ctx->encode_async = Block_copy(^(size_t iter) {
+        const int cb_idx = iter;
+        const int n_cb_l = ctx->n_cb;
+
+        const int n_nodes_0 = ctx->n_nodes_0;
+        const int n_nodes_1 = ctx->n_nodes_1;
+
+        const int n_nodes_per_cb = ctx->n_nodes_per_cb;
+
+        id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
+        id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+
+        int node_start = 0;
+        int node_end   = n_nodes_0;
+
+        if (cb_idx < n_cb_l) {
+            node_start = n_nodes_0 + (                                         (cb_idx + 0) * n_nodes_per_cb);
+            node_end   = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
+        }
+
+        const bool should_capture = ctx->capture_next_compute;
+
+        for (int idx = node_start; idx < node_end; ++idx) {
+            if (should_capture) {
+                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
+            }
+
+            ggml_metal_encode_node(ctx, idx, encoder);
+
+            if (should_capture) {
+                [encoder popDebugGroup];
+            }
+        }
+
+        [encoder endEncoding];
+
+        if (cb_idx < 2 || ctx->abort_callback == NULL) {
+            [command_buffer commit];
+        }
+    });
 }
 
 static struct ggml_backend_i ggml_backend_metal_i = {

From d5ac8cf2f2e30459489e6721a17d15b92a0c42a6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 7 Oct 2024 18:27:51 +0300
Subject: [PATCH 024/396] ggml : add metal backend registry / device (#9713)

* ggml : add metal backend registry / device

ggml-ci

* metal : fix names [no ci]

* metal : global registry and device instances

ggml-ci

* cont : alternative initialization of global objects

ggml-ci

* llama : adapt to backend changes

ggml-ci

* fixes

* metal : fix indent

* metal : fix build when MTLGPUFamilyApple3 is not available

ggml-ci

* fix merge

* metal : avoid unnecessary singleton accesses

ggml-ci

* metal : minor fix [no ci]

* metal : g_state -> g_ggml_ctx_dev_main [no ci]

* metal : avoid reference of device context in the backend context

ggml-ci

* metal : minor [no ci]

* metal : fix maxTransferRate check

* metal : remove transfer rate stuff

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 ggml/include/ggml-backend.h |   2 +
 ggml/include/ggml-metal.h   |   6 +-
 ggml/src/ggml-backend.cpp   |  21 +-
 ggml/src/ggml-cuda.cu       |   7 +-
 ggml/src/ggml-metal.m       | 711 ++++++++++++++++++++++++------------
 src/llama.cpp               |  72 ++--
 6 files changed, 535 insertions(+), 284 deletions(-)

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 4d7d2716e..152b9adb0 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -127,6 +127,8 @@ extern "C" {
         bool async;
         // pinned host buffer
         bool host_buffer;
+        // creating buffers from host ptr
+        bool buffer_from_host_ptr;
         // event synchronization
         bool events;
     };
diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h
index c3ec572b2..b8d3f678b 100644
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -43,7 +43,9 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
 
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
-GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
+GGML_DEPRECATED(
+        GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
+        "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
 
 GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
 
@@ -57,6 +59,8 @@ GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int fam
 // capture all command buffers committed the next time `ggml_backend_graph_compute` is called
 GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
 
+GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 0551764fe..4f3e9374c 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -463,6 +463,7 @@ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
 }
 
 void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
+    memset(props, 0, sizeof(*props));
     device->iface.get_props(device, props);
 }
 
@@ -479,6 +480,10 @@ ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t devic
 }
 
 ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
+    if (device->iface.get_host_buffer_type == NULL) {
+        return NULL;
+    }
+
     return device->iface.get_host_buffer_type(device);
 }
 
@@ -525,6 +530,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #include "ggml-cuda.h"
 #endif
 
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_t> backends;
     std::vector<ggml_backend_dev_t> devices;
@@ -533,10 +542,13 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_CUDA
         register_backend(ggml_backend_cuda_reg());
 #endif
+#ifdef GGML_USE_METAL
+        register_backend(ggml_backend_metal_reg());
+#endif
 
         register_backend(ggml_backend_cpu_reg());
 
-        // TODO: sycl, metal, vulkan, kompute, cann
+        // TODO: sycl, vulkan, kompute, cann
     }
 
     void register_backend(ggml_backend_reg_t reg) {
@@ -1118,9 +1130,10 @@ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggm
     props->type        = ggml_backend_cpu_device_get_type(dev);
     ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = {
-        /* async       */ false,
-        /* host_buffer */ false,
-        /* events      */ false,
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
     };
 }
 
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 5b6f605b0..edb61abdf 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2920,9 +2920,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
 #endif
 
     props->caps = {
-        /* async       */ true,
-        /* host_buffer */ host_buffer,
-        /* events      */ events,
+        /* .async                 = */ true,
+        /* .host_buffer           = */ host_buffer,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ events,
     };
 }
 
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 08598c28b..172a0f925 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -20,6 +20,69 @@
 
 #define UNUSED(x) (void)(x)
 
+// globals
+
+// overload of MTLGPUFamilyMetal3 (not available in some environments)
+static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
+
+// initialized in ggml_backend_metal_reg
+static struct ggml_backend_reg    g_ggml_backend_metal_reg;
+static struct ggml_backend_device g_ggml_backend_metal_device;
+
+// information about a Metal device
+// note: assumes single GPU device - the default one
+// TODO: support multiple GPU devices
+static struct ggml_backend_metal_device_context {
+    id<MTLDevice> mtl_device;
+    int           mtl_device_ref_count;
+
+    bool support_simdgroup_reduction;
+    bool support_simdgroup_mm;
+
+    char name[128];
+} g_ggml_ctx_dev_main = {
+    /*.mtl_device                  =*/ nil,
+    /*.mtl_device_ref_count        =*/ 0,
+    /*.support_simdgroup_reduction =*/ false,
+    /*.support_simdgroup_mm        =*/ false,
+    /*.name                        =*/ "",
+};
+
+// acquire
+static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_device_context * ctx) {
+    assert(ctx != NULL);
+
+    if (ctx->mtl_device == nil) {
+        ctx->mtl_device = MTLCreateSystemDefaultDevice();
+
+        ctx->support_simdgroup_reduction  = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
+        ctx->support_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
+
+        ctx->support_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
+
+        strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1);
+    }
+
+    ctx->mtl_device_ref_count++;
+
+    return ctx->mtl_device;
+}
+
+// release
+static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_context * ctx) {
+    assert(ctx != NULL);
+    assert(ctx->mtl_device_ref_count > 0);
+
+    ctx->mtl_device_ref_count--;
+
+    if (ctx->mtl_device_ref_count == 0) {
+        [ctx->mtl_device release];
+        ctx->mtl_device = nil;
+    }
+}
+
+// kernels
+
 struct ggml_metal_kernel {
     id<MTLComputePipelineState> pipeline;
 };
@@ -214,16 +277,12 @@ enum ggml_metal_kernel_type {
 };
 
 struct ggml_backend_metal_context {
-    id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
 
     dispatch_queue_t d_queue;
 
     struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
 
-    bool support_simdgroup_reduction;
-    bool support_simdgroup_mm;
-
     // capture state
     bool capture_next_compute;
     bool capture_started;
@@ -280,7 +339,7 @@ static void * ggml_metal_host_malloc(size_t n) {
     return data;
 }
 
-static struct ggml_backend_metal_context * ggml_metal_init(void) {
+static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
     GGML_LOG_INFO("%s: allocating\n", __func__);
 
 #if TARGET_OS_OSX && !GGML_METAL_NDEBUG
@@ -292,14 +351,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
     [devices release]; // since it was created by a *Copy* C method
 #endif
 
-    // Pick and show default Metal device
-    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+    // init context
+    struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
+    struct ggml_backend_metal_device_context * ctx_dev = dev->context;
+
+    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
     GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
 
-    // Configure context
-    struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
-    ctx->device = device;
-    ctx->queue  = [ctx->device newCommandQueue];
+    ctx->queue  = [device newCommandQueue];
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
     id<MTLLibrary> metal_library;
@@ -332,7 +391,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
             NSURL * libURL = [NSURL fileURLWithPath:path_lib];
             GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
 
-            metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
+            metal_library = [device newLibraryWithURL:libURL error:&error];
             if (error) {
                 GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                 return NULL;
@@ -382,7 +441,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
 
                 //[options setFastMathEnabled:false];
 
-                metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
+                metal_library = [device newLibraryWithSource:src options:options error:&error];
                 if (error) {
                     GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                     return NULL;
@@ -392,44 +451,37 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
     }
 
     // print MTL GPU family:
-    GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
-
-    const NSInteger MTLGPUFamilyMetal3 = 5001;
+    GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, [[device name] UTF8String]);
 
     // determine max supported GPU family
     // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
     // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
     {
         for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
-            if ([ctx->device supportsFamily:i]) {
+            if ([device supportsFamily:i]) {
                 GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d  (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
                 break;
             }
         }
 
         for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
-            if ([ctx->device supportsFamily:i]) {
+            if ([device supportsFamily:i]) {
                 GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
                 break;
             }
         }
 
-        for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
-            if ([ctx->device supportsFamily:i]) {
-                GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
+        for (int i = MTLGPUFamilyMetal3_GGML + 5; i >= MTLGPUFamilyMetal3_GGML; --i) {
+            if ([device supportsFamily:i]) {
+                GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3_GGML + 3, i);
                 break;
             }
         }
     }
 
-    ctx->support_simdgroup_reduction  = [ctx->device supportsFamily:MTLGPUFamilyApple7];
-    ctx->support_simdgroup_reduction |= [ctx->device supportsFamily:MTLGPUFamilyMetal3];
-
-    ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7];
-
-    GGML_LOG_INFO("%s: simdgroup reduction support   = %s\n",       __func__, ctx->support_simdgroup_reduction ? "true" : "false");
-    GGML_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
-    GGML_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+    GGML_LOG_INFO("%s: simdgroup reduction support   = %s\n", __func__, ctx_dev->support_simdgroup_reduction ? "true" : "false");
+    GGML_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx_dev->support_simdgroup_mm ? "true" : "false");
+    GGML_LOG_INFO("%s: hasUnifiedMemory              = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
 
     ctx->capture_next_compute = false;
     ctx->capture_started = false;
@@ -443,13 +495,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
 
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
     if (@available(macOS 10.12, iOS 16.0, *)) {
-        GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
-    }
-#elif TARGET_OS_OSX
-    if (ctx->device.maxTransferRate != 0) {
-        GGML_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
-    } else {
-        GGML_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
+        GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6);
     }
 #endif
 
@@ -470,7 +516,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
         if (supported) { \
             struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
             id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
-            kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
+            kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \
             [metal_function release]; \
             if (error) { \
                 GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
@@ -481,6 +527,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
             GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
         }
 
+        const bool support_simdgroup_mm        = ctx_dev->support_simdgroup_mm;
+        const bool support_simdgroup_reduction = ctx_dev->support_simdgroup_reduction;
+
         // simd_sum and simd_max requires MTLGPUFamilyApple7
 
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD,                           add,                            true);
@@ -507,10 +556,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,                  gelu_quick_4,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                          silu,                           true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4,                        silu_4,                         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,                  soft_max_f16,                   ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,                soft_max_f16_4,                 ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,                  soft_max_f32,                   ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4,                soft_max_f32_4,                 ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,                  soft_max_f16,                   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,                soft_max_f16_4,                 support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,                  soft_max_f32,                   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4,                soft_max_f32_4,                 support_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,                 diag_mask_inf,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,               diag_mask_inf_8,                true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,                  get_rows_f32,                   true);
@@ -535,101 +584,101 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,               get_rows_iq4_nl,                true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS,               get_rows_iq4_xs,                true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,                  get_rows_i32,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                      rms_norm,                       ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                    group_norm,                     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                      rms_norm,                       support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                    group_norm,                     support_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                          norm,                           true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,                  ssm_conv_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,                  ssm_scan_f32,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                mul_mv_f32_f32,                 ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,                mul_mv_f16_f16,                 ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,                mul_mv_f16_f32,                 ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,           mul_mv_f16_f32_1row,            ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,             mul_mv_f16_f32_l4,              ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,               mul_mv_q4_0_f32,                ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,               mul_mv_q4_1_f32,                ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,               mul_mv_q5_0_f32,                ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,               mul_mv_q5_1_f32,                ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,               mul_mv_q8_0_f32,                ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,               mul_mv_q2_K_f32,                ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,               mul_mv_q3_K_f32,                ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,               mul_mv_q4_K_f32,                ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,               mul_mv_q5_K_f32,                ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,               mul_mv_q6_K_f32,                ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,            mul_mv_iq2_xxs_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,             mul_mv_iq2_xs_f32,              ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,            mul_mv_iq3_xxs_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,              mul_mv_iq3_s_f32,               ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,              mul_mv_iq2_s_f32,               ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,              mul_mv_iq1_s_f32,               ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32,              mul_mv_iq1_m_f32,               ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,             mul_mv_iq4_nl_f32,              ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,             mul_mv_iq4_xs_f32,              ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,             mul_mv_id_f32_f32,              ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,             mul_mv_id_f16_f16,              ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,             mul_mv_id_f16_f32,              ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,        mul_mv_id_f16_f32_1row,         ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,          mul_mv_id_f16_f32_l4,           ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,            mul_mv_id_q4_0_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,            mul_mv_id_q4_1_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,            mul_mv_id_q5_0_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,            mul_mv_id_q5_1_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,            mul_mv_id_q8_0_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,            mul_mv_id_q2_K_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,            mul_mv_id_q3_K_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,            mul_mv_id_q4_K_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,            mul_mv_id_q5_K_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,            mul_mv_id_q6_K_f32,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,         mul_mv_id_iq2_xxs_f32,          ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,          mul_mv_id_iq2_xs_f32,           ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,         mul_mv_id_iq3_xxs_f32,          ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,           mul_mv_id_iq3_s_f32,            ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,           mul_mv_id_iq2_s_f32,            ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,           mul_mv_id_iq1_s_f32,            ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,           mul_mv_id_iq1_m_f32,            ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,          mul_mv_id_iq4_nl_f32,           ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,          mul_mv_id_iq4_xs_f32,           ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,                mul_mm_f32_f32,                 ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,                mul_mm_f16_f32,                 ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,               mul_mm_q4_0_f32,                ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,               mul_mm_q4_1_f32,                ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,               mul_mm_q5_0_f32,                ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,               mul_mm_q5_1_f32,                ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,               mul_mm_q8_0_f32,                ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,               mul_mm_q2_K_f32,                ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,               mul_mm_q3_K_f32,                ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,               mul_mm_q4_K_f32,                ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,               mul_mm_q5_K_f32,                ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,               mul_mm_q6_K_f32,                ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,            mul_mm_iq2_xxs_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,             mul_mm_iq2_xs_f32,              ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,            mul_mm_iq3_xxs_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,              mul_mm_iq3_s_f32,               ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,              mul_mm_iq2_s_f32,               ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,              mul_mm_iq1_s_f32,               ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,              mul_mm_iq1_m_f32,               ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,             mul_mm_iq4_nl_f32,              ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,             mul_mm_iq4_xs_f32,              ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,             mul_mm_id_f32_f32,              ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,             mul_mm_id_f16_f32,              ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,            mul_mm_id_q4_0_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,            mul_mm_id_q4_1_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,            mul_mm_id_q5_0_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,            mul_mm_id_q5_1_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,            mul_mm_id_q8_0_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,            mul_mm_id_q2_K_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,            mul_mm_id_q3_K_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,            mul_mm_id_q4_K_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,            mul_mm_id_q5_K_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,            mul_mm_id_q6_K_f32,             ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,         mul_mm_id_iq2_xxs_f32,          ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,          mul_mm_id_iq2_xs_f32,           ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,         mul_mm_id_iq3_xxs_f32,          ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,           mul_mm_id_iq3_s_f32,            ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,           mul_mm_id_iq2_s_f32,            ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,           mul_mm_id_iq1_s_f32,            ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,           mul_mm_id_iq1_m_f32,            ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,          mul_mm_id_iq4_nl_f32,           ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,          mul_mm_id_iq4_xs_f32,           ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                mul_mv_f32_f32,                 support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,                mul_mv_f16_f16,                 support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,                mul_mv_f16_f32,                 support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,           mul_mv_f16_f32_1row,            support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,             mul_mv_f16_f32_l4,              support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,               mul_mv_q4_0_f32,                support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,               mul_mv_q4_1_f32,                support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,               mul_mv_q5_0_f32,                support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,               mul_mv_q5_1_f32,                support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,               mul_mv_q8_0_f32,                support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,               mul_mv_q2_K_f32,                support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,               mul_mv_q3_K_f32,                support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,               mul_mv_q4_K_f32,                support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,               mul_mv_q5_K_f32,                support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,               mul_mv_q6_K_f32,                support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,            mul_mv_iq2_xxs_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,             mul_mv_iq2_xs_f32,              support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,            mul_mv_iq3_xxs_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,              mul_mv_iq3_s_f32,               support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,              mul_mv_iq2_s_f32,               support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,              mul_mv_iq1_s_f32,               support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32,              mul_mv_iq1_m_f32,               support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,             mul_mv_iq4_nl_f32,              support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,             mul_mv_iq4_xs_f32,              support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,             mul_mv_id_f32_f32,              support_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,             mul_mv_id_f16_f16,              support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,             mul_mv_id_f16_f32,              support_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,        mul_mv_id_f16_f32_1row,         support_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,          mul_mv_id_f16_f32_l4,           support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,            mul_mv_id_q4_0_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,            mul_mv_id_q4_1_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,            mul_mv_id_q5_0_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,            mul_mv_id_q5_1_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,            mul_mv_id_q8_0_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,            mul_mv_id_q2_K_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,            mul_mv_id_q3_K_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,            mul_mv_id_q4_K_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,            mul_mv_id_q5_K_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,            mul_mv_id_q6_K_f32,             support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,         mul_mv_id_iq2_xxs_f32,          support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,          mul_mv_id_iq2_xs_f32,           support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,         mul_mv_id_iq3_xxs_f32,          support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,           mul_mv_id_iq3_s_f32,            support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,           mul_mv_id_iq2_s_f32,            support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,           mul_mv_id_iq1_s_f32,            support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,           mul_mv_id_iq1_m_f32,            support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,          mul_mv_id_iq4_nl_f32,           support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,          mul_mv_id_iq4_xs_f32,           support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,                mul_mm_f32_f32,                 support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,                mul_mm_f16_f32,                 support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,               mul_mm_q4_0_f32,                support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,               mul_mm_q4_1_f32,                support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,               mul_mm_q5_0_f32,                support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,               mul_mm_q5_1_f32,                support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,               mul_mm_q8_0_f32,                support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,               mul_mm_q2_K_f32,                support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,               mul_mm_q3_K_f32,                support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,               mul_mm_q4_K_f32,                support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,               mul_mm_q5_K_f32,                support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,               mul_mm_q6_K_f32,                support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,            mul_mm_iq2_xxs_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,             mul_mm_iq2_xs_f32,              support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,            mul_mm_iq3_xxs_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,              mul_mm_iq3_s_f32,               support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,              mul_mm_iq2_s_f32,               support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,              mul_mm_iq1_s_f32,               support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,              mul_mm_iq1_m_f32,               support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,             mul_mm_iq4_nl_f32,              support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,             mul_mm_iq4_xs_f32,              support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,             mul_mm_id_f32_f32,              support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,             mul_mm_id_f16_f32,              support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,            mul_mm_id_q4_0_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,            mul_mm_id_q4_1_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,            mul_mm_id_q5_0_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,            mul_mm_id_q5_1_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,            mul_mm_id_q8_0_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,            mul_mm_id_q2_K_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,            mul_mm_id_q3_K_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,            mul_mm_id_q4_K_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,            mul_mm_id_q5_K_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,            mul_mm_id_q6_K_f32,             support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,         mul_mm_id_iq2_xxs_f32,          support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,          mul_mm_id_iq2_xs_f32,           support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,         mul_mm_id_iq3_xxs_f32,          support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,           mul_mm_id_iq3_s_f32,            support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,           mul_mm_id_iq2_s_f32,            support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,           mul_mm_id_iq1_s_f32,            support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,           mul_mm_id_iq1_m_f32,            support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,          mul_mm_id_iq4_nl_f32,           support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,          mul_mm_id_iq4_xs_f32,           support_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                 rope_norm_f32,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                 rope_norm_f16,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                 rope_neox_f32,                  true);
@@ -643,14 +692,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,          argsort_f32_i32_desc,           true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,                leaky_relu_f32,                 true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,        flash_attn_ext_f16_h64,         ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,        flash_attn_ext_f16_h80,         ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,        flash_attn_ext_f16_h96,         ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,       flash_attn_ext_f16_h112,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,       flash_attn_ext_f16_h128,        ctx->support_simdgroup_mm);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,   flash_attn_ext_vec_f16_h128,    ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,        flash_attn_ext_f16_h64,         support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,        flash_attn_ext_f16_h80,         support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,        flash_attn_ext_f16_h96,         support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,       flash_attn_ext_f16_h112,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,       flash_attn_ext_f16_h128,        support_simdgroup_mm);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,   flash_attn_ext_vec_f16_h128,    support_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    support_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,                   cpy_f32_f16,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,                   cpy_f32_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,                   cpy_f16_f16,                    true);
@@ -684,7 +733,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
     Block_release(ctx->encode_async);
 
     [ctx->queue release];
-    [ctx->device release];
 
     dispatch_release(ctx->d_queue);
 
@@ -742,13 +790,16 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
     return nil;
 }
 
-static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) {
+static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_context * ctx_dev, const struct ggml_tensor * op) {
     for (size_t i = 0, n = 3; i < n; ++i) {
         if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
             return false;
         }
     }
 
+    const bool support_simdgroup_mm        = ctx_dev->support_simdgroup_mm;
+    const bool support_simdgroup_reduction = ctx_dev->support_simdgroup_reduction;
+
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -786,7 +837,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
         case GGML_OP_SOFT_MAX:
         case GGML_OP_RMS_NORM:
         case GGML_OP_GROUP_NORM:
-            return ctx->support_simdgroup_reduction;
+            return support_simdgroup_reduction;
         case GGML_OP_NORM:
         case GGML_OP_ROPE:
             return true;
@@ -812,13 +863,13 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
             if (op->src[0]->ne[0] == 256) {
                 return false;
             }
-            return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
+            return support_simdgroup_mm; // TODO: over-restricted for vec-kernels
         case GGML_OP_SSM_CONV:
         case GGML_OP_SSM_SCAN:
             return true;
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
-            return ctx->support_simdgroup_reduction &&
+            return support_simdgroup_reduction &&
                 (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
         case GGML_OP_CPY:
         case GGML_OP_DUP:
@@ -862,9 +913,12 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
 }
 
 static void ggml_metal_encode_node(
-     struct ggml_backend_metal_context * ctx,
+                        ggml_backend_t   backend,
                                    int   idx,
           id<MTLComputeCommandEncoder>   encoder) {
+    struct ggml_backend_metal_context        * ctx     = backend->context;
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
     struct ggml_cgraph * gf = ctx->gf;
 
     struct ggml_tensor * node = ggml_graph_node(gf, idx);
@@ -894,7 +948,7 @@ static void ggml_metal_encode_node(
             } break;
     }
 
-    if (!ggml_metal_supports_op(ctx, dst)) {
+    if (!ggml_metal_supports_op(ctx_dev, dst)) {
         GGML_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
         GGML_ABORT("unsupported op");
     }
@@ -967,6 +1021,8 @@ static void ggml_metal_encode_node(
     //            dst->name);
     //}
 
+    id<MTLDevice> device = ctx_dev->mtl_device;
+
     switch (dst->op) {
         case GGML_OP_CONCAT:
             {
@@ -1675,7 +1731,7 @@ static void ggml_metal_encode_node(
                 // the numbers below are measured on M2 Ultra for 7B and 13B models
                 // these numbers do not translate to other devices or model sizes
                 // TODO: need to find a better approach
-                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
+                        if ([device.name isEqualToString:@"Apple M2 Ultra"]) {
                             switch (src0t) {
                                 case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
                                 case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
@@ -1695,7 +1751,7 @@ static void ggml_metal_encode_node(
 
                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                        if ([device supportsFamily:MTLGPUFamilyApple7] &&
                                 !ggml_is_transposed(src0) &&
                                 !ggml_is_transposed(src1) &&
                                 src1t == GGML_TYPE_F32 &&
@@ -1990,7 +2046,7 @@ static void ggml_metal_encode_node(
                 // ne21 = n_rows
                 const int dst_rows = ne20*ne21;
                 const int dst_rows_min = n_as;
-                const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;
+                const int dst_rows_max = (device.maxThreadgroupMemoryLength - 32 - 8192)/4;
 
                 // max size of the rowids array in the kernel shared buffer
                 GGML_ASSERT(dst_rows <= dst_rows_max);
@@ -2001,7 +2057,7 @@ static void ggml_metal_encode_node(
                 // TODO: for now, always use mat-vec kernels until we figure out how to improve the
                 //       indirect matrix multiplication
                 // !!!
-                if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                if ([device supportsFamily:MTLGPUFamilyApple7] &&
                         ne00 % 32 == 0 && ne00 >= 64 &&
                         dst_rows > dst_rows_min) {
 
@@ -2840,7 +2896,7 @@ static void ggml_metal_encode_node(
 
                     while (true) {
                         const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2);
-                        if (smem > ctx->device.maxThreadgroupMemoryLength) {
+                        if (smem > device.maxThreadgroupMemoryLength) {
                             break;
                         }
                         nsgmax *= 2;
@@ -2852,8 +2908,8 @@ static void ggml_metal_encode_node(
 
                     const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2);
 
-                    //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
-                    GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
+                    //printf("smem: %zu, max: %zu\n", smem, device.maxThreadgroupMemoryLength);
+                    GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength);
 
                     [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0];
 
@@ -2878,8 +2934,8 @@ static void ggml_metal_encode_node(
 
                     const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2);
 
-                    //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
-                    GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
+                    //printf("smem: %zu, max: %zu\n", smem, device.maxThreadgroupMemoryLength);
+                    GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength);
                     [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0];
 
                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
@@ -2954,8 +3010,11 @@ static void ggml_metal_encode_node(
 }
 
 static enum ggml_status ggml_metal_graph_compute(
-        struct ggml_backend_metal_context * ctx,
-                       struct ggml_cgraph * gf) {
+            ggml_backend_t   backend,
+        struct ggml_cgraph * gf) {
+    struct ggml_backend_metal_context        * ctx     = backend->context;
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
     // number of nodes encoded by the main thread (empirically determined)
     const int n_main = 128;
 
@@ -2983,7 +3042,7 @@ static enum ggml_status ggml_metal_graph_compute(
 
             if (!ctx->capture_started) {
                 // create capture scope
-                ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
+                ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx_dev->mtl_device];
 
                 MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
                 descriptor.captureObject = ctx->capture_scope;
@@ -3087,31 +3146,6 @@ static enum ggml_status ggml_metal_graph_compute(
 
 // backend interface
 
-// default buffer
-static id<MTLDevice> g_backend_device = nil;
-static int g_backend_device_ref_count = 0;
-
-static id<MTLDevice> ggml_backend_metal_get_device(void) {
-    if (g_backend_device == nil) {
-        g_backend_device = MTLCreateSystemDefaultDevice();
-    }
-
-    g_backend_device_ref_count++;
-
-    return g_backend_device;
-}
-
-static void ggml_backend_metal_free_device(void) {
-    assert(g_backend_device_ref_count > 0);
-
-    g_backend_device_ref_count--;
-
-    if (g_backend_device_ref_count == 0) {
-        [g_backend_device release];
-        g_backend_device = nil;
-    }
-}
-
 static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
     return "Metal";
 
@@ -3124,7 +3158,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     for (int i = 0; i < ctx->n_buffers; i++) {
         [ctx->buffers[i].metal release];
     }
-    ggml_backend_metal_free_device();
+    ggml_backend_metal_device_rel(buffer->buft->device->context);
 
     if (ctx->owned) {
 #if TARGET_OS_OSX
@@ -3227,7 +3261,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
         size_aligned += (size_page - (size_aligned % size_page));
     }
 
-    id<MTLDevice> device = ggml_backend_metal_get_device();
+    id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
 
     ctx->all_data = ggml_metal_host_malloc(size_aligned);
     ctx->all_size = size_aligned;
@@ -3241,16 +3275,16 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
 
         if (size_aligned > 0) {
             ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
-                            length:size_aligned
-                            options:MTLResourceStorageModeShared
-                            deallocator:nil];
+                                            length:size_aligned
+                                            options:MTLResourceStorageModeShared
+                                            deallocator:nil];
         }
     }
 
     if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
         GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
         free(ctx);
-        ggml_backend_metal_free_device();
+        ggml_backend_metal_device_rel(buft->device->context);
         return NULL;
     }
 
@@ -3265,9 +3299,9 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
 }
 
 static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    id<MTLDevice> device = ggml_backend_metal_get_device();
-    size_t max_size = device.maxBufferLength;
-    ggml_backend_metal_free_device();
+    id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
+    const size_t max_size = device.maxBufferLength;
+    ggml_backend_metal_device_rel(buft->device->context);
 
     return max_size;
 
@@ -3290,15 +3324,14 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
             /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
         },
-        /* .device  = */ NULL,
+        /* .device  = */ &g_ggml_backend_metal_device,
         /* .context = */ NULL,
     };
 
     return &ggml_backend_buffer_type_metal;
 }
 
-// buffer from ptr
-
+// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
 ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
     struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
 
@@ -3321,7 +3354,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
         size_aligned += (size_page - (size_aligned % size_page));
     }
 
-    id<MTLDevice> device = ggml_backend_metal_get_device();
+    id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
 
     // the buffer fits into the max buffer size allowed by the device
     if (size_aligned <= device.maxBufferLength) {
@@ -3386,8 +3419,12 @@ static const char * ggml_backend_metal_name(ggml_backend_t backend) {
 }
 
 static void ggml_backend_metal_free(ggml_backend_t backend) {
-    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
+    struct ggml_backend_metal_context        * ctx     = backend->context;
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
+    ggml_backend_metal_device_rel(ctx_dev);
     ggml_metal_free(ctx);
+
     free(backend);
 }
 
@@ -3398,21 +3435,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggm
 }
 
 static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
-
-    return ggml_metal_graph_compute(metal_ctx, cgraph);
-}
-
-static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
-
-    return ggml_metal_supports_op(metal_ctx, op);
-}
-
-static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
-
-    UNUSED(backend);
+    return ggml_metal_graph_compute(backend, cgraph);
 }
 
 static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
@@ -3459,7 +3482,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
                 [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
             }
 
-            ggml_metal_encode_node(ctx, idx, encoder);
+            ggml_metal_encode_node(backend, idx, encoder);
 
             if (should_capture) {
                 [encoder popDebugGroup];
@@ -3487,8 +3510,8 @@ static struct ggml_backend_i ggml_backend_metal_i = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_metal_graph_compute,
-    /* .supports_op             = */ ggml_backend_metal_supports_op,
-    /* .supports_buft           = */ ggml_backend_metal_supports_buft,
+    /* .supports_op             = */ NULL,
+    /* .supports_buft           = */ NULL,
     /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
@@ -3499,8 +3522,11 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
     return &guid;
 }
 
+// TODO: remove in the future
 ggml_backend_t ggml_backend_metal_init(void) {
-    struct ggml_backend_metal_context * ctx = ggml_metal_init();
+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_metal_reg(), 0);
+
+    struct ggml_backend_metal_context * ctx = ggml_metal_init(dev);
     if (ctx == NULL) {
         GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
         return NULL;
@@ -3511,7 +3537,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
     *backend = (struct ggml_backend) {
         /* .guid      = */ ggml_backend_metal_guid(),
         /* .interface = */ ggml_backend_metal_i,
-        /* .device    = */ NULL,
+        /* .device    = */ dev,
         /* .context   = */ ctx,
     };
 
@@ -3536,9 +3562,9 @@ void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_ca
 bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
     GGML_ASSERT(ggml_backend_is_metal(backend));
 
-    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
-    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
+    return [ctx_dev->mtl_device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }
 
 void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
@@ -3548,11 +3574,246 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
     ctx->capture_next_compute = true;
 }
 
-ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
+// backend device
 
-ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
-    return ggml_backend_metal_init();
+static const char * ggml_backend_metal_device_get_name(ggml_backend_dev_t dev) {
+    return "Metal";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) {
+    // acq/rel just to populate ctx->name in case it hasn't been done yet
+    struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
+    ggml_backend_metal_device_acq(ctx_dev);
+    ggml_backend_metal_device_rel(ctx_dev);
+
+    return ctx_dev->name;
+}
+
+static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    if (@available(macOS 10.12, iOS 16.0, *)) {
+        struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
+        id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
+        *total = device.recommendedMaxWorkingSetSize;
+        *free  = *total - device.currentAllocatedSize;
+
+        ggml_backend_metal_device_rel(ctx_dev);
+    } else {
+        *free = 1;
+        *total = 1;
+    }
+}
+
+static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_metal_device_get_name(dev);
+    props->description = ggml_backend_metal_device_get_description(dev);
+    props->type        = ggml_backend_metal_device_get_type(dev);
+    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = (struct ggml_backend_dev_caps) {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, const char * params) {
+    struct ggml_backend_metal_context * ctx = ggml_metal_init(dev);
+    if (ctx == NULL) {
+        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+        return NULL;
+    }
+
+    ggml_backend_t backend = malloc(sizeof(struct ggml_backend));
+
+    *backend = (struct ggml_backend) {
+        /* .guid      = */ ggml_backend_metal_guid(),
+        /* .interface = */ ggml_backend_metal_i,
+        /* .device    = */ dev,
+        /* .context   = */ ctx,
+    };
+
+    ggml_backend_metal_set_n_cb(backend, 1);
+
+    return backend;
 
     GGML_UNUSED(params);
-    GGML_UNUSED(user_data);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_metal_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
+
+    ctx->all_data = ptr;
+    ctx->all_size = size;
+    ctx->owned = false;
+    ctx->n_buffers = 0;
+
+    const size_t size_page = sysconf(_SC_PAGESIZE);
+
+    // page-align the data ptr
+    {
+        const uintptr_t offs = (uintptr_t) ptr % size_page;
+        ptr  = (void *) ((char *) ptr - offs);
+        size += offs;
+    }
+
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+
+    struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
+    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
+    // the buffer fits into the max buffer size allowed by the device
+    if (size_aligned <= device.maxBufferLength) {
+        ctx->buffers[ctx->n_buffers].data  = ptr;
+        ctx->buffers[ctx->n_buffers].size  = size;
+        ctx->buffers[ctx->n_buffers].metal = nil;
+
+        if (size_aligned > 0) {
+            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+            if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+                return false;
+            }
+        }
+
+        ggml_backend_metal_log_allocated_size(device, size_aligned);
+
+        ++ctx->n_buffers;
+    } else {
+        // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
+        // one of the views
+        const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
+        const size_t size_step = device.maxBufferLength - size_ovlp;
+        const size_t size_view = device.maxBufferLength;
+
+        for (size_t i = 0; i < size; i += size_step) {
+            const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
+
+            ctx->buffers[ctx->n_buffers].data  = (void *) ((uint8_t *) ptr + i);
+            ctx->buffers[ctx->n_buffers].size  = size_step_aligned;
+            ctx->buffers[ctx->n_buffers].metal = nil;
+
+            if (size_step_aligned > 0) {
+                ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+                if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                    GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
+                    return false;
+                }
+            }
+
+            ggml_backend_metal_log_allocated_size(device, size_step_aligned);
+
+            if (i + size_step < size) {
+                GGML_LOG_INFO("\n");
+            }
+
+            ++ctx->n_buffers;
+        }
+    }
+
+    return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
+}
+
+static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    struct ggml_backend_metal_device_context * ctx_dev = dev->context;
+
+    return ggml_metal_supports_op(ctx_dev, op);
+}
+
+static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
+
+    UNUSED(dev);
+}
+
+static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    return false;
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(op);
+}
+
+static struct ggml_backend_device_i ggml_backend_metal_device_i = {
+    /* .get_name             = */ ggml_backend_metal_device_get_name,
+    /* .get_description      = */ ggml_backend_metal_device_get_description,
+    /* .get_memory           = */ ggml_backend_metal_device_get_memory,
+    /* .get_type             = */ ggml_backend_metal_device_get_type,
+    /* .get_props            = */ ggml_backend_metal_device_get_props,
+    /* .init_backend         = */ ggml_backend_metal_device_init,
+    /* .get_buffer_type      = */ ggml_backend_metal_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_metal_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_metal_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_metal_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// backend registry
+
+static const char * ggml_backend_metal_reg_get_name(ggml_backend_reg_t reg) {
+    return "Metal";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_metal_reg_device_count(ggml_backend_reg_t reg) {
+    return 1;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    return &g_ggml_backend_metal_device;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+}
+
+static struct ggml_backend_reg_i ggml_backend_metal_reg_i = {
+    /* .get_name         = */ ggml_backend_metal_reg_get_name,
+    /* .device_count     = */ ggml_backend_metal_reg_device_count,
+    /* .device_get       = */ ggml_backend_metal_reg_device_get,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_metal_reg(void) {
+    // TODO: make this thread-safe somehow?
+    {
+        g_ggml_backend_metal_reg = (struct ggml_backend_reg) {
+            /* .iface   = */ ggml_backend_metal_reg_i,
+            /* .context = */ NULL,
+        };
+
+        g_ggml_backend_metal_device = (struct ggml_backend_device) {
+            /* .iface   = */ ggml_backend_metal_device_i,
+            /* .reg     = */ &g_ggml_backend_metal_reg,
+            /* .context = */ &g_ggml_ctx_dev_main,
+        };
+    }
+
+    return &g_ggml_backend_metal_reg;
 }
diff --git a/src/llama.cpp b/src/llama.cpp
index bf6fd9277..77df74723 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -26,10 +26,6 @@
 #  include "ggml-blas.h"
 #endif
 
-#ifdef GGML_USE_METAL
-#  include "ggml-metal.h"
-#endif
-
 // TODO: replace with ggml API call
 #define QK_K 256
 
@@ -3292,9 +3288,6 @@ struct llama_context {
     std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
 
     std::vector<ggml_backend_t> backends;
-#ifdef GGML_USE_METAL
-    ggml_backend_t backend_metal = nullptr;
-#endif
 #ifdef GGML_USE_BLAS
     ggml_backend_t backend_blas = nullptr;
 #endif
@@ -3420,9 +3413,7 @@ static int llama_get_device_count(const llama_model & model) {
     count += (int) model.rpc_servers.size();
 #endif
 
-#if defined(GGML_USE_METAL)
-    count += 1;
-#elif defined(GGML_USE_SYCL)
+#if defined(GGML_USE_SYCL)
     count += ggml_backend_sycl_get_device_count();
 #elif defined(GGML_USE_VULKAN)
     count += ggml_backend_vk_get_device_count();
@@ -3488,9 +3479,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
     }
     device -= (int)model.devices.size();
 
-#if defined(GGML_USE_METAL)
-    buft = ggml_backend_metal_buffer_type();
-#elif defined(GGML_USE_VULKAN)
+#if defined(GGML_USE_VULKAN)
     buft = ggml_backend_vk_buffer_type(device);
 #elif defined(GGML_USE_SYCL)
     buft = ggml_backend_sycl_buffer_type(device);
@@ -8918,48 +8907,39 @@ static bool llm_load_tensors(
         llama_buf_map bufs;
         bufs.reserve(n_max_backend_buffer);
 
-        // only the mmap region containing the tensors in the model is mapped to the backend buffer
-        // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
-        // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
-        if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(model, true)) {
+        // check if this backend device supports buffer_from_host_ptr
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        bool buffer_from_host_ptr_supported = false;
+        if (dev) {
+            ggml_backend_dev_props props;
+            ggml_backend_dev_get_props(dev, &props);
+            buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
+        }
+
+        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported) {
             for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+                // only the mmap region containing the tensors in the model is mapped to the backend buffer
+                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
+                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
                 void * addr = nullptr;
-                size_t first, last;
+                size_t first, last; // NOLINT
                 ml.get_mapping_range(&first, &last, &addr, idx, ctx);
                 if (first >= last) {
                     continue;
                 }
-                ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
-                if (buf == nullptr) {
-                    throw std::runtime_error("unable to allocate backend CPU buffer");
-                }
-                model.bufs.push_back(buf);
-                bufs.emplace(idx, buf);
-            }
-        }
-#ifdef GGML_USE_METAL
-        else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
-            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 const size_t max_size = ggml_get_max_tensor_size(ctx);
-                void * addr = nullptr;
-                size_t first, last;
-                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
-                if (first >= last) {
-                    continue;
-                }
-                ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
+                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
                 if (buf == nullptr) {
-                    throw std::runtime_error("unable to allocate backend metal buffer");
+                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
                 }
                 model.bufs.push_back(buf);
                 bufs.emplace(idx, buf);
             }
         }
-#endif
         else {
             ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
             if (buf == nullptr) {
-                throw std::runtime_error("unable to allocate backend buffer");
+                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
             }
             model.bufs.push_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
@@ -19041,7 +19021,7 @@ bool llama_supports_mlock(void) {
 }
 
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_METAL)   || defined(GGML_USE_VULKAN) || \
+#if defined(GGML_USE_VULKAN) || \
     defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
     // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
     return true;
@@ -19344,17 +19324,7 @@ struct llama_context * llama_new_context_with_model(
         }
 #endif
 
-#if defined(GGML_USE_METAL)
-        if (model->n_gpu_layers > 0) {
-            ctx->backend_metal = ggml_backend_metal_init();
-            if (ctx->backend_metal == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(ctx->backend_metal);
-        }
-#elif defined(GGML_USE_VULKAN)
+#if defined(GGML_USE_VULKAN)
         if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
             LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
             llama_free(ctx);

From 6279dac039ddeb6d5ebd125a6274fd3c37a77ba8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 7 Oct 2024 19:35:42 +0300
Subject: [PATCH 025/396] flake.lock: Update (#9753)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'flake-parts':
    'github:hercules-ci/flake-parts/bcef6817a8b2aa20a5a6dbb19b43e63c5bf8619a?narHash=sha256-HO4zgY0ekfwO5bX0QH/3kJ/h4KvUDFZg8YpkNwIbg1U%3D' (2024-09-12)
  → 'github:hercules-ci/flake-parts/3d04084d54bedc3d6b8b736c70ef449225c361b1?narHash=sha256-K5ZLCyfO/Zj9mPFldf3iwS6oZStJcU4tSpiXTMYaaL0%3D' (2024-10-01)
• Updated input 'flake-parts/nixpkgs-lib':
    'https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz?narHash=sha256-Ss8QWLXdr2JCBPcYChJhz4xJm%2Bh/xjl4G0c0XlP6a74%3D' (2024-09-01)
  → 'https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz?narHash=sha256-0xHYkMkeLVQAMa7gvkddbPqpxph%2BhDzdu1XdGPJR%2BOs%3D' (2024-10-01)
• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/1925c603f17fc89f4c8f6bf6f631a802ad85d784?narHash=sha256-J%2BPeFKSDV%2BpHL7ukkfpVzCOO7mBSrrpJ3svwBFABbhI%3D' (2024-09-26)
  → 'github:NixOS/nixpkgs/bc947f541ae55e999ffdb4013441347d83b00feb?narHash=sha256-NOiTvBbRLIOe5F6RbHaAh6%2B%2BBNjsb149fGZd1T4%2BKBg%3D' (2024-10-04)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/flake.lock b/flake.lock
index dde1ab527..3fb6ced51 100644
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
         "nixpkgs-lib": "nixpkgs-lib"
       },
       "locked": {
-        "lastModified": 1726153070,
-        "narHash": "sha256-HO4zgY0ekfwO5bX0QH/3kJ/h4KvUDFZg8YpkNwIbg1U=",
+        "lastModified": 1727826117,
+        "narHash": "sha256-K5ZLCyfO/Zj9mPFldf3iwS6oZStJcU4tSpiXTMYaaL0=",
         "owner": "hercules-ci",
         "repo": "flake-parts",
-        "rev": "bcef6817a8b2aa20a5a6dbb19b43e63c5bf8619a",
+        "rev": "3d04084d54bedc3d6b8b736c70ef449225c361b1",
         "type": "github"
       },
       "original": {
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1727348695,
-        "narHash": "sha256-J+PeFKSDV+pHL7ukkfpVzCOO7mBSrrpJ3svwBFABbhI=",
+        "lastModified": 1728018373,
+        "narHash": "sha256-NOiTvBbRLIOe5F6RbHaAh6++BNjsb149fGZd1T4+KBg=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "1925c603f17fc89f4c8f6bf6f631a802ad85d784",
+        "rev": "bc947f541ae55e999ffdb4013441347d83b00feb",
         "type": "github"
       },
       "original": {
@@ -36,14 +36,14 @@
     },
     "nixpkgs-lib": {
       "locked": {
-        "lastModified": 1725233747,
-        "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
+        "lastModified": 1727825735,
+        "narHash": "sha256-0xHYkMkeLVQAMa7gvkddbPqpxph+hDzdu1XdGPJR+Os=",
         "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz"
       },
       "original": {
         "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz"
       }
     },
     "root": {

From f1af42fa8c925096407c61ff0a3d5d5d669cc535 Mon Sep 17 00:00:00 2001
From: Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
Date: Mon, 7 Oct 2024 09:37:31 -0700
Subject: [PATCH 026/396] Update building for Android (#9672)

* docs : clarify building Android on Termux

* docs : update building Android on Termux

* docs : add cross-compiling for Android

* cmake : link dl explicitly for Android
---
 docs/android.md         | 109 +++++++++++++++++++++++++---------------
 ggml/src/CMakeLists.txt |   4 ++
 2 files changed, 72 insertions(+), 41 deletions(-)

diff --git a/docs/android.md b/docs/android.md
index cec4358d9..320b62240 100644
--- a/docs/android.md
+++ b/docs/android.md
@@ -2,55 +2,82 @@
 # Android
 
 ## Build on Android using Termux
-[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
-```
-apt update && apt upgrade -y
-apt install git make cmake
-```
 
-It's recommended to move your model inside the `~/` directory for best performance:
-```
-cd storage/downloads
-mv model.gguf ~/
-```
+[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.
 
-[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
-
-## Building the Project using Android NDK
-Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
-
-Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
-```
-$ mkdir build-android
-$ cd build-android
-$ export NDK=<your_ndk_directory>
-$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-$ make
-```
-
-Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
-
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
-
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
-```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
-$cd /data/data/com.termux/files/home/bin
-$chmod +x ./*
-```
-
-Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
+With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell:
 
 ```
-$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
+$ apt update && apt upgrade -y
+$ apt install git cmake
 ```
 
-Now, you can start chatting:
+Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
+
+Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
+
 ```
-$cd /data/data/com.termux/files/home/bin
-$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
+$ curl -L {model-url} -o ~/{model}.gguf
 ```
 
-Here's a demo of an interactive session running on Pixel 5 phone:
+Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
+
+```
+$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
+```
+
+Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
+
+To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
 
 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
+
+## Cross-compile using Android NDK
+It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
+
+Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
+
+```
+$ cmake \
+  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DCMAKE_C_FLAGS="-march=armv8.7a" \
+  -DCMAKE_CXX_FLAGS="-march=armv8.7a" \
+  -DGGML_OPENMP=OFF \
+  -DGGML_LLAMAFILE=OFF \
+  -B build-android
+```
+
+Notes:
+  - While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
+  - `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325)
+
+The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use.
+
+Feel free to adjust the Android ABI for your target. Once the project is configured:
+
+```
+$ cmake --build build-android --config Release -j{n}
+$ cmake --install build-android --prefix {install-dir} --config Release
+```
+
+After installing, go ahead and download the model of your choice to your host system. Then:
+
+```
+$ adb shell "mkdir /data/local/tmp/llama.cpp"
+$ adb push {install-dir} /data/local/tmp/llama.cpp/
+$ adb push {model}.gguf /data/local/tmp/llama.cpp/
+$ adb shell
+```
+
+In the `adb shell`:
+
+```
+$ cd /data/local/tmp/llama.cpp
+$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
+```
+
+That's it!
+
+Be aware that Android will not find the library path `lib` on its own, so we must specify `LD_LIBRARY_PATH` in order to run the installed executables. Android does support `RPATH` in later API levels, so this could change in the future. Refer to the previous section for information about `context-size` (very important!) and running other `examples`.
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 286bec255..03cff4a99 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1361,6 +1361,10 @@ if (MATH_LIBRARY)
     endif()
 endif()
 
+if (CMAKE_SYSTEM_NAME MATCHES "Android")
+    list(APPEND GGML_EXTRA_LIBS_PRIVATE dl) # Must be linked explicitly
+endif()
+
 list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
 list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
 target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})

From 6374743747b14db4eb73ce82ae449a2978bc3b47 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 7 Oct 2024 21:55:08 +0200
Subject: [PATCH 027/396] ggml : add backend registry / device interfaces to
 BLAS backend (#9752)

* ggml : add backend registry / device interfaces to BLAS backend

* fix mmap usage when using host buffers
---
 ggml/include/ggml-backend.h  |   1 +
 ggml/include/ggml-blas.h     |   2 +
 ggml/src/CMakeLists.txt      |  14 +-
 ggml/src/ggml-backend-impl.h |  14 +-
 ggml/src/ggml-backend.cpp    |  27 +++-
 ggml/src/ggml-blas.cpp       | 256 ++++++++++++++++++++++++++++-------
 src/llama.cpp                |  72 ++++++----
 tests/test-backend-ops.cpp   |   6 +-
 8 files changed, 293 insertions(+), 99 deletions(-)

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 152b9adb0..5933b8e8f 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -170,6 +170,7 @@ extern "C" {
 
     // Functions that may be obtained using ggml_backend_reg_get_proc_address
     typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
+    typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
 
     //
     // Backend registry
diff --git a/ggml/include/ggml-blas.h b/ggml/include/ggml-blas.h
index dd612860d..25b2e637f 100644
--- a/ggml/include/ggml-blas.h
+++ b/ggml/include/ggml-blas.h
@@ -17,6 +17,8 @@ GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
 // for openblas and blis, this will also set the number of threads used for blas operations
 GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
 
+GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void);
+
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 03cff4a99..f126ebf7e 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -190,22 +190,24 @@ if (GGML_BLAS)
             # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
             find_package(PkgConfig REQUIRED)
             if (${GGML_BLAS_VENDOR} MATCHES "Generic")
-                pkg_check_modules(DepBLAS REQUIRED blas)
+                pkg_check_modules(DepBLAS blas)
             elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
                 # As of openblas v0.3.22, the 64-bit is named openblas64.pc
                 pkg_check_modules(DepBLAS openblas64)
                 if (NOT DepBLAS_FOUND)
-                    pkg_check_modules(DepBLAS REQUIRED openblas)
+                    pkg_check_modules(DepBLAS openblas)
                 endif()
             elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
-                pkg_check_modules(DepBLAS REQUIRED blis)
+                add_compile_definitions(GGML_BLAS_USE_BLIS)
+                pkg_check_modules(DepBLAS blis)
             elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
-                pkg_check_modules(DepBLAS REQUIRED blas-atlas)
+                pkg_check_modules(DepBLAS blas-atlas)
             elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
-                pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
+                pkg_check_modules(DepBLAS flexiblas_api)
             elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
+                add_compile_definitions(GGML_BLAS_USE_MKL)
                 # all Intel* libraries share the same include path
-                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
+                pkg_check_modules(DepBLAS mkl-sdl)
             elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
                 # this doesn't provide pkg-config
                 # suggest to assign BLAS_INCLUDE_DIRS on your own
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index ba2e26999..fd3deae00 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -88,6 +88,7 @@ extern "C" {
 
         void (*free)(ggml_backend_t backend);
 
+        // Will be moved to the device interface
         // buffer allocation
         ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
 
@@ -112,17 +113,9 @@ extern "C" {
 
         // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
         //            new backends should implement the device interface instead
-
         // These functions are being moved to the device interface
-        // check if the backend can compute an operation
         bool (*supports_op)  (ggml_backend_t backend, const struct ggml_tensor * op);
-
-        // check if the backend can use tensors allocated in a buffer type
         bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
-
-        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
-        // these should be expensive operations with large batch sizes that may benefit from running on this backend
-        // even if the weight has to be copied from the CPU temporarily
         bool (*offload_op)   (ggml_backend_t backend, const struct ggml_tensor * op);
 
         // (optional) event synchronization
@@ -184,9 +177,8 @@ extern "C" {
         // check if the backend can use tensors allocated in a buffer type
         bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
 
-        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
-        // these should be expensive operations with large batch sizes that may benefit from running on this backend
-        // even if the weight has to be copied from the CPU temporarily
+        // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
+        // these should be expensive operations that may benefit from running on this backend instead of the CPU backend
         bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
 
         // (optional) event synchronization
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 4f3e9374c..fbd49d13d 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -500,7 +500,11 @@ bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buff
 }
 
 bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
-    return device->iface.offload_op(device, op);
+    if (device->iface.offload_op != NULL) {
+        return device->iface.offload_op(device, op);
+    }
+
+    return false;
 }
 
 // Backend (reg)
@@ -534,6 +538,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_BLAS
+#include "ggml-blas.h"
+#endif
+
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_t> backends;
     std::vector<ggml_backend_dev_t> devices;
@@ -545,10 +553,13 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_METAL
         register_backend(ggml_backend_metal_reg());
 #endif
-
-        register_backend(ggml_backend_cpu_reg());
+#ifdef GGML_USE_BLAS
+        register_backend(ggml_backend_blas_reg());
+#endif
 
         // TODO: sycl, vulkan, kompute, cann
+
+        register_backend(ggml_backend_cpu_reg());
     }
 
     void register_backend(ggml_backend_reg_t reg) {
@@ -1229,16 +1240,22 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg
     };
 
     return &ggml_backend_cpu_device;
+}
+
+static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        return (void *)ggml_backend_cpu_set_n_threads;
+    }
+    return NULL;
 
     GGML_UNUSED(reg);
-    GGML_UNUSED(index);
 }
 
 static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
     /* .get_name         = */ ggml_backend_cpu_reg_get_name,
     /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
     /* .get_device       = */ ggml_backend_cpu_reg_get_device,
-    /* .get_proc_address = */ NULL,
+    /* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
 };
 
 ggml_backend_reg_t ggml_backend_cpu_reg(void) {
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
index b850e6a8d..0c6574de5 100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@@ -4,6 +4,7 @@
 
 #include <future>
 #include <vector>
+#include <cstring>
 
 #if defined(GGML_USE_ACCELERATE)
 #   include <Accelerate/Accelerate.h>
@@ -26,30 +27,6 @@ struct ggml_backend_blas_context {
 #endif
 };
 
-// helper function to determine if it is better to use BLAS or not
-// for large matrices, BLAS is faster
-static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    if (ggml_is_contiguous(src0) &&
-        ggml_is_contiguous(src1) &&
-        src1->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-
-        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
-        return true;
-    }
-
-    return false;
-}
-
 static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
@@ -235,7 +212,7 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g
 
 // backend interface
 
-static const char * ggml_backend_blas_name(ggml_backend_t backend) {
+static const char * ggml_backend_blas_get_name(ggml_backend_t backend) {
     return "BLAS";
 
     GGML_UNUSED(backend);
@@ -285,29 +262,8 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
     GGML_UNUSED(backend);
 }
 
-static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-
-    return (op->op == GGML_OP_MUL_MAT  && ggml_backend_blas_use_blas(op)) ||
-           (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 &&
-                                          op->src[1]->type == GGML_TYPE_F32 &&
-                                          ggml_is_matrix(src0) &&
-                                          ggml_is_matrix(src1) &&
-                                          ggml_is_contiguous(src0) &&
-                                          (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
-
-    GGML_UNUSED(backend);
-}
-
-static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft);
-
-    GGML_UNUSED(backend);
-}
-
 static struct ggml_backend_i blas_backend_i = {
-    /* .get_name                = */ ggml_backend_blas_name,
+    /* .get_name                = */ ggml_backend_blas_get_name,
     /* .free                    = */ ggml_backend_blas_free,
     /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,
@@ -319,8 +275,8 @@ static struct ggml_backend_i blas_backend_i = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_blas_graph_compute,
-    /* .supports_op             = */ ggml_backend_blas_supports_op,
-    /* .supports_buft           = */ ggml_backend_blas_supports_buft,
+    /* .supports_op             = */ NULL,
+    /* .supports_buft           = */ NULL,
     /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
@@ -337,7 +293,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
     ggml_backend_t backend = new ggml_backend {
         /* .guid      = */ ggml_backend_blas_guid(),
         /* .interface = */ blas_backend_i,
-        /* .device    = */ nullptr,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
         /* .context   = */ ctx,
     };
 
@@ -364,3 +320,203 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
     ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
     ctx->n_threads = n_threads;
 }
+
+// device interface
+
+static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
+    return "BLAS";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
+    #if defined(GGML_USE_ACCELERATE)
+        return "Accelerate";
+    #elif defined(GGML_BLAS_USE_MKL)
+        return "MKL";
+    #elif defined(GGML_BLAS_USE_BLIS)
+        return "BLIS";
+    #elif defined(GGML_BLAS_USE_NVPL)
+        return "NVPL";
+    #elif defined(OPENBLAS_VERSION)
+        return "OpenBLAS";
+    #else
+        return "BLAS";
+    #endif
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    // TODO
+    *free = 0;
+    *total = 0;
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_CPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_blas_device_get_name(dev);
+    props->description = ggml_backend_blas_device_get_description(dev);
+    props->type        = ggml_backend_blas_device_get_type(dev);
+    ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) {
+    return ggml_backend_blas_init();
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_cpu_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
+}
+
+static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+
+    switch (op->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+
+        case GGML_OP_MUL_MAT:
+        {
+            // BLAS usually is only faster for large matrices
+            const struct ggml_tensor * src0 = op->src[0];
+            const struct ggml_tensor * src1 = op->src[1];
+
+            const int64_t ne10 = src1->ne[0];
+
+            const int64_t ne0 = op->ne[0];
+            const int64_t ne1 = op->ne[1];
+
+            // TODO: find the optimal value
+            const int64_t min_batch = 32;
+
+            return (ggml_is_contiguous(src0) &&
+                    ggml_is_contiguous(src1) &&
+                    src1->type == GGML_TYPE_F32 &&
+                    (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch));
+        }
+
+        case GGML_OP_OUT_PROD:
+            return (op->src[0]->type == GGML_TYPE_F32 &&
+                    op->src[1]->type == GGML_TYPE_F32 &&
+                    ggml_is_matrix(src0) &&
+                    ggml_is_matrix(src1) &&
+                    ggml_is_contiguous(src0) &&
+                    (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
+
+        default:
+            return false;
+
+    }
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft);
+
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
+    /* .get_name             = */ ggml_backend_blas_device_get_name,
+    /* .get_description      = */ ggml_backend_blas_device_get_description,
+    /* .get_memory           = */ ggml_backend_blas_device_get_memory,
+    /* .get_type             = */ ggml_backend_blas_device_get_type,
+    /* .get_props            = */ ggml_backend_blas_device_get_props,
+    /* .init_backend         = */ ggml_backend_blas_device_init,
+    /* .get_buffer_type      = */ ggml_backend_blas_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_blas_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_blas_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// backend reg interface
+
+static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) {
+    return "BLAS";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) {
+    return 1;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    static ggml_backend_device ggml_backend_blas_device = {
+        /* .iface   = */ ggml_backend_blas_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ nullptr,
+    };
+
+    return &ggml_backend_blas_device;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+}
+
+static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        return (void *)ggml_backend_blas_set_n_threads;
+    }
+    return NULL;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(name);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
+    /* .get_name         = */ ggml_backend_blas_reg_get_name,
+    /* .get_device_count = */ ggml_backend_blas_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_blas_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_blas_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_blas_reg(void) {
+    static struct ggml_backend_reg ggml_backend_blas_reg = {
+        /* .iface   = */ ggml_backend_blas_reg_i,
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_blas_reg;
+}
diff --git a/src/llama.cpp b/src/llama.cpp
index 77df74723..3fb8132f0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -22,10 +22,6 @@
 #   include "ggml-cann.h"
 #endif
 
-#ifdef GGML_USE_BLAS
-#  include "ggml-blas.h"
-#endif
-
 // TODO: replace with ggml API call
 #define QK_K 256
 
@@ -3288,9 +3284,8 @@ struct llama_context {
     std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
 
     std::vector<ggml_backend_t> backends;
-#ifdef GGML_USE_BLAS
-    ggml_backend_t backend_blas = nullptr;
-#endif
+    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+
     ggml_backend_t backend_cpu = nullptr;
 
     ggml_threadpool_t threadpool       = nullptr;
@@ -8908,7 +8903,8 @@ static bool llm_load_tensors(
         bufs.reserve(n_max_backend_buffer);
 
         // check if this backend device supports buffer_from_host_ptr
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        // when using a host buffer as the CPU bakcend buffer, use the CPU device to prioritize using buffer_from_host_ptr over the host buffer
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft == llama_default_buffer_type_cpu(model, true) ? ggml_backend_cpu_buffer_type() : buft);
         bool buffer_from_host_ptr_supported = false;
         if (dev) {
             ggml_backend_dev_props props;
@@ -17048,17 +17044,19 @@ static void llama_graph_compute(
                     int   n_threads,
         ggml_threadpool * threadpool) {
     if (lctx.backend_cpu != nullptr) {
-        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
         ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
         ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
     }
-#ifdef GGML_USE_BLAS
-    if (lctx.backend_blas != nullptr) {
-        ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
-    }
-#endif
 
-    ggml_backend_sched_graph_compute_async(lctx.sched, gf);
+    // set the number of threads for all the backends
+    for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) {
+        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
+    }
+
+    auto err = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
+    if (err != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
+    }
 
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
 }
@@ -19110,9 +19108,16 @@ struct llama_model * llama_load_model_from_file(
     // TODO: rework API to give user more control over device selection
     for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
         ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        // skip the CPU backend since it is handled separately
-        if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU_FULL) {
-            model->devices.push_back(dev);
+        switch (ggml_backend_dev_type(dev)) {
+            case GGML_BACKEND_DEVICE_TYPE_CPU:
+            case GGML_BACKEND_DEVICE_TYPE_CPU_FULL:
+                // skip CPU backends since they are `handled separately
+                break;
+
+            case GGML_BACKEND_DEVICE_TYPE_GPU:
+            case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
+                model->devices.push_back(dev);
+                break;
         }
     }
 
@@ -19407,14 +19412,19 @@ struct llama_context * llama_new_context_with_model(
         }
 #endif
 
-#ifdef GGML_USE_BLAS
-        ctx->backend_blas = ggml_backend_blas_init();
-        if (ctx->backend_blas == nullptr) {
-            LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
-        } else {
-            ctx->backends.push_back(ctx->backend_blas);
+        // add other backends (such as BLAS)
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                    llama_free(ctx);
+                    return nullptr;
+                }
+                ctx->backends.push_back(backend);
+            }
         }
-#endif
 
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
@@ -19424,6 +19434,18 @@ struct llama_context * llama_new_context_with_model(
         }
         ctx->backends.push_back(ctx->backend_cpu);
 
+        // create a list of the set_n_threads functions in the backends
+        for (auto * backend : ctx->backends) {
+            ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+            if (reg) {
+                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+                if (ggml_backend_set_n_threads_fn) {
+                    ctx->set_n_threads_fns.emplace_back(backend, ggml_backend_set_n_threads_fn);
+                }
+            }
+        }
+
         if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index a10d98e35..fa26cc653 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3820,9 +3820,11 @@ int main(int argc, char ** argv) {
             continue;
         }
 
-        if (ggml_backend_is_cpu(backend)) {
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
             // TODO: better value for n_threads
-            ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
+            ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
         }
 
         printf("  Device description: %s\n", ggml_backend_dev_description(dev));

From fa42aa6d8902cc4eaf31866b3b3b7b61b69da930 Mon Sep 17 00:00:00 2001
From: standby24x7 <standby24x7@gmail.com>
Date: Tue, 8 Oct 2024 15:19:53 +0900
Subject: [PATCH 028/396] scripts : fix spelling typo in messages and comments
 (#9782)

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
---
 scripts/debug-test.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh
index 91946c514..c6c1e988a 100755
--- a/scripts/debug-test.sh
+++ b/scripts/debug-test.sh
@@ -110,7 +110,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
 ###########################################################
 
 # Note: test-eval-callback requires -DLLAMA_CURL
-cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
+cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build environment"
 pushd "$build_dir"
 make -j || abort "Failed to compile"
 popd > /dev/null || exit 1
@@ -127,7 +127,7 @@ printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
 pushd "$build_dir"
 tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
 if [ ${#tests[@]} -eq 0 ]; then
-    abort "No tests avaliable... check your compliation process..."
+    abort "No tests available... check your compilation process..."
 fi
 popd > /dev/null || exit 1
 
@@ -137,7 +137,7 @@ popd > /dev/null || exit 1
 
 # Select test number
 if [ -z $test_number ]; then
-    # List out avaliable tests
+    # List out available tests
     printf "Which test would you like to debug?\n"
     id=0
     for s in "${tests[@]}"

From 458367a90606448a9c0262b276947c9e536086e0 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 8 Oct 2024 13:27:04 +0200
Subject: [PATCH 029/396] server : better security control for public
 deployments (#9776)

* server : more explicit endpoint access settings

* protect /props endpoint

* fix tests

* update server docs

* fix typo

* fix tests
---
 common/arg.cpp                                |  16 ++-
 common/common.h                               |   5 +-
 examples/server/README.md                     |  55 +++-----
 examples/server/server.cpp                    | 123 ++++++++----------
 .../server/tests/features/security.feature    |  22 ++--
 examples/server/tests/features/steps/steps.py |   4 +-
 examples/server/utils.hpp                     |  13 ++
 src/unicode-data.cpp                          |   4 +-
 8 files changed, 126 insertions(+), 116 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 2a85ad845..7f5c05a34 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1838,9 +1838,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.endpoint_metrics = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
+    add_opt(llama_arg(
+        {"--slots"},
+        format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.endpoint_slots = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
+    add_opt(llama_arg(
+        {"--props"},
+        format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.endpoint_props = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
     add_opt(llama_arg(
         {"--no-slots"},
-        format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        "disables slots monitoring endpoint",
         [](gpt_params & params) {
             params.endpoint_slots = false;
         }
diff --git a/common/common.h b/common/common.h
index 8b84cf9ad..65add1f30 100644
--- a/common/common.h
+++ b/common/common.h
@@ -290,7 +290,10 @@ struct gpt_params {
     std::string ssl_file_key  = "";                                                                         // NOLINT
     std::string ssl_file_cert = "";                                                                         // NOLINT
 
-    bool endpoint_slots   = true;
+    // "advanced" endpoints are disabled by default for better security
+    bool webui            = true;
+    bool endpoint_slots   = false;
+    bool endpoint_props   = false; // only control POST requests, not GET
     bool endpoint_metrics = false;
 
     bool log_json = false;
diff --git a/examples/server/README.md b/examples/server/README.md
index 6253de43c..09d1cf097 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -18,6 +18,8 @@ The project is under active development, and we are [looking for feedback and co
 
 ## Usage
 
+<!-- Note for contributors: The list below is generated by llama-gen-docs -->
+
 **Common params**
 
 | Argument | Explanation |
@@ -149,7 +151,9 @@ The project is under active development, and we are [looking for feedback and co
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
-| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
+| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
+| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
@@ -380,8 +384,6 @@ node index.js
 
     `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
 
-    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
-
     `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
 
 **Response format**
@@ -519,34 +521,41 @@ Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/B
 
 Takes a prefix and a suffix and returns the predicted completion as stream.
 
-    *Options:*
+*Options:*
 
-    `input_prefix`: Set the prefix of the code to infill.
+- `input_prefix`: Set the prefix of the code to infill.
+- `input_suffix`: Set the suffix of the code to infill.
 
-    `input_suffix`: Set the suffix of the code to infill.
+It also accepts all the options of `/completion` except `stream` and `prompt`.
 
-    It also accepts all the options of `/completion` except `stream` and `prompt`.
+### **GET** `/props`: Get server global properties.
 
-- **GET** `/props`: Return current server settings.
+This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
 
 **Response format**
 
 ```json
 {
-  "assistant_name": "",
-  "user_name": "",
+  "system_prompt": "",
   "default_generation_settings": { ... },
   "total_slots": 1,
   "chat_template": ""
 }
 ```
 
-- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
-- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
+- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
 - `chat_template` - the model's original Jinja2 prompt template
 
+### POST `/props`: Change server global properties.
+
+To use this endpoint with POST method, you need to start server with `--props`
+
+*Options:*
+
+- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
+
 ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
 
 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
@@ -813,28 +822,6 @@ To know the `id` of the adapter, use GET `/lora-adapters`
 
 ## More examples
 
-### Change system prompt on runtime
-
-To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.
-
-`prompt`: Specify a context that you want all connecting clients to respect.
-
-`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
-
-`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
-
-```json
-{
-    "system_prompt": {
-        "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
-        "anti_prompt": "User:",
-        "assistant_name": "Assistant:"
-    }
-}
-```
-
-**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
-
 ### Interactive mode
 
 Check the sample in [chat.mjs](chat.mjs).
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 13e54e501..aedfca0d6 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1106,12 +1106,7 @@ struct server_context {
         SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
 
         system_prompt = sys_prompt;
-
-        // release all slots
-        for (server_slot & slot : slots) {
-            slot.release();
-        }
-
+        // update system_tokens and KV cache as soon as all slots are idle
         system_need_update = true;
         return true;
     }
@@ -1627,16 +1622,6 @@ struct server_context {
                         break;
                     }
 
-                    if (task.data.contains("system_prompt")) {
-                        std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
-                        system_prompt_set(sys_prompt);
-
-                        for (server_slot & slot : slots) {
-                            slot.n_past    = 0;
-                            slot.n_past_se = 0;
-                        }
-                    }
-
                     slot->reset();
 
                     slot->id_task   = task.id;
@@ -1862,10 +1847,6 @@ struct server_context {
     }
 
     void update_slots() {
-        if (system_need_update) {
-            system_prompt_update();
-        }
-
         // check if all slots are idle
         {
             bool all_idle = true;
@@ -1878,6 +1859,10 @@ struct server_context {
             }
 
             if (all_idle) {
+                if (system_need_update) {
+                    system_prompt_update();
+                }
+
                 SRV_INF("%s", "all slots are idle\n");
                 if (system_prompt.empty() && clean_kv_cache) {
                     kv_cache_clear();
@@ -2536,20 +2521,10 @@ int main(int argc, char ** argv) {
     //
 
     auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
-        // TODO: should we apply API key to all endpoints, including "/health" and "/models"?
-        static const std::unordered_set<std::string> protected_endpoints = {
-            "/props",
-            "/completion",
-            "/completions",
-            "/v1/completions",
-            "/chat/completions",
-            "/v1/chat/completions",
-            "/infill",
-            "/tokenize",
-            "/detokenize",
-            "/embedding",
-            "/embeddings",
-            "/v1/embeddings",
+        static const std::unordered_set<std::string> public_endpoints = {
+            "/health",
+            "/models",
+            "/v1/models",
         };
 
         // If API key is not set, skip validation
@@ -2557,8 +2532,8 @@ int main(int argc, char ** argv) {
             return true;
         }
 
-        // If path is not in protected_endpoints list, skip validation
-        if (protected_endpoints.find(req.path) == protected_endpoints.end()) {
+        // If path is public, skip validation
+        if (public_endpoints.find(req.path) != public_endpoints.end()) {
             return true;
         }
 
@@ -2620,7 +2595,7 @@ int main(int argc, char ** argv) {
 
     const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
         if (!params.endpoint_slots) {
-            res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED));
+            res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
 
@@ -2869,24 +2844,31 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
-        std::string template_key = "tokenizer.chat_template", curr_tmpl;
-        int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
-        if (tlen > 0) {
-            std::vector<char> curr_tmpl_buf(tlen + 1, 0);
-            if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
-                curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
-            }
-        }
         json data = {
-            { "system_prompt",               ctx_server.system_prompt.c_str() },
+            { "system_prompt",               ctx_server.system_prompt },
             { "default_generation_settings", ctx_server.default_generation_settings_for_props },
             { "total_slots",                 ctx_server.params.n_parallel },
-            { "chat_template",               curr_tmpl.c_str() },
+            { "chat_template",               llama_get_chat_template(ctx_server.model) },
         };
 
         res_ok(res, data);
     };
 
+    const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
+        if (!ctx_server.params.endpoint_props) {
+            res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+
+        json data = json::parse(req.body);
+        if (data.contains("system_prompt")) {
+            std::string system_prompt = data.at("system_prompt");
+            ctx_server.system_prompt_set(system_prompt);
+        }
+
+        res_ok(res, {{ "success", true }});
+    };
+
     const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) {
         if (ctx_server.params.embedding || ctx_server.params.reranking) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
@@ -3265,30 +3247,39 @@ int main(int argc, char ** argv) {
         svr->set_base_dir(params.public_path);
     }
 
-    // using embedded static files
-    svr->Get("/",                           handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
-    svr->Get("/index.js",                   handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/completion.js",              handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
+    if (!params.api_keys.empty()) {
+        // for now, if API key is set, web UI is unusable
+        svr->Get("/", [&](const httplib::Request &, httplib::Response & res) {
+            return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
+        });
+    } else {
+        // using embedded static files
+        svr->Get("/",                           handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+        svr->Get("/index.js",                   handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
+        svr->Get("/completion.js",              handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+        svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
 
-    // add new-ui files
-    svr->Get("/colorthemes.css",       handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
-    svr->Get("/style.css",             handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-ketivah.css",     handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-mangotango.css",  handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-playground.css",  handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-polarnight.css",  handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-snowstorm.css",   handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
-    svr->Get("/index-new.html",        handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
-    svr->Get("/system-prompts.js",     handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/prompt-formats.js",     handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
+        // add new-ui files
+        svr->Get("/colorthemes.css",       handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
+        svr->Get("/style.css",             handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
+        svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
+        svr->Get("/theme-ketivah.css",     handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
+        svr->Get("/theme-mangotango.css",  handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
+        svr->Get("/theme-playground.css",  handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
+        svr->Get("/theme-polarnight.css",  handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
+        svr->Get("/theme-snowstorm.css",   handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
+        svr->Get("/index-new.html",        handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
+        svr->Get("/system-prompts.js",     handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
+        svr->Get("/prompt-formats.js",     handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
+    }
 
     // register API routes
-    svr->Get ("/health",              handle_health);
+    svr->Get ("/health",              handle_health); // public endpoint (no API key check)
     svr->Get ("/metrics",             handle_metrics);
     svr->Get ("/props",               handle_props);
-    svr->Get ("/v1/models",           handle_models);
+    svr->Post("/props",               handle_props_change);
+    svr->Get ("/models",              handle_models); // public endpoint (no API key check)
+    svr->Get ("/v1/models",           handle_models); // public endpoint (no API key check)
     svr->Post("/completion",          handle_completions); // legacy
     svr->Post("/completions",         handle_completions);
     svr->Post("/v1/completions",      handle_completions);
diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature
index eb82e7aca..0a3c5cc77 100644
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@@ -5,7 +5,7 @@ Feature: Security
   Background: Server startup with an api key defined
     Given a server listening on localhost:8080
     And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a server api key llama.cpp
+    And   a server api key THIS_IS_THE_KEY
     Then  the server is starting
     Then  the server is healthy
 
@@ -16,11 +16,11 @@ Feature: Security
     And   a completion request with <api_error> api error
 
     Examples: Prompts
-      | api_key   | api_error |
-      | llama.cpp | no        |
-      | llama.cpp | no        |
-      | hackeme   | raised    |
-      |           | raised    |
+      | api_key         | api_error |
+      | THIS_IS_THE_KEY | no        |
+      | THIS_IS_THE_KEY | no        |
+      | hackeme         | raised    |
+      |                 | raised    |
 
   Scenario Outline: OAI Compatibility
     Given a system prompt test
@@ -32,10 +32,10 @@ Feature: Security
     Given an OAI compatible chat completions request with <api_error> api error
 
     Examples: Prompts
-      | api_key   | api_error |
-      | llama.cpp | no        |
-      | llama.cpp | no        |
-      | hackme    | raised    |
+      | api_key         | api_error |
+      | THIS_IS_THE_KEY | no        |
+      | THIS_IS_THE_KEY | no        |
+      | hackme          | raised    |
 
   Scenario Outline: OAI Compatibility (invalid response formats)
     Given a system prompt test
@@ -55,7 +55,7 @@ Feature: Security
 
 
   Scenario Outline: CORS Options
-    Given a user api key llama.cpp
+    Given a user api key THIS_IS_THE_KEY
     When  an OPTIONS request is sent from <origin>
     Then  CORS header <cors_header> is set to <cors_header_value>
 
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 2611614ba..540a2ecd5 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1299,7 +1299,8 @@ async def wait_for_slots_status(context,
 
     async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         while True:
-            async with await session.get(f'{base_url}/slots', params=params) as slots_response:
+            headers = {'Authorization': f'Bearer {context.server_api_key}'}
+            async with await session.get(f'{base_url}/slots', params=params, headers=headers) as slots_response:
                 status_code = slots_response.status
                 slots = await slots_response.json()
                 if context.debug:
@@ -1387,6 +1388,7 @@ def start_server_background(context):
         context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
     server_listen_addr = context.server_fqdn
     server_args = [
+        '--slots', # requires to get slot status via /slots endpoint
         '--host', server_listen_addr,
         '--port', context.server_port,
     ]
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 47dfdfde5..452606cca 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -90,6 +90,19 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
     return formatted_chat;
 }
 
+static std::string llama_get_chat_template(const struct llama_model * model) {
+    std::string template_key = "tokenizer.chat_template";
+    // call with NULL buffer to get the total size of the string
+    int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
+    if (res < 0) {
+        return "";
+    } else {
+        std::vector<char> model_template(res, 0);
+        llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
+        return std::string(model_template.data(), model_template.size());
+    }
+}
+
 //
 // base64 utils (TODO: move to common in the future)
 //
diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp
index 07424bbab..04dcd7fcf 100644
--- a/src/unicode-data.cpp
+++ b/src/unicode-data.cpp
@@ -2311,7 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
 0x003000,
 };
 
-// list is always in ascending order, to enable binary searh
+// list is always in ascending order, to enable binary search
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
 {0x000041, 0x000061},
 {0x000042, 0x000062},
@@ -3748,7 +3748,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
 {0x01E921, 0x01E943},
 };
 
-// list is always in ascending order, to enable binary searh
+// list is always in ascending order, to enable binary search
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
 {0x000061, 0x000041},
 {0x000062, 0x000042},

From dca1d4b58a7f1acf1bd253be84e50d6367f492fd Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 8 Oct 2024 14:21:43 +0200
Subject: [PATCH 030/396] ggml : fix BLAS with unsupported types (#9775)

* ggml : do not use BLAS with types without to_float

* ggml : return pointer from ggml_internal_get_type_traits to avoid unnecessary copies

* ggml : rename ggml_internal_get_type_traits -> ggml_get_type_traits

it's not really internal if everybody uses it
---
 examples/export-lora/export-lora.cpp       |  4 +--
 examples/quantize-stats/quantize-stats.cpp | 10 +++----
 ggml/include/ggml.h                        |  6 ++--
 ggml/src/ggml-backend.cpp                  |  2 +-
 ggml/src/ggml-blas.cpp                     | 26 ++++++++++--------
 ggml/src/ggml-vulkan.cpp                   |  4 +--
 ggml/src/ggml.c                            |  6 ++--
 pocs/vdot/q8dot.cpp                        |  6 ++--
 pocs/vdot/vdot.cpp                         | 14 +++++-----
 src/llama.cpp                              |  9 +++---
 tests/test-backend-ops.cpp                 |  4 +--
 tests/test-quantize-fns.cpp                | 32 +++++++++++-----------
 tests/test-quantize-perf.cpp               | 26 +++++++++---------
 13 files changed, 75 insertions(+), 74 deletions(-)

diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 0051a5eb6..644d46a62 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -314,9 +314,9 @@ struct lora_merge_ctx {
             // optionally dequantize it
             printf("%s :   + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
             auto nels = ggml_nelements(inp_base);
-            ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
+            const auto * qtype = ggml_get_type_traits(base->type);
             std::vector<uint8_t> dequant_buf(nels * sizeof(float));
-            qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
+            qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
             ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
         } else {
             ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 498cbbe3c..e372856c6 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
 }
 
 static void test_roundtrip_on_chunk(
-    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
+    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
     float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
 ) {
     if (layer->type == GGML_TYPE_F16) {
@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
 
 // Run quantization function for a single layer and update error stats
 static void test_roundtrip_on_layer(
-    std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
+    std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
     const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
     std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
 ) {
@@ -371,8 +371,8 @@ int main(int argc, char ** argv) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (qfns.from_float && qfns.to_float) {
+        const auto *  qfns = ggml_get_type_traits(type);
+        if (qfns->from_float && qfns->to_float) {
             if (params.verbose) {
                 printf("testing %s ...\n",  ggml_type_name(type));
             }
@@ -393,7 +393,7 @@ int main(int argc, char ** argv) {
                 test_roundtrip_on_layer(
                         layer_name,
                         params.per_layer_stats,
-                        qfns,
+                        *qfns,
                         params.reference,
                         kv_tensor.second,
                         input_scratch,
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index e7678d071..4508da4fb 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2535,7 +2535,7 @@ extern "C" {
     typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
                                        const void * GGML_RESTRICT y, int nr, int nc);
 
-    typedef struct {
+    struct ggml_type_traits {
         const char             * type_name;
         int64_t                  blck_size;
         int64_t                  blck_size_interleave; // interleave elements in blocks
@@ -2551,9 +2551,9 @@ extern "C" {
         int64_t                  ncols; // number of columns to process simultaneously
         ggml_gemv_t              gemv;
         ggml_gemm_t              gemm;
-    } ggml_type_traits_t;
+    };
 
-    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+    GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index fbd49d13d..627b4dbc7 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1177,7 +1177,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
                 op->type != GGML_TYPE_IQ1_S   &&
                 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
         case GGML_OP_MUL_MAT:
-            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
         case GGML_OP_ROPE_BACK:
             return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
         case GGML_OP_IM2COL_BACK:
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
index 0c6574de5..55f724586 100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@@ -65,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 
     // convert src0 to float
     if (type != GGML_TYPE_F32) {
-        ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
-        ggml_to_float_t const to_float = type_traits.to_float;
+        const auto * type_traits = ggml_get_type_traits(type);
+        ggml_to_float_t const to_float = type_traits->to_float;
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -420,19 +420,21 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s
             // TODO: find the optimal value
             const int64_t min_batch = 32;
 
-            return (ggml_is_contiguous(src0) &&
-                    ggml_is_contiguous(src1) &&
-                    src1->type == GGML_TYPE_F32 &&
-                    (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch));
+            return ggml_is_contiguous(src0) &&
+                   ggml_is_contiguous(src1) &&
+                   src1->type == GGML_TYPE_F32 &&
+                   (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
+                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
         }
 
         case GGML_OP_OUT_PROD:
-            return (op->src[0]->type == GGML_TYPE_F32 &&
-                    op->src[1]->type == GGML_TYPE_F32 &&
-                    ggml_is_matrix(src0) &&
-                    ggml_is_matrix(src1) &&
-                    ggml_is_contiguous(src0) &&
-                    (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                   op->src[1]->type == GGML_TYPE_F32 &&
+                   ggml_is_matrix(src0) &&
+                   ggml_is_matrix(src1) &&
+                   ggml_is_contiguous(src0) &&
+                   (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
+                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
 
         default:
             return false;
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 30bd376da..374c6ecd7 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -5287,9 +5287,9 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg
         return;
     }
 
-    ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
+    const auto * tt = ggml_get_type_traits(quant);
 
-    ggml_to_float_t dequant_fn = tt.to_float;
+    ggml_to_float_t dequant_fn = tt->to_float;
 
     dequant_fn(from, to, ne);
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 03b832d0f..3f01092d9 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -729,7 +729,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
 static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
 static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
 
-static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
+static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I8] = {
         .type_name                = "i8",
         .blck_size                = 1,
@@ -1151,9 +1151,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
 };
 
 // For internal test use
-ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
+const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
     GGML_ASSERT(type < GGML_TYPE_COUNT);
-    return type_traits[type];
+    return &type_traits[type];
 }
 
 //
diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp
index 1a52ff5e9..131d7c177 100644
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@@ -136,7 +136,7 @@ int main(int argc, char** argv) {
 
     auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
 
-    auto funcs = ggml_internal_get_type_traits(ggml_type);
+    const auto * funcs = ggml_get_type_traits(ggml_type);
 
     Stat simple, ggml;
 
@@ -156,8 +156,8 @@ int main(int argc, char** argv) {
 
         t1 = std::chrono::high_resolution_clock::now();
         float fs;
-        if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
-        else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
+        if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
+        else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
         t2 = std::chrono::high_resolution_clock::now();
         t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
         if (iloop > 3) ggml.addResult(fs, t);
diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp
index 17e9e4482..88e66ea13 100644
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@@ -236,7 +236,7 @@ int main(int argc, char** argv) {
     int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
     int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
 
-    auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
+    const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0);
 
     std::vector<block_q4_0> q40;
     std::vector<block_q4_1> q41;
@@ -261,9 +261,9 @@ int main(int argc, char** argv) {
         // Note, we do not include this in the timing as in practical application
         // we already have the quantized model weights.
         if (useQ4_1) {
-            funcs.from_float(x1.data(), q41.data(), kVecSize);
+            funcs->from_float(x1.data(), q41.data(), kVecSize);
         } else {
-            funcs.from_float(x1.data(), q40.data(), kVecSize);
+            funcs->from_float(x1.data(), q40.data(), kVecSize);
         }
 
         // Now measure time the dot product needs using the "scalar" version above
@@ -282,10 +282,10 @@ int main(int argc, char** argv) {
             dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
         }
         else {
-            auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
-            vdot.from_float(y1.data(), q8.data(), kVecSize);
-            if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
-            else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
+            const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type);
+            vdot->from_float(y1.data(), q8.data(), kVecSize);
+            if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
+            else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
         }
         sumq += result;
         t2 = std::chrono::high_resolution_clock::now();
diff --git a/src/llama.cpp b/src/llama.cpp
index 3fb8132f0..01cdf17dc 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17872,10 +17872,9 @@ static void llama_tensor_dequantize_internal(
     }
     float * f32_output = (float *) output.data();
 
-    ggml_type_traits_t qtype;
+    const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
     if (ggml_is_quantized(tensor->type)) {
-        qtype = ggml_internal_get_type_traits(tensor->type);
-        if (qtype.to_float == NULL) {
+        if (qtype->to_float == NULL) {
             throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
         }
     } else if (tensor->type != GGML_TYPE_F16 &&
@@ -17889,7 +17888,7 @@ static void llama_tensor_dequantize_internal(
         } else if (tensor->type == GGML_TYPE_BF16) {
             ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
         } else if (ggml_is_quantized(tensor->type)) {
-            qtype.to_float(tensor->data, f32_output, nelements);
+            qtype->to_float(tensor->data, f32_output, nelements);
         } else {
             GGML_ABORT("fatal error"); // unreachable
         }
@@ -17925,7 +17924,7 @@ static void llama_tensor_dequantize_internal(
             } else if (typ == GGML_TYPE_BF16) {
                 ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
             } else {
-                qtype.to_float(inbuf, outbuf, nels);
+                qtype->to_float(inbuf, outbuf, nels);
             }
         };
         workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index fa26cc653..ee1a8877e 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -133,7 +133,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
     std::vector<uint8_t> buf(ggml_nbytes(t));
     ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
 
-    ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
+    const auto * tt = ggml_get_type_traits(t->type);
     size_t bs = ggml_blck_size(t->type);
     std::vector<float> vq(ggml_blck_size(t->type));
     bool quantized = ggml_is_quantized(t->type);
@@ -159,7 +159,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                     } else if (t->type == GGML_TYPE_I8) {
                         tv.push_back((float)*(int8_t *) &buf[i]);
                     } else if (quantized) {
-                        tt.to_float(&buf[i], vq.data(), bs);
+                        tt->to_float(&buf[i], vq.data(), bs);
                         tv.insert(tv.end(), vq.begin(), vq.end());
                     } else {
                         GGML_ABORT("fatal error");
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index ccf5721a3..d50417ba0 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -44,26 +44,26 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
 }
 
 // Total quantization error on test data
-static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
     std::vector<uint8_t> tmp_q(2*test_size);
     std::vector<float> tmp_out(test_size);
 
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
     return array_rmse(test_data, tmp_out.data(), test_size);
 }
 
 // Total quantization error on test data
-static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
     std::vector<uint8_t> tmp_q(2*test_size);
     std::vector<float> tmp_out(test_size);
     std::vector<float> tmp_out_ref(test_size);
 
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
 
-    qfns.from_float_ref(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
+    qfns->from_float_ref(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
 
     return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
 }
@@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
 
 // Total dot product error
 static float dot_product_error(
-    ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
+    const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
 ) {
     std::vector<uint8_t> tmp_q1(2*test_size);
     std::vector<uint8_t> tmp_q2(2*test_size);
 
-    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+    const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
 
-    qfns.from_float(test_data1, tmp_q1.data(), test_size);
-    vdot.from_float(test_data2, tmp_q2.data(), test_size);
+    qfns->from_float(test_data1, tmp_q1.data(), test_size);
+    vdot->from_float(test_data2, tmp_q2.data(), test_size);
 
     float result = INFINITY;
-    qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
+    qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
 
     const float dot_ref = dot_product(test_data1, test_data2, test_size);
 
@@ -131,10 +131,10 @@ int main(int argc, char * argv[]) {
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
 
         // deprecated - skip
-        if (qfns.blck_size == 0) {
+        if (qfns->blck_size == 0) {
             continue;
         }
 
@@ -143,7 +143,7 @@ int main(int argc, char * argv[]) {
         printf("Testing %s\n", ggml_type_name((ggml_type) i));
         ggml_quantize_init(ei);
 
-        if (qfns.from_float && qfns.to_float) {
+        if (qfns->from_float && qfns->to_float) {
             const float total_error = total_quantization_error(qfns, test_size, test_data.data());
             const float max_quantization_error =
                 type == GGML_TYPE_TQ1_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 24e066053..bdbdd90a8 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -122,9 +122,9 @@ static void usage(char * argv[]) {
     printf("  --type TYPE           set test type as");
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
         if (ggml_type_name(type) != NULL) {
-            if (qfns.from_float && qfns.to_float) {
+            if (qfns->from_float && qfns->to_float) {
                 printf(" %s", ggml_type_name(type));
             }
         }
@@ -270,12 +270,12 @@ int main(int argc, char * argv[]) {
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
         if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
             continue;
         }
 
-        if (qfns.from_float && qfns.to_float) {
+        if (qfns->from_float && qfns->to_float) {
             printf("%s\n", ggml_type_name(type));
 
             ggml_quantize_init(type);
@@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.from_float_ref(test_data1, test_q1, size);
+                        qfns->from_float_ref(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -299,7 +299,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.from_float(test_data1, test_q1, size);
+                        qfns->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -310,11 +310,11 @@ int main(int argc, char * argv[]) {
 
             if (params.op_dequantize_row_q) {
                 printf("  dequantize_row_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
+                qfns->from_float(test_data1, test_q1, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.to_float(test_q1, test_out, size);
+                        qfns->to_float(test_q1, test_out, size);
                         return test_out[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -328,8 +328,8 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-                        vdot.from_float(test_data1, test_q1, size);
+                        const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
+                        vdot->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -340,13 +340,13 @@ int main(int argc, char * argv[]) {
 
             if (params.op_vec_dot_q) {
                 printf("  vec_dot_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                qfns.from_float(test_data2, test_q2, largest);
+                qfns->from_float(test_data1, test_q1, largest);
+                qfns->from_float(test_data2, test_q2, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
                         float result;
-                        qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
+                        qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
                         return result;
                     };
                     size_t quantized_size = ggml_row_size(type, size);

From 3dc48fe75ad48f8856118520a267c96f74df8e90 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Oct 2024 10:55:42 +0300
Subject: [PATCH 031/396] examples : remove llama.vim

An updated version will be added in #9787
---
 examples/llama.vim | 135 ---------------------------------------------
 1 file changed, 135 deletions(-)
 delete mode 100644 examples/llama.vim

diff --git a/examples/llama.vim b/examples/llama.vim
deleted file mode 100644
index 1b5ad6ba0..000000000
--- a/examples/llama.vim
+++ /dev/null
@@ -1,135 +0,0 @@
-" Requires an already running llama.cpp server
-" To install either copy or symlink to ~/.vim/autoload/llama.vim
-" Then start with either :call llama#doLlamaGen(),
-" or add a keybind to your vimrc such as
-" nnoremap Z :call llama#doLlamaGen()<CR>
-" Similarly, you could add an insert mode keybind with
-" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
-"
-" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
-" let g:llama_api_url = "192.168.1.10:8080"
-" llama_overrides can also be set through buffer/window scopes. For instance
-" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
-" Could be added to your .vimrc to automatically set a lower temperature when
-" editing a python script
-" Additionally, an override dict can be stored at the top of a file
-" !*{"stop": ["User:"]}
-" Could be added to the start of your chatlog.txt to set the stopping token
-" These parameter dicts are merged together from lowest to highest priority:
-" server default -> g:llama_overrides -> w:llama_overrides ->
-" b:llama_overrides -> in file (!*) overrides
-"
-" Sublists (like logit_bias and stop) are overridden, not merged
-" Example override:
-" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
-if !exists("g:llama_api_url")
-    let g:llama_api_url= "127.0.0.1:8080"
-endif
-if !exists("g:llama_overrides")
-   let g:llama_overrides = {}
-endif
-const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
-const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
-let s:linedict = {}
-
-func s:callbackHandler(bufn, channel, msg)
-   if len(a:msg) < 3
-      return
-   elseif a:msg[0] == "d"
-      let l:msg = a:msg[6:-1]
-   else
-      let l:msg = a:msg
-   endif
-   let l:decoded_msg = json_decode(l:msg)
-   let l:newtext = split(l:decoded_msg['content'], "\n", 1)
-   if len(l:newtext) > 0
-      call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
-   else
-      echo "nothing genned"
-   endif
-   if len(newtext) > 1
-      let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
-      let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
-   endif
-   if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
-       echo "Finished generation"
-   endif
-endfunction
-
-func llama#doLlamaGen()
-   if exists("b:job")
-      if job_status(b:job) == "run"
-         call job_stop(b:job)
-         return
-      endif
-   endif
-
-   let l:cbuffer = bufnr("%")
-   let s:linedict[l:cbuffer] = line('$')
-   let l:buflines = getbufline(l:cbuffer, 1, 1000)
-   let l:querydata = copy(s:querydata)
-   call extend(l:querydata, g:llama_overrides)
-   if exists("w:llama_overrides")
-      call extend(l:querydata, w:llama_overrides)
-   endif
-   if exists("b:llama_overrides")
-      call extend(l:querydata, b:llama_overrides)
-   endif
-   if l:buflines[0][0:1] == '!*'
-      let l:userdata = json_decode(l:buflines[0][2:-1])
-      call extend(l:querydata, l:userdata)
-      let l:buflines = l:buflines[1:-1]
-   endif
-   let l:querydata.prompt = join(l:buflines, "\n")
-   let l:curlcommand = copy(s:curlcommand)
-   if exists("g:llama_api_key")
-       call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
-   endif
-   let l:curlcommand[2] = json_encode(l:querydata)
-   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
-endfunction
-
-" Echos the tokkenization of the provided string , or cursor to end of word
-" Onus is placed on the user to include the preceding space
-func llama#tokenizeWord(...)
-    if (a:0 > 0)
-        let l:input = a:1
-    else
-        exe "normal \"*ye"
-        let l:input = @*
-    endif
-    let l:querydata = {"content": l:input}
-    let l:curlcommand = copy(s:curlcommand)
-    let l:curlcommand[2] = json_encode(l:querydata)
-    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
-   let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
-endfunction
-
-func s:tokenizeWordCallback(plaintext, channel, msg)
-    echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
-endfunction
-
-
-" Echos the token count of the entire buffer (or provided string)
-" Example usage :echo llama#tokenCount()
-func llama#tokenCount(...)
-    if (a:0 > 0)
-        let l:buflines = a:1
-    else
-        let l:buflines = getline(1,1000)
-        if l:buflines[0][0:1] == '!*'
-            let l:buflines = l:buflines[1:-1]
-        endif
-        let l:buflines = join(l:buflines, "\n")
-    endif
-    let l:querydata = {"content": l:buflines}
-    let l:curlcommand = copy(s:curlcommand)
-    let l:curlcommand[2] = json_encode(l:querydata)
-    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
-   let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
-endfunction
-
-func s:tokenCountCallback(channel, msg)
-    let resp = json_decode(a:msg)
-    echo len(resp.tokens)
-endfunction

From e7022064ab637ccb5f37867196f1802c4a453c91 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Oct 2024 17:00:18 +0300
Subject: [PATCH 032/396] perplexity : fix integer overflow (#9783)

* perplexity : fix integer overflow

ggml-ci

* perplexity : keep n_vocab as int and make appropriate casts

ggml-ci
---
 examples/perplexity/perplexity.cpp | 85 +++++++++++++++++-------------
 1 file changed, 49 insertions(+), 36 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 87347135e..40bc29f7a 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -169,7 +169,7 @@ static void process_logits(
                 break;
             }
             lock.unlock();
-            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
+            const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]);
             const double v = -results.log_softmax;
             local_nll += v;
             local_nll2 += v*v;
@@ -203,7 +203,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
                 break;
             }
             lock.unlock();
-            const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
+            const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
             local_nll += v;
             local_nll2 += v*v;
         }
@@ -281,7 +281,9 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
     kld.sum_kld  += sum;
     kld.sum_kld2 += sum*sum;
     ++kld.count;
-    if (imax == imax_base) ++kld.n_same_top;
+    if (imax == imax_base) {
+        ++kld.n_same_top;
+    }
 
     const float p_base = expf(-nll_base);
     const float p = expf(-nll);
@@ -323,7 +325,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
                 break;
             }
             lock.unlock();
-            std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+            std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
             kld_values[i]    = (float)v.first;
             p_diff_values[i] = v.second;
         }
@@ -383,9 +385,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
     const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1)  / params.ppl_stride;
 
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_batch = params.n_batch;
 
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
     int count = 0;
     double nll = 0.0;
 
@@ -424,8 +427,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
                 tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
             }
 
-            const auto batch_logits = llama_get_logits(ctx);
-            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            const auto * batch_logits = llama_get_logits(ctx);
+            logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
 
             if (j == 0) {
                 tokens[batch_start] = token_org;
@@ -447,11 +450,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
 
         //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
         for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
-
             // Calculate probability of next token, given the previous ones.
             const std::vector<float> tok_logits(
-                logits.begin() + (j + 0) * n_vocab,
-                logits.begin() + (j + 1) * n_vocab);
+                logits.begin() + size_t(j + 0) * n_vocab,
+                logits.begin() + size_t(j + 1) * n_vocab);
 
             const float prob = softmax(tok_logits)[tokens[start + j + 1]];
             logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
@@ -521,9 +523,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     const int n_chunk_max = tokens.size() / n_ctx;
 
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_batch = params.n_batch;
 
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
     int count = 0;
     double nll = 0.0;
     double nll2 = 0.0;
@@ -538,7 +541,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
     std::vector<float> logits;
     if (num_batches > 1) {
-        logits.reserve((size_t)n_ctx * n_vocab);
+        logits.reserve(size_t(n_ctx) * n_vocab);
     }
 
     LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
@@ -620,7 +623,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
             if (num_batches > 1 && n_outputs > 0) {
                 const auto * batch_logits = llama_get_logits(ctx);
-                logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
+                logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
             }
         }
 
@@ -661,7 +664,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             } else {
                 double av = nll/count;
                 double av2 = nll2/count - av*av;
-                if (av2 > 0) av2 = sqrt(av2/(count-1));
+                if (av2 > 0) {
+                    av2 = sqrt(av2/(count-1));
+                }
                 LOG("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
             }
         }
@@ -686,10 +691,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     return {tokens, ppl, logit_history, prob_history};
 }
 
-static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
+static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
     int prev_outputs = 0;
-    for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-        const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+    for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
+        const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
 
         llama_batch batch_view = {
             n_tokens,
@@ -713,7 +718,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
             n_outputs += batch_view.logits[i] != 0;
         }
 
-        memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float));
+        memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
 
         prev_outputs += n_outputs;
     }
@@ -728,7 +733,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
     if (eval_results.size() != eval_pairs.size()) {
         eval_results.resize(eval_pairs.size());
     }
-    if (eval_pairs.empty()) return;
+    if (eval_pairs.empty()) {
+        return;
+    }
 
     size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
 
@@ -736,11 +743,13 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
     auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
         float local_logprobs[K_TOKEN_CHUNK];
         while (true) {
-            size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
-            if (first >= eval_results.size()) break;
-            size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
+            const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
+            if (first >= eval_results.size()) {
+                break;
+            }
+            const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
             for (size_t i = first; i < last; ++i) {
-                auto logits = batch_logits + eval_pairs[i].first * n_vocab;
+                const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
                 float max_logit = logits[0];
                 for (int j = 1; j < n_vocab; ++j) {
                     max_logit = std::max(max_logit, logits[j]);
@@ -877,10 +886,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
     double acc = 0.0f;
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_ctx   = llama_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
     const int max_tasks_per_batch = 32;
     const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
 
@@ -888,7 +898,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
     std::vector<float> tok_logits(n_vocab);
     // TODO: this could be made smaller; it's currently the worst-case size
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
 
     std::vector<std::pair<size_t, llama_token>> eval_pairs;
     std::vector<float> eval_results;
@@ -975,7 +985,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             auto & hs_cur = hs_data[i];
 
             // get the logits of the last token of the common prefix
-            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float));
+            std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float));
 
             const auto first_probs = softmax(tok_logits);
 
@@ -1158,10 +1168,11 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
 
     LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_ctx   = llama_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
     const int max_tasks_per_batch = 128;
     const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
 
@@ -1169,7 +1180,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
 
     std::vector<float> tok_logits(n_vocab);
     // TODO: this could be made smaller; it's currently the worst-case size
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
 
     std::vector<std::pair<size_t, llama_token>> eval_pairs;
     std::vector<float> eval_results;
@@ -1509,17 +1520,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
 
     LOG("\ntask\tacc_norm\n");
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_ctx   = llama_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
     const int max_tasks_per_batch = 32;
     const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
 
     llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
 
     std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
 
     std::vector<std::pair<size_t, llama_token>> eval_pairs;
     std::vector<float> eval_results;
@@ -1627,7 +1639,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
             //LOG("\n    common_prefix: %zu\n", cur_task.common_prefix);
 
             // get the logits of the last token of the common prefix
-            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
+            std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float));
 
             const auto first_probs = softmax(tok_logits);
 
@@ -1709,7 +1721,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
                 __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
     }
 
-    int n_vocab, n_chunk;
+    int n_vocab;
+    int n_chunk;
     in.read((char *)&n_vocab, sizeof(n_vocab));
     in.read((char *)&n_chunk, sizeof(n_chunk));
     if (in.fail()) {
@@ -1720,7 +1733,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
         LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
     }
 
-    std::vector<llama_token> tokens(n_ctx * n_chunk);
+    std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
     if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
         LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
         return;
@@ -1737,7 +1750,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
     std::vector<float> logits;
     if (num_batches > 1) {
-        logits.reserve(n_ctx * n_vocab);
+        logits.reserve(size_t(n_ctx) * n_vocab);
     }
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@@ -1801,7 +1814,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 
             if (num_batches > 1) {
                 const auto * batch_logits = llama_get_logits(ctx);
-                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+                logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
             }
         }
 
@@ -1822,7 +1835,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 
         const int first = n_ctx/2;
         const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+        process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                 workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
         p_diff_ptr += n_ctx - 1 - first;
         kld_ptr    += n_ctx - 1 - first;

From c81f3bbb051f8b736e117dfc78c99d7c4e0450f6 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 9 Oct 2024 18:49:52 +0200
Subject: [PATCH 033/396] cmake : do not build common library by default when
 standalone (#9804)

---
 CMakeLists.txt                                | 6 +++---
 examples/llama.android/llama/build.gradle.kts | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 415743c2a..64a335378 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,7 +63,7 @@ option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
 option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
 
 # utils
-option(LLAMA_BUILD_COMMON "llama: build common utils library" ON)
+option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
 
 # extra artifacts
 option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
@@ -201,12 +201,12 @@ if (LLAMA_BUILD_COMMON)
     add_subdirectory(common)
 endif()
 
-if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
     include(CTest)
     add_subdirectory(tests)
 endif()
 
-if (LLAMA_BUILD_EXAMPLES)
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
     add_subdirectory(examples)
     add_subdirectory(pocs)
 endif()
diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts
index 0a3806172..2d1dfba20 100644
--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -18,6 +18,7 @@ android {
         }
         externalNativeBuild {
             cmake {
+                arguments += "-DLLAMA_BUILD_COMMON=ON"
                 arguments += "-DCMAKE_BUILD_TYPE=Release"
                 cppFlags += listOf()
                 arguments += listOf()

From c7499c557cc1efafaf0a6bc12963c39826299703 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 10 Oct 2024 19:50:49 +0200
Subject: [PATCH 034/396] examples : do not use common library in simple
 example (#9803)

* examples : do not use common library in simple example

* add command line parser, simplify code
---
 examples/simple/CMakeLists.txt |   2 +-
 examples/simple/simple.cpp     | 222 +++++++++++++++++++--------------
 2 files changed, 128 insertions(+), 96 deletions(-)

diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index 070cfbe7a..b63afbb8b 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-simple)
 add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index c2b7267c8..be91b2891 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,50 +1,112 @@
-#include "arg.h"
-#include "common.h"
-#include "log.h"
 #include "llama.h"
-
+#include <cstdio>
+#include <cstring>
+#include <string>
 #include <vector>
 
 static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
-    LOG("\n");
+    printf("\nexample usage:\n");
+    printf("\n    %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
+    printf("\n");
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    // path to the model gguf file
+    std::string model_path;
+    // prompt to generate text from
+    std::string prompt = "Hello my name is";
+    // number of layers to offload to the GPU
+    int ngl = 99;
+    // number of tokens to predict
+    int n_predict = 32;
 
-    params.prompt = "Hello my name is";
-    params.n_predict = 32;
+    // parse command line arguments
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
-        return 1;
+    {
+        int i = 1;
+        for (; i < argc; i++) {
+            if (strcmp(argv[i], "-m") == 0) {
+                if (i + 1 < argc) {
+                    model_path = argv[++i];
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else if (strcmp(argv[i], "-n") == 0) {
+                if (i + 1 < argc) {
+                    try {
+                        n_predict = std::stoi(argv[++i]);
+                    } catch (...) {
+                        print_usage(argc, argv);
+                        return 1;
+                    }
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else if (strcmp(argv[i], "-ngl") == 0) {
+                if (i + 1 < argc) {
+                    try {
+                        ngl = std::stoi(argv[++i]);
+                    } catch (...) {
+                        print_usage(argc, argv);
+                        return 1;
+                    }
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else {
+                // prompt starts here
+                break;
+            }
+        }
+        if (model_path.empty()) {
+            print_usage(argc, argv);
+            return 1;
+        }
+        if (i < argc) {
+            prompt = argv[i++];
+            for (; i < argc; i++) {
+                prompt += " ";
+                prompt += argv[i];
+            }
+        }
     }
 
-    gpt_init();
-
-    // total length of the sequence including the prompt
-    const int n_predict = params.n_predict;
-
-    // init LLM
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
     // initialize the model
 
-    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = ngl;
 
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
 
     if (model == NULL) {
         fprintf(stderr , "%s: error: unable to load model\n" , __func__);
         return 1;
     }
 
+    // tokenize the prompt
+
+    // find the number of tokens in the prompt
+    const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+
+    // allocate space for the tokens and tokenize the prompt
+    std::vector<llama_token> prompt_tokens(n_prompt);
+    if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
+        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
+        return 1;
+    }
+
     // initialize the context
 
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+    llama_context_params ctx_params = llama_context_default_params();
+    // n_ctx is the context size
+    ctx_params.n_ctx = n_prompt + n_predict - 1;
+    // n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
+    ctx_params.n_batch = n_prompt;
+    // enable performance counters
+    ctx_params.no_perf = false;
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
@@ -53,117 +115,87 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    // initialize the sampler
+
     auto sparams = llama_sampler_chain_default_params();
-
     sparams.no_perf = false;
-
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
     llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
 
-    // tokenize the prompt
-
-    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
-
-    const int n_ctx    = llama_n_ctx(ctx);
-    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
-
-    LOG("\n");
-    LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
-
-    // make sure the KV cache is big enough to hold all the prompt and generated tokens
-    if (n_kv_req > n_ctx) {
-        LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_ERR("%s:        either reduce n_predict or increase n_ctx\n", __func__);
-        return 1;
-    }
-
     // print the prompt token-by-token
 
-    LOG("\n");
-
-    for (auto id : tokens_list) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+    for (auto id : prompt_tokens) {
+        char buf[128];
+        int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
+        if (n < 0) {
+            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
+            return 1;
+        }
+        std::string s(buf, n);
+        printf("%s", s.c_str());
     }
 
-    // create a llama_batch with size 512
-    // we use this object to submit token data for decoding
+    // prepare a batch for the prompt
 
-    llama_batch batch = llama_batch_init(512, 0, 1);
-
-    // evaluate the initial prompt
-    for (size_t i = 0; i < tokens_list.size(); i++) {
-        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
-    }
-
-    // llama_decode will output logits only for the last token of the prompt
-    batch.logits[batch.n_tokens - 1] = true;
-
-    if (llama_decode(ctx, batch) != 0) {
-        LOG("%s: llama_decode() failed\n", __func__);
-        return 1;
-    }
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size(), 0, 0);
 
     // main loop
 
-    int n_cur    = batch.n_tokens;
-    int n_decode = 0;
-
     const auto t_main_start = ggml_time_us();
+    int n_decode = 0;
+    llama_token new_token_id;
+
+    for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ctx, batch)) {
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            return 1;
+        }
+
+        n_pos += batch.n_tokens;
 
-    while (n_cur <= n_predict) {
         // sample the next token
         {
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
+            new_token_id = llama_sampler_sample(smpl, ctx, -1);
 
             // is it an end of generation?
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
-                LOG("\n");
-
+            if (llama_token_is_eog(model, new_token_id)) {
                 break;
             }
 
-            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            char buf[128];
+            int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
+            if (n < 0) {
+                fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
+                return 1;
+            }
+            std::string s(buf, n);
+            printf("%s", s.c_str());
             fflush(stdout);
 
-            // prepare the next batch
-            llama_batch_clear(batch);
-
-            // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
+            // prepare the next batch with the sampled token
+            batch = llama_batch_get_one(&new_token_id, 1, n_pos, 0);
 
             n_decode += 1;
         }
-
-        n_cur += 1;
-
-        // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
-            return 1;
-        }
     }
 
-    LOG("\n");
+    printf("\n");
 
     const auto t_main_end = ggml_time_us();
 
-    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    LOG("\n");
+    fprintf(stderr, "\n");
     llama_perf_sampler_print(smpl);
     llama_perf_context_print(ctx);
+    fprintf(stderr, "\n");
 
-    LOG("\n");
-
-    llama_batch_free(batch);
     llama_sampler_free(smpl);
     llama_free(ctx);
     llama_free_model(model);
 
-    llama_backend_free();
-
     return 0;
 }

From cf8e0a3bb9c0e93e371773b282054cdbbb231038 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Fri, 11 Oct 2024 02:10:37 +0800
Subject: [PATCH 035/396] musa: add docker image support (#9685)

* mtgpu: add docker image support

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* mtgpu: enable docker workflow

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

---------

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 .devops/full-musa.Dockerfile         | 26 +++++++++++++++++++
 .devops/llama-cli-musa.Dockerfile    | 30 +++++++++++++++++++++
 .devops/llama-server-musa.Dockerfile | 35 +++++++++++++++++++++++++
 .github/workflows/docker.yml         |  3 +++
 docs/docker.md                       | 39 +++++++++++++++++++++++++++-
 ggml/src/CMakeLists.txt              |  4 +--
 6 files changed, 134 insertions(+), 3 deletions(-)
 create mode 100644 .devops/full-musa.Dockerfile
 create mode 100644 .devops/llama-cli-musa.Dockerfile
 create mode 100644 .devops/llama-server-musa.Dockerfile

diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile
new file mode 100644
index 000000000..34ba856d3
--- /dev/null
+++ b/.devops/full-musa.Dockerfile
@@ -0,0 +1,26 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc) && \
+    cp build/bin/* .
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile
new file mode 100644
index 000000000..b5696794f
--- /dev/null
+++ b/.devops/llama-cli-musa.Dockerfile
@@ -0,0 +1,30 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the MUSA runtime image
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-cli -j$(nproc)
+
+FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libgomp1
+
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+
+ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile
new file mode 100644
index 000000000..193a6d77c
--- /dev/null
+++ b/.devops/llama-server-musa.Dockerfile
@@ -0,0 +1,35 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the MUSA runtime image
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-server -j$(nproc)
+
+FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index a4ac9b217..a953cdac9 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -43,6 +43,9 @@ jobs:
           - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
           - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
           - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
           # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
           #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
           #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
diff --git a/docs/docker.md b/docs/docker.md
index e8a084173..8d90e6ded 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -19,8 +19,11 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
 
-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
 
 ## Usage
 
@@ -84,3 +87,37 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
+
+## Docker With MUSA
+
+Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/native) properly installed on Linux, `muBLAS` should be accessible inside the container.
+
+## Building Docker locally
+
+```bash
+docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
+docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
+docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
+```
+
+You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
+
+The defaults are:
+
+- `MUSA_VERSION` set to `rc3.1.0`
+
+The resulting images, are essentially the same as the non-MUSA images:
+
+1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
+3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
+
+## Usage
+
+After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
+
+```bash
+docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+```
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index f126ebf7e..676f85a36 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -163,8 +163,8 @@ if (GGML_OPENMP)
         list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
 
         if (GGML_MUSA)
-            list(APPEND GGML_EXTRA_INCLUDES     "/usr/lib/llvm-10/include/openmp")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
+            list(APPEND GGML_EXTRA_INCLUDES     "/usr/lib/llvm-14/lib/clang/14.0.0/include")
+            list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
         endif()
     else()
         message(WARNING "OpenMP not found")

From 0e9f760eb12546704ef8fa72577bc1a3ffe1bc04 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 10 Oct 2024 20:14:55 +0200
Subject: [PATCH 036/396] rpc : add backend registry / device interfaces
 (#9812)

* rpc : add backend registry / device interfaces

* llama : add llama_supports_rpc API

* ggml_backend_rpc_start_rpc_server -> ggml_backend_rpc_start_server
---
 common/arg.cpp                       |  18 +--
 examples/llama-bench/llama-bench.cpp |  10 +-
 examples/rpc/rpc-server.cpp          |   2 +-
 ggml/include/ggml-rpc.h              |   6 +-
 ggml/src/ggml-backend.cpp            |   7 +
 ggml/src/ggml-rpc.cpp                | 204 ++++++++++++++++++++++++---
 include/llama.h                      |   1 +
 src/llama.cpp                        |  87 +++++-------
 8 files changed, 247 insertions(+), 88 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 7f5c05a34..4d2527c58 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1353,15 +1353,15 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.image.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
-#ifdef GGML_USE_RPC
-    add_opt(llama_arg(
-        {"--rpc"}, "SERVERS",
-        "comma separated list of RPC servers",
-        [](gpt_params & params, const std::string & value) {
-            params.rpc_servers = value;
-        }
-    ).set_env("LLAMA_ARG_RPC"));
-#endif
+    if (llama_supports_rpc()) {
+        add_opt(llama_arg(
+            {"--rpc"}, "SERVERS",
+            "comma separated list of RPC servers",
+            [](gpt_params & params, const std::string & value) {
+                params.rpc_servers = value;
+            }
+        ).set_env("LLAMA_ARG_RPC"));
+    }
     add_opt(llama_arg(
         {"--mlock"},
         "force system to keep model in RAM rather than swapping or compressing",
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index fb1d387b2..c22bdedcf 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -304,9 +304,9 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  --cpu-strict <0|1>                        (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
     printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
     printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-#ifdef GGML_USE_RPC
-    printf("  -rpc, --rpc <rpc_servers>                 (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
-#endif
+    if (llama_supports_rpc()) {
+        printf("  -rpc, --rpc <rpc_servers>                 (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
+    }
     printf("  -sm, --split-mode <none|layer|row>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
     printf("  -mg, --main-gpu <i>                       (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
     printf("  -nkvo, --no-kv-offload <0|1>              (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@@ -497,14 +497,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             auto p = string_split<int>(argv[i], split_delim);
             params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
-#ifdef GGML_USE_RPC
-        } else if (arg == "-rpc" || arg == "--rpc") {
+        } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.rpc_servers.push_back(argv[i]);
-#endif
         } else if (arg == "-sm" || arg == "--split-mode") {
             if (++i >= argc) {
                 invalid_param = true;
diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
index 355125831..8354e37e5 100644
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -151,7 +151,7 @@ int main(int argc, char * argv[]) {
         get_backend_memory(&free_mem, &total_mem);
     }
     printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
-    start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
+    ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
     ggml_backend_free(backend);
     return 0;
 }
diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h
index 64cde7f13..d57967368 100644
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -17,7 +17,11 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * en
 
 GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
 
-GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+
+GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
+
+GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 627b4dbc7..fb1d3ead3 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -542,6 +542,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #include "ggml-blas.h"
 #endif
 
+#ifdef GGML_USE_RPC
+#include "ggml-rpc.h"
+#endif
+
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_t> backends;
     std::vector<ggml_backend_dev_t> devices;
@@ -556,6 +560,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_BLAS
         register_backend(ggml_backend_blas_reg());
 #endif
+#ifdef GGML_USE_RPC
+        register_backend(ggml_backend_rpc_reg());
+#endif
 
         // TODO: sycl, vulkan, kompute, cann
 
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index ab7298cba..13c7dd436 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -25,7 +25,7 @@
 #  include <netdb.h>
 #  include <unistd.h>
 #endif
-#include <string.h>
+#include <cstring>
 
 #define UNUSED GGML_UNUSED
 
@@ -630,22 +630,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
     return (enum ggml_status)output[0];
 }
 
-static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    UNUSED(backend);
-    UNUSED(op);
-    //TODO: call the remote backend and cache the results
-    return true;
-}
-
-static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
-        return false;
-    }
-    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
-    ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
-    return buft_ctx->endpoint == rpc_ctx->endpoint;
-}
-
 static ggml_backend_i ggml_backend_rpc_interface = {
     /* .get_name                = */ ggml_backend_rpc_name,
     /* .free                    = */ ggml_backend_rpc_free,
@@ -659,8 +643,8 @@ static ggml_backend_i ggml_backend_rpc_interface = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_rpc_graph_compute,
-    /* .supports_op             = */ ggml_backend_rpc_supports_op,
-    /* .supports_buft           = */ ggml_backend_rpc_supports_buft,
+    /* .supports_op             = */ NULL,
+    /* .supports_buft           = */ NULL,
     /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
@@ -691,7 +675,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * en
 
     ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
         /* .iface   = */ ggml_backend_rpc_buffer_type_interface,
-        /* .device  = */ nullptr,
+        /* .device  = */ ggml_backend_rpc_add_device(endpoint),
         /* .context = */ buft_ctx
     };
     buft_map[endpoint] = buft;
@@ -707,7 +691,7 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
     ggml_backend_t backend = new ggml_backend {
         /* .guid      = */ ggml_backend_rpc_guid(),
         /* .interface = */ ggml_backend_rpc_interface,
-        /* .device    = */ nullptr,
+        /* .device    = */ ggml_backend_rpc_add_device(endpoint),
         /* .context   = */ ctx
     };
     return backend;
@@ -1189,7 +1173,7 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
     }
 }
 
-void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
+void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
     std::string host;
     int port;
     if (!parse_endpoint(endpoint, host, port)) {
@@ -1226,3 +1210,179 @@ void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free
     WSACleanup();
 #endif
 }
+
+// device interface
+
+struct ggml_backend_rpc_device_context {
+    std::string endpoint;
+    std::string name;
+};
+
+static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_rpc_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+    return ctx->name.c_str();
+}
+
+static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+    ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
+
+    UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
+    // TODO: obtain value from the server
+    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+
+    UNUSED(dev);
+}
+
+static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_rpc_device_get_name(dev);
+    props->description = ggml_backend_rpc_device_get_description(dev);
+    props->type        = ggml_backend_rpc_device_get_type(dev);
+    ggml_backend_rpc_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const char * params) {
+    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+    return ggml_backend_rpc_init(ctx->endpoint.c_str());
+
+    UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+    return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
+
+    UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    UNUSED(dev);
+    UNUSED(max_tensor_size);
+}
+
+static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    UNUSED(dev);
+    UNUSED(op);
+    //TODO: call the remote backend and cache the results
+    return true;
+}
+
+static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
+        return false;
+    }
+    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+    ggml_backend_rpc_device_context * dev_ctx = (ggml_backend_rpc_device_context *)dev->context;
+    return buft_ctx->endpoint == dev_ctx->endpoint;
+}
+
+static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
+    /* .get_name             = */ ggml_backend_rpc_device_get_name,
+    /* .get_description      = */ ggml_backend_rpc_device_get_description,
+    /* .get_memory           = */ ggml_backend_rpc_device_get_memory,
+    /* .get_type             = */ ggml_backend_rpc_device_get_type,
+    /* .get_props            = */ ggml_backend_rpc_device_get_props,
+    /* .init_backend         = */ ggml_backend_rpc_device_init,
+    /* .get_buffer_type      = */ ggml_backend_rpc_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_rpc_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_rpc_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// backend reg interface
+
+static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
+    return "RPC";
+
+    UNUSED(reg);
+}
+
+static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
+    return 0;
+
+    UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
+
+    UNUSED(reg);
+    UNUSED(index);
+}
+
+static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
+        return (void *)ggml_backend_rpc_add_device;
+    }
+    return NULL;
+
+    UNUSED(reg);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
+    /* .get_name         = */ ggml_backend_rpc_reg_get_name,
+    /* .get_device_count = */ ggml_backend_rpc_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_rpc_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_rpc_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_rpc_reg(void) {
+    static struct ggml_backend_reg ggml_backend_rpc_reg = {
+        /* .iface   = */ ggml_backend_rpc_reg_i,
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_rpc_reg;
+}
+
+ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
+    static std::unordered_map<std::string, ggml_backend_dev_t> dev_map;
+
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    if (dev_map.find(endpoint) != dev_map.end()) {
+        return dev_map[endpoint];
+    }
+
+    ggml_backend_rpc_device_context * ctx = new ggml_backend_rpc_device_context {
+        /* .endpoint = */ endpoint,
+        /* .name     = */ "RPC[" + std::string(endpoint) + "]",
+    };
+
+    ggml_backend_dev_t dev = new ggml_backend_device {
+        /* .iface   = */ ggml_backend_rpc_device_i,
+        /* .reg     = */ ggml_backend_rpc_reg(),
+        /* .context = */ ctx,
+    };
+
+    dev_map[endpoint] = dev;
+
+    return dev;
+}
diff --git a/include/llama.h b/include/llama.h
index 7cae1bbe2..4f8f6d23d 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -433,6 +433,7 @@ extern "C" {
     LLAMA_API bool llama_supports_mmap       (void);
     LLAMA_API bool llama_supports_mlock      (void);
     LLAMA_API bool llama_supports_gpu_offload(void);
+    LLAMA_API bool llama_supports_rpc        (void);
 
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index 01cdf17dc..da7afb1ee 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8,10 +8,6 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
-#ifdef GGML_USE_RPC
-#  include "ggml-rpc.h"
-#endif
-
 #if defined(GGML_USE_VULKAN)
 #  include "ggml-vulkan.h"
 #elif defined(GGML_USE_SYCL)
@@ -3404,10 +3400,6 @@ struct llama_lora_adapter {
 static int llama_get_device_count(const llama_model & model) {
     int count = (int) model.devices.size();
 
-#if defined(GGML_USE_RPC)
-    count += (int) model.rpc_servers.size();
-#endif
-
 #if defined(GGML_USE_SYCL)
     count += ggml_backend_sycl_get_device_count();
 #elif defined(GGML_USE_VULKAN)
@@ -3460,15 +3452,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
 static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
     ggml_backend_buffer_type_t buft = nullptr;
 
-#if defined(GGML_USE_RPC)
-    int rpc_count = (int)model.rpc_servers.size();
-    if (device < rpc_count) {
-        const char * endpoint = model.rpc_servers[device].c_str();
-        return ggml_backend_rpc_buffer_type(endpoint);
-    }
-    device -= rpc_count;
-#endif
-
     if (device < (int)model.devices.size()) {
         return ggml_backend_dev_buffer_type(model.devices[device]);
     }
@@ -3523,18 +3506,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
 }
 
 static size_t llama_get_device_memory(const llama_model & model, int device) {
-#if defined(GGML_USE_RPC)
-    int rpc_count = (int)model.rpc_servers.size();
-    if (device < rpc_count) {
-        size_t total;
-        size_t free;
-        const char * endpoint = model.rpc_servers[device].c_str();
-        ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
-        return free;
-    }
-    device = device - rpc_count;
-#endif
-
     if (device < (int)model.devices.size()) {
         ggml_backend_dev_t dev = model.devices[device];
         size_t total;
@@ -19019,15 +18990,20 @@ bool llama_supports_mlock(void) {
 
 bool llama_supports_gpu_offload(void) {
 #if defined(GGML_USE_VULKAN) || \
-    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
+    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
     // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
     return true;
 #else
     return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
-        ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
+           ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
+           llama_supports_rpc();
 #endif
 }
 
+bool llama_supports_rpc(void) {
+    return ggml_backend_reg_by_name("RPC") != nullptr;
+}
+
 void llama_backend_init(void) {
     ggml_time_init();
 
@@ -19102,6 +19078,36 @@ struct llama_model * llama_load_model_from_file(
         model->rpc_servers.push_back(servers);
     }
 
+    // add RPC devices
+    if (!model->rpc_servers.empty()) {
+        ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+        if (!rpc_reg) {
+            LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
+            llama_free_model(model);
+            return nullptr;
+        }
+
+        // ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+        using ggml_backend_rpc_add_device_t = ggml_backend_dev_t (*)(const char *);
+        ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+        if (!ggml_backend_rpc_add_device_fn) {
+            LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
+            llama_free_model(model);
+            return nullptr;
+        }
+
+        for (const std::string & server : model->rpc_servers) {
+            ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+            if (dev) {
+                model->devices.push_back(dev);
+            } else {
+                LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
+                llama_free_model(model);
+                return nullptr;
+            }
+        }
+    }
+
     // create list of devices to use with this model
     // currently, we use all available devices
     // TODO: rework API to give user more control over device selection
@@ -19128,7 +19134,7 @@ struct llama_model * llama_load_model_from_file(
         } else if (status == -2) {
             LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
         }
-        delete model;
+        llama_free_model(model);
         return nullptr;
     }
 
@@ -19311,23 +19317,6 @@ struct llama_context * llama_new_context_with_model(
             main_gpu -= (int)model->devices.size();
         }
 
-#if defined(GGML_USE_RPC)
-        if (model->n_gpu_layers > 0) {
-            for (const auto & endpoint : model->rpc_servers) {
-                ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-        if (main_gpu >= (int)model->rpc_servers.size()) {
-            main_gpu -= (int)model->rpc_servers.size();
-        }
-#endif
-
 #if defined(GGML_USE_VULKAN)
         if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
             LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);

From 7eee341bee09957139789c2d828995953f0fc7ff Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 10 Oct 2024 22:57:42 +0200
Subject: [PATCH 037/396] common : use common_ prefix for common library
 functions (#9805)

* common : use common_ prefix for common library functions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/arg.cpp                                | 868 +++++++++---------
 common/arg.h                                  |  44 +-
 common/common.cpp                             | 116 +--
 common/common.h                               | 110 +--
 common/log.cpp                                | 100 +-
 common/log.h                                  |  36 +-
 common/ngram-cache.cpp                        |  72 +-
 common/ngram-cache.h                          |  38 +-
 common/sampling.cpp                           | 136 +--
 common/sampling.h                             |  40 +-
 examples/batched-bench/batched-bench.cpp      |  20 +-
 examples/batched/batched.cpp                  |  28 +-
 .../convert-llama2c-to-ggml.cpp               |   2 +-
 .../cvector-generator/cvector-generator.cpp   |  16 +-
 examples/embedding/embedding.cpp              |  24 +-
 examples/eval-callback/eval-callback.cpp      |  14 +-
 examples/export-lora/export-lora.cpp          |   6 +-
 examples/gen-docs/gen-docs.cpp                |  14 +-
 examples/gritlm/gritlm.cpp                    |  36 +-
 examples/imatrix/imatrix.cpp                  |  18 +-
 examples/infill/infill.cpp                    |  66 +-
 .../llama/src/main/cpp/llama-android.cpp      |  22 +-
 examples/llava/llava-cli.cpp                  |  44 +-
 examples/llava/minicpmv-cli.cpp               |  40 +-
 examples/lookahead/lookahead.cpp              |  48 +-
 examples/lookup/lookup-create.cpp             |  14 +-
 examples/lookup/lookup-merge.cpp              |   8 +-
 examples/lookup/lookup-stats.cpp              |  28 +-
 examples/lookup/lookup.cpp                    |  54 +-
 examples/main/main.cpp                        |  92 +-
 examples/parallel/parallel.cpp                |  36 +-
 examples/passkey/passkey.cpp                  |  28 +-
 examples/perplexity/perplexity.cpp            |  58 +-
 examples/retrieval/retrieval.cpp              |  26 +-
 examples/save-load-state/save-load-state.cpp  |  18 +-
 examples/server/server.cpp                    |  80 +-
 examples/server/utils.hpp                     |   8 +-
 examples/speculative/speculative.cpp          |  80 +-
 examples/tokenize/tokenize.cpp                |   4 +-
 tests/test-arg-parser.cpp                     |  28 +-
 tests/test-chat-template.cpp                  |  10 +-
 tests/test-log.cpp                            |   4 +-
 tests/test-tokenizer-0.cpp                    |  14 +-
 tests/test-tokenizer-1-bpe.cpp                |  10 +-
 tests/test-tokenizer-1-spm.cpp                |  10 +-
 45 files changed, 1284 insertions(+), 1284 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 4d2527c58..6014f5d8a 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -17,27 +17,27 @@
 
 using json = nlohmann::ordered_json;
 
-llama_arg & llama_arg::set_examples(std::initializer_list<enum llama_example> examples) {
+common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
     this->examples = std::move(examples);
     return *this;
 }
 
-llama_arg & llama_arg::set_env(const char * env) {
+common_arg & common_arg::set_env(const char * env) {
     help = help + "\n(env: " + env + ")";
     this->env = env;
     return *this;
 }
 
-llama_arg & llama_arg::set_sparam() {
+common_arg & common_arg::set_sparam() {
     is_sparam = true;
     return *this;
 }
 
-bool llama_arg::in_example(enum llama_example ex) {
+bool common_arg::in_example(enum llama_example ex) {
     return examples.find(ex) != examples.end();
 }
 
-bool llama_arg::get_value_from_env(std::string & output) {
+bool common_arg::get_value_from_env(std::string & output) {
     if (env == nullptr) return false;
     char * value = std::getenv(env);
     if (value) {
@@ -47,7 +47,7 @@ bool llama_arg::get_value_from_env(std::string & output) {
     return false;
 }
 
-bool llama_arg::has_value_from_env() {
+bool common_arg::has_value_from_env() {
     return env != nullptr && std::getenv(env);
 }
 
@@ -78,7 +78,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
     return result;
 }
 
-std::string llama_arg::to_string() {
+std::string common_arg::to_string() {
     // params for printing to console
     const static int n_leading_spaces = 40;
     const static int n_char_per_line_help = 70; // TODO: detect this based on current console
@@ -145,7 +145,7 @@ static std::string format(const char * fmt, ...) {
     return std::string(buf.data(), size);
 }
 
-static void gpt_params_handle_model_default(gpt_params & params) {
+static void common_params_handle_model_default(common_params & params) {
     if (!params.hf_repo.empty()) {
         // short-hand to avoid specifying --hf-file -> default it to --model
         if (params.hf_file.empty()) {
@@ -171,12 +171,12 @@ static void gpt_params_handle_model_default(gpt_params & params) {
 // CLI argument parsing functions
 //
 
-static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx_arg) {
+static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
     std::string arg;
     const std::string arg_prefix = "--";
-    gpt_params & params = ctx_arg.params;
+    common_params & params = ctx_arg.params;
 
-    std::unordered_map<std::string, llama_arg *> arg_to_options;
+    std::unordered_map<std::string, common_arg *> arg_to_options;
     for (auto & opt : ctx_arg.options) {
         for (const auto & arg : opt.args) {
             arg_to_options[arg] = &opt;
@@ -268,7 +268,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
-    gpt_params_handle_model_default(params);
+    common_params_handle_model_default(params);
 
     if (params.escape) {
         string_process_escapes(params.prompt);
@@ -291,16 +291,16 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
     return true;
 }
 
-static void gpt_params_print_usage(gpt_params_context & ctx_arg) {
-    auto print_options = [](std::vector<llama_arg *> & options) {
-        for (llama_arg * opt : options) {
+static void common_params_print_usage(common_params_context & ctx_arg) {
+    auto print_options = [](std::vector<common_arg *> & options) {
+        for (common_arg * opt : options) {
             printf("%s", opt->to_string().c_str());
         }
     };
 
-    std::vector<llama_arg *> common_options;
-    std::vector<llama_arg *> sparam_options;
-    std::vector<llama_arg *> specific_options;
+    std::vector<common_arg *> common_options;
+    std::vector<common_arg *> sparam_options;
+    std::vector<common_arg *> specific_options;
     for (auto & opt : ctx_arg.options) {
         // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
         if (opt.is_sparam) {
@@ -320,17 +320,17 @@ static void gpt_params_print_usage(gpt_params_context & ctx_arg) {
     print_options(specific_options);
 }
 
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) {
-    auto ctx_arg = gpt_params_parser_init(params, ex, print_usage);
-    const gpt_params params_org = ctx_arg.params; // the example can modify the default params
+bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
+    const common_params params_org = ctx_arg.params; // the example can modify the default params
 
     try {
-        if (!gpt_params_parse_ex(argc, argv, ctx_arg)) {
+        if (!common_params_parse_ex(argc, argv, ctx_arg)) {
             ctx_arg.params = params_org;
             return false;
         }
         if (ctx_arg.params.usage) {
-            gpt_params_print_usage(ctx_arg);
+            common_params_print_usage(ctx_arg);
             if (ctx_arg.print_usage) {
                 ctx_arg.print_usage(argc, argv);
             }
@@ -345,16 +345,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example
     return true;
 }
 
-gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) {
-    gpt_params_context ctx_arg(params);
+common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    common_params_context ctx_arg(params);
     ctx_arg.print_usage = print_usage;
     ctx_arg.ex          = ex;
 
     std::string sampler_type_chars;
     std::string sampler_type_names;
     for (const auto & sampler : params.sparams.samplers) {
-        sampler_type_chars += gpt_sampler_type_to_chr(sampler);
-        sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
+        sampler_type_chars += common_sampler_type_to_chr(sampler);
+        sampler_type_names += common_sampler_type_to_str(sampler) + ";";
     }
     sampler_type_names.pop_back();
 
@@ -366,371 +366,371 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
      * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
      * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
      */
-    auto add_opt = [&](llama_arg arg) {
+    auto add_opt = [&](common_arg arg) {
         if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
             ctx_arg.options.push_back(std::move(arg));
         }
     };
 
 
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-h", "--help", "--usage"},
         "print usage and exit",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.usage = true;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--version"},
         "show version and build info",
-        [](gpt_params &) {
+        [](common_params &) {
             fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
             fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
             exit(0);
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--verbose-prompt"},
         format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.verbose_prompt = true;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--no-display-prompt"},
         format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.display_prompt = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-co", "--color"},
         format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.use_color = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-t", "--threads"}, "N",
         format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.cpuparams.n_threads = value;
             if (params.cpuparams.n_threads <= 0) {
                 params.cpuparams.n_threads = std::thread::hardware_concurrency();
             }
         }
     ).set_env("LLAMA_ARG_THREADS"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-tb", "--threads-batch"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads)",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.cpuparams_batch.n_threads = value;
             if (params.cpuparams_batch.n_threads <= 0) {
                 params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
             }
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-td", "--threads-draft"}, "N",
         "number of threads to use during generation (default: same as --threads)",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.draft_cpuparams.n_threads = value;
             if (params.draft_cpuparams.n_threads <= 0) {
                 params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-tbd", "--threads-batch-draft"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.draft_cpuparams_batch.n_threads = value;
             if (params.draft_cpuparams_batch.n_threads <= 0) {
                 params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-C", "--cpu-mask"}, "M",
         "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
-        [](gpt_params & params, const std::string & mask) {
+        [](common_params & params, const std::string & mask) {
             params.cpuparams.mask_valid = true;
             if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
                 throw std::invalid_argument("invalid cpumask");
             }
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-Cr", "--cpu-range"}, "lo-hi",
         "range of CPUs for affinity. Complements --cpu-mask",
-        [](gpt_params & params, const std::string & range) {
+        [](common_params & params, const std::string & range) {
             params.cpuparams.mask_valid = true;
             if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
                 throw std::invalid_argument("invalid range");
             }
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--cpu-strict"}, "<0|1>",
         format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.cpuparams.strict_cpu = std::stoul(value);
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--prio"}, "N",
         format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
-        [](gpt_params & params, int prio) {
+        [](common_params & params, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
             }
             params.cpuparams.priority = (enum ggml_sched_priority) prio;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--poll"}, "<0...100>",
         format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.cpuparams.poll = std::stoul(value);
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-Cb", "--cpu-mask-batch"}, "M",
         "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
-        [](gpt_params & params, const std::string & mask) {
+        [](common_params & params, const std::string & mask) {
             params.cpuparams_batch.mask_valid = true;
             if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
                 throw std::invalid_argument("invalid cpumask");
             }
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-Crb", "--cpu-range-batch"}, "lo-hi",
         "ranges of CPUs for affinity. Complements --cpu-mask-batch",
-        [](gpt_params & params, const std::string & range) {
+        [](common_params & params, const std::string & range) {
             params.cpuparams_batch.mask_valid = true;
             if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
                 throw std::invalid_argument("invalid range");
             }
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--cpu-strict-batch"}, "<0|1>",
         "use strict CPU placement (default: same as --cpu-strict)",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.cpuparams_batch.strict_cpu = value;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--prio-batch"}, "N",
         format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
-        [](gpt_params & params, int prio) {
+        [](common_params & params, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
             }
             params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--poll-batch"}, "<0|1>",
         "use polling to wait for work (default: same as --poll)",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.cpuparams_batch.poll = value;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-Cd", "--cpu-mask-draft"}, "M",
         "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](gpt_params & params, const std::string & mask) {
+        [](common_params & params, const std::string & mask) {
             params.draft_cpuparams.mask_valid = true;
             if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
                 throw std::invalid_argument("invalid cpumask");
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-Crd", "--cpu-range-draft"}, "lo-hi",
         "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
-        [](gpt_params & params, const std::string & range) {
+        [](common_params & params, const std::string & range) {
             params.draft_cpuparams.mask_valid = true;
             if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
                 throw std::invalid_argument("invalid range");
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--cpu-strict-draft"}, "<0|1>",
         "Use strict CPU placement for draft model (default: same as --cpu-strict)",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.draft_cpuparams.strict_cpu = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--prio-draft"}, "N",
         format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
-        [](gpt_params & params, int prio) {
+        [](common_params & params, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
             }
             params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--poll-draft"}, "<0|1>",
         "Use polling to wait for draft model work (default: same as --poll])",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.draft_cpuparams.poll = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-Cbd", "--cpu-mask-batch-draft"}, "M",
         "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](gpt_params & params, const std::string & mask) {
+        [](common_params & params, const std::string & mask) {
             params.draft_cpuparams_batch.mask_valid = true;
             if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
                 throw std::invalid_argument("invalid cpumask");
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
-        [](gpt_params & params, const std::string & range) {
+        [](common_params & params, const std::string & range) {
             params.draft_cpuparams_batch.mask_valid = true;
             if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
                 throw std::invalid_argument("invalid cpumask");
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--cpu-strict-batch-draft"}, "<0|1>",
         "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.draft_cpuparams_batch.strict_cpu = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--prio-batch-draft"}, "N",
         format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
-        [](gpt_params & params, int prio) {
+        [](common_params & params, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
             }
             params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--poll-batch-draft"}, "<0|1>",
         "Use polling to wait for draft model work (default: --poll-draft)",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.draft_cpuparams_batch.poll = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--draft"}, "N",
         format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_draft = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ps", "--p-split"}, "N",
         format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.p_split = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-lcs", "--lookup-cache-static"}, "FNAME",
         "path to static lookup cache to use for lookup decoding (not updated by generation)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.lookup_cache_static = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
         "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.lookup_cache_dynamic = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-c", "--ctx-size"}, "N",
         format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_ctx = value;
         }
     ).set_env("LLAMA_ARG_CTX_SIZE"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-n", "--predict", "--n-predict"}, "N",
         format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_predict = value;
         }
     ).set_env("LLAMA_ARG_N_PREDICT"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-b", "--batch-size"}, "N",
         format("logical maximum batch size (default: %d)", params.n_batch),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_batch = value;
         }
     ).set_env("LLAMA_ARG_BATCH"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ub", "--ubatch-size"}, "N",
         format("physical maximum batch size (default: %d)", params.n_ubatch),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_ubatch = value;
         }
     ).set_env("LLAMA_ARG_UBATCH"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--keep"}, "N",
         format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_keep = value;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--no-context-shift"},
         format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.ctx_shift = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--chunks"}, "N",
         format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_chunks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-fa", "--flash-attn"},
         format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.flash_attn = true;
         }
     ).set_env("LLAMA_ARG_FLASH_ATTN"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-p", "--prompt"}, "PROMPT",
         ex == LLAMA_EXAMPLE_MAIN
             ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
             : "prompt to start generation with",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.prompt = value;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--no-perf"},
         format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.no_perf = true;
             params.sparams.no_perf = true;
         }
     ).set_env("LLAMA_ARG_NO_PERF"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -743,10 +743,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             }
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--in-file"}, "FNAME",
         "an input file (repeat to specify multiple files)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -754,10 +754,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.in_files.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-bf", "--binary-file"}, "FNAME",
         "binary file containing the prompt (default: none)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             std::ifstream file(value, std::ios::binary);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -770,63 +770,63 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-e", "--escape"},
         format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.escape = true;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--no-escape"},
         "do not process escape sequences",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.escape = false;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ptc", "--print-token-count"}, "N",
         format("print token count every N tokens (default: %d)", params.n_print),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_print = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--prompt-cache"}, "FNAME",
         "file to cache prompt state for faster startup (default: none)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.path_prompt_cache = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--prompt-cache-all"},
         "if specified, saves user input and generations to cache as well\n",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.prompt_cache_all = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--prompt-cache-ro"},
         "if specified, uses the prompt cache but does not update it",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.prompt_cache_ro = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-r", "--reverse-prompt"}, "PROMPT",
         "halt generation at PROMPT, return control in interactive mode\n",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.antiprompt.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-sp", "--special"},
         format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.special = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-cnv", "--conversation"},
         format(
             "run in conversation mode:\n"
@@ -835,222 +835,222 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             "(default: %s)",
             params.conversation ? "true" : "false"
         ),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.conversation = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-i", "--interactive"},
         format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.interactive = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-if", "--interactive-first"},
         format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.interactive_first = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-mli", "--multiline-input"},
         "allows you to write or paste multiple lines without ending each in '\\'",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.multiline_input = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--in-prefix-bos"},
         "prefix BOS to user inputs, preceding the `--in-prefix` string",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.input_prefix_bos = true;
             params.enable_chat_template = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--in-prefix"}, "STRING",
         "string to prefix user inputs with (default: empty)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.input_prefix = value;
             params.enable_chat_template = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--in-suffix"}, "STRING",
         "string to suffix after user inputs with (default: empty)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.input_suffix = value;
             params.enable_chat_template = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--no-warmup"},
         "skip warming up the model with an empty run",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.warmup = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--spm-infill"},
         format(
             "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
             params.spm_infill ? "enabled" : "disabled"
         ),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.spm_infill = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--samplers"}, "SAMPLERS",
         format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             const auto sampler_names = string_split(value, ';');
-            params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
+            params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-s", "--seed"}, "SEED",
         format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.seed = std::stoul(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--sampling-seq"}, "SEQUENCE",
         format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.samplers = gpt_sampler_types_from_chars(value);
+        [](common_params & params, const std::string & value) {
+            params.sparams.samplers = common_sampler_types_from_chars(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--ignore-eos"},
         "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.sparams.ignore_eos = true;
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--penalize-nl"},
         format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.sparams.penalize_nl = true;
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--temp"}, "N",
         format("temperature (default: %.1f)", (double)params.sparams.temp),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.temp = std::stof(value);
             params.sparams.temp = std::max(params.sparams.temp, 0.0f);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--top-k"}, "N",
         format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.sparams.top_k = value;
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--top-p"}, "N",
         format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.top_p = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--min-p"}, "N",
         format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.min_p = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--tfs"}, "N",
         format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.tfs_z = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--typical"}, "N",
         format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.typ_p = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--repeat-last-n"}, "N",
         format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.sparams.penalty_last_n = value;
             params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--repeat-penalty"}, "N",
         format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.penalty_repeat = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--presence-penalty"}, "N",
         format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.penalty_present = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--frequency-penalty"}, "N",
         format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.penalty_freq = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--dynatemp-range"}, "N",
         format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.dynatemp_range = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--dynatemp-exp"}, "N",
         format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.dynatemp_exponent = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--mirostat"}, "N",
         format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
         "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.sparams.mirostat = value;
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--mirostat-lr"}, "N",
         format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.mirostat_eta = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--mirostat-ent"}, "N",
         format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.mirostat_tau = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
         "modifies the likelihood of token appearing in the completion,\n"
         "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
         "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             std::stringstream ss(value);
             llama_token key;
             char sign;
@@ -1067,17 +1067,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             }
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--grammar"}, "GRAMMAR",
         format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.grammar = value;
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--grammar-file"}, "FNAME",
         "file to read grammar from",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1089,17 +1089,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             );
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-j", "--json-schema"}, "SCHEMA",
         "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.sparams.grammar = json_schema_to_grammar(json::parse(value));
         }
     ).set_sparam());
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--pooling"}, "{none,mean,cls,last,rank}",
         "pooling type for embeddings, use model default if unspecified",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
             else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
             else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS;  }
@@ -1108,275 +1108,275 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--attention"}, "{causal,non,causal}",
         "attention type for embeddings, use model default if unspecified",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
             else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--rope-scaling"}, "{none,linear,yarn}",
         "RoPE frequency scaling method, defaults to linear unless specified by the model",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
             else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
             else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--rope-scale"}, "N",
         "RoPE context scaling factor, expands context by a factor of N",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.rope_freq_scale = 1.0f / std::stof(value);
         }
     ).set_env("LLAMA_ARG_ROPE_SCALE"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--rope-freq-base"}, "N",
         "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.rope_freq_base = std::stof(value);
         }
     ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--rope-freq-scale"}, "N",
         "RoPE frequency scaling factor, expands context by a factor of 1/N",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.rope_freq_scale = std::stof(value);
         }
     ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--yarn-orig-ctx"}, "N",
         format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.yarn_orig_ctx = value;
         }
     ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--yarn-ext-factor"}, "N",
         format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.yarn_ext_factor = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--yarn-attn-factor"}, "N",
         format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.yarn_attn_factor = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--yarn-beta-slow"}, "N",
         format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.yarn_beta_slow = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--yarn-beta-fast"}, "N",
         format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.yarn_beta_fast = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-gan", "--grp-attn-n"}, "N",
         format("group-attention factor (default: %d)", params.grp_attn_n),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.grp_attn_n = value;
         }
     ).set_env("LLAMA_ARG_GRP_ATTN_N"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-gaw", "--grp-attn-w"}, "N",
         format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.grp_attn_w = value;
         }
     ).set_env("LLAMA_ARG_GRP_ATTN_W"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-dkvc", "--dump-kv-cache"},
         "verbose print of the KV cache",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.dump_kv_cache = true;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-nkvo", "--no-kv-offload"},
         "disable KV offload",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.no_kv_offload = true;
         }
     ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ctk", "--cache-type-k"}, "TYPE",
         format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             // TODO: get the type right here
             params.cache_type_k = value;
         }
     ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ctv", "--cache-type-v"}, "TYPE",
         format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             // TODO: get the type right here
             params.cache_type_v = value;
         }
     ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--perplexity", "--all-logits"},
         format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.logits_all = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--hellaswag"},
         "compute HellaSwag score over random tasks from datafile supplied with -f",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.hellaswag = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--hellaswag-tasks"}, "N",
         format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.hellaswag_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--winogrande"},
         "compute Winogrande score over random tasks from datafile supplied with -f",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.winogrande = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--winogrande-tasks"}, "N",
         format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.winogrande_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--multiple-choice"},
         "compute multiple choice score over random tasks from datafile supplied with -f",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.multiple_choice = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--multiple-choice-tasks"}, "N",
         format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.multiple_choice_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--kl-divergence"},
         "computes KL-divergence to logits provided via --kl-divergence-base",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.kl_divergence = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
         "set logits file",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.logits_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--ppl-stride"}, "N",
         format("stride for perplexity calculation (default: %d)", params.ppl_stride),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.ppl_stride = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--ppl-output-type"}, "<0|1>",
         format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.ppl_output_type = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-dt", "--defrag-thold"}, "N",
         format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.defrag_thold = std::stof(value);
         }
     ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-np", "--parallel"}, "N",
         format("number of parallel sequences to decode (default: %d)", params.n_parallel),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_parallel = value;
         }
     ).set_env("LLAMA_ARG_N_PARALLEL"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ns", "--sequences"}, "N",
         format("number of sequences to decode (default: %d)", params.n_sequences),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_sequences = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-cb", "--cont-batching"},
         format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.cont_batching = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-nocb", "--no-cont-batching"},
         "disable continuous batching",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.cont_batching = false;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--mmproj"}, "FILE",
         "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.mmproj = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.image.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     if (llama_supports_rpc()) {
-        add_opt(llama_arg(
+        add_opt(common_arg(
             {"--rpc"}, "SERVERS",
             "comma separated list of RPC servers",
-            [](gpt_params & params, const std::string & value) {
+            [](common_params & params, const std::string & value) {
                 params.rpc_servers = value;
             }
         ).set_env("LLAMA_ARG_RPC"));
     }
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--mlock"},
         "force system to keep model in RAM rather than swapping or compressing",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.use_mlock = true;
         }
     ).set_env("LLAMA_ARG_MLOCK"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--no-mmap"},
         "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.use_mmap = false;
         }
     ).set_env("LLAMA_ARG_NO_MMAP"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--numa"}, "TYPE",
         "attempt optimizations that help on some NUMA systems\n"
         "- distribute: spread execution evenly over all nodes\n"
@@ -1384,17 +1384,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         "- numactl: use the CPU map provided by numactl\n"
         "if run without this previously, it is recommended to drop the system page cache before using this\n"
         "see https://github.com/ggerganov/llama.cpp/issues/1437",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
             else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
             else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_env("LLAMA_ARG_NUMA"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
@@ -1402,10 +1402,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             }
         }
     ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
         "number of layers to store in VRAM for the draft model",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_gpu_layers_draft = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
@@ -1413,13 +1413,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-sm", "--split-mode"}, "{none,layer,row}",
         "how to split the model across multiple GPUs, one of:\n"
         "- none: use one GPU only\n"
         "- layer (default): split layers and KV across GPUs\n"
         "- row: split rows across GPUs",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             std::string arg_next = value;
             if (arg_next == "none") {
                 params.split_mode = LLAMA_SPLIT_MODE_NONE;
@@ -1439,10 +1439,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             }
         }
     ).set_env("LLAMA_ARG_SPLIT_MODE"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ts", "--tensor-split"}, "N0,N1,N2,...",
         "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             std::string arg_next = value;
 
             // split string by , and /
@@ -1466,80 +1466,80 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             }
         }
     ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-mg", "--main-gpu"}, "INDEX",
         format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.main_gpu = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
             }
         }
     ).set_env("LLAMA_ARG_MAIN_GPU"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--check-tensors"},
         format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.check_tensors = true;
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--override-kv"}, "KEY=TYPE:VALUE",
         "advanced option to override model metadata by key. may be specified multiple times.\n"
         "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
                 throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
             }
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--lora"}, "FNAME",
         "path to LoRA adapter (can be repeated to use multiple adapters)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.lora_adapters.push_back({ std::string(value), 1.0 });
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--lora-scaled"}, "FNAME", "SCALE",
         "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
-        [](gpt_params & params, const std::string & fname, const std::string & scale) {
+        [](common_params & params, const std::string & fname, const std::string & scale) {
             params.lora_adapters.push_back({ fname, std::stof(scale) });
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--control-vector"}, "FNAME",
         "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.control_vectors.push_back({ 1.0f, value, });
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--control-vector-scaled"}, "FNAME", "SCALE",
         "add a control vector with user defined scaling SCALE\n"
         "note: this argument can be repeated to add multiple scaled control vectors",
-        [](gpt_params & params, const std::string & fname, const std::string & scale) {
+        [](common_params & params, const std::string & fname, const std::string & scale) {
             params.control_vectors.push_back({ std::stof(scale), fname });
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--control-vector-layer-range"}, "START", "END",
         "layer range to apply the control vector(s) to, start and end inclusive",
-        [](gpt_params & params, const std::string & start, const std::string & end) {
+        [](common_params & params, const std::string & start, const std::string & end) {
             params.control_vector_layer_start = std::stoi(start);
             params.control_vector_layer_end = std::stoi(end);
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-a", "--alias"}, "STRING",
         "set alias for model name (to be used by REST API)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.model_alias = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-m", "--model"}, "FNAME",
         ex == LLAMA_EXAMPLE_EXPORT_LORA
             ? std::string("model path from which to load base model")
@@ -1547,49 +1547,49 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                 "model path (default: `models/$filename` with filename from `--hf-file` "
                 "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
             ),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.model = value;
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.model_draft = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-mu", "--model-url"}, "MODEL_URL",
         "model download url (default: unused)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.model_url = value;
         }
     ).set_env("LLAMA_ARG_MODEL_URL"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-hfr", "--hf-repo"}, "REPO",
         "Hugging Face model repository (default: unused)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.hf_repo = value;
         }
     ).set_env("LLAMA_ARG_HF_REPO"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-hff", "--hf-file"}, "FILE",
         "Hugging Face model file (default: unused)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-hft", "--hf-token"}, "TOKEN",
         "Hugging Face access token (default: value from HF_TOKEN environment variable)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.hf_token = value;
         }
     ).set_env("HF_TOKEN"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--context-file"}, "FNAME",
         "file to load context from (repeat to specify multiple files)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             std::ifstream file(value, std::ios::binary);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1597,35 +1597,35 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.context_files.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--chunk-size"}, "N",
         format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.chunk_size = value;
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--chunk-separator"}, "STRING",
         format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.chunk_separator = value;
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--junk"}, "N",
         format("number of times to repeat the junk text (default: %d)", params.n_junk),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_junk = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--pos"}, "N",
         format("position of the passkey in the junk text (default: %d)", params.i_pos),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.i_pos = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-o", "--output", "--output-file"}, "FNAME",
         format("output file (default: '%s')",
             ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -1633,145 +1633,145 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                 : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
                     ? params.cvector_outfile.c_str()
                     : params.out_file.c_str()),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.out_file = value;
             params.cvector_outfile = value;
             params.lora_outfile = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_out_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--save-frequency"}, "N",
         format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_save_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--process-output"},
         format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.process_output = true;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--no-ppl"},
         format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.compute_ppl = false;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--chunk", "--from-chunk"}, "N",
         format("start processing the input from chunk N (default: %d)", params.i_chunk),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.i_chunk = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-pps"},
         format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.is_pp_shared = true;
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-npp"}, "n0,n1,...",
         "number of prompt tokens",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             auto p = string_split<int>(value, ',');
             params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ntg"}, "n0,n1,...",
         "number of text generation tokens",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             auto p = string_split<int>(value, ',');
             params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-npl"}, "n0,n1,...",
         "number of parallel prompts",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             auto p = string_split<int>(value, ',');
             params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--embd-normalize"}, "N",
         format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.embd_normalize = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--embd-output-format"}, "FORMAT",
         "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.embd_out = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--embd-separator"}, "STRING",
         "separator of embendings (default \\n) for example \"<#sep#>\"",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.embd_sep = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--host"}, "HOST",
         format("ip address to listen (default: %s)", params.hostname.c_str()),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.hostname = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--port"}, "PORT",
         format("port to listen (default: %d)", params.port),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.port = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--path"}, "PATH",
         format("path to serve static files from (default: %s)", params.public_path.c_str()),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.public_path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--embedding", "--embeddings"},
         format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.embedding = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--reranking", "--rerank"},
         format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.reranking = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--api-key"}, "KEY",
         "API key to use for authentication (default: none)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.api_keys.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--api-key-file"}, "FNAME",
         "path to file containing API keys (default: none)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             std::ifstream key_file(value);
             if (!key_file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1785,39 +1785,39 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             key_file.close();
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--ssl-key-file"}, "FNAME",
         "path to file a PEM-encoded SSL private key",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.ssl_file_key = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--ssl-cert-file"}, "FNAME",
         "path to file a PEM-encoded SSL certificate",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.ssl_file_cert = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-to", "--timeout"}, "N",
         format("server read/write timeout in seconds (default: %d)", params.timeout_read),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.timeout_read  = value;
             params.timeout_write = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--threads-http"}, "N",
         format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_threads_http = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-spf", "--system-prompt-file"}, "FNAME",
         "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1831,38 +1831,38 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.system_prompt = system_prompt;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--metrics"},
         format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.endpoint_metrics = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--slots"},
         format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.endpoint_slots = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--props"},
         format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.endpoint_props = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--no-slots"},
         "disables slots monitoring endpoint",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.endpoint_slots = false;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--slot-save-path"}, "PATH",
         "path to save slot kv cache (default: disabled)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.slot_save_path = value;
             // if doesn't end with DIRECTORY_SEPARATOR, add it
             if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
@@ -1870,13 +1870,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
         "set custom jinja chat template (default: template taken from model's metadata)\n"
         "if suffix/prefix are specified, template will be disabled\n"
         "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
-        [](gpt_params & params, const std::string & value) {
-            if (!llama_chat_verify_template(value)) {
+        [](common_params & params, const std::string & value) {
+            if (!common_chat_verify_template(value)) {
                 throw std::runtime_error(format(
                     "error: the supplied chat template is not supported: %s\n"
                     "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
@@ -1886,31 +1886,31 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.chat_template = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
         format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.slot_prompt_similarity = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--lora-init-without-apply"},
         format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.lora_init_without_apply = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--simple-io"},
         "use basic IO for better compatibility in subprocesses and limited consoles",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.simple_io = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-ld", "--logdir"}, "LOGDIR",
         "path under which to save YAML logs (no logging if unset)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.logdir = value;
 
             if (params.logdir.back() != DIRECTORY_SEPARATOR) {
@@ -1918,101 +1918,101 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             }
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--positive-file"}, "FNAME",
         format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.cvector_positive_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--negative-file"}, "FNAME",
         format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             params.cvector_negative_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--pca-batch"}, "N",
         format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_pca_batch = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--pca-iter"}, "N",
         format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.n_pca_iterations = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--method"}, "{pca, mean}",
         "dimensionality reduction method to be used (default: pca)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
             else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--output-format"}, "{md,jsonl}",
         "output format for batched-bench results (default: md)",
-        [](gpt_params & params, const std::string & value) {
+        [](common_params & params, const std::string & value) {
             /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
             else if (value == "md") { params.batched_bench_output_jsonl = false; }
             else { std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--log-disable"},
         "Log disable",
-        [](gpt_params &) {
-            gpt_log_pause(gpt_log_main());
+        [](common_params &) {
+            common_log_pause(common_log_main());
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--log-file"}, "FNAME",
         "Log to file",
-        [](gpt_params &, const std::string & value) {
-            gpt_log_set_file(gpt_log_main(), value.c_str());
+        [](common_params &, const std::string & value) {
+            common_log_set_file(common_log_main(), value.c_str());
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--log-colors"},
         "Enable colored logging",
-        [](gpt_params &) {
-            gpt_log_set_colors(gpt_log_main(), true);
+        [](common_params &) {
+            common_log_set_colors(common_log_main(), true);
         }
     ).set_env("LLAMA_LOG_COLORS"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-v", "--verbose", "--log-verbose"},
         "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
-        [](gpt_params & params) {
+        [](common_params & params) {
             params.verbosity = INT_MAX;
-            gpt_log_set_verbosity_thold(INT_MAX);
+            common_log_set_verbosity_thold(INT_MAX);
         }
     ));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"-lv", "--verbosity", "--log-verbosity"}, "N",
         "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
-        [](gpt_params & params, int value) {
+        [](common_params & params, int value) {
             params.verbosity = value;
-            gpt_log_set_verbosity_thold(value);
+            common_log_set_verbosity_thold(value);
         }
     ).set_env("LLAMA_LOG_VERBOSITY"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--log-prefix"},
         "Enable prefx in log messages",
-        [](gpt_params &) {
-            gpt_log_set_prefix(gpt_log_main(), true);
+        [](common_params &) {
+            common_log_set_prefix(common_log_main(), true);
         }
     ).set_env("LLAMA_LOG_PREFIX"));
-    add_opt(llama_arg(
+    add_opt(common_arg(
         {"--log-timestamps"},
         "Enable timestamps in log messages",
-        [](gpt_params &) {
-            gpt_log_set_timestamps(gpt_log_main(), true);
+        [](common_params &) {
+            common_log_set_timestamps(common_log_main(), true);
         }
     ).set_env("LLAMA_LOG_TIMESTAMPS"));
 
diff --git a/common/arg.h b/common/arg.h
index 413de2c88..a6700d323 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -10,7 +10,7 @@
 // CLI argument parsing
 //
 
-struct llama_arg {
+struct common_arg {
     std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
     std::vector<const char *> args;
     const char * value_hint   = nullptr; // help text or example for arg value
@@ -18,60 +18,60 @@ struct llama_arg {
     const char * env          = nullptr;
     std::string help;
     bool is_sparam = false; // is current arg a sampling param?
-    void (*handler_void)   (gpt_params & params) = nullptr;
-    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
-    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (gpt_params & params, int) = nullptr;
+    void (*handler_void)   (common_params & params) = nullptr;
+    void (*handler_string) (common_params & params, const std::string &) = nullptr;
+    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (common_params & params, int) = nullptr;
 
-    llama_arg(
+    common_arg(
         const std::initializer_list<const char *> & args,
         const char * value_hint,
         const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &)
+        void (*handler)(common_params & params, const std::string &)
     ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
 
-    llama_arg(
+    common_arg(
         const std::initializer_list<const char *> & args,
         const char * value_hint,
         const std::string & help,
-        void (*handler)(gpt_params & params, int)
+        void (*handler)(common_params & params, int)
     ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
 
-    llama_arg(
+    common_arg(
         const std::initializer_list<const char *> & args,
         const std::string & help,
-        void (*handler)(gpt_params & params)
+        void (*handler)(common_params & params)
     ) : args(args), help(help), handler_void(handler) {}
 
     // support 2 values for arg
-    llama_arg(
+    common_arg(
         const std::initializer_list<const char *> & args,
         const char * value_hint,
         const char * value_hint_2,
         const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &, const std::string &)
+        void (*handler)(common_params & params, const std::string &, const std::string &)
     ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
 
-    llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
-    llama_arg & set_env(const char * env);
-    llama_arg & set_sparam();
+    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    common_arg & set_env(const char * env);
+    common_arg & set_sparam();
     bool in_example(enum llama_example ex);
     bool get_value_from_env(std::string & output);
     bool has_value_from_env();
     std::string to_string();
 };
 
-struct gpt_params_context {
+struct common_params_context {
     enum llama_example ex = LLAMA_EXAMPLE_COMMON;
-    gpt_params & params;
-    std::vector<llama_arg> options;
+    common_params & params;
+    std::vector<common_arg> options;
     void(*print_usage)(int, char **) = nullptr;
-    gpt_params_context(gpt_params & params) : params(params) {}
+    common_params_context(common_params & params) : params(params) {}
 };
 
 // parse input arguments from CLI
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 
 // function to be used by test-arg-parser
-gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
diff --git a/common/common.cpp b/common/common.cpp
index 29df16c95..d1b92250a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -362,10 +362,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
     return true;
 }
 
-void gpt_init() {
+void common_init() {
     llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
-        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
-            gpt_log_add(gpt_log_main(), level, "%s", text);
+        if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
+            common_log_add(common_log_main(), level, "%s", text);
         }
     }, NULL);
 
@@ -378,7 +378,7 @@ void gpt_init() {
     LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
 }
 
-std::string gpt_params_get_system_info(const gpt_params & params) {
+std::string common_params_get_system_info(const common_params & params) {
     std::ostringstream os;
 
     os << "system_info: n_threads = " << params.cpuparams.n_threads;
@@ -493,7 +493,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
             first = false;
         }
 
-        auto detokenized = llama_token_to_piece(ctx, token);
+        auto detokenized = common_token_to_piece(ctx, token);
 
         detokenized.erase(
             std::remove_if(
@@ -524,7 +524,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
             first = false;
         }
 
-        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
+        auto detokenized = common_token_to_piece(ctx, batch.token[i]);
 
         detokenized.erase(
                 std::remove_if(
@@ -819,16 +819,16 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
-struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
-    llama_init_result iparams;
-    auto mparams = llama_model_params_from_gpt_params(params);
+struct common_init_result common_init_from_params(common_params & params) {
+    common_init_result iparams;
+    auto mparams = common_model_params_to_llama(params);
 
     llama_model * model = nullptr;
 
     if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+        model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
     } else if (!params.model_url.empty()) {
-        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+        model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
     } else {
         model = llama_load_model_from_file(params.model.c_str(), mparams);
     }
@@ -863,7 +863,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         }
     }
 
-    auto cparams = llama_context_params_from_gpt_params(params);
+    auto cparams = common_context_params_to_llama(params);
 
     llama_context * lctx = llama_new_context_with_model(model, cparams);
     if (lctx == NULL) {
@@ -876,7 +876,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
         if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
 
-        const auto cvec = llama_control_vector_load(params.control_vectors);
+        const auto cvec = common_control_vector_load(params.control_vectors);
         if (cvec.n_embd == -1) {
             llama_free(lctx);
             llama_free_model(model);
@@ -900,7 +900,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
 
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
-        llama_lora_adapter_container loaded_la;
+        common_lora_adapter_container loaded_la;
         loaded_la.path = la.path;
         loaded_la.scale = la.scale;
         loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
@@ -913,7 +913,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
     }
     if (!params.lora_init_without_apply) {
-        llama_lora_adapters_apply(lctx, iparams.lora_adapters);
+        common_lora_adapters_apply(lctx, iparams.lora_adapters);
     }
 
     if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -961,7 +961,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     return iparams;
 }
 
-void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
     llama_lora_adapter_clear(ctx);
     for (auto & la : lora_adapters) {
         if (la.scale != 0.0f) {
@@ -970,7 +970,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
     }
 }
 
-struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
+struct llama_model_params common_model_params_to_llama(const common_params & params) {
     auto mparams = llama_model_default_params();
 
     if (params.n_gpu_layers != -1) {
@@ -1022,7 +1022,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
     throw std::runtime_error("Invalid cache type: " + s);
 }
 
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
+struct llama_context_params common_context_params_to_llama(const common_params & params) {
     auto cparams = llama_context_default_params();
 
     cparams.n_ctx             = params.n_ctx;
@@ -1112,7 +1112,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
     return false;
 }
 
-static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
+static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
 
     // Initialize libcurl
     std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@@ -1182,15 +1182,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
     }
 
     // Send a HEAD request to retrieve the etag and last-modified headers
-    struct llama_load_model_from_url_headers {
+    struct common_load_model_from_url_headers {
         std::string etag;
         std::string last_modified;
     };
-    llama_load_model_from_url_headers headers;
+    common_load_model_from_url_headers headers;
     {
         typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
         auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
+            common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
 
             static std::regex header_regex("([^:]+): (.*)\r\n");
             static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1326,7 +1326,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
     return true;
 }
 
-struct llama_model * llama_load_model_from_url(
+struct llama_model * common_load_model_from_url(
         const char * model_url,
         const char * path_model,
         const char * hf_token,
@@ -1337,7 +1337,7 @@ struct llama_model * llama_load_model_from_url(
         return NULL;
     }
 
-    if (!llama_download_file(model_url, path_model, hf_token)) {
+    if (!common_download_file(model_url, path_model, hf_token)) {
         return NULL;
     }
 
@@ -1390,7 +1390,7 @@ struct llama_model * llama_load_model_from_url(
                 char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
                 llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
 
-                return llama_download_file(split_url, split_path, hf_token);
+                return common_download_file(split_url, split_path, hf_token);
             }, idx));
         }
 
@@ -1405,7 +1405,7 @@ struct llama_model * llama_load_model_from_url(
     return llama_load_model_from_file(path_model, params);
 }
 
-struct llama_model * llama_load_model_from_hf(
+struct llama_model * common_load_model_from_hf(
         const char * repo,
         const char * model,
         const char * path_model,
@@ -1425,12 +1425,12 @@ struct llama_model * llama_load_model_from_hf(
     model_url += "/resolve/main/";
     model_url += model;
 
-    return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
+    return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
 }
 
 #else
 
-struct llama_model * llama_load_model_from_url(
+struct llama_model * common_load_model_from_url(
         const char * /*model_url*/,
         const char * /*path_model*/,
         const char * /*hf_token*/,
@@ -1439,7 +1439,7 @@ struct llama_model * llama_load_model_from_url(
     return nullptr;
 }
 
-struct llama_model * llama_load_model_from_hf(
+struct llama_model * common_load_model_from_hf(
         const char * /*repo*/,
         const char * /*model*/,
         const char * /*path_model*/,
@@ -1455,11 +1455,11 @@ struct llama_model * llama_load_model_from_hf(
 // Batch utils
 //
 
-void llama_batch_clear(struct llama_batch & batch) {
+void common_batch_clear(struct llama_batch & batch) {
     batch.n_tokens = 0;
 }
 
-void llama_batch_add(
+void common_batch_add(
                  struct llama_batch & batch,
                         llama_token   id,
                           llama_pos   pos,
@@ -1482,15 +1482,15 @@ void llama_batch_add(
 // Vocab utils
 //
 
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
                         bool   add_special,
                         bool   parse_special) {
-    return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
+    return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
 }
 
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
     const struct llama_model * model,
            const std::string & text,
                         bool   add_special,
@@ -1509,7 +1509,7 @@ std::vector<llama_token> llama_tokenize(
     return result;
 }
 
-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
+std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
     std::string piece;
     piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
     const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
@@ -1525,7 +1525,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
     return piece;
 }
 
-std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
     std::string text;
     text.resize(std::max(text.capacity(), tokens.size()));
     int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -1545,15 +1545,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
 // Chat template utils
 //
 
-bool llama_chat_verify_template(const std::string & tmpl) {
+bool common_chat_verify_template(const std::string & tmpl) {
     llama_chat_message chat[] = {{"user", "test"}};
     int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
     return res >= 0;
 }
 
-std::string llama_chat_apply_template(const struct llama_model * model,
+std::string common_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
-        const std::vector<llama_chat_msg> & msgs,
+        const std::vector<common_chat_msg> & msgs,
         bool add_ass) {
     int alloc_size = 0;
     bool fallback = false; // indicate if we must fallback to default chatml
@@ -1595,42 +1595,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
     return formatted_chat;
 }
 
-std::string llama_chat_format_single(const struct llama_model * model,
+std::string common_chat_format_single(const struct llama_model * model,
         const std::string & tmpl,
-        const std::vector<llama_chat_msg> & past_msg,
-        const llama_chat_msg & new_msg,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
         bool add_ass) {
     std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
-    std::vector<llama_chat_msg> chat_new(past_msg);
+    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
+    std::vector<common_chat_msg> chat_new(past_msg);
     // if the past_msg ends with a newline, we must preserve it in the formatted version
     if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
         ss << "\n";
     };
     // format chat with new_msg
     chat_new.push_back(new_msg);
-    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
+    auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
     // get the diff part
     ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
     return ss.str();
 }
 
-std::string llama_chat_format_example(const struct llama_model * model,
+std::string common_chat_format_example(const struct llama_model * model,
         const std::string & tmpl) {
-    std::vector<llama_chat_msg> msgs = {
+    std::vector<common_chat_msg> msgs = {
         {"system",    "You are a helpful assistant"},
         {"user",      "Hello"},
         {"assistant", "Hi there"},
         {"user",      "How are you?"},
     };
-    return llama_chat_apply_template(model, tmpl, msgs, true);
+    return common_chat_apply_template(model, tmpl, msgs, true);
 }
 
 //
 // KV cache utils
 //
 
-void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
 
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@@ -1653,7 +1653,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
     printf("\n=== Done dumping\n");
 }
 
-void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@@ -1705,7 +1705,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
 // Embedding utils
 //
 
-void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
     double sum = 0.0;
 
     switch (embd_norm) {
@@ -1739,7 +1739,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
     }
 }
 
-float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
     double sum  = 0.0;
     double sum1 = 0.0;
     double sum2 = 0.0;
@@ -1765,8 +1765,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
 // Control vector utils
 //
 
-static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
-    llama_control_vector_data result = { -1, {} };
+static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
+    common_control_vector_data result = { -1, {} };
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -1850,11 +1850,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
     return result;
 }
 
-llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
-    llama_control_vector_data result = { -1, {} };
+common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
+    common_control_vector_data result = { -1, {} };
 
     for (const auto & info : load_infos) {
-        auto cur = llama_control_vector_load_one(info);
+        auto cur = common_control_vector_load_one(info);
 
         if (cur.n_embd == -1) {
             result.n_embd = -1;
@@ -1946,7 +1946,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
     }
 }
 
-void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
+void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
     const auto & sparams = params.sparams;
 
diff --git a/common/common.h b/common/common.h
index 65add1f30..ea2719e4b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -24,12 +24,12 @@
 
 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
 
-struct llama_lora_adapter_info {
+struct common_lora_adapter_info {
     std::string path;
     float scale;
 };
 
-struct llama_lora_adapter_container : llama_lora_adapter_info {
+struct common_lora_adapter_container : common_lora_adapter_info {
     struct llama_lora_adapter * adapter;
 };
 
@@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
 extern char const * LLAMA_COMPILER;
 extern char const * LLAMA_BUILD_TARGET;
 
-struct llama_control_vector_load_info;
+struct common_control_vector_load_info;
 
 //
 // CPU utils
@@ -82,14 +82,14 @@ enum llama_example {
     LLAMA_EXAMPLE_COUNT,
 };
 
-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
+enum common_sampler_type {
+    COMMON_SAMPLER_TYPE_NONE        = 0,
+    COMMON_SAMPLER_TYPE_TOP_K       = 1,
+    COMMON_SAMPLER_TYPE_TOP_P       = 2,
+    COMMON_SAMPLER_TYPE_MIN_P       = 3,
+    COMMON_SAMPLER_TYPE_TFS_Z       = 4,
+    COMMON_SAMPLER_TYPE_TYPICAL_P   = 5,
+    COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -99,7 +99,7 @@ enum dimre_method {
 };
 
 // sampler parameters
-struct gpt_sampler_params {
+struct common_sampler_params {
     uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
 
     int32_t n_prev            = 64;    // number of previous tokens to remember
@@ -124,13 +124,13 @@ struct gpt_sampler_params {
     bool    ignore_eos        = false;
     bool    no_perf           = false; // disable performance metrics
 
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_TEMPERATURE
+    std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_TOP_K,
+        COMMON_SAMPLER_TYPE_TFS_Z,
+        COMMON_SAMPLER_TYPE_TYPICAL_P,
+        COMMON_SAMPLER_TYPE_TOP_P,
+        COMMON_SAMPLER_TYPE_MIN_P,
+        COMMON_SAMPLER_TYPE_TEMPERATURE
     };
 
     std::string grammar; // optional BNF-like grammar to constrain sampling
@@ -141,7 +141,7 @@ struct gpt_sampler_params {
     std::string print() const;
 };
 
-struct gpt_params {
+struct common_params {
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =     0; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -183,7 +183,7 @@ struct gpt_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
 
-    struct gpt_sampler_params sparams;
+    struct common_sampler_params sparams;
 
     std::string model                = ""; // model path                                                    // NOLINT
     std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
@@ -208,9 +208,9 @@ struct gpt_params {
     std::vector<llama_model_kv_override> kv_overrides;
 
     bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
-    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
+    std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
 
-    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
+    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
 
     int32_t verbosity                  = 0;
     int32_t control_vector_layer_start = -1; // layer range for control vector
@@ -348,9 +348,9 @@ struct gpt_params {
 
 // call once at the start of a program if it uses libcommon
 // initializes the logging system and prints info about the build
-void gpt_init();
+void common_init();
 
-std::string gpt_params_get_system_info(const gpt_params & params);
+std::string common_params_get_system_info(const common_params & params);
 
 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
 bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
@@ -404,29 +404,29 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //
 
-struct llama_init_result {
+struct common_init_result {
     struct llama_model   * model   = nullptr;
     struct llama_context * context = nullptr;
-    std::vector<llama_lora_adapter_container> lora_adapters;
+    std::vector<common_lora_adapter_container> lora_adapters;
 };
 
-struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
+struct common_init_result     common_init_from_params(common_params & params);
 
-struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
-struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
+struct llama_model_params     common_model_params_to_llama  (const common_params & params);
+struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 
 // clear LoRA adapters from context, then apply new list of adapters
-void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
 
 // Batch utils
 
-void llama_batch_clear(struct llama_batch & batch);
+void common_batch_clear(struct llama_batch & batch);
 
-void llama_batch_add(
+void common_batch_add(
                  struct llama_batch & batch,
                         llama_token   id,
                           llama_pos   pos,
@@ -439,13 +439,13 @@ void llama_batch_add(
 
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
                         bool   add_special,
                         bool   parse_special = false);
 
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
     const struct llama_model * model,
            const std::string & text,
                         bool   add_special,
@@ -453,7 +453,7 @@ std::vector<llama_token> llama_tokenize(
 
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
-std::string llama_token_to_piece(
+std::string common_token_to_piece(
         const struct llama_context * ctx,
                        llama_token   token,
                        bool          special = true);
@@ -461,7 +461,7 @@ std::string llama_token_to_piece(
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
-std::string llama_detokenize(
+std::string common_detokenize(
                          llama_context * ctx,
         const std::vector<llama_token> & tokens,
                                   bool   special = true);
@@ -471,31 +471,31 @@ std::string llama_detokenize(
 //
 
 // same with llama_chat_message, but uses std::string
-struct llama_chat_msg {
+struct common_chat_msg {
     std::string role;
     std::string content;
 };
 
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool llama_chat_verify_template(const std::string & tmpl);
+bool common_chat_verify_template(const std::string & tmpl);
 
 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
-std::string llama_chat_apply_template(const struct llama_model * model,
+std::string common_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
-        const std::vector<llama_chat_msg> & chat,
+        const std::vector<common_chat_msg> & chat,
         bool add_ass);
 
 // Format single message, while taking into account the position of that message in chat history
-std::string llama_chat_format_single(const struct llama_model * model,
+std::string common_chat_format_single(const struct llama_model * model,
         const std::string & tmpl,
-        const std::vector<llama_chat_msg> & past_msg,
-        const llama_chat_msg & new_msg,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
         bool add_ass);
 
 // Returns an example of formatted chat
-std::string llama_chat_format_example(const struct llama_model * model,
+std::string common_chat_format_example(const struct llama_model * model,
         const std::string & tmpl);
 
 //
@@ -503,31 +503,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
 //
 
 // Dump the KV cache view with the number of sequences per cell.
-void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
 
 // Dump the KV cache view showing individual sequences in each cell (long output).
-void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
 
 //
 // Embedding utils
 //
 
-void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
 
-float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
 
 //
 // Control vector utils
 //
 
-struct llama_control_vector_data {
+struct common_control_vector_data {
     int n_embd;
 
     // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
     std::vector<float> data;
 };
 
-struct llama_control_vector_load_info {
+struct common_control_vector_load_info {
     float strength;
 
     std::string fname;
@@ -535,7 +535,7 @@ struct llama_control_vector_load_info {
 
 // Load control vectors, scale each by strength, and add them together.
 // On error, returns {-1, empty}
-llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
+common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
 
 //
 // Split utils
@@ -554,5 +554,5 @@ void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std
 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
 
 void yaml_dump_non_result_info(
-    FILE * stream, const gpt_params & params, const llama_context * lctx,
+    FILE * stream, const common_params & params, const llama_context * lctx,
     const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
diff --git a/common/log.cpp b/common/log.cpp
index 5a844ed59..04c7c0ed1 100644
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -8,10 +8,10 @@
 #include <thread>
 #include <vector>
 
-int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
 
-void gpt_log_set_verbosity_thold(int verbosity) {
-    gpt_log_verbosity_thold = verbosity;
+void common_log_set_verbosity_thold(int verbosity) {
+    common_log_verbosity_thold = verbosity;
 }
 
 #define LOG_COL_DEFAULT "\033[0m"
@@ -29,16 +29,16 @@ static int64_t t_us() {
 }
 
 // colors
-enum gpt_log_col : int {
-    GPT_LOG_COL_DEFAULT = 0,
-    GPT_LOG_COL_BOLD,
-    GPT_LOG_COL_RED,
-    GPT_LOG_COL_GREEN,
-    GPT_LOG_COL_YELLOW,
-    GPT_LOG_COL_BLUE,
-    GPT_LOG_COL_MAGENTA,
-    GPT_LOG_COL_CYAN,
-    GPT_LOG_COL_WHITE,
+enum common_log_col : int {
+    COMMON_LOG_COL_DEFAULT = 0,
+    COMMON_LOG_COL_BOLD,
+    COMMON_LOG_COL_RED,
+    COMMON_LOG_COL_GREEN,
+    COMMON_LOG_COL_YELLOW,
+    COMMON_LOG_COL_BLUE,
+    COMMON_LOG_COL_MAGENTA,
+    COMMON_LOG_COL_CYAN,
+    COMMON_LOG_COL_WHITE,
 };
 
 // disable colors by default
@@ -54,7 +54,7 @@ static std::vector<const char *> g_col = {
     "",
 };
 
-struct gpt_log_entry {
+struct common_log_entry {
     enum ggml_log_level level;
 
     bool prefix;
@@ -71,7 +71,7 @@ struct gpt_log_entry {
         if (!fcur) {
             // stderr displays DBG messages only when their verbosity level is not higher than the threshold
             // these messages will still be logged to a file
-            if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
+            if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
                 return;
             }
 
@@ -86,19 +86,19 @@ struct gpt_log_entry {
             if (timestamp) {
                 // [M.s.ms.us]
                 fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
-                        g_col[GPT_LOG_COL_BLUE],
+                        g_col[COMMON_LOG_COL_BLUE],
                         (int) (timestamp / 1000000 / 60),
                         (int) (timestamp / 1000000 % 60),
                         (int) (timestamp / 1000 % 1000),
                         (int) (timestamp % 1000),
-                        g_col[GPT_LOG_COL_DEFAULT]);
+                        g_col[COMMON_LOG_COL_DEFAULT]);
             }
 
             switch (level) {
-                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN],   g_col[GPT_LOG_COL_DEFAULT]); break;
-                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], ""                        ); break;
-                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED],     ""                        ); break;
-                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW],  ""                        ); break;
+                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN],   g_col[COMMON_LOG_COL_DEFAULT]); break;
+                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], ""                        ); break;
+                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED],     ""                        ); break;
+                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW],  ""                        ); break;
                 default:
                     break;
             }
@@ -107,18 +107,18 @@ struct gpt_log_entry {
         fprintf(fcur, "%s", msg.data());
 
         if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
-            fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
+            fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
         }
 
         fflush(fcur);
     }
 };
 
-struct gpt_log {
+struct common_log {
     // default capacity - will be expanded if needed
-    gpt_log() : gpt_log(256) {}
+    common_log() : common_log(256) {}
 
-    gpt_log(size_t capacity) {
+    common_log(size_t capacity) {
         file = nullptr;
         prefix = false;
         timestamps = false;
@@ -137,7 +137,7 @@ struct gpt_log {
         resume();
     }
 
-    ~gpt_log() {
+    ~common_log() {
         pause();
         if (file) {
             fclose(file);
@@ -158,12 +158,12 @@ private:
     int64_t t_start;
 
     // ring buffer of entries
-    std::vector<gpt_log_entry> entries;
+    std::vector<common_log_entry> entries;
     size_t head;
     size_t tail;
 
     // worker thread copies into this
-    gpt_log_entry cur;
+    common_log_entry cur;
 
 public:
     void add(enum ggml_log_level level, const char * fmt, va_list args) {
@@ -219,7 +219,7 @@ public:
         tail = (tail + 1) % entries.size();
         if (tail == head) {
             // expand the buffer
-            std::vector<gpt_log_entry> new_entries(2*entries.size());
+            std::vector<common_log_entry> new_entries(2*entries.size());
 
             size_t new_tail = 0;
 
@@ -320,15 +320,15 @@ public:
         pause();
 
         if (colors) {
-            g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
-            g_col[GPT_LOG_COL_BOLD]    = LOG_COL_BOLD;
-            g_col[GPT_LOG_COL_RED]     = LOG_COL_RED;
-            g_col[GPT_LOG_COL_GREEN]   = LOG_COL_GREEN;
-            g_col[GPT_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
-            g_col[GPT_LOG_COL_BLUE]    = LOG_COL_BLUE;
-            g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
-            g_col[GPT_LOG_COL_CYAN]    = LOG_COL_CYAN;
-            g_col[GPT_LOG_COL_WHITE]   = LOG_COL_WHITE;
+            g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
+            g_col[COMMON_LOG_COL_BOLD]    = LOG_COL_BOLD;
+            g_col[COMMON_LOG_COL_RED]     = LOG_COL_RED;
+            g_col[COMMON_LOG_COL_GREEN]   = LOG_COL_GREEN;
+            g_col[COMMON_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
+            g_col[COMMON_LOG_COL_BLUE]    = LOG_COL_BLUE;
+            g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
+            g_col[COMMON_LOG_COL_CYAN]    = LOG_COL_CYAN;
+            g_col[COMMON_LOG_COL_WHITE]   = LOG_COL_WHITE;
         } else {
             for (size_t i = 0; i < g_col.size(); i++) {
                 g_col[i] = "";
@@ -355,47 +355,47 @@ public:
 // public API
 //
 
-struct gpt_log * gpt_log_init() {
-    return new gpt_log;
+struct common_log * common_log_init() {
+    return new common_log;
 }
 
-struct gpt_log * gpt_log_main() {
-    static struct gpt_log log;
+struct common_log * common_log_main() {
+    static struct common_log log;
 
     return &log;
 }
 
-void gpt_log_pause(struct gpt_log * log) {
+void common_log_pause(struct common_log * log) {
     log->pause();
 }
 
-void gpt_log_resume(struct gpt_log * log) {
+void common_log_resume(struct common_log * log) {
     log->resume();
 }
 
-void gpt_log_free(struct gpt_log * log) {
+void common_log_free(struct common_log * log) {
     delete log;
 }
 
-void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
+void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
     va_list args;
     va_start(args, fmt);
     log->add(level, fmt, args);
     va_end(args);
 }
 
-void gpt_log_set_file(struct gpt_log * log, const char * file) {
+void common_log_set_file(struct common_log * log, const char * file) {
     log->set_file(file);
 }
 
-void gpt_log_set_colors(struct gpt_log * log, bool colors) {
+void common_log_set_colors(struct common_log * log, bool colors) {
     log->set_colors(colors);
 }
 
-void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
+void common_log_set_prefix(struct common_log * log, bool prefix) {
     log->set_prefix(prefix);
 }
 
-void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
+void common_log_set_timestamps(struct common_log * log, bool timestamps) {
     log->set_timestamps(timestamps);
 }
diff --git a/common/log.h b/common/log.h
index 84f9b3ed7..66605cc69 100644
--- a/common/log.h
+++ b/common/log.h
@@ -14,23 +14,23 @@
 #define LOG_DEFAULT_LLAMA 0
 
 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
-// set via gpt_log_set_verbosity()
-extern int gpt_log_verbosity_thold;
+// set via common_log_set_verbosity()
+extern int common_log_verbosity_thold;
 
-void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
+void common_log_set_verbosity_thold(int verbosity); // not thread-safe
 
-// the gpt_log uses an internal worker thread to print/write log messages
+// the common_log uses an internal worker thread to print/write log messages
 // when the worker thread is paused, incoming log messages are discarded
-struct gpt_log;
+struct common_log;
 
-struct gpt_log * gpt_log_init();
-struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
-void             gpt_log_pause (struct gpt_log * log); // pause  the worker thread, not thread-safe
-void             gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
-void             gpt_log_free  (struct gpt_log * log);
+struct common_log * common_log_init();
+struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
+void                common_log_pause (struct common_log * log); // pause  the worker thread, not thread-safe
+void                common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
+void                common_log_free  (struct common_log * log);
 
 LOG_ATTRIBUTE_FORMAT(3, 4)
-void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
+void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...);
 
 // defaults: file = NULL, colors = false, prefix = false, timestamps = false
 //
@@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * f
 // D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
 //
 
-void gpt_log_set_file      (struct gpt_log * log, const char * file);       // not thread-safe
-void gpt_log_set_colors    (struct gpt_log * log,       bool   colors);     // not thread-safe
-void gpt_log_set_prefix    (struct gpt_log * log,       bool   prefix);     // whether to output prefix to each log
-void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+void common_log_set_file      (struct common_log * log, const char * file);       // not thread-safe
+void common_log_set_colors    (struct common_log * log,       bool   colors);     // not thread-safe
+void common_log_set_prefix    (struct common_log * log,       bool   prefix);     // whether to output prefix to each log
+void common_log_set_timestamps(struct common_log * log,       bool   timestamps); // whether to output timestamps in the prefix
 
 // helper macros for logging
 // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -66,13 +66,13 @@ void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // w
 //
 //   LOG_DBG("this is a debug message: %d\n", expensive_function());
 //
-// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
+// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
 //
 
 #define LOG_TMPL(level, verbosity, ...) \
     do { \
-        if ((verbosity) <= gpt_log_verbosity_thold) { \
-            gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
+        if ((verbosity) <= common_log_verbosity_thold) { \
+            common_log_add(common_log_main(), (level), __VA_ARGS__); \
         } \
     } while (0)
 
diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp
index 7953c723e..a9dfb6714 100644
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -8,7 +8,7 @@
 #include <fstream>
 #include <thread>
 
-void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
+void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                               std::vector<llama_token> & inp, int nnew, bool print_progress) {
     const int64_t t_start_ms = ggml_time_ms();
     const int64_t inp_size = inp.size();
@@ -20,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
         const int64_t i_start = std::max(inp_size - nnew, ngram_size);
         for (int64_t i = i_start; i < inp_size; ++i) {
             const int64_t ngram_start = i - ngram_size;
-            llama_ngram ngram(&inp[ngram_start], ngram_size);
+            common_ngram ngram(&inp[ngram_start], ngram_size);
             const llama_token token = inp[i];
 
-            llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
+            common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
             if (part_it == ngram_cache.end()) {
-                llama_ngram_cache_part part;
+                common_ngram_cache_part part;
                 part.emplace(token, 1);
                 ngram_cache.emplace(ngram, part);
             } else {
-                llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
+                common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
                 if (token_count_it == part_it->second.end()) {
                     part_it->second.emplace(token, 1);
                 } else {
@@ -62,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4,  3,  2,  2};
 constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
 
 // Helper function that tries to draft a token from only the static ngram cache:
-static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
-    llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
+    common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
     if (part_static_it == nc_static.end()) {
         return -1;
     }
-    const llama_ngram_cache_part part_static = part_static_it->second;
+    const common_ngram_cache_part part_static = part_static_it->second;
 
     int max_count_static  = 0;
     int sum_count_static  = 0;
@@ -95,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng
 
 // Try to draft a token from primary cache (context/dynamic), validate with static cache:
 static llama_token try_draft(
-    llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
+    common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
     const int * min_sample_size, const int * min_percent) {
 
     llama_token drafted_token = -1;
 
     for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
-        const llama_ngram ngram_primary = ngrams_primary[i];
+        const common_ngram ngram_primary = ngrams_primary[i];
 
-        llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
+        common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
         if (part_primary_it == nc_primary.end()) {
             continue;
         }
-        const llama_ngram_cache_part part_primary = part_primary_it->second;
+        const common_ngram_cache_part part_primary = part_primary_it->second;
 
         int max_count_primary = 0;
         int max_count_static  = 0;
@@ -117,7 +117,7 @@ static llama_token try_draft(
         for (std::pair<llama_token, int> token_count_primary : part_primary) {
             const llama_token token = token_count_primary.first;
 
-            llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
+            common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
 
             const int32_t count_primary = token_count_primary.second;
             const int32_t count_static  = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
@@ -142,9 +142,9 @@ static llama_token try_draft(
     return drafted_token;
 }
 
-void llama_ngram_cache_draft(
+void common_ngram_cache_draft(
     std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
-    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
+    common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
 ) {
     GGML_ASSERT(draft.size() == 1);
     const int inp_size = inp.size();
@@ -157,21 +157,21 @@ void llama_ngram_cache_draft(
         llama_token drafted_token = -1;
 
         const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
-        llama_ngram ngram_static;
+        common_ngram ngram_static;
         for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
             ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
         }
-        llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
-        llama_ngram_cache_part part_static;
+        common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+        common_ngram_cache_part part_static;
         if (part_static_it != nc_static.end()) {
             part_static = part_static_it->second;
         }
 
         // cd = context + dynamic
-        std::vector<llama_ngram> ngrams_cd;
+        std::vector<common_ngram> ngrams_cd;
         for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
             const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
-            llama_ngram ngram_cd;
+            common_ngram ngram_cd;
             for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
                 ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
             }
@@ -196,16 +196,16 @@ void llama_ngram_cache_draft(
     }
 }
 
-void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
+void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
     std::ofstream file_out(filename, std::ios::binary);
-    for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
-        const llama_ngram      ngram        = item.first;
-        llama_ngram_cache_part token_counts = item.second;
+    for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
+        const common_ngram      ngram        = item.first;
+        common_ngram_cache_part token_counts = item.second;
         GGML_ASSERT(!token_counts.empty());
         const int32_t ntokens = token_counts.size();
         GGML_ASSERT(ntokens > 0);
 
-        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(llama_ngram));
+        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(common_ngram));
         file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
         for (std::pair<llama_token, int32_t> item2 : token_counts) {
             const llama_token token = item2.first;
@@ -219,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
 
 }
 
-llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
+common_ngram_cache common_ngram_cache_load(std::string & filename) {
     std::ifstream hashmap_file(filename, std::ios::binary);
     if (!hashmap_file) {
         throw std::ifstream::failure("Unable to open file " + filename);
     }
-    llama_ngram_cache ngram_cache;
+    common_ngram_cache ngram_cache;
 
-    llama_ngram ngram;
+    common_ngram ngram;
     int32_t     ntokens;
     llama_token token;
     int32_t     count;
@@ -235,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
     char * ntokensc = reinterpret_cast<char*>(&ntokens);
     char * tokenc   = reinterpret_cast<char*>(&token);
     char * countc   = reinterpret_cast<char*>(&count);
-    while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
+    while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
         GGML_ASSERT(!hashmap_file.eof());
         GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
         GGML_ASSERT(ntokens > 0);
-        llama_ngram_cache_part token_counts;
+        common_ngram_cache_part token_counts;
 
         for (int i = 0; i < ntokens; ++i) {
             GGML_ASSERT(!hashmap_file.eof());
@@ -257,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
     return ngram_cache;
 }
 
-void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
-    for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
-        const llama_ngram      ngram = ngram_part.first;
-        llama_ngram_cache_part  part = ngram_part.second;
+void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
+    for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
+        const common_ngram      ngram = ngram_part.first;
+        common_ngram_cache_part  part = ngram_part.second;
 
-        llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
+        common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
         if (part_merged_it == ngram_cache_target.end()) {
             ngram_cache_target.emplace(ngram, part);
             continue;
@@ -273,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
             const int32_t     count = token_count.second;
             GGML_ASSERT(count > 0);
 
-            llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
+            common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
             if (token_count_merged_it == part_merged_it->second.end()) {
                 part_merged_it->second.emplace(token, count);
                 continue;
diff --git a/common/ngram-cache.h b/common/ngram-cache.h
index ab4c9b376..09c2b0319 100644
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -12,22 +12,22 @@
 
 // Data structures to map n-grams to empirical token probabilities:
 
-struct llama_ngram {
+struct common_ngram {
     llama_token tokens[LLAMA_NGRAM_MAX];
 
-    llama_ngram() {
+    common_ngram() {
         for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
             tokens[i] = -1;
         }
     }
 
-    llama_ngram(const llama_token * input, const int ngram_size) {
+    common_ngram(const llama_token * input, const int ngram_size) {
         for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
             tokens[i] = i < ngram_size ? input[i] : -1;
         }
     }
 
-    bool operator==(const llama_ngram & other) const {
+    bool operator==(const common_ngram & other) const {
         for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
             if (tokens[i] != other.tokens[i]) {
                 return false;
@@ -37,28 +37,28 @@ struct llama_ngram {
     }
 };
 
-struct llama_token_hash_function {
+struct common_token_hash_function {
     size_t operator()(const llama_token token) const {
         // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
         return token * 11400714819323198485llu;
     }
 };
 
-struct llama_ngram_hash_function {
-    size_t operator()(const llama_ngram & ngram) const {
-        size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
+struct common_ngram_hash_function {
+    size_t operator()(const common_ngram & ngram) const {
+        size_t hash = common_token_hash_function{}(ngram.tokens[0]);
         for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= llama_token_hash_function{}(ngram.tokens[i]);
+            hash ^= common_token_hash_function{}(ngram.tokens[i]);
         }
         return hash;
     }
 };
 
 // token -> number of times token has been seen
-typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
+typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
 
 // n-gram -> empirical distribution of following tokens
-typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
+typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
 
 
 // Update an ngram cache with tokens.
@@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
 //
 // In order to get correct results inp_data can ONLY BE APPENDED TO.
 // Changes in the middle need a complete rebuild.
-void llama_ngram_cache_update(
-    llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
+void common_ngram_cache_update(
+    common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
 
 // Try to draft tokens from ngram caches.
 // inp:                the tokens generated so far.
@@ -81,21 +81,21 @@ void llama_ngram_cache_update(
 // nc_context:         ngram cache based on current context.
 // nc_dynamic:         ngram cache based on previous user generations.
 // nc_static:          ngram cache generated from a large text corpus, used for validation.
-void llama_ngram_cache_draft(
+void common_ngram_cache_draft(
     std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
-    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
+    common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
 
 // Save an ngram cache to a file.
 // ngram_cache: the ngram cache to save.
 // filename:    the path under which to save the ngram cache.
-void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
+void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
 
-// Load an ngram cache saved with llama_ngram_cache_save.
+// Load an ngram cache saved with common_ngram_cache_save.
 // filename: the path from which to load the ngram cache.
 // returns:  an ngram cache containing the information saved to filename.
-llama_ngram_cache llama_ngram_cache_load(std::string & filename);
+common_ngram_cache common_ngram_cache_load(std::string & filename);
 
 // Merge two ngram caches.
 // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
 // ngram_cache_add:    the ngram cache to add to ngram_cache_target.
-void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
+void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 3dc7f1120..cd49ade69 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -98,8 +98,8 @@ struct ring_buffer {
     std::vector<T> data;
 };
 
-struct gpt_sampler {
-    gpt_sampler_params params;
+struct common_sampler {
+    common_sampler_params params;
 
     struct llama_sampler * grmr;
     struct llama_sampler * chain;
@@ -125,7 +125,7 @@ struct gpt_sampler {
     }
 };
 
-std::string gpt_sampler_params::print() const {
+std::string common_sampler_params::print() const {
     char result[1024];
 
     snprintf(result, sizeof(result),
@@ -139,12 +139,12 @@ std::string gpt_sampler_params::print() const {
     return std::string(result);
 }
 
-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
     llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
 
     lparams.no_perf = params.no_perf;
 
-    auto * result = new gpt_sampler {
+    auto * result = new common_sampler {
         /* .params = */ params,
         /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
         /* .chain  = */ llama_sampler_chain_init(lparams),
@@ -175,22 +175,22 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
         if (params.mirostat == 0) {
             for (const auto & cnstr : params.samplers) {
                 switch (cnstr) {
-                    case GPT_SAMPLER_TYPE_TOP_K:
+                    case COMMON_SAMPLER_TYPE_TOP_K:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
                         break;
-                    case GPT_SAMPLER_TYPE_TOP_P:
+                    case COMMON_SAMPLER_TYPE_TOP_P:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
                         break;
-                    case GPT_SAMPLER_TYPE_MIN_P:
+                    case COMMON_SAMPLER_TYPE_MIN_P:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
                         break;
-                    case GPT_SAMPLER_TYPE_TFS_Z:
+                    case COMMON_SAMPLER_TYPE_TFS_Z:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
                         break;
-                    case GPT_SAMPLER_TYPE_TYPICAL_P:
+                    case COMMON_SAMPLER_TYPE_TYPICAL_P:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
                         break;
-                    case GPT_SAMPLER_TYPE_TEMPERATURE:
+                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                         break;
                     default:
@@ -224,7 +224,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
     return result;
 }
 
-void gpt_sampler_free(struct gpt_sampler * gsmpl) {
+void common_sampler_free(struct common_sampler * gsmpl) {
     if (gsmpl) {
         llama_sampler_free(gsmpl->grmr);
 
@@ -234,7 +234,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
     }
 }
 
-void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
+void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
     if (accept_grammar) {
         llama_sampler_accept(gsmpl->grmr, token);
     }
@@ -244,14 +244,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce
     gsmpl->prev.push_back(token);
 }
 
-void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
+void common_sampler_reset(struct common_sampler * gsmpl) {
     llama_sampler_reset(gsmpl->grmr);
 
     llama_sampler_reset(gsmpl->chain);
 }
 
-struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
-    return new gpt_sampler {
+struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
+    return new common_sampler {
         /* .params = */ gsmpl->params,
         /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
         /* .chain  = */ llama_sampler_clone(gsmpl->chain),
@@ -261,7 +261,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
     };
 }
 
-void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
+void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
     // TODO: measure grammar performance
 
     if (gsmpl) {
@@ -272,7 +272,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
     }
 }
 
-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
     gsmpl->set_logits(ctx, idx);
 
     auto & grmr  = gsmpl->grmr;
@@ -318,21 +318,21 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
     return cur_p.data[cur_p.selected].id;
 }
 
-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
+uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
     return llama_sampler_get_seed(gsmpl->chain);
 }
 
 // helpers
 
-llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
     return &gsmpl->cur_p;
 }
 
-llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
+llama_token common_sampler_last(const struct common_sampler * gsmpl) {
     return gsmpl->prev.rat(0);
 }
 
-std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
+std::string common_sampler_print(const struct common_sampler * gsmpl) {
     std::string result = "logits ";
 
     for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
@@ -343,7 +343,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
     return result;
 }
 
-std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
+std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
     n = std::min(n, (int) gsmpl->prev.size());
 
     if (n <= 0) {
@@ -358,63 +358,63 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
 
         GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
 
-        result += llama_token_to_piece(ctx_main, id);
+        result += common_token_to_piece(ctx_main, id);
     }
 
     return result;
 }
 
-char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
+char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
     switch (cnstr) {
-        case GPT_SAMPLER_TYPE_TOP_K:       return 'k';
-        case GPT_SAMPLER_TYPE_TFS_Z:       return 'f';
-        case GPT_SAMPLER_TYPE_TYPICAL_P:   return 'y';
-        case GPT_SAMPLER_TYPE_TOP_P:       return 'p';
-        case GPT_SAMPLER_TYPE_MIN_P:       return 'm';
-        case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
+        case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
+        case COMMON_SAMPLER_TYPE_TFS_Z:       return 'f';
+        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
+        case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
+        case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
+        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
         default : return '?';
     }
 }
 
-std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
+std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
     switch (cnstr) {
-        case GPT_SAMPLER_TYPE_TOP_K:       return "top_k";
-        case GPT_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
-        case GPT_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
-        case GPT_SAMPLER_TYPE_TOP_P:       return "top_p";
-        case GPT_SAMPLER_TYPE_MIN_P:       return "min_p";
-        case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+        case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
+        case COMMON_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
+        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
+        case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
+        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
         default : return "";
     }
 }
 
-std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
-        { "top_k",       GPT_SAMPLER_TYPE_TOP_K },
-        { "top_p",       GPT_SAMPLER_TYPE_TOP_P },
-        { "typ_p",       GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "min_p",       GPT_SAMPLER_TYPE_MIN_P },
-        { "tfs_z",       GPT_SAMPLER_TYPE_TFS_Z },
-        { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
+std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
+    std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
+        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
+        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "tfs_z",       COMMON_SAMPLER_TYPE_TFS_Z },
+        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
     };
 
     // since samplers names are written multiple ways
     // make it ready for both system names and input names
-    std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
-        { "top-k",       GPT_SAMPLER_TYPE_TOP_K },
-        { "top-p",       GPT_SAMPLER_TYPE_TOP_P },
-        { "nucleus",     GPT_SAMPLER_TYPE_TOP_P },
-        { "typical-p",   GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typical",     GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typ-p",       GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typ",         GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "min-p",       GPT_SAMPLER_TYPE_MIN_P },
-        { "tfs-z",       GPT_SAMPLER_TYPE_TFS_Z },
-        { "tfs",         GPT_SAMPLER_TYPE_TFS_Z },
-        { "temp",        GPT_SAMPLER_TYPE_TEMPERATURE },
+    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
+        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
+        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "tfs-z",       COMMON_SAMPLER_TYPE_TFS_Z },
+        { "tfs",         COMMON_SAMPLER_TYPE_TFS_Z },
+        { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
     };
 
-    std::vector<gpt_sampler_type> samplers;
+    std::vector<common_sampler_type> samplers;
     samplers.reserve(names.size());
 
     for (const auto & name : names) {
@@ -434,17 +434,17 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
     return samplers;
 }
 
-std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P),       GPT_SAMPLER_TYPE_TOP_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P),       GPT_SAMPLER_TYPE_MIN_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
+std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
+    std::unordered_map<char, common_sampler_type> sampler_name_map = {
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z),       COMMON_SAMPLER_TYPE_TFS_Z },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
     };
 
-    std::vector<gpt_sampler_type> samplers;
+    std::vector<common_sampler_type> samplers;
     samplers.reserve(chars.size());
 
     for (const auto & c : chars) {
diff --git a/common/sampling.h b/common/sampling.h
index d0e1a9203..d37f25ad3 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -7,7 +7,7 @@
 #include <string>
 #include <vector>
 
-// gpt_sampler extends llama_sampler with additional functionality:
+// common_sampler extends llama_sampler with additional functionality:
 //
 //  - grammar support
 //  - custom sampler logic based on the parameters
@@ -23,30 +23,30 @@
 // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
 // grammar constraints are applied to the full vocabulary and the token is resampled.
 //
-// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
+// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
 // be moved into the core llama library.
 //
-// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
+// For convenience, the common_sampler also maintains a container with the current candidate tokens.
 // This can be used to access the probabilities of the rest of the non-sampled tokens.
 //
 // TODO: measure grammar performance
 //
 
-struct gpt_sampler;
+struct common_sampler;
 
 // llama_sampler API overloads
 
-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
 
-void gpt_sampler_free(struct gpt_sampler * gsmpl);
+void common_sampler_free(struct common_sampler * gsmpl);
 
 // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
-void                 gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
-void                 gpt_sampler_reset (struct gpt_sampler * gsmpl);
-struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
+void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
+void                    common_sampler_reset (struct common_sampler * gsmpl);
+struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 
 // arguments can be nullptr to skip printing
-void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
+void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
 
 // extended sampling implementation:
 //
@@ -58,26 +58,26 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 // if grammar_first is true, the grammar is applied before the samplers (slower)
 // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
 //
-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
 
-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
+uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
 
 // helpers
 
 // access the internal list of current candidate tokens
-llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
 
 // get the last accepted token
-llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
+llama_token common_sampler_last(const struct common_sampler * gsmpl);
 
 // print the sampler chain into a string
-std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
+std::string common_sampler_print(const struct common_sampler * gsmpl);
 
 // get a string representation of the last accepted tokens
-std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
+std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
 
-char        gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
-std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
+char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
+std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
 
-std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
+std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 4a15941f1..81c3220ad 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -15,13 +15,13 @@ static void print_usage(int, char ** argv) {
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     int is_pp_shared = params.is_pp_shared;
 
@@ -36,7 +36,7 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+    llama_model_params model_params = common_model_params_to_llama(params);
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -45,7 +45,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+    llama_context_params ctx_params = common_context_params_to_llama(params);
 
     // ensure enough sequences are available
     ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
@@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
     // warm up
     {
         for (int i = 0; i < 16; ++i) {
-            llama_batch_add(batch, 0, i, { 0 }, false);
+            common_batch_add(batch, 0, i, { 0 }, false);
         }
 
         if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@@ -122,11 +122,11 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
-                llama_batch_clear(batch);
+                common_batch_clear(batch);
 
                 for (int i = 0; i < pp; ++i) {
                     for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
-                        llama_batch_add(batch, 0, i, { j }, false);
+                        common_batch_add(batch, 0, i, { j }, false);
                     }
                 }
                 batch.logits[batch.n_tokens - 1] = true;
@@ -151,10 +151,10 @@ int main(int argc, char ** argv) {
                 const auto t_tg_start = ggml_time_us();
 
                 for (int i = 0; i < tg; ++i) {
-                    llama_batch_clear(batch);
+                    common_batch_clear(batch);
 
                     for (int j = 0; j < pl; ++j) {
-                        llama_batch_add(batch, 0, pp + i, { j }, true);
+                        common_batch_add(batch, 0, pp + i, { j }, true);
                     }
 
                     if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 7887a43d6..3b554033e 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -15,16 +15,16 @@ static void print_usage(int, char ** argv) {
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
     params.prompt = "Hello my name is";
     params.n_predict = 32;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     // number of parallel batches
     int n_parallel = params.n_parallel;
@@ -39,7 +39,7 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+    llama_model_params model_params = common_model_params_to_llama(params);
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -51,13 +51,13 @@ int main(int argc, char ** argv) {
     // tokenize the prompt
 
     std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(model, params.prompt, true);
+    tokens_list = common_tokenize(model, params.prompt, true);
 
     const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
 
     // initialize the context
 
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+    llama_context_params ctx_params = common_context_params_to_llama(params);
 
     ctx_params.n_ctx   = n_kv_req;
     ctx_params.n_batch = std::max(n_predict, n_parallel);
@@ -94,7 +94,7 @@ int main(int argc, char ** argv) {
     LOG("\n");
 
     for (auto id : tokens_list) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", common_token_to_piece(ctx, id).c_str());
     }
 
     // create a llama_batch
@@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
 
     // evaluate the initial prompt
     for (size_t i = 0; i < tokens_list.size(); ++i) {
-        llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
+        common_batch_add(batch, tokens_list[i], i, seq_ids, false);
     }
     GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
 
@@ -123,8 +123,8 @@ int main(int argc, char ** argv) {
             decoder_start_token_id = llama_token_bos(model);
         }
 
-        llama_batch_clear(batch);
-        llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
+        common_batch_clear(batch);
+        common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
     }
 
     // llama_decode will output logits only for the last token of the prompt
@@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
 
     while (n_cur <= n_predict) {
         // prepare the next batch
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         // sample the next token for each parallel sequence / stream
         for (int32_t i = 0; i < n_parallel; ++i) {
@@ -185,15 +185,15 @@ int main(int argc, char ** argv) {
 
             // if there is only one stream, we print immediately to stdout
             if (n_parallel == 1) {
-                LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+                LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
             }
 
-            streams[i] += llama_token_to_piece(ctx, new_token_id);
+            streams[i] += common_token_to_piece(ctx, new_token_id);
 
             i_batch[i] = batch.n_tokens;
 
             // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_cur, { i }, true);
+            common_batch_add(batch, new_token_id, n_cur, { i }, true);
 
             n_decode += 1;
         }
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index c140daed3..988a584c9 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -872,7 +872,7 @@ static std::string basename(const std::string &path) {
 }
 
 int main(int argc, char ** argv) {
-    gpt_init();
+    common_init();
 
     struct train_params params = get_default_train_params();
     if (!params_parse(argc, argv, &params)) {
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index 41bf4eb2a..69e141ecb 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -31,7 +31,7 @@ template <class Iter>
 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
     std::string ret;
     for (; begin != end; ++begin) {
-        ret += llama_token_to_piece(ctx, *begin);
+        ret += common_token_to_piece(ctx, *begin);
     }
 
     return ret;
@@ -272,8 +272,8 @@ struct tokenized_prompt {
 
     tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
         const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-        tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
-        tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
+        tokens_pos = common_tokenize(ctx, pos, add_bos, true);
+        tokens_neg = common_tokenize(ctx, neg, add_bos, true);
         max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
         padding_seq(ctx, tokens_pos, max_seq_len);
         padding_seq(ctx, tokens_neg, max_seq_len);
@@ -281,7 +281,7 @@ struct tokenized_prompt {
 
     void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
         // TODO: customize padding token
-        std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
+        std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
         llama_token pad_tok = pad_tokens.back();
         while (tokens.size() < len) {
             tokens.push_back(pad_tok);
@@ -370,7 +370,7 @@ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const
  * Load prompt files and completion file.
  * Then format each pair of prompt + completion to make an entry.
  */
-static int prepare_entries(gpt_params & params, train_context & ctx_train) {
+static int prepare_entries(common_params & params, train_context & ctx_train) {
     // load prompts
     std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
     std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
@@ -388,9 +388,9 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
         return 1;
     }
 
@@ -413,7 +413,7 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     // load the model to get hparams
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 734926822..3f18fc6a7 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -28,7 +28,7 @@ static std::vector<std::string> split_lines(const std::string & s, const std::st
 static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
     size_t n_tokens = tokens.size();
     for (size_t i = 0; i < n_tokens; i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
+        common_batch_add(batch, tokens[i], i, { seq_id }, true);
     }
 }
 
@@ -74,18 +74,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         }
 
         float * out = output + embd_pos * n_embd;
-        llama_embd_normalize(embd, out, n_embd, embd_norm);
+        common_embd_normalize(embd, out, n_embd, embd_norm);
     }
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     params.embedding = true;
     // For non-causal models, batch size must be equal to ubatch size
@@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
@@ -122,7 +122,7 @@ int main(int argc, char ** argv) {
     // print system information
     {
         LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
     // split the prompt into lines
@@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
     // tokenize the prompts and trim
     std::vector<std::vector<int32_t>> inputs;
     for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true, true);
+        auto inp = common_tokenize(ctx, prompt, true, true);
         if (inp.size() > n_batch) {
             LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
@@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
             LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
             LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
             for (int j = 0; j < (int) inputs[i].size(); j++) {
-                LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+                LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str());
             }
             LOG("\n\n");
         }
@@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
             batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
             e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
             s = 0;
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
         }
 
         // add to batch
@@ -263,7 +263,7 @@ int main(int argc, char ** argv) {
                 LOG("\n");
                 for (int i = 0; i < n_prompts; i++) {
                     for (int j = 0; j < n_prompts; j++) {
-                        float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                        float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
                         LOG("%6.2f ", sim);
                     }
                     LOG("%1.10s", prompts[i].c_str());
@@ -296,7 +296,7 @@ int main(int argc, char ** argv) {
             for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
                 LOG("    [");
                 for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
-                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                    float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
                     LOG("%6.2f", sim);
                     j++;
                     if (j < n_embd_count) LOG(", "); else break;
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 6d629fe4e..fb52db4e1 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -126,10 +126,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
     return true;
 }
 
-static bool run(llama_context * ctx, const gpt_params & params) {
+static bool run(llama_context * ctx, const common_params & params) {
     const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
 
-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
 
     if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
         LOG_ERR("%s : failed to eval\n", __func__);
@@ -142,13 +142,13 @@ static bool run(llama_context * ctx, const gpt_params & params) {
 int main(int argc, char ** argv) {
     callback_data cb_data;
 
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
     params.warmup = false;
 
     // init
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
@@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
     // print system information
     {
         LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
         LOG_INF("\n");
     }
 
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 644d46a62..67662313d 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -128,7 +128,7 @@ struct lora_merge_ctx {
 
     lora_merge_ctx(
             std::string & base_fname,
-            std::vector<llama_lora_adapter_info> & lora_files,
+            std::vector<common_lora_adapter_info> & lora_files,
             std::string & outfile,
             int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
         fout.exceptions(std::ofstream::failbit); // fail fast on write errors
@@ -400,9 +400,9 @@ static void print_usage(int, char ** argv) {
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
         return 1;
     }
 
diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp
index 4b19a9dc2..77c59a836 100644
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -11,7 +11,7 @@ static void write_table_header(std::ofstream & file) {
     file << "| -------- | ----------- |\n";
 }
 
-static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
+static void write_table_entry(std::ofstream & file, const common_arg & opt) {
     file << "| `";
     // args
     for (const auto & arg : opt.args) {
@@ -40,7 +40,7 @@ static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
     file << "` | " << md_help << " |\n";
 }
 
-static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
+static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
     write_table_header(file);
     for (const auto & opt : opts) {
         write_table_entry(file, *opt);
@@ -50,12 +50,12 @@ static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
 static void export_md(std::string fname, llama_example ex) {
     std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
 
-    gpt_params params;
-    auto ctx_arg = gpt_params_parser_init(params, ex);
+    common_params params;
+    auto ctx_arg = common_params_parser_init(params, ex);
 
-    std::vector<llama_arg *> common_options;
-    std::vector<llama_arg *> sparam_options;
-    std::vector<llama_arg *> specific_options;
+    std::vector<common_arg *> common_options;
+    std::vector<common_arg *> sparam_options;
+    std::vector<common_arg *> specific_options;
     for (auto & opt : ctx_arg.options) {
         // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
         if (opt.is_sparam) {
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 20b99a4fd..6e42fa073 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -15,11 +15,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
     llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
     for (uint64_t i = 0; i < sentences.size(); i++) {
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         const std::string input_string = instruction + sentences[i];
 
-        std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);
+        std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);
 
         const int32_t n_toks = inputs.size();
 
@@ -28,7 +28,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         // inputs.push_back(llama_token_eos(model));
 
         // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();
+        const int32_t n_inst = common_tokenize(model, instruction, true, false).size();
 
 #ifdef GRIT_DEBUG
         // debug tokens - should be matching as referenced in the GritLM sample
@@ -40,7 +40,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
 
         // add input to batch (this increments n_tokens)
         for (int32_t j = 0; j < n_toks; j++) {
-            llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
+            common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
         }
 
         // clear previous kv_cache values (irrelevant for embeddings)
@@ -75,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         }
 
         std::vector<float> emb_norm(emb_unorm.size());
-        llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
+        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
         result.push_back(emb_norm);
 
 #ifdef GRIT_DEBUG
@@ -105,16 +105,16 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
 
     llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
-    std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
+    std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
     int32_t i_current_token = 0;
 
     while (true) {
-        llama_batch_clear(bat);
+        common_batch_clear(bat);
         {
             const int32_t n_inputs = inputs.size();
 
             for (int32_t i = 0; i < n_inputs; i++) {
-                llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+                common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
             }
         }
         inputs.clear();
@@ -127,7 +127,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
             break;
         }
 
-        std::string piece = llama_token_to_piece(ctx, token);
+        std::string piece = common_token_to_piece(ctx, token);
         if (stream) {
             std::printf("%s", piece.c_str());
             std::fflush(stdout);
@@ -152,16 +152,16 @@ static std::string gritlm_instruction(const std::string & instruction) {
 }
 
 int main(int argc, char * argv[]) {
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
-    llama_model_params mparams = llama_model_params_from_gpt_params(params);
-    llama_context_params cparams = llama_context_params_from_gpt_params(params);
+    llama_model_params mparams = common_model_params_to_llama(params);
+    llama_context_params cparams = common_context_params_to_llama(params);
 
     llama_backend_init();
 
@@ -199,10 +199,10 @@ int main(int argc, char * argv[]) {
 
         const int n_embd = llama_n_embd(model);
 
-        const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
-        const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
-        const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
-        const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
+        const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
+        const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
 
         std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
         std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index c8e273529..d1ff3e8bc 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -37,13 +37,13 @@ struct Stats {
 class IMatrixCollector {
 public:
     IMatrixCollector() = default;
-    void set_params(gpt_params params) { m_params = std::move(params); }
+    void set_params(common_params params) { m_params = std::move(params); }
     bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
     void save_imatrix(int ncall = -1) const;
     bool load_imatrix(const char * file_name);
 private:
     std::unordered_map<std::string, Stats> m_stats;
-    gpt_params                             m_params;
+    common_params                          m_params;
     std::mutex                             m_mutex;
     int                                    m_last_call = 0;
     std::vector<float>                     m_src1_data;
@@ -428,7 +428,7 @@ static void process_logits(
     }
 }
 
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
+static bool compute_imatrix(llama_context * ctx, const common_params & params) {
     const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
     GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
     const int n_ctx = llama_n_ctx(ctx);
@@ -436,7 +436,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
     auto tim1 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenizing the input ..\n", __func__);
 
-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -568,17 +568,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
     params.n_ctx = 512;
     params.logits_all = true;
     params.escape = false;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
@@ -607,7 +607,7 @@ int main(int argc, char ** argv) {
     params.warmup = false;
 
     // init
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
@@ -625,7 +625,7 @@ int main(int argc, char ** argv) {
     // print system information
     {
         LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
     if (!compute_imatrix(ctx, params)) {
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index d52425ae6..3d0f71fda 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -35,8 +35,8 @@
 
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
-static gpt_sampler             ** g_smpl;
-static gpt_params               * g_params;
+static common_sampler          ** g_smpl;
+static common_params            * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
@@ -44,7 +44,7 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 
 static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
+    const llama_context * ctx, const common_params & params, const llama_model * model,
     const std::vector<llama_token> & input_tokens, const std::string & output,
     const std::vector<llama_token> & output_tokens
 ) {
@@ -95,12 +95,12 @@ static void sigint_handler(int signo) {
         } else {
             console::cleanup();
             LOG("\n");
-            gpt_perf_print(*g_ctx, *g_smpl);
+            common_perf_print(*g_ctx, *g_smpl);
             write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
 
             // make sure all logs are flushed
             LOG("Interrupted by user\n");
-            gpt_log_pause(gpt_log_main());
+            common_log_pause(common_log_main());
 
             _exit(130);
         }
@@ -109,14 +109,14 @@ static void sigint_handler(int signo) {
 #endif
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
     g_params = &params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     auto & sparams = params.sparams;
 
@@ -166,7 +166,7 @@ int main(int argc, char ** argv) {
 
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
-    gpt_sampler  * smpl = nullptr;
+    common_sampler * smpl = nullptr;
 
     g_model = &model;
     g_ctx = &ctx;
@@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
 
     // load the model and apply lora adapter, if any
     LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     model = llama_init.model;
     ctx = llama_init.context;
@@ -195,15 +195,15 @@ int main(int argc, char ** argv) {
     // print system information
     {
         LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
     const bool add_bos = llama_add_bos_token(model);
     GGML_ASSERT(!llama_add_eos_token(model));
 
     std::vector<llama_token> embd_inp;
     std::vector<llama_token> embd_end;
-    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
+    std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
+    std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
 
     GGML_ASSERT(llama_token_prefix(model) >= 0);
     GGML_ASSERT(llama_token_suffix(model) >= 0);
@@ -257,13 +257,13 @@ int main(int argc, char ** argv) {
         LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (params.n_keep > 0) {
         LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
             }
             LOG_CNT("'\n");
         }
@@ -298,11 +298,11 @@ int main(int argc, char ** argv) {
             LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
-    smpl = gpt_sampler_init(model, sparams);
+    smpl = common_sampler_init(model, sparams);
 
-    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
     LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
-    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
 
     LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
 
@@ -411,9 +411,9 @@ int main(int argc, char ** argv) {
         embd.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
+            const llama_token id = common_sampler_sample(smpl, ctx, -1);
 
-            gpt_sampler_accept(smpl, id, true);
+            common_sampler_accept(smpl, id, true);
 
             // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
 
@@ -434,7 +434,7 @@ int main(int argc, char ** argv) {
 
                 // push the prompt in the sampling context in order to apply repetition penalties later
                 // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
+                common_sampler_accept(smpl, embd_inp[n_consumed], false);
 
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -446,7 +446,7 @@ int main(int argc, char ** argv) {
         // display text
         if (input_echo) {
             for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
+                const std::string token_str = common_token_to_piece(ctx, id);
                 LOG("%s", token_str.c_str());
 
                 if (embd.size() > 1) {
@@ -465,10 +465,10 @@ int main(int argc, char ** argv) {
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
             // deal with eot token in infill mode
-            if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
+            if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
                 if (is_interacting && !params.interactive_first) {
                     // print an eot token
-                    LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
                 }
                 LOG("\n");
                 console::set_display(console::user_input);
@@ -505,8 +505,8 @@ int main(int argc, char ** argv) {
                 }
 
                 // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
+                std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
+                std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
 
                 inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
                 inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
@@ -529,7 +529,7 @@ int main(int argc, char ** argv) {
                 is_interacting = false;
             }
             // deal with end of generation tokens in interactive mode
-            else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
+            else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
                 LOG_DBG("found EOS token\n");
 
                 if (params.interactive) {
@@ -579,7 +579,7 @@ int main(int argc, char ** argv) {
 
                     const size_t original_size = embd_inp.size();
 
-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    const auto line_inp = common_tokenize(ctx, buffer, false);
                     LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
 
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
@@ -587,7 +587,7 @@ int main(int argc, char ** argv) {
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
                         const llama_token token = embd_inp[i];
                         output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
+                        output_ss << common_token_to_piece(ctx, token);
                     }
 
                     n_remain -= line_inp.size();
@@ -601,7 +601,7 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    gpt_sampler_reset(smpl);
+                    common_sampler_reset(smpl);
                 }
                 is_interacting = false;
             }
@@ -620,17 +620,17 @@ int main(int argc, char ** argv) {
         }
     }
     if (!params.interactive && n_remain <= 0) {
-        LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+        LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
     }
 
     LOG("\n");
-    gpt_perf_print(ctx, smpl);
+    common_perf_print(ctx, smpl);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
     llama_free(ctx);
     llama_free_model(model);
 
-    gpt_sampler_free(smpl);
+    common_sampler_free(smpl);
     llama_backend_free();
 
     return 0;
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index f611809c6..f5ffd063f 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -186,11 +186,11 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
     for (nri = 0; nri < nr; nri++) {
         LOGi("Benchmark prompt processing (pp)");
 
-        llama_batch_clear(*batch);
+        common_batch_clear(*batch);
 
         const int n_tokens = pp;
         for (i = 0; i < n_tokens; i++) {
-            llama_batch_add(*batch, 0, i, { 0 }, false);
+            common_batch_add(*batch, 0, i, { 0 }, false);
         }
 
         batch->logits[batch->n_tokens - 1] = true;
@@ -210,9 +210,9 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
         const auto t_tg_start = ggml_time_us();
         for (i = 0; i < tg; i++) {
 
-            llama_batch_clear(*batch);
+            common_batch_clear(*batch);
             for (j = 0; j < pl; j++) {
-                llama_batch_add(*batch, 0, i, { j }, true);
+                common_batch_add(*batch, 0, i, { j }, true);
             }
 
             LOGi("llama_decode() text generation: %d", i);
@@ -357,7 +357,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
     const auto context = reinterpret_cast<llama_context *>(context_pointer);
     const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
 
-    const auto tokens_list = llama_tokenize(context, text, 1);
+    const auto tokens_list = common_tokenize(context, text, 1);
 
     auto n_ctx = llama_n_ctx(context);
     auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
@@ -369,14 +369,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
     }
 
     for (auto id : tokens_list) {
-        LOGi("%s", llama_token_to_piece(context, id).c_str());
+        LOGi("%s", common_token_to_piece(context, id).c_str());
     }
 
-    llama_batch_clear(*batch);
+    common_batch_clear(*batch);
 
     // evaluate the initial prompt
     for (auto i = 0; i < tokens_list.size(); i++) {
-        llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
+        common_batch_add(*batch, tokens_list[i], i, { 0 }, false);
     }
 
     // llama_decode will output logits only for the last token of the prompt
@@ -419,7 +419,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
         return nullptr;
     }
 
-    auto new_token_chars = llama_token_to_piece(context, new_token_id);
+    auto new_token_chars = common_token_to_piece(context, new_token_id);
     cached_token_chars += new_token_chars;
 
     jstring new_token = nullptr;
@@ -431,8 +431,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
         new_token = env->NewStringUTF("");
     }
 
-    llama_batch_clear(*batch);
-    llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
+    common_batch_clear(*batch);
+    common_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
 
     env->CallVoidMethod(intvar_ncur, la_int_var_inc);
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 8f437863f..5f9abe2b6 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -37,21 +37,21 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
 
 static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
     std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
+    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
     eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
     return true;
 }
 
-static const char * sample(struct gpt_sampler * smpl,
+static const char * sample(struct common_sampler * smpl,
                            struct llama_context * ctx_llama,
                            int * n_past) {
-    const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
-    gpt_sampler_accept(smpl, id, true);
+    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
+    common_sampler_accept(smpl, id, true);
     static std::string ret;
     if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
         ret = "</s>";
     } else {
-        ret = llama_token_to_piece(ctx_llama, id);
+        ret = common_token_to_piece(ctx_llama, id);
     }
     eval_id(ctx_llama, id, n_past);
     return ret.c_str();
@@ -120,7 +120,7 @@ static void print_usage(int, char ** argv) {
     LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
-static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
+static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
 
     // load and preprocess the image
     llava_image_embed * embed = NULL;
@@ -146,7 +146,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
     return embed;
 }
 
-static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
+static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
     int n_past = 0;
 
     const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
@@ -159,16 +159,16 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         user_prompt = prompt.substr(image_pos + std::string("<image>").length());
         LOG_INF("system_prompt: %s\n", system_prompt.c_str());
         if (params->verbose_prompt) {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
         LOG_INF("user_prompt: %s\n", user_prompt.c_str());
         if (params->verbose_prompt) {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
     } else {
@@ -176,9 +176,9 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
         user_prompt = prompt + "\nASSISTANT:";
         if (params->verbose_prompt) {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
     }
@@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
 
     LOG("\n");
 
-    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
     if (!smpl) {
         LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
@@ -211,15 +211,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         fflush(stdout);
     }
 
-    gpt_sampler_free(smpl);
+    common_sampler_free(smpl);
     LOG("\n");
 }
 
-static struct llama_model * llava_init(gpt_params * params) {
+static struct llama_model * llava_init(common_params * params) {
     llama_backend_init();
     llama_numa_init(params->numa);
 
-    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
+    llama_model_params model_params = common_model_params_to_llama(*params);
 
     llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
@@ -229,7 +229,7 @@ static struct llama_model * llava_init(gpt_params * params) {
     return model;
 }
 
-static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
+static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
     const char * clip_path = params->mmproj.c_str();
 
     auto prompt = params->prompt;
@@ -240,7 +240,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
     auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
 
 
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
+    llama_context_params ctx_params = common_context_params_to_llama(*params);
     ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
 
     llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
@@ -272,13 +272,13 @@ static void llava_free(struct llava_context * ctx_llava) {
 int main(int argc, char ** argv) {
     ggml_time_init();
 
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
         print_usage(argc, argv);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index c5156c35b..6b666de1b 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -25,11 +25,11 @@ static void show_additional_info(int /*argc*/, char ** argv) {
     LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
-static struct llama_model * llava_init(gpt_params * params) {
+static struct llama_model * llava_init(common_params * params) {
     llama_backend_init();
     llama_numa_init(params->numa);
 
-    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
+    llama_model_params model_params = common_model_params_to_llama(*params);
 
     llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
@@ -39,13 +39,13 @@ static struct llama_model * llava_init(gpt_params * params) {
     return model;
 }
 
-static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
+static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
     auto prompt = params->prompt;
     if (prompt.empty()) {
         prompt = "describe the image in detail.";
     }
 
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
+    llama_context_params ctx_params = common_context_params_to_llama(*params);
     if (params->n_ctx < 2048) {
         // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
         LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
@@ -79,7 +79,7 @@ static void llava_free(struct llava_context * ctx_llava) {
     llama_backend_free();
 }
 
-static struct clip_ctx * clip_init_context(gpt_params * params) {
+static struct clip_ctx * clip_init_context(common_params * params) {
     const char * clip_path = params->mmproj.c_str();
 
     auto prompt = params->prompt;
@@ -114,7 +114,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
 
 static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
     std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
+    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
     return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
 }
 
@@ -129,7 +129,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
     llava_image_embed_free(slice_embed);
 }
 
-static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
+static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
     std::string system_prompt;
     int idx = 0;
     int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
@@ -162,22 +162,22 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
     LOG_INF("%s: image token past: %d\n", __func__, n_past);
 }
 
-static const char * sample(struct gpt_sampler * smpl,
+static const char * sample(struct common_sampler * smpl,
                            struct llama_context * ctx_llama,
                            int * n_past) {
-    const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
-    gpt_sampler_accept(smpl, id, true);
+    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
+    common_sampler_accept(smpl, id, true);
     static std::string ret;
     if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
         ret = "</s>";
     } else {
-        ret = llama_token_to_piece(ctx_llama, id);
+        ret = common_token_to_piece(ctx_llama, id);
     }
     eval_id(ctx_llama, id, n_past);
     return ret.c_str();
 }
 
-static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
+static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
     auto * ctx_clip = clip_init_context(params);
     auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
     if (!embeds) {
@@ -213,7 +213,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri
     return ctx_llava;
 }
 
-static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
+static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
     std::string user_prompt = prompt;
     int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
     if (!is_first) {
@@ -237,11 +237,11 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
 
     LOG_INF("\n");
 
-    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
     return smpl;
 }
 
-static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){
+static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
 
     const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
     return tmp;
@@ -250,13 +250,13 @@ static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampl
 int main(int argc, char ** argv) {
     ggml_time_init();
 
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     if (params.mmproj.empty() || (params.image.empty())) {
         show_additional_info(argc, argv);
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
 
                 fflush(stdout);
             }
-            gpt_sampler_free(smpl);
+            common_sampler_free(smpl);
         }else {
             while (true) {
                 LOG("<user>");
@@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
                     if (strstr(response.c_str(), "<user>")) break; // minicpm-v
                     fflush(stdout);
                 }
-                gpt_sampler_free(smpl);
+                common_sampler_free(smpl);
             }
         }
         printf("\n");
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 49870b4a4..f9e4aba81 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -37,13 +37,13 @@ struct ngram_container {
 };
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     const int W = 15; // lookahead window
     const int N = 5;  // n-gram size
@@ -56,7 +56,7 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     // load the target model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
@@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> inp;
     std::vector<llama_token> all;
 
-    inp = ::llama_tokenize(ctx, params.prompt, true, true);
+    inp = common_tokenize(ctx, params.prompt, true, true);
     all = inp;
 
     const int max_context_size     = llama_n_ctx(ctx);
@@ -79,7 +79,7 @@ int main(int argc, char ** argv) {
     LOG("\n\n");
 
     for (auto id : inp) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", common_token_to_piece(ctx, id).c_str());
     }
 
     fflush(stderr);
@@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
     llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
 
     // target model sampling context
-    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
 
     // verification n-grams
     std::vector<ngram_data> ngrams_cur(G);
@@ -156,12 +156,12 @@ int main(int argc, char ** argv) {
 
     // sample first token
     {
-        id = gpt_sampler_sample(smpl, ctx, 0);
+        id = common_sampler_sample(smpl, ctx, 0);
 
-        gpt_sampler_accept(smpl, id, true);
+        common_sampler_accept(smpl, id, true);
 
         {
-            const std::string token_str = llama_token_to_piece(ctx, id);
+            const std::string token_str = common_token_to_piece(ctx, id);
 
             LOG("%s", token_str.c_str());
             fflush(stdout);
@@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
         // debug
         if (dump_kv_cache) {
             llama_kv_cache_view_update(ctx, &kvc_view);
-            llama_kv_cache_dump_view_seqs(kvc_view, 40);
+            common_kv_cache_dump_view_seqs(kvc_view, 40);
         }
 
         // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
@@ -201,10 +201,10 @@ int main(int argc, char ** argv) {
         //                                                      V  V  V  V  V  V
         //                                                             id
         {
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
 
             // current token - first token of the first level
-            llama_batch_add(batch, id, n_past, seq_id_all, true);
+            common_batch_add(batch, id, n_past, seq_id_all, true);
 
             // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
             {
@@ -229,7 +229,7 @@ int main(int argc, char ** argv) {
                         ngrams_cur[g].tokens [j + 1] = t;
                         ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
 
-                        llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
+                        common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
                     }
                 }
             }
@@ -241,13 +241,13 @@ int main(int argc, char ** argv) {
                     seq_id_look[j] = i + j + 1;
                 }
 
-                llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
+                common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
             }
 
             // fill the rest of the levels
             for (int j = 1; j < N - 1; j++) {
                 for (int i = 0; i < W; i++) {
-                    llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
+                    common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
                 }
             }
         }
@@ -281,13 +281,13 @@ int main(int argc, char ** argv) {
             }
 
             // sample the next token
-            id = gpt_sampler_sample(smpl, ctx, i_batch);
+            id = common_sampler_sample(smpl, ctx, i_batch);
 
-            gpt_sampler_accept(smpl, id, true);
+            common_sampler_accept(smpl, id, true);
 
             // print
             {
-                const std::string token_str = llama_token_to_piece(ctx, id);
+                const std::string token_str = common_token_to_piece(ctx, id);
 
                 if (v == 0) {
                     LOG("%s", token_str.c_str());
@@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
             // print known n-grams starting with token id (debug)
             if (0 && v == 0) {
                 if (ngrams_observed.cnt[id] > 0) {
-                    LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
+                    LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
                 }
 
                 for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
@@ -336,7 +336,7 @@ int main(int argc, char ** argv) {
                     const int idx = id*(N - 1)*G + i*(N - 1);
 
                     for (int j = 0; j < N - 1; j++) {
-                        const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
+                        const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
 
                         LOG("%s", token_str.c_str());
                     }
@@ -358,7 +358,7 @@ int main(int argc, char ** argv) {
                 if (v == 0) {
                     // sample from the last level
                     for (int i = 0; i < W; i++) {
-                        tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
+                        tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
                     }
                 } else {
                     for (int i = 0; i < W; i++) {
@@ -466,9 +466,9 @@ int main(int argc, char ** argv) {
     LOG_INF("n_accept  = %d\n", n_accept);
 
     LOG_INF("\n");
-    gpt_perf_print(ctx, smpl);
+    common_perf_print(ctx, smpl);
 
-    gpt_sampler_free(smpl);
+    common_sampler_free(smpl);
 
     llama_kv_cache_view_free(&kvc_view);
 
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
index 33287c02c..7ced0aa97 100644
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -12,9 +12,9 @@
 #include <vector>
 
 int main(int argc, char ** argv){
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
         return 1;
     }
 
@@ -23,7 +23,7 @@ int main(int argc, char ** argv){
     llama_numa_init(params.numa);
 
     // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
@@ -31,15 +31,15 @@ int main(int argc, char ** argv){
 
     // tokenize the prompt
     std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, true, true);
+    inp = common_tokenize(ctx, params.prompt, true, true);
     fprintf(stderr, "%s: tokenization done\n", __func__);
 
 
-    llama_ngram_cache ngram_cache;
-    llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
+    common_ngram_cache ngram_cache;
+    common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
     fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
 
-    llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
+    common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
 
     return 0;
 }
diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp
index 81e2b0436..6871c0f5f 100644
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@@ -33,15 +33,15 @@ int main(int argc, char ** argv){
     }
 
     fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
-    llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
+    common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
 
     for (size_t i = 1; i < args.size()-1; ++i) {
         fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
-        llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
+        common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
 
-        llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
+        common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
     }
 
     fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
-    llama_ngram_cache_save(ngram_cache_merged, args.back());
+    common_ngram_cache_save(ngram_cache_merged, args.back());
 }
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index 6d1e1ceb9..7faebe7ba 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -13,13 +13,13 @@
 #include <vector>
 
 int main(int argc, char ** argv){
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     const int n_draft = params.n_draft;
 
@@ -28,18 +28,18 @@ int main(int argc, char ** argv){
     llama_numa_init(params.numa);
 
     // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
 
     // tokenize the prompt
     std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, true, true);
+    inp = common_tokenize(ctx, params.prompt, true, true);
 
-    llama_ngram_cache ngram_cache_context;
-    llama_ngram_cache ngram_cache_dynamic;
-    llama_ngram_cache ngram_cache_static;
+    common_ngram_cache ngram_cache_context;
+    common_ngram_cache ngram_cache_dynamic;
+    common_ngram_cache ngram_cache_static;
     int64_t t_draft_flat_us = 0;
     int64_t t_draft_us = 0;
 
@@ -48,7 +48,7 @@ int main(int argc, char ** argv){
 
         if (!params.lookup_cache_static.empty()) {
             try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
+                ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
             } catch (std::ifstream::failure const &) {
                 LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
@@ -57,7 +57,7 @@ int main(int argc, char ** argv){
 
         if (!params.lookup_cache_dynamic.empty()) {
             try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
+                ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
             } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
         }
 
@@ -86,7 +86,7 @@ int main(int argc, char ** argv){
 
             {
                 const int64_t t_start_draft_us = ggml_time_us();
-                llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+                common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
                 t_draft_us += ggml_time_us() - t_start_draft_us;
             }
 
@@ -105,7 +105,7 @@ int main(int argc, char ** argv){
 
                 {
                     const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
                     t_draft_us += ggml_time_us() - t_start_draft_us;
                 }
             }
@@ -115,7 +115,7 @@ int main(int argc, char ** argv){
                 pseudo_output.push_back(inp_slice[pseudo_output.size()]);
                 {
                     const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
                     t_draft_us += ggml_time_us() - t_start_draft_us;
                 }
             }
@@ -133,7 +133,7 @@ int main(int argc, char ** argv){
         }
 
         // After each chunk, update the dynamic ngram cache with the context ngram cache:
-        llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
+        common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
         ngram_cache_context.clear();
     }
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 2ccd0e6c1..82fc7d466 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -13,13 +13,13 @@
 #include <vector>
 
 int main(int argc, char ** argv){
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     // max. number of additional tokens to draft if match is found
     const int n_draft = params.n_draft;
@@ -31,29 +31,29 @@ int main(int argc, char ** argv){
     llama_numa_init(params.numa);
 
     // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
 
     // tokenize the prompt
     std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, true, true);
+    inp = common_tokenize(ctx, params.prompt, true, true);
 
-    llama_ngram_cache ngram_cache_context;
-    llama_ngram_cache ngram_cache_dynamic;
-    llama_ngram_cache ngram_cache_static;
+    common_ngram_cache ngram_cache_context;
+    common_ngram_cache ngram_cache_dynamic;
+    common_ngram_cache ngram_cache_static;
     int64_t t_draft_flat_us = 0;
     int64_t t_draft_us = 0;
 
     {
         // Fill up context ngram cache with tokens from user input:
         const int64_t t_start_draft_us = ggml_time_us();
-        llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
+        common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
 
         if (!params.lookup_cache_static.empty()) {
             try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
+                ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
             } catch (std::ifstream::failure const &) {
                 LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
@@ -62,7 +62,7 @@ int main(int argc, char ** argv){
 
         if (!params.lookup_cache_dynamic.empty()) {
             try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
+                ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
             } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
         }
 
@@ -80,7 +80,7 @@ int main(int argc, char ** argv){
     LOG("\n\n");
 
     for (auto id : inp) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", common_token_to_piece(ctx, id).c_str());
     }
 
     fflush(stderr);
@@ -102,7 +102,7 @@ int main(int argc, char ** argv){
 
     bool has_eos = false;
 
-    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
 
     std::vector<llama_token> draft;
 
@@ -117,7 +117,7 @@ int main(int argc, char ** argv){
         // debug
         if (dump_kv_cache) {
             llama_kv_cache_view_update(ctx, &kvc_view);
-            llama_kv_cache_dump_view_seqs(kvc_view, 40);
+            common_kv_cache_dump_view_seqs(kvc_view, 40);
         }
 
         // print current draft sequence
@@ -126,11 +126,11 @@ int main(int argc, char ** argv){
         int i_dft = 0;
         while (true) {
             // sample from the target model
-            llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);
+            llama_token id = common_sampler_sample(smpl, ctx, i_dft);
 
-            gpt_sampler_accept(smpl, id, true);
+            common_sampler_accept(smpl, id, true);
 
-            const std::string token_str = llama_token_to_piece(ctx, id);
+            const std::string token_str = common_token_to_piece(ctx, id);
 
             if (!params.use_color) {
                 LOG("%s", token_str.c_str());
@@ -152,7 +152,7 @@ int main(int argc, char ** argv){
                 {
                     // Update context ngram cache with the newly accepted token:
                     const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
+                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
                     t_draft_us += ggml_time_us() - t_start_draft_us;
                 }
 
@@ -178,7 +178,7 @@ int main(int argc, char ** argv){
             {
                 // Update context ngram cache with the newly accepted token:
                 const int64_t t_start_draft_us = ggml_time_us();
-                llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
+                common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
                 t_draft_us += ggml_time_us() - t_start_draft_us;
             }
             break;
@@ -192,18 +192,18 @@ int main(int argc, char ** argv){
         // clean the cache of draft tokens that weren't accepted
         llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
 
-        llama_batch_clear(batch_tgt);
-        llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
+        common_batch_clear(batch_tgt);
+        common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
 
         // Draft already contains a single token sampled from the model:
         GGML_ASSERT(draft.size() == 1);
         GGML_ASSERT(draft[0] == inp.back());
         const int64_t t_start_draft_us = ggml_time_us();
 
-        llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+        common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
 
         for (size_t i = 1; i < draft.size(); ++i) {
-            llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+            common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
         }
 
         t_draft_us += ggml_time_us() - t_start_draft_us;
@@ -218,8 +218,8 @@ int main(int argc, char ** argv){
     auto t_dec_end = ggml_time_us();
 
     // Update dynamic ngram cache with context ngram cache and save it to disk:
-    llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
-    llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
+    common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
+    common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
 
     LOG("\n\n");
 
@@ -237,9 +237,9 @@ int main(int argc, char ** argv){
     LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
     LOG_INF("\ntarget:\n\n");
-    gpt_perf_print(ctx, smpl);
+    common_perf_print(ctx, smpl);
 
-    gpt_sampler_free(smpl);
+    common_sampler_free(smpl);
 
     llama_batch_free(batch_tgt);
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 6bbb1e13e..fb10c20c5 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -33,8 +33,8 @@
 
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
-static gpt_sampler             ** g_smpl;
-static gpt_params               * g_params;
+static common_sampler          ** g_smpl;
+static common_params            * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
@@ -63,7 +63,7 @@ static bool file_is_empty(const std::string & path) {
 }
 
 static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
+    const llama_context * ctx, const common_params & params, const llama_model * model,
     const std::vector<llama_token> & input_tokens, const std::string & output,
     const std::vector<llama_token> & output_tokens
 ) {
@@ -114,12 +114,12 @@ static void sigint_handler(int signo) {
         } else {
             console::cleanup();
             LOG("\n");
-            gpt_perf_print(*g_ctx, *g_smpl);
+            common_perf_print(*g_ctx, *g_smpl);
             write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
 
             // make sure all logs are flushed
             LOG("Interrupted by user\n");
-            gpt_log_pause(gpt_log_main());
+            common_log_pause(common_log_main());
 
             _exit(130);
         }
@@ -127,22 +127,22 @@ static void sigint_handler(int signo) {
 }
 #endif
 
-static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
-    llama_chat_msg new_msg{role, content};
-    auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
+    common_chat_msg new_msg{role, content};
+    auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
     chat_msgs.push_back({role, content});
     LOG_DBG("formatted: '%s'\n", formatted.c_str());
     return formatted;
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
     g_params = &params;
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     auto & sparams = params.sparams;
 
@@ -187,9 +187,9 @@ int main(int argc, char ** argv) {
 
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
-    gpt_sampler * smpl = nullptr;
+    common_sampler * smpl = nullptr;
 
-    std::vector<llama_chat_msg> chat_msgs;
+    std::vector<common_chat_msg> chat_msgs;
 
     g_model = &model;
     g_ctx = &ctx;
@@ -197,7 +197,7 @@ int main(int argc, char ** argv) {
 
     // load the model and apply lora adapter, if any
     LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     model = llama_init.model;
     ctx = llama_init.context;
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
     // print chat template example in conversation mode
     if (params.conversation) {
         if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
         } else {
             LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
         }
@@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
     // print system information
     {
         LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
         LOG_INF("\n");
     }
 
@@ -296,7 +296,7 @@ int main(int argc, char ** argv) {
             : params.prompt;
         if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
             LOG_DBG("tokenize the prompt\n");
-            embd_inp = ::llama_tokenize(ctx, prompt, true, true);
+            embd_inp = common_tokenize(ctx, prompt, true, true);
         } else {
             LOG_DBG("use session tokens\n");
             embd_inp = session_tokens;
@@ -379,13 +379,13 @@ int main(int argc, char ** argv) {
         LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (params.n_keep > add_bos) {
             LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
             }
             LOG_CNT("'\n");
         }
@@ -415,9 +415,9 @@ int main(int argc, char ** argv) {
             for (const auto & antiprompt : params.antiprompt) {
                 LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
                 if (params.verbose_prompt) {
-                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    auto tmp = common_tokenize(ctx, antiprompt, false, true);
                     for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                     }
                 }
             }
@@ -430,9 +430,9 @@ int main(int argc, char ** argv) {
         if (!params.input_prefix.empty()) {
             LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
             if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
@@ -440,23 +440,23 @@ int main(int argc, char ** argv) {
         if (!params.input_suffix.empty()) {
             LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
             if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
     }
 
-    smpl = gpt_sampler_init(model, sparams);
+    smpl = common_sampler_init(model, sparams);
     if (!smpl) {
         LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         return 1;
     }
 
-    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
     LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
-    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
 
     LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
 
@@ -521,7 +521,7 @@ int main(int argc, char ** argv) {
 
     antiprompt_ids.reserve(params.antiprompt.size());
     for (const std::string & antiprompt : params.antiprompt) {
-        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
+        antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
     }
 
     if (llama_model_has_encoder(model)) {
@@ -679,9 +679,9 @@ int main(int argc, char ** argv) {
                 LOG_DBG("saved session to %s\n", path_session.c_str());
             }
 
-            const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
+            const llama_token id = common_sampler_sample(smpl, ctx, -1);
 
-            gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
+            common_sampler_accept(smpl, id, /* accept_grammar= */ true);
 
             // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
 
@@ -702,7 +702,7 @@ int main(int argc, char ** argv) {
 
                 // push the prompt in the sampling context in order to apply repetition penalties later
                 // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
+                common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
 
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -714,7 +714,7 @@ int main(int argc, char ** argv) {
         // display text
         if (input_echo && display) {
             for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+                const std::string token_str = common_token_to_piece(ctx, id, params.special);
 
                 // Console/Stream Output
                 LOG("%s", token_str.c_str());
@@ -743,7 +743,7 @@ int main(int argc, char ** argv) {
             // check for reverse prompt in the last n_prev tokens
             if (!params.antiprompt.empty()) {
                 const int n_prev = 32;
-                const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
+                const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
 
                 is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
@@ -765,7 +765,7 @@ int main(int argc, char ** argv) {
                 }
 
                 // check for reverse prompt using special tokens
-                llama_token last_token = gpt_sampler_last(smpl);
+                llama_token last_token = common_sampler_last(smpl);
                 for (std::vector<llama_token> ids : antiprompt_ids) {
                     if (ids.size() == 1 && last_token == ids[0]) {
                         if (params.interactive) {
@@ -782,13 +782,13 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
+            if (llama_token_is_eog(model, common_sampler_last(smpl))) {
                 LOG_DBG("found an EOG token\n");
 
                 if (params.interactive) {
                     if (!params.antiprompt.empty()) {
                         // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
+                        const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
                         embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                         is_antiprompt = true;
                     }
@@ -803,8 +803,8 @@ int main(int argc, char ** argv) {
 
             // if current token is not EOG, we add it to current assistant message
             if (params.conversation) {
-                const auto id = gpt_sampler_last(smpl);
-                assistant_ss << llama_token_to_piece(ctx, id, false);
+                const auto id = common_sampler_last(smpl);
+                assistant_ss << common_token_to_piece(ctx, id, false);
             }
 
             if (n_past > 0 && is_interacting) {
@@ -862,9 +862,9 @@ int main(int argc, char ** argv) {
                         ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
                         : std::move(buffer);
                     // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
-                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
-                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                    const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = common_tokenize(ctx, user_inp,            false, format_chat);
+                    const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
 
                     LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
 
@@ -882,7 +882,7 @@ int main(int argc, char ** argv) {
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
                         const llama_token token = embd_inp[i];
                         output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
+                        output_ss << common_token_to_piece(ctx, token);
                     }
 
                     // reset assistant message
@@ -899,7 +899,7 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    gpt_sampler_reset(smpl);
+                    common_sampler_reset(smpl);
                 }
                 is_interacting = false;
             }
@@ -925,10 +925,10 @@ int main(int argc, char ** argv) {
     }
 
     LOG("\n\n");
-    gpt_perf_print(ctx, smpl);
+    common_perf_print(ctx, smpl);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
-    gpt_sampler_free(smpl);
+    common_sampler_free(smpl);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 81e2f7ed7..20274c147 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -54,7 +54,7 @@ static std::vector<std::string> k_prompts = {
 struct client {
     ~client() {
         if (smpl) {
-            gpt_sampler_free(smpl);
+            common_sampler_free(smpl);
         }
     }
 
@@ -75,7 +75,7 @@ struct client {
     std::string prompt;
     std::string response;
 
-    struct gpt_sampler * smpl = nullptr;
+    struct common_sampler * smpl = nullptr;
 };
 
 static void print_date_time() {
@@ -103,13 +103,13 @@ static std::vector<std::string> split_string(const std::string& input, char deli
 int main(int argc, char ** argv) {
     srand(1234);
 
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     // number of simultaneous "clients" to simulate
     const int32_t n_clients = params.n_parallel;
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     // load the target model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
@@ -160,11 +160,11 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.smpl = gpt_sampler_init(model, params.sparams);
+        client.smpl = common_sampler_init(model, params.sparams);
     }
 
     std::vector<llama_token> tokens_system;
-    tokens_system = ::llama_tokenize(ctx, k_system, true);
+    tokens_system = common_tokenize(ctx, k_system, true);
     const int32_t n_tokens_system = tokens_system.size();
 
     llama_seq_id g_seq_id = 0;
@@ -189,7 +189,7 @@ int main(int argc, char ** argv) {
         LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
 
         for (int32_t i = 0; i < n_tokens_system; ++i) {
-            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
+            common_batch_add(batch, tokens_system[i], i, { 0 }, false);
         }
 
         if (llama_decode(ctx, batch) != 0) {
@@ -210,10 +210,10 @@ int main(int argc, char ** argv) {
     while (true) {
         if (dump_kv_cache) {
             llama_kv_cache_view_update(ctx, &kvc_view);
-            llama_kv_cache_dump_view_seqs(kvc_view, 40);
+            common_kv_cache_dump_view_seqs(kvc_view, 40);
         }
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         // decode any currently ongoing sequences
         for (auto & client : clients) {
@@ -223,7 +223,7 @@ int main(int argc, char ** argv) {
 
             client.i_batch = batch.n_tokens;
 
-            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
+            common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
 
             client.n_decoded += 1;
         }
@@ -252,14 +252,14 @@ int main(int argc, char ** argv) {
                     client.prompt   = client.input + "\nAssistant:";
                     client.response = "";
 
-                    gpt_sampler_reset(client.smpl);
+                    common_sampler_reset(client.smpl);
 
                     // do not prepend BOS because we have a system prompt!
                     std::vector<llama_token> tokens_prompt;
-                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
+                    tokens_prompt = common_tokenize(ctx, client.prompt, false);
 
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
+                        common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
                     }
 
                     // extract the logits only for the last token
@@ -340,9 +340,9 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
 
-                const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i);
+                const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
 
-                gpt_sampler_accept(client.smpl, id, true);
+                common_sampler_accept(client.smpl, id, true);
 
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
@@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
                     client.t_start_gen = ggml_time_us();
                 }
 
-                const std::string token_str = llama_token_to_piece(ctx, id);
+                const std::string token_str = common_token_to_piece(ctx, id);
 
                 client.response += token_str;
                 client.sampled = id;
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 7ef8d14f3..09bba708f 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -15,17 +15,17 @@ static void print_usage(int, char ** argv) {
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
     params.n_junk = 250;
     params.n_keep = 32;
     params.i_pos  = -1;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     int n_junk = params.n_junk;
     int n_keep = params.n_keep;
@@ -61,7 +61,7 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+    llama_model_params model_params = common_model_params_to_llama(params);
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -72,7 +72,7 @@ int main(int argc, char ** argv) {
 
     // initialize the context
 
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+    llama_context_params ctx_params = common_context_params_to_llama(params);
 
     ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
 
@@ -92,10 +92,10 @@ int main(int argc, char ** argv) {
 
     // tokenize the prompt
     std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+    tokens_list = common_tokenize(ctx, params.prompt, true);
 
     // tokenize the prefix and use it as a sink
-    const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
+    const int n_tokens_prefix = common_tokenize(ctx, prompt_prefix, true).size();
 
     const int n_tokens_all = tokens_list.size();
 
@@ -137,10 +137,10 @@ int main(int argc, char ** argv) {
             n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
         }
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
-            llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
+            common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
         }
 
         if (i + n_batch >= n_tokens_all) {
@@ -171,10 +171,10 @@ int main(int argc, char ** argv) {
 
         n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
-            llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
+            common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
         }
 
         if (i + n_batch >= n_tokens_all) {
@@ -229,15 +229,15 @@ int main(int argc, char ** argv) {
                 break;
             }
 
-            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
 
             n_decode += 1;
 
             // prepare the next batch
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
 
             // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_past++, { 0 }, true);
+            common_batch_add(batch, new_token_id, n_past++, { 0 }, true);
         }
 
         n_cur += 1;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 40bc29f7a..efb41b80a 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -35,7 +35,7 @@ struct results_log_softmax {
 };
 
 static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
+    const llama_context * ctx, const common_params & params, const llama_model * model,
     const struct results_perplexity & results
 ) {
     if (params.logdir.empty()) {
@@ -339,7 +339,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
     }
 }
 
-static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
+static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) {
     // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
     // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
     // Output: `perplexity: 13.5106 [114/114]`
@@ -350,7 +350,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
 
     LOG_INF("%s: tokenizing the input ..\n", __func__);
 
-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
 
     const int n_ctx = llama_n_ctx(ctx);
 
@@ -474,7 +474,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
     return {tokens, std::exp(nll / count), logit_history, prob_history};
 }
 
-static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) {
+static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
     if (params.ppl_stride > 0) {
         return perplexity_v2(ctx, params);
     }
@@ -502,7 +502,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     auto tim1 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenizing the input ..\n", __func__);
 
-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -772,7 +772,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
     }
 }
 
-static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
+static void hellaswag_score(llama_context * ctx, const common_params & params) {
     // Calculates hellaswag score (acc_norm) from prompt
     //
     // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -853,7 +853,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
         for (size_t j = 0; j < 4; j++) {
             hs_cur.ending[j] = prompt_lines[idx*6+2+j];
-            hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
+            hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
         }
 
         // determine the common prefix of the endings
@@ -910,7 +910,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         size_t i1 = i0;
         size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         // batch as much tasks as possible into the available context
         // each task has 4 unique sequence ids - one for each ending
@@ -926,7 +926,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             }
 
             for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
-                llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
+                common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
             }
             batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
             n_logits += 1;
@@ -936,7 +936,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
                 // TODO: don't evaluate the last token of each sequence
                 for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
                     const bool needs_logits = i < seq_tokens_size - 1;
-                    llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
+                    common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
                     n_logits += needs_logits;
                 }
             }
@@ -1112,7 +1112,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
  *    0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
  *
  */
-static void winogrande_score(llama_context * ctx, const gpt_params & params) {
+static void winogrande_score(llama_context * ctx, const common_params & params) {
 
     constexpr int k_min_trailing_ctx = 3;
 
@@ -1146,8 +1146,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
     LOG_INF("%s : tokenizing selected tasks\n", __func__);
 
     for (auto & task : data) {
-        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
-        task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
+        task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
+        task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true);
 
         task.common_prefix = 0;
         for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@@ -1162,8 +1162,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             task.seq_tokens[0].size() - task.common_prefix +
             task.seq_tokens[1].size() - task.common_prefix;
 
-        task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
-        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
+        task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size();
+        task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size();
     }
 
     LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
@@ -1195,7 +1195,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         size_t i1 = i0;
         size_t i_logits = 0;
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
             int n_logits = 0;
@@ -1205,7 +1205,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             }
 
             for (size_t i = 0; i < data[i1].common_prefix; ++i) {
-                llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
+                common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
             }
             batch.logits[batch.n_tokens - 1] = true;
             n_logits += 1;
@@ -1213,7 +1213,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             for (int s = 0; s < 2; ++s) {
                 // TODO: end before the last token, no need to predict past the end of the sequences
                 for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
-                    llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
+                    common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
                     n_logits += 1;
                 }
             }
@@ -1370,7 +1370,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
             }
             return false;
         }
-        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
+        task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
     }
     auto min_len = task.seq_tokens.front().size();
     for (auto& seq : task.seq_tokens) {
@@ -1414,7 +1414,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
 //     git@hf.co:datasets/Stevross/mmlu
 //     https://huggingface.co/datasets/truthful_qa
 //
-static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
+static void multiple_choice_score(llama_context * ctx, const common_params & params) {
 
     std::istringstream strstream(params.prompt);
     uint32_t n_task;
@@ -1548,7 +1548,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         size_t i1 = i0;
         size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         // batch as much tasks as possible into the available context
         // each task has 4 unique sequence ids - one for each ending
@@ -1571,7 +1571,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
 
             for (size_t i = 0; i < cur_task.common_prefix; ++i) {
                 //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
-                llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
+                common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
             }
             batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
             n_logits += 1;
@@ -1581,7 +1581,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                 // TODO: don't evaluate the last token of each sequence
                 for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
                     const bool needs_logits = i < seq_tokens_size - 1;
-                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
+                    common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
                     n_logits += needs_logits;
                 }
             }
@@ -1695,7 +1695,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
     LOG_INF("\n");
 }
 
-static void kl_divergence(llama_context * ctx, const gpt_params & params) {
+static void kl_divergence(llama_context * ctx, const common_params & params) {
     if (params.logits_file.empty()) {
         LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
         return;
@@ -1968,17 +1968,17 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
     params.n_ctx = 512;
     params.logits_all = true;
     params.escape = false;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     const int32_t n_ctx = params.n_ctx;
 
@@ -2017,7 +2017,7 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     // load the model and apply lora adapter, if any
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
@@ -2036,7 +2036,7 @@ int main(int argc, char ** argv) {
     // print system information
     {
         LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
     struct results_perplexity results;
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 5971690f1..1768aae51 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -77,7 +77,7 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
 static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
     size_t n_tokens = tokens.size();
     for (size_t i = 0; i < n_tokens; i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
+        common_batch_add(batch, tokens[i], i, { seq_id }, true);
     }
 }
 
@@ -107,18 +107,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         }
 
         float * out = output + batch.seq_id[i][0] * n_embd;
-        llama_embd_normalize(embd, out, n_embd);
+        common_embd_normalize(embd, out, n_embd);
     }
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     // For BERT models, batch size must be equal to ubatch size
     params.n_ubatch = params.n_batch;
@@ -149,7 +149,7 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
@@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
     // print system information
     {
         LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
     // max batch size
@@ -185,7 +185,7 @@ int main(int argc, char ** argv) {
 
     // tokenize the prompts and trim
     for (auto & chunk : chunks) {
-        auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
+        auto inp = common_tokenize(ctx, chunk.textdata, true, false);
         if (inp.size() > n_batch) {
             LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
@@ -204,7 +204,7 @@ int main(int argc, char ** argv) {
             LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
             LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
             for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
-                LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+                LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
             }
             LOG_INF("\n\n");
         }
@@ -232,7 +232,7 @@ int main(int argc, char ** argv) {
         if (batch.n_tokens + n_toks > n_batch) {
             float * out = emb + p * n_embd;
             batch_decode(ctx, batch, out, s, n_embd);
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
             p += s;
             s = 0;
         }
@@ -260,20 +260,20 @@ int main(int argc, char ** argv) {
     while (true) {
         LOG("Enter query: ");
         std::getline(std::cin, query);
-        std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
+        std::vector<int32_t> query_tokens = common_tokenize(ctx, query, true);
 
         batch_add_seq(query_batch, query_tokens, 0);
 
         std::vector<float> query_emb(n_embd, 0);
         batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
 
-        llama_batch_clear(query_batch);
+        common_batch_clear(query_batch);
 
         // compute cosine similarities
         {
             std::vector<std::pair<int, float>> similarities;
             for (int i = 0; i < n_chunks; i++) {
-                float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
+                float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
                 similarities.push_back(std::make_pair(i, sim));
             }
 
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 0117d9357..3866cfa27 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -6,12 +6,12 @@
 #include <cstdio>
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
     params.prompt = "The quick brown fox";
     params.sparams.seed = 1234;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
@@ -28,7 +28,7 @@ int main(int argc, char ** argv) {
     std::string result2;
 
     // init
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
 
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
     llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
 
     // tokenize prompt
-    auto tokens = llama_tokenize(ctx, params.prompt, true);
+    auto tokens = common_tokenize(ctx, params.prompt, true);
 
     // evaluate prompt
     llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
@@ -72,7 +72,7 @@ int main(int argc, char ** argv) {
 
     for (auto i = 0; i < params.n_predict; i++) {
         auto next_token     = llama_sampler_sample(smpl, ctx, -1);
-        auto next_token_str = llama_token_to_piece(ctx, next_token);
+        auto next_token_str = common_token_to_piece(ctx, next_token);
 
         printf("%s", next_token_str.c_str());
         result0 += next_token_str;
@@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
 
     // make new context
-    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+    auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));
 
     llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
 
@@ -128,7 +128,7 @@ int main(int argc, char ** argv) {
     // second run
     for (auto i = 0; i < params.n_predict; i++) {
         auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
-        auto next_token_str = llama_token_to_piece(ctx2, next_token);
+        auto next_token_str = common_token_to_piece(ctx2, next_token);
 
         printf("%s", next_token_str.c_str());
         result1 += next_token_str;
@@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
     }
 
     // make new context
-    auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+    auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));
 
     llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
 
@@ -216,7 +216,7 @@ int main(int argc, char ** argv) {
     // third run with seq 1 instead of 0
     for (auto i = 0; i < params.n_predict; i++) {
         auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
-        auto next_token_str = llama_token_to_piece(ctx3, next_token);
+        auto next_token_str = common_token_to_piece(ctx3, next_token);
 
         printf("%s", next_token_str.c_str());
         result2 += next_token_str;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index aedfca0d6..2e1d24189 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -188,8 +188,8 @@ struct server_slot {
     // sampling
     json json_schema;
 
-    struct gpt_sampler_params sparams;
-    struct gpt_sampler * smpl = nullptr;
+    struct common_sampler_params sparams;
+    struct common_sampler * smpl = nullptr;
 
     llama_token sampled;
 
@@ -231,7 +231,7 @@ struct server_slot {
         generated_token_probs.clear();
     }
 
-    bool has_budget(gpt_params &global_params) {
+    bool has_budget(common_params &global_params) {
         if (params.n_predict == -1 && global_params.n_predict == -1) {
             return true; // limitless
         }
@@ -611,9 +611,9 @@ struct server_response {
 struct server_context {
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
-    std::vector<llama_lora_adapter_container> loras;
+    std::vector<common_lora_adapter_container> loras;
 
-    gpt_params params;
+    common_params params;
 
     llama_batch batch = {};
 
@@ -655,20 +655,20 @@ struct server_context {
         // Clear any sampling context
         for (server_slot & slot : slots) {
             if (slot.smpl != nullptr) {
-                gpt_sampler_free(slot.smpl);
+                common_sampler_free(slot.smpl);
             }
         }
 
         llama_batch_free(batch);
     }
 
-    bool load_model(const gpt_params & params_) {
+    bool load_model(const common_params & params_) {
         params = params_;
 
         // dedicate one sequence to the system prompt
         params.n_parallel += 1;
 
-        llama_init_result llama_init = llama_init_from_gpt_params(params);
+        common_init_result llama_init = common_init_from_params(params);
 
         model = llama_init.model;
         ctx   = llama_init.context;
@@ -771,10 +771,10 @@ struct server_context {
 
                     std::vector<llama_token> p;
                     if (first) {
-                        p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
+                        p = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
                         first = false;
                     } else {
-                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
+                        p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
                     }
 
                     prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
@@ -788,7 +788,7 @@ struct server_context {
             }
         } else {
             auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
+            prompt_tokens = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
         }
 
         return prompt_tokens;
@@ -999,7 +999,7 @@ struct server_context {
                                 slot.sparams.logit_bias.push_back({tok, bias});
                             }
                         } else if (el[0].is_string()) {
-                            auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
+                            auto toks = common_tokenize(model, el[0].get<std::string>(), false);
                             for (auto tok : toks) {
                                 slot.sparams.logit_bias.push_back({tok, bias});
                             }
@@ -1031,7 +1031,7 @@ struct server_context {
                         sampler_names.emplace_back(name);
                     }
                 }
-                slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
+                slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
             } else {
                 slot.sparams.samplers = default_sparams.samplers;
             }
@@ -1039,10 +1039,10 @@ struct server_context {
 
         {
             if (slot.smpl != nullptr) {
-                gpt_sampler_free(slot.smpl);
+                common_sampler_free(slot.smpl);
             }
 
-            slot.smpl = gpt_sampler_init(model, slot.sparams);
+            slot.smpl = common_sampler_init(model, slot.sparams);
             if (slot.smpl == nullptr) {
                 // for now, the only error that may happen here is invalid grammar
                 send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
@@ -1073,7 +1073,7 @@ struct server_context {
         system_tokens.clear();
 
         if (!system_prompt.empty()) {
-            system_tokens = ::llama_tokenize(ctx, system_prompt, true);
+            system_tokens = common_tokenize(ctx, system_prompt, true);
 
             const int32_t n_batch = llama_n_batch(ctx);
             const int32_t n_tokens_prompt = system_tokens.size();
@@ -1081,10 +1081,10 @@ struct server_context {
             for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
                 const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
 
-                llama_batch_clear(batch);
+                common_batch_clear(batch);
 
                 for (int32_t j = 0; j < n_tokens; ++j) {
-                    llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
+                    common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
                 }
 
                 if (llama_decode(ctx, batch) != 0) {
@@ -1113,7 +1113,7 @@ struct server_context {
 
     bool process_token(completion_token_output & result, server_slot & slot) {
         // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
+        const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
         slot.sampled = result.tok;
 
         // search stop word and delete it
@@ -1224,7 +1224,7 @@ struct server_context {
         std::vector<std::string> samplers;
         samplers.reserve(slot.sparams.samplers.size());
         for (const auto & sampler : slot.sparams.samplers) {
-            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
+            samplers.emplace_back(common_sampler_type_to_str(sampler));
         }
 
         return json {
@@ -1232,7 +1232,7 @@ struct server_context {
             {"n_predict",                 slot.n_predict},     // Server configured n_predict
             {"model",                     params.model_alias},
             {"seed",                      slot.sparams.seed},
-            {"seed_cur",                  slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
+            {"seed_cur",                  slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
             {"temperature",               slot.sparams.temp},
             {"dynatemp_range",            slot.sparams.dynatemp_range},
             {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
@@ -1297,7 +1297,7 @@ struct server_context {
         };
 
         if (slot.sparams.n_probs > 0) {
-            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
+            const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
             const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
             const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
 
@@ -1347,7 +1347,7 @@ struct server_context {
         if (slot.sparams.n_probs > 0) {
             std::vector<completion_token_output> probs;
             if (!slot.params.stream && slot.stopped_word) {
-                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
+                const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
 
                 size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
                 probs = std::vector<completion_token_output>(
@@ -1401,7 +1401,7 @@ struct server_context {
                 continue;
             }
 
-            llama_embd_normalize(embd, embd_res.data(), n_embd);
+            common_embd_normalize(embd, embd_res.data(), n_embd);
 
             res.data = json {
                 {"embedding", embd_res},
@@ -1835,7 +1835,7 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SET_LORA:
                 {
-                    llama_lora_adapters_apply(ctx, loras);
+                    common_lora_adapters_apply(ctx, loras);
                     server_task_result result;
                     result.id = task.id;
                     result.stop = true;
@@ -1921,7 +1921,7 @@ struct server_context {
         }
 
         // start populating the batch for this iteration
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         // frist, add sampled tokens from any ongoing sequences
         for (auto & slot : slots) {
@@ -1935,7 +1935,7 @@ struct server_context {
 
             // TODO: we always have to take into account the "system_tokens"
             //       this is not great and needs to be improved somehow
-            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
+            common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
 
             slot.n_past += 1;
 
@@ -2092,7 +2092,7 @@ struct server_context {
                                 GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                             }
 
-                            gpt_sampler_reset(slot.smpl);
+                            common_sampler_reset(slot.smpl);
 
                             if (!slot.params.cache_prompt) {
                                 slot.n_past_se = 0;
@@ -2105,7 +2105,7 @@ struct server_context {
 
                                 // push the prompt into the sampling context (do not apply grammar)
                                 for (int i = 0; i < slot.n_past; ++i) {
-                                    gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
+                                    common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
                                 }
                             }
                         }
@@ -2159,7 +2159,7 @@ struct server_context {
                         slot.n_past_se = 0;
                         slot.ga_i = 0;
                         // TODO: is the system prompt ever in the sampling context?
-                        gpt_sampler_reset(slot.smpl);
+                        common_sampler_reset(slot.smpl);
                     }
 
                     // remove the non-common part from the cache
@@ -2184,7 +2184,7 @@ struct server_context {
                             }
                         }
 
-                        llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
+                        common_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
 
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@@ -2322,9 +2322,9 @@ struct server_context {
                 }
 
                 completion_token_output result;
-                const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
 
-                gpt_sampler_accept(slot.smpl, id, true);
+                common_sampler_accept(slot.smpl, id, true);
 
                 slot.n_decoded += 1;
                 if (slot.n_decoded == 1) {
@@ -2335,7 +2335,7 @@ struct server_context {
 
                 result.tok = id;
 
-                const auto * cur_p = gpt_sampler_get_candidates(slot.smpl);
+                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
 
                 for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
                     result.probs.push_back({
@@ -2399,13 +2399,13 @@ inline void signal_handler(int signal) {
 
 int main(int argc, char ** argv) {
     // own arguments required by this example
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     // enabling this will output extra debug information in the HTTP responses from the server
     // see format_final_response_oaicompat()
@@ -2427,7 +2427,7 @@ int main(int argc, char ** argv) {
 
     LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
     LOG_INF("\n");
-    LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     LOG_INF("\n");
 
     std::unique_ptr<httplib::Server> svr;
@@ -3014,7 +3014,7 @@ int main(int argc, char ** argv) {
 
             if (with_pieces) {
                 for (const auto& token : tokens) {
-                    std::string piece = llama_token_to_piece(ctx_server.ctx, token);
+                    std::string piece = common_token_to_piece(ctx_server.ctx, token);
                     json piece_json;
 
                     // Check if the piece is valid UTF-8
@@ -3357,7 +3357,7 @@ int main(int argc, char ** argv) {
     }
 
     // print sample chat example to make it clear which template is used
-    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str());
+    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
 
     ctx_server.queue_tasks.on_new_task(std::bind(
                 &server_context::process_single_task, &ctx_server, std::placeholders::_1));
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 452606cca..ad99e9574 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -57,7 +57,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    std::vector<llama_chat_msg> chat;
+    std::vector<common_chat_msg> chat;
 
     for (size_t i = 0; i < messages.size(); ++i) {
         const auto & curr_msg = messages[i];
@@ -84,7 +84,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
         chat.push_back({role, content});
     }
 
-    const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
+    const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
     LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
 
     return formatted_chat;
@@ -246,7 +246,7 @@ template <class Iter>
 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
     std::string ret;
     for (; begin != end; ++begin) {
-        ret += llama_token_to_piece(ctx, *begin);
+        ret += common_token_to_piece(ctx, *begin);
     }
 
     return ret;
@@ -254,7 +254,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
 
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
-    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+    std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
 
     // if the size is 1 and first bit is 1, meaning it's a partial character
     //   (size > 1 meaning it's already a known token)
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index adf6255e1..5a7b3084f 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -26,20 +26,20 @@ struct seq_draft {
     std::vector<llama_token> tokens;
     std::vector<std::vector<llama_token_data>> dists;
 
-    struct gpt_sampler * smpl = nullptr;
+    struct common_sampler * smpl = nullptr;
 };
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
     // needed to get candidate probs even for temp <= 0.0
     params.sparams.n_probs = 128;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
         return 1;
     }
 
-    gpt_init();
+    common_init();
 
     if (params.model_draft.empty()) {
         LOG_ERR("%s: --model-draft is required\n", __func__);
@@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
     llama_context * ctx_dft = NULL;
 
     // load the target model
-    llama_init_result llama_init_tgt = llama_init_from_gpt_params(params);
+    common_init_result llama_init_tgt = common_init_from_params(params);
     model_tgt = llama_init_tgt.model;
     ctx_tgt = llama_init_tgt.context;
 
@@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
     }
 
     params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
-    llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
+    common_init_result llama_init_dft = common_init_from_params(params);
     model_dft = llama_init_dft.model;
     ctx_dft = llama_init_dft.context;
 
@@ -124,8 +124,8 @@ int main(int argc, char ** argv) {
             if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
                 LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
                 LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
-                        llama_token_to_piece(ctx_tgt, i).c_str(),
-                        llama_token_to_piece(ctx_dft, i).c_str());
+                        common_token_to_piece(ctx_tgt, i).c_str(),
+                        common_token_to_piece(ctx_dft, i).c_str());
                 return 1;
             }
         }
@@ -134,7 +134,7 @@ int main(int argc, char ** argv) {
 
     // Tokenize the prompt
     std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
+    inp = common_tokenize(ctx_tgt, params.prompt, true, true);
 
     const int max_context_size     = llama_n_ctx(ctx_tgt);
     const int max_tokens_list_size = max_context_size - 4;
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
     LOG("\n\n");
 
     for (auto id : inp) {
-        LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
+        LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
     }
 
     const int n_input = inp.size();
@@ -178,7 +178,7 @@ int main(int argc, char ** argv) {
     bool has_eos = false;
 
     // target model sampling context (reuse the llama_context's sampling instance)
-    struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
 
     struct llama_sampler * softmax = llama_sampler_init_softmax();
 
@@ -186,8 +186,8 @@ int main(int argc, char ** argv) {
     std::vector<seq_draft> drafts(n_seq_dft);
 
     for (int s = 0; s < n_seq_dft; ++s) {
-        // allocate gpt_sampler for each draft sequence
-        drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
+        // allocate llama_sampler for each draft sequence
+        drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
     }
 
     llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@@ -229,9 +229,9 @@ int main(int argc, char ** argv) {
                 bool accept = false;
                 if (params.sparams.temp > 0) {
                     // stochastic verification
-                    gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
+                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
 
-                    auto & dist_tgt = *gpt_sampler_get_candidates(smpl);
+                    auto & dist_tgt = *common_sampler_get_candidates(smpl);
 
                     float p_tgt = 0.0f;
                     float p_dft = 0.0f;
@@ -277,13 +277,13 @@ int main(int argc, char ** argv) {
                             s_keep = s;
                             accept = true;
                             token_id = drafts[s].tokens[i_dft];
-                            token_str = llama_token_to_piece(ctx_tgt, token_id);
-                            gpt_sampler_accept(smpl, token_id, true);
+                            token_str = common_token_to_piece(ctx_tgt, token_id);
+                            common_sampler_accept(smpl, token_id, true);
 
                             LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                             break;
                         } else {
-                            LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
+                            LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
                             drafts[s].active = false;
 
                             // calculate residual probability
@@ -349,19 +349,19 @@ int main(int argc, char ** argv) {
                         const int idx = dist(rng);
 
                         token_id = dist_tgt.data[idx].id;
-                        gpt_sampler_accept(smpl, token_id, true);
-                        token_str = llama_token_to_piece(ctx_tgt, token_id);
+                        common_sampler_accept(smpl, token_id, true);
+                        token_str = common_token_to_piece(ctx_tgt, token_id);
                     }
                 } else {
                     // greedy verification
 
                     // sample from the target model
                     LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
-                    token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
+                    token_id = common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
 
-                    gpt_sampler_accept(smpl, token_id, true);
+                    common_sampler_accept(smpl, token_id, true);
 
-                    token_str = llama_token_to_piece(ctx_tgt, token_id);
+                    token_str = common_token_to_piece(ctx_tgt, token_id);
 
                     for (int s = 0; s < n_seq_dft; ++s) {
                         if (!drafts[s].active) {
@@ -431,8 +431,8 @@ int main(int argc, char ** argv) {
             drafts[0].dists.push_back(std::vector<llama_token_data>());
             drafts[0].i_batch_tgt.push_back(0);
 
-            llama_batch_clear(batch_dft);
-            llama_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
+            common_batch_clear(batch_dft);
+            common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
 
             llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
             // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
@@ -446,9 +446,9 @@ int main(int argc, char ** argv) {
         }
 
         if (drafts[0].smpl) {
-            gpt_sampler_free(drafts[0].smpl);
+            common_sampler_free(drafts[0].smpl);
         }
-        drafts[0].smpl = gpt_sampler_clone(smpl);
+        drafts[0].smpl = common_sampler_clone(smpl);
 
         int n_seq_cur  = 1;
         int n_past_cur = n_past_dft;
@@ -461,8 +461,8 @@ int main(int argc, char ** argv) {
         drafts[0].drafting    = true;
         drafts[0].i_batch_dft = 0;
 
-        llama_batch_clear(batch_tgt);
-        llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
+        common_batch_clear(batch_tgt);
+        common_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
 
         // sample n_draft tokens from the draft model using tree-based sampling
         for (int i = 0; i < n_draft; ++i) {
@@ -477,13 +477,13 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
-                gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
+                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
 
-                const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
+                const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
 
                 for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
                     LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                            k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
+                            k, s, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                 }
 
                 std::vector<int> sa(1, s);
@@ -518,9 +518,9 @@ int main(int argc, char ** argv) {
                         drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
 
                         if (drafts[n_seq_cur].smpl) {
-                            gpt_sampler_free(drafts[n_seq_cur].smpl);
+                            common_sampler_free(drafts[n_seq_cur].smpl);
                         }
-                        drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl);
+                        drafts[n_seq_cur].smpl = common_sampler_clone(drafts[s].smpl);
 
                         sa.push_back(n_seq_cur);
 
@@ -536,7 +536,7 @@ int main(int argc, char ** argv) {
 
                     const int s = sa[is];
 
-                    gpt_sampler_accept(drafts[s].smpl, id, true);
+                    common_sampler_accept(drafts[s].smpl, id, true);
 
                     drafts[s].tokens.push_back(id);
                     // save cur_p.data into drafts[s].dists
@@ -545,12 +545,12 @@ int main(int argc, char ** argv) {
                     // add unique drafted tokens to the target batch
                     drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
-                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+                    common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
 
                     // add the token to the batch for batched decoding with the draft model
                     drafts[s].i_batch_dft = batch_dft.n_tokens;
 
-                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
+                    common_batch_add(batch_dft, id, n_past_cur, { s }, true);
 
                     if (batch_tgt.n_tokens > n_draft) {
                         drafts[s].drafting = false;
@@ -617,11 +617,11 @@ int main(int argc, char ** argv) {
 
     LOG_INF("\n");
     LOG_INF("target:\n\n");
-    gpt_perf_print(ctx_tgt, smpl);
+    common_perf_print(ctx_tgt, smpl);
 
-    gpt_sampler_free(smpl);
+    common_sampler_free(smpl);
     for (int s = 0; s < n_seq_dft; ++s) {
-        gpt_sampler_free(drafts[s].smpl);
+        common_sampler_free(drafts[s].smpl);
     }
 
     llama_sampler_free(softmax);
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
index a9af6471f..12ad54256 100644
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -365,7 +365,7 @@ int main(int raw_argc, char ** raw_argv) {
     const bool parse_special = !no_parse_special;
 
     std::vector<llama_token> tokens;
-    tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
+    tokens = common_tokenize(model, prompt, add_bos, parse_special);
 
     if (printing_ids) {
         printf("[");
@@ -380,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) {
         } else {
             bool invalid_utf8 = false;
             printf("%6d -> '", tokens[i]);
-            write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
+            write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
             if (invalid_utf8) {
                 printf("' (utf-8 decode failure)\n");
             } else {
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index e07d09733..3665238b5 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -10,12 +10,12 @@
 #include <cassert>
 
 int main(void) {
-    gpt_params params;
+    common_params params;
 
     printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
     for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
         try {
-            auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
+            auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
             std::unordered_set<std::string> seen_args;
             std::unordered_set<std::string> seen_env_vars;
             for (const auto & opt : ctx_arg.options) {
@@ -58,44 +58,44 @@ int main(void) {
 
     // missing value
     argv = {"binary_name", "-m"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     // wrong value (int)
     argv = {"binary_name", "-ngl", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     // wrong value (enum)
     argv = {"binary_name", "-sm", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
     argv = {"binary_name", "--draft", "123"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
 
 
     printf("test-arg-parser: test valid usage\n\n");
 
     argv = {"binary_name", "-m", "model_file.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "model_file.gguf");
 
     argv = {"binary_name", "-t", "1234"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.cpuparams.n_threads == 1234);
 
     argv = {"binary_name", "--verbose"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.verbosity > 1);
 
     argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "abc.gguf");
     assert(params.n_predict == 6789);
     assert(params.n_batch == 9090);
 
     // --draft cannot be used outside llama-speculative
     argv = {"binary_name", "--draft", "123"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
     assert(params.n_draft == 123);
 
 // skip this part on windows, because setenv is not supported
@@ -106,12 +106,12 @@ int main(void) {
 
     setenv("LLAMA_ARG_THREADS", "blah", true);
     argv = {"binary_name"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "blah.gguf");
     assert(params.cpuparams.n_threads == 1010);
 
@@ -121,7 +121,7 @@ int main(void) {
     setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name", "-m", "overwritten.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "overwritten.gguf");
     assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index a8222caee..6f046249f 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -140,11 +140,11 @@ int main(void) {
 
     // test llama_chat_format_single for system message
     printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
-    std::vector<llama_chat_msg> chat2;
-    llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
+    std::vector<common_chat_msg> chat2;
+    common_chat_msg sys_msg{"system", "You are a helpful assistant"};
 
     auto fmt_sys = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
+        auto output = common_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
         printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
         printf("-------------------------\n");
         return output;
@@ -160,10 +160,10 @@ int main(void) {
     chat2.push_back({"system", "You are a helpful assistant"});
     chat2.push_back({"user", "Hello"});
     chat2.push_back({"assistant", "I am assistant"});
-    llama_chat_msg new_msg{"user", "How are you"};
+    common_chat_msg new_msg{"user", "How are you"};
 
     auto fmt_single = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
+        auto output = common_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
         printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
         printf("-------------------------\n");
         return output;
diff --git a/tests/test-log.cpp b/tests/test-log.cpp
index 211222369..306f28c61 100644
--- a/tests/test-log.cpp
+++ b/tests/test-log.cpp
@@ -24,8 +24,8 @@ int main() {
                 }
 
                 if (rand () % 10 < 5) {
-                    gpt_log_set_timestamps(gpt_log_main(), rand() % 2);
-                    gpt_log_set_prefix    (gpt_log_main(), rand() % 2);
+                    common_log_set_timestamps(common_log_main(), rand() % 2);
+                    common_log_set_prefix    (common_log_main(), rand() % 2);
                 }
             }
         });
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 4d49850c9..0af85f002 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -202,7 +202,7 @@ int main(int argc, char **argv) {
     for (int i = 0; i < nthread; i++) {
         threads[i] = std::thread([&, i]() {
             for (const auto & test_kv : k_tests) {
-                const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
+                const std::vector<llama_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
 
                 // here only print the result of the first thread
                 // because the other threads are running the same tests
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
 
                 printf("\n");
                 printf("src: '%s'\n", test_kv.first.c_str());
-                printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
+                printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
                 printf("tok: ");
                 for (const auto & tok : res) {
                     printf("%d ", tok);
@@ -229,16 +229,16 @@ int main(int argc, char **argv) {
                 if (!correct) {
                     fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
                     fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                        llama_detokenize(ctx, res).c_str(),
-                        llama_detokenize(ctx, test_kv.second).c_str());
+                        common_detokenize(ctx, res).c_str(),
+                        common_detokenize(ctx, test_kv.second).c_str());
                     fprintf(stderr, "%s : expected tokens: ", __func__);
                     for (const auto & t : test_kv.second) {
-                        fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
                     }
                     fprintf(stderr, "\n");
                     fprintf(stderr, "%s : got tokens:      ", __func__);
                     for (const auto & t : res) {
-                        fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
                     }
                     fprintf(stderr, "\n");
 
@@ -273,7 +273,7 @@ int main(int argc, char **argv) {
         {
             const auto t_start = ggml_time_us();
 
-            res = llama_tokenize(ctx, text, add_special, false);
+            res = common_tokenize(ctx, text, add_special, false);
 
             const auto t_end = ggml_time_us();
 
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 9498387e0..0ff7fc833 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -78,10 +78,10 @@ int main(int argc, char **argv) {
     const int n_vocab = llama_n_vocab(model);
 
     for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize(ctx, std::vector<int>(1, i));
+        std::string str = common_detokenize(ctx, std::vector<int>(1, i));
         try {
             auto cps = unicode_cpts_from_utf8(str);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
+            std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
             if (ignore_merges && tokens.size() > 1) {
                 fprintf(stderr,
                         "%s : error: token %d detokenizes to '%s'(%zu) but "
@@ -94,7 +94,7 @@ int main(int argc, char **argv) {
                 fprintf(stderr, "]\n");
                 return 2;
             }
-            std::string check = llama_detokenize(ctx, tokens);
+            std::string check = common_detokenize(ctx, tokens);
             if (check != str) {
                 fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                     __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -123,8 +123,8 @@ int main(int argc, char **argv) {
                     }
 
                     std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-                    std::string check = llama_detokenize(ctx, tokens);
+                    std::vector<llama_token> tokens = common_tokenize(ctx, str, false);
+                    std::string check = common_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
                         fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                                 cp, check.c_str(), check.length(), str.c_str(), str.length());
diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp
index 7ca9e2ca6..9b0716a43 100644
--- a/tests/test-tokenizer-1-spm.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@@ -66,9 +66,9 @@ int main(int argc, char ** argv) {
     const int n_vocab = llama_n_vocab(model);
 
     for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
-        std::string check = llama_detokenize(ctx, tokens);
+        std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);
+        std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+        std::string check = common_detokenize(ctx, tokens);
         if (check != str) {
             fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                 __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -93,8 +93,8 @@ int main(int argc, char ** argv) {
                     }
 
                     std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
-                    std::string check = llama_detokenize(ctx, tokens);
+                    std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+                    std::string check = common_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
                         fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                                 cp, check.c_str(), check.length(), str.c_str(), str.length());

From 96776405a17034dcfd53d3ddf5d142d34bdbb657 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 11 Oct 2024 15:34:45 +0200
Subject: [PATCH 038/396] ggml : move more prints to the ggml log system
 (#9839)

* ggml : move more prints to the ggml log system

* show BLAS OpenMP warnings in all builds using debug print
---
 ggml/src/ggml-alloc.c     | 34 +++++++++++++++++-----------------
 ggml/src/ggml-backend.cpp | 32 ++++++++++++++++----------------
 ggml/src/ggml-blas.cpp    |  8 ++++----
 ggml/src/ggml-cuda.cu     | 22 +++++++++++-----------
 4 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 70187b9b6..28548fbbb 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -14,7 +14,7 @@
 
 //#define GGML_ALLOCATOR_DEBUG
 
-//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
+//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
 #define AT_PRINTF(...)
 
 
@@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
     size = GGML_PAD(size, talloc->alignment);
 
     if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
-        fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
                 __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
         GGML_ABORT("not enough space in the buffer");
     }
@@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
             best_fit_block = alloc->n_free_blocks - 1;
         } else {
             // this should never happen
-            fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+            GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
                     __func__, size, max_avail);
             GGML_ABORT("not enough space in the buffer");
         }
@@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
                 }
             }
         }
-        fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
         for (int i = 0; i < 1024; i++) {
             if (alloc->allocated_tensors[i].tensor) {
-                fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+                GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
                     alloc->allocated_tensors[i].offset,
                     alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
                     ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
             }
         }
-        fprintf(stderr, "\n");
+        GGML_LOG_DEBUG("\n");
     }
 #endif
 
@@ -768,13 +768,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
         if (new_size > cur_size || galloc->buffers[i] == NULL) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+            GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
 
             ggml_backend_buffer_free(galloc->buffers[i]);
             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
             if (galloc->buffers[i] == NULL) {
-                fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                 return false;
             }
             ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -825,14 +825,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t
 static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
     if (galloc->n_nodes != graph->n_nodes) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
+        GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
 #endif
         return true;
     }
 
     if (galloc->n_leafs != graph->n_leafs) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
+        GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
 #endif
         return true;
     }
@@ -843,7 +843,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
 
         if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
+            GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
 #endif
             return true;
         }
@@ -855,7 +855,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
             }
             if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
 #ifndef NDEBUG
-                fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
+                GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
 #endif
                 return true;
             }
@@ -869,14 +869,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
     if (ggml_gallocr_needs_realloc(galloc, graph)) {
         if (galloc->n_buffers == 1) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
+            GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
 #endif
             if (!ggml_gallocr_reserve(galloc, graph)) {
                 return false;
             }
         } else {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
+            GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
 #endif
             return false;
         }
@@ -940,7 +940,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
     ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
     if (buffer == NULL) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
+        GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
 #endif
         for (size_t i = 0; i < *n_buffers; i++) {
             ggml_backend_buffer_free((*buffers)[i]);
@@ -990,7 +990,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
         }
 
         if (this_size > max_size) {
-            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
+            GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
                     __func__, t->name,
                     ggml_backend_buft_name(buft),
                     this_size, max_size);
@@ -1022,7 +1022,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
 
     if (n_buffers == 0) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+        GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
 #endif
         return NULL;
     }
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index fb1d3ead3..15d650150 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -379,7 +379,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
         ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
     } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
+        GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
 #endif
         size_t nbytes = ggml_nbytes(src);
         void * data = malloc(nbytes);
@@ -571,7 +571,7 @@ struct ggml_backend_registry {
 
     void register_backend(ggml_backend_reg_t reg) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
+        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
             __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
 #endif
         backends.push_back(reg);
@@ -582,7 +582,7 @@ struct ggml_backend_registry {
 
     void register_device(ggml_backend_dev_t device) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
+        GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
 #endif
         devices.push_back(device);
     }
@@ -773,7 +773,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
     void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
     if (data == NULL) {
-        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
         return NULL;
     }
 
@@ -836,7 +836,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
     void * ptr;
     int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
     if (result != 0) {
-        fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
+        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
         return NULL;
     }
 
@@ -1459,7 +1459,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
     }
 
 #ifndef NDEBUG
-    fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
+    GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
         __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
 #endif
 
@@ -1548,13 +1548,13 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
     for (int i = 0; i < graph->n_nodes; i++) {
         if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
             ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
-            fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
+            GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
                 sched->splits[cur_split].n_inputs);
             for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
-                fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
+                GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
                     fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
             }
-            fprintf(stderr, "\n");
+            GGML_LOG_DEBUG("\n");
             cur_split++;
         }
         struct ggml_tensor * node = graph->nodes[i];
@@ -1562,7 +1562,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
             continue;
         }
         ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
-        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
+        GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
             fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
@@ -1570,10 +1570,10 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
                 continue;
             }
             ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
-            fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
+            GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
                 fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
         }
-        fprintf(stderr, "\n");
+        GGML_LOG_DEBUG("\n");
     }
 }
 
@@ -2087,11 +2087,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
         // the re-allocation may cause the split inputs to be moved to a different address
         ggml_backend_sched_synchronize(sched);
 #ifndef NDEBUG
-        fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
+        GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
 #endif
         ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
         if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
-            fprintf(stderr, "%s: failed to allocate graph\n", __func__);
+            GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
             return false;
         }
     }
@@ -2485,7 +2485,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
     struct ggml_context * ctx_unallocated = ggml_init(params);
 
     if (ctx_allocated == NULL || ctx_unallocated == NULL) {
-        fprintf(stderr, "failed to allocate context for graph copy\n");
+        GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
         ggml_hash_set_free(&hash_set);
         free(node_copies);
         free(node_init);
@@ -2508,7 +2508,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
     // allocate nodes
     ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
     if (buffer == NULL) {
-        fprintf(stderr, "failed to allocate buffer for graph copy\n");
+        GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
         ggml_hash_set_free(&hash_set);
         free(node_copies);
         free(node_init);
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
index 55f724586..7875ec86d 100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@@ -297,14 +297,14 @@ ggml_backend_t ggml_backend_blas_init(void) {
         /* .context   = */ ctx,
     };
 
-#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
+#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
     if (openblas_get_parallel() != OPENBLAS_OPENMP) {
-        fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
+        GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
     }
 #endif
 
-#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
-    fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
+#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
+    GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
 #endif
 
     return backend;
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index edb61abdf..1338bd458 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -291,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
                 return;
             }
         }
-        GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
+        GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
         ggml_cuda_set_device(device);
         CUDA_CHECK(cudaFree(ptr));
         pool_size -= size;
@@ -980,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
     if (err != cudaSuccess) {
         // clear the error
         cudaGetLastError();
-        GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
+        GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                            size / 1024.0 / 1024.0, cudaGetErrorString(err));
         return nullptr;
     }
@@ -2406,7 +2406,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
 
     if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
 #ifndef NDEBUG
-        GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
+        GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
 #endif
         return false;
     }
@@ -2524,7 +2524,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
         if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
             cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
 #ifndef NDEBUG
-            GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
 #endif
         }
     }
@@ -2575,14 +2575,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
             if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
                 use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
 #ifndef NDEBUG
-                GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
+                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
 #endif
             }
 
             if (node->op == GGML_OP_MUL_MAT_ID) {
                 use_cuda_graph = false; // This node type is not supported by CUDA graph capture
 #ifndef NDEBUG
-                GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
+                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
 #endif
             }
 
@@ -2591,7 +2591,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                 // Changes in batch size or context size can cause changes to the grid size of some kernels.
                 use_cuda_graph = false;
 #ifndef NDEBUG
-                GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
 #endif
             }
 
@@ -2603,7 +2603,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                 if (!ptr) {
                     use_cuda_graph = false;
 #ifndef NDEBUG
-                    GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
+                    GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
 #endif
                 } else {
                     if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
@@ -2627,7 +2627,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
         if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
             cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
 #ifndef NDEBUG
-            GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
 #endif
         }
     }
@@ -2685,7 +2685,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                 use_cuda_graph = false;
                 cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
 #ifndef NDEBUG
-                GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
+                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
 #endif
             } else {
                 graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2854,7 +2854,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
         // clear the error
         cudaGetLastError();
 
-        GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
+        GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
                            size / 1024.0 / 1024.0, cudaGetErrorString(err));
         return false;
     }

From 943d20b4111c746bcd9dbc7e4771de313b08b50c Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Sat, 12 Oct 2024 13:09:53 +0800
Subject: [PATCH 039/396] musa : update doc (#9856)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 README.md     | 4 ++--
 docs/build.md | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 41e5e5448..dd4927b04 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ variety of hardware - locally and in the cloud.
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2 and AVX512 support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
-- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
 
@@ -413,7 +413,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
 | [BLAS](./docs/build.md#blas-build) | All |
 | [BLIS](./docs/backend/BLIS.md) | All |
 | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
+| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
 | [CUDA](./docs/build.md#cuda) | Nvidia GPU |
 | [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
 | [Vulkan](./docs/build.md#vulkan) | GPU |
diff --git a/docs/build.md b/docs/build.md
index faa0ecfa4..4e362ebc7 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -198,6 +198,8 @@ The following compilation options are also available to tweak performance:
 
 ### MUSA
 
+This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
+
 - Using `make`:
   ```bash
   make GGML_MUSA=1
@@ -209,6 +211,12 @@ The following compilation options are also available to tweak performance:
   cmake --build build --config Release
   ```
 
+The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
+
+The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
+
+Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
+
 ### hipBLAS
 
 This provides BLAS acceleration on HIP-supported AMD GPUs.

From 11ac9800aff532715a5bc7991062c68ba3472e6e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 12 Oct 2024 08:21:51 +0300
Subject: [PATCH 040/396] llama : improve infill support and special token
 detection (#9798)

* llama : improve infill support

ggml-ci

* llama : add more FIM token strings

ggml-ci

* server : update prompt on slot restore (#9800)

* gguf : deprecate old FIM token KVs
---
 common/arg.cpp              | 248 +++++++++-----------
 common/common.cpp           |  18 +-
 common/common.h             |  19 +-
 examples/infill/infill.cpp  |  14 +-
 examples/server/README.md   |   2 +-
 examples/server/server.cpp  | 150 ++++++------
 gguf-py/gguf/constants.py   |  26 ++-
 gguf-py/gguf/gguf_writer.py |   9 -
 include/llama.h             |  17 +-
 src/llama-vocab.cpp         |  38 ++-
 src/llama-vocab.h           |  35 ++-
 src/llama.cpp               | 452 ++++++++++++++++++++++--------------
 12 files changed, 601 insertions(+), 427 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 6014f5d8a..c4229a3a4 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -119,32 +119,6 @@ std::string common_arg::to_string() {
 // utils
 //
 
-#ifdef __GNUC__
-#ifdef __MINGW32__
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
-#endif
-
-LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
-static std::string format(const char * fmt, ...) {
-    va_list ap;
-    va_list ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
 static void common_params_handle_model_default(common_params & params) {
     if (!params.hf_repo.empty()) {
         // short-hand to avoid specifying --hf-file -> default it to --model
@@ -199,7 +173,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
                     continue;
                 }
             } catch (std::exception & e) {
-                throw std::invalid_argument(format(
+                throw std::invalid_argument(string_format(
                     "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
             }
         }
@@ -220,7 +194,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
             std::replace(arg.begin(), arg.end(), '_', '-');
         }
         if (arg_to_options.find(arg) == arg_to_options.end()) {
-            throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
+            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
         }
         auto opt = *arg_to_options[arg];
         if (opt.has_value_from_env()) {
@@ -252,7 +226,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
                 continue;
             }
         } catch (std::exception & e) {
-            throw std::invalid_argument(format(
+            throw std::invalid_argument(string_format(
                 "error while handling argument \"%s\": %s\n\n"
                 "usage:\n%s\n\nto show complete usage, run with -h",
                 arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
@@ -391,28 +365,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"--verbose-prompt"},
-        format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
+        string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
         [](common_params & params) {
             params.verbose_prompt = true;
         }
     ));
     add_opt(common_arg(
         {"--no-display-prompt"},
-        format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
+        string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
         [](common_params & params) {
             params.display_prompt = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-co", "--color"},
-        format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
+        string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
         [](common_params & params) {
             params.use_color = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-t", "--threads"}, "N",
-        format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
+        string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
         [](common_params & params, int value) {
             params.cpuparams.n_threads = value;
             if (params.cpuparams.n_threads <= 0) {
@@ -472,14 +446,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"--cpu-strict"}, "<0|1>",
-        format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
+        string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
         [](common_params & params, const std::string & value) {
             params.cpuparams.strict_cpu = std::stoul(value);
         }
     ));
     add_opt(common_arg(
         {"--prio"}, "N",
-        format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
+        string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
         [](common_params & params, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
@@ -489,7 +463,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"--poll"}, "<0...100>",
-        format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
+        string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
         [](common_params & params, const std::string & value) {
             params.cpuparams.poll = std::stoul(value);
         }
@@ -523,7 +497,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"--prio-batch"}, "N",
-        format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
+        string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
         [](common_params & params, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
@@ -567,7 +541,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--prio-draft"}, "N",
-        format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
+        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
         [](common_params & params, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
@@ -611,7 +585,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--prio-batch-draft"}, "N",
-        format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
+        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
         [](common_params & params, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
@@ -628,14 +602,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--draft"}, "N",
-        format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
+        string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
         [](common_params & params, int value) {
             params.n_draft = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-ps", "--p-split"}, "N",
-        format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
+        string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
         [](common_params & params, const std::string & value) {
             params.p_split = std::stof(value);
         }
@@ -656,56 +630,56 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-c", "--ctx-size"}, "N",
-        format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
+        string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
         [](common_params & params, int value) {
             params.n_ctx = value;
         }
     ).set_env("LLAMA_ARG_CTX_SIZE"));
     add_opt(common_arg(
         {"-n", "--predict", "--n-predict"}, "N",
-        format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
+        string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
         [](common_params & params, int value) {
             params.n_predict = value;
         }
     ).set_env("LLAMA_ARG_N_PREDICT"));
     add_opt(common_arg(
         {"-b", "--batch-size"}, "N",
-        format("logical maximum batch size (default: %d)", params.n_batch),
+        string_format("logical maximum batch size (default: %d)", params.n_batch),
         [](common_params & params, int value) {
             params.n_batch = value;
         }
     ).set_env("LLAMA_ARG_BATCH"));
     add_opt(common_arg(
         {"-ub", "--ubatch-size"}, "N",
-        format("physical maximum batch size (default: %d)", params.n_ubatch),
+        string_format("physical maximum batch size (default: %d)", params.n_ubatch),
         [](common_params & params, int value) {
             params.n_ubatch = value;
         }
     ).set_env("LLAMA_ARG_UBATCH"));
     add_opt(common_arg(
         {"--keep"}, "N",
-        format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
+        string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
         [](common_params & params, int value) {
             params.n_keep = value;
         }
     ));
     add_opt(common_arg(
         {"--no-context-shift"},
-        format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
+        string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
         [](common_params & params) {
             params.ctx_shift = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--chunks"}, "N",
-        format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
+        string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
         [](common_params & params, int value) {
             params.n_chunks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"-fa", "--flash-attn"},
-        format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
+        string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
         [](common_params & params) {
             params.flash_attn = true;
         }
@@ -721,7 +695,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"--no-perf"},
-        format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
         [](common_params & params) {
             params.no_perf = true;
             params.sparams.no_perf = true;
@@ -733,7 +707,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
             // store the external file name in params
             params.prompt_file = value;
@@ -749,7 +723,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
             params.in_files.push_back(value);
         }
@@ -760,7 +734,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             std::ifstream file(value, std::ios::binary);
             if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
             // store the external file name in params
             params.prompt_file = value;
@@ -772,7 +746,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"-e", "--escape"},
-        format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+        string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
         [](common_params & params) {
             params.escape = true;
         }
@@ -786,7 +760,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"-ptc", "--print-token-count"}, "N",
-        format("print token count every N tokens (default: %d)", params.n_print),
+        string_format("print token count every N tokens (default: %d)", params.n_print),
         [](common_params & params, int value) {
             params.n_print = value;
         }
@@ -821,14 +795,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-sp", "--special"},
-        format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
+        string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
         [](common_params & params) {
             params.special = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-cnv", "--conversation"},
-        format(
+        string_format(
             "run in conversation mode:\n"
             "- does not print special tokens and suffix/prefix\n"
             "- interactive mode is also enabled\n"
@@ -841,14 +815,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-i", "--interactive"},
-        format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
+        string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
         [](common_params & params) {
             params.interactive = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-if", "--interactive-first"},
-        format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
+        string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
         [](common_params & params) {
             params.interactive_first = true;
         }
@@ -893,7 +867,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--spm-infill"},
-        format(
+        string_format(
             "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
             params.spm_infill ? "enabled" : "disabled"
         ),
@@ -903,7 +877,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--samplers"}, "SAMPLERS",
-        format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
+        string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
         [](common_params & params, const std::string & value) {
             const auto sampler_names = string_split(value, ';');
             params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
@@ -911,14 +885,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_sparam());
     add_opt(common_arg(
         {"-s", "--seed"}, "SEED",
-        format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
+        string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
         [](common_params & params, const std::string & value) {
             params.sparams.seed = std::stoul(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--sampling-seq"}, "SEQUENCE",
-        format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
+        string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
         [](common_params & params, const std::string & value) {
             params.sparams.samplers = common_sampler_types_from_chars(value);
         }
@@ -932,14 +906,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_sparam());
     add_opt(common_arg(
         {"--penalize-nl"},
-        format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
+        string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
         [](common_params & params) {
             params.sparams.penalize_nl = true;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--temp"}, "N",
-        format("temperature (default: %.1f)", (double)params.sparams.temp),
+        string_format("temperature (default: %.1f)", (double)params.sparams.temp),
         [](common_params & params, const std::string & value) {
             params.sparams.temp = std::stof(value);
             params.sparams.temp = std::max(params.sparams.temp, 0.0f);
@@ -947,42 +921,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_sparam());
     add_opt(common_arg(
         {"--top-k"}, "N",
-        format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
+        string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
         [](common_params & params, int value) {
             params.sparams.top_k = value;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--top-p"}, "N",
-        format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
+        string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
         [](common_params & params, const std::string & value) {
             params.sparams.top_p = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--min-p"}, "N",
-        format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
+        string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
         [](common_params & params, const std::string & value) {
             params.sparams.min_p = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--tfs"}, "N",
-        format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
+        string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
         [](common_params & params, const std::string & value) {
             params.sparams.tfs_z = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--typical"}, "N",
-        format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
+        string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
         [](common_params & params, const std::string & value) {
             params.sparams.typ_p = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--repeat-last-n"}, "N",
-        format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
+        string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
         [](common_params & params, int value) {
             params.sparams.penalty_last_n = value;
             params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
@@ -990,42 +964,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_sparam());
     add_opt(common_arg(
         {"--repeat-penalty"}, "N",
-        format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
+        string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
         [](common_params & params, const std::string & value) {
             params.sparams.penalty_repeat = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--presence-penalty"}, "N",
-        format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
+        string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
         [](common_params & params, const std::string & value) {
             params.sparams.penalty_present = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--frequency-penalty"}, "N",
-        format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
+        string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
         [](common_params & params, const std::string & value) {
             params.sparams.penalty_freq = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dynatemp-range"}, "N",
-        format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
+        string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
         [](common_params & params, const std::string & value) {
             params.sparams.dynatemp_range = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dynatemp-exp"}, "N",
-        format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
+        string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
         [](common_params & params, const std::string & value) {
             params.sparams.dynatemp_exponent = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--mirostat"}, "N",
-        format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+        string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
         "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
         [](common_params & params, int value) {
             params.sparams.mirostat = value;
@@ -1033,14 +1007,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_sparam());
     add_opt(common_arg(
         {"--mirostat-lr"}, "N",
-        format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
+        string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
         [](common_params & params, const std::string & value) {
             params.sparams.mirostat_eta = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--mirostat-ent"}, "N",
-        format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
+        string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
         [](common_params & params, const std::string & value) {
             params.sparams.mirostat_tau = std::stof(value);
         }
@@ -1069,7 +1043,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_sparam());
     add_opt(common_arg(
         {"--grammar"}, "GRAMMAR",
-        format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
+        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
         [](common_params & params, const std::string & value) {
             params.sparams.grammar = value;
         }
@@ -1080,7 +1054,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
             std::copy(
                 std::istreambuf_iterator<char>(file),
@@ -1150,49 +1124,49 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
     add_opt(common_arg(
         {"--yarn-orig-ctx"}, "N",
-        format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
+        string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
         [](common_params & params, int value) {
             params.yarn_orig_ctx = value;
         }
     ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
     add_opt(common_arg(
         {"--yarn-ext-factor"}, "N",
-        format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
+        string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
         [](common_params & params, const std::string & value) {
             params.yarn_ext_factor = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
     add_opt(common_arg(
         {"--yarn-attn-factor"}, "N",
-        format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
+        string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
         [](common_params & params, const std::string & value) {
             params.yarn_attn_factor = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
     add_opt(common_arg(
         {"--yarn-beta-slow"}, "N",
-        format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
+        string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
         [](common_params & params, const std::string & value) {
             params.yarn_beta_slow = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
     add_opt(common_arg(
         {"--yarn-beta-fast"}, "N",
-        format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
+        string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
         [](common_params & params, const std::string & value) {
             params.yarn_beta_fast = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
     add_opt(common_arg(
         {"-gan", "--grp-attn-n"}, "N",
-        format("group-attention factor (default: %d)", params.grp_attn_n),
+        string_format("group-attention factor (default: %d)", params.grp_attn_n),
         [](common_params & params, int value) {
             params.grp_attn_n = value;
         }
     ).set_env("LLAMA_ARG_GRP_ATTN_N"));
     add_opt(common_arg(
         {"-gaw", "--grp-attn-w"}, "N",
-        format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
+        string_format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
         [](common_params & params, int value) {
             params.grp_attn_w = value;
         }
@@ -1213,7 +1187,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
     add_opt(common_arg(
         {"-ctk", "--cache-type-k"}, "TYPE",
-        format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
+        string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
         [](common_params & params, const std::string & value) {
             // TODO: get the type right here
             params.cache_type_k = value;
@@ -1221,7 +1195,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
     add_opt(common_arg(
         {"-ctv", "--cache-type-v"}, "TYPE",
-        format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
+        string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
         [](common_params & params, const std::string & value) {
             // TODO: get the type right here
             params.cache_type_v = value;
@@ -1229,7 +1203,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
     add_opt(common_arg(
         {"--perplexity", "--all-logits"},
-        format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
+        string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
         [](common_params & params) {
             params.logits_all = true;
         }
@@ -1243,7 +1217,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--hellaswag-tasks"}, "N",
-        format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
+        string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
         [](common_params & params, int value) {
             params.hellaswag_tasks = value;
         }
@@ -1257,7 +1231,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--winogrande-tasks"}, "N",
-        format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
+        string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
         [](common_params & params, int value) {
             params.winogrande_tasks = value;
         }
@@ -1271,7 +1245,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--multiple-choice-tasks"}, "N",
-        format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
+        string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
         [](common_params & params, int value) {
             params.multiple_choice_tasks = value;
         }
@@ -1292,42 +1266,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--ppl-stride"}, "N",
-        format("stride for perplexity calculation (default: %d)", params.ppl_stride),
+        string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
         [](common_params & params, int value) {
             params.ppl_stride = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--ppl-output-type"}, "<0|1>",
-        format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
+        string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
         [](common_params & params, int value) {
             params.ppl_output_type = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"-dt", "--defrag-thold"}, "N",
-        format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
+        string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
         [](common_params & params, const std::string & value) {
             params.defrag_thold = std::stof(value);
         }
     ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
     add_opt(common_arg(
         {"-np", "--parallel"}, "N",
-        format("number of parallel sequences to decode (default: %d)", params.n_parallel),
+        string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
         [](common_params & params, int value) {
             params.n_parallel = value;
         }
     ).set_env("LLAMA_ARG_N_PARALLEL"));
     add_opt(common_arg(
         {"-ns", "--sequences"}, "N",
-        format("number of sequences to decode (default: %d)", params.n_sequences),
+        string_format("number of sequences to decode (default: %d)", params.n_sequences),
         [](common_params & params, int value) {
             params.n_sequences = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
     add_opt(common_arg(
         {"-cb", "--cont-batching"},
-        format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+        string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
         [](common_params & params) {
             params.cont_batching = true;
         }
@@ -1451,7 +1425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             std::vector<std::string> split_arg{ it, {} };
             if (split_arg.size() >= llama_max_devices()) {
                 throw std::invalid_argument(
-                    format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
+                    string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
                 );
             }
             for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -1468,7 +1442,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
     add_opt(common_arg(
         {"-mg", "--main-gpu"}, "INDEX",
-        format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
+        string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
         [](common_params & params, int value) {
             params.main_gpu = value;
             if (!llama_supports_gpu_offload()) {
@@ -1478,7 +1452,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_MAIN_GPU"));
     add_opt(common_arg(
         {"--check-tensors"},
-        format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
+        string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
         [](common_params & params) {
             params.check_tensors = true;
         }
@@ -1489,7 +1463,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
         [](common_params & params, const std::string & value) {
             if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
-                throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
+                throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
             }
         }
     ));
@@ -1543,7 +1517,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-m", "--model"}, "FNAME",
         ex == LLAMA_EXAMPLE_EXPORT_LORA
             ? std::string("model path from which to load base model")
-            : format(
+            : string_format(
                 "model path (default: `models/$filename` with filename from `--hf-file` "
                 "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
             ),
@@ -1592,42 +1566,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             std::ifstream file(value, std::ios::binary);
             if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
             params.context_files.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--chunk-size"}, "N",
-        format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
+        string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
         [](common_params & params, int value) {
             params.chunk_size = value;
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--chunk-separator"}, "STRING",
-        format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
+        string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
         [](common_params & params, const std::string & value) {
             params.chunk_separator = value;
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--junk"}, "N",
-        format("number of times to repeat the junk text (default: %d)", params.n_junk),
+        string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
         [](common_params & params, int value) {
             params.n_junk = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
     add_opt(common_arg(
         {"--pos"}, "N",
-        format("position of the passkey in the junk text (default: %d)", params.i_pos),
+        string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
         [](common_params & params, int value) {
             params.i_pos = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
     add_opt(common_arg(
         {"-o", "--output", "--output-file"}, "FNAME",
-        format("output file (default: '%s')",
+        string_format("output file (default: '%s')",
             ex == LLAMA_EXAMPLE_EXPORT_LORA
                 ? params.lora_outfile.c_str()
                 : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
@@ -1641,42 +1615,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
-        format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
+        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
         [](common_params & params, int value) {
             params.n_out_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--save-frequency"}, "N",
-        format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
+        string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
         [](common_params & params, int value) {
             params.n_save_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--process-output"},
-        format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
+        string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
         [](common_params & params) {
             params.process_output = true;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--no-ppl"},
-        format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+        string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
         [](common_params & params) {
             params.compute_ppl = false;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--chunk", "--from-chunk"}, "N",
-        format("start processing the input from chunk N (default: %d)", params.i_chunk),
+        string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
         [](common_params & params, int value) {
             params.i_chunk = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"-pps"},
-        format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
+        string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
         [](common_params & params) {
             params.is_pp_shared = true;
         }
@@ -1707,7 +1681,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"--embd-normalize"}, "N",
-        format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
+        string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
         [](common_params & params, int value) {
             params.embd_normalize = value;
         }
@@ -1728,35 +1702,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--host"}, "HOST",
-        format("ip address to listen (default: %s)", params.hostname.c_str()),
+        string_format("ip address to listen (default: %s)", params.hostname.c_str()),
         [](common_params & params, const std::string & value) {
             params.hostname = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
     add_opt(common_arg(
         {"--port"}, "PORT",
-        format("port to listen (default: %d)", params.port),
+        string_format("port to listen (default: %d)", params.port),
         [](common_params & params, int value) {
             params.port = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
     add_opt(common_arg(
         {"--path"}, "PATH",
-        format("path to serve static files from (default: %s)", params.public_path.c_str()),
+        string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
         [](common_params & params, const std::string & value) {
             params.public_path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
     add_opt(common_arg(
         {"--embedding", "--embeddings"},
-        format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
+        string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
         [](common_params & params) {
             params.embedding = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
     add_opt(common_arg(
         {"--reranking", "--rerank"},
-        format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
+        string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
         [](common_params & params) {
             params.reranking = true;
         }
@@ -1774,7 +1748,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             std::ifstream key_file(value);
             if (!key_file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
             std::string key;
             while (std::getline(key_file, key)) {
@@ -1801,7 +1775,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
     add_opt(common_arg(
         {"-to", "--timeout"}, "N",
-        format("server read/write timeout in seconds (default: %d)", params.timeout_read),
+        string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
         [](common_params & params, int value) {
             params.timeout_read  = value;
             params.timeout_write = value;
@@ -1809,7 +1783,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
     add_opt(common_arg(
         {"--threads-http"}, "N",
-        format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
+        string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
         [](common_params & params, int value) {
             params.n_threads_http = value;
         }
@@ -1820,7 +1794,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
             std::string system_prompt;
             std::copy(
@@ -1833,21 +1807,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--metrics"},
-        format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
+        string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
         [](common_params & params) {
             params.endpoint_metrics = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
     add_opt(common_arg(
         {"--slots"},
-        format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
         [](common_params & params) {
             params.endpoint_slots = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
     add_opt(common_arg(
         {"--props"},
-        format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
+        string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
         [](common_params & params) {
             params.endpoint_props = true;
         }
@@ -1877,7 +1851,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
         [](common_params & params, const std::string & value) {
             if (!common_chat_verify_template(value)) {
-                throw std::runtime_error(format(
+                throw std::runtime_error(string_format(
                     "error: the supplied chat template is not supported: %s\n"
                     "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
                     value.c_str()
@@ -1888,14 +1862,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
     add_opt(common_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
-        format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
+        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
         [](common_params & params, const std::string & value) {
             params.slot_prompt_similarity = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--lora-init-without-apply"},
-        format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
+        string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
         [](common_params & params) {
             params.lora_init_without_apply = true;
         }
@@ -1920,28 +1894,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"--positive-file"}, "FNAME",
-        format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
+        string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
         [](common_params & params, const std::string & value) {
             params.cvector_positive_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--negative-file"}, "FNAME",
-        format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
+        string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
         [](common_params & params, const std::string & value) {
             params.cvector_negative_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--pca-batch"}, "N",
-        format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
+        string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
         [](common_params & params, int value) {
             params.n_pca_batch = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--pca-iter"}, "N",
-        format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
+        string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
         [](common_params & params, int value) {
             params.n_pca_iterations = value;
         }
diff --git a/common/common.cpp b/common/common.cpp
index d1b92250a..451307b55 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -12,6 +12,7 @@
 
 #include <algorithm>
 #include <cinttypes>
+#include <climits>
 #include <cmath>
 #include <codecvt>
 #include <cstdarg>
@@ -23,10 +24,10 @@
 #include <regex>
 #include <sstream>
 #include <string>
+#include <thread>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include <thread>
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -400,6 +401,21 @@ std::string common_params_get_system_info(const common_params & params) {
 // String utils
 //
 
+std::string string_format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
 std::vector<std::string> string_split(std::string input, char separator) {
     std::vector<std::string> parts;
     size_t separator_pos = input.find(separator);
diff --git a/common/common.h b/common/common.h
index ea2719e4b..5beec4bde 100644
--- a/common/common.h
+++ b/common/common.h
@@ -352,15 +352,28 @@ void common_init();
 
 std::string common_params_get_system_info(const common_params & params);
 
-bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
-bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
-void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
+bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
 bool set_process_priority(enum ggml_sched_priority prio);
 
 //
 // String utils
 //
 
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+
+LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+std::string string_format(const char * fmt, ...);
+
 std::vector<std::string> string_split(std::string input, char separator);
 
 std::string string_strip(const std::string & str);
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 3d0f71fda..f82c614f5 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -205,11 +205,11 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
     std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
 
-    GGML_ASSERT(llama_token_prefix(model) >= 0);
-    GGML_ASSERT(llama_token_suffix(model) >= 0);
+    GGML_ASSERT(llama_token_fim_pre(model) >= 0);
+    GGML_ASSERT(llama_token_fim_suf(model) >= 0);
 
-    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+    inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
+    inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
 
     embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
     embd_end = params.spm_infill ? inp_pfx : inp_sfx;
@@ -218,7 +218,7 @@ int main(int argc, char ** argv) {
     }
     embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
 
-    const llama_token middle_token = llama_token_middle(model);
+    const llama_token middle_token = llama_token_fim_mid(model);
     if (middle_token >= 0) {
         embd_inp.push_back(middle_token);
     }
@@ -508,8 +508,8 @@ int main(int argc, char ** argv) {
                 std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
                 std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
 
-                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+                inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
+                inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
 
                 embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
                 embd_end = params.spm_infill ? inp_pfx : inp_sfx;
diff --git a/examples/server/README.md b/examples/server/README.md
index 09d1cf097..3da0130ac 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -526,7 +526,7 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
 - `input_prefix`: Set the prefix of the code to infill.
 - `input_suffix`: Set the suffix of the code to infill.
 
-It also accepts all the options of `/completion` except `stream` and `prompt`.
+It also accepts all the options of `/completion`.
 
 ### **GET** `/props`: Get server global properties.
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2e1d24189..314a506a1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -753,12 +753,7 @@ struct server_context {
         metrics.init();
     }
 
-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
-        // TODO: currently, we tokenize using special tokens by default
-        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
-        //       but it's better compared to completely ignoring ChatML and other chat templates
-        const bool TMP_FORCE_SPECIAL = true;
-
+    std::vector<llama_token> tokenize(const json & json_prompt, bool add_special, bool parse_special) const {
         // If `add_bos` is true, we only add BOS, when json_prompt is a string,
         // or the first element of the json_prompt array is a string.
         std::vector<llama_token> prompt_tokens;
@@ -771,10 +766,10 @@ struct server_context {
 
                     std::vector<llama_token> p;
                     if (first) {
-                        p = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
+                        p = common_tokenize(ctx, s, add_special, parse_special);
                         first = false;
                     } else {
-                        p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
+                        p = common_tokenize(ctx, s, false, parse_special);
                     }
 
                     prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
@@ -788,7 +783,7 @@ struct server_context {
             }
         } else {
             auto s = json_prompt.template get<std::string>();
-            prompt_tokens = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
+            prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
         }
 
         return prompt_tokens;
@@ -1215,7 +1210,7 @@ struct server_context {
                     slot.params.n_predict, n_ctx_train);
         }
 
-        SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str());
+        SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
 
         return slot.has_next_token; // continue
     }
@@ -1483,9 +1478,8 @@ struct server_context {
         if (prompt.is_string() || json_is_array_of_numbers(prompt)) {
             data["index"] = 0;
             create_task(data, false, nullptr);
-        }
-        // otherwise, it's a multiple-prompt task, we break it into smaller tasks
-        else if (prompt.is_array()) {
+        } else if (prompt.is_array()) {
+            // otherwise, it's a multiple-prompt task, we break it into smaller tasks
             std::vector<json> prompts = prompt;
             if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
                 // prompts[0] is the question
@@ -1510,9 +1504,8 @@ struct server_context {
                     }
                 }
             }
-        }
-        // invalid case
-        else {
+        } else {
+            // invalid case
             throw std::runtime_error(error_msg);
         }
 
@@ -1785,6 +1778,9 @@ struct server_context {
                     }
                     slot->cache_tokens.resize(token_count);
 
+                    // TODO: maybe detokenize the slot->cache_tokens instead?
+                    slot->prompt = string_format("[restored %d tokens from file]", (int) token_count);
+
                     const int64_t t_end = ggml_time_us();
                     const double t_restore_ms = (t_end - t_start) / 1000.0;
 
@@ -1971,63 +1967,57 @@ struct server_context {
                         slot.t_start_process_prompt = ggml_time_us();
                         slot.t_start_generation = 0;
 
-                        if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) {
-                            const bool add_bos = llama_add_bos_token(model);
-                            bool suff_rm_leading_spc = true;
-                            if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
-                                params.input_suffix.erase(0, 1);
-                                suff_rm_leading_spc = false;
-                            }
+                        switch (slot.cmpl_type) {
+                            case SERVER_TASK_CMPL_TYPE_NORMAL:
+                            case SERVER_TASK_CMPL_TYPE_EMBEDDING:
+                                {
+                                    prompt_tokens = tokenize(slot.prompt, system_prompt.empty(), true); // add BOS if there isn't system prompt
+                                } break;
+                            case SERVER_TASK_CMPL_TYPE_RERANK:
+                                {
+                                    // require slot.prompt to be array of 2 strings
+                                    if (!slot.prompt.is_array() || slot.prompt.size() != 2) {
+                                        SLT_ERR(slot, "%s", "invalid prompt for rerank task\n");
+                                        slot.release();
+                                        send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST);
+                                        continue;
+                                    }
 
-                            auto prefix_tokens = tokenize(slot.params.input_prefix, false);
-                            auto suffix_tokens = tokenize(slot.params.input_suffix, false);
+                                    // prompt: [BOS]query[EOS][SEP]doc[EOS]
+                                    prompt_tokens.clear();
+                                    prompt_tokens.push_back(llama_token_bos(model));
+                                    {
+                                        const auto part = tokenize(slot.prompt[0], false, false);
+                                        prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
+                                    }
+                                    prompt_tokens.push_back(llama_token_eos(model));
+                                    prompt_tokens.push_back(llama_token_sep(model));
+                                    {
+                                        const auto part = tokenize(slot.prompt[1], false, false);
+                                        prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
+                                    }
+                                    prompt_tokens.push_back(llama_token_eos(model));
+                                } break;
+                            case SERVER_TASK_CMPL_TYPE_INFILL:
+                                {
+                                    auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
+                                    auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
 
-                            const int space_token = 29871; // TODO: this should not be hardcoded
-                            if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
-                                suffix_tokens.erase(suffix_tokens.begin());
-                            }
+                                    prefix_tokens.insert(prefix_tokens.begin(), llama_token_fim_pre(model));
+                                    suffix_tokens.insert(suffix_tokens.begin(), llama_token_fim_suf(model));
 
-                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                            suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
+                                    auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
+                                    auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
 
-                            auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
-                            auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
-                            if (add_bos) {
-                                embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
-                            }
-                            embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+                                    if (llama_add_bos_token(model)) {
+                                        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+                                    }
 
-                            const llama_token middle_token = llama_token_middle(model);
-                            if (middle_token >= 0) {
-                                embd_inp.push_back(middle_token);
-                            }
+                                    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+                                    embd_inp.push_back(llama_token_fim_mid(model));
 
-                            prompt_tokens = embd_inp;
-                        } else if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
-                            // require slot.prompt to be array of 2 strings
-                            if (!slot.prompt.is_array() || slot.prompt.size() != 2) {
-                                SLT_ERR(slot, "%s", "invalid prompt for rerank task\n");
-                                slot.release();
-                                send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST);
-                                continue;
-                            }
-
-                            // prompt: [BOS]query[EOS][SEP]doc[EOS]
-                            prompt_tokens.clear();
-                            prompt_tokens.push_back(llama_token_bos(model));
-                            {
-                                const auto part = tokenize(slot.prompt[0], false);
-                                prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
-                            }
-                            prompt_tokens.push_back(llama_token_eos(model));
-                            prompt_tokens.push_back(llama_token_sep(model));
-                            {
-                                const auto part = tokenize(slot.prompt[1], false);
-                                prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
-                            }
-                            prompt_tokens.push_back(llama_token_eos(model));
-                        } else {
-                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
+                                    prompt_tokens = std::move(embd_inp);
+                                } break;
                         }
 
                         slot.n_past = 0;
@@ -2035,6 +2025,11 @@ struct server_context {
 
                         SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
 
+                        // print prompt tokens:
+                        for (int i = 0; i < (int) prompt_tokens.size(); i++) {
+                            SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                        }
+
                         // empty prompt passed -> release the slot and send empty response
                         if (prompt_tokens.empty()) {
                             SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
@@ -2924,7 +2919,23 @@ int main(int argc, char ** argv) {
         return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
     };
 
-    const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
+        std::string err;
+        if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
+            err += "prefix token is missing. ";
+        }
+        if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) {
+            err += "suffix token is missing. ";
+        }
+        if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) {
+            err += "middle token is missing. ";
+        }
+
+        if (!err.empty()) {
+            res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+
         json data = json::parse(req.body);
         return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res);
     };
@@ -3010,7 +3021,8 @@ int main(int argc, char ** argv) {
         if (body.count("content") != 0) {
             const bool add_special = json_value(body, "add_special", false);
             const bool with_pieces = json_value(body, "with_pieces", false);
-            std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
+
+            std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special, true);
 
             if (with_pieces) {
                 for (const auto& token : tokens) {
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index e08617ba2..7ab08b036 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -152,6 +152,8 @@ class Keys:
         MERGES               = "tokenizer.ggml.merges"
         BOS_ID               = "tokenizer.ggml.bos_token_id"
         EOS_ID               = "tokenizer.ggml.eos_token_id"
+        EOT_ID               = "tokenizer.ggml.eot_token_id"
+        EOM_ID               = "tokenizer.ggml.eom_token_id"
         UNK_ID               = "tokenizer.ggml.unknown_token_id"
         SEP_ID               = "tokenizer.ggml.seperator_token_id"
         PAD_ID               = "tokenizer.ggml.padding_token_id"
@@ -168,11 +170,16 @@ class Keys:
         CHAT_TEMPLATE_N      = "tokenizer.chat_template.{name}"
         CHAT_TEMPLATES       = "tokenizer.chat_templates"
         # FIM/Infill special tokens constants
+        FIM_PRE_ID           = "tokenizer.ggml.fim_pre_token_id"
+        FIM_SUF_ID           = "tokenizer.ggml.fim_suf_token_id"
+        FIM_MID_ID           = "tokenizer.ggml.fim_mid_token_id"
+        FIM_PAD_ID           = "tokenizer.ggml.fim_pad_token_id"
+        FIM_REP_ID           = "tokenizer.ggml.fim_rep_token_id"
+        FIM_SEP_ID           = "tokenizer.ggml.fim_sep_token_id"
+        # deprecated:
         PREFIX_ID            = "tokenizer.ggml.prefix_token_id"
         SUFFIX_ID            = "tokenizer.ggml.suffix_token_id"
         MIDDLE_ID            = "tokenizer.ggml.middle_token_id"
-        EOT_ID               = "tokenizer.ggml.eot_token_id"
-        EOM_ID               = "tokenizer.ggml.eom_token_id"
 
     class Adapter:
         TYPE       = "adapter.type"
@@ -1579,6 +1586,8 @@ KEY_TOKENIZER_SCORES     = Keys.Tokenizer.SCORES
 KEY_TOKENIZER_MERGES     = Keys.Tokenizer.MERGES
 KEY_TOKENIZER_BOS_ID     = Keys.Tokenizer.BOS_ID
 KEY_TOKENIZER_EOS_ID     = Keys.Tokenizer.EOS_ID
+KEY_TOKENIZER_EOT_ID     = Keys.Tokenizer.EOT_ID
+KEY_TOKENIZER_EOM_ID     = Keys.Tokenizer.EOM_ID
 KEY_TOKENIZER_UNK_ID     = Keys.Tokenizer.UNK_ID
 KEY_TOKENIZER_SEP_ID     = Keys.Tokenizer.SEP_ID
 KEY_TOKENIZER_PAD_ID     = Keys.Tokenizer.PAD_ID
@@ -1586,8 +1595,15 @@ KEY_TOKENIZER_CLS_ID     = Keys.Tokenizer.CLS_ID
 KEY_TOKENIZER_MASK_ID    = Keys.Tokenizer.MASK_ID
 KEY_TOKENIZER_HF_JSON    = Keys.Tokenizer.HF_JSON
 KEY_TOKENIZER_RWKV       = Keys.Tokenizer.RWKV
-KEY_TOKENIZER_PRIFIX_ID  = Keys.Tokenizer.PREFIX_ID
+
+KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
+KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
+KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
+KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
+KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
+KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
+
+# deprecated
+KEY_TOKENIZER_PREFIX_ID  = Keys.Tokenizer.PREFIX_ID
 KEY_TOKENIZER_SUFFIX_ID  = Keys.Tokenizer.SUFFIX_ID
 KEY_TOKENIZER_MIDDLE_ID  = Keys.Tokenizer.MIDDLE_ID
-KEY_TOKENIZER_EOT_ID     = Keys.Tokenizer.EOT_ID
-KEY_TOKENIZER_EOM_ID     = Keys.Tokenizer.EOM_ID
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 5c460ef1b..0d8d8a0b0 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -843,15 +843,6 @@ class GGUFWriter:
 
         self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
 
-    def add_prefix_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
-
-    def add_suffix_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
-
-    def add_middle_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
-
     def add_eot_token_id(self, id: int) -> None:
         self.add_uint32(Keys.Tokenizer.EOT_ID, id)
 
diff --git a/include/llama.h b/include/llama.h
index 4f8f6d23d..9110b5956 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -897,6 +897,7 @@ extern "C" {
     // Special tokens
     LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
     LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
+    LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
     LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
     LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
     LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
@@ -905,11 +906,17 @@ extern "C" {
     LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
     LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
 
-    // Codellama infill tokens
-    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
-    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
-    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
-    LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
+    // infill tokens
+    DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
+
+    LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
+    LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
+    LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
+    LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
+    LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
+    LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
 
     //
     // Tokenization
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d2f34ddd6..a27394a37 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1663,6 +1663,14 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
     return vocab.special_eos_id;
 }
 
+llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eot_id;
+}
+
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eom_id;
+}
+
 llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
     return vocab.special_cls_id;
 }
@@ -1688,23 +1696,39 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
 }
 
 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
-    return vocab.special_prefix_id;
+    return vocab.special_fim_pre_id;
 }
 
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
-    return vocab.special_middle_id;
+    return vocab.special_fim_mid_id;
 }
 
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
-    return vocab.special_suffix_id;
+    return vocab.special_fim_suf_id;
 }
 
-llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
-    return vocab.special_eot_id;
+llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_pre_id;
 }
 
-llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
-    return vocab.special_eom_id;
+llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_suf_id;
+}
+
+llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_mid_id;
+}
+
+llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_pad_id;
+}
+
+llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_rep_id;
+}
+
+llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_sep_id;
 }
 
 int32_t llama_tokenize_impl(
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 28bad9135..17e14488a 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -37,20 +37,26 @@ struct llama_vocab {
     std::map<std::pair<std::string, std::string>, int> bpe_ranks;
 
     // default LLaMA special tokens
+    // TODO: should we set all of these to LLAMA_TOKEN_NULL?
     id special_bos_id  = 1;
     id special_eos_id  = 2;
+    id special_eot_id  = LLAMA_TOKEN_NULL;
+    id special_eom_id  = LLAMA_TOKEN_NULL;
     id special_unk_id  = 0;
     id special_sep_id  = LLAMA_TOKEN_NULL;
     id special_pad_id  = LLAMA_TOKEN_NULL;
     id special_cls_id  = LLAMA_TOKEN_NULL;
     id special_mask_id = LLAMA_TOKEN_NULL;
 
-    id linefeed_id       = 13;
-    id special_prefix_id = LLAMA_TOKEN_NULL;
-    id special_suffix_id = LLAMA_TOKEN_NULL;
-    id special_middle_id = LLAMA_TOKEN_NULL;
-    id special_eot_id    = LLAMA_TOKEN_NULL; // TODO: move above after "eos_id", and here add "file separator" token
-    id special_eom_id    = LLAMA_TOKEN_NULL;
+    id linefeed_id    = 13;
+
+    // fim tokens
+    id special_fim_pre_id = LLAMA_TOKEN_NULL;
+    id special_fim_suf_id = LLAMA_TOKEN_NULL;
+    id special_fim_mid_id = LLAMA_TOKEN_NULL;
+    id special_fim_pad_id = LLAMA_TOKEN_NULL;
+    id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
+    id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
 
     // set of all tokens that cause "end of generation"
     std::set<id> special_eog_ids;
@@ -104,19 +110,26 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
 
 llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
 llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
 llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
 llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
 llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
 
-bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
-bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
-
 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
-llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
-llama_token llama_token_eom_impl   (const struct llama_vocab & vocab);
+
+llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
+
+bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
+bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
 
 int32_t llama_tokenize_impl(
         const struct llama_vocab & vocab,
diff --git a/src/llama.cpp b/src/llama.cpp
index da7afb1ee..f68024f5b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -345,6 +345,8 @@ enum llm_kv {
     LLM_KV_TOKENIZER_MERGES,
     LLM_KV_TOKENIZER_BOS_ID,
     LLM_KV_TOKENIZER_EOS_ID,
+    LLM_KV_TOKENIZER_EOT_ID,
+    LLM_KV_TOKENIZER_EOM_ID,
     LLM_KV_TOKENIZER_UNK_ID,
     LLM_KV_TOKENIZER_SEP_ID,
     LLM_KV_TOKENIZER_PAD_ID,
@@ -357,14 +359,20 @@ enum llm_kv {
     LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
     LLM_KV_TOKENIZER_HF_JSON,
     LLM_KV_TOKENIZER_RWKV,
-    LLM_KV_TOKENIZER_PREFIX_ID,
-    LLM_KV_TOKENIZER_SUFFIX_ID,
-    LLM_KV_TOKENIZER_MIDDLE_ID,
-    LLM_KV_TOKENIZER_EOT_ID,
-    LLM_KV_TOKENIZER_EOM_ID,
+    LLM_KV_TOKENIZER_FIM_PRE_ID,
+    LLM_KV_TOKENIZER_FIM_SUF_ID,
+    LLM_KV_TOKENIZER_FIM_MID_ID,
+    LLM_KV_TOKENIZER_FIM_PAD_ID,
+    LLM_KV_TOKENIZER_FIM_REP_ID,
+    LLM_KV_TOKENIZER_FIM_SEP_ID,
 
     LLM_KV_ADAPTER_TYPE,
     LLM_KV_ADAPTER_LORA_ALPHA,
+
+    // deprecated:
+    LLM_KV_TOKENIZER_PREFIX_ID,
+    LLM_KV_TOKENIZER_SUFFIX_ID,
+    LLM_KV_TOKENIZER_MIDDLE_ID,
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -422,57 +430,65 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
 
-    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
-    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
-    { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
-    { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
-    { LLM_KV_ROPE_SCALING_ATTN_FACTOR,      "%s.rope.scaling.attn_factor"             },
-    { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
-    { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
-    { LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     "%s.rope.scaling.yarn_log_multiplier"     },
+    { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
+    { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
+    { LLM_KV_ROPE_SCALE_LINEAR,                "%s.rope.scale_linear"                    },
+    { LLM_KV_ROPE_SCALING_TYPE,                "%s.rope.scaling.type"                    },
+    { LLM_KV_ROPE_SCALING_FACTOR,              "%s.rope.scaling.factor"                  },
+    { LLM_KV_ROPE_SCALING_ATTN_FACTOR,         "%s.rope.scaling.attn_factor"             },
+    { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,        "%s.rope.scaling.original_context_length" },
+    { LLM_KV_ROPE_SCALING_FINETUNED,           "%s.rope.scaling.finetuned"               },
+    { LLM_KV_ROPE_SCALING_YARN_LOG_MUL,        "%s.rope.scaling.yarn_log_multiplier"     },
 
-    { LLM_KV_SPLIT_NO,                      "split.no"            },
-    { LLM_KV_SPLIT_COUNT,                   "split.count"         },
-    { LLM_KV_SPLIT_TENSORS_COUNT,           "split.tensors.count" },
+    { LLM_KV_SPLIT_NO,                         "split.no"            },
+    { LLM_KV_SPLIT_COUNT,                      "split.count"         },
+    { LLM_KV_SPLIT_TENSORS_COUNT,              "split.tensors.count" },
 
-    { LLM_KV_SSM_CONV_KERNEL,               "%s.ssm.conv_kernel"    },
-    { LLM_KV_SSM_INNER_SIZE,                "%s.ssm.inner_size"     },
-    { LLM_KV_SSM_STATE_SIZE,                "%s.ssm.state_size"     },
-    { LLM_KV_SSM_TIME_STEP_RANK,            "%s.ssm.time_step_rank" },
-    { LLM_KV_SSM_DT_B_C_RMS,                "%s.ssm.dt_b_c_rms" },
+    { LLM_KV_SSM_CONV_KERNEL,                  "%s.ssm.conv_kernel"    },
+    { LLM_KV_SSM_INNER_SIZE,                   "%s.ssm.inner_size"     },
+    { LLM_KV_SSM_STATE_SIZE,                   "%s.ssm.state_size"     },
+    { LLM_KV_SSM_TIME_STEP_RANK,               "%s.ssm.time_step_rank" },
+    { LLM_KV_SSM_DT_B_C_RMS,                   "%s.ssm.dt_b_c_rms"     },
 
-    { LLM_KV_WKV_HEAD_SIZE,                 "%s.wkv.head_size" },
+    { LLM_KV_WKV_HEAD_SIZE,                    "%s.wkv.head_size" },
 
-    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
-    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
-    { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
-    { LLM_KV_TOKENIZER_TOKEN_TYPE,           "tokenizer.ggml.token_type"               },
-    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,     "tokenizer.ggml.token_type_count"         },
-    { LLM_KV_TOKENIZER_SCORES,               "tokenizer.ggml.scores"                   },
-    { LLM_KV_TOKENIZER_MERGES,               "tokenizer.ggml.merges"                   },
-    { LLM_KV_TOKENIZER_BOS_ID,               "tokenizer.ggml.bos_token_id"             },
-    { LLM_KV_TOKENIZER_EOS_ID,               "tokenizer.ggml.eos_token_id"             },
-    { LLM_KV_TOKENIZER_UNK_ID,               "tokenizer.ggml.unknown_token_id"         },
-    { LLM_KV_TOKENIZER_SEP_ID,               "tokenizer.ggml.seperator_token_id"       },
-    { LLM_KV_TOKENIZER_PAD_ID,               "tokenizer.ggml.padding_token_id"         },
-    { LLM_KV_TOKENIZER_CLS_ID,               "tokenizer.ggml.cls_token_id"             },
-    { LLM_KV_TOKENIZER_MASK_ID,              "tokenizer.ggml.mask_token_id"            },
-    { LLM_KV_TOKENIZER_ADD_BOS,              "tokenizer.ggml.add_bos_token"            },
-    { LLM_KV_TOKENIZER_ADD_EOS,              "tokenizer.ggml.add_eos_token"            },
-    { LLM_KV_TOKENIZER_ADD_PREFIX,           "tokenizer.ggml.add_space_prefix"         },
-    { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      "tokenizer.ggml.remove_extra_whitespaces" },
-    { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
-    { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
-    { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
-    { LLM_KV_TOKENIZER_PREFIX_ID,            "tokenizer.ggml.prefix_token_id"          },
-    { LLM_KV_TOKENIZER_SUFFIX_ID,            "tokenizer.ggml.suffix_token_id"          },
-    { LLM_KV_TOKENIZER_MIDDLE_ID,            "tokenizer.ggml.middle_token_id"          },
-    { LLM_KV_TOKENIZER_EOT_ID,               "tokenizer.ggml.eot_token_id"             },
-    { LLM_KV_TOKENIZER_EOM_ID,               "tokenizer.ggml.eom_token_id"             },
+    { LLM_KV_TOKENIZER_MODEL,                  "tokenizer.ggml.model"                    },
+    { LLM_KV_TOKENIZER_PRE,                    "tokenizer.ggml.pre"                      },
+    { LLM_KV_TOKENIZER_LIST,                   "tokenizer.ggml.tokens"                   },
+    { LLM_KV_TOKENIZER_TOKEN_TYPE,             "tokenizer.ggml.token_type"               },
+    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,       "tokenizer.ggml.token_type_count"         },
+    { LLM_KV_TOKENIZER_SCORES,                 "tokenizer.ggml.scores"                   },
+    { LLM_KV_TOKENIZER_MERGES,                 "tokenizer.ggml.merges"                   },
+    { LLM_KV_TOKENIZER_BOS_ID,                 "tokenizer.ggml.bos_token_id"             },
+    { LLM_KV_TOKENIZER_EOS_ID,                 "tokenizer.ggml.eos_token_id"             },
+    { LLM_KV_TOKENIZER_EOT_ID,                 "tokenizer.ggml.eot_token_id"             },
+    { LLM_KV_TOKENIZER_EOM_ID,                 "tokenizer.ggml.eom_token_id"             },
+    { LLM_KV_TOKENIZER_UNK_ID,                 "tokenizer.ggml.unknown_token_id"         },
+    { LLM_KV_TOKENIZER_SEP_ID,                 "tokenizer.ggml.seperator_token_id"       },
+    { LLM_KV_TOKENIZER_PAD_ID,                 "tokenizer.ggml.padding_token_id"         },
+    { LLM_KV_TOKENIZER_CLS_ID,                 "tokenizer.ggml.cls_token_id"             },
+    { LLM_KV_TOKENIZER_MASK_ID,                "tokenizer.ggml.mask_token_id"            },
+    { LLM_KV_TOKENIZER_ADD_BOS,                "tokenizer.ggml.add_bos_token"            },
+    { LLM_KV_TOKENIZER_ADD_EOS,                "tokenizer.ggml.add_eos_token"            },
+    { LLM_KV_TOKENIZER_ADD_PREFIX,             "tokenizer.ggml.add_space_prefix"         },
+    { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,        "tokenizer.ggml.remove_extra_whitespaces" },
+    { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,   "tokenizer.ggml.precompiled_charsmap"     },
+    { LLM_KV_TOKENIZER_HF_JSON,                "tokenizer.huggingface.json"              },
+    { LLM_KV_TOKENIZER_RWKV,                   "tokenizer.rwkv.world"                    },
+    { LLM_KV_TOKENIZER_FIM_PRE_ID,             "tokenizer.ggml.fim_pre_token_id"         },
+    { LLM_KV_TOKENIZER_FIM_SUF_ID,             "tokenizer.ggml.fim_suf_token_id"         },
+    { LLM_KV_TOKENIZER_FIM_MID_ID,             "tokenizer.ggml.fim_mid_token_id"         },
+    { LLM_KV_TOKENIZER_FIM_PAD_ID,             "tokenizer.ggml.fim_pad_token_id"         },
+    { LLM_KV_TOKENIZER_FIM_REP_ID,             "tokenizer.ggml.fim_rep_token_id"         },
+    { LLM_KV_TOKENIZER_FIM_SEP_ID,             "tokenizer.ggml.fim_sep_token_id"         },
 
-    { LLM_KV_ADAPTER_TYPE,                  "adapter.type"       },
-    { LLM_KV_ADAPTER_LORA_ALPHA,            "adapter.lora.alpha" },
+    { LLM_KV_ADAPTER_TYPE,                     "adapter.type"       },
+    { LLM_KV_ADAPTER_LORA_ALPHA,               "adapter.lora.alpha" },
+
+    // deprecated
+    { LLM_KV_TOKENIZER_PREFIX_ID,              "tokenizer.ggml.prefix_token_id" },
+    { LLM_KV_TOKENIZER_SUFFIX_ID,              "tokenizer.ggml.suffix_token_id" },
+    { LLM_KV_TOKENIZER_MIDDLE_ID,              "tokenizer.ggml.middle_token_id" },
 };
 
 struct LLM_KV {
@@ -6164,14 +6180,14 @@ static void llm_load_vocab(
             vocab.type = LLAMA_VOCAB_TYPE_NONE;
 
             // default special tokens
-            vocab.special_bos_id  = -1;
-            vocab.special_eos_id  = -1;
-            vocab.special_unk_id  = -1;
-            vocab.special_sep_id  = -1;
-            vocab.special_pad_id  = -1;
-            vocab.special_cls_id  = -1;
-            vocab.special_mask_id = -1;
-            vocab.linefeed_id     = -1;
+            vocab.special_bos_id  = LLAMA_TOKEN_NULL;
+            vocab.special_eos_id  = LLAMA_TOKEN_NULL;
+            vocab.special_unk_id  = LLAMA_TOKEN_NULL;
+            vocab.special_sep_id  = LLAMA_TOKEN_NULL;
+            vocab.special_pad_id  = LLAMA_TOKEN_NULL;
+            vocab.special_cls_id  = LLAMA_TOKEN_NULL;
+            vocab.special_mask_id = LLAMA_TOKEN_NULL;
+            vocab.linefeed_id     = LLAMA_TOKEN_NULL;
 
             // read vocab size from metadata
             if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
@@ -6188,16 +6204,16 @@ static void llm_load_vocab(
             vocab.special_bos_id  = 1;
             vocab.special_eos_id  = 2;
             vocab.special_unk_id  = 0;
-            vocab.special_sep_id  = -1;
-            vocab.special_pad_id  = -1;
-            vocab.special_cls_id  = -1;
-            vocab.special_mask_id = -1;
+            vocab.special_sep_id  = LLAMA_TOKEN_NULL;
+            vocab.special_pad_id  = LLAMA_TOKEN_NULL;
+            vocab.special_cls_id  = LLAMA_TOKEN_NULL;
+            vocab.special_mask_id = LLAMA_TOKEN_NULL;
         } else if (tokenizer_model == "bert") {
             vocab.type = LLAMA_VOCAB_TYPE_WPM;
 
             // default special tokens
-            vocab.special_bos_id  = -1;
-            vocab.special_eos_id  = -1;
+            vocab.special_bos_id  = LLAMA_TOKEN_NULL;
+            vocab.special_eos_id  = LLAMA_TOKEN_NULL;
             vocab.special_unk_id  = 100;
             vocab.special_sep_id  = 102;
             vocab.special_pad_id  = 0;
@@ -6233,22 +6249,22 @@ static void llm_load_vocab(
             // default special tokens
             vocab.special_bos_id  = 11;
             vocab.special_eos_id  = 11;
-            vocab.special_unk_id  = -1;
-            vocab.special_sep_id  = -1;
-            vocab.special_pad_id  = -1;
-            vocab.special_cls_id  = -1;
-            vocab.special_mask_id = -1;
+            vocab.special_unk_id  = LLAMA_TOKEN_NULL;
+            vocab.special_sep_id  = LLAMA_TOKEN_NULL;
+            vocab.special_pad_id  = LLAMA_TOKEN_NULL;
+            vocab.special_cls_id  = LLAMA_TOKEN_NULL;
+            vocab.special_mask_id = LLAMA_TOKEN_NULL;
         } else if (tokenizer_model == "t5") {
             vocab.type = LLAMA_VOCAB_TYPE_UGM;
 
             // default special tokens
-            vocab.special_bos_id  = -1;
+            vocab.special_bos_id  = LLAMA_TOKEN_NULL;
             vocab.special_eos_id  = 1;
             vocab.special_unk_id  = 2;
-            vocab.special_sep_id  = -1;
+            vocab.special_sep_id  = LLAMA_TOKEN_NULL;
             vocab.special_pad_id  = 0;
-            vocab.special_cls_id  = -1;
-            vocab.special_mask_id = -1;
+            vocab.special_cls_id  = LLAMA_TOKEN_NULL;
+            vocab.special_mask_id = LLAMA_TOKEN_NULL;
 
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
@@ -6271,11 +6287,11 @@ static void llm_load_vocab(
             vocab.type = LLAMA_VOCAB_TYPE_RWKV;
 
             // default special tokens
-            vocab.special_bos_id = -1;
-            vocab.special_eos_id = -1;
-            vocab.special_unk_id = -1;
-            vocab.special_sep_id = -1;
-            vocab.special_pad_id = -1;
+            vocab.special_bos_id = LLAMA_TOKEN_NULL;
+            vocab.special_eos_id = LLAMA_TOKEN_NULL;
+            vocab.special_unk_id = LLAMA_TOKEN_NULL;
+            vocab.special_sep_id = LLAMA_TOKEN_NULL;
+            vocab.special_pad_id = LLAMA_TOKEN_NULL;
         } else {
             throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
         }
@@ -6359,7 +6375,7 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "chatglm-bpe") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
-                vocab.special_bos_id  = -1;
+                vocab.special_bos_id = LLAMA_TOKEN_NULL;
             } else if (
                 tokenizer_pre == "viking") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
@@ -6485,44 +6501,6 @@ static void llm_load_vocab(
 
     // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
     if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-        // For Fill-In-the-Middle (FIM)/infill models which where converted
-        // prior to support of FIM special tokens in GGUF, the following
-        // will allow those models to continue to work. The general names
-        // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
-        // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
-        // new versions of these models have been published.
-        std::string gen_name;
-        ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
-
-        std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
-            [](unsigned char c){ return std::tolower(c); });
-
-        if (gen_name.find("code") != std::string::npos) {
-            if (model.arch == LLM_ARCH_LLAMA
-              && 32010 < vocab.id_to_token.size()
-              && vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
-              && vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
-              && vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
-              && vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
-                vocab.special_prefix_id = 32007;
-                vocab.special_suffix_id = 32008;
-                vocab.special_middle_id = 32009;
-                vocab.special_eot_id    = 32010;
-            } else if (model.arch == LLM_ARCH_GEMMA
-              && 107 < vocab.id_to_token.size()
-              && vocab.id_to_token[67].text == "<|fim_prefix|>"
-              && vocab.id_to_token[69].text == "<|fim_suffix|>"
-              && vocab.id_to_token[68].text == "<|fim_middle|>"
-              && vocab.id_to_token[107].text == "<end_of_turn>") {
-                vocab.special_prefix_id = 67;
-                vocab.special_suffix_id = 69;
-                vocab.special_middle_id = 68;
-                // TODO: this is not EOT, it is "file separator" token, needs fix
-                //       https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
-                //vocab.special_eot_id    = 70;
-                vocab.special_eot_id    = 107;
-            }
-        }
         try {
             vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
         } catch (const std::exception & e) {
@@ -6550,18 +6528,26 @@ static void llm_load_vocab(
     // special tokens
     {
         const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
-            { LLM_KV_TOKENIZER_BOS_ID,    vocab.special_bos_id    },
-            { LLM_KV_TOKENIZER_EOS_ID,    vocab.special_eos_id    },
-            { LLM_KV_TOKENIZER_UNK_ID,    vocab.special_unk_id    },
-            { LLM_KV_TOKENIZER_SEP_ID,    vocab.special_sep_id    },
-            { LLM_KV_TOKENIZER_PAD_ID,    vocab.special_pad_id    },
-            { LLM_KV_TOKENIZER_CLS_ID,    vocab.special_cls_id    },
-            { LLM_KV_TOKENIZER_MASK_ID,   vocab.special_mask_id   },
-            { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
-            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
-            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
-            { LLM_KV_TOKENIZER_EOT_ID,    vocab.special_eot_id    },
-            { LLM_KV_TOKENIZER_EOM_ID,    vocab.special_eom_id    },
+            { LLM_KV_TOKENIZER_BOS_ID,     vocab.special_bos_id     },
+            { LLM_KV_TOKENIZER_EOS_ID,     vocab.special_eos_id     },
+            { LLM_KV_TOKENIZER_EOT_ID,     vocab.special_eot_id     },
+            { LLM_KV_TOKENIZER_EOM_ID,     vocab.special_eom_id     },
+            { LLM_KV_TOKENIZER_UNK_ID,     vocab.special_unk_id     },
+            { LLM_KV_TOKENIZER_SEP_ID,     vocab.special_sep_id     },
+            { LLM_KV_TOKENIZER_PAD_ID,     vocab.special_pad_id     },
+            { LLM_KV_TOKENIZER_CLS_ID,     vocab.special_cls_id     },
+            { LLM_KV_TOKENIZER_MASK_ID,    vocab.special_mask_id    },
+            { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
+            { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
+            { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
+            { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
+            { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
+            { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
+
+            // deprecated
+            { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
+            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
+            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
         };
 
         for (const auto & it : special_token_types) {
@@ -6592,22 +6578,21 @@ static void llm_load_vocab(
             }
         }
 
-        // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
-        //
-        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
-        //       for now, we apply this workaround to find the EOT token based on its text
-        if (vocab.special_eot_id == -1) {
-            for (const auto & t : vocab.token_to_id) {
+        // auto-detect special tokens by text
+        // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
+        //       for now, we apply this workaround to find the tokens based on their text
+
+        for (const auto & t : vocab.token_to_id) {
+            // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
+            if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
                 if (false
-                        // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
-                        //       need to fix convert script
-                        //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
                         || t.first == "<|eot_id|>"
                         || t.first == "<|im_end|>"
                         || t.first == "<|end|>"
                         || t.first == "<end_of_turn>"
                         || t.first == "<|endoftext|>"
                         || t.first == "<EOT>"
+                        || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
                    ) {
                     vocab.special_eot_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -6615,23 +6600,118 @@ static void llm_load_vocab(
                                 __func__, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
-                    break;
                 }
             }
-        }
 
-        // find EOM token: "<|eom_id|>"
-        //
-        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
-        //       for now, we apply this workaround to find the EOM token based on its text
-        if (vocab.special_eom_id == -1) {
-            const auto & t = vocab.token_to_id.find("<|eom_id|>");
-            if (t != vocab.token_to_id.end()) {
-                vocab.special_eom_id = t->second;
-                if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                        __func__, t->first.c_str());
-                    vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+            // find EOM token: "<|eom_id|>"
+            if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|eom_id|>"
+                        ) {
+                    vocab.special_eom_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
+            if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_prefix|>"  // Qwen
+                        || t.first == "<fim-prefix>"
+                        || t.first == "<｜fim▁begin｜>" // DeepSeek
+                        || t.first == "<PRE>"
+                        ) {
+                    vocab.special_fim_pre_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
+            if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_suffix|>" // Qwen
+                        || t.first == "<fim-suffix>"
+                        || t.first == "<｜fim▁hole｜>" // DeepSeek
+                        || t.first == "<SUF>"
+                        ) {
+                    vocab.special_fim_suf_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
+            if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_middle|>" // Qwen
+                        || t.first == "<fim-middle>"
+                        || t.first == "<｜fim▁end｜>"  // DeepSeek
+                        || t.first == "<MID>"
+                        ) {
+                    vocab.special_fim_mid_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
+            if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_pad|>" // Qwen
+                        || t.first == "<fim-pad>"
+                        || t.first == "<PAD>"
+                        ) {
+                    vocab.special_fim_pad_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
+            if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_repo|>"  // Qwen
+                        || t.first == "<|repo_name|>"
+                        || t.first == "<fim-repo>"
+                        || t.first == "<REPO>"
+                        ) {
+                    vocab.special_fim_rep_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SEP token: "<|file_sep|>"
+            if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|file_sep|>" // Qwen
+                        ) {
+                    vocab.special_fim_sep_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
                 }
             }
         }
@@ -6659,17 +6739,17 @@ static void llm_load_vocab(
             }
         }
 
-        if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
+        if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eos_id);
             LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
+        if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eot_id);
             LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
+        if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eom_id);
             LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
@@ -6863,20 +6943,24 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
 
     // special tokens
-    if (vocab.special_bos_id    != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,  vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
-    if (vocab.special_eos_id    != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,  vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
-    if (vocab.special_unk_id    != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,  vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
-    if (vocab.special_sep_id    != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,  vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
-    if (vocab.special_pad_id    != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,  vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
-    if (vocab.special_cls_id    != -1) { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,  vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
-    if (vocab.special_mask_id   != -1) { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
+    if (vocab.special_bos_id  != -1)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,     vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
+    if (vocab.special_eos_id  != -1)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,     vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
+    if (vocab.special_eot_id  != -1)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,     vocab.id_to_token[vocab.special_eot_id].text.c_str() );  }
+    if (vocab.special_eom_id  != -1)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,     vocab.id_to_token[vocab.special_eom_id].text.c_str() );  }
+    if (vocab.special_unk_id  != -1)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,     vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
+    if (vocab.special_sep_id  != -1)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,     vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
+    if (vocab.special_pad_id  != -1)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,     vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
+    if (vocab.special_cls_id  != -1)    { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,     vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
+    if (vocab.special_mask_id != -1)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id,    vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
 
-    if (vocab.linefeed_id       != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,       vocab.id_to_token[vocab.linefeed_id].text.c_str() );       }
-    if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token        = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
-    if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token        = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
-    if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token        = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
-    if (vocab.special_eot_id    != -1) { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,    vocab.id_to_token[vocab.special_eot_id].text.c_str() );    }
-    if (vocab.special_eom_id    != -1) { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,    vocab.id_to_token[vocab.special_eom_id].text.c_str() );    }
+    if (vocab.linefeed_id != -1)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,        vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
+
+    if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
+    if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
+    if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
+    if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
+    if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
+    if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
 
     for (const auto & id : vocab.special_eog_ids) {
         LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
@@ -19453,7 +19537,7 @@ struct llama_context * llama_new_context_with_model(
             }
 
             LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
                 ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                 ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
         }
@@ -21307,6 +21391,10 @@ llama_token llama_token_eos(const struct llama_model * model) {
     return llama_token_eos_impl(model->vocab);
 }
 
+llama_token llama_token_eot(const struct llama_model * model) {
+    return llama_token_eot_impl(model->vocab);
+}
+
 llama_token llama_token_cls(const struct llama_model * model) {
     return llama_token_cls_impl(model->vocab);
 }
@@ -21343,8 +21431,28 @@ llama_token llama_token_suffix(const struct llama_model * model) {
     return llama_token_suffix_impl(model->vocab);
 }
 
-llama_token llama_token_eot(const struct llama_model * model) {
-    return llama_token_eot_impl(model->vocab);
+llama_token llama_token_fim_pre(const struct llama_model * model) {
+    return llama_token_fim_pre_impl(model->vocab);
+}
+
+llama_token llama_token_fim_suf(const struct llama_model * model) {
+    return llama_token_fim_suf_impl(model->vocab);
+}
+
+llama_token llama_token_fim_mid(const struct llama_model * model) {
+    return llama_token_fim_mid_impl(model->vocab);
+}
+
+llama_token llama_token_fim_pad(const struct llama_model * model) {
+    return llama_token_fim_pad_impl(model->vocab);
+}
+
+llama_token llama_token_fim_rep(const struct llama_model * model) {
+    return llama_token_fim_rep_impl(model->vocab);
+}
+
+llama_token llama_token_fim_sep(const struct llama_model * model) {
+    return llama_token_fim_sep_impl(model->vocab);
 }
 
 //

From 95c76e8e92ecc93f784b185eafae36a0e7ad2fa3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 12 Oct 2024 14:51:54 +0300
Subject: [PATCH 041/396] server : remove legacy system_prompt feature (#9857)

* server : remove legacy system_prompt feature

ggml-ci

* readme : update [no ci]

* server : fix non-transformer logic + remove response from /props
---
 common/arg.cpp             |  17 ------
 common/common.h            |   1 -
 examples/server/README.md  |   6 +--
 examples/server/server.cpp | 103 +++++++------------------------------
 4 files changed, 19 insertions(+), 108 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index c4229a3a4..78cf6ab30 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1788,23 +1788,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_threads_http = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
-    add_opt(common_arg(
-        {"-spf", "--system-prompt-file"}, "FNAME",
-        "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
-        [](common_params & params, const std::string & value) {
-            std::ifstream file(value);
-            if (!file) {
-                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
-            }
-            std::string system_prompt;
-            std::copy(
-                        std::istreambuf_iterator<char>(file),
-                        std::istreambuf_iterator<char>(),
-                        std::back_inserter(system_prompt)
-                        );
-            params.system_prompt = system_prompt;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--metrics"},
         string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
diff --git a/common/common.h b/common/common.h
index 5beec4bde..71e686156 100644
--- a/common/common.h
+++ b/common/common.h
@@ -282,7 +282,6 @@ struct common_params {
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT
     std::string chat_template = "";                                                                         // NOLINT
-    std::string system_prompt = "";                                                                         // NOLINT
     bool enable_chat_template = true;
 
     std::vector<std::string> api_keys;
diff --git a/examples/server/README.md b/examples/server/README.md
index 3da0130ac..52ccd9f5e 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -149,7 +149,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
 | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
-| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
 | `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
 | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
@@ -320,7 +319,6 @@ node index.js
 
       - The prompt is a string or an array with the first element given as a string
       - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
-      - The system prompt is empty
 
     `temperature`: Adjust the randomness of the generated text. Default: `0.8`
 
@@ -536,14 +534,12 @@ This endpoint is public (no API key check). By default, it is read-only. To make
 
 ```json
 {
-  "system_prompt": "",
   "default_generation_settings": { ... },
   "total_slots": 1,
   "chat_template": ""
 }
 ```
 
-- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
 - `chat_template` - the model's original Jinja2 prompt template
@@ -554,7 +550,7 @@ To use this endpoint with POST method, you need to start server with `--props`
 
 *Options:*
 
-- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
+- None yet
 
 ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 314a506a1..42b57d9c4 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -623,12 +623,6 @@ struct server_context {
 
     int32_t n_ctx; // total context for all clients / slots
 
-    // system prompt
-    bool system_need_update = false;
-
-    std::string              system_prompt;
-    std::vector<llama_token> system_tokens;
-
     // slots / clients
     std::vector<server_slot> slots;
     json default_generation_settings_for_props;
@@ -665,7 +659,7 @@ struct server_context {
     bool load_model(const common_params & params_) {
         params = params_;
 
-        // dedicate one sequence to the system prompt
+        // reserve one extra sequence (seq_id == 0) for extra features
         params.n_parallel += 1;
 
         common_init_result llama_init = common_init_from_params(params);
@@ -1061,51 +1055,6 @@ struct server_context {
         clean_kv_cache = false;
     }
 
-    void system_prompt_update() {
-        SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str());
-
-        kv_cache_clear();
-        system_tokens.clear();
-
-        if (!system_prompt.empty()) {
-            system_tokens = common_tokenize(ctx, system_prompt, true);
-
-            const int32_t n_batch = llama_n_batch(ctx);
-            const int32_t n_tokens_prompt = system_tokens.size();
-
-            for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
-                const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
-
-                common_batch_clear(batch);
-
-                for (int32_t j = 0; j < n_tokens; ++j) {
-                    common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
-                }
-
-                if (llama_decode(ctx, batch) != 0) {
-                    SRV_ERR("%s", "llama_decode() failed\n");
-                    return;
-                }
-            }
-
-            // assign the system KV cache to all parallel sequences
-            for (int32_t i = 1; i <= params.n_parallel; ++i) {
-                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
-            }
-        }
-
-        system_need_update = false;
-    }
-
-    bool system_prompt_set(const std::string & sys_prompt) {
-        SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
-
-        system_prompt = sys_prompt;
-        // update system_tokens and KV cache as soon as all slots are idle
-        system_need_update = true;
-        return true;
-    }
-
     bool process_token(completion_token_output & result, server_slot & slot) {
         // remember which tokens were sampled - used for repetition penalties during sampling
         const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
@@ -1855,12 +1804,8 @@ struct server_context {
             }
 
             if (all_idle) {
-                if (system_need_update) {
-                    system_prompt_update();
-                }
-
                 SRV_INF("%s", "all slots are idle\n");
-                if (system_prompt.empty() && clean_kv_cache) {
+                if (clean_kv_cache) {
                     kv_cache_clear();
                 }
 
@@ -1882,7 +1827,7 @@ struct server_context {
         // TODO: simplify and improve
         for (server_slot & slot : slots) {
             if (slot.ga_n == 1) {
-                if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
+                if (slot.is_processing() && slot.n_past >= slot.n_ctx - 1) {
                     if (!params.ctx_shift) {
                         // this check is redundant (for good)
                         // we should never get here, because generation should already stopped in process_token()
@@ -1893,13 +1838,13 @@ struct server_context {
 
                     // Shift context
                     const int n_keep    = slot.params.n_keep + add_bos_token;
-                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
+                    const int n_left    = slot.n_past - n_keep;
                     const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
 
                     SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
                     llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
+                    llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past,        -n_discard);
 
                     if (slot.params.cache_prompt) {
                         for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -1929,9 +1874,7 @@ struct server_context {
 
             const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
 
-            // TODO: we always have to take into account the "system_tokens"
-            //       this is not great and needs to be improved somehow
-            common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
+            common_batch_add(batch, slot.sampled, slot_npast, { slot.id + 1 }, true);
 
             slot.n_past += 1;
 
@@ -1939,8 +1882,8 @@ struct server_context {
                 slot.cache_tokens.push_back(slot.sampled);
             }
 
-            SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n",
-                    slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated);
+            SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
+                    slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
         }
 
         // process in chunks of params.n_batch
@@ -1971,7 +1914,7 @@ struct server_context {
                             case SERVER_TASK_CMPL_TYPE_NORMAL:
                             case SERVER_TASK_CMPL_TYPE_EMBEDDING:
                                 {
-                                    prompt_tokens = tokenize(slot.prompt, system_prompt.empty(), true); // add BOS if there isn't system prompt
+                                    prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true);
                                 } break;
                             case SERVER_TASK_CMPL_TYPE_RERANK:
                                 {
@@ -2050,7 +1993,7 @@ struct server_context {
                         } else {
                             if (!params.ctx_shift) {
                                 // if context shift is disabled, we make sure prompt size is smaller than KV size
-                                if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) {
+                                if (slot.n_prompt_tokens >= slot.n_ctx) {
                                     slot.release();
                                     send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
                                     continue;
@@ -2138,22 +2081,19 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    int p0 = (int) system_tokens.size() + slot.n_past;
+                    int p0 = slot.n_past;
+
                     if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
                         llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
 
-                        p0 = (int) system_tokens.size();
-                        if (p0 != 0) {
-                            // copy over the system prompt when there is one
-                            llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
-                        }
+                        p0 = 0;
 
-                        // there is no common part left (except for the system prompt)
+                        // there is no common part left
                         slot.n_past = 0;
                         slot.n_past_se = 0;
                         slot.ga_i = 0;
-                        // TODO: is the system prompt ever in the sampling context?
+
                         common_sampler_reset(slot.smpl);
                     }
 
@@ -2179,7 +2119,7 @@ struct server_context {
                             }
                         }
 
-                        common_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
+                        common_batch_add(batch, prompt_tokens[slot.n_past], slot_npast, { slot.id + 1 }, false);
 
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@@ -2409,10 +2349,6 @@ int main(int argc, char ** argv) {
     // struct that contains llama context and inference
     server_context ctx_server;
 
-    if (!params.system_prompt.empty()) {
-        ctx_server.system_prompt_set(params.system_prompt);
-    }
-
     if (params.model_alias == "unknown") {
         params.model_alias = params.model;
     }
@@ -2840,7 +2776,6 @@ int main(int argc, char ** argv) {
 
     const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
         json data = {
-            { "system_prompt",               ctx_server.system_prompt },
             { "default_generation_settings", ctx_server.default_generation_settings_for_props },
             { "total_slots",                 ctx_server.params.n_parallel },
             { "chat_template",               llama_get_chat_template(ctx_server.model) },
@@ -2856,10 +2791,8 @@ int main(int argc, char ** argv) {
         }
 
         json data = json::parse(req.body);
-        if (data.contains("system_prompt")) {
-            std::string system_prompt = data.at("system_prompt");
-            ctx_server.system_prompt_set(system_prompt);
-        }
+
+        // update any props here
 
         res_ok(res, {{ "success", true }});
     };

From 1bde94dd024b632f98428f4bf2ce483295130779 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 12 Oct 2024 16:06:31 +0300
Subject: [PATCH 042/396] server : remove self-extend features (#9860)

* server : remove self-extend

ggml-ci

* server : fix context limit check to use slot.n_past

ggml-ci
---
 common/arg.cpp                                |   6 +-
 examples/server/README.md                     |   2 -
 examples/server/server.cpp                    | 187 +++++-------------
 .../server/tests/features/ctx_shift.feature   |   4 +
 4 files changed, 57 insertions(+), 142 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 78cf6ab30..205177d46 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1163,14 +1163,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.grp_attn_n = value;
         }
-    ).set_env("LLAMA_ARG_GRP_ATTN_N"));
+    ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
     add_opt(common_arg(
         {"-gaw", "--grp-attn-w"}, "N",
-        string_format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
+        string_format("group-attention width (default: %d)", params.grp_attn_w),
         [](common_params & params, int value) {
             params.grp_attn_w = value;
         }
-    ).set_env("LLAMA_ARG_GRP_ATTN_W"));
+    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-dkvc", "--dump-kv-cache"},
         "verbose print of the KV cache",
diff --git a/examples/server/README.md b/examples/server/README.md
index 52ccd9f5e..caffbac52 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -60,8 +60,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
 | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
 | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
-| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
-| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
 | `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
 | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
 | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 42b57d9c4..0dd2fc8b2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -193,21 +193,15 @@ struct server_slot {
 
     llama_token sampled;
 
-    int32_t ga_i = 0;   // group-attention state
-    int32_t ga_n = 1;   // group-attention factor
-    int32_t ga_w = 512; // group-attention width
-
-    int32_t n_past_se = 0; // self-extend
-
     // stats
-    size_t n_sent_text = 0; // number of sent text character
+    size_t n_sent_text        = 0; // number of sent text character
     size_t n_sent_token_probs = 0;
 
     int64_t t_start_process_prompt;
     int64_t t_start_generation;
 
     double t_prompt_processing; // ms
-    double t_token_generation; // ms
+    double t_token_generation;  // ms
 
     std::function<void(int)> callback_on_release;
 
@@ -225,8 +219,6 @@ struct server_slot {
         n_sent_text        = 0;
         n_sent_token_probs = 0;
         cmpl_type          = SERVER_TASK_CMPL_TYPE_NORMAL;
-        ga_i               = 0;
-        n_past_se          = 0;
 
         generated_token_probs.clear();
     }
@@ -705,22 +697,6 @@ struct server_context {
 
             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
 
-            const int ga_n = params.grp_attn_n;
-            const int ga_w = params.grp_attn_w;
-
-            if (ga_n != 1) {
-                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                       // NOLINT
-                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");             // NOLINT
-                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
-                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
-
-                SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w);
-            }
-
-            slot.ga_i = 0;
-            slot.ga_n = ga_n;
-            slot.ga_w = ga_w;
-
             slot.sparams = params.sparams;
 
             slot.callback_on_release = [this](int) {
@@ -906,19 +882,14 @@ struct server_context {
         }
         if (data.contains("json_schema") && !data.contains("grammar")) {
             try {
-                auto schema                = json_value(data, "json_schema", json::object());
-                slot.sparams.grammar       = json_schema_to_grammar(schema);
+                auto schema          = json_value(data, "json_schema", json::object());
+                slot.sparams.grammar = json_schema_to_grammar(schema);
             } catch (const std::exception & e) {
                 send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
                 return false;
             }
         } else {
-            slot.sparams.grammar       = json_value(data, "grammar",           default_sparams.grammar);
-        }
-
-        if (slot.params.cache_prompt && slot.ga_n != 1) {
-            slot.params.cache_prompt = false;
-            SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n");
+            slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
         }
 
         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -1131,12 +1102,13 @@ struct server_context {
         }
 
         // if context shift is disabled, we stop when it reaches the context limit
-        if (slot.n_decoded >= slot.n_ctx) {
+        if (slot.n_past >= slot.n_ctx) {
             slot.truncated      = true;
             slot.stopped_limit  = true;
             slot.has_next_token = false;
 
-            SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx);
+            SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
+                    slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
         }
 
         if (llama_token_is_eog(model, result.tok)) {
@@ -1148,13 +1120,13 @@ struct server_context {
 
         const auto n_ctx_train = llama_n_ctx_train(model);
 
-        if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
+        if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
             slot.truncated      = true;
             slot.stopped_limit  = true;
             slot.has_next_token = false; // stop prediction
 
             SLT_WRN(slot,
-                    "n_predict (%d) is not set and self-context extend is disabled. "
+                    "n_predict (%d) is set for infinite generation. "
                     "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
                     slot.params.n_predict, n_ctx_train);
         }
@@ -1826,38 +1798,36 @@ struct server_context {
         // apply context-shift if needed
         // TODO: simplify and improve
         for (server_slot & slot : slots) {
-            if (slot.ga_n == 1) {
-                if (slot.is_processing() && slot.n_past >= slot.n_ctx - 1) {
-                    if (!params.ctx_shift) {
-                        // this check is redundant (for good)
-                        // we should never get here, because generation should already stopped in process_token()
-                        slot.release();
-                        send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
-                        continue;
-                    }
-
-                    // Shift context
-                    const int n_keep    = slot.params.n_keep + add_bos_token;
-                    const int n_left    = slot.n_past - n_keep;
-                    const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
-
-                    SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
-
-                    llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past,        -n_discard);
-
-                    if (slot.params.cache_prompt) {
-                        for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
-                            slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
-                        }
-
-                        slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
-                    }
-
-                    slot.n_past -= n_discard;
-
-                    slot.truncated = true;
+            if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
+                if (!params.ctx_shift) {
+                    // this check is redundant (for good)
+                    // we should never get here, because generation should already stopped in process_token()
+                    slot.release();
+                    send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
+                    continue;
                 }
+
+                // Shift context
+                const int n_keep    = slot.params.n_keep + add_bos_token;
+                const int n_left    = slot.n_past - n_keep;
+                const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
+
+                SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
+
+                llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
+                llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past,        -n_discard);
+
+                if (slot.params.cache_prompt) {
+                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
+                        slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+                    }
+
+                    slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+                }
+
+                slot.n_past -= n_discard;
+
+                slot.truncated = true;
             }
         }
 
@@ -1872,9 +1842,7 @@ struct server_context {
 
             slot.i_batch = batch.n_tokens;
 
-            const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
-
-            common_batch_add(batch, slot.sampled, slot_npast, { slot.id + 1 }, true);
+            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true);
 
             slot.n_past += 1;
 
@@ -1993,6 +1961,8 @@ struct server_context {
                         } else {
                             if (!params.ctx_shift) {
                                 // if context shift is disabled, we make sure prompt size is smaller than KV size
+                                // TODO: there should be a separate parameter that control prompt truncation
+                                //       context shift should be applied only during the generation phase
                                 if (slot.n_prompt_tokens >= slot.n_ctx) {
                                     slot.release();
                                     send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
@@ -2005,7 +1975,7 @@ struct server_context {
                             slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
 
                             // if input prompt is too big, truncate it (if group attention self-extend is disabled)
-                            if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) {
+                            if (slot.n_prompt_tokens >= slot.n_ctx) {
                                 const int n_left = slot.n_ctx - slot.params.n_keep;
 
                                 const int n_block_size = n_left / 2;
@@ -2032,12 +2002,7 @@ struct server_context {
 
                             common_sampler_reset(slot.smpl);
 
-                            if (!slot.params.cache_prompt) {
-                                slot.n_past_se = 0;
-                                slot.ga_i      = 0;
-                            } else {
-                                GGML_ASSERT(slot.ga_n == 1);
-
+                            if (slot.params.cache_prompt) {
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
 
@@ -2053,9 +2018,6 @@ struct server_context {
                             SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
 
                             slot.n_past--;
-                            if (slot.ga_i > 0) {
-                                slot.n_past_se--;
-                            }
                         }
 
                         slot.n_prompt_tokens_processed = 0;
@@ -2081,52 +2043,31 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    int p0 = slot.n_past;
-
-                    if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
+                    if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
                         llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
 
-                        p0 = 0;
-
                         // there is no common part left
                         slot.n_past = 0;
-                        slot.n_past_se = 0;
-                        slot.ga_i = 0;
 
                         common_sampler_reset(slot.smpl);
                     }
 
+                    SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
+
                     // remove the non-common part from the cache
                     slot.cache_tokens.resize(slot.n_past);
 
-                    SLT_INF(slot, "kv cache rm [%d, end)\n", p0);
-
-                    int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
-
-                    int32_t ga_i = slot.ga_i;
-                    int32_t ga_n = slot.ga_n;
-                    int32_t ga_w = slot.ga_w;
-
                     // add prompt tokens for processing in the current batch
-                    // TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow
-                    for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) {
-                        if (slot.ga_n != 1) {
-                            while (slot_npast >= ga_i + ga_w) {
-                                const int bd = (ga_w/ga_n)*(ga_n - 1);
-                                slot_npast -= bd;
-                                ga_i += ga_w/ga_n;
-                            }
-                        }
-
-                        common_batch_add(batch, prompt_tokens[slot.n_past], slot_npast, { slot.id + 1 }, false);
+                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
+                        common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false);
 
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
                         }
 
                         slot.n_prompt_tokens_processed++;
-                        slot_npast++;
+                        slot.n_past++;
                     }
 
                     SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
@@ -2167,34 +2108,6 @@ struct server_context {
         for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
             const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
-            for (auto & slot : slots) {
-                if (slot.ga_n != 1) {
-                    // context extension via Self-Extend
-                    // TODO: simplify and/or abstract this
-                    while (slot.n_past_se >= slot.ga_i + slot.ga_w) {
-                        const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
-                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
-                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
-
-                        SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        SLT_DBG(slot, "div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
-
-                        llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
-                        llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
-                        llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
-
-                        slot.n_past_se -= bd;
-
-                        slot.ga_i += slot.ga_w / slot.ga_n;
-
-                        SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
-                    }
-
-                    slot.n_past_se += n_tokens;
-                }
-            }
-
             llama_batch batch_view = {
                 n_tokens,
                 batch.token    + i,
diff --git a/examples/server/tests/features/ctx_shift.feature b/examples/server/tests/features/ctx_shift.feature
index ba3afcf06..ae6c6b01b 100644
--- a/examples/server/tests/features/ctx_shift.feature
+++ b/examples/server/tests/features/ctx_shift.feature
@@ -13,6 +13,10 @@ Feature: llama.cpp server
     And   32 as batch size
     And   2 slots
 
+    # the prompt is 301 tokens
+    # the slot context is 256/2 = 128 tokens
+    # the prompt is truncated to keep the last 109 tokens
+    # 64 tokens are generated thanks to shifting the context when it gets full
   Scenario: Inference with context shift
     And   64 server max tokens to predict
     Then  the server is starting

From edc265661cd707327297b6ec4d83423c43cb50a5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 12 Oct 2024 16:14:27 +0300
Subject: [PATCH 043/396] server : add option to time limit the generation
 phase (#9865)

ggml-ci
---
 examples/server/README.md  |  2 ++
 examples/server/server.cpp | 50 +++++++++++++++++++++++++++++++++-----
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index caffbac52..b5feeb77b 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -374,6 +374,8 @@ node index.js
 
     `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
 
+    `t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
+
     `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
 
     `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0dd2fc8b2..f809c46d5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -128,9 +128,12 @@ struct slot_params {
     bool stream       = true;
     bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
 
-    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t  n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
-    int32_t  n_predict = -1; // new tokens to predict
+    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
+    int32_t n_predict = -1; // new tokens to predict
+
+    int64_t t_max_prompt_ms  = -1; // TODO: implement
+    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
 
     std::vector<std::string> antiprompt;
 
@@ -175,6 +178,7 @@ struct server_slot {
     server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
 
     bool has_next_token = true;
+    bool has_new_line   = false;
     bool truncated      = false;
     bool stopped_eos    = false;
     bool stopped_word   = false;
@@ -210,6 +214,7 @@ struct server_slot {
 
         n_prompt_tokens    = 0;
         generated_text     = "";
+        has_new_line       = false;
         truncated          = false;
         stopped_eos        = false;
         stopped_word       = false;
@@ -874,6 +879,8 @@ struct server_context {
         slot.sparams.seed              = json_value(data, "seed",              default_sparams.seed);
         slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
         slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
+      //slot.params.t_max_prompt_ms    = json_value(data, "t_max_prompt_ms",   default_params.t_max_prompt_ms); // TODO: implement
+        slot.params.t_max_predict_ms   = json_value(data, "t_max_predict_ms",  default_params.t_max_predict_ms);
 
         // process "json_schema" and "grammar"
         if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
@@ -1101,6 +1108,20 @@ struct server_context {
             SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
         }
 
+        // if we have already seen a new line, we stop after a certain time limit
+        if (slot.has_new_line && slot.params.t_max_predict_ms > 0 &&
+            (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
+            slot.stopped_limit  = true;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
+        }
+
+        // check if there is a new line in the generated text
+        if (result.text_to_send.find('\n') != std::string::npos) {
+            slot.has_new_line = true;
+        }
+
         // if context shift is disabled, we stop when it reaches the context limit
         if (slot.n_past >= slot.n_ctx) {
             slot.truncated      = true;
@@ -1250,6 +1271,7 @@ struct server_context {
             {"tokens_evaluated",    slot.n_prompt_tokens},
             {"generation_settings", get_formated_generation(slot)},
             {"prompt",              slot.prompt},
+            {"has_new_line",        slot.has_new_line},
             {"truncated",           slot.truncated},
             {"stopped_eos",         slot.stopped_eos},
             {"stopped_word",        slot.stopped_word},
@@ -1576,6 +1598,7 @@ struct server_context {
                         slot_data["prompt"]     = slot.prompt;
                         slot_data["next_token"] = {
                             {"has_next_token", slot.has_next_token},
+                            {"has_new_line",   slot.has_new_line},
                             {"n_remain",       slot.n_remaining},
                             {"n_decoded",      slot.n_decoded},
                             {"stopped_eos",    slot.stopped_eos},
@@ -1914,6 +1937,13 @@ struct server_context {
                                     auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
                                     auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
 
+                                    // for now pick context to fit in a single batch (ratio prefix:suffix = 3:1, TODO: configurable?)
+                                    const int n_suffix_take = std::min<int>(suffix_tokens.size(), n_batch/4);
+                                    const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
+
+                                    prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
+                                    suffix_tokens.resize(n_suffix_take);
+
                                     prefix_tokens.insert(prefix_tokens.begin(), llama_token_fim_pre(model));
                                     suffix_tokens.insert(suffix_tokens.begin(), llama_token_fim_suf(model));
 
@@ -1936,9 +1966,17 @@ struct server_context {
 
                         SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
 
-                        // print prompt tokens:
-                        for (int i = 0; i < (int) prompt_tokens.size(); i++) {
-                            SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                        // print prompt tokens (for debugging)
+                        if (1) {
+                            // first 16 tokens (avoid flooding logs)
+                            for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
+                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                            }
+                        } else {
+                            // all
+                            for (int i = 0; i < (int) prompt_tokens.size(); i++) {
+                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                            }
                         }
 
                         // empty prompt passed -> release the slot and send empty response

From 92be9f12164f18ce845a5bab60cefa5f7fec6836 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 06:11:26 +0300
Subject: [PATCH 044/396] flake.lock: Update (#9870)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/bc947f541ae55e999ffdb4013441347d83b00feb?narHash=sha256-NOiTvBbRLIOe5F6RbHaAh6%2B%2BBNjsb149fGZd1T4%2BKBg%3D' (2024-10-04)
  → 'github:NixOS/nixpkgs/5633bcff0c6162b9e4b5f1264264611e950c8ec7?narHash=sha256-9UTxR8eukdg%2BXZeHgxW5hQA9fIKHsKCdOIUycTryeVw%3D' (2024-10-09)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index 3fb6ced51..702527028 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1728018373,
-        "narHash": "sha256-NOiTvBbRLIOe5F6RbHaAh6++BNjsb149fGZd1T4+KBg=",
+        "lastModified": 1728492678,
+        "narHash": "sha256-9UTxR8eukdg+XZeHgxW5hQA9fIKHsKCdOIUycTryeVw=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "bc947f541ae55e999ffdb4013441347d83b00feb",
+        "rev": "5633bcff0c6162b9e4b5f1264264611e950c8ec7",
         "type": "github"
       },
       "original": {

From c7181bd294757dd80a7904e3dd0fea2d0be914e7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 18:52:48 +0300
Subject: [PATCH 045/396] server : reuse cached context chunks (#9866)

ggml-ci
---
 common/arg.cpp             |  7 ++++
 common/common.h            |  3 +-
 examples/server/README.md  |  1 +
 examples/server/server.cpp | 69 ++++++++++++++++++++++++++++++++++++--
 examples/server/utils.hpp  |  4 +--
 5 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 205177d46..8969fc107 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1788,6 +1788,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_threads_http = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
+    add_opt(common_arg(
+        {"--cache-reuse"}, "N",
+        string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
+        [](common_params & params, int value) {
+            params.n_cache_reuse = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
     add_opt(common_arg(
         {"--metrics"},
         string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
diff --git a/common/common.h b/common/common.h
index 71e686156..5507b1c59 100644
--- a/common/common.h
+++ b/common/common.h
@@ -277,7 +277,8 @@ struct common_params {
     int32_t port           = 8080;         // server listens on this network port
     int32_t timeout_read   = 600;          // http read timeout in seconds
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT
diff --git a/examples/server/README.md b/examples/server/README.md
index b5feeb77b..cd0eaf847 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -147,6 +147,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
 | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
+| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
 | `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
 | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f809c46d5..015b3b2c5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -800,7 +800,7 @@ struct server_context {
                 int slot_prompt_len = slot_prompt.size();
 
                 // length of the Longest Common Prefix between the current slot's prompt and the input prompt
-                int lcp_len = common_part(slot_prompt, prompt);
+                int lcp_len = longest_common_prefix(slot_prompt, prompt);
 
                 // fraction of the common substring length compared to the current slot's prompt length
                 similarity = static_cast<float>(lcp_len) / slot_prompt_len;
@@ -2012,7 +2012,7 @@ struct server_context {
                             }
                             slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
 
-                            // if input prompt is too big, truncate it (if group attention self-extend is disabled)
+                            // if input prompt is too big, truncate it
                             if (slot.n_prompt_tokens >= slot.n_ctx) {
                                 const int n_left = slot.n_ctx - slot.params.n_keep;
 
@@ -2042,12 +2042,74 @@ struct server_context {
 
                             if (slot.params.cache_prompt) {
                                 // reuse any previously computed tokens that are common with the new prompt
-                                slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+                                slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
 
                                 // push the prompt into the sampling context (do not apply grammar)
                                 for (int i = 0; i < slot.n_past; ++i) {
                                     common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
                                 }
+
+                                // reuse chunks from the cached prompt by shifting their KV cache in the new position
+                                if (params.n_cache_reuse > 0) {
+                                    size_t head_c = slot.n_past; // cache
+                                    size_t head_p = slot.n_past; // current prompt
+
+                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past);
+
+                                    while (head_c < slot.cache_tokens.size() &&
+                                           head_p < prompt_tokens.size()) {
+                                        if (llama_token_is_control(model, slot.cache_tokens[head_c])) {
+                                            break;
+                                        }
+
+                                        if (llama_token_is_control(model, prompt_tokens[head_p])) {
+                                            break;
+                                        }
+
+                                        size_t n_match = 0;
+
+                                        while (head_c + n_match < slot.cache_tokens.size() &&
+                                               head_p + n_match < prompt_tokens.size()     &&
+                                               slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
+                                            if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match])) {
+                                                break;
+                                            }
+
+                                            if (llama_token_is_control(model, prompt_tokens[head_p + n_match])) {
+                                                break;
+                                            }
+
+                                            n_match++;
+                                        }
+
+                                        if (n_match >= (size_t) params.n_cache_reuse) {
+                                            SLT_DBG(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
+                                            //for (size_t i = head_p; i < head_p + n_match; i++) {
+                                            //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                                            //}
+
+                                            const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
+
+                                            llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
+                                            llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1,     kv_shift);
+
+                                            for (size_t i = 0; i < n_match; i++) {
+                                                slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
+
+                                                common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
+
+                                                slot.n_past++;
+                                            }
+
+                                            head_c += n_match;
+                                            head_p += n_match;
+                                        } else {
+                                            head_c += 1;
+                                        }
+                                    }
+
+                                    SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
+                                }
                             }
                         }
 
@@ -3257,6 +3319,7 @@ int main(int argc, char ** argv) {
 
     ctx_server.queue_tasks.on_new_task(std::bind(
                 &server_context::process_single_task, &ctx_server, std::placeholders::_1));
+
     ctx_server.queue_tasks.on_update_slots(std::bind(
                 &server_context::update_slots, &ctx_server));
 
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index ad99e9574..37999604d 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -195,14 +195,14 @@ static std::string gen_chatcmplid() {
 // other common utils
 //
 
-static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
+static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
     size_t i;
     for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
 
     return i;
 }
 
-static size_t common_part(const std::string & a, const std::string & b) {
+static size_t longest_common_prefix(const std::string & a, const std::string & b) {
     size_t i;
     for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
 

From d4c19c0f5cdb1e512573e8c86c79e8d0238c73c4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 21:31:35 +0300
Subject: [PATCH 046/396] server : accept extra_context for the infill endpoint
 (#9874)

* server : accept extra_context for the infill endpoint

ggml-ci

* server : update readme [no ci]

* server : use repo-level FIM pattern if possible

ggml-ci
---
 examples/server/README.md  |  21 ++++++++
 examples/server/server.cpp | 102 ++++++++++++++++++++++++++++++++++---
 src/llama.cpp              |  56 +++++++++++++-------
 3 files changed, 153 insertions(+), 26 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index cd0eaf847..eb0a7b32e 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -524,9 +524,30 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
 
 - `input_prefix`: Set the prefix of the code to infill.
 - `input_suffix`: Set the suffix of the code to infill.
+- `prompt`: Added after the `FIM_MID` token
+- `extra_context`: Additional context inserted before the FIM prefix. See https://github.com/ggerganov/llama.cpp/pull/9874
 
 It also accepts all the options of `/completion`.
 
+If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used:
+
+```txt
+<FIM_REP>myproject
+<FIM_SEP>{chunk 0 filename}
+{chunk 0 text}
+<FIM_SEP>{chunk 1 filename}
+{chunk 1 text}
+...
+<FIM_SEP>filename
+<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
+```
+
+If the tokens are missing, then the extra context is simply prefixed at the start:
+
+```txt
+[extra_context]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
+```
+
 ### **GET** `/props`: Get server global properties.
 
 This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 015b3b2c5..18bcad3f0 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -139,6 +139,7 @@ struct slot_params {
 
     json input_prefix;
     json input_suffix;
+    json extra_context;
 };
 
 struct server_slot {
@@ -170,6 +171,7 @@ struct server_slot {
 
     // when a task is submitted, we first tokenize the prompt and store it here
     std::vector<llama_token> prompt_tokens;
+    std::vector<llama_token> extra_tokens;
 
     std::string generated_text;
     std::vector<llama_token> cache_tokens;
@@ -906,8 +908,26 @@ struct server_context {
         }
 
         // infill
-        slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix);
-        slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
+        slot.params.input_prefix  = json_value(data, "input_prefix",  default_params.input_prefix);
+        slot.params.input_suffix  = json_value(data, "input_suffix",  default_params.input_suffix);
+        slot.params.extra_context = json_value(data, "extra_context", default_params.extra_context);
+
+        SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.params.extra_context.size());
+        for (const auto & chunk : slot.params.extra_context) {
+            // { "text": string, "filename": string }
+            if (!chunk.contains("text") || !chunk["text"].is_string()) {
+                send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }
+
+            // filename is optional
+            if (chunk.contains("filename") && !chunk["filename"].is_string()) {
+                send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }
+
+            SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str());
+        }
 
         // get prompt
         if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) {
@@ -1934,13 +1954,66 @@ struct server_context {
                                 } break;
                             case SERVER_TASK_CMPL_TYPE_INFILL:
                                 {
+                                    // use FIM repo-level pattern:
+                                    // ref: https://arxiv.org/pdf/2409.12186
+                                    //
+                                    // [FIM_REP]myproject
+                                    // [FIM_SEP]filename0
+                                    // extra chunk 0
+                                    // [FIM_SEP]filename1
+                                    // extra chunk 1
+                                    // ...
+                                    // [FIM_SEP]filename
+                                    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]
+                                    //
                                     auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
                                     auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
 
-                                    // for now pick context to fit in a single batch (ratio prefix:suffix = 3:1, TODO: configurable?)
-                                    const int n_suffix_take = std::min<int>(suffix_tokens.size(), n_batch/4);
+                                    slot.extra_tokens.clear();
+                                    if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
+                                        static const auto k_fim_repo = tokenize("myproject\n", false, false);
+
+                                        slot.extra_tokens.push_back(llama_token_fim_rep(model));
+                                        slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
+                                    }
+
+                                    for (const auto & chunk : slot.params.extra_context) {
+                                        // { "text": string, "filename": string }
+                                        const std::string text     = chunk.value("text", "");
+                                        const std::string filename = chunk.value("filename", "tmp");
+
+                                        if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
+                                            const auto k_fim_file = tokenize(filename + "\n", false, false);
+
+                                            slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model));
+                                            slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+                                        } else {
+                                            // chunk separator in binary form to avoid confusing the AI
+                                            static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
+                                            static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false);
+
+                                            slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
+                                        }
+
+                                        const auto chunk_tokens = tokenize(text, false, false);
+                                        slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
+                                    }
+
+                                    if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
+                                        // TODO: current filename
+                                        static const auto k_fim_file = tokenize("filename\n", false, false);
+
+                                        slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model));
+                                        slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+                                    }
+
+                                    // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
+                                    const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch)/4);
                                     const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
 
+                                    // fill the rest of the context with extra chunks
+                                    const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
+
                                     prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
                                     suffix_tokens.resize(n_suffix_take);
 
@@ -1954,6 +2027,11 @@ struct server_context {
                                         embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
                                     }
 
+                                    SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size());
+
+                                    // put the extra context before the FIM prefix
+                                    embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end());
+
                                     embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
                                     embd_inp.push_back(llama_token_fim_mid(model));
 
@@ -2058,11 +2136,15 @@ struct server_context {
 
                                     while (head_c < slot.cache_tokens.size() &&
                                            head_p < prompt_tokens.size()) {
-                                        if (llama_token_is_control(model, slot.cache_tokens[head_c])) {
+                                        if (llama_token_is_control(model, slot.cache_tokens[head_c]) &&
+                                            slot.cache_tokens[head_c] != llama_token_fim_rep(model) &&
+                                            slot.cache_tokens[head_c] != llama_token_fim_sep(model)) {
                                             break;
                                         }
 
-                                        if (llama_token_is_control(model, prompt_tokens[head_p])) {
+                                        if (llama_token_is_control(model, prompt_tokens[head_p]) &&
+                                            prompt_tokens[head_p] != llama_token_fim_rep(model) &&
+                                            prompt_tokens[head_p] != llama_token_fim_sep(model)) {
                                             break;
                                         }
 
@@ -2071,11 +2153,15 @@ struct server_context {
                                         while (head_c + n_match < slot.cache_tokens.size() &&
                                                head_p + n_match < prompt_tokens.size()     &&
                                                slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
-                                            if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match])) {
+                                            if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match]) &&
+                                                slot.cache_tokens[head_c + n_match] != llama_token_fim_rep(model) &&
+                                                slot.cache_tokens[head_c + n_match] != llama_token_fim_sep(model)) {
                                                 break;
                                             }
 
-                                            if (llama_token_is_control(model, prompt_tokens[head_p + n_match])) {
+                                            if (llama_token_is_control(model, prompt_tokens[head_p + n_match]) &&
+                                                prompt_tokens[head_p + n_match] != llama_token_fim_rep(model) &&
+                                                prompt_tokens[head_p + n_match] != llama_token_fim_sep(model)) {
                                                 break;
                                             }
 
diff --git a/src/llama.cpp b/src/llama.cpp
index f68024f5b..511f91802 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6596,8 +6596,8 @@ static void llm_load_vocab(
                    ) {
                     vocab.special_eot_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
                 }
@@ -6610,8 +6610,8 @@ static void llm_load_vocab(
                         ) {
                     vocab.special_eom_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
                 }
@@ -6627,8 +6627,8 @@ static void llm_load_vocab(
                         ) {
                     vocab.special_fim_pre_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
                 }
@@ -6644,8 +6644,8 @@ static void llm_load_vocab(
                         ) {
                     vocab.special_fim_suf_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
                 }
@@ -6661,8 +6661,8 @@ static void llm_load_vocab(
                         ) {
                     vocab.special_fim_mid_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
                 }
@@ -6677,8 +6677,8 @@ static void llm_load_vocab(
                         ) {
                     vocab.special_fim_pad_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
                 }
@@ -6694,8 +6694,8 @@ static void llm_load_vocab(
                         ) {
                     vocab.special_fim_rep_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
                 }
@@ -6708,8 +6708,8 @@ static void llm_load_vocab(
                         ) {
                     vocab.special_fim_sep_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
                 }
@@ -6720,6 +6720,19 @@ static void llm_load_vocab(
         // this is currently determined based on the token text, which is obviously not ideal
         // ref: https://github.com/ggerganov/llama.cpp/issues/9606
         vocab.special_eog_ids.clear();
+
+        if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
+        }
+
+        if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
+        }
+
+        if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
+        }
+
         for (const auto & t : vocab.token_to_id) {
             if (false
                     || t.first == "<|eot_id|>"
@@ -6732,13 +6745,20 @@ static void llm_load_vocab(
                ) {
                 vocab.special_eog_ids.insert(t.second);
                 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                            __func__, t.first.c_str());
+                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                            __func__, t.second, t.first.c_str());
                     vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                 }
+            } else {
+                // token is control, but not marked as EOG -> print a warning
+                if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
+                    LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
+                            __func__, t.second, t.first.c_str());
+                }
             }
         }
 
+        // sanity checks
         if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eos_id);
             LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);

From 13dca2a54a394757d56fdd652b9f0df08f44ea22 Mon Sep 17 00:00:00 2001
From: agray3 <agray3@users.noreply.github.com>
Date: Mon, 14 Oct 2024 01:49:08 +0100
Subject: [PATCH 047/396] Vectorize load instructions in dmmv f16 CUDA kernel
 (#9816)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Vectorize load instructions in dmmv f16 CUDA kernel

Replaces scalar with vector load instructions, which substantially
improves performance on NVIDIA HBM GPUs, e.g. gives a 1.27X overall
speedup for Meta-Llama-3-8B-Instruct-F16 BS1 inference evaluation on
H100 SXM 80GB HBM3. On GDDR GPUs, there is a slight (1.01X) speedup.

* addressed comment

* Update ggml/src/ggml-cuda/dmmv.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/dmmv.cu | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu
index 96a5adef5..00e21b5d7 100644
--- a/ggml/src/ggml-cuda/dmmv.cu
+++ b/ggml/src/ggml-cuda/dmmv.cu
@@ -416,10 +416,11 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
 
 static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
     const half * x = (const half *) vx;
-
+    // load 2 halfs into register in a single instruction
+    const half2 x_reg = *((half2 *) &(x[ib + iqs]));
     // automatic half -> float type cast if dfloat == float
-    v.x = x[ib + iqs + 0];
-    v.y = x[ib + iqs + 1];
+    v.x = __low2float(x_reg);
+    v.y = __high2float(x_reg);
 }
 
 static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
@@ -476,13 +477,28 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
             // matrix multiplication
             // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
 #ifdef GGML_CUDA_F16
-            tmp += __hmul2(v, {
-                y[iybs + iqs + j/qr + 0],
-                y[iybs + iqs + j/qr + y_offset]
-            });
+            if ( y_offset == 1 ) {
+                // load 2 dfloats into register in a single instruction
+                const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
+                tmp += __hmul2(v, y_reg);
+            }
+            else {
+                tmp += __hmul2(v, {
+                        y[iybs + iqs + j/qr + 0],
+                        y[iybs + iqs + j/qr + y_offset]
+                    });
+            }
 #else
-            tmp += v.x * y[iybs + iqs + j/qr + 0];
-            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
+            if ( y_offset == 1 ) {
+                // load 2 dfloats into register in a single instruction
+                const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
+                tmp += v.x * y_reg.x;
+                tmp += v.y * y_reg.y;
+            }
+            else {
+                tmp += v.x * y[iybs + iqs + j/qr + 0];
+                tmp += v.y * y[iybs + iqs + j/qr + y_offset];
+            }
 #endif // GGML_CUDA_F16
         }
     }

From a89f75e1b7b90cb2d4d4c52ca53ef9e9b466aa45 Mon Sep 17 00:00:00 2001
From: VoidIsVoid <343750470@qq.com>
Date: Mon, 14 Oct 2024 15:04:36 +0800
Subject: [PATCH 048/396] server : handle "logprobs" field with false value
 (#9871)

Co-authored-by: Gimling <huangjl@ruyi.ai>
---
 examples/server/utils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 37999604d..69519ef95 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -360,9 +360,9 @@ static json oaicompat_completion_params_parse(
 
     // Handle "logprobs" field
     // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
-    if (body.contains("logprobs")) {
+    if (json_value(body, "logprobs", false)) {
         llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
-    } else if (body.contains("top_logprobs")) {
+    } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
         throw std::runtime_error("top_logprobs requires logprobs to be set to true");
     }
 

From 4c42f93b22146c83b763d8cbee5fafc512746649 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Tuszy=C5=84ski?= <srgtuszy@gmail.com>
Date: Tue, 15 Oct 2024 10:20:34 +0200
Subject: [PATCH 049/396] readme : update bindings list (#9889)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index dd4927b04..08fe8cc92 100644
--- a/README.md
+++ b/README.md
@@ -130,6 +130,7 @@ Typically finetunes of the base models below are supported as well.
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
+- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
 
 **UI:**
 

From dcdd535302fc9702a4709be25f56540d65163a44 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Oct 2024 12:48:44 +0300
Subject: [PATCH 050/396] server : update preact (#9895)

---
 examples/server/public/index.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/public/index.js b/examples/server/public/index.js
index fe615ca25..32ec6e9e1 100644
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
@@ -1 +1 @@
-const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function f(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function s(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}s.prototype.brand=t;s.prototype.h=function(){return!0};s.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};s.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};s.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};s.prototype.valueOf=function(){return this.value};s.prototype.toString=function(){return this.value+""};s.prototype.toJSON=function(){return this.value};s.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(s.prototype,"value",{get(){const t=f(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new s(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){s.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new s).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}s.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){s.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=f(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},F=[],A=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){var n=t.parentNode;n&&n.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e)return e.__e;return"function"==typeof t.type?B(t):null}function z(t){var n,e;if(null!=(t=t.__)&&null!=t.__c){for(t.__e=t.__c.base=null,n=0;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e){t.__e=t.__c.base=e.__e;break}return z(t)}}function G(t){(!t.__d&&(t.__d=!0)&&U.push(t)&&!J.__r++||E!==S.debounceRendering)&&((E=S.debounceRendering)||H)(J)}function J(){var t,n,e,_,i,o,r,u;for(U.sort(P);t=U.shift();)t.__d&&(n=U.length,_=void 0,o=(i=(e=t).__v).__e,r=[],u=[],e.__P&&((_=L({},i)).__v=i.__v+1,S.vnode&&S.vnode(_),_t(e.__P,_,i,e.__n,e.__P.namespaceURI,32&i.__u?[o]:null,r,null==o?B(i):o,!!(32&i.__u),u),_.__v=i.__v,_.__.__k[_.__i]=_,it(r,_,u),_.__e!=o&&z(_)),U.length>n&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,f,s){var c,h,a,p,d,v=_&&_.__k||F,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c<y;c++)null!=(a=e.__k[c])&&"boolean"!=typeof a&&"function"!=typeof a&&(h=-1===a.__i?M:v[a.__i]||M,a.__i=c,_t(t,a,h,i,o,r,u,l,f,s),p=a.__e,a.ref&&h.ref!=a.ref&&(h.ref&&rt(h.ref,null,a),s.push(a.ref,a.__c||p,a)),null==d&&null!=p&&(d=p),65536&a.__u||h.__k===a.__k?l=X(a,l,t):"function"==typeof a.type&&void 0!==a.__d?l=a.__d:p&&(l=p.nextSibling),a.__d=void 0,a.__u&=-196609);e.__d=l,e.__e=d}function Q(t,n,e){var _,i,o,r,u,l=n.length,f=e.length,s=f,c=0;for(t.__k=[],_=0;_<l;_++)r=_+c,null!=(i=t.__k[_]=null==(i=n[_])||"boolean"==typeof i||"function"==typeof i?null:"string"==typeof i||"number"==typeof i||"bigint"==typeof i||i.constructor==String?I(null,i,null,null,null):W(i)?I(j,{children:i},null,null,null):void 0===i.constructor&&i.__b>0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i)?(i.__=t,i.__b=t.__b+1,u=Z(i,e,r,s),i.__i=u,o=null,-1!==u&&(s--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:u>r?s>l-r?c+=u-r:c--:u<r&&(u==r-c?c-=u-r:c++),u!==_+c&&(i.__u|=65536))):(o=e[r])&&null==o.key&&o.__e&&0==(131072&o.__u)&&(o.__e==t.__d&&(t.__d=B(o)),ut(o,o,!1),e[r]=null,s--);if(s)for(_=0;_<f;_++)null!=(o=e[_])&&0==(131072&o.__u)&&(o.__e==t.__d&&(t.__d=B(o)),ut(o,o))}function X(t,n,e){var _,i;if("function"==typeof t.type){for(_=t.__k,i=0;_&&i<_.length;i++)_[i]&&(_[i].__=t,n=X(_[i],n,e));return n}t.__e!=n&&(n&&t.type&&!e.contains(n)&&(n=B(t)),e.insertBefore(t.__e,n||null),n=t.__e);do{n=n&&n.nextSibling}while(null!=n&&8===n.nodeType);return n}function Y(t,n){return n=n||[],null==t||"boolean"==typeof t||(W(t)?t.some((function(t){Y(t,n)})):n.push(t)),n}function Z(t,n,e,_){var i=t.key,o=t.type,r=e-1,u=e+1,l=n[e];if(null===l||l&&i==l.key&&o===l.type&&0==(131072&l.__u))return e;if(_>(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u<n.length;){if(r>=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u<n.length){if((l=n[u])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return u;u++}}return-1}function tt(t,n,e){"-"===n[0]?t.setProperty(n,null==e?"":e):t[n]=null==e?"":"number"!=typeof e||A.test(n)?e:e+"px"}function nt(t,n,e,_,i){var o;t:if("style"===n)if("string"==typeof e)t.style.cssText=e;else{if("string"==typeof _&&(t.style.cssText=_=""),_)for(n in _)e&&n in e||tt(t.style,n,"");if(e)for(n in e)_&&e[n]===_[n]||tt(t.style,n,e[n])}else if("o"===n[0]&&"n"===n[1])o=n!==(n=n.replace(/(PointerCapture)$|Capture$/i,"$1")),n=n.toLowerCase()in t||"onFocusOut"===n||"onFocusIn"===n?n.toLowerCase().slice(2):n.slice(2),t.l||(t.l={}),t.l[n+o]=e,e?_?e.u=_.u:(e.u=N,t.addEventListener(n,o?T:$,o)):t.removeEventListener(n,o?T:$,o);else{if("http://www.w3.org/2000/svg"==i)n=n.replace(/xlink(H|:h)/,"h").replace(/sName$/,"s");else if("width"!=n&&"height"!=n&&"href"!=n&&"list"!=n&&"form"!=n&&"tabIndex"!=n&&"download"!=n&&"rowSpan"!=n&&"colSpan"!=n&&"role"!=n&&"popover"!=n&&n in t)try{t[n]=null==e?"":e;break t}catch(t){}"function"==typeof e||(null==e||!1===e&&"-"!==n[4]?t.removeAttribute(n):t.setAttribute(n,"popover"==n&&1==e?"":e))}}function et(t){return function(n){if(this.l){var e=this.l[n.type+t];if(null==n.t)n.t=N++;else if(n.t<e.u)return;return e(S.event?S.event(n):n)}}}function _t(t,n,e,_,i,o,r,u,l,f){var s,c,h,a,p,d,v,y,m,g,b,k,w,x,C,U,E=n.type;if(void 0!==n.constructor)return null;128&e.__u&&(l=!!(32&e.__u),o=[u=n.__e=e.__e]),(s=S.__b)&&s(n);t:if("function"==typeof E)try{if(y=n.props,m="prototype"in E&&E.prototype.render,g=(s=E.contextType)&&_[s.__c],b=s?g?g.props.value:s.__:_,e.__c?v=(c=n.__c=e.__c).__=c.__E:(m?n.__c=c=new E(y,b):(n.__c=c=new q(y,b),c.constructor=E,c.render=lt),g&&g.sub(c),c.props=y,c.state||(c.state={}),c.context=b,c.__n=_,h=c.__d=!0,c.__h=[],c._sb=[]),m&&null==c.__s&&(c.__s=c.state),m&&null!=E.getDerivedStateFromProps&&(c.__s==c.state&&(c.__s=L({},c.__s)),L(c.__s,E.getDerivedStateFromProps(y,c.__s))),a=c.props,p=c.state,c.__v=n,h)m&&null==E.getDerivedStateFromProps&&null!=c.componentWillMount&&c.componentWillMount(),m&&null!=c.componentDidMount&&c.__h.push(c.componentDidMount);else{if(m&&null==E.getDerivedStateFromProps&&y!==a&&null!=c.componentWillReceiveProps&&c.componentWillReceiveProps(y,b),!c.__e&&(null!=c.shouldComponentUpdate&&!1===c.shouldComponentUpdate(y,c.__s,b)||n.__v===e.__v)){for(n.__v!==e.__v&&(c.props=y,c.state=c.__s,c.__d=!1),n.__e=e.__e,n.__k=e.__k,n.__k.forEach((function(t){t&&(t.__=n)})),k=0;k<c._sb.length;k++)c.__h.push(c._sb[k]);c._sb=[],c.__h.length&&r.push(c);break t}null!=c.componentWillUpdate&&c.componentWillUpdate(y,c.__s,b),m&&null!=c.componentDidUpdate&&c.__h.push((function(){c.componentDidUpdate(a,p,d)}))}if(c.context=b,c.props=y,c.__P=t,c.__e=!1,w=S.__r,x=0,m){for(c.state=c.__s,c.__d=!1,w&&w(n),s=c.render(c.props,c.state,c.context),C=0;C<c._sb.length;C++)c.__h.push(c._sb[C]);c._sb=[]}else do{c.__d=!1,w&&w(n),s=c.render(c.props,c.state,c.context),c.state=c.__s}while(c.__d&&++x<25);c.state=c.__s,null!=c.getChildContext&&(_=L(L({},_),c.getChildContext())),m&&!h&&null!=c.getSnapshotBeforeUpdate&&(d=c.getSnapshotBeforeUpdate(a,p)),K(t,W(U=null!=s&&s.type===j&&null==s.key?s.props.children:s)?U:[U],n,e,_,i,o,r,u,l,f),c.base=n.__e,n.__u&=-161,c.__h.length&&r.push(c),v&&(c.__E=c.__=null)}catch(t){if(n.__v=null,l||null!=o){for(n.__u|=l?160:32;u&&8===u.nodeType&&u.nextSibling;)u=u.nextSibling;o[o.indexOf(u)]=null,n.__e=u}else n.__e=e.__e,n.__k=e.__k;S.__e(t,n,e)}else null==o&&n.__v===e.__v?(n.__k=e.__k,n.__e=e.__e):n.__e=ot(e.__e,n,e,_,i,o,r,l,f);(s=S.diffed)&&s(n)}function it(t,n,e){n.__d=void 0;for(var _=0;_<e.length;_++)rt(e[_],e[++_],e[++_]);S.__c&&S.__c(n,t),t.some((function(n){try{t=n.__h,n.__h=[],t.some((function(t){t.call(n)}))}catch(t){S.__e(t,n.__v)}}))}function ot(t,n,e,_,i,o,r,u,l){var f,s,c,h,a,p,d,v=e.props,y=n.props,m=n.type;if("svg"===m?i="http://www.w3.org/2000/svg":"math"===m?i="http://www.w3.org/1998/Math/MathML":i||(i="http://www.w3.org/1999/xhtml"),null!=o)for(f=0;f<o.length;f++)if((a=o[f])&&"setAttribute"in a==!!m&&(m?a.localName===m:3===a.nodeType)){t=a,o[f]=null;break}if(null==t){if(null===m)return document.createTextNode(y);t=document.createElementNS(i,m,y.is&&y),o=null,u=!1}if(null===m)v===y||u&&t.data===y||(t.data=y);else{if(o=o&&w.call(t.childNodes),v=e.props||M,!u&&null!=o)for(v={},f=0;f<t.attributes.length;f++)v[(a=t.attributes[f]).name]=a.value;for(f in v)if(a=v[f],"children"==f);else if("dangerouslySetInnerHTML"==f)c=a;else if("key"!==f&&!(f in y)){if("value"==f&&"defaultValue"in y||"checked"==f&&"defaultChecked"in y)continue;nt(t,f,null,a,i)}for(f in y)a=y[f],"children"==f?h=a:"dangerouslySetInnerHTML"==f?s=a:"value"==f?p=a:"checked"==f?d=a:"key"===f||u&&"function"!=typeof a||v[f]===a||nt(t,f,a,v[f],i);if(s)u||c&&(s.__html===c.__html||s.__html===t.innerHTML)||(t.innerHTML=s.__html),n.__k=[];else if(c&&(t.innerHTML=""),K(t,W(h)?h:[h],n,e,_,"foreignObject"===m?"http://www.w3.org/1999/xhtml":i,o,r,o?o[0]:e.__k&&B(e,0),u,l),null!=o)for(f=o.length;f--;)null!=o[f]&&O(o[f]);u||(f="value",void 0!==p&&(p!==t[f]||"progress"===m&&!p||"option"===m&&p!==v[f])&&nt(t,f,p,v[f],i),f="checked",void 0!==d&&d!==t[f]&&nt(t,f,d,v[f],i))}return t}function rt(t,n,e){try{if("function"==typeof t){var _="function"==typeof t.__u;_&&t.__u(),_&&null==n||(t.__u=t(n))}else t.current=n}catch(t){S.__e(t,e)}}function ut(t,n,e){var _,i;if(S.unmount&&S.unmount(t),(_=t.ref)&&(_.current&&_.current!==t.__e||rt(_,null,n)),null!=(_=t.__c)){if(_.componentWillUnmount)try{_.componentWillUnmount()}catch(t){S.__e(t,n)}_.base=_.__P=null}if(_=t.__k)for(i=0;i<_.length;i++)_[i]&&ut(_[i],n,e||"function"!=typeof t.type);e||null==t.__e||O(t.__e),t.__c=t.__=t.__e=t.__d=void 0}function lt(t,n,e){return this.constructor(t,e)}function ft(t,n,e){var _,i,o,r;S.__&&S.__(t,n),i=(_="function"==typeof e)?null:e&&e.__k||n.__k,o=[],r=[],_t(n,t=(!_&&e||n).__k=R(j,null,[t]),i||M,M,n.namespaceURI,!_&&e?[e]:i?null:n.firstChild?w.call(n.childNodes):null,o,!_&&e?e:i?i.__e:n.firstChild,_,r),it(o,t,r)}function st(t,n){ft(t,n,st)}function ct(t,n,e){var _,i,o,r,u=L({},t.props);for(o in t.type&&t.type.defaultProps&&(r=t.type.defaultProps),n)"key"==o?_=n[o]:"ref"==o?i=n[o]:u[o]=void 0===n[o]&&void 0!==r?r[o]:n[o];return arguments.length>2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=[],(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=F.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&&gt.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function Ft(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function At(t,n){gt.useDebugValue&&gt.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&&gt.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(s.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof s){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o<n.length;o++){var r=n[o++],u=n[o]?(n[0]|=r?1:2,e[n[o++]]):n[++o];3===r?_[0]=u:4===r?_[1]=Object.assign(_[1]||{},u):5===r?(_[1]=_[1]||{})[n[++o]]=u:6===r?_[1][n[++o]]+=u+"":r?(i=t.apply(u,nn(t,u,e,["",null])),_.push(i),u[0]?n[0]|=2:(n[o-2]=0,n[o]=i)):_.push(u)}return _},en=new Map;function _n(t){var n=en.get(this);return n||(n=new Map,en.set(this,n)),(n=nn(this,n.get(t)||(n.set(t,n=function(t){for(var n,e,_=1,i="",o="",r=[0],u=function(t){1===_&&(t||(i=i.replace(/^\s*\n\s*|\s*\n\s*$/g,"")))?r.push(0,t,i):3===_&&(t||i)?(r.push(3,t,i),_=2):2===_&&"..."===i&&t?r.push(4,t,0):2===_&&i&&!t?r.push(5,0,!0,i):_>=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l<t.length;l++){l&&(1===_&&u(),u(l));for(var f=0;f<t[l].length;f++)n=t[l][f],1===_?"<"===n?(u(),r=[r],_=3):i+=n:4===_?"--"===i&&">"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][f+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,s as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,st as hydrate,C as isValidElement,S as options,ft as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,Ft as useContext,At as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState};
+const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},A=[],F=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){t&&t.parentNode&&t.parentNode.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e)return e.__e;return"function"==typeof t.type?B(t):null}function z(t){var n,e;if(null!=(t=t.__)&&null!=t.__c){for(t.__e=t.__c.base=null,n=0;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e){t.__e=t.__c.base=e.__e;break}return z(t)}}function G(t){(!t.__d&&(t.__d=!0)&&U.push(t)&&!J.__r++||E!==S.debounceRendering)&&((E=S.debounceRendering)||H)(J)}function J(){var t,n,e,_,i,o,r,u;for(U.sort(P);t=U.shift();)t.__d&&(n=U.length,_=void 0,o=(i=(e=t).__v).__e,r=[],u=[],e.__P&&((_=L({},i)).__v=i.__v+1,S.vnode&&S.vnode(_),_t(e.__P,_,i,e.__n,e.__P.namespaceURI,32&i.__u?[o]:null,r,null==o?B(i):o,!!(32&i.__u),u),_.__v=i.__v,_.__.__k[_.__i]=_,it(r,_,u),_.__e!=o&&z(_)),U.length>n&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||A,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c<y;c++)null!=(a=e.__k[c])&&(h=-1===a.__i?M:v[a.__i]||M,a.__i=c,_t(t,a,h,i,o,r,u,l,s,f),p=a.__e,a.ref&&h.ref!=a.ref&&(h.ref&&rt(h.ref,null,a),f.push(a.ref,a.__c||p,a)),null==d&&null!=p&&(d=p),65536&a.__u||h.__k===a.__k?l=X(a,l,t):"function"==typeof a.type&&void 0!==a.__d?l=a.__d:p&&(l=p.nextSibling),a.__d=void 0,a.__u&=-196609);e.__d=l,e.__e=d}function Q(t,n,e){var _,i,o,r,u,l=n.length,s=e.length,f=s,c=0;for(t.__k=[],_=0;_<l;_++)null!=(i=n[_])&&"boolean"!=typeof i&&"function"!=typeof i?(r=_+c,(i=t.__k[_]="string"==typeof i||"number"==typeof i||"bigint"==typeof i||i.constructor==String?I(null,i,null,null,null):W(i)?I(j,{children:i},null,null,null):void 0===i.constructor&&i.__b>0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i).__=t,i.__b=t.__b+1,o=null,-1!==(u=i.__i=Z(i,e,r,f))&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:(u>r?c--:c++,i.__u|=65536))):i=t.__k[_]=null;if(f)for(_=0;_<s;_++)null!=(o=e[_])&&0==(131072&o.__u)&&(o.__e==t.__d&&(t.__d=B(o)),ut(o,o))}function X(t,n,e){var _,i;if("function"==typeof t.type){for(_=t.__k,i=0;_&&i<_.length;i++)_[i]&&(_[i].__=t,n=X(_[i],n,e));return n}t.__e!=n&&(n&&t.type&&!e.contains(n)&&(n=B(t)),e.insertBefore(t.__e,n||null),n=t.__e);do{n=n&&n.nextSibling}while(null!=n&&8===n.nodeType);return n}function Y(t,n){return n=n||[],null==t||"boolean"==typeof t||(W(t)?t.some((function(t){Y(t,n)})):n.push(t)),n}function Z(t,n,e,_){var i=t.key,o=t.type,r=e-1,u=e+1,l=n[e];if(null===l||l&&i==l.key&&o===l.type&&0==(131072&l.__u))return e;if(_>(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u<n.length;){if(r>=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u<n.length){if((l=n[u])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return u;u++}}return-1}function tt(t,n,e){"-"===n[0]?t.setProperty(n,null==e?"":e):t[n]=null==e?"":"number"!=typeof e||F.test(n)?e:e+"px"}function nt(t,n,e,_,i){var o;t:if("style"===n)if("string"==typeof e)t.style.cssText=e;else{if("string"==typeof _&&(t.style.cssText=_=""),_)for(n in _)e&&n in e||tt(t.style,n,"");if(e)for(n in e)_&&e[n]===_[n]||tt(t.style,n,e[n])}else if("o"===n[0]&&"n"===n[1])o=n!==(n=n.replace(/(PointerCapture)$|Capture$/i,"$1")),n=n.toLowerCase()in t||"onFocusOut"===n||"onFocusIn"===n?n.toLowerCase().slice(2):n.slice(2),t.l||(t.l={}),t.l[n+o]=e,e?_?e.u=_.u:(e.u=N,t.addEventListener(n,o?T:$,o)):t.removeEventListener(n,o?T:$,o);else{if("http://www.w3.org/2000/svg"==i)n=n.replace(/xlink(H|:h)/,"h").replace(/sName$/,"s");else if("width"!=n&&"height"!=n&&"href"!=n&&"list"!=n&&"form"!=n&&"tabIndex"!=n&&"download"!=n&&"rowSpan"!=n&&"colSpan"!=n&&"role"!=n&&"popover"!=n&&n in t)try{t[n]=null==e?"":e;break t}catch(t){}"function"==typeof e||(null==e||!1===e&&"-"!==n[4]?t.removeAttribute(n):t.setAttribute(n,"popover"==n&&1==e?"":e))}}function et(t){return function(n){if(this.l){var e=this.l[n.type+t];if(null==n.t)n.t=N++;else if(n.t<e.u)return;return e(S.event?S.event(n):n)}}}function _t(t,n,e,_,i,o,r,u,l,s){var f,c,h,a,p,d,v,y,m,g,b,k,w,x,C,U,E=n.type;if(void 0!==n.constructor)return null;128&e.__u&&(l=!!(32&e.__u),o=[u=n.__e=e.__e]),(f=S.__b)&&f(n);t:if("function"==typeof E)try{if(y=n.props,m="prototype"in E&&E.prototype.render,g=(f=E.contextType)&&_[f.__c],b=f?g?g.props.value:f.__:_,e.__c?v=(c=n.__c=e.__c).__=c.__E:(m?n.__c=c=new E(y,b):(n.__c=c=new q(y,b),c.constructor=E,c.render=lt),g&&g.sub(c),c.props=y,c.state||(c.state={}),c.context=b,c.__n=_,h=c.__d=!0,c.__h=[],c._sb=[]),m&&null==c.__s&&(c.__s=c.state),m&&null!=E.getDerivedStateFromProps&&(c.__s==c.state&&(c.__s=L({},c.__s)),L(c.__s,E.getDerivedStateFromProps(y,c.__s))),a=c.props,p=c.state,c.__v=n,h)m&&null==E.getDerivedStateFromProps&&null!=c.componentWillMount&&c.componentWillMount(),m&&null!=c.componentDidMount&&c.__h.push(c.componentDidMount);else{if(m&&null==E.getDerivedStateFromProps&&y!==a&&null!=c.componentWillReceiveProps&&c.componentWillReceiveProps(y,b),!c.__e&&(null!=c.shouldComponentUpdate&&!1===c.shouldComponentUpdate(y,c.__s,b)||n.__v===e.__v)){for(n.__v!==e.__v&&(c.props=y,c.state=c.__s,c.__d=!1),n.__e=e.__e,n.__k=e.__k,n.__k.some((function(t){t&&(t.__=n)})),k=0;k<c._sb.length;k++)c.__h.push(c._sb[k]);c._sb=[],c.__h.length&&r.push(c);break t}null!=c.componentWillUpdate&&c.componentWillUpdate(y,c.__s,b),m&&null!=c.componentDidUpdate&&c.__h.push((function(){c.componentDidUpdate(a,p,d)}))}if(c.context=b,c.props=y,c.__P=t,c.__e=!1,w=S.__r,x=0,m){for(c.state=c.__s,c.__d=!1,w&&w(n),f=c.render(c.props,c.state,c.context),C=0;C<c._sb.length;C++)c.__h.push(c._sb[C]);c._sb=[]}else do{c.__d=!1,w&&w(n),f=c.render(c.props,c.state,c.context),c.state=c.__s}while(c.__d&&++x<25);c.state=c.__s,null!=c.getChildContext&&(_=L(L({},_),c.getChildContext())),m&&!h&&null!=c.getSnapshotBeforeUpdate&&(d=c.getSnapshotBeforeUpdate(a,p)),K(t,W(U=null!=f&&f.type===j&&null==f.key?f.props.children:f)?U:[U],n,e,_,i,o,r,u,l,s),c.base=n.__e,n.__u&=-161,c.__h.length&&r.push(c),v&&(c.__E=c.__=null)}catch(t){if(n.__v=null,l||null!=o){for(n.__u|=l?160:128;u&&8===u.nodeType&&u.nextSibling;)u=u.nextSibling;o[o.indexOf(u)]=null,n.__e=u}else n.__e=e.__e,n.__k=e.__k;S.__e(t,n,e)}else null==o&&n.__v===e.__v?(n.__k=e.__k,n.__e=e.__e):n.__e=ot(e.__e,n,e,_,i,o,r,l,s);(f=S.diffed)&&f(n)}function it(t,n,e){n.__d=void 0;for(var _=0;_<e.length;_++)rt(e[_],e[++_],e[++_]);S.__c&&S.__c(n,t),t.some((function(n){try{t=n.__h,n.__h=[],t.some((function(t){t.call(n)}))}catch(t){S.__e(t,n.__v)}}))}function ot(t,n,e,_,i,o,r,u,l){var s,f,c,h,a,p,d,v=e.props,y=n.props,m=n.type;if("svg"===m?i="http://www.w3.org/2000/svg":"math"===m?i="http://www.w3.org/1998/Math/MathML":i||(i="http://www.w3.org/1999/xhtml"),null!=o)for(s=0;s<o.length;s++)if((a=o[s])&&"setAttribute"in a==!!m&&(m?a.localName===m:3===a.nodeType)){t=a,o[s]=null;break}if(null==t){if(null===m)return document.createTextNode(y);t=document.createElementNS(i,m,y.is&&y),u&&(S.__m&&S.__m(n,o),u=!1),o=null}if(null===m)v===y||u&&t.data===y||(t.data=y);else{if(o=o&&w.call(t.childNodes),v=e.props||M,!u&&null!=o)for(v={},s=0;s<t.attributes.length;s++)v[(a=t.attributes[s]).name]=a.value;for(s in v)if(a=v[s],"children"==s);else if("dangerouslySetInnerHTML"==s)c=a;else if(!(s in y)){if("value"==s&&"defaultValue"in y||"checked"==s&&"defaultChecked"in y)continue;nt(t,s,null,a,i)}for(s in y)a=y[s],"children"==s?h=a:"dangerouslySetInnerHTML"==s?f=a:"value"==s?p=a:"checked"==s?d=a:u&&"function"!=typeof a||v[s]===a||nt(t,s,a,v[s],i);if(f)u||c&&(f.__html===c.__html||f.__html===t.innerHTML)||(t.innerHTML=f.__html),n.__k=[];else if(c&&(t.innerHTML=""),K(t,W(h)?h:[h],n,e,_,"foreignObject"===m?"http://www.w3.org/1999/xhtml":i,o,r,o?o[0]:e.__k&&B(e,0),u,l),null!=o)for(s=o.length;s--;)O(o[s]);u||(s="value","progress"===m&&null==p?t.removeAttribute("value"):void 0!==p&&(p!==t[s]||"progress"===m&&!p||"option"===m&&p!==v[s])&&nt(t,s,p,v[s],i),s="checked",void 0!==d&&d!==t[s]&&nt(t,s,d,v[s],i))}return t}function rt(t,n,e){try{if("function"==typeof t){var _="function"==typeof t.__u;_&&t.__u(),_&&null==n||(t.__u=t(n))}else t.current=n}catch(t){S.__e(t,e)}}function ut(t,n,e){var _,i;if(S.unmount&&S.unmount(t),(_=t.ref)&&(_.current&&_.current!==t.__e||rt(_,null,n)),null!=(_=t.__c)){if(_.componentWillUnmount)try{_.componentWillUnmount()}catch(t){S.__e(t,n)}_.base=_.__P=null}if(_=t.__k)for(i=0;i<_.length;i++)_[i]&&ut(_[i],n,e||"function"!=typeof t.type);e||O(t.__e),t.__c=t.__=t.__e=t.__d=void 0}function lt(t,n,e){return this.constructor(t,e)}function st(t,n,e){var _,i,o,r;S.__&&S.__(t,n),i=(_="function"==typeof e)?null:e&&e.__k||n.__k,o=[],r=[],_t(n,t=(!_&&e||n).__k=R(j,null,[t]),i||M,M,n.namespaceURI,!_&&e?[e]:i?null:n.firstChild?w.call(n.childNodes):null,o,!_&&e?e:i?i.__e:n.firstChild,_,r),it(o,t,r)}function ft(t,n){st(t,n,ft)}function ct(t,n,e){var _,i,o,r,u=L({},t.props);for(o in t.type&&t.type.defaultProps&&(r=t.type.defaultProps),n)"key"==o?_=n[o]:"ref"==o?i=n[o]:u[o]=void 0===n[o]&&void 0!==r?r[o]:n[o];return arguments.length>2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=new Set,(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.forEach((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.add(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.delete(t),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=A.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&&gt.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Ft(t,n){gt.useDebugValue&&gt.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&&gt.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o<n.length;o++){var r=n[o++],u=n[o]?(n[0]|=r?1:2,e[n[o++]]):n[++o];3===r?_[0]=u:4===r?_[1]=Object.assign(_[1]||{},u):5===r?(_[1]=_[1]||{})[n[++o]]=u:6===r?_[1][n[++o]]+=u+"":r?(i=t.apply(u,nn(t,u,e,["",null])),_.push(i),u[0]?n[0]|=2:(n[o-2]=0,n[o]=i)):_.push(u)}return _},en=new Map;function _n(t){var n=en.get(this);return n||(n=new Map,en.set(this,n)),(n=nn(this,n.get(t)||(n.set(t,n=function(t){for(var n,e,_=1,i="",o="",r=[0],u=function(t){1===_&&(t||(i=i.replace(/^\s*\n\s*|\s*\n\s*$/g,"")))?r.push(0,t,i):3===_&&(t||i)?(r.push(3,t,i),_=2):2===_&&"..."===i&&t?r.push(4,t,0):2===_&&i&&!t?r.push(5,0,!0,i):_>=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l<t.length;l++){l&&(1===_&&u(),u(l));for(var s=0;s<t[l].length;s++)n=t[l][s],1===_?"<"===n?(u(),r=[r],_=3):i+=n:4===_?"--"===i&&">"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,At as useContext,Ft as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState};

From fbc98b748e7b075e327bcf13237057f647678049 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Tue, 15 Oct 2024 15:54:55 +0500
Subject: [PATCH 051/396] sampling : add XTC sampler (#9742)

* Initial XTC commit

Adds XTC sampler, not activated by default, but recommended settings by default.

* Cleanup

* Simplified chances calculation

To be more inline with the original implementation, chance is calculated once at the beginning.

* First fixes by comments

Still need to look into sorting

* Fixed trailing backspaces

* Fixed RNG to be reproduceable

Thanks to @slaren for directions

* Fixed forgotten header

* Moved `min_keep`

Moved from conditions to a simple check at the end.

* Fixed broken randomization

Thanks to @slaren for explanation

* Swapped sorting for a custom algorithm

Shifts tokens to remove the penalized ones, then puts the penalized at the back. Should make `min_keep` still viable.

* Algorithm rework

1. Scan token from top till the first non-penalizable
2. Remove the last captured token (the least probable above threshold)
3. Shift all tokens to override the remaining penalizable
4. Penalize and put them at the the bottom.

* Added XTC to `test-sampling`

* Simplified algorithm and more tests

* Updated info in common and args

* Merged back lost commits in common and arg

* Update dump info in common

* Fixed incorrect min_keep check

* Added XTC to README

* Renamed parameters, fixed info and defaults

* probability is at 0 by default, but XTC is included in sampling queue
* threshold higher than 0.5 switches XTC off

* Initial server support

* Added XTC to server UIs

* Fixed labels in old server UI

* Made algorithm safer and more readable

* Removed xtc_threshold_max

* Fixed arg after update

* Quick fixes by comments

* Simplified algorithm since threshold_max is removed

* Renamed random distribution

* Fixed tests and outdated README

* Small fixes
---
 common/arg.cpp                        | 14 ++++
 common/common.cpp                     |  2 +
 common/common.h                       |  6 ++
 common/sampling.cpp                   | 13 +++-
 examples/main/README.md               | 13 ++++
 examples/server/public/index-new.html |  6 ++
 examples/server/public/index.html     |  4 ++
 examples/server/server.cpp            |  4 ++
 include/llama.h                       |  3 +
 src/llama-sampling.cpp                | 95 +++++++++++++++++++++++++++
 tests/test-sampling.cpp               | 45 +++++++++++--
 11 files changed, 195 insertions(+), 10 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 8969fc107..d6a8e1f6f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -947,6 +947,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sparams.tfs_z = std::stof(value);
         }
     ).set_sparam());
+    add_opt(common_arg(
+        {"--xtc-probability"}, "N",
+        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
+        [](common_params & params, const std::string & value) {
+            params.sparams.xtc_probability = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--xtc-threshold"}, "N",
+        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
+        [](common_params & params, const std::string & value) {
+            params.sparams.xtc_threshold = std::stof(value);
+        }
+    ).set_sparam());
     add_opt(common_arg(
         {"--typical"}, "N",
         string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
diff --git a/common/common.cpp b/common/common.cpp
index 451307b55..c08f01b42 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2104,6 +2104,8 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
+    fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
+    fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
     fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
     fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
diff --git a/common/common.h b/common/common.h
index 5507b1c59..df2ee6bd4 100644
--- a/common/common.h
+++ b/common/common.h
@@ -90,6 +90,8 @@ enum common_sampler_type {
     COMMON_SAMPLER_TYPE_TFS_Z       = 4,
     COMMON_SAMPLER_TYPE_TYPICAL_P   = 5,
     COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
+    COMMON_SAMPLER_TYPE_XTC         = 7,
+
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -108,6 +110,8 @@ struct common_sampler_params {
     int32_t top_k             = 40;    // <= 0 to use vocab size
     float   top_p             = 0.95f; // 1.0 = disabled
     float   min_p             = 0.05f; // 0.0 = disabled
+    float   xtc_probability   = 0.00f; // 0.0 = disabled
+    float   xtc_threshold     = 0.10f; // > 0.5 disables XTC
     float   tfs_z             = 1.00f; // 1.0 = disabled
     float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
     float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
@@ -124,12 +128,14 @@ struct common_sampler_params {
     bool    ignore_eos        = false;
     bool    no_perf           = false; // disable performance metrics
 
+
     std::vector<enum common_sampler_type> samplers = {
         COMMON_SAMPLER_TYPE_TOP_K,
         COMMON_SAMPLER_TYPE_TFS_Z,
         COMMON_SAMPLER_TYPE_TYPICAL_P,
         COMMON_SAMPLER_TYPE_TOP_P,
         COMMON_SAMPLER_TYPE_MIN_P,
+        COMMON_SAMPLER_TYPE_XTC,
         COMMON_SAMPLER_TYPE_TEMPERATURE
     };
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index cd49ade69..fb95bcd3b 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -130,10 +130,10 @@ std::string common_sampler_params::print() const {
 
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
             "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
             penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
-            top_k, tfs_z, top_p, min_p, typ_p, temp,
+            top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
             mirostat, mirostat_eta, mirostat_tau);
 
     return std::string(result);
@@ -184,6 +184,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                     case COMMON_SAMPLER_TYPE_MIN_P:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
                         break;
+                    case COMMON_SAMPLER_TYPE_XTC:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                        break;
                     case COMMON_SAMPLER_TYPE_TFS_Z:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
                         break;
@@ -372,6 +375,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
         case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
         case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
+        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
         default : return '?';
     }
 }
@@ -384,6 +388,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
         case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
         case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
         default : return "";
     }
 }
@@ -396,6 +401,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
         { "tfs_z",       COMMON_SAMPLER_TYPE_TFS_Z },
         { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
     };
 
     // since samplers names are written multiple ways
@@ -441,7 +447,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC }
     };
 
     std::vector<common_sampler_type> samplers;
diff --git a/examples/main/README.md b/examples/main/README.md
index f0c3031ab..620934dad 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -241,6 +241,19 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres
 
 Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
 
+### XTC Sampling
+
+-   `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
+-   `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
+
+Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
+
+By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
+
+Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
+
+Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
+
 ### Logit Bias
 
 -   `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html
index c87dd8f1e..ad4183cd9 100644
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -43,6 +43,8 @@
       top_k: 0, // <= 0 to use vocab size
       top_p: 1.0, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
       tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
@@ -836,6 +838,8 @@ return html`
           ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
           ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
           ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+          ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+          ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
           ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
         </fieldset>
 
@@ -1132,6 +1136,8 @@ document.addEventListener('DOMContentLoaded', (event) => {
   const snapSettings = {
     temperature: { snapValue: 1.0, snapRangeMultiplier: 6 },
     min_p: { snapValue: 0.05, snapRangeMultiplier: 2 },
+    xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
     top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
     tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 },
     typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 07fec6a38..88065705f 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -307,6 +307,8 @@
       top_k: 40, // <= 0 to use vocab size
       top_p: 0.95, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
       tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
@@ -1013,6 +1015,8 @@
               ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
               ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
               ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+              ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+              ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
             </fieldset>
             <hr />
             <fieldset class="three">
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 18bcad3f0..8d4380e12 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -863,6 +863,8 @@ struct server_context {
         slot.sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
         slot.sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
         slot.sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
+        slot.sparams.xtc_probability   = json_value(data, "xtc_probability",   default_sparams.xtc_probability);
+        slot.sparams.xtc_threshold     = json_value(data, "xtc_threshold",     default_sparams.xtc_threshold);
         slot.sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
         slot.sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
         slot.sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
@@ -1196,6 +1198,8 @@ struct server_context {
             {"top_k",                     slot.sparams.top_k},
             {"top_p",                     slot.sparams.top_p},
             {"min_p",                     slot.sparams.min_p},
+            {"xtc_probability",           slot.sparams.xtc_probability},
+            {"xtc_threshold",             slot.sparams.xtc_threshold},
             {"tfs_z",                     slot.sparams.tfs_z},
             {"typical_p",                 slot.sparams.typ_p},
             {"repeat_last_n",             slot.sparams.penalty_last_n},
diff --git a/include/llama.h b/include/llama.h
index 9110b5956..92d4c70c1 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1101,6 +1101,9 @@ extern "C" {
     /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
     LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
 
+    /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
+
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index e255a8fc4..67a78c3ac 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1059,6 +1059,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
     };
 }
 
+// xtc
+
+struct llama_sampler_xtc {
+    const float    probability;
+    const float    threshold;
+    const size_t   min_keep;
+
+    const uint32_t seed;
+    uint32_t       seed_cur;
+
+    std::mt19937   rng;
+};
+
+static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
+    return "xtc";
+}
+
+static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+
+    if (ctx->probability <= 0.0f
+        || ctx->threshold > 0.5f
+        || cur_p->size < 2) {
+        return;
+    }
+
+    std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
+    float chance = distribution(ctx->rng);
+    if (chance > ctx->probability) return;
+
+    // in case it's not sorted/recalculated yet
+    llama_sampler_softmax_impl(cur_p);
+
+    int pos_last = 0;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (cur_p->data[i].p >= ctx->threshold) {
+            pos_last = i;
+        } else break;
+    }
+
+    if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
+        cur_p->data += pos_last;
+        cur_p->size -= pos_last;
+    }
+}
+
+static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
+    auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_xtc *) result->ctx;
+
+        result_ctx->rng = ctx->rng;
+    }
+
+    return result;
+}
+
+static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_xtc *) smpl->ctx;
+}
+
+static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
+}
+
+static struct llama_sampler_i llama_sampler_xtc_i = {
+    /* .name   = */ llama_sampler_xtc_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sample_xtc_apply,
+    /* .reset  = */ llama_sampler_xtc_reset,
+    /* .clone  = */ llama_sampler_xtc_clone,
+    /* .free   = */ llama_sampler_xtc_free,
+};
+
+struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
+    return new llama_sampler {
+        /* .iface = */ &llama_sampler_xtc_i,
+        /* .ctx   = */ new llama_sampler_xtc {
+            /* .probability   = */ p,
+            /* .threshold     = */ t,
+            /* .min_keep      = */ min_keep,
+            /* .seed          = */ seed,
+            /* .seed_cur      = */ seed_cur,
+            /* .rng           = */ std::mt19937(seed_cur),
+        },
+    };
+}
+
 // mirostat
 
 struct llama_sampler_mirostat {
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 6e021c4c7..1372bdf13 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -111,6 +111,28 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
     }
 }
 
+static void test_xtc(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p, float t) {
+    const size_t n_vocab = probs.size();
+
+    std::vector<llama_token_data> cur;
+    cur.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        const float logit = logf(probs[token_id]);
+        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    APPLY(llama_sampler_init_softmax(), &cur_p);
+    DUMP(&cur_p);
+    APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p);
+    DUMP(&cur_p);
+
+    GGML_ASSERT(cur_p.size == expected_probs.size());
+    for (size_t i = 0; i < cur_p.size; i++) {
+        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
+    }
+}
+
 static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
     const size_t n_vocab = probs.size();
 
@@ -263,7 +285,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
     }
     const int64_t t_end = ggml_time_us();
     llama_sampler_free(cnstr);
-    printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+    printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
 }
 
 #define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
@@ -279,12 +301,13 @@ static void test_perf() {
         data.emplace_back(llama_token_data{i, logit, 0.0f});
     }
 
-    BENCH(llama_sampler_init_top_k    (40),      data, 32);
-    BENCH(llama_sampler_init_top_p    (0.8f, 1), data, 32);
-    BENCH(llama_sampler_init_min_p    (0.2f, 1), data, 32);
-    BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
-    BENCH(llama_sampler_init_typical  (0.5f, 1), data, 32);
-    BENCH(llama_sampler_init_softmax  (),        data, 32);
+    BENCH(llama_sampler_init_top_k    (40),                     data, 32);
+    BENCH(llama_sampler_init_top_p    (0.8f, 1),                data, 32);
+    BENCH(llama_sampler_init_min_p    (0.2f, 1),                data, 32);
+    BENCH(llama_sampler_init_tail_free(0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_typical  (0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 1, 1),       data, 32);
+    BENCH(llama_sampler_init_softmax  (),                       data, 32);
 }
 
 int main(void) {
@@ -309,6 +332,14 @@ int main(void) {
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
 
+    printf("XTC should:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.09f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                          0.99f, 0.19f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},                    0.99f, 0.29f);
+
+    printf("XTC should not:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
+
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);

From 223c25a72fcc3f65cdfd7f5d57edd5b44b550e18 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Oct 2024 16:28:55 +0300
Subject: [PATCH 052/396] server : improve infill context reuse (#9894)

ggml-ci
---
 examples/server/README.md  | 10 +++---
 examples/server/server.cpp | 73 ++++++++++++++------------------------
 2 files changed, 33 insertions(+), 50 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index eb0a7b32e..fcdb02afd 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -524,10 +524,12 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
 
 - `input_prefix`: Set the prefix of the code to infill.
 - `input_suffix`: Set the suffix of the code to infill.
-- `prompt`: Added after the `FIM_MID` token
-- `extra_context`: Additional context inserted before the FIM prefix. See https://github.com/ggerganov/llama.cpp/pull/9874
+- `input_extra`:  Additional context inserted before the FIM prefix.
+- `prompt`:       Added after the `FIM_MID` token
 
-It also accepts all the options of `/completion`.
+`input_extra` is array of `{"filename": string, "text": string}` objects.
+
+The endpoint also accepts all the options of `/completion`.
 
 If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used:
 
@@ -545,7 +547,7 @@ If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](
 If the tokens are missing, then the extra context is simply prefixed at the start:
 
 ```txt
-[extra_context]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
+[input_extra]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
 ```
 
 ### **GET** `/props`: Get server global properties.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8d4380e12..d53cca84c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -136,10 +136,6 @@ struct slot_params {
     int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
 
     std::vector<std::string> antiprompt;
-
-    json input_prefix;
-    json input_suffix;
-    json extra_context;
 };
 
 struct server_slot {
@@ -169,6 +165,10 @@ struct server_slot {
 
     json prompt; // can be either a string, array of strings or array of token ids
 
+    json input_prefix;
+    json input_suffix;
+    json input_extra;
+
     // when a task is submitted, we first tokenize the prompt and store it here
     std::vector<llama_token> prompt_tokens;
     std::vector<llama_token> extra_tokens;
@@ -910,12 +910,12 @@ struct server_context {
         }
 
         // infill
-        slot.params.input_prefix  = json_value(data, "input_prefix",  default_params.input_prefix);
-        slot.params.input_suffix  = json_value(data, "input_suffix",  default_params.input_suffix);
-        slot.params.extra_context = json_value(data, "extra_context", default_params.extra_context);
+        slot.input_prefix = json_value(data, "input_prefix", json());
+        slot.input_suffix = json_value(data, "input_suffix", json());
+        slot.input_extra  = json_value(data, "input_extra",  json());
 
-        SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.params.extra_context.size());
-        for (const auto & chunk : slot.params.extra_context) {
+        SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size());
+        for (const auto & chunk : slot.input_extra) {
             // { "text": string, "filename": string }
             if (!chunk.contains("text") || !chunk["text"].is_string()) {
                 send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST);
@@ -932,7 +932,7 @@ struct server_context {
         }
 
         // get prompt
-        if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) {
+        {
             const auto & prompt = data.find("prompt");
             if (prompt == data.end()) {
                 send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
@@ -1958,6 +1958,8 @@ struct server_context {
                                 } break;
                             case SERVER_TASK_CMPL_TYPE_INFILL:
                                 {
+                                    // TODO: optimize this block by reducing memory allocations and movement
+
                                     // use FIM repo-level pattern:
                                     // ref: https://arxiv.org/pdf/2409.12186
                                     //
@@ -1968,10 +1970,11 @@ struct server_context {
                                     // extra chunk 1
                                     // ...
                                     // [FIM_SEP]filename
-                                    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]
+                                    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
                                     //
-                                    auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
-                                    auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
+                                    auto tokens_prefix = tokenize(slot.input_prefix, false, false);
+                                    auto tokens_suffix = tokenize(slot.input_suffix, false, false);
+                                    auto tokens_prompt = tokenize(slot.prompt,       false, false);
 
                                     slot.extra_tokens.clear();
                                     if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
@@ -1981,7 +1984,7 @@ struct server_context {
                                         slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
                                     }
 
-                                    for (const auto & chunk : slot.params.extra_context) {
+                                    for (const auto & chunk : slot.input_extra) {
                                         // { "text": string, "filename": string }
                                         const std::string text     = chunk.value("text", "");
                                         const std::string filename = chunk.value("filename", "tmp");
@@ -2012,20 +2015,21 @@ struct server_context {
                                     }
 
                                     // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
-                                    const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch)/4);
-                                    const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
+                                    const int n_suffix_take = std::min<int>(tokens_suffix.size(),   (n_batch/4));
+                                    const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4) - 3);
 
                                     // fill the rest of the context with extra chunks
                                     const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
 
-                                    prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
-                                    suffix_tokens.resize(n_suffix_take);
+                                    tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
+                                    tokens_suffix.resize(n_suffix_take);
 
-                                    prefix_tokens.insert(prefix_tokens.begin(), llama_token_fim_pre(model));
-                                    suffix_tokens.insert(suffix_tokens.begin(), llama_token_fim_suf(model));
+                                    tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
+                                    tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
+                                    tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
 
-                                    auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
-                                    auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
+                                    auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix;
+                                    auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix;
 
                                     if (llama_add_bos_token(model)) {
                                         embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
@@ -2140,40 +2144,17 @@ struct server_context {
 
                                     while (head_c < slot.cache_tokens.size() &&
                                            head_p < prompt_tokens.size()) {
-                                        if (llama_token_is_control(model, slot.cache_tokens[head_c]) &&
-                                            slot.cache_tokens[head_c] != llama_token_fim_rep(model) &&
-                                            slot.cache_tokens[head_c] != llama_token_fim_sep(model)) {
-                                            break;
-                                        }
-
-                                        if (llama_token_is_control(model, prompt_tokens[head_p]) &&
-                                            prompt_tokens[head_p] != llama_token_fim_rep(model) &&
-                                            prompt_tokens[head_p] != llama_token_fim_sep(model)) {
-                                            break;
-                                        }
 
                                         size_t n_match = 0;
-
                                         while (head_c + n_match < slot.cache_tokens.size() &&
                                                head_p + n_match < prompt_tokens.size()     &&
                                                slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
-                                            if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match]) &&
-                                                slot.cache_tokens[head_c + n_match] != llama_token_fim_rep(model) &&
-                                                slot.cache_tokens[head_c + n_match] != llama_token_fim_sep(model)) {
-                                                break;
-                                            }
-
-                                            if (llama_token_is_control(model, prompt_tokens[head_p + n_match]) &&
-                                                prompt_tokens[head_p + n_match] != llama_token_fim_rep(model) &&
-                                                prompt_tokens[head_p + n_match] != llama_token_fim_sep(model)) {
-                                                break;
-                                            }
 
                                             n_match++;
                                         }
 
                                         if (n_match >= (size_t) params.n_cache_reuse) {
-                                            SLT_DBG(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
+                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
                                             //for (size_t i = head_p; i < head_p + n_match; i++) {
                                             //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
                                             //}

From 755a9b2bf00fbae988e03a47e852b66eaddd113a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Oct 2024 16:35:33 +0300
Subject: [PATCH 053/396] llama : add infill sampler (#9896)

ggml-ci
---
 common/common.h        |   4 +-
 common/sampling.cpp    |   9 +-
 examples/main/main.cpp |  46 +++++-----
 include/llama.h        |  28 ++++++
 src/llama-sampling.cpp | 201 +++++++++++++++++++++++++++++++++++++++++
 src/llama-sampling.h   |   5 +-
 src/llama-vocab.cpp    |  17 ++++
 src/llama-vocab.h      |   8 +-
 src/llama.cpp          |  11 +++
 9 files changed, 300 insertions(+), 29 deletions(-)

diff --git a/common/common.h b/common/common.h
index df2ee6bd4..5ca8fd391 100644
--- a/common/common.h
+++ b/common/common.h
@@ -91,7 +91,7 @@ enum common_sampler_type {
     COMMON_SAMPLER_TYPE_TYPICAL_P   = 5,
     COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
     COMMON_SAMPLER_TYPE_XTC         = 7,
-
+    COMMON_SAMPLER_TYPE_INFILL      = 8,
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -136,7 +136,7 @@ struct common_sampler_params {
         COMMON_SAMPLER_TYPE_TOP_P,
         COMMON_SAMPLER_TYPE_MIN_P,
         COMMON_SAMPLER_TYPE_XTC,
-        COMMON_SAMPLER_TYPE_TEMPERATURE
+        COMMON_SAMPLER_TYPE_TEMPERATURE,
     };
 
     std::string grammar; // optional BNF-like grammar to constrain sampling
diff --git a/common/sampling.cpp b/common/sampling.cpp
index fb95bcd3b..56cd0df6b 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -196,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                     case COMMON_SAMPLER_TYPE_TEMPERATURE:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                         break;
+                    case COMMON_SAMPLER_TYPE_INFILL:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
+                        break;
                     default:
                         GGML_ASSERT(false && "unknown sampler type");
                 }
@@ -376,6 +379,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
         case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
         case COMMON_SAMPLER_TYPE_XTC:         return 'x';
+        case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
         default : return '?';
     }
 }
@@ -389,6 +393,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
         case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
         case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
+        case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
         default : return "";
     }
 }
@@ -402,6 +407,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "tfs_z",       COMMON_SAMPLER_TYPE_TFS_Z },
         { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
         { "xtc",         COMMON_SAMPLER_TYPE_XTC },
+        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
     };
 
     // since samplers names are written multiple ways
@@ -448,7 +454,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC }
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
     };
 
     std::vector<common_sampler_type> samplers;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index fb10c20c5..65483c45f 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -569,30 +569,30 @@ int main(int argc, char ** argv) {
                     if (!params.ctx_shift){
                         LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
                         break;
-                    } else {
-                        if (params.n_predict == -2) {
-                            LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
-                            break;
-                        }
-
-                        const int n_left    = n_past - params.n_keep;
-                        const int n_discard = n_left/2;
-
-                        LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                                n_past, n_left, n_ctx, params.n_keep, n_discard);
-
-                        llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                        llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
-
-                        n_past -= n_discard;
-
-                        LOG_DBG("after swap: n_past = %d\n", n_past);
-
-                        LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
-
-                        LOG_DBG("clear session path\n");
-                        path_session.clear();
                     }
+
+                    if (params.n_predict == -2) {
+                        LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                        break;
+                    }
+
+                    const int n_left    = n_past - params.n_keep;
+                    const int n_discard = n_left/2;
+
+                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                            n_past, n_left, n_ctx, params.n_keep, n_discard);
+
+                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+
+                    n_past -= n_discard;
+
+                    LOG_DBG("after swap: n_past = %d\n", n_past);
+
+                    LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
+
+                    LOG_DBG("clear session path\n");
+                    path_session.clear();
                 }
             } else {
                 // context extension via Self-Extend
diff --git a/include/llama.h b/include/llama.h
index 92d4c70c1..02bc7f087 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -953,6 +953,12 @@ extern "C" {
                                int32_t   lstrip,
                                   bool   special);
 
+    // check if token0 is contained as a prefix in token1
+    LLAMA_API bool llama_token_is_prefix(
+              const struct llama_model * model,
+                           llama_token   token0,
+                           llama_token   token1);
+
     /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
     /// @param text The char pointer must be large enough to hold the resulting text.
     /// @return Returns the number of chars/bytes on success, no more than text_len_max.
@@ -1148,6 +1154,28 @@ extern "C" {
                              int32_t   n_logit_bias,
               const llama_logit_bias * logit_bias);
 
+    // this sampler is meant to be used for fill-in-the-middle infilling
+    // it's supposed to be used after top_k + top_p sampling
+    //
+    // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
+    // 2. combine probs of tokens that have the same prefix
+    //
+    // example:
+    //
+    // - before:
+    //   "hel":   0.5
+    //   "hell":  0.2
+    //   "hello": 0.1
+    //   "dummy": 0.1
+    //
+    // - after:
+    //   "hel":   0.8
+    //   "dummy": 0.1
+    //
+    // 3. discard non-EOG tokens with low prob
+    // 4. if no tokens are left -> pick EOT
+    //
+    LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
 
     // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
     LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 67a78c3ac..2e6550682 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1739,6 +1739,207 @@ struct llama_sampler * llama_sampler_init_logit_bias(
     };
 }
 
+// infill
+
+//#define GGML_DEBUG_SAMPLER_INFILL
+
+struct llama_sampler_infill {
+    const struct llama_vocab * vocab;
+};
+
+static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
+    return "infill";
+}
+
+static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_infill *) smpl->ctx;
+
+    llama_sampler_softmax_impl(cur_p);
+
+#if defined(GGML_DEBUG_SAMPLER_INFILL)
+#define LOG_DBG_CUR LLAMA_LOG_DEBUG
+#else
+#define LOG_DBG_CUR(...)
+#endif
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+
+    float p_txt_sum = 0.0f;
+    float p_eog_sum = 0.0f;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+            p_eog_sum += cur_p->data[i].p;
+        } else {
+            p_txt_sum += cur_p->data[i].p;
+        }
+    }
+
+    const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
+
+    LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
+
+    if (3*p_eog_sum*cur_p->size > p_txt_sum) {
+        LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
+
+        // keep just the EOG tokens
+        const auto size_org = cur_p->size;
+
+        cur_p->size = 0;
+
+        float p_sum = 0.0f;
+
+        for (size_t i = 0; i < size_org; ++i) {
+            if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+                p_sum += cur_p->data[i].p;
+
+                cur_p->data[cur_p->size++] = cur_p->data[i];
+            }
+        }
+
+        // normalize probs
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].p /= p_sum;
+        }
+
+        return;
+    }
+
+    size_t n_combined = 0; GGML_UNUSED(n_combined);
+
+    // combine tokens with common prefix
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        for (size_t j = 0; j < cur_p->size; ++j) {
+            if (cur_p->data[i].logit == -INFINITY) {
+                break;
+            }
+
+            if (i == j || cur_p->data[j].logit == -INFINITY) {
+                continue;
+            }
+
+            if (llama_token_is_prefix_impl(*ctx->vocab, cur_p->data[i].id, cur_p->data[j].id)) {
+                if (cur_p->data[i].p >  cur_p->data[j].p) {
+                    cur_p->data[i].p += cur_p->data[j].p;
+                    cur_p->data[j].logit = -INFINITY;
+                    cur_p->data[j].p     = 0.0f;
+                } else {
+                    cur_p->data[j].p += cur_p->data[i].p;
+                    cur_p->data[i].logit = -INFINITY;
+                    cur_p->data[i].p     = 0.0f;
+                }
+
+                n_combined++;
+            }
+        }
+    }
+
+    size_t n_non_eog = 0;
+
+    size_t size_org = cur_p->size;
+
+    float p_sum = 0.0f;
+    float thold = 0.2f;
+
+    cur_p->size = 0;
+
+    LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
+
+    for (size_t i = 0; i < size_org; ++i) {
+        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+
+        if (cur_p->data[i].p < thold && !is_eog) {
+            continue;
+        }
+
+        if (!is_eog) {
+            ++n_non_eog;
+        }
+
+        p_sum += cur_p->data[i].p;
+
+        // keep this token
+        cur_p->data[cur_p->size++] = cur_p->data[i];
+    }
+
+    LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
+
+    // if no non-EOG tokens are left -> reduce cur_p to single EOT token
+    if (n_non_eog == 0) {
+        cur_p->size = 1;
+        cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
+        cur_p->data[0].logit = 1.0f;
+
+        return;
+    }
+
+    // normalize probs
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= p_sum;
+
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+
+    size_org = cur_p->size;
+    p_sum = 0.0f;
+    thold = 1.0/(n_non_eog + 1);
+
+    cur_p->size = 0;
+
+    LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
+
+    for (size_t i = 0; i < size_org; ++i) {
+        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+
+        if (cur_p->data[i].p < thold && !is_eog) {
+            continue;
+        }
+
+        p_sum += cur_p->data[i].p;
+
+        cur_p->data[cur_p->size++] = cur_p->data[i];
+    }
+
+    // normalize probs
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= p_sum;
+
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+
+#undef LOG_DBG_CUR
+}
+
+static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
+    return llama_sampler_init_infill_impl(*ctx->vocab);
+}
+
+static void llama_sampler_infill_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_infill *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_infill_i = {
+    /* .name   = */ llama_sampler_infill_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_infill_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_infill_clone,
+    /* .free   = */ llama_sampler_infill_free,
+};
+
+struct llama_sampler * llama_sampler_init_infill_impl(
+        const struct llama_vocab & vocab) {
+    return new llama_sampler {
+        /* .iface = */ &llama_sampler_infill_i,
+        /* .ctx   = */ new llama_sampler_infill {
+            /* .vocab = */ &vocab,
+        },
+    };
+}
+
 // utils
 
 uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
index d90b14713..2683f1b92 100644
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@@ -4,8 +4,6 @@
 
 #include "llama-grammar.h"
 
-#include <unordered_map>
-
 struct llama_vocab;
 struct llama_grammar;
 
@@ -27,3 +25,6 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
         const struct llama_vocab & vocab,
                       const char * grammar_str,
                       const char * grammar_root);
+
+struct llama_sampler * llama_sampler_init_infill_impl(
+        const struct llama_vocab & vocab);
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a27394a37..070de9365 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1858,6 +1858,23 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
     return 0;
 }
 
+bool llama_token_is_prefix_impl(
+        const struct llama_vocab & vocab,
+                     llama_token   token0,
+                     llama_token   token1) {
+    char text_buf_0[128];
+    char text_buf_1[128];
+
+    const int32_t len0 = llama_token_to_piece_impl(vocab, token0, text_buf_0, sizeof(text_buf_0) - 1, 0, false);
+    const int32_t len1 = llama_token_to_piece_impl(vocab, token1, text_buf_1, sizeof(text_buf_1) - 1, 0, false);
+
+    if (len0 <= 0 || len1 <= 0) {
+        return false;
+    }
+
+    return len0 <= len1 && memcmp(text_buf_0, text_buf_1, len0) == 0;
+}
+
 int32_t llama_detokenize_impl(
         const struct llama_vocab & vocab,
                const llama_token * tokens,
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 17e14488a..d958d0073 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -48,7 +48,7 @@ struct llama_vocab {
     id special_cls_id  = LLAMA_TOKEN_NULL;
     id special_mask_id = LLAMA_TOKEN_NULL;
 
-    id linefeed_id    = 13;
+    id linefeed_id = 13;
 
     // fim tokens
     id special_fim_pre_id = LLAMA_TOKEN_NULL;
@@ -149,6 +149,12 @@ int32_t llama_token_to_piece_impl(
                          int32_t   lstrip,
                             bool   special);
 
+// check if token0 is contained as a prefix in token1
+bool llama_token_is_prefix_impl(
+        const struct llama_vocab & vocab,
+                     llama_token   token0,
+                     llama_token   token1);
+
 int32_t llama_detokenize_impl(
         const struct llama_vocab & vocab,
                const llama_token * tokens,
diff --git a/src/llama.cpp b/src/llama.cpp
index 511f91802..8d44c73c8 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21500,6 +21500,13 @@ int32_t llama_token_to_piece(
     return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
 }
 
+bool llama_token_is_prefix(
+    const struct llama_model * model,
+                 llama_token   token0,
+                 llama_token   token1) {
+    return llama_token_is_prefix_impl(model->vocab, token0, token1);
+}
+
 int32_t llama_detokenize(
     const struct llama_model * model,
            const llama_token * tokens,
@@ -21830,6 +21837,10 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
     return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
 }
 
+struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
+    return llama_sampler_init_infill_impl(model->vocab);
+}
+
 //
 // model split
 //

From becfd387f6919d99ec34b76c2522f90ac250c489 Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Wed, 16 Oct 2024 08:51:46 +0800
Subject: [PATCH 054/396] [CANN] Fix cann compilation error (#9891)

Fix cann compilation error after merging llama.cpp supports dynamically loadable backends.
---
 ggml/src/ggml-cann.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index db5f8f186..ec3c0a688 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -1148,6 +1148,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
         for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
             ggml_backend_cann_buffer_types[i] = {
                 /* .iface    = */ ggml_backend_cann_buffer_type_interface,
+                /* .device    = */ nullptr,
                 /* .context  = */
                  new ggml_backend_cann_buffer_type_context{
                     i, "CANN" + std::to_string(i)},
@@ -1868,7 +1869,7 @@ static ggml_backend_event_t ggml_backend_cann_event_new(
     ACL_CHECK(aclrtCreateEvent(&event));
 
     return new ggml_backend_event{
-        /* .backend = */ backend,
+        /* .device = */ nullptr,
         /* .context = */ event,
     };
 }
@@ -1895,10 +1896,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
  *
  * @param event Pointer to the event structure to be recorded.
  */
-static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
+static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
     ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)event->backend->context;
-
+        (ggml_backend_cann_context*)backend->context;
     ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
 }
 
@@ -1916,8 +1916,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
                                          ggml_backend_event_t event) {
     ggml_backend_cann_context* cann_ctx =
         (ggml_backend_cann_context*)backend->context;
-
-    if (ggml_backend_is_cann(event->backend)) {
+    if (ggml_backend_is_cann(backend)) {
         ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
                                        (aclrtEvent)event->context));
     } else {

From cd60b88bf7ad7785fb6ac9864e360cf10e42faad Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 9 Oct 2024 16:40:35 +0200
Subject: [PATCH 055/396] ggml-alloc : remove buffer_id from leaf_alloc
 (ggml/987)

This commit removes the buffer_id field from the leaf_alloc struct.

The motivation for is that this field is only written to and never
read/used as far as I can tell. Each tensor_alloc has a buffer_id field
and this is what caused me to look into this more closely, to
understand what the buffer_id in leaf_alloc was used for.
---
 ggml/src/ggml-alloc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 28548fbbb..041de9e3e 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -348,7 +348,6 @@ struct tensor_alloc {
 };
 
 struct leaf_alloc {
-    int buffer_id;
     struct tensor_alloc leaf;
 };
 
@@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     for (int i = 0; i < graph->n_leafs; i++) {
         struct ggml_tensor * leaf = graph->leafs[i];
         struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
         if (leaf->view_src || leaf->data) {
             galloc->leaf_allocs[i].leaf.buffer_id = -1;
             galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;

From 0e41b300ed28f7fe185d938b2e3d56a0bf7411ed Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 16 Oct 2024 11:28:14 +0300
Subject: [PATCH 056/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 3cca9cc2f..6d31b21b9 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-564f42082f858f9674b2a2e06e9e779d9ed2c754
+2327bda7a55ac6b72614ac5ebd5c5a5e02553b9b

From 1f66b699c48cb5ab3265ed72c48e8549b1674291 Mon Sep 17 00:00:00 2001
From: Alexey Parfenov <zxed@alkatrazstudio.net>
Date: Wed, 16 Oct 2024 08:35:53 +0000
Subject: [PATCH 057/396] server : fix the disappearance of the end of the text
 (#9867)

* server: fix the disappearance of the end of the text when streaming with stop strings

* simplify "send text" checks
---
 examples/server/server.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d53cca84c..b5e63384c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1090,22 +1090,21 @@ struct server_context {
             size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
 
             const std::string str_test = slot.generated_text.substr(pos);
-            bool is_stop_full = false;
+            bool send_text = true;
 
             size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
             if (stop_pos != std::string::npos) {
-                is_stop_full = true;
                 slot.generated_text.erase(
                     slot.generated_text.begin() + pos + stop_pos,
                     slot.generated_text.end());
                 pos = std::min(slot.n_sent_text, slot.generated_text.size());
-            } else {
-                is_stop_full = false;
+            } else if (slot.has_next_token) {
                 stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
+                send_text = stop_pos == std::string::npos;
             }
 
             // check if there is any token to predict
-            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
+            if (send_text) {
                 // no send the stop word in the response
                 result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
                 slot.n_sent_text += result.text_to_send.size();

From 10433e8b457c4cfd759cbb41fc55fc398db4a5da Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Wed, 16 Oct 2024 18:10:21 +0800
Subject: [PATCH 058/396] llama : add tensor name for "result_norm" (#9907)

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/llama.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 8d44c73c8..c51b49c56 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16095,9 +16095,11 @@ struct llm_build_context {
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
         cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_norm", -1);
 
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
+
         ggml_build_forward_expand(gf, cur);
 
         return gf;

From 66c2c93082289325199ae1f773f3b0ab2e399a47 Mon Sep 17 00:00:00 2001
From: Joe Eli McIlvain <joe.eli.mac@gmail.com>
Date: Wed, 16 Oct 2024 09:03:24 -0700
Subject: [PATCH 059/396] grammar : fix JSON Schema for string regex with
 top-level alt. (#9903)

Prior to this commit, using a JSON Schema containing a string
with `pattern` regular expression that uses top-level alternation
(e.g. `"pattern": "^A|B|C|D$"`) would result in invalid JSON
output from the constrained sampling grammar, because it
ended up creating a grammar rule like this for the string:

```
thing ::= "\"" "A" | "B" | "C" | "D" "\"" space
```

Note that this rule will only match a starting quote for the "A" case,
and will only match an ending quote for the "D" case,
so this rule will always produce invalid JSON when used for sampling
(that is, the JSON will always be lacking the starting quote,
the ending quote, or both).

This was fixed in a simple way by adding parentheses to the
generated rule (for all string pattern rules, to keep it simple),
such that the new generated rule looks like this (correct):

```
thing ::= "\"" ("A" | "B" | "C" | "D") "\"" space
```
---
 common/json-schema-to-grammar.cpp             |  2 +-
 examples/json_schema_to_grammar.py            |  2 +-
 .../server/public/json-schema-to-grammar.mjs  |  2 +-
 tests/test-json-schema-to-grammar.cpp         | 21 +++++++++++++++----
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 881eb49e3..dadc18c8b 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -611,7 +611,7 @@ private:
             }
             return join_seq();
         };
-        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
+        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
     }
 
     /*
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index a8779bf3b..fc9f0097f 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -540,7 +540,7 @@ class SchemaConverter:
         return self._add_rule(
             name,
             to_rule(transform()) if self._raw_pattern \
-                else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
+                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
 
 
     def _resolve_ref(self, ref):
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
index 7267f3f9c..e67bb15c1 100644
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -529,7 +529,7 @@ export class SchemaConverter {
       return joinSeq();
     };
 
-    return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
+    return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space")
   }
 
   _notStrings(strings) {
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 3a89598a8..9d2db91f5 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -696,7 +696,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "pattern": "^abc?d*efg+(hij)?kl$"
         })""",
         R"""(
-            root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
+            root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\"" space
             space ::= | " " | "\n" [ \t]{0,20}
         )"""
     });
@@ -709,7 +709,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
         })""",
         R"""(
-            root ::= "\"" "[]{}()|+*?" "\"" space
+            root ::= "\"" ("[]{}()|+*?") "\"" space
             space ::= | " " | "\n" [ \t]{0,20}
         )"""
     });
@@ -722,7 +722,20 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "pattern": "^\"$"
         })""",
         R"""(
-            root ::= "\"" "\"" "\"" space
+            root ::= "\"" ("\"") "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "regexp with top-level alternation",
+        R"""({
+            "type": "string",
+            "pattern": "^A|B|C|D$"
+        })""",
+        R"""(
+            root ::= "\"" ("A" | "B" | "C" | "D") "\"" space
             space ::= | " " | "\n" [ \t]{0,20}
         )"""
     });
@@ -736,7 +749,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             dot ::= [^\x0A\x0D]
-            root ::= "\"" ("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot "\"" space
+            root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\"" space
             root-1 ::= [0-9]
             space ::= | " " | "\n" [ \t]{0,20}
         )"""

From dbf18e4de9e7aa496871f1555f9f0c8d84567108 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 16 Oct 2024 19:24:05 +0200
Subject: [PATCH 060/396] llava : fix typo in error message [no ci] (#9884)

---
 examples/llava/llava.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 8558c6bdc..2c96973c8 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -432,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
     bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
     if (!image_embed_result) {
         clip_image_u8_free(img);
-        LOG_ERR("%s: coulnd't embed the image\n", __func__);
+        LOG_ERR("%s: couldn't embed the image\n", __func__);
         return NULL;
     }
 

From 9e041024481f6b249ab8918e18b9477f873b5a5e Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 16 Oct 2024 19:34:28 +0200
Subject: [PATCH 061/396] llama : suppress conversion from 'size_t' to 'int'
 (#9046)

* llama : suppress conversion from 'size_t' to 'int'

This commit updates llm_tokenizer_spm.tokenize to suppress/remove the
following warnings that are generated on Windows when using MSVC:

```console
src\llama-vocab.cpp(211,1): warning C4267: 'argument':
    conversion from 'size_t' to 'int', possible loss of data
src\llama-vocab.cpp(517,1): warning C4267: 'argument':
    conversion from 'size_t' to 'int', possible loss of data
```

This is done by adding a cast for the size_t returned from
symbols.size(). I believe this is safe as it seems unlikely that
symbols, which stores an entry for each UTF8 character, would become
larger than INT_MAX.

The motivation for this change is to reduce the number of warnings that
are currently generated when building on Windows.

* squash! llama : suppress conversion from 'size_t' to 'int'

Move cast into for loop.
---
 src/llama-vocab.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 070de9365..57d56a3d3 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -221,7 +221,7 @@ struct llm_tokenizer_spm_session {
         }
 
         // seed the work queue with all possible 2-character tokens.
-        for (size_t i = 1; i < symbols.size(); ++i) {
+        for (int i = 1; i < (int) symbols.size(); ++i) {
             try_add_bigram(i - 1, i);
         }
 
@@ -563,7 +563,7 @@ struct llm_tokenizer_bpe_session {
                 index++;
                 symbols.emplace_back(sym);
             }
-            for (size_t i = 1; i < symbols.size(); ++i) {
+            for (int i = 1; i < (int) symbols.size(); ++i) {
                 add_new_bigram(i - 1, i);
             }
 

From 73afe681aa76e818733fc1f30de082c1d6910bcd Mon Sep 17 00:00:00 2001
From: "Gilad S." <7817232+giladgd@users.noreply.github.com>
Date: Thu, 17 Oct 2024 01:36:51 +0300
Subject: [PATCH 062/396] fix: use `vm_allocate` to allocate CPU backend buffer
 on macOS (#9875)

* fix: use `vm_allocate` to allocate CPU backend buffer on macOS

* fix: switch to `posix_memalign` to keep existing `free()` usages work

* feat: move `GGML_ALIGNED_MALLOC` to `ggml-backend-impl.h`, add support for `vm_allocate` on macOS

* style: formatting

* fix: move const outside of `#ifndef`

* style: formatting

* fix: unused var

* fix: transform `GGML_ALIGNED_MALLOC` and `GGML_ALIGNED_FREE` into functions and add them to `ggml-impl.h`

* fix: unused var

* fix: page align to `GGUF_DEFAULT_ALIGNMENT`

* fix: page align to `TENSOR_ALIGNMENT`

* fix: convert `TENSOR_ALIGNMENT` to a macro

* fix: increase page size to `32` on iOS

* fix: iOS page size

* fix: `hbw_posix_memalign` alignment
---
 ggml/src/ggml-backend.cpp |  8 ++---
 ggml/src/ggml-impl.h      |  8 +++++
 ggml/src/ggml.c           | 74 +++++++++++++++++++++++++++------------
 3 files changed, 63 insertions(+), 27 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 15d650150..6d6ffeb4e 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -682,8 +682,6 @@ ggml_backend_t ggml_backend_init_best(void) {
 
 // backend CPU
 
-static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
-
 static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
     return "CPU";
 
@@ -702,7 +700,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 }
 
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
+    ggml_aligned_free(buffer->context, buffer->size);
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -770,8 +768,8 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
 }
 
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
-    void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
+    void * data = ggml_aligned_malloc(size);
+
     if (data == NULL) {
         GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
         return NULL;
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index d3f4bad8c..65c4f8119 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -19,6 +19,9 @@ extern "C" {
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
+// required for mmap as gguf only guarantees 32-byte alignment
+#define TENSOR_ALIGNMENT 32
+
 // static_assert should be a #define, but if it's not,
 // fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
@@ -196,6 +199,11 @@ struct ggml_cgraph {
 
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 
+// Memory allocation
+
+void * ggml_aligned_malloc(size_t size);
+void ggml_aligned_free(void * ptr, size_t size);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 3f01092d9..779b38d12 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -35,10 +35,6 @@
 #include <omp.h>
 #endif
 
-#ifdef GGML_USE_METAL
-#include <unistd.h>
-#endif
-
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
 #endif
 
 #if defined(__APPLE__)
+#include <unistd.h>
+#include <mach/mach.h>
 #include <TargetConditionals.h>
 #endif
 
@@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
 //#define GGML_SOFT_MAX_ACCELERATE
 #endif
 
+
+void * ggml_aligned_malloc(size_t size) {
 #if defined(_MSC_VER) || defined(__MINGW32__)
-#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
-#define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
+    return _aligned_malloc(size, TENSOR_ALIGNMENT);
 #else
-inline static void * ggml_aligned_malloc(size_t size) {
     if (size == 0) {
         GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
         return NULL;
     }
     void * aligned_memory = NULL;
 #ifdef GGML_USE_CPU_HBM
-    int result = hbw_posix_memalign(&aligned_memory, 16, size);
+    int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
+#elif TARGET_OS_OSX
+    kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
+    int result = EFAULT;
+    switch (alloc_status) {
+        case KERN_SUCCESS:
+            result = 0;
+            break;
+        case KERN_INVALID_ADDRESS:
+            result = EINVAL;
+            break;
+        case KERN_NO_SPACE:
+            result = ENOMEM;
+            break;
+        default:
+            result = EFAULT;
+            break;
+    }
 #elif GGML_USE_METAL
-    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
+    const long page_size = sysconf(_SC_PAGESIZE);
+    int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
 #else
-    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
+    int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
 #endif
     if (result != 0) {
         // Handle allocation failure
@@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
         return NULL;
     }
     return aligned_memory;
+#endif
 }
-#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
-#ifdef GGML_USE_CPU_HBM
-#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
+
+void ggml_aligned_free(void * ptr, size_t size) {
+    GGML_UNUSED(size);
+#if defined(_MSC_VER) || defined(__MINGW32__)
+    _aligned_free(ptr);
+#elif GGML_USE_CPU_HBM
+    if (ptr != NULL) {
+        hbw_free(ptr);
+    }
+#elif TARGET_OS_OSX
+    if (ptr != NULL) {
+        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
+    }
 #else
-#define GGML_ALIGNED_FREE(ptr)    free(ptr)
-#endif
+    free(ptr);
 #endif
+}
+
 
 inline static void * ggml_malloc(size_t size) {
     if (size == 0) {
@@ -3869,7 +3897,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     *ctx = (struct ggml_context) {
         /*.mem_size           =*/ mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
         /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
         /*.no_alloc           =*/ params.no_alloc,
         /*.no_alloc_save      =*/ params.no_alloc,
@@ -3909,7 +3937,7 @@ void ggml_free(struct ggml_context * ctx) {
                     __func__, i, ggml_used_mem(ctx));
 
             if (ctx->mem_buffer_owned) {
-                GGML_ALIGNED_FREE(ctx->mem_buffer);
+                ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
             }
 
             found = true;
@@ -19608,9 +19636,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
 void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
     if (!threadpool) return;
 
+    const int n_threads = threadpool->n_threads_max;
+
 #ifndef GGML_USE_OPENMP
     struct ggml_compute_state* workers = threadpool->workers;
-    const int n_threads = threadpool->n_threads_max;
 
     ggml_mutex_lock(&threadpool->mutex);
 
@@ -19630,8 +19659,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
     ggml_cond_destroy(&threadpool->cond);
 #endif // GGML_USE_OPENMP
 
-    GGML_ALIGNED_FREE(threadpool->workers);
-    GGML_ALIGNED_FREE(threadpool);
+    const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
+    ggml_aligned_free(threadpool->workers, workers_size);
+    ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
 }
 
 #ifndef GGML_USE_OPENMP
@@ -20063,7 +20093,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
                 struct ggml_cplan * cplan) {
 
     struct ggml_threadpool * threadpool =
-        GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
+        ggml_aligned_malloc(sizeof(struct ggml_threadpool));
     {
         threadpool->cgraph           = cgraph;
         threadpool->cplan            = cplan;
@@ -20084,7 +20114,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
 
     // Allocate and init workers state
     const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
-    struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
+    struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
 
     memset(workers, 0, workers_size);
     for (int j = 0; j < tpp->n_threads; j++) {

From 21942002780352b4a54f4bd3e5eefa3bc7f14fe6 Mon Sep 17 00:00:00 2001
From: "Gilad S." <7817232+giladgd@users.noreply.github.com>
Date: Thu, 17 Oct 2024 02:34:22 +0300
Subject: [PATCH 063/396] fix: allocating CPU buffer with size `0` (#9917)

---
 ggml/src/ggml-backend.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 6d6ffeb4e..4b9bac21d 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -768,14 +768,19 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
 }
 
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * data = ggml_aligned_malloc(size);
+    auto alloc_size = size;
+    if (alloc_size == 0) {
+        alloc_size = 1;
+    }
+
+    void * data = ggml_aligned_malloc(alloc_size);
 
     if (data == NULL) {
-        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
+        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
         return NULL;
     }
 
-    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
+    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
 }
 
 static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {

From f010b77a372ffcfaf4338c670d6d3ecd89aa4eb6 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 17 Oct 2024 02:46:58 +0200
Subject: [PATCH 064/396] vulkan : add backend registry / device interfaces
 (#9721)

* vulkan : add backend registry / device interfaces

* llama : print devices used on model load
---
 ggml/include/ggml-vulkan.h |   2 +
 ggml/src/ggml-backend.cpp  |   9 +-
 ggml/src/ggml-vulkan.cpp   | 284 +++++++++++++++++++++++++++----------
 src/llama.cpp              |  54 ++-----
 4 files changed, 226 insertions(+), 123 deletions(-)

diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h
index e074042ef..c03bbfe5e 100644
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
 
+GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 4b9bac21d..a3bc79a46 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -538,6 +538,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
 #ifdef GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
@@ -557,6 +561,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_METAL
         register_backend(ggml_backend_metal_reg());
 #endif
+#ifdef GGML_USE_VULKAN
+        register_backend(ggml_backend_vk_reg());
+#endif
 #ifdef GGML_USE_BLAS
         register_backend(ggml_backend_blas_reg());
 #endif
@@ -564,7 +571,7 @@ struct ggml_backend_registry {
         register_backend(ggml_backend_rpc_reg());
 #endif
 
-        // TODO: sycl, vulkan, kompute, cann
+        // TODO: sycl, kompute, cann
 
         register_backend(ggml_backend_cpu_reg());
     }
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 374c6ecd7..e749bbe70 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -1941,7 +1941,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
         if (device->fp16) {
             device_extensions.push_back("VK_KHR_shader_float16_int8");
         }
-        device->name = device->properties.deviceName.data();
+        device->name = GGML_VK_NAME + std::to_string(idx);
 
         device_create_info = {
             vk::DeviceCreateFlags(),
@@ -1968,7 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
 
         device->buffer_type = {
             /* .iface    = */ ggml_backend_vk_buffer_type_interface,
-            /* .device   = */ nullptr,
+            /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx),
             /* .context  = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
         };
 
@@ -6378,7 +6378,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
             /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
         },
-        /* .device   = */ nullptr,
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0),
         /* .context  = */ nullptr,
     };
 
@@ -6581,9 +6581,135 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     UNUSED(backend);
 }
 
-static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
+// TODO: enable async and synchronize
+static ggml_backend_i ggml_backend_vk_interface = {
+    /* .get_name                = */ ggml_backend_vk_name,
+    /* .free                    = */ ggml_backend_vk_free,
+    /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,  // ggml_backend_vk_set_tensor_async,
+    /* .get_tensor_async        = */ NULL,  // ggml_backend_vk_get_tensor_async,
+    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_vk_cpy_tensor_async,
+    /* .synchronize             = */ NULL,  // ggml_backend_vk_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_vk_graph_compute,
+    /* .supports_op             = */ NULL,
+    /* .supports_buft           = */ NULL,
+    /* .offload_op              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
 
+static ggml_guid_t ggml_backend_vk_guid() {
+    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
+    VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
+
+    ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
+    ggml_vk_init(ctx, dev_num);
+
+    ggml_backend_t vk_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_vk_guid(),
+        /* .interface = */ ggml_backend_vk_interface,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
+        /* .context   = */ ctx,
+    };
+
+    return vk_backend;
+}
+
+bool ggml_backend_is_vk(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
+}
+
+int ggml_backend_vk_get_device_count() {
+    return ggml_vk_get_device_count();
+}
+
+void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
+    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+    int dev_idx = vk_instance.device_indices[device];
+    ggml_vk_get_device_description(dev_idx, description, description_size);
+}
+
+void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
+    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+
+    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
+
+    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
+
+    for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
+        if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+            *total = heap.size;
+            *free = heap.size;
+            break;
+        }
+    }
+}
+
+//////////////////////////
+
+struct ggml_backend_vk_device_context {
+    int device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
+    ggml_backend_vk_get_device_memory(ctx->device, free, total);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ggml_backend_vk_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return ggml_backend_vk_host_buffer_type();
+}
+
+static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+}
+
+static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_vk_device_get_name(dev);
+    props->description = ggml_backend_vk_device_get_description(dev);
+    props->type        = ggml_backend_vk_device_get_type(dev);
+    ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* async       */ false,
+        /* host_buffer */ true,
+        /* events      */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
+    UNUSED(params);
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ggml_backend_vk_init(ctx->device);
+}
+
+static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -6701,97 +6827,101 @@ static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tenso
             return false;
     }
 
-    UNUSED(backend);
+    UNUSED(dev);
 }
 
-static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
+static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
+        return false;
+    }
+
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
+
+    return buft_ctx->device->idx == ctx->device;
+}
+
+static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     const int min_batch_size = 32;
 
     return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
            (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
 
-    UNUSED(backend);
+    UNUSED(dev);
 }
 
-static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
-        return false;
-    }
-
-    ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-
-    return buft_ctx->device == ctx->device;
-}
-
-// TODO: enable async and synchronize
-static ggml_backend_i ggml_backend_vk_interface = {
-    /* .get_name                = */ ggml_backend_vk_name,
-    /* .free                    = */ ggml_backend_vk_free,
-    /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,  // ggml_backend_vk_set_tensor_async,
-    /* .get_tensor_async        = */ NULL,  // ggml_backend_vk_get_tensor_async,
-    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_vk_cpy_tensor_async,
-    /* .synchronize             = */ NULL,  // ggml_backend_vk_synchronize,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_vk_graph_compute,
-    /* .supports_op             = */ ggml_backend_vk_supports_op,
-    /* .supports_buft           = */ ggml_backend_vk_supports_buft,
-    /* .offload_op              = */ ggml_backend_vk_offload_op,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
+static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
+    /* .get_name             = */ ggml_backend_vk_device_get_name,
+    /* .get_description      = */ ggml_backend_vk_device_get_description,
+    /* .get_memory           = */ ggml_backend_vk_device_get_memory,
+    /* .get_type             = */ ggml_backend_vk_device_get_type,
+    /* .get_props            = */ ggml_backend_vk_device_get_props,
+    /* .init_backend         = */ ggml_backend_vk_device_init,
+    /* .get_buffer_type      = */ ggml_backend_vk_device_get_buffer_type,
+    /* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr = */ NULL,
+    /* .supports_op          = */ ggml_backend_vk_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_vk_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_vk_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
 };
 
-static ggml_guid_t ggml_backend_vk_guid() {
-    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
-    return &guid;
+static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+    return GGML_VK_NAME;
 }
 
-ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
-    VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
-
-    ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
-    ggml_vk_init(ctx, dev_num);
-
-    ggml_backend_t vk_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_vk_guid(),
-        /* .interface = */ ggml_backend_vk_interface,
-        /* .device    = */ nullptr,
-        /* .context   = */ ctx,
-    };
-
-    return vk_backend;
+static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+    return ggml_backend_vk_get_device_count();
 }
 
-bool ggml_backend_is_vk(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
-}
+static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+    static std::vector<ggml_backend_dev_t> devices;
 
-int ggml_backend_vk_get_device_count() {
-    return ggml_vk_get_device_count();
-}
+    static bool initialized = false;
 
-void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
-    ggml_vk_get_device_description(device, description, description_size);
-}
-
-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
-    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
-
-    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
-
-    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
-
-    for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
-        if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-            *total = heap.size;
-            *free = heap.size;
-            break;
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
+                ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
+                char desc[256];
+                ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
+                ctx->device = i;
+                ctx->name = GGML_VK_NAME + std::to_string(i);
+                ctx->description = desc;
+                devices.push_back(new ggml_backend_device {
+                    /* .iface   = */ ggml_backend_vk_device_i,
+                    /* .reg     = */ reg,
+                    /* .context = */ ctx,
+                });
+            }
+            initialized = true;
         }
     }
+
+    GGML_ASSERT(device < devices.size());
+    return devices[device];
+}
+
+static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
+    /* .get_name         = */ ggml_backend_vk_reg_get_name,
+    /* .get_device_count = */ ggml_backend_vk_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_vk_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_vk_reg() {
+    static ggml_backend_reg reg = {
+        /* .iface   = */ ggml_backend_vk_reg_i,
+        /* .context = */ nullptr,
+    };
+
+    return &reg;
 }
 
 // Extension availability
diff --git a/src/llama.cpp b/src/llama.cpp
index c51b49c56..68479c6db 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8,9 +8,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
-#if defined(GGML_USE_VULKAN)
-#  include "ggml-vulkan.h"
-#elif defined(GGML_USE_SYCL)
+#if defined(GGML_USE_SYCL)
 #  include "ggml-sycl.h"
 #elif defined(GGML_USE_KOMPUTE)
 #   include "ggml-kompute.h"
@@ -3418,8 +3416,6 @@ static int llama_get_device_count(const llama_model & model) {
 
 #if defined(GGML_USE_SYCL)
     count += ggml_backend_sycl_get_device_count();
-#elif defined(GGML_USE_VULKAN)
-    count += ggml_backend_vk_get_device_count();
 #elif defined(GGML_USE_CANN)
     count += ggml_backend_cann_get_device_count();
 #endif
@@ -3451,10 +3447,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
     }
 #elif defined(GGML_USE_CPU_HBM)
     buft = ggml_backend_cpu_hbm_buffer_type();
-#elif defined(GGML_USE_VULKAN)
-    if (host_buffer) {
-        buft = ggml_backend_vk_host_buffer_type();
-    }
 #endif
 
     if (buft == nullptr) {
@@ -3473,9 +3465,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
     }
     device -= (int)model.devices.size();
 
-#if defined(GGML_USE_VULKAN)
-    buft = ggml_backend_vk_buffer_type(device);
-#elif defined(GGML_USE_SYCL)
+#if defined(GGML_USE_SYCL)
     buft = ggml_backend_sycl_buffer_type(device);
 #elif defined(GGML_USE_KOMPUTE)
     buft = ggml_backend_kompute_buffer_type(device);
@@ -3535,11 +3525,6 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
     size_t free;
     ggml_backend_sycl_get_device_memory(device, &free, &total);
     return free;
-#elif defined(GGML_USE_VULKAN)
-    size_t total;
-    size_t free;
-    ggml_backend_vk_get_device_memory(device, &free, &total);
-    return free;
 #elif defined(GGML_USE_CANN)
     size_t total;
     size_t free;
@@ -19095,8 +19080,7 @@ bool llama_supports_mlock(void) {
 }
 
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_VULKAN) || \
-    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
     // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
     return true;
 #else
@@ -19227,8 +19211,13 @@ struct llama_model * llama_load_model_from_file(
 
             case GGML_BACKEND_DEVICE_TYPE_GPU:
             case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
+            {
+                size_t free, total; // NOLINT
+                ggml_backend_dev_memory(dev, &free, &total);
+                LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
                 model->devices.push_back(dev);
                 break;
+            }
         }
     }
 
@@ -19423,32 +19412,7 @@ struct llama_context * llama_new_context_with_model(
             main_gpu -= (int)model->devices.size();
         }
 
-#if defined(GGML_USE_VULKAN)
-        if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
-            llama_free(ctx);
-            return nullptr;
-        }
-        if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
-            ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
-        } else {
-            for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
-                ggml_backend_t backend = ggml_backend_vk_init(device);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-#elif defined(GGML_USE_SYCL)
+#if defined(GGML_USE_SYCL)
         // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
         if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
             ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);

From 3752217ed5a6a11864682fbf009bcb36afffd6bc Mon Sep 17 00:00:00 2001
From: Tim Wang <overocean@gmail.com>
Date: Thu, 17 Oct 2024 17:57:14 +1100
Subject: [PATCH 065/396] readme : update bindings list (#9918)

Co-authored-by: Tim Wang <tim.wang@ing.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 08fe8cc92..707904ddc 100644
--- a/README.md
+++ b/README.md
@@ -131,6 +131,7 @@ Typically finetunes of the base models below are supported as well.
 - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
+- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 
 **UI:**
 

From 99bd4ac28c32cd17c0e337ff5601393b033dc5fc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 17 Oct 2024 22:32:47 +0300
Subject: [PATCH 066/396] llama : infill sampling handle very long tokens
 (#9924)

* llama : infill sampling handle very long tokens

ggml-ci

* cont : better indices

ggml-ci
---
 include/llama.h        |  6 ------
 src/llama-sampling.cpp | 48 ++++++++++++++++++++++++++++++------------
 src/llama-vocab.cpp    | 17 ---------------
 src/llama.cpp          |  7 ------
 4 files changed, 35 insertions(+), 43 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 02bc7f087..1a13360c2 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -953,12 +953,6 @@ extern "C" {
                                int32_t   lstrip,
                                   bool   special);
 
-    // check if token0 is contained as a prefix in token1
-    LLAMA_API bool llama_token_is_prefix(
-              const struct llama_model * model,
-                           llama_token   token0,
-                           llama_token   token1);
-
     /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
     /// @param text The char pointer must be large enough to hold the resulting text.
     /// @return Returns the number of chars/bytes on success, no more than text_len_max.
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 2e6550682..bd750c40e 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1745,6 +1745,9 @@ struct llama_sampler * llama_sampler_init_logit_bias(
 
 struct llama_sampler_infill {
     const struct llama_vocab * vocab;
+
+    std::vector<char> buf0;
+    std::vector<char> buf1;
 };
 
 static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
@@ -1810,27 +1813,44 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     size_t n_combined = 0; GGML_UNUSED(n_combined);
 
     // combine tokens with common prefix
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        for (size_t j = 0; j < cur_p->size; ++j) {
-            if (cur_p->data[i].logit == -INFINITY) {
+    for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
+        for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
+            if (cur_p->data[i0].logit == -INFINITY) {
                 break;
             }
 
-            if (i == j || cur_p->data[j].logit == -INFINITY) {
+            if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
                 continue;
             }
 
-            if (llama_token_is_prefix_impl(*ctx->vocab, cur_p->data[i].id, cur_p->data[j].id)) {
-                if (cur_p->data[i].p >  cur_p->data[j].p) {
-                    cur_p->data[i].p += cur_p->data[j].p;
-                    cur_p->data[j].logit = -INFINITY;
-                    cur_p->data[j].p     = 0.0f;
-                } else {
-                    cur_p->data[j].p += cur_p->data[i].p;
-                    cur_p->data[i].logit = -INFINITY;
-                    cur_p->data[i].p     = 0.0f;
+            int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+            if (len0 < 0) {
+                ctx->buf0.resize(len0);
+                len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+                assert(len0 > 0);
+            }
+
+            int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+            if (len1 < 0) {
+                ctx->buf1.resize(len1);
+                len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+                assert(len1 > 0);
+            }
+
+            // token i0 is a prefix of token i1
+            if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
+                int dst = i0;
+                int src = i1;
+
+                // merge into the token with higher probability
+                if (cur_p->data[i1].p > cur_p->data[i0].p) {
+                    std::swap(dst, src);
                 }
 
+                cur_p->data[dst].p += cur_p->data[src].p;
+                cur_p->data[src].logit = -INFINITY;
+                cur_p->data[src].p     = 0.0f;
+
                 n_combined++;
             }
         }
@@ -1936,6 +1956,8 @@ struct llama_sampler * llama_sampler_init_infill_impl(
         /* .iface = */ &llama_sampler_infill_i,
         /* .ctx   = */ new llama_sampler_infill {
             /* .vocab = */ &vocab,
+            /* .buf0 = */ std::vector<char>(512),
+            /* .buf1 = */ std::vector<char>(512),
         },
     };
 }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 57d56a3d3..0a49ddbe3 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1858,23 +1858,6 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
     return 0;
 }
 
-bool llama_token_is_prefix_impl(
-        const struct llama_vocab & vocab,
-                     llama_token   token0,
-                     llama_token   token1) {
-    char text_buf_0[128];
-    char text_buf_1[128];
-
-    const int32_t len0 = llama_token_to_piece_impl(vocab, token0, text_buf_0, sizeof(text_buf_0) - 1, 0, false);
-    const int32_t len1 = llama_token_to_piece_impl(vocab, token1, text_buf_1, sizeof(text_buf_1) - 1, 0, false);
-
-    if (len0 <= 0 || len1 <= 0) {
-        return false;
-    }
-
-    return len0 <= len1 && memcmp(text_buf_0, text_buf_1, len0) == 0;
-}
-
 int32_t llama_detokenize_impl(
         const struct llama_vocab & vocab,
                const llama_token * tokens,
diff --git a/src/llama.cpp b/src/llama.cpp
index 68479c6db..d8e2b006c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21466,13 +21466,6 @@ int32_t llama_token_to_piece(
     return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
 }
 
-bool llama_token_is_prefix(
-    const struct llama_model * model,
-                 llama_token   token0,
-                 llama_token   token1) {
-    return llama_token_is_prefix_impl(model->vocab, token0, token1);
-}
-
 int32_t llama_detokenize(
     const struct llama_model * model,
            const llama_token * tokens,

From 9f45fc1e9950a496febc575cdd196cd5cad000cc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 17 Oct 2024 23:26:32 +0300
Subject: [PATCH 067/396] llama : change warning to debug log

---
 src/llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index d8e2b006c..ffaa6f789 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6735,9 +6735,9 @@ static void llm_load_vocab(
                     vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                 }
             } else {
-                // token is control, but not marked as EOG -> print a warning
+                // token is control, but not marked as EOG -> print a debug log
                 if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
-                    LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
+                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
                             __func__, t.second, t.first.c_str());
                 }
             }

From 17bb9280807cfbb6611b853aa1ef05114bd9efe9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 17 Oct 2024 23:43:05 +0300
Subject: [PATCH 068/396] readme : remove --memory-f32 references (#9925)

---
 examples/main/README.md    | 4 ----
 scripts/run-with-preset.py | 6 +++---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/examples/main/README.md b/examples/main/README.md
index 620934dad..7e192b9f2 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -297,10 +297,6 @@ These options help improve the performance and memory usage of the LLaMA models.
 
  These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
 
-### Memory Float 32
-
--   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
-
 ### Batch Size
 
 -   `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py
index ee21eab37..47cacb432 100755
--- a/scripts/run-with-preset.py
+++ b/scripts/run-with-preset.py
@@ -15,7 +15,7 @@ CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
     "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
     "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
     "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
-    "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
+    "low-vram", "main-gpu", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
     "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
     "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
     "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
@@ -25,12 +25,12 @@ CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
 ]
 
 CLI_ARGS_LLAMA_BENCH = [
-    "batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
+    "batch-size", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
     "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
 ]
 
 CLI_ARGS_LLAMA_SERVER = [
-    "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
+    "alias", "batch-size", "ctx-size", "embedding", "host", "lora", "lora-base",
     "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
     "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
     "threads", "verbose"

From 6f55bccbb8835d42147add4ee48807450f5ff535 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Fri, 18 Oct 2024 01:41:51 +0200
Subject: [PATCH 069/396] llama : rename batch_all to batch (#8881)

This commit addresses the TODO in the code to rename the `batch_all`
parameter to `batch` in `llama_decode_internal`.
---
 src/llama.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index ffaa6f789..dcb015d12 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17134,10 +17134,10 @@ static void llama_graph_compute(
 //
 static int llama_decode_internal(
          llama_context & lctx,
-           llama_batch   batch_all) { // TODO: rename back to batch
+           llama_batch   batch) {
 
     lctx.is_encoding = false;
-    const uint32_t n_tokens_all = batch_all.n_tokens;
+    const uint32_t n_tokens_all = batch.n_tokens;
 
     if (n_tokens_all == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
@@ -17148,12 +17148,12 @@ static int llama_decode_internal(
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
 
-    GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
-    if (batch_all.token) {
+    if (batch.token) {
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
-            if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]);
+            if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
                 return -1;
             }
         }
@@ -17184,9 +17184,9 @@ static int llama_decode_internal(
     lctx.embd_seq.clear();
 
     // count outputs
-    if (batch_all.logits && !embd_pooled) {
+    if (batch.logits && !embd_pooled) {
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
-            n_outputs += batch_all.logits[i] != 0;
+            n_outputs += batch.logits[i] != 0;
         }
     } else if (lctx.logits_all || embd_pooled) {
         n_outputs = n_tokens_all;
@@ -17195,7 +17195,7 @@ static int llama_decode_internal(
         n_outputs = 1;
     }
 
-    lctx.sbatch.from_batch(batch_all, n_embd,
+    lctx.sbatch.from_batch(batch, n_embd,
         /* simple_split */ !kv_self.recurrent,
         /* logits_all   */ n_outputs == n_tokens_all);
 

From 8901755ba328643c9ab071c20e1939ea52951a0e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 18 Oct 2024 07:32:19 +0300
Subject: [PATCH 070/396] server : add n_indent parameter for line indentation
 requirement (#9929)

ggml-ci
---
 examples/server/README.md  |  2 ++
 examples/server/server.cpp | 54 +++++++++++++++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index fcdb02afd..09f1aa249 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -333,6 +333,8 @@ node index.js
 
     `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
 
+    `n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`
+
     `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
     By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b5e63384c..8fd443878 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -131,6 +131,7 @@ struct slot_params {
     int32_t n_keep    =  0; // number of tokens to keep from initial prompt
     int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
     int32_t n_predict = -1; // new tokens to predict
+    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
 
     int64_t t_max_prompt_ms  = -1; // TODO: implement
     int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
@@ -173,6 +174,8 @@ struct server_slot {
     std::vector<llama_token> prompt_tokens;
     std::vector<llama_token> extra_tokens;
 
+    size_t last_nl_pos = 0;
+
     std::string generated_text;
     std::vector<llama_token> cache_tokens;
     std::vector<completion_token_output> generated_token_probs;
@@ -215,6 +218,7 @@ struct server_slot {
         SLT_DBG(*this, "%s", "\n");
 
         n_prompt_tokens    = 0;
+        last_nl_pos        = 0;
         generated_text     = "";
         has_new_line       = false;
         truncated          = false;
@@ -860,6 +864,7 @@ struct server_context {
         slot.params.stream             = json_value(data, "stream",            false);
         slot.params.cache_prompt       = json_value(data, "cache_prompt",      false);
         slot.params.n_predict          = json_value(data, "n_predict",         json_value(data, "max_tokens", default_params.n_predict));
+        slot.params.n_indent           = json_value(data, "n_indent",          default_params.n_indent);
         slot.sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
         slot.sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
         slot.sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
@@ -878,7 +883,7 @@ struct server_context {
         slot.sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
         slot.sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
         slot.sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
-        slot.params.n_keep             = json_value(data, "n_keep",            slot.params.n_keep);
+        slot.params.n_keep             = json_value(data, "n_keep",            default_params.n_keep);
         slot.params.n_discard          = json_value(data, "n_discard",         default_params.n_discard);
         slot.sparams.seed              = json_value(data, "seed",              default_sparams.seed);
         slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
@@ -1129,13 +1134,48 @@ struct server_context {
             SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
         }
 
-        // if we have already seen a new line, we stop after a certain time limit
-        if (slot.has_new_line && slot.params.t_max_predict_ms > 0 &&
-            (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
-            slot.stopped_limit  = true;
-            slot.has_next_token = false;
+        if (slot.has_new_line) {
+            // if we have already seen a new line, we stop after a certain time limit
+            if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
+                slot.stopped_limit  = true;
+                slot.has_next_token = false;
 
-            SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
+                SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
+            }
+
+            // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
+            if (slot.params.n_indent > 0) {
+                // check the current indentation
+                // TODO: improve by not doing it more than once for each new line
+                if (slot.last_nl_pos > 0) {
+                    size_t pos = slot.last_nl_pos;
+
+                    int n_indent = 0;
+                    while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) {
+                        n_indent++;
+                        pos++;
+                    }
+
+                    if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) {
+                        slot.stopped_limit  = true;
+                        slot.has_next_token = false;
+
+                        // cut the last line
+                        slot.generated_text.erase(pos, std::string::npos);
+
+                        SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
+                    }
+                }
+
+                // find the next new line
+                {
+                    const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
+
+                    if (pos != std::string::npos) {
+                        slot.last_nl_pos = pos + 1;
+                    }
+                }
+            }
         }
 
         // check if there is a new line in the generated text

From 60ce97c9d809f4b040e90b597468b839df5728d0 Mon Sep 17 00:00:00 2001
From: Ma Mingfei <mingfei.ma@intel.com>
Date: Fri, 18 Oct 2024 13:34:36 +0800
Subject: [PATCH 071/396] add amx kernel for gemm (#8998)

add intel amx isa detection

add vnni kernel for gemv cases

add vnni and amx kernel support for block_q8_0

code cleanup

fix packing B issue

enable openmp

fine tune amx kernel

switch to aten parallel pattern

add error message for nested parallelism

code cleanup

add f16 support in ggml-amx

add amx kernels for QK_K quant formats: Q4_K, Q5_K, Q6_K and IQ4_XS

update CMakeList

update README

fix some compilation warning

fix compiler warning when amx is not enabled

minor change

ggml-ci

move ggml_amx_init from ggml.c to ggml-amx/mmq.cpp

ggml-ci

update CMakeLists with -mamx-tile, -mamx-int8 and -mamx-bf16

ggml-ci

add amx as an ggml-backend

update header file, the old path for immintrin.h has changed to ggml-cpu-impl.h

minor change

update CMakeLists.txt

minor change

apply weight prepacking in set_tensor method in ggml-backend

fix compile error

ggml-ci

minor change

ggml-ci

update CMakeLists.txt

ggml-ci

add march dependency

minor change

ggml-ci

change ggml_backend_buffer_is_host to return false for amx backend

ggml-ci

fix supports_op

use device reg for AMX backend

ggml-ci

minor change

ggml-ci

minor change

fix rebase

set .buffer_from_host_ptr to be false for AMX backend
---
 CMakeLists.txt             |    4 +
 Makefile                   |   24 +-
 README.md                  |    2 +-
 ggml/CMakeLists.txt        |    4 +
 ggml/include/ggml-amx.h    |   25 +
 ggml/include/ggml.h        |    1 +
 ggml/src/CMakeLists.txt    |   42 +
 ggml/src/ggml-amx.cpp      |  453 +++++++
 ggml/src/ggml-amx/common.h |   93 ++
 ggml/src/ggml-amx/mmq.cpp  | 2509 ++++++++++++++++++++++++++++++++++++
 ggml/src/ggml-amx/mmq.h    |   17 +
 ggml/src/ggml-backend.cpp  |   12 +-
 ggml/src/ggml.c            |    8 +
 src/llama.cpp              |   17 +
 14 files changed, 3204 insertions(+), 7 deletions(-)
 create mode 100644 ggml/include/ggml-amx.h
 create mode 100644 ggml/src/ggml-amx.cpp
 create mode 100644 ggml/src/ggml-amx/common.h
 create mode 100644 ggml/src/ggml-amx/mmq.cpp
 create mode 100644 ggml/src/ggml-amx/mmq.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64a335378..ef0932a7b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
     set(GGML_LLAMAFILE_DEFAULT ON)
 endif()
 
+if (NOT DEFINED GGML_AMX)
+    set(GGML_AMX ON)
+endif()
+
 if (NOT DEFINED GGML_CUDA_GRAPHS)
     set(GGML_CUDA_GRAPHS_DEFAULT ON)
 endif()
diff --git a/Makefile b/Makefile
index 2793978c3..719f45d16 100644
--- a/Makefile
+++ b/Makefile
@@ -93,11 +93,6 @@ GGML_METAL := 1
 DEPRECATE_WARNING := 1
 endif
 
-ifdef LLAMA_OPENMP
-GGML_OPENMP := 1
-DEPRECATE_WARNING := 1
-endif
-
 ifdef LLAMA_RPC
 GGML_RPC := 1
 DEPRECATE_WARNING := 1
@@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
 	OBJ_GGML    += ggml/src/llamafile/sgemm.o
 endif
 
+ifndef GGML_NO_AMX
+	MK_CPPFLAGS += -DGGML_USE_AMX
+	OBJ_GGML    += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
+endif
+
 ifdef GGML_RPC
 	MK_CPPFLAGS += -DGGML_USE_RPC
 	OBJ_GGML    += ggml/src/ggml-rpc.o
@@ -1087,6 +1087,19 @@ ggml/src/llamafile/sgemm.o: \
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # GGML_NO_LLAMAFILE
 
+ifndef GGML_NO_AMX
+ggml/src/ggml-amx.o: \
+	ggml/src/ggml-amx.cpp \
+	ggml/include/ggml-amx.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ggml/src/ggml-amx/mmq.o: \
+	ggml/src/ggml-amx/mmq.cpp \
+	ggml/src/ggml-amx/mmq.h \
+	ggml/include/ggml.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif
+
 ifdef GGML_RPC
 ggml/src/ggml-rpc.o: \
 	ggml/src/ggml-rpc.cpp \
@@ -1238,6 +1251,7 @@ clean:
 	rm -vrf ggml/src/ggml-metal-embed.metal
 	rm -vrf ggml/src/ggml-cuda/*.o
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
+	rm -vrf ggml/src/ggml-amx/*.o
 	rm -rvf $(BUILD_TARGETS)
 	rm -rvf $(TEST_TARGETS)
 	rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
diff --git a/README.md b/README.md
index 707904ddc..1088b3338 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ variety of hardware - locally and in the cloud.
 
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
-- AVX, AVX2 and AVX512 support for x86 architectures
+- AVX, AVX2, AVX512 and AMX support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
 - Vulkan and SYCL backend support
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 89fdf9d1c..cfa6e3f70 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -99,6 +99,9 @@ option(GGML_AVX512      "ggml: enable AVX512"           OFF)
 option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
 option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI"      OFF)
 option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16"      OFF)
+option(GGML_AMX_TILE    "ggml: enable AMX-TILE"         OFF)
+option(GGML_AMX_INT8    "ggml: enable AMX-INT8"         OFF)
+option(GGML_AMX_BF16    "ggml: enable AMX-BF16"         OFF)
 option(GGML_FMA         "ggml: enable FMA"              ${INS_ENB})
 if (NOT MSVC)
     option(GGML_F16C    "ggml: enable F16C"             ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
@@ -158,6 +161,7 @@ set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
 option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
+option(GGML_AMX                             "ggml: use AMX"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
diff --git a/ggml/include/ggml-amx.h b/ggml/include/ggml-amx.h
new file mode 100644
index 000000000..22b3f70f4
--- /dev/null
+++ b/ggml/include/ggml-amx.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// buffer_type API
+GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+
+GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);
+
+// backend API
+GGML_API ggml_backend_t ggml_backend_amx_init(void);
+
+GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
+
+GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 4508da4fb..de3c706fc 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2488,6 +2488,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_avx512_vbmi(void);
     GGML_API int ggml_cpu_has_avx512_vnni(void);
     GGML_API int ggml_cpu_has_avx512_bf16(void);
+    GGML_API int ggml_cpu_has_amx_int8   (void);
     GGML_API int ggml_cpu_has_fma        (void);
     GGML_API int ggml_cpu_has_neon       (void);
     GGML_API int ggml_cpu_has_sve        (void);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 676f85a36..aa405e4d0 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -267,6 +267,26 @@ if (GGML_LLAMAFILE)
     set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
 endif()
 
+if (GGML_AMX)
+    if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
+    else()
+        set(GGML_AMX OFF)
+        message(WARNING "AMX requires gcc version > 11.0. Turning off GGML_AMX.")
+    endif()
+
+    if (GGML_AMX)
+        message(STATUS "Using AMX")
+
+        list(APPEND GGML_CDEF_PUBLIC GGML_USE_AMX)
+
+        file(GLOB   GGML_HEADERS_AMX "ggml-amx/*.h")
+        list(APPEND GGML_HEADERS_AMX "../include/ggml-amx.h")
+
+        file(GLOB   GGML_SOURCES_AMX "ggml-amx/*.cpp")
+        list(APPEND GGML_SOURCES_AMX "ggml-amx.cpp")
+    endif()
+endif()
+
 if (GGML_CUDA)
     cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
 
@@ -1180,6 +1200,18 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
             endif()
+            if (GGML_AMX_TILE)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
+            endif()
+            if (GGML_AMX_INT8)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
+            endif()
+            if (GGML_AMX_BF16)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
+            endif()
         elseif (GGML_AVX2)
             list(APPEND ARCH_FLAGS /arch:AVX2)
         elseif (GGML_AVX)
@@ -1215,6 +1247,15 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
         if (GGML_AVX512_BF16)
             list(APPEND ARCH_FLAGS -mavx512bf16)
         endif()
+        if (GGML_AMX_TILE)
+            list(APPEND ARCH_FLAGS -mamx-tile)
+        endif()
+        if (GGML_AMX_INT8)
+            list(APPEND ARCH_FLAGS -mamx-int8)
+        endif()
+        if (GGML_AMX_BF16)
+            list(APPEND ARCH_FLAGS -mamx-bf16)
+        endif()
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
     message(STATUS "PowerPC detected")
@@ -1340,6 +1381,7 @@ add_library(ggml
             ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
             ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
             ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
+            ${GGML_SOURCES_AMX}       ${GGML_HEADERS_AMX}
             ${GGML_SOURCES_CANN}      ${GGML_HEADERS_CANN}
             ggml-aarch64.c            ggml-aarch64.h
             )
diff --git a/ggml/src/ggml-amx.cpp b/ggml/src/ggml-amx.cpp
new file mode 100644
index 000000000..ac6ec2342
--- /dev/null
+++ b/ggml/src/ggml-amx.cpp
@@ -0,0 +1,453 @@
+#include "ggml-amx.h"
+#include "ggml-amx/common.h"
+#include "ggml-amx/mmq.h"
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+
+#if defined(__gnu_linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#if defined(__AMX_INT8__)
+
+// AMX buffer interface
+static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
+    return "AMX";
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+}
+
+static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *)(buffer->context);
+}
+
+static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    memset((char *)tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    if (qtype_has_amx_kernels(tensor->type)) {
+        ggml_backend_amx_convert_weight(tensor, data, offset, size);
+    } else {
+        memcpy((char *)tensor->data + offset, data, size);
+    }
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        if (qtype_has_amx_kernels(src->type)) {
+            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
+        } else {
+            memcpy(dst->data, src->data, ggml_nbytes(src));
+        }
+        return true;
+    }
+    return false;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+
+static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
+    /* .get_name        = */ ggml_backend_amx_buffer_get_name,
+    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_amx_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "AMX";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
+    if (data == NULL) {
+        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
+}
+
+static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
+    return ggml_backend_amx_get_alloc_size(tensor);
+
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+
+    GGML_UNUSED(buft);
+}
+
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
+        /* .iface = */ {
+        /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
+        /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
+        /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
+        /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+        /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
+        /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
+        },
+        /* .device  = */ NULL,
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_buffer_type_amx;
+}
+
+// backend interface
+
+static const char * ggml_backend_amx_name(ggml_backend_t backend) {
+    return "AMX";
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_amx_free(ggml_backend_t backend) {
+    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
+    delete ctx;
+    delete backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
+    return ggml_backend_amx_buffer_type();
+
+    GGML_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        switch (node->op) {
+        case GGML_OP_MUL_MAT:
+            ggml_backend_amx_mul_mat(ctx, node);
+            break;
+
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            break;
+
+        default:
+            fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
+            GGML_ASSERT(false);
+        }
+    }
+
+    return GGML_STATUS_SUCCESS;
+
+    GGML_UNUSED(backend);
+}
+
+static struct ggml_backend_i ggml_backend_amx_i = {
+    /* .get_name                = */ ggml_backend_amx_name,
+    /* .free                    = */ ggml_backend_amx_free,
+    /* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_amx_graph_compute,
+    /* .supports_op             = */ NULL,
+    /* .supports_buft           = */ NULL,
+    /* .offload_op              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_amx_guid() {
+    static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
+    return &guid;
+}
+
+#define ARCH_GET_XCOMP_PERM     0x1022
+#define ARCH_REQ_XCOMP_PERM     0x1023
+#define XFEATURE_XTILECFG       17
+#define XFEATURE_XTILEDATA      18
+
+static bool ggml_amx_init() {
+#if defined(__gnu_linux__)
+    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
+        fprintf(stderr, "AMX is not ready to be used!\n");
+        return false;
+    }
+    return true;
+#elif defined(_WIN32)
+    return true;
+#endif
+}
+
+ggml_backend_t ggml_backend_amx_init() {
+
+    // invoke a Linux system call to request access to AMX features
+    ggml_amx_init();
+
+    // backend context
+    ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
+
+    // ggml amx backend
+    ggml_backend_t backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_amx_guid(),
+        /* .interface = */ ggml_backend_amx_i,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
+        /* .context   = */ ctx,
+    };
+
+    return backend;
+}
+
+bool ggml_backend_is_amx(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
+}
+
+void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_amx(backend_amx));
+
+    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
+    ctx->n_threads = n_threads;
+}
+
+// device interface
+
+static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
+    return "AMX";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
+    return "Intel Advanced Matrix Extensions";
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    // TODO
+    *free = 0;
+    *total = 0;
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_CPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_amx_device_get_name(dev);
+    props->description = ggml_backend_amx_device_get_description(dev);
+    props->type        = ggml_backend_amx_device_get_type(dev);
+    ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    // `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
+    return ggml_backend_amx_init();
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_amx_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+
+    // handle only 2d gemm for now
+    auto is_contiguous_2d = [](const struct ggml_tensor * t) {
+        return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
+    };
+
+    switch (op->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+
+        case GGML_OP_MUL_MAT: {
+            const struct ggml_tensor * src0 = op->src[0];
+            const struct ggml_tensor * src1 = op->src[1];
+
+            const enum ggml_type type = src0->type;
+            const int64_t ne0 = op->ne[0];
+
+            bool is_training = src0->grad || src1->grad;
+
+            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
+            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
+            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
+
+            bool can_use_amx =
+                is_contiguous_2d(src0) &&       // src0 must be contiguous
+                is_contiguous_2d(src1) &&       // src1 must be contiguous
+                !is_training &&                 // inference only
+                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
+                has_amx_kernels &&              // with amx kernel impls
+                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
+
+            return can_use_amx;
+        }
+        default:
+            return false;
+    }
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
+
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
+    /* .get_name             = */ ggml_backend_amx_device_get_name,
+    /* .get_description      = */ ggml_backend_amx_device_get_description,
+    /* .get_memory           = */ ggml_backend_amx_device_get_memory,
+    /* .get_type             = */ ggml_backend_amx_device_get_type,
+    /* .get_props            = */ ggml_backend_amx_device_get_props,
+    /* .init_backend         = */ ggml_backend_amx_device_init,
+    /* .get_buffer_type      = */ ggml_backend_amx_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ NULL,
+    /* .supports_op          = */ ggml_backend_amx_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_amx_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// backend reg interface
+
+static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
+    return "AMX";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
+    return 1;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    static ggml_backend_device ggml_backend_amx_device = {
+        /* .iface   = */ ggml_backend_amx_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ nullptr,
+    };
+
+    return &ggml_backend_amx_device;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+}
+
+static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        return (void *)ggml_backend_amx_set_n_threads;
+    }
+    return NULL;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(name);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
+    /* .get_name         = */ ggml_backend_amx_reg_get_name,
+    /* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_amx_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_amx_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_amx_reg(void) {
+    static struct ggml_backend_reg ggml_backend_amx_reg = {
+        /* .iface   = */ ggml_backend_amx_reg_i,
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_amx_reg;
+}
+
+#else // if defined(__AMX_INT8__)
+
+ggml_backend_t ggml_backend_amx_init(void) {
+    fprintf(stderr, "GGML is not compiled with AMX support!\n");
+    return ggml_backend_t{};
+}
+
+void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
+    fprintf(stderr, "GGML is not compiled with AMX support!\n");
+
+    GGML_UNUSED(backend_amx);
+    GGML_UNUSED(n_threads);
+}
+
+#endif
diff --git a/ggml/src/ggml-amx/common.h b/ggml/src/ggml-amx/common.h
new file mode 100644
index 000000000..2b6c63527
--- /dev/null
+++ b/ggml/src/ggml-amx/common.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpu-impl.h" // <immintrin.h>
+
+#include <algorithm>
+#include <memory>
+#include <type_traits>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#define TILE_M 16
+#define TILE_N 16
+#define TILE_K 32
+#define VNNI_BLK 4
+
+#define AMX_BLK_SIZE 32
+
+#define TMM0 0
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+#define TMM4 4
+#define TMM5 5
+#define TMM6 6
+#define TMM7 7
+
+// parallel routines
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T div_up(T x, T y) { return (x + y - 1) / y; }
+
+template <typename T>
+inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
+#if 0
+    // onednn partition pattern
+    T& n_my = n_end;
+    if (nth <= 1 || n == 0) {
+        n_start = 0;
+        n_my = n;
+    } else {
+        T n1 = div_up(n, nth);
+        T n2 = n1 - 1;
+        T T1 = n - n2 * nth;
+        n_my = ith < T1 ? n1 : n2;
+        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
+    }
+    n_end += n_start;
+#else
+    // pytorch aten partition pattern
+    T n_my = div_up(n, nth);
+    n_start = ith * n_my;
+    n_end = std::min(n_start + n_my, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for(int nth, int n, const func_t& f) {
+#if defined(_OPENMP)
+#pragma omp parallel num_threads(nth)
+{
+    //int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+}
+#else
+    f(0, n);
+
+    GGML_UNUSED(nth);
+#endif
+}
+
+// quantized types that have AMX support
+inline bool qtype_has_amx_kernels(const enum ggml_type type) {
+    // TODO: fix padding for vnni format
+    return (type == GGML_TYPE_Q4_0) ||
+        (type == GGML_TYPE_Q4_1);
+        //(type == GGML_TYPE_Q8_0) ||
+        //(type == GGML_TYPE_Q4_K) ||
+        //(type == GGML_TYPE_Q5_K) ||
+        //(type == GGML_TYPE_Q6_K) ||
+        //(type == GGML_TYPE_IQ4_XS);
+}
+
+// ggml backend context
+struct ggml_backend_amx_context {
+    int n_threads = GGML_DEFAULT_N_THREADS;
+    std::unique_ptr<char[]> work_data;
+    size_t work_size = 0;
+};
diff --git a/ggml/src/ggml-amx/mmq.cpp b/ggml/src/ggml-amx/mmq.cpp
new file mode 100644
index 000000000..239d15121
--- /dev/null
+++ b/ggml/src/ggml-amx/mmq.cpp
@@ -0,0 +1,2509 @@
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wpedantic"
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
+#endif
+
+#include "mmq.h"
+#include "ggml-impl.h"
+#include "ggml-quants.h"
+#include <algorithm>
+#include <type_traits>
+
+#if defined(__gnu_linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#if (defined(_WIN32) || defined(_WIN64))
+#define RESTRICT __restrict
+#else
+#define RESTRICT __restrict__
+#endif
+
+#if (defined(_WIN32) || defined(_WIN64))
+#define ALWAYS_INLINE __forceinline
+#elif __has_attribute(always_inline) || defined(__GNUC__)
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#else
+#define ALWAYS_INLINE inline
+#endif
+
+#if defined(__AMX_INT8__)
+
+namespace {
+
+// Forced unrolling
+template <int n>
+struct Unroll {
+    template <typename Func, typename... Args>
+    ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+        Unroll<n - 1>{}(f, args...);
+        f(std::integral_constant<int, n - 1>{}, args...);
+    }
+};
+
+template <>
+struct Unroll<1> {
+    template <typename Func, typename... Args>
+    ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+        f(std::integral_constant<int, 0>{}, args...);
+    }
+};
+
+// type traits
+template <typename T> struct PackedTypes {};
+template <> struct PackedTypes<block_q4_0> { using type = int8_t; };
+template <> struct PackedTypes<block_q4_1> { using type = uint8_t; };
+template <> struct PackedTypes<block_q8_0> { using type = int8_t; };
+template <typename T> using packed_B_type = typename PackedTypes<T>::type;
+
+template <typename T>
+struct do_compensate : std::integral_constant<bool,
+    std::is_same<T, block_q8_0>::value> {};
+
+template <typename T>
+struct do_unpack : std::integral_constant<bool,
+    std::is_same<T, block_q4_0>::value ||
+    std::is_same<T, block_q4_1>::value> {};
+
+template <typename T>
+struct is_type_qkk : std::integral_constant<bool,
+    std::is_same<T, block_q4_K>::value ||
+    std::is_same<T, block_q5_K>::value ||
+    std::is_same<T, block_q6_K>::value ||
+    std::is_same<T, block_iq4_xs>::value> {};
+
+#define GGML_DISPATCH_FLOATING_TYPES(TYPE, ...)                                        \
+    [&] {                                                                              \
+        switch (TYPE) {                                                                \
+            case GGML_TYPE_F16: {                                                      \
+                using type = ggml_fp16_t;                                              \
+                constexpr int blck_size = 16;                                          \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_BF16: {                                                     \
+                using type = ggml_bf16_t;                                              \
+                constexpr int blck_size = 32;                                          \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            default:                                                                   \
+                fprintf(stderr, "Unsupported floating data type\n");                   \
+        }                                                                              \
+    }()
+
+#define GGML_DISPATCH_QTYPES(QT, ...)                                                  \
+    [&] {                                                                              \
+        switch (QT) {                                                                  \
+            case GGML_TYPE_Q4_0: {                                                     \
+                using type = block_q4_0;                                               \
+                using vec_dot_type = block_q8_0;                                       \
+                constexpr int blck_size = QK4_0;                                       \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_Q4_1: {                                                     \
+                using type = block_q4_1;                                               \
+                using vec_dot_type = block_q8_1;                                       \
+                constexpr int blck_size = QK4_1;                                       \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_Q8_0: {                                                     \
+                using type = block_q8_0;                                               \
+                using vec_dot_type = block_q8_0;                                       \
+                constexpr int blck_size = QK8_0;                                       \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_Q4_K: {                                                     \
+                using type = block_q4_K;                                               \
+                using vec_dot_type = block_q8_K;                                       \
+                constexpr int blck_size = QK_K;                                        \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_Q5_K: {                                                     \
+                using type = block_q5_K;                                               \
+                using vec_dot_type = block_q8_K;                                       \
+                constexpr int blck_size = QK_K;                                        \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_Q6_K: {                                                     \
+                using type = block_q6_K;                                               \
+                using vec_dot_type = block_q8_K;                                       \
+                constexpr int blck_size = QK_K;                                        \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_IQ4_XS: {                                                   \
+                using type = block_iq4_xs;                                             \
+                using vec_dot_type = block_q8_K;                                       \
+                constexpr int blck_size = QK_K;                                        \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            default:                                                                   \
+                fprintf(stderr, "Unsupported quantized data type: %d\n", int(TYPE));   \
+        }                                                                              \
+    }()
+
+#define GGML_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...)                                     \
+    [&] {                                                                              \
+        if (BOOL_V) {                                                                  \
+            constexpr bool BOOL_NAME = true;                                           \
+            return __VA_ARGS__();                                                      \
+        } else {                                                                       \
+            constexpr bool BOOL_NAME = false;                                          \
+            return __VA_ARGS__();                                                      \
+        }                                                                              \
+    }()
+
+// define amx tile config data structure
+struct tile_config_t{
+    uint8_t palette_id = 0;
+    uint8_t start_row = 0;
+    uint8_t reserved_0[14] = {0};
+    uint16_t colsb[16] = {0};
+    uint8_t rows[16] = {0};
+};
+
+// Notes: amx tile config
+//
+// Typically, TMUL calculates A and B of size 16 x 64 containing INT8 values,
+// and accumulate the result to a 16 x 16 matrix C containing INT32 values,
+//
+// As many GGUF quantized types as `block_size` of 32, so a 16-16-32 config is used
+// instead of the normally used 16-16-64 config.
+//
+//    Block A: {16, 32}, dtype = int8_t
+//    Block B: {16, 32}, dtype = uint8_t/int8_t
+//    Block C: {16, 16}, dtype = int32_t
+//
+// Block B needs to be prepacked to vnni format before feeding into  TMUL:
+//    packed_B: from {n, k} to {k/vnni_blk, n, vnni_blck}, viewed in 2d, we get {8, 64}
+//
+// Therefore, we get tileconfig:
+//             A    B    C
+//    rows    16    8   16
+//    colsb   32   64   16
+//
+// For tile distribution, follow a 2-2-4 pattern, e.g. A used TMM2-TMM3, B used TMM0-TMM1,
+// C used TMM4-TMM7:
+//            B TMM0  B TMM1
+//    A TMM2  C TMM4  C TMM6
+//    A TMM3  C TMM5  C TMM7
+//
+// Each `amx` kernel handles 4 blocks at a time: 2MB * 2NB, when m < 2 * BLOCK_M, unpack A
+// will be needed.
+//
+// Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
+// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
+//
+// ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
+//    advanced-matrix-extensions-intrinsics-functions.html
+//
+
+#define TC_CONFIG_TILE(i, r, cb) tc.rows[i] = r; tc.colsb[i] = cb
+void ggml_tile_config_init(void) {
+    static thread_local bool is_first_time = true;
+
+    if (!is_first_time) {
+        return;
+    }
+
+    static thread_local tile_config_t tc;
+    tile_config_t current_tc;
+    _tile_storeconfig(&current_tc);
+
+    // load only when config changes
+    if (tc.palette_id == 0 || (memcmp(&current_tc.colsb, &tc.colsb, sizeof(uint16_t) * 8) != 0 &&
+                               memcmp(&current_tc.rows, &tc.rows, sizeof(uint8_t) * 8) != 0)) {
+        tc.palette_id = 1;
+        tc.start_row = 0;
+        TC_CONFIG_TILE(TMM0, 8, 64);
+        TC_CONFIG_TILE(TMM1, 8, 64);
+        TC_CONFIG_TILE(TMM2, 16, 32);
+        TC_CONFIG_TILE(TMM3, 16, 32);
+        TC_CONFIG_TILE(TMM4, 16, 64);
+        TC_CONFIG_TILE(TMM5, 16, 64);
+        TC_CONFIG_TILE(TMM6, 16, 64);
+        TC_CONFIG_TILE(TMM7, 16, 64);
+        _tile_loadconfig(&tc);
+    }
+
+    is_first_time = false;
+}
+
+// we need an extra 16 * 4B (TILE_N * int32_t) for each NB/KB block for compensation.
+// See the notes `s8s8 igemm compensation in avx512-vnni` for detail.
+template <typename TB>
+int get_tile_size() {
+    int tile_size = TILE_N * sizeof(TB);
+    if (do_compensate<TB>::value) {
+        tile_size += TILE_N * sizeof(int32_t);
+    }
+    if (std::is_same<TB, block_q4_K>::value ||
+        std::is_same<TB, block_q5_K>::value) {
+        tile_size += TILE_N * 4;
+    }
+    if (std::is_same<TB, block_iq4_xs>::value) {
+        tile_size += TILE_N * 2;
+    }
+    return tile_size;
+}
+
+template <typename TB, int BLOCK_K>
+int get_row_size(int K) {
+    int KB = K / BLOCK_K;
+    int row_size = KB * sizeof(TB);
+    if (do_compensate<TB>::value) {
+        row_size += KB * sizeof(int32_t);
+    }
+    if (std::is_same<TB, block_q4_K>::value ||
+        std::is_same<TB, block_q5_K>::value) {
+        row_size += KB * 4;
+    }
+    if (std::is_same<TB, block_iq4_xs>::value) {
+        row_size += KB * 2;
+    }
+    return row_size;
+}
+
+// vectorized dtype conversion
+inline float FP16_TO_FP32(ggml_half val) {
+    __m256i v = _mm256_setr_epi16(
+        val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m512 o = _mm512_cvtph_ps(v);
+    return _mm512_cvtss_f32(o);
+}
+
+inline __m512 FP16_TO_FP32_VEC(ggml_half val) {
+    __m256i v = _mm256_set1_epi16(val);
+    return _mm512_cvtph_ps(v);
+}
+
+// horizontal reduce
+inline float _mm512_reduce_max_ps(const __m512 x) {
+    __m512 v = x;
+    __m512 v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
+    v = _mm512_max_ps(v, v1);
+    v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
+    v = _mm512_max_ps(v, v1);
+    v1 = _mm512_shuffle_ps(v, v, 0x4E);
+    v = _mm512_max_ps(v, v1);
+    v1 = _mm512_shuffle_ps(v, v, 0xB1);
+    v = _mm512_max_ps(v, v1);
+    return _mm512_cvtss_f32(v);
+}
+
+// transpose utils
+#define SHUFFLE_EPI32(a, b, mask) \
+    _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask))
+inline void transpose_8x8_32bit(__m256i * v, __m256i * v1) {
+    // unpacking and 32-bit elements
+    v1[0] = _mm256_unpacklo_epi32(v[0], v[1]);
+    v1[1] = _mm256_unpackhi_epi32(v[0], v[1]);
+    v1[2] = _mm256_unpacklo_epi32(v[2], v[3]);
+    v1[3] = _mm256_unpackhi_epi32(v[2], v[3]);
+    v1[4] = _mm256_unpacklo_epi32(v[4], v[5]);
+    v1[5] = _mm256_unpackhi_epi32(v[4], v[5]);
+    v1[6] = _mm256_unpacklo_epi32(v[6], v[7]);
+    v1[7] = _mm256_unpackhi_epi32(v[6], v[7]);
+
+    // shuffling the 32-bit elements
+    v[0] = SHUFFLE_EPI32(v1[0], v1[2], 0x44);
+    v[1] = SHUFFLE_EPI32(v1[0], v1[2], 0xee);
+    v[2] = SHUFFLE_EPI32(v1[4], v1[6], 0x44);
+    v[3] = SHUFFLE_EPI32(v1[4], v1[6], 0xee);
+    v[4] = SHUFFLE_EPI32(v1[1], v1[3], 0x44);
+    v[5] = SHUFFLE_EPI32(v1[1], v1[3], 0xee);
+    v[6] = SHUFFLE_EPI32(v1[5], v1[7], 0x44);
+    v[7] = SHUFFLE_EPI32(v1[5], v1[7], 0xee);
+
+    // shuffling 128-bit elements
+    v1[0] = _mm256_permute2f128_si256(v[2], v[0], 0x02);
+    v1[1] = _mm256_permute2f128_si256(v[3], v[1], 0x02);
+    v1[2] = _mm256_permute2f128_si256(v[6], v[4], 0x02);
+    v1[3] = _mm256_permute2f128_si256(v[7], v[5], 0x02);
+    v1[4] = _mm256_permute2f128_si256(v[2], v[0], 0x13);
+    v1[5] = _mm256_permute2f128_si256(v[3], v[1], 0x13);
+    v1[6] = _mm256_permute2f128_si256(v[6], v[4], 0x13);
+    v1[7] = _mm256_permute2f128_si256(v[7], v[5], 0x13);
+}
+
+inline void transpose_16x4_32bit(__m512i * r, __m512i * d) {
+
+    static const __m512i index1 = _mm512_set_epi32(
+        0x0f, 0x0b, 0x07, 0x03,
+        0x0e, 0x0a, 0x06, 0x02,
+        0x0d, 0x09, 0x05, 0x01,
+        0x0c, 0x08, 0x04, 0x00);
+
+    d[0] = _mm512_permutexvar_epi32(index1, r[0]);
+    d[1] = _mm512_permutexvar_epi32(index1, r[1]);
+    d[2] = _mm512_permutexvar_epi32(index1, r[2]);
+    d[3] = _mm512_permutexvar_epi32(index1, r[3]);
+
+    r[0] = _mm512_shuffle_i32x4(d[0], d[1], 0x44);
+    r[1] = _mm512_shuffle_i32x4(d[0], d[1], 0xee);
+    r[2] = _mm512_shuffle_i32x4(d[2], d[3], 0x44);
+    r[3] = _mm512_shuffle_i32x4(d[2], d[3], 0xee);
+
+    d[0] = _mm512_shuffle_i32x4(r[0], r[2], 0x88);
+    d[1] = _mm512_shuffle_i32x4(r[0], r[2], 0xdd);
+    d[2] = _mm512_shuffle_i32x4(r[1], r[3], 0x88);
+    d[3] = _mm512_shuffle_i32x4(r[1], r[3], 0xdd);
+}
+
+inline void transpose_16x16_32bit(__m512i * v) {
+    __m512i v1[16];
+    v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
+    v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
+    v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
+    v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
+    v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
+    v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
+    v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
+    v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
+    v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
+    v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
+    v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
+    v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
+    v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
+    v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
+    v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
+    v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);
+
+    v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
+    v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
+    v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
+    v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
+    v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
+    v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
+    v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
+    v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
+    v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
+    v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
+    v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
+    v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
+    v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
+    v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
+    v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
+    v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);
+
+    v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
+    v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
+    v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
+    v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
+    v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
+    v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
+    v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
+    v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
+    v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
+    v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
+    v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
+    v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
+    v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
+    v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
+    v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
+    v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);
+
+    v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
+    v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
+    v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
+    v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
+    v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
+    v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
+    v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
+    v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
+    v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
+    v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
+    v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
+    v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
+    v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
+    v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
+    v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
+    v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
+}
+
+void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_t k) {
+    assert(k % QK_K == 0);
+    const int KB = k / QK_K;
+    constexpr int kVecs = QK_K / 16;
+
+    block_q8_K * y = reinterpret_cast<block_q8_K *>(vy);
+
+    // hold 16 float vecs from x
+    __m512  v[kVecs];
+
+    // hold the quants vecs
+    __m512i vq[kVecs / 4];
+
+    // hold the packed quants vecs
+    __m512i vq_packed[kVecs / 4];
+
+    const __m512 signBit = _mm512_set1_ps(-0.f);
+
+    for (int i = 0; i < KB; ++i) {
+        // Compute max(abs(e)) for the block
+        __m512 vamax = _mm512_set1_ps(0.f);
+        for (int j = 0; j < kVecs; ++j) {
+            v[j] = _mm512_loadu_ps(x); x += 16;
+            vamax = _mm512_max_ps(vamax, _mm512_andnot_ps(signBit, v[j]));
+        }
+        const float amax = _mm512_reduce_max_ps(vamax);
+
+        // Quantize these floats
+        const float iscale = 127.f / amax;
+        y[i].d = GGML_FP32_TO_FP16(1 / iscale);
+        const float id = ( amax != 0.0f ) ? iscale : 0.f;
+        const __m512 vscale = _mm512_set1_ps(id);
+
+        // Apply multiplier and round to nearest integer
+        for (int j = 0; j < kVecs; ++j) {
+            v[j] = _mm512_mul_ps(v[j], vscale);
+            v[j] = _mm512_roundscale_ps(v[j], (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+        }
+
+        // Pack to epi8 vecs
+        for (int j = 0; j < kVecs / 4; ++j) {
+            __m128i q8_0 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 0]));
+            __m128i q8_1 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 1]));
+            __m128i q8_2 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 2]));
+            __m128i q8_3 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 3]));
+
+            __m256i q8_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_0), (q8_1), 1);
+            __m256i q8_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_2), (q8_3), 1);
+
+            vq[j] = _mm512_inserti32x8(_mm512_castsi256_si512(q8_01), q8_23, 1);
+            _mm512_storeu_si512((__m512i *)(y[i].qs + j * 64), vq[j]);
+        }
+
+        // Compute the bsums with vnni
+        transpose_16x4_32bit(vq, vq_packed);
+
+        const __m512i one = _mm512_set1_epi8(1);
+        __m512i sum = _mm512_setzero_si512();
+        for (int k = 0; k < 4; ++k) {
+            sum = _mm512_dpbusd_epi32(sum, one, vq_packed[k]);
+        }
+        _mm256_storeu_si256((__m256i *)(y[i].bsums), _mm512_cvtepi32_epi16(sum));
+    }
+}
+
+// quantize A from float to `vec_dot_type`
+template <typename T>
+inline void from_float(const float * x, char * vy, int64_t k);
+
+template <>
+inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
+    quantize_row_q8_0(x, vy, k);
+}
+
+template <>
+inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
+    quantize_row_q8_1(x, vy, k);
+}
+
+template <>
+inline void from_float<block_q8_K>(const float * x, char * vy, int64_t k) {
+#if 1
+    // TODO: this is reference impl!
+    quantize_row_q8_K(x, vy, k);
+#else
+    quantize_row_q8_K_vnni(x, vy, k);
+#endif
+}
+
+// load A from memory to array when nrows can not fill in whole tile
+void unpack_A(int8_t * RESTRICT tile, const block_q8_0 * RESTRICT A, int lda, int nr) {
+    assert(nr != TILE_M);
+    for (int m = 0; m < nr; ++m) {
+        const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs));
+        _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v);
+    }
+}
+
+void unpack_A(int8_t * RESTRICT tile, const block_q8_1 * RESTRICT A, int lda, int nr) {
+    assert(nr != TILE_M);
+    for (int m = 0; m < nr; ++m) {
+        const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs));
+        _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v);
+    }
+}
+
+template <typename TB>
+void unpack_A(int8_t * RESTRICT tile, const block_q8_K * RESTRICT A, int lda, int k, int nr) {
+    assert(nr <= TILE_M);
+    for (int m = 0; m < nr; ++m) {
+        const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs + k * 32));
+        _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v);
+    }
+}
+
+template <>
+void unpack_A<block_q6_K>(int8_t * RESTRICT tile, const block_q8_K * RESTRICT A, int lda, int k, int nr) {
+    assert(nr <= TILE_M);
+    // zero padding k from 16 to 32, so that we don't have to re-config amx
+    const __m128i zero = _mm_setzero_si128();
+    for (int m = 0; m < nr; ++m) {
+        const __m128i v = _mm_loadu_si128((const __m128i *)(A[m * lda].qs + k * 16));
+        const __m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(v), zero, 1);
+        _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), r);
+    }
+}
+
+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
+    const __m256i lowMask = _mm256_set1_epi8(0xF);
+    return _mm256_and_si256(lowMask, bytes);
+}
+
+// used for block_q4_K
+inline __m512i bytes_from_nibbles_64(const uint8_t * rsi) {
+    const __m256i tmp = _mm256_loadu_si256((const __m256i *)rsi);
+    const __m256i lowMask = _mm256_set1_epi8(0xF);
+    const __m256i q4l = _mm256_and_si256(tmp, lowMask);
+    const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(tmp, 4), lowMask);
+    return _mm512_inserti32x8(_mm512_castsi256_si512(q4l), q4h, 1);
+}
+
+// used for block_q5_K
+inline __m512i bytes_from_nibbles_64(const uint8_t * qs, const uint8_t * qh, int k) {
+    const __m256i lowMask = _mm256_set1_epi8(0xF);
+    __m256i hmask = _mm256_set1_epi8(1);
+    hmask = _mm256_slli_epi16(hmask, k);
+
+    const __m256i q5bits = _mm256_loadu_si256((const __m256i *)qs);
+    const __m256i hbits = _mm256_loadu_si256((const __m256i *)qh);
+
+    const __m256i q5l_0 = _mm256_and_si256(q5bits, lowMask);
+    const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 0), 4);
+    const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
+    hmask = _mm256_slli_epi16(hmask, 1);
+
+    const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), lowMask);
+    const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 1), 4);
+    const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
+
+    return _mm512_inserti32x8(_mm512_castsi256_si512(q5_0), q5_1, 1);
+}
+
+// used for block_q6_K
+inline void bytes_from_nibbles_128(__m512i& r0, __m512i& r1, const uint8_t * qs, const uint8_t * qh) {
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m256i m2 = _mm256_set1_epi8(0x3);
+
+    const __m256i q6bits1 = _mm256_loadu_si256((const __m256i *)qs);
+    const __m256i q6bits2 = _mm256_loadu_si256((const __m256i *)(qs + 32));
+    const __m256i q6bitsH = _mm256_loadu_si256((const __m256i *)qh);
+
+    const __m256i q6h_0 = _mm256_slli_epi16(_mm256_and_si256(                  q6bitsH,     m2), 4);
+    const __m256i q6h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 2), m2), 4);
+    const __m256i q6h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 4), m2), 4);
+    const __m256i q6h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 6), m2), 4);
+
+    const __m256i q6_0 = _mm256_or_si256(_mm256_and_si256(q6bits1, m4), q6h_0);
+    const __m256i q6_1 = _mm256_or_si256(_mm256_and_si256(q6bits2, m4), q6h_1);
+    const __m256i q6_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits1, 4), m4), q6h_2);
+    const __m256i q6_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits2, 4), m4), q6h_3);
+
+    r0 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_0), q6_1, 1);
+    r1 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_2), q6_3, 1);
+}
+
+inline __m512i packNibbles(__m512i r0, __m512i r1) {
+    return _mm512_or_si512(r0, _mm512_slli_epi16(r1, 4));
+}
+
+template <typename TB>
+inline void pack_qs(void * RESTRICT packed_B, const TB * RESTRICT B, int KB) {
+    int8_t tmp[8 * 64];
+    __m256i v[8], v2[8];
+    for (int n = 0; n < 8; ++n) {
+        v[n] = bytes_from_nibbles_32(B[n * KB].qs);
+    }
+    transpose_8x8_32bit(v, v2);
+    for (int n = 0; n < 8; ++n) {
+        _mm256_storeu_si256((__m256i *)(tmp + n * 64), v2[n]);
+    }
+    for (int n = 0; n < 8; ++n) {
+        v[n] = bytes_from_nibbles_32(B[(n + 8) * KB].qs);
+    }
+    transpose_8x8_32bit(v, v2);
+    for (int n = 0; n < 8; ++n) {
+        _mm256_storeu_si256((__m256i *)(tmp + n * 64 + 32), v2[n]);
+    }
+
+    // pack again with 128 to fully utilize vector length
+    for (int n = 0; n < 8; n += 2) {
+        __m512i r0 = _mm512_loadu_si512((const __m512i *)(tmp + n * 64));
+        __m512i r1 = _mm512_loadu_si512((const __m512i *)(tmp + n * 64 + 64));
+        __m512i r1r0 = packNibbles(r0, r1);
+        _mm512_storeu_si512((__m512i *)((char *)packed_B + n * 32), r1r0);
+    }
+}
+
+template <>
+inline void pack_qs<block_q8_0>(void * RESTRICT packed_B, const block_q8_0 * RESTRICT B, int KB) {
+    __m256i v[8], v2[8];
+    for (int n = 0; n < 8; ++n) {
+        v[n] = _mm256_loadu_si256((const __m256i *)(B[n * KB].qs));
+    }
+    transpose_8x8_32bit(v, v2);
+    for (int n = 0; n < 8; ++n) {
+        _mm256_storeu_si256((__m256i *)((char *)packed_B + n * 64), v2[n]);
+    }
+    for (int n = 0; n < 8; ++n) {
+        v[n] = _mm256_loadu_si256((const __m256i *)(B[(n + 8) * KB].qs));
+    }
+    transpose_8x8_32bit(v, v2);
+    for (int n = 0; n < 8; ++n) {
+        _mm256_storeu_si256((__m256i *)((char *)packed_B + n * 64 + 32), v2[n]);
+    }
+}
+
+template <>
+inline void pack_qs<block_q4_K>(void * RESTRICT packed_B, const block_q4_K * RESTRICT B, int KB) {
+    __m512i v[16];
+    // QK_K 256 with 8 groups, handle 2 groups at a time
+    char * pb = (char *)packed_B;
+    for (int k = 0; k < QK_K / 64; ++k) {
+        // pack 2 groups { n, g,  k} to {g, k/4, 4n}
+        //          e.g. {16, 2, 32} to {2,   8, 64}
+        for (int n = 0; n < TILE_N; ++n) {
+            v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32);
+        }
+
+        transpose_16x16_32bit(v);
+
+        // pack again with 128 to fully utilize vector length
+        for (int n = 0; n < TILE_N; n += 2) {
+            _mm512_storeu_si512((__m512i *)pb, packNibbles(v[n], v[n + 1]));
+            pb += 64;
+        }
+    }
+}
+
+template <>
+inline void pack_qs<block_q5_K>(void * RESTRICT packed_B, const block_q5_K * RESTRICT B, int KB) {
+    __m512i v[16];
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    // QK_K 256 with 8 groups, handle 2 groups at a time
+    char * pb = (char *)packed_B;
+    char * ph = (char *)packed_B + (QK_K / 2) * TILE_N;
+    for (int k = 0; k < QK_K / 64; ++k) {
+        // pack 2 groups { n, g,  k} to {g, k/4, 4n}
+        //          e.g. {16, 2, 32} to {2,   8, 64}
+        for (int n = 0; n < TILE_N; ++n) {
+            v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32, B[n * KB].qh, /* group */2 * k);
+        }
+
+        transpose_16x16_32bit(v);
+
+        // 1. pack lower 4bits with 2 groups
+        for (int n = 0; n < TILE_N; n += 2) {
+            // get lower 4 bits
+            const __m512i r0 = _mm512_and_si512(v[n], lowMask);
+            const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask);
+            _mm512_storeu_si512((__m512i *)pb, packNibbles(r0, r1)); pb += 64;
+        }
+
+        // 2. pack higher 1bit with 2 groups
+        const __m512i hmask = _mm512_set1_epi8(0x10);
+        for (int g = 0; g < 2; ++g) {
+            __m512i hbits = _mm512_setzero_si512();
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 0], hmask), 4));
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 1], hmask), 3));
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 2], hmask), 2));
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 3], hmask), 1));
+            hbits = _mm512_add_epi8(hbits,                   _mm512_and_si512(v[g * 8 + 4], hmask)    );
+            hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 5], hmask), 1));
+            hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 6], hmask), 2));
+            hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 7], hmask), 3));
+            _mm512_storeu_si512((__m512i *)ph, hbits); ph += 64;
+        }
+    }
+}
+
+template <>
+inline void pack_qs<block_q6_K>(void * RESTRICT packed_B, const block_q6_K * RESTRICT B, int KB) {
+    __m512i v[32];
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    // QK_K 256 with 8 groups, handle 4 groups at a time
+    char * pb = (char *)packed_B;
+    char * ph = (char *)packed_B + (QK_K / 2) * TILE_N;
+    for (int k = 0; k < QK_K / 128; ++k) {
+        for (int n = 0; n < TILE_N; ++n) {
+            bytes_from_nibbles_128(v[n], v[n + 16], B[n * KB].ql + k * 64, B[n * KB].qh + k * 32);
+        }
+
+        // top half: group 0,1 or 4,5; bottom half: group 2,3 or 6,7
+        transpose_16x16_32bit(v);
+        transpose_16x16_32bit(v + 16);
+
+        // 1. pack lower 4bits with 4 groups
+        for (int n = 0; n < 32; n += 2) {
+            const __m512i r0 = _mm512_and_si512(v[n], lowMask);
+            const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask);
+            _mm512_storeu_si512((__m512i *)pb, packNibbles(r0, r1)); pb += 64;
+        }
+
+        // 2. pack higher 2bit with 4 groups
+        const __m512i hmask = _mm512_set1_epi8(0x30);
+        for (int g = 0; g < 8; ++g) {
+            __m512i hbits = _mm512_setzero_si512();
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 0], hmask), 4));
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 1], hmask), 2));
+            hbits = _mm512_add_epi8(hbits,                   _mm512_and_si512(v[g * 4 + 2], hmask)    );
+            hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 4 + 3], hmask), 2));
+            _mm512_storeu_si512((__m512i *)ph, hbits); ph += 64;
+        }
+    }
+}
+
+template <>
+inline void pack_qs<block_iq4_xs>(void * RESTRICT packed_B, const block_iq4_xs * RESTRICT B, int KB) {
+    __m512i v[16];
+    char * pb = (char *)packed_B;
+    for (int k = 0; k < QK_K / 64; ++k) {
+        for (int n = 0; n < TILE_N; ++n) {
+            __m256i r0 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 +  0);
+            __m256i r1 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 + 16);
+            v[n] = _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+        }
+
+        transpose_16x16_32bit(v);
+
+        // pack again with 128 to fully utilize vector length
+        for (int n = 0; n < TILE_N; n += 2) {
+            _mm512_storeu_si512((__m512i *)pb, packNibbles(v[n], v[n + 1]));
+            pb += 64;
+        }
+    }
+}
+
+// pack B to vnni formats in 4bits or 8 bits
+void pack_B(void * RESTRICT packed_B, const block_q4_0 * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+    ggml_half * d0 = reinterpret_cast<ggml_half *>((char *)packed_B + TILE_N * TILE_K / 2);
+    for (int n = 0; n < TILE_N; ++n) {
+        d0[n] = B[n * KB].d;
+    }
+}
+
+void pack_B(void * RESTRICT packed_B, const block_q4_1 * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+    ggml_half * d0 = reinterpret_cast<ggml_half *>((char *)packed_B + TILE_N * TILE_K / 2);
+    ggml_half * m0 = d0 + TILE_N;
+    for (int n = 0; n < TILE_N; ++n) {
+        d0[n] = B[n * KB].d;
+        m0[n] = B[n * KB].m;
+    }
+}
+
+inline void s8s8_compensation(void * RESTRICT packed_B) {
+    // packed_B layout:
+    //   quants {TILE_N, TILEK}  int8_t
+    //   d0     {TILE_N}      ggml_half
+    //   comp   {TILE_N}        int32_t
+    const int offset = TILE_N * TILE_K + TILE_N * sizeof(ggml_half);
+    __m512i vcomp = _mm512_setzero_si512();
+    const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
+    for (int k = 0; k < 8; ++k) {
+        __m512i vb = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + k * 64));
+        vcomp = _mm512_dpbusd_epi32(vcomp, off, vb);
+    }
+    _mm512_storeu_si512((__m512i *)((char *)(packed_B) + offset), vcomp);
+}
+
+void pack_B(void * RESTRICT packed_B, const block_q8_0 * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+    ggml_half * d0 = reinterpret_cast<ggml_half *>((char *)packed_B + TILE_N * TILE_K);
+    for (int n = 0; n < TILE_N; ++n) {
+        d0[n] = B[n * KB].d;
+    }
+    s8s8_compensation(packed_B);
+}
+
+// convert 8 * {min, scale} from int6 to int8
+inline void unpack_mins_and_scales(const uint8_t * scales, uint32_t * utmp) {
+    const uint32_t kmask1 = 0x3f3f3f3f;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+    const uint32_t kmask3 = 0x03030303;
+
+    memcpy(utmp, scales, 12);
+    utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+    const uint32_t uaux = utmp[1] & kmask1;
+    utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+    utmp[2] = uaux;
+    utmp[0] &= kmask1;
+}
+
+// packed_B layout:
+//   quants {8, TILE_N, 16}  uint8
+//   scales {8, TILE_N}      uint8
+//   mins   {8, TILE_N}      uint8
+//   d      {TILE_N}     ggml_half
+//   dmin   {TILE_N}     ggml_half
+void pack_B(void * RESTRICT packed_B, const block_q4_K * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+
+    uint8_t * scales = reinterpret_cast<uint8_t *>((char *)packed_B + (QK_K / 2) * TILE_N);
+    uint8_t * mins = scales + 8 * TILE_N;
+    ggml_half * d = reinterpret_cast<ggml_half *>(mins + 8 * TILE_N);
+    ggml_half * dmin = d + TILE_N;
+
+    union {
+        uint32_t u32[4];
+        uint8_t  u8[16];
+    } s;
+
+    for (int n = 0; n < TILE_N; ++n) {
+        unpack_mins_and_scales(B[n * KB].scales, s.u32);
+        for (int k = 0; k < 8; ++k) {
+            scales[k * TILE_N + n] = s.u8[k];
+            mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8];
+        }
+        d[n] = B[n * KB].d;
+        dmin[n] = B[n * KB].dmin;
+    }
+}
+
+// packed_B layout:
+//   quants {8, TILE_N, 16}  uint8
+//   qh     {8, TILE_N,  4}  uint8
+//   scales {8, TILE_N}      uint8
+//   mins   {8, TILE_N}      uint8
+//   d      {TILE_N}     ggml_half
+//   dmin   {TILE_N}     ggml_half
+void pack_B(void * RESTRICT packed_B, const block_q5_K * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+
+    uint8_t * scales = reinterpret_cast<uint8_t *>((char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N);
+    uint8_t * mins = scales + 8 * TILE_N;
+    ggml_half * d = reinterpret_cast<ggml_half *>(mins + 8 * TILE_N);
+    ggml_half * dmin = d + TILE_N;
+
+    union {
+        uint32_t u32[4];
+        uint8_t  u8[16];
+    } s;
+
+    for (int n = 0; n < TILE_N; ++n) {
+        unpack_mins_and_scales(B[n * KB].scales, s.u32);
+        for (int k = 0; k < 8; ++k) {
+            scales[k * TILE_N + n] = s.u8[k];
+            mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8];
+        }
+        d[n] = B[n * KB].d;
+        dmin[n] = B[n * KB].dmin;
+    }
+}
+
+// packed_B layout:
+//   quants {16, TILE_N, 8}  uint8
+//   qh     {16, TILE_N, 4}  uint8
+//   scales {16, TILE_N}      uint8
+//   d      {TILE_N}     ggml_half
+void pack_B(void * RESTRICT packed_B, const block_q6_K * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+
+    uint8_t * scales = reinterpret_cast<uint8_t *>((char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N);
+    ggml_half * d = reinterpret_cast<ggml_half *>(scales + 16 * TILE_N);
+    for (int n = 0; n < TILE_N; ++n) {
+        const int8_t * ps = B[n * KB].scales;
+        for (int k = 0; k < 16; ++k) {
+            scales[k * TILE_N + n] = ps[k];
+        }
+        d[n] = B[n * KB].d;
+    }
+}
+
+// packed_B layout:
+//   quants {8, TILE_N, 16}  uint8
+//   scales {8, TILE_N}       int8
+//   d      {TILE_N}     ggml_half
+void pack_B(void * RESTRICT packed_B, const block_iq4_xs * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+
+    int8_t * scales = reinterpret_cast<int8_t *>((char *)packed_B + (QK_K / 2) * TILE_N);
+    ggml_half * d = reinterpret_cast<ggml_half *>(scales + 8 * TILE_N);
+
+    // pack the scales
+    for (int n = 0; n < TILE_N; ++n) {
+        uint16_t sh = B[n * KB].scales_h;
+        for (int k = 0; k < 8; k += 2) {
+            const int16_t ls1 = ((B[n * KB].scales_l[k / 2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((B[n * KB].scales_l[k / 2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            scales[(k + 0) * TILE_N + n] = ls1;
+            scales[(k + 1) * TILE_N + n] = ls2;
+            sh >>= 4;
+        }
+        d[n] = B[n * KB].d;
+    }
+}
+
+template<typename TB, typename packed_B_t = packed_B_type<TB>>
+void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) {
+    GGML_UNUSED(tile);
+    GGML_UNUSED(packed_B);
+};
+
+template <>
+void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) {
+  const __m512i off = _mm512_set1_epi8(8);
+  const __m512i lowMask = _mm512_set1_epi8(0xF);
+  for (int n = 0; n < 8; n += 2) {
+    __m512i bytes = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + n * 32));
+    const __m512i r0 = _mm512_sub_epi8(_mm512_and_si512(bytes, lowMask), off);
+    const __m512i r1 = _mm512_sub_epi8(_mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask), off);
+    _mm512_storeu_si512((__m512i *)(tile + n * 64 +  0), r0);
+    _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1);
+  }
+}
+
+template <>
+void unpack_B<block_q4_1>(uint8_t * RESTRICT tile, const void * RESTRICT packed_B) {
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    for (int n = 0; n < 8; n += 2) {
+        __m512i bytes = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + n * 32));
+        const __m512i r0 = _mm512_and_si512(bytes, lowMask);
+        const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 +  0), r0);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1);
+    }
+}
+
+// packed_B_t for QKK is int8_t
+template <typename TB>
+void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) {
+    const int packed_B_group_size = QK_K / 2 * TILE_N / 8;
+    const char * packed_B_group = (const char *)packed_B + k * packed_B_group_size;
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    for (int n = 0; n < 8; n += 2) {
+        __m512i bytes = _mm512_loadu_si512(packed_B_group + n * 32);
+        const __m512i r0 = _mm512_and_si512(bytes, lowMask);
+        const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 +  0), r0);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1);
+    }
+}
+
+template <>
+void unpack_B<block_q5_K>(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) {
+    // lower 4bits, stride 256 bytes
+    const int packed_l4_group_size = QK_K / 2 * TILE_N / 8;
+    const char * pb = (const char *)packed_B + k * packed_l4_group_size;
+
+    // higher 1bit, stride 64 bytes
+    const int packed_h1_group_size = QK_K / 8 * TILE_N / 8;
+    const char * ph = (const char *)packed_B + (QK_K / 2) * TILE_N + k * packed_h1_group_size;
+    const __m512i hbits = _mm512_loadu_si512(ph);
+
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    __m512i hmask0 = _mm512_set1_epi8(0x1);
+    __m512i hmask1 = _mm512_set1_epi8(0x2);
+
+    for (int n = 0; n < 8; n += 2) {
+        __m512i bytes = _mm512_loadu_si512(pb + n * 32);
+        __m512i r0 = _mm512_and_si512(bytes, lowMask);
+        __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+        __m512i h0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), n), 4);
+        __m512i h1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), n + 1), 4);
+
+        hmask0 = _mm512_slli_epi16(hmask0, 2);
+        hmask1 = _mm512_slli_epi16(hmask1, 2);
+        r0 = _mm512_add_epi8(r0, h0);
+        r1 = _mm512_add_epi8(r1, h1);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 +  0), r0);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1);
+    }
+}
+
+template <>
+void unpack_B<block_q6_K>(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) {
+    // lower 4bits, stride 128 bytes
+    const int packed_l4_group_size = QK_K / 2 * TILE_N / 16;
+    const char * pb = (const char *)packed_B + k * packed_l4_group_size;
+
+    // higher 2bits, stride 64 bytes
+    const int packed_h2_group_size = QK_K / 4 * TILE_N / 16;
+    const char * ph = (const char *)packed_B + (QK_K / 2) * TILE_N + k * packed_h2_group_size;
+    const __m512i hbits = _mm512_loadu_si512(ph);
+
+    const __m512i off = _mm512_set1_epi8(32);
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    __m512i hmask0 = _mm512_set1_epi8(0x3); // 0011
+    __m512i hmask1 = _mm512_set1_epi8(0xC); // 1100
+
+    // notes: skip zero padding from row4 to row7 as we have done so in `unpack_A`
+    __m512i bytes = _mm512_loadu_si512(pb);
+    __m512i r0 = _mm512_and_si512(bytes, lowMask);
+    __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+    __m512i h0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask0), 4);
+    __m512i h1 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask1), 2);
+    _mm512_storeu_si512((__m512i *)(tile +  0), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off));
+    _mm512_storeu_si512((__m512i *)(tile + 64), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off));
+
+    hmask0 = _mm512_slli_epi16(hmask0, 4);
+    hmask1 = _mm512_slli_epi16(hmask1, 4);
+
+    bytes = _mm512_loadu_si512(pb + 64);
+    r0 = _mm512_and_si512(bytes, lowMask);
+    r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+    h0 =                   _mm512_and_si512(hbits, hmask0);
+    h1 = _mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), 2);
+    _mm512_storeu_si512((__m512i *)(tile + 128), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off));
+    _mm512_storeu_si512((__m512i *)(tile + 192), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off));
+}
+
+template <>
+void unpack_B<block_iq4_xs>(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) {
+    static const __m512i values128 = _mm512_set_epi8(
+        113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+        113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+        113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+        113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127
+    );
+
+    const int packed_B_group_size = QK_K / 2 * TILE_N / 8;
+    const char * pb = (const char *)packed_B + k * packed_B_group_size;
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+    for (int n = 0; n < 8; n += 2) {
+        __m512i bytes = _mm512_loadu_si512(pb + n * 32);
+        const __m512i r0 = _mm512_shuffle_epi8(values128, _mm512_and_si512(bytes, lowMask));
+        const __m512i r1 = _mm512_shuffle_epi8(values128, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask));
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 +  0), r0);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1);
+    }
+}
+
+template <typename TA, typename TB, bool is_acc>
+struct acc_C {};
+
+template <bool is_acc>
+struct acc_C<block_q8_0, block_q4_0, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_0 * A, int lda, const void * packed_B, int nr) {
+        const int offset = TILE_N * TILE_K / 2;
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
+
+        for (int m = 0; m < nr; ++m) {
+            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+            vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_1, block_q4_1, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_1 * A, int lda, const void * packed_B, int nr) {
+        const int offset = TILE_N * TILE_K / 2;
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
+        const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
+
+        for (int m = 0; m < nr; ++m) {
+            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+            vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
+            vsum = _mm512_fmadd_ps(vm0, vs1, vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_0, block_q8_0, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_0 * A, int lda, const void * packed_B, int nr) {
+        const int offset = TILE_N * TILE_K;
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
+
+        for (int m = 0; m < nr; ++m) {
+            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+            vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_K, block_q4_K, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) {
+        const uint8_t * scales = reinterpret_cast<const uint8_t *>((const char *)packed_B + (QK_K / 2) * TILE_N);
+        const uint8_t * mins = scales + 8 * TILE_N;
+        const ggml_half * d0 = reinterpret_cast<const ggml_half *>(mins + 8 * TILE_N);
+        const ggml_half * dmin = d0 + TILE_N;
+
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0));
+        const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dmin));
+
+        for (int m = 0; m < nr; ++m) {
+            const float d1 = A[m * lda].d;
+            const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
+            const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin);
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+
+            const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[m * lda].bsums);
+            const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+
+            __m512i acc_m = _mm512_setzero_si512();
+            for (int k = 0; k < 4; ++k) {
+                __m512i vmask = _mm512_set1_epi32(k);
+                __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s));
+                __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(mins + k * 32)));
+                acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
+            }
+
+            vsum = _mm512_fmadd_ps(vtile, vd, vsum);
+            vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_K, block_q5_K, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) {
+        const uint8_t * scales = reinterpret_cast<const uint8_t *>((const char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N);
+        const uint8_t * mins = scales + 8 * TILE_N;
+        const ggml_half * d0 = reinterpret_cast<const ggml_half *>(mins + 8 * TILE_N);
+        const ggml_half * dmin = d0 + TILE_N;
+
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0));
+        const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dmin));
+
+        for (int m = 0; m < nr; ++m) {
+            const float d1 = A[m * lda].d;
+            const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
+            const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin);
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+
+            const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[m * lda].bsums);
+            const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+
+            __m512i acc_m = _mm512_setzero_si512();
+            for (int k = 0; k < 4; ++k) {
+                __m512i vmask = _mm512_set1_epi32(k);
+                __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s));
+                __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(mins + k * 32)));
+                acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
+            }
+
+            vsum = _mm512_fmadd_ps(vtile, vd, vsum);
+            vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_K, block_q6_K, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) {
+        const uint8_t * scales = reinterpret_cast<const uint8_t *>((const char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N);
+        const ggml_half * d0 = reinterpret_cast<const ggml_half *>(scales + 16 * TILE_N);
+
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0));
+
+        for (int m = 0; m < nr; ++m) {
+            const float d1 = A[m * lda].d;
+            const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+
+            vsum = _mm512_fmadd_ps(vtile, vd, vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_K, block_iq4_xs, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) {
+        const int8_t * scales = reinterpret_cast<const int8_t *>((const char *)packed_B + (QK_K / 2) * TILE_N);
+        const ggml_half * d0 = reinterpret_cast<const ggml_half *>(scales + 8 * TILE_N);
+
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0));
+
+        for (int m = 0; m < nr; ++m) {
+            const float d1 = A[m * lda].d;
+            const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+
+            vsum = _mm512_fmadd_ps(vtile, vd, vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <typename TB> constexpr int get_quants_size();
+template <> constexpr int get_quants_size<block_q4_K>() { return (QK_K / 2) * TILE_N; }
+template <> constexpr int get_quants_size<block_q5_K>() { return (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N; }
+template <> constexpr int get_quants_size<block_q6_K>() { return (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N; }
+template <> constexpr int get_quants_size<block_iq4_xs>() { return (QK_K / 2) * TILE_N; }
+
+// used for QKK format
+template <typename TB, bool is_acc,
+          typename std::enable_if<is_type_qkk<TB>::value, int>::type = 0>
+inline void scale_C(const int32_t * RESTRICT tile, int32_t * RESTRICT sumi, const void * packed_B, int k, int nr) {
+    const uint8_t * scales = reinterpret_cast<const uint8_t *>((const char *)packed_B + get_quants_size<TB>());
+    const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(scales + k * TILE_N)));
+
+    for (int m = 0; m < nr; ++m) {
+        __m512i vsumi;
+        if (is_acc) {
+            vsumi = _mm512_loadu_si512(sumi + m * TILE_N);
+        } else {
+            vsumi = _mm512_setzero_si512();
+        }
+        __m512i vtile = _mm512_loadu_si512(tile + m * TILE_N);
+        vsumi = _mm512_add_epi32(vsumi, _mm512_mullo_epi32(vtile, vscale));
+        _mm512_storeu_si512((__m512i *)(sumi + m * TILE_N), vsumi);
+    }
+}
+
+template <typename TA, typename TB, typename TC, int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_avx {
+    static void apply(int K, const TA * RESTRICT A, const TB * RESTRICT B, TC * RESTRICT C, int ldc) {
+        GGML_UNUSED(K);
+        GGML_UNUSED(A);
+        GGML_UNUSED(B);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int K, const float * RESTRICT A, const ggml_fp16_t * RESTRICT B, float * RESTRICT C, int ldc) {
+        constexpr int ROWS = BLOCK_M;
+        constexpr int COLS = BLOCK_N;
+        assert(BLOCK_K == 16);
+
+        __m512 va;
+        __m512 vb[COLS];
+        __m512 vc[ROWS * COLS];
+
+        auto loadc = [&](int idx) {
+            vc[idx] = _mm512_setzero_ps();
+        };
+        Unroll<ROWS * COLS>{}(loadc);
+
+        auto compute = [&](int idx, int k) {
+            // TODO: use `constexpr` here to get rid of interger div
+            // when upgraded to C++17
+            const int row = idx / COLS;
+            const int col = idx % COLS;
+
+            if (col == 0) {
+                va = _mm512_loadu_ps(A + row * K + k);
+            }
+            if (row == 0) {
+                vb[col] =  _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k)));
+            }
+            vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
+        };
+
+        for (int k = 0; k < K; k += 16) {
+            Unroll<ROWS * COLS>{}(compute, k);
+        }
+
+        auto storec = [&](int idx) {
+            const int row = idx / COLS;
+            const int col = idx % COLS;
+            C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
+        };
+        Unroll<ROWS * COLS>{}(storec);
+    }
+};
+
+#define LAUNCH_TINYGEMM_KERNEL_AVX(MB_SIZE, NB_SIZE)                                \
+    tinygemm_kernel_avx<float, type, float, MB_SIZE, NB_SIZE, blck_size>::apply(    \
+        K, (const float *)src1->data + mb_start * K,                                \
+        (const type *)src0->data + nb_start * K,                                    \
+        (float *)dst->data + mb_start * ldc + nb_start, ldc);
+
+
+// re-organize in the format {NB, KB, TILE_SIZE}:
+#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
+
+template<typename TB, int BLOCK_K>
+void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) {
+    const int NB = N / TILE_N;
+    const int KB = K / BLOCK_K;
+    const int TILE_SIZE = get_tile_size<TB>();
+
+    // parallel on NB should be enough
+    parallel_for(n_threads, NB, [&](int begin, int end) {
+        for (int n = begin; n < end; ++n) {
+            for (int k = 0; k < KB; ++k) {
+                int n0 = n * TILE_N;
+                pack_B((char *)packed_B + PACKED_INDEX(n, k, KB, TILE_SIZE), &B[n0 * KB + k], KB);
+            }
+        }
+    });
+}
+
+template <typename TA, typename TB, typename TC, int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni {};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q4_0);
+
+        const block_q8_0 * RESTRICT A = static_cast<const block_q8_0 *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        __m512i va[8];
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // sum of offsets, shared across COLS
+        //
+        // avx512-vnni does not have `_mm512_dpbssd_epi32`,
+        // need to transfrom ss to us:
+        //   a * (b - 8) is equavilent to b * a - 8 * a
+        //   s    u   u                   u   s   u   s
+        //
+        __m512i vcomp;
+
+        const __m512i off = _mm512_set1_epi8(8);
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        auto loadc = [&](int col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        auto compute = [&](int col, int i) {
+            // load a and compute compensation
+            if (col == 0) {
+                const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
+                vcomp = _mm512_setzero_si512();
+                for (int k = 0; k < 8; ++k) {
+                    va[k] = _mm512_set1_epi32(a_ptr[k]);
+                    vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
+                }
+                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
+            }
+
+            // load b
+            __m512i vsum = _mm512_setzero_si512();
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            for (int k = 0; k < 8; k += 2) {
+                __m512i bytes = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 32));
+                __m512i vb0 = _mm512_and_si512(bytes, lowMask);
+                vsum = _mm512_dpbusd_epi32(vsum, vb0, va[k + 0]);
+                __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+                vsum = _mm512_dpbusd_epi32(vsum, vb1, va[k + 1]);
+            }
+            const int offset = TILE_N * TILE_K / 2;
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset)));
+            vsum = _mm512_sub_epi32(vsum, vcomp);
+
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](int col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q4_1);
+
+        const block_q8_1 * RESTRICT A = static_cast<const block_q8_1 *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        __m512i va[8];
+        __m512i vb[8];
+        __m512 vc[COLS];
+        __m512 vd1, vs1;
+
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        auto loadc = [&](int col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        auto compute = [&](int col, int i) {
+            // load a
+            if (col == 0) {
+                const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
+                for (int k = 0; k < 8; ++k) {
+                    va[k] = _mm512_set1_epi32(a_ptr[k]);
+                }
+                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
+                vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
+            }
+
+            // load b
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            for (int k = 0; k < 8; k += 2) {
+                __m512i bytes = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 32));
+                vb[k + 0] = _mm512_and_si512(bytes, lowMask);
+                vb[k + 1] = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+            }
+            const int offset = TILE_N * TILE_K / 2;
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset)));
+            const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset + TILE_N * sizeof(ggml_half))));
+
+            __m512i vsum = _mm512_setzero_si512();
+            for (int k = 0; k < 8; ++k) {
+                vsum = _mm512_dpbusd_epi32(vsum, vb[k], va[k]);
+            }
+
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
+            vc[col] = _mm512_fmadd_ps(vm0, vs1, vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](int col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q8_0) + TILE_N * sizeof(int32_t);
+
+        const block_q8_0 * RESTRICT A = static_cast<const block_q8_0 *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        __m512i va[8];
+        __m512i vb[8];
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // Notes: s8s8 igemm compensation in avx512-vnni
+        // change s8s8 to u8s8 with compensate
+        //   a * b = (a + 128) * b - 128 * b
+        //   s   s       u       s    u    s
+        //
+        // (128 * b is pre-computed when packing B to vnni formats)
+        //
+        const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
+
+        auto loadc = [&](int col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        auto compute = [&](int col, int i) {
+            // load a and add offset 128
+            if (col == 0) {
+                const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
+                for (int k = 0; k < 8; ++k) {
+                    va[k] = _mm512_set1_epi32(a_ptr[k]);
+                    va[k] = _mm512_add_epi8(va[k], off);
+                }
+                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
+            }
+
+            // load b
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            for (int k = 0; k < 8; ++k) {
+                vb[k] = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 64));
+            }
+            const int offset = TILE_N * TILE_K;
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset)));
+            const int offset2 = TILE_N * TILE_K + TILE_N * sizeof(ggml_half);
+            const __m512i vcomp = _mm512_loadu_si512((const __m512i *)(b_ptr + offset2));
+
+            __m512i vsum = _mm512_setzero_si512();
+            for (int k = 0; k < 8; ++k) {
+                vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]);
+            }
+            vsum = _mm512_sub_epi32(vsum, vcomp);
+
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](int col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q4_K) + TILE_N * 4;
+
+        const block_q8_K * RESTRICT A = static_cast<const block_q8_K *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        // a.qs:   8 groups, 32 bytes each group (m256i)
+        __m512i va[8];
+        // a.bsum: 8 groups,  2 bytes each group (m128i)
+        __m512i va_bsum;
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // packed_B:
+        const int offset_scales = (QK_K / 2) * TILE_N;
+        const int offset_mins   = (QK_K / 2) * TILE_N +  8 * TILE_N;
+        const int offset_d0     = (QK_K / 2) * TILE_N + 16 * TILE_N;
+        const int offset_dmin   = (QK_K / 2) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half);
+
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        auto loadc = [&](int col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        // Notes: vnni formats in QK_K
+        //   a) quants vnni format
+        //     int8  {k/4, n, 4}, viewed as 2d {k/4, 4n}, k = 32
+        //     from {16, 32} to {8, 64}
+        //
+        //   b) min vnni format
+        //     int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
+        //     from {16,  8} to {4, 32}
+        //
+        auto compute = [&](int col, int i) {
+            // load a
+            if (col == 0) {
+                for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
+                    va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
+                }
+                const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums);
+                const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+                va_bsum = _mm512_castsi128_si512(q8s);
+                vd1 = _mm512_set1_ps(A[0 * KB + i].d);
+            }
+
+            // step 1: accumultate the quants
+            __m512i acc = _mm512_setzero_si512();
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            const char * b_qs  = b_ptr;
+            for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
+                __m512i vsum = _mm512_setzero_si512();
+                for (int k = 0; k < 8; k += 2) {
+                    __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]);
+                    __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]);
+
+                    __m512i bytes = _mm512_loadu_si512((const __m512i *)b_qs);
+                    __m512i vb0 = _mm512_and_si512(bytes, lowMask);
+                    vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
+                    __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+                    vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
+
+                    b_qs += 64;
+                }
+                // vacc += scale * (q8 @ q4)
+                const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N)));
+                acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
+            }
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0)));
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
+
+            // step 2: accumulate the mins
+            __m512i acc_m = _mm512_setzero_si512();
+            for (int k = 0; k < 4; ++k) {
+                __m512i vmask = _mm512_set1_epi32(k);
+                __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum);
+                __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_mins + k * 32)));
+                acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
+            }
+            const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_dmin)));
+            vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](int col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q5_K) + TILE_N * 4;
+
+        const block_q8_K * RESTRICT A = static_cast<const block_q8_K *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        // a.qs:   8 groups, 32 bytes each group (m256i)
+        __m512i va[8];
+        // a.bsum: 8 groups,  2 bytes each group (m128i)
+        __m512i va_bsum;
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // packed_B:
+        const int offset_qh     = (QK_K / 2) * TILE_N;
+        const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N;
+        const int offset_mins   = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N +  8 * TILE_N;
+        const int offset_d0     = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N;
+        const int offset_dmin   = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half);
+
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        auto loadc = [&](int col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        // Q5_K and Q4_K shares the same vnni formats, refer to notes above.
+        auto compute = [&](int col, int i) {
+            // load a
+            if (col == 0) {
+                for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
+                    va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
+                }
+                const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums);
+                const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+                va_bsum = _mm512_castsi128_si512(q8s);
+                vd1 = _mm512_set1_ps(A[0 * KB + i].d);
+            }
+
+            // step 1: accumultate the quants
+            __m512i acc = _mm512_setzero_si512();
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            const char * b_qs  = b_ptr;
+            const char * b_qh  = b_ptr + offset_qh;
+            for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
+                __m512i vsum = _mm512_setzero_si512();
+                __m512i hmask0 = _mm512_set1_epi8(0x1);
+                __m512i hmask1 = _mm512_set1_epi8(0x2);
+                __m512i hbits = _mm512_loadu_si512((const __m512i *)(b_qh + k_group * 64));
+                for (int k = 0; k < 8; k += 2) {
+                    __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]);
+                    __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]);
+
+                    __m512i bytes = _mm512_loadu_si512((const __m512i *)b_qs);
+                    __m512i vb0 = _mm512_and_si512(bytes, lowMask);
+                    __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+
+                    __m512i vh0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), k), 4);
+                    __m512i vh1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), k + 1), 4);
+
+                    hmask0 = _mm512_slli_epi16(hmask0, 2);
+                    hmask1 = _mm512_slli_epi16(hmask1, 2);
+                    vb0 = _mm512_add_epi8(vb0, vh0);
+                    vb1 = _mm512_add_epi8(vb1, vh1);
+
+                    vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
+                    vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
+
+                    b_qs += 64;
+                }
+                // vacc += scale * (q8 @ q5)
+                const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N)));
+                acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
+            }
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0)));
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
+
+            // step 2: accumulate the mins
+            __m512i acc_m = _mm512_setzero_si512();
+            for (int k = 0; k < 4; ++k) {
+                __m512i vmask = _mm512_set1_epi32(k);
+                __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum);
+                __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_mins + k * 32)));
+                acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
+            }
+            const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_dmin)));
+            vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](int col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q6_K);
+
+        const block_q8_K * RESTRICT A = static_cast<const block_q8_K *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        // load the 256 bytes from A to 4 avx512 vectors
+        __m512i va[4];
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // packed_B:
+        const int offset_qh     = (QK_K / 2) * TILE_N;
+        const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N;
+        const int offset_d0     = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N + 16 * TILE_N;
+
+        // compensation
+        __m512i vcomp;
+
+        const __m512i m32s = _mm512_set1_epi32(32);
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        auto loadc = [&](int col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        auto compute = [&](int col, int i) {
+            if (col == 0) {
+                // load a
+                va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0));
+                va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64));
+                va[2] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 128));
+                va[3] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 192));
+
+                const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums);
+                vcomp = _mm512_mullo_epi32(_mm512_cvtepi16_epi32(q8sums), m32s);
+                vd1 = _mm512_set1_ps(A[0 * KB + i].d);
+            }
+
+            // accmulate the quants
+            __m512i acc = _mm512_setzero_si512();
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            const char * b_qs = b_ptr;
+            const char * b_qh = b_ptr + offset_qh;
+            int mask = 0;
+            for (int k_group = 0; k_group < QK_K / 16; ++k_group) {
+                int r = k_group >> 2;
+                __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+                __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+
+                __m512i vsum = _mm512_setzero_si512();
+                __m512i hmask = _mm512_set1_epi8(0x3);
+
+                __m512i bytes = _mm512_loadu_si512(b_qs);
+                __m512i hbits = _mm512_loadu_si512(b_qh);
+                __m512i vb0 = _mm512_and_si512(bytes, lowMask);
+                __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+                __m512i vh0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask), 4);
+                __m512i vh1 = _mm512_slli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 2)), 2);
+
+                vb0 = _mm512_add_epi8(vb0, vh0);
+                vb1 = _mm512_add_epi8(vb1, vh1);
+                vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
+                vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
+                b_qs += 64;
+
+                va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+                va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+
+                bytes = _mm512_loadu_si512(b_qs);
+                vb0 = _mm512_and_si512(bytes, lowMask);
+                vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+                vh0 =                   _mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 4));
+                vh1 = _mm512_srli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 6)), 2);
+                vb0 = _mm512_add_epi8(vb0, vh0);
+                vb1 = _mm512_add_epi8(vb1, vh1);
+                vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
+                vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
+                b_qs += 64;
+                b_qh += 64;
+
+                // B * A - 32 * A
+                __m512i vmask = _mm512_set1_epi32(k_group);
+                vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp));
+
+                // vacc += scale * (q8 @ q6)
+                const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N)));
+                acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
+            }
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0)));
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](int col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_iq4_xs) + TILE_N * 2;
+
+        const block_q8_K * RESTRICT A = static_cast<const block_q8_K *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        // load the 256 bytes from A to 4 avx512 vectors
+        __m512i va[4];
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // packed_B:
+        const int offset_scales = (QK_K / 2) * TILE_N ;
+        const int offset_d0     = (QK_K / 2) * TILE_N + 8 * TILE_N;
+
+        // compensation
+        __m512i vcomp;
+
+        const __m256i m128s = _mm256_set1_epi16(128);
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        const __m512i values128 = _mm512_set_epi8(
+            113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+            113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+            113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+            113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127
+        );
+        const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
+        const __m512i values256 = _mm512_add_epi8(values128, off);
+
+        auto loadc = [&](int col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        auto compute = [&](int col, int i) {
+            if (col == 0) {
+                // load a
+                va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0));
+                va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64));
+                va[2] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 128));
+                va[3] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 192));
+
+                // compensation: 128 * A
+                const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums);
+                vcomp = _mm512_castsi256_si512(_mm256_madd_epi16(q8sums, m128s));
+                vd1 = _mm512_set1_ps(A[0 * KB + i].d);
+            }
+
+            // accmulate the quants
+            __m512i acc = _mm512_setzero_si512();
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            const char * b_qs = b_ptr;
+            int mask = 0;
+            for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
+                int r = k_group >> 1;
+                __m512i vmask = _mm512_set1_epi32(k_group);
+                __m512i vsum = _mm512_setzero_si512();
+                for (int k = 0; k < 8; k += 2) {
+                    __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+                    __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+
+                    __m512i bytes = _mm512_loadu_si512(b_qs);
+                    __m512i vb0 = _mm512_shuffle_epi8(values256, _mm512_and_si512(bytes, lowMask));
+                    __m512i vb1 = _mm512_shuffle_epi8(values256, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask));
+
+                    vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
+                    vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
+                    b_qs += 64;
+                }
+                // (B + 128) * A - 128 * A
+                vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp));
+
+                // vacc += scale * (q8 @ q4)
+                const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N)));
+                acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
+            }
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0)));
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](int col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE)                                         \
+    tinygemm_kernel_vnni<vec_dot_type, type, float, 1, NB_SIZE, blck_size>::apply(   \
+        KB, (const char *)wdata + 0 * row_size_A,                                    \
+        (const char *)src0->data + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE),     \
+        (float *) dst->data + 0 * N + nb_start, ldc)
+
+template <typename TA, typename TB, typename TC, int BLOCK_K,
+          typename std::enable_if<!is_type_qkk<TB>::value, int>::type = 0>
+void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const void * RESTRICT _B, TC * RESTRICT C, int ldc) {
+    using packed_B_t = packed_B_type<TB>;
+    const int TILE_SIZE = get_tile_size<TB>();
+    const bool need_unpack = do_unpack<TB>::value;
+
+    GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N);
+    const TA * RESTRICT A = static_cast<const TA *>(_A);
+    const char * RESTRICT B = static_cast<const char *>(_B);
+
+    const int m0 = std::min(M, TILE_M);
+    const int m1 = std::max(M - TILE_M, 0);
+    const int lda = KB * sizeof(TA);
+    //const int ldb = KB * sizeof(TB);
+
+    static thread_local packed_B_t Tile0[TILE_N * TILE_K];
+    static thread_local packed_B_t Tile1[TILE_N * TILE_K];
+    static thread_local int8_t Tile23[TILE_M * TILE_K];
+
+    static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
+    static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
+
+    // double buffering C to interleave avx512 and amx
+    int32_t * C_cur = TileC0;
+    int32_t * C_pre = TileC1;
+
+    auto Tile4 = [&](int32_t * base) { return base; };
+    auto Tile5 = [&](int32_t * base) { return base + TILE_M * TILE_N; };
+    auto Tile6 = [&](int32_t * base) { return base + 2 * TILE_M * TILE_N; };
+    auto Tile7 = [&](int32_t * base) { return base + 3 * TILE_M * TILE_N; };
+
+    if (M == 2 * TILE_M) {
+        // i = 0
+        const char * B_blk0 = B + PACKED_INDEX(0, 0, KB, TILE_SIZE);
+        const char * B_blk1 = B + PACKED_INDEX(1, 0, KB, TILE_SIZE);
+        if (need_unpack) {
+            unpack_B<TB>(Tile0, B_blk0);
+            _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
+        } else {
+            _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
+        }
+
+        _tile_zero(TMM4);
+        _tile_loadd(TMM2, A[0].qs, lda);
+        _tile_dpbssd(TMM4, TMM2, TMM0);
+        _tile_stored(TMM4, Tile4(C_pre), TILE_N * sizeof(int32_t));
+
+        _tile_zero(TMM5);
+        _tile_loadd(TMM3, A[TILE_M * KB + 0].qs, lda);
+        _tile_dpbssd(TMM5, TMM3, TMM0);
+        _tile_stored(TMM5, Tile5(C_pre), TILE_N * sizeof(int32_t));
+
+        if (need_unpack) {
+            unpack_B<TB>(Tile1, B_blk0);
+            _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
+        } else {
+            _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
+        }
+
+        _tile_zero(TMM6);
+        _tile_dpbssd(TMM6, TMM2, TMM1);
+        _tile_stored(TMM6, Tile6(C_pre), TILE_N * sizeof(int32_t));
+
+        _tile_zero(TMM7);
+        _tile_dpbssd(TMM7, TMM3, TMM1);
+        _tile_stored(TMM7, Tile7(C_pre), TILE_N * sizeof(int32_t));
+
+        for (int i = 1; i < KB; ++i) {
+            // index of previous iter
+            const int ii = i - 1;
+            const char * B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE);
+            const char * B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE);
+            GGML_DISPATCH_BOOL(ii > 0, is_acc, [&] {
+                if (need_unpack) {
+                    unpack_B<TB>(Tile0, B_blk0);
+                    _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
+                } else {
+                    _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
+                }
+                _tile_zero(TMM4);
+                _tile_loadd(TMM2, A[i].qs, lda);
+                acc_C<TA, TB, is_acc>::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
+
+                _tile_dpbssd(TMM4, TMM2, TMM0);
+                _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t));
+
+                _tile_zero(TMM5);
+                _tile_loadd(TMM3, A[TILE_M * KB + i].qs, lda);
+                acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
+
+                _tile_dpbssd(TMM5, TMM3, TMM0);
+                _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t));
+
+                if (need_unpack) {
+                    unpack_B<TB>(Tile1, B_blk1);
+                    _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
+                } else {
+                    _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
+                }
+                _tile_zero(TMM6);
+                acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);
+
+                _tile_dpbssd(TMM6, TMM2, TMM1);
+                _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t));
+
+                _tile_zero(TMM7);
+                acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);
+
+                _tile_dpbssd(TMM7, TMM3, TMM1);
+                _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t));
+
+                std::swap(C_cur, C_pre);
+            });
+        }
+        // final accumulation
+        {
+            int ii = KB - 1;
+            acc_C<TA, TB, true>::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
+            acc_C<TA, TB, true>::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
+            acc_C<TA, TB, true>::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);
+            acc_C<TA, TB, true>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);
+        }
+    } else {
+        for (int i = 0; i < KB; ++i) {
+            _tile_zero(TMM4);
+            _tile_zero(TMM6);
+            if (m1 != 0) {
+                _tile_zero(TMM5);
+                _tile_zero(TMM7);
+            }
+
+            const char * B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE);
+            const char * B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE);
+            if (need_unpack) {
+                unpack_B<TB>(Tile0, B_blk0);
+                _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
+            } else {
+                _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
+            }
+
+            if (need_unpack) {
+                unpack_B<TB>(Tile1, B_blk1);
+                _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
+            } else {
+                _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
+            }
+
+            if (m0 == TILE_M) {
+                _tile_loadd(TMM2, A[i].qs, lda);
+            } else {
+                unpack_A(Tile23, &A[i], KB, m0);
+                _tile_loadd(TMM2, Tile23, TILE_K);
+            }
+
+            _tile_dpbssd(TMM4, TMM2, TMM0);
+            _tile_dpbssd(TMM6, TMM2, TMM1);
+
+            _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t));
+            _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t));
+
+            GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
+                acc_C<TA, TB, is_acc>::apply(C,          ldc, Tile4(C_cur), &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0);
+                acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Tile6(C_cur), &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m0);
+            });
+
+            if (m1 != 0) {
+                unpack_A(Tile23, &A[TILE_M * KB + i], KB, m1);
+                _tile_loadd(TMM3, Tile23, TILE_K);
+
+                _tile_dpbssd(TMM5, TMM3, TMM0);
+                _tile_dpbssd(TMM7, TMM3, TMM1);
+                _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t));
+                _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t));
+                GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
+                    acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc,          ldc, Tile5(C_cur), &A[TILE_M * KB + i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1);
+                    acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_cur), &A[TILE_M * KB + i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1);
+                });
+            }
+        }
+    }
+    return;
+}
+
+template <typename TA, typename TB, typename TC, int BLOCK_K,
+          typename std::enable_if<is_type_qkk<TB>::value, int>::type = 0>
+void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+    static_assert(std::is_same<TA, block_q8_K>::value);
+    const int TILE_SIZE = get_tile_size<TB>();
+
+    GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N);
+    const TA * RESTRICT A = static_cast<const TA *>(_A);
+    const char * RESTRICT B = static_cast<const char *>(_B);
+
+    const int m0 = std::min(M, TILE_M);
+    const int m1 = std::max(M - TILE_M, 0);
+    //const int lda = KB * sizeof(TA);
+
+    static thread_local int8_t Tile0[TILE_N * TILE_K];
+    static thread_local int8_t Tile1[TILE_N * TILE_K];
+    static thread_local int8_t Tile23[TILE_M * TILE_K];
+
+    // mat mul result for each group
+    static thread_local int32_t Tile4[TILE_M * TILE_N];
+    static thread_local int32_t Tile5[TILE_M * TILE_N];
+    static thread_local int32_t Tile6[TILE_M * TILE_N];
+    static thread_local int32_t Tile7[TILE_M * TILE_N];
+
+    // sum of each QK_K block, contains 8 groups, int32
+    static thread_local int32_t Sumi4[TILE_M * TILE_N];
+    static thread_local int32_t Sumi5[TILE_M * TILE_N];
+    static thread_local int32_t Sumi6[TILE_M * TILE_N];
+    static thread_local int32_t Sumi7[TILE_M * TILE_N];
+
+    const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
+    for (int i = 0; i < KB; ++i) {
+        // step 1: accumulate the quants across 8 groups, each group with 32
+        for (int k = 0; k < QK_K / k_group_size; ++k) {
+            GGML_DISPATCH_BOOL(k > 0, is_acc, [&] {
+                _tile_zero(TMM4);
+                _tile_zero(TMM6);
+
+                unpack_B<TB>(Tile0, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k);
+                _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
+
+                unpack_B<TB>(Tile1, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k);
+                _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
+
+                unpack_A<TB>(Tile23, &A[i], KB, k, m0);
+                _tile_loadd(TMM2, Tile23, TILE_K);
+
+                _tile_dpbssd(TMM4, TMM2, TMM0);
+                _tile_dpbssd(TMM6, TMM2, TMM1);
+
+                _tile_stored(TMM4, Tile4, TILE_N * sizeof(int32_t));
+                _tile_stored(TMM6, Tile6, TILE_N * sizeof(int32_t));
+
+                scale_C<TB, is_acc>(Tile4, Sumi4, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m0);
+                scale_C<TB, is_acc>(Tile6, Sumi6, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m0);
+
+                if (m1 != 0) {
+                    _tile_zero(TMM5);
+                    _tile_zero(TMM7);
+
+                    unpack_A<TB>(Tile23, &A[TILE_M * KB + i], KB, k, m1);
+                    _tile_loadd(TMM3, Tile23, TILE_K);
+
+                    _tile_dpbssd(TMM5, TMM3, TMM0);
+                    _tile_dpbssd(TMM7, TMM3, TMM1);
+
+                    _tile_stored(TMM5, Tile5, TILE_N * sizeof(int32_t));
+                    _tile_stored(TMM7, Tile7, TILE_N * sizeof(int32_t));
+
+                    scale_C<TB, is_acc>(Tile5, Sumi5, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m1);
+                    scale_C<TB, is_acc>(Tile7, Sumi7, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m1);
+                }
+            });
+        }
+
+        // step 2: accmulate the mins
+        GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
+            acc_C<TA, TB, is_acc>::apply(C,          ldc, Sumi4, &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0);
+            acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Sumi6, &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m0);
+            if (m1 != 0) {
+                acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc,          ldc, Sumi5, &A[TILE_M * KB + i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1);
+                acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Sumi7, &A[TILE_M * KB + i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1);
+            }
+        });
+    }
+    return;
+}
+
+} // anonymous namespace
+
+// get the packed tensor size for quantized weights
+size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) {
+    const enum ggml_type TYPE = tensor->type;
+
+    const int K = tensor->ne[0]; // ne0: in_features
+    const int N = tensor->ne[1]; // ne1: out_features
+
+    auto get_tensor_size = [&] {
+        size_t row_size_B{0};
+        GGML_DISPATCH_QTYPES(TYPE, [&] {
+            row_size_B = get_row_size<type, blck_size>(K);
+        });
+        return N * row_size_B;
+    };
+
+    if (qtype_has_amx_kernels(TYPE)) {
+        return get_tensor_size();
+    } else {
+        // for f16, bf16 we don't do packing
+        return ggml_nbytes(tensor);
+    }
+}
+
+// pack weight to vnni format
+void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+
+    size_t alloc_size = ggml_backend_amx_get_alloc_size(tensor);
+    GGML_ASSERT(alloc_size == size);
+
+    const enum ggml_type TYPE = tensor->type;
+
+    const int K = tensor->ne[0]; // ne0: in_features
+    const int N = tensor->ne[1]; // ne1: out_features
+
+#if defined(_OPENMP)
+    // the buffer ctx is not initialized when .set_tensor is called
+    int n_threads = omp_get_num_threads();
+#else
+    int n_threads = 1;
+#endif
+
+    GGML_DISPATCH_QTYPES(TYPE, [&] {
+        convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads);
+    });
+}
+
+// NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
+//
+// src0: weight in shape of {N, K}, quantized
+// src1: input  in shape of {M, K}, float32
+// dst:  output in shape of {M, N}, float32
+//
+// the function performs: dst = src1 @ src0.T
+//
+void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
+    struct ggml_tensor * src0 = dst->src[0];
+    struct ggml_tensor * src1 = dst->src[1];
+
+    const enum ggml_type TYPE = src0->type;
+
+    const int n_threads = ctx->n_threads;
+
+    // f16 only has avx512 kernels for now,
+    // amx kernels will be added once 6th gen xeon is released.
+    const bool is_floating_type = TYPE == GGML_TYPE_F16;
+
+    const int M = dst->ne[1];
+    const int N = dst->ne[0];
+    const int K = src0->ne[0];
+    const int ldc = dst->nb[1] / dst->nb[0];
+
+    if (is_floating_type) {
+        constexpr int BLOCK_M = 4;
+        constexpr int BLOCK_N = 6;
+        const int MB = div_up(M, BLOCK_M);
+        const int NB = div_up(N, BLOCK_N);
+
+        parallel_for(n_threads, MB * NB, [&](int begin, int end) {
+            GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
+                for (int i = begin; i < end; ++i) {
+                    int mb = i / NB;
+                    int nb = i % NB;
+
+                    int mb_start = mb * BLOCK_M;
+                    int mb_size = std::min(BLOCK_M, M - mb_start);
+                    int nb_start = nb * BLOCK_N;
+                    int nb_size = std::min(BLOCK_N, N - nb_start);
+
+                    switch (mb_size << 4 | nb_size) {
+                        case 0x12: LAUNCH_TINYGEMM_KERNEL_AVX(1, 2); break;
+                        case 0x14: LAUNCH_TINYGEMM_KERNEL_AVX(1, 4); break;
+                        case 0x16: LAUNCH_TINYGEMM_KERNEL_AVX(1, 6); break;
+                        case 0x22: LAUNCH_TINYGEMM_KERNEL_AVX(2, 2); break;
+                        case 0x24: LAUNCH_TINYGEMM_KERNEL_AVX(2, 4); break;
+                        case 0x26: LAUNCH_TINYGEMM_KERNEL_AVX(2, 6); break;
+                        case 0x32: LAUNCH_TINYGEMM_KERNEL_AVX(3, 2); break;
+                        case 0x34: LAUNCH_TINYGEMM_KERNEL_AVX(3, 4); break;
+                        case 0x36: LAUNCH_TINYGEMM_KERNEL_AVX(3, 6); break;
+                        case 0x42: LAUNCH_TINYGEMM_KERNEL_AVX(4, 2); break;
+                        case 0x44: LAUNCH_TINYGEMM_KERNEL_AVX(4, 4); break;
+                        case 0x46: LAUNCH_TINYGEMM_KERNEL_AVX(4, 6); break;
+                        default: fprintf(stderr, "Unexpected block size!\n");
+                    }
+                }
+            });
+        });
+        return;
+    }
+
+    // pointer to work space, used convert A from float to quantized type
+    void * wdata = nullptr;
+
+    //TODO: performance improvement: merge quant A
+    GGML_DISPATCH_QTYPES(TYPE, [&] {
+        const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
+        const size_t desired_wsize = M * row_size_A;
+        if (ctx->work_size < desired_wsize) {
+            ctx->work_data.reset(new char[desired_wsize]);
+            ctx->work_size = desired_wsize;
+        }
+        wdata = ctx->work_data.get();
+
+        // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
+        // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
+        GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
+
+        const float * A_data = static_cast<const float *>(src1->data);
+        for (int m = 0; m < M; ++m) {
+            from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
+        }
+    });
+
+    if (M == 1) {
+        // MB = 1 and handle 8 tiles in each block
+        constexpr int kTilesN = 4;
+        constexpr int BLOCK_N = TILE_N * kTilesN;
+        const int NB = div_up(N, BLOCK_N);
+
+        parallel_for(n_threads, NB, [&](int begin, int end) {
+            GGML_DISPATCH_QTYPES(TYPE, [&] {
+                const int KB = K / blck_size;
+                const int TILE_SIZE = get_tile_size<type>();
+                const int row_size_A = KB * sizeof(vec_dot_type);
+                for (int i = begin; i < end; ++i) {
+                    int nb = i;
+                    int nb_start = nb * BLOCK_N;
+                    int nb_size = std::min(BLOCK_N, N - nb_start); // 32, 64, 96
+
+                    switch (nb_size) {
+                        //case 160: LAUNCH_TINYGEMM_KERNEL_VNNI(160); break;
+                        case 128: LAUNCH_TINYGEMM_KERNEL_VNNI(128); break;
+                        case 96: LAUNCH_TINYGEMM_KERNEL_VNNI(96); break;
+                        case 64: LAUNCH_TINYGEMM_KERNEL_VNNI(64); break;
+                        case 32: LAUNCH_TINYGEMM_KERNEL_VNNI(32); break;
+                        default: fprintf(stderr, "Unexpected n block size!\n");
+                    }
+                }
+            });
+        });
+        return;
+    }
+
+    // handle 4 tiles at a tile
+    constexpr int BLOCK_M = TILE_M * 2;
+    constexpr int BLOCK_N = TILE_N * 2;
+    const int MB = div_up(M, BLOCK_M);
+    const int NB = div_up(N, BLOCK_N);
+
+    parallel_for(n_threads, MB * NB, [&](int begin, int end) {
+        // init tile config for each thread
+        ggml_tile_config_init();
+
+        GGML_DISPATCH_QTYPES(TYPE, [&] {
+            const int KB = K / blck_size;
+            const int TILE_SIZE = get_tile_size<type>();
+            const int row_size_A = KB * sizeof(vec_dot_type);
+
+            for (int i = begin; i < end; ++i) {
+                int mb = i / NB;
+                int nb = i % NB;
+
+                int mb_start = mb * BLOCK_M;
+                int mb_size = std::min(BLOCK_M, M - mb_start);
+                int nb_start = nb * BLOCK_N;
+                int nb_size = BLOCK_N;
+
+                tinygemm_kernel_amx<vec_dot_type, type, float, blck_size>(
+                    mb_size, nb_size, KB,
+                    (const char *)wdata + mb_start * row_size_A,
+                    (const char *)src0->data + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE),
+                    (float *) dst->data + mb_start * N + nb_start, ldc);
+            }
+        });
+    });
+}
+
+#else // if defined(__AMX_INT8__)
+
+void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
+    fprintf(stderr, "GGML is not compiled with AMX support!\n");
+
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+#endif // if defined(__AMX_INT8__)
diff --git a/ggml/src/ggml-amx/mmq.h b/ggml/src/ggml-amx/mmq.h
new file mode 100644
index 000000000..cf0920620
--- /dev/null
+++ b/ggml/src/ggml-amx/mmq.h
@@ -0,0 +1,17 @@
+#pragma once
+#include "common.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
+
+void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+
+void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index a3bc79a46..1c17dde30 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -329,7 +329,6 @@ bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type
     if (backend->device) {
         return ggml_backend_dev_supports_buft(backend->device, buft);
     }
-
     return backend->iface.supports_buft(backend, buft);
 }
 
@@ -550,6 +549,14 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #include "ggml-rpc.h"
 #endif
 
+#ifndef __AMX_INT8__
+#undef GGML_USE_AMX
+#endif
+
+#ifdef GGML_USE_AMX
+#  include "ggml-amx.h"
+#endif
+
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_t> backends;
     std::vector<ggml_backend_dev_t> devices;
@@ -570,6 +577,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_RPC
         register_backend(ggml_backend_rpc_reg());
 #endif
+#ifdef GGML_USE_AMX
+        register_backend(ggml_backend_amx_reg());
+#endif
 
         // TODO: sycl, kompute, cann
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 779b38d12..7e24313ed 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -23252,6 +23252,14 @@ int ggml_cpu_has_avx512_bf16(void) {
 #endif
 }
 
+int ggml_cpu_has_amx_int8(void) {
+#if defined(__AMX_INT8__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_fma(void) {
 #if defined(__FMA__)
     return 1;
diff --git a/src/llama.cpp b/src/llama.cpp
index dcb015d12..0025e94b8 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16,6 +16,14 @@
 #   include "ggml-cann.h"
 #endif
 
+#ifndef __AMX_INT8__
+#undef GGML_USE_AMX
+#endif
+
+#ifdef GGML_USE_AMX
+#  include "ggml-amx.h"
+#endif
+
 // TODO: replace with ggml API call
 #define QK_K 256
 
@@ -3533,6 +3541,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
 #else
     return 1;
 #endif
+
     GGML_UNUSED(model);
     GGML_UNUSED(device);
 }
@@ -7031,7 +7040,14 @@ static bool llm_load_tensors(
 
     // assign cpu layers
     for (int i = 0; i < i_gpu_start; ++i) {
+#ifdef GGML_USE_AMX
+        model.buft_layer[i] = {
+            ggml_backend_amx_buffer_type(),
+            llama_default_buffer_type_cpu(model, true)
+        };
+#else
         model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
+#endif
     }
 
     if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
@@ -21839,6 +21855,7 @@ const char * llama_print_system_info(void) {
     s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
     s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
     s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
+    s += "AMX_INT8 = "    + std::to_string(ggml_cpu_has_amx_int8())    + " | ";
     s += "FMA = "         + std::to_string(ggml_cpu_has_fma())         + " | ";
     s += "NEON = "        + std::to_string(ggml_cpu_has_neon())        + " | ";
     s += "SVE = "         + std::to_string(ggml_cpu_has_sve())         + " | ";

From 87421a23e8c60e00a7b227d501e8aab2a1aff7ce Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Fri, 18 Oct 2024 06:46:16 +0100
Subject: [PATCH 072/396] [SYCL] Add SYCL Backend registry, device and Event
 Interfaces (#9705)

* implemented missing SYCL event APIs

* sycl : Added device and backend reg interfaces

* Restructured ggml-sycl.cpp
---
 examples/llama-bench/llama-bench.cpp |    2 +-
 ggml/include/ggml-sycl.h             |   11 +-
 ggml/src/ggml-backend.cpp            |   10 +-
 ggml/src/ggml-sycl.cpp               | 2689 ++++++++++++++------------
 src/llama.cpp                        |   61 +-
 5 files changed, 1492 insertions(+), 1281 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index c22bdedcf..60a7aef5b 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -151,7 +151,7 @@ static std::string get_gpu_info() {
     int count = ggml_backend_sycl_get_device_count();
     for (int i = 0; i < count; i++) {
         char buf[128];
-        ggml_sycl_get_device_description(i, buf, sizeof(buf));
+        ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
         id += buf;
         if (i < count - 1) {
             id += "/";
diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h
index 03b698e61..af521f599 100644
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@@ -19,6 +19,8 @@ extern "C" {
 // backend API
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 
+GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
+
 // devide buffer
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 
@@ -29,14 +31,19 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const fl
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 
 GGML_API void ggml_backend_sycl_print_sycl_devices(void);
-GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
+GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_API void ggml_backend_sycl_get_device_description(int device,
+                                                       char *description,
+                                                       size_t description_size);
 GGML_API int  ggml_backend_sycl_get_device_count();
 GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 
 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
 // GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
+
+GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 1c17dde30..81d09cd8b 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -537,6 +537,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
 #ifdef GGML_USE_VULKAN
 #include "ggml-vulkan.h"
 #endif
@@ -568,6 +572,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_METAL
         register_backend(ggml_backend_metal_reg());
 #endif
+#ifdef GGML_USE_SYCL
+        register_backend(ggml_backend_sycl_reg());
+#endif
 #ifdef GGML_USE_VULKAN
         register_backend(ggml_backend_vk_reg());
 #endif
@@ -581,7 +588,7 @@ struct ggml_backend_registry {
         register_backend(ggml_backend_amx_reg());
 #endif
 
-        // TODO: sycl, kompute, cann
+        // TODO: kompute, cann
 
         register_backend(ggml_backend_cpu_reg());
     }
@@ -2254,6 +2261,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
         sched->backends[b] = backends[b];
         sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
         GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
+
         if (sched->n_copies > 1) {
             for (int c = 0; c < sched->n_copies; c++) {
                 sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 4d3f1c5ce..4d91ee460 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -40,17 +40,316 @@
 #include "ggml-sycl/presets.hpp"
 #include "ggml-sycl/gemm.hpp"
 
-bool   ggml_sycl_loaded(void);
-void   ggml_sycl_free_data(struct ggml_tensor * tensor);
-void   ggml_sycl_copy_to_device(struct ggml_tensor * tensor);
-void   ggml_sycl_set_main_device(int main_device);
-void   ggml_sycl_set_mul_mat_q(bool mul_mat_q);
-void   ggml_sycl_get_device_description(int device, char * description, size_t description_size);
-bool   ggml_backend_is_sycl(ggml_backend_t backend);
-int    ggml_backend_sycl_get_device(ggml_backend_t backend);
-static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
-static inline int get_sycl_env(const char *env_name, int default_val);
+static bool g_sycl_loaded = false;
 
+static ggml_sycl_device_info ggml_sycl_init() {
+    ggml_sycl_device_info info = {};
+
+    info.device_count = dpct::dev_mgr::instance().device_count();
+    if (info.device_count == 0) {
+        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
+        return info;
+    }
+
+    GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
+
+    int64_t total_vram = 0;
+#if defined(GGML_SYCL_FORCE_MMQ)
+    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
+#else
+    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
+#endif
+#if defined(SYCL_USE_XMX)
+    fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
+#else
+    fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
+#endif
+    fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
+
+    for (int i = 0; i < info.device_count; ++i) {
+        info.devices[i].vmm = 0;
+        dpct::device_info prop;
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(i))));
+
+        info.default_tensor_split[i] = total_vram;
+        total_vram += prop.get_global_mem_size();
+
+        info.devices[i].cc =
+            100 * prop.get_major_version() + 10 * prop.get_minor_version();
+
+        info.max_work_group_sizes[i] = prop.get_max_work_group_size();
+    }
+
+    for (int id = 0; id < info.device_count; ++id) {
+        info.default_tensor_split[id] /= total_vram;
+    }
+    return info;
+}
+
+const ggml_sycl_device_info & ggml_sycl_info() {
+    static ggml_sycl_device_info info = ggml_sycl_init();
+    return info;
+}
+
+void print_device_detail(int id, sycl::device &device, std::string device_type) {
+
+    dpct::device_info prop;
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        dpct::get_device_info(prop, device)));
+
+    std::string version;
+    version += std::to_string(prop.get_major_version());
+    version += ".";
+    version += std::to_string(prop.get_minor_version());
+
+    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
+    std::string name = std::string(prop.get_name());
+    name = std::regex_replace(name, std::regex("\\(R\\)"), "");
+    name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
+
+    auto global_mem_size = prop.get_global_mem_size()/1000000;
+
+    fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
+            name.c_str(), version.c_str(), prop.get_max_compute_units(),
+            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
+            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
+}
+
+void ggml_backend_sycl_print_sycl_devices() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
+    int device_count = dpct::dev_mgr::instance().device_count();
+    std::map<std::string, size_t> DeviceNums;
+    fprintf(stderr, "found %d SYCL devices:\n", device_count);
+    fprintf(stderr, "|  |                   |                                       |       |Max    |        |Max  |Global |                     |\n");
+    fprintf(stderr, "|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |\n");
+    fprintf(stderr, "|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|\n");
+    fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
+    for (int id = 0; id < device_count; ++id) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        sycl::backend backend = device.get_backend();
+        std::string backend_type = get_device_backend_and_type(device);
+        int type_id=DeviceNums[backend_type]++;
+        std::stringstream device_type;
+        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
+        print_device_detail(id, device, device_type.str());
+    }
+}
+
+static inline int get_sycl_env(const char *env_name, int default_val) {
+    char *user_device_string = getenv(env_name);
+    int user_number = default_val;
+
+    unsigned n;
+    if (user_device_string != NULL &&
+        sscanf(user_device_string, " %u", &n) == 1) {
+        user_number = (int)n;
+    } else {
+        user_number = default_val;
+    }
+    return user_number;
+}
+
+static void ggml_check_sycl() try {
+    static bool initialized = false;
+
+    if (!initialized) {
+        fprintf(stderr, "[SYCL] call ggml_check_sycl\n");
+        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
+
+        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
+
+#if defined(GGML_SYCL_F16)
+        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
+#endif
+
+/* NOT REMOVE, keep it for next optimize for XMX.
+#if defined(SYCL_USE_XMX)
+        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
+#endif
+*/
+
+        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
+                            dpct::dev_mgr::instance().device_count()) != 0) {
+            initialized = true;
+            g_sycl_loaded = false;
+            return;
+        }
+        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
+        ggml_backend_sycl_print_sycl_devices();
+        initialized = true;
+        g_sycl_loaded = true;
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+/*
+device_index: device index from 0 to n (continue numbers).
+    It is used for device select/set in SYCL backend internal data structure.
+*/
+inline void check_allow_gpu_index(const int device_index) {
+  if (device_index >= ggml_sycl_info().device_count) {
+    char error_buf[256];
+    snprintf(
+        error_buf,
+        sizeof(error_buf),
+        "%s error: device_index:%d is out of range: [0-%d]",
+        __func__,
+        device_index,
+        ggml_sycl_info().device_count - 1);
+    fprintf(stderr, "%s\n", error_buf);
+    assert(false);
+  }
+}
+
+GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len) try {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_gpu_list\n");
+    for(int i=0;i<max_len;i++) id_list[i] = -1;
+
+    for (int i=0;i< ggml_sycl_info().device_count;i++){
+        if (i>=max_len) break;
+        id_list[i] = i;
+    }
+    return;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+// sycl buffer
+
+struct ggml_backend_sycl_buffer_context {
+    int device;
+    void * dev_ptr = nullptr;
+    queue_ptr stream;
+    std::string name;
+
+     ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
+        device(device), dev_ptr(dev_ptr), stream(stream) {
+            check_allow_gpu_index(device);
+            name = (GGML_SYCL_NAME + std::to_string(device));
+        }
+
+
+    ~ggml_backend_sycl_buffer_context() {
+        if (dev_ptr != nullptr) {
+            ggml_sycl_set_device(device);
+            SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
+        }
+    }
+};
+
+static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
+    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
+    return ctx->name.c_str();
+}
+
+static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
+}
+
+static void
+ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+    ggml_sycl_set_device(ctx->device);
+
+    delete ctx;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+    return ctx->dev_ptr;
+}
+
+static void
+ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
+                                     ggml_tensor *tensor) try {
+    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
+
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        assert(tensor->view_src->buffer->buft == buffer->buft);
+        tensor->backend = tensor->view_src->backend;
+        tensor->extra = tensor->view_src->extra;
+        return;
+    }
+
+
+    if (ggml_is_quantized(tensor->type)) {
+        // initialize padding to 0 to avoid possible NaN values
+        size_t original_size = ggml_nbytes(tensor);
+        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+
+        if (padded_size > original_size && tensor->view_src == nullptr) {
+            SYCL_CHECK(CHECK_TRY_ERROR(ctx->stream->memset(
+                (char *)tensor->data + original_size, 0,
+                padded_size - original_size).wait()));
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                ggml_tensor *tensor,
+                                                const void *data, size_t offset,
+                                                size_t size) try {
+
+    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+
+    ggml_sycl_set_device(ctx->device);
+    auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
+    char* host_buf = (char*)malloc(size);
+    memcpy(host_buf, data, size);
+    SYCL_CHECK(
+        CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
+                             .wait()));
+    free(host_buf);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                const ggml_tensor *tensor,
+                                                void *data, size_t offset,
+                                                size_t size) try {
+
+    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+
+    ggml_sycl_set_device(ctx->device);
+    auto stream = dpct::dev_mgr::instance().get_device(ctx->device).default_queue();
+
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        stream.memcpy(data, (const char *)tensor->data + offset, size)
+            .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
 
 void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
                     const void *ptr_src, size_t size) {
@@ -60,6 +359,850 @@ void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
     free(host_buf);
 }
 
+static bool
+ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                    const ggml_tensor *src,
+                                    ggml_tensor *dst) try {
+    if (ggml_backend_buffer_is_sycl(src->buffer)) {
+        ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
+        ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;
+
+        ggml_sycl_set_device(src_ctx->device);
+        /*
+        DPCT1009:198: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            dpct::dev_mgr::instance().get_device(src_ctx->device).queues_wait_and_throw()));
+        ggml_sycl_set_device(dst_ctx->device);
+        /*
+        DPCT1009:199: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw()));
+        /*
+        DPCT1009:200: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+
+        queue_ptr stream_dst = dst_ctx->stream;
+        queue_ptr stream_src = src_ctx->stream;
+        size_t size = ggml_nbytes(src);
+
+        //todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs.
+        dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size);
+
+//todo, it's known issue：error in device2device cross GPUs. reused when the issue is fixed. DON"T remove
+#if 0
+        SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(
+            (char *)dst->data, (const char *)src->data, size).wait()));
+
+        /*
+        DPCT1009:201: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw()));
+#endif
+        return true;
+    }
+    return false;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+
+static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
+                                           uint8_t value) try {
+     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+
+    ggml_sycl_set_device(ctx->device);
+    queue_ptr stream = ctx->stream;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+
+    SYCL_CHECK(CHECK_TRY_ERROR((*stream)
+                                    .memset(ctx->dev_ptr, value, buffer->size)
+                                    .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
+    /* .get_name        = */ ggml_backend_sycl_buffer_get_name,
+    /* .free_buffer     = */ ggml_backend_sycl_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_sycl_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_sycl_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_sycl_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_sycl_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_sycl_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_sycl_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// sycl buffer type
+struct ggml_backend_sycl_buffer_type_context {
+    int device;
+    std::string name;
+
+    // each buffer type has its own stream
+    queue_ptr stream = nullptr;
+};
+
+static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
+
+    return ctx->name.c_str();
+}
+
+static ggml_backend_buffer_t
+ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                           size_t size) try {
+    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
+    ggml_sycl_set_device(buft_ctx->device);
+    const queue_ptr stream = buft_ctx->stream;
+    size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
+
+    void * dev_ptr;
+    SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
+                                    size, *stream)));
+    if (!dev_ptr) {
+        fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
+        return nullptr;
+    }
+    ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
+    return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    return dpct::get_current_device().get_max_mem_alloc_size();
+
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    size_t size = ggml_nbytes(tensor);
+    int64_t ne0 = tensor->ne[0];
+
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return size;
+
+    GGML_UNUSED(buft);
+}
+
+static const ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_sycl_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_sycl_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_sycl_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_sycl_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_sycl_buffer_type_get_alloc_size,
+    /* .is_host          = */ NULL,
+};
+
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
+
+    auto dev_count = ggml_backend_sycl_get_device_count();
+
+    if (device>=dev_count or device<0) {
+        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+            device, dev_count-1);
+        GGML_ASSERT(device<dev_count);
+    }
+    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
+
+    static bool ggml_backend_sycl_buffer_type_initialized = false;
+
+    if (!ggml_backend_sycl_buffer_type_initialized) {
+        for (int i = 0; i < dev_count; i++) {
+            auto & device_i = dpct::dev_mgr::instance().get_device(i);
+            queue_ptr stream = &(device_i.default_queue());
+            ggml_backend_sycl_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
+                /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), i),
+                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
+            };
+        }
+        ggml_backend_sycl_buffer_type_initialized = true;
+    }
+    return &ggml_backend_sycl_buffer_types[device];
+}
+
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
+
+    int device = ctx->device;
+    if (device>=ggml_sycl_info().device_count or device<0) {
+        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+            device, ggml_sycl_info().device_count-1);
+        GGML_ASSERT(device<ggml_sycl_info().device_count);
+    }
+    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
+
+    static bool ggml_backend_sycl_buffer_type_initialized = false;
+
+    if (!ggml_backend_sycl_buffer_type_initialized) {
+        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
+            ggml_backend_sycl_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
+                /* .device   = */ nullptr,
+                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
+            };
+        }
+        ggml_backend_sycl_buffer_type_initialized = true;
+    }
+    return &ggml_backend_sycl_buffer_types[device];
+}
+
+// sycl split buffer
+
+static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
+    int64_t min_compute_capability = INT_MAX;
+    int64_t max_compute_capability = INT_MIN;
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
+            if (min_compute_capability > ggml_sycl_info().devices[i].cc) {
+                min_compute_capability = ggml_sycl_info().devices[i].cc;
+            }
+            if (max_compute_capability < ggml_sycl_info().devices[i].cc) {
+                max_compute_capability = ggml_sycl_info().devices[i].cc;
+            }
+        }
+    }
+
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            return max_compute_capability >= VER_GEN9 ? 128 : 64;
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return 64;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+            return 1;
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ4_NL:
+            return max_compute_capability >= VER_GEN9 ? 128 : 64;
+        case GGML_TYPE_IQ3_S:
+            return max_compute_capability >= VER_GEN9 ? 128 : 64;
+        case GGML_TYPE_Q6_K:
+            return 64;
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split, int id) {
+    const int64_t nrows = ggml_nrows(tensor);
+    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
+
+    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
+    *row_low -= *row_low % rounding;
+    if (id == ggml_sycl_info().device_count - 1) {
+        *row_high = nrows;
+    } else {
+        *row_high = nrows*tensor_split[id + 1];
+        *row_high -= *row_high % rounding;
+    }
+}
+
+static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
+}
+
+struct ggml_backend_sycl_split_buffer_type_context {
+    std::array<float, GGML_SYCL_MAX_DEVICES> tensor_split;
+};
+
+struct ggml_backend_sycl_split_buffer_context {
+    ~ggml_backend_sycl_split_buffer_context() try {
+        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
+            for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+                for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
+                    if (extra->events[i][is] != nullptr) {
+                        /*
+                        DPCT1009:206: SYCL uses exceptions to report errors and
+                        does not use the error codes. The original code was
+                        commented out and a warning string was inserted. You
+                        need to rewrite this code.
+                        */
+                        SYCL_CHECK(CHECK_TRY_ERROR(
+                            dpct::destroy_event(extra->events[i][is])));
+                    }
+                }
+                if (extra->data_device[i] != nullptr) {
+                    /*
+                    DPCT1009:207: SYCL uses exceptions to report errors and does
+                    not use the error codes. The original code was commented out
+                    and a warning string was inserted. You need to rewrite this
+                    code.
+                    */
+                    ggml_sycl_set_device(i);
+                    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
+                        extra->data_device[i], *(streams[i]))));
+                }
+            }
+            delete extra;
+        }
+    }
+    catch (sycl::exception const &exc) {
+      std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+                << ", line:" << __LINE__ << std::endl;
+      std::exit(1);
+    }
+
+    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
+    std::vector<queue_ptr> streams;
+};
+
+static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
+    return GGML_SYCL_NAME "_Split";
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
+   return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
+}
+
+static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
+    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
+    return (void *)0x1000;
+
+    GGML_UNUSED(buffer);
+}
+
+static void
+ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
+                                           ggml_tensor *tensor) try {
+    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
+
+    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
+    ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
+
+    ctx->tensor_extras.push_back(extra);
+        ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        // FIXME: do not crash if cudaMalloc fails
+        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
+        ggml_sycl_set_device(i);
+        const queue_ptr stream = ctx->streams[i];
+        char * buf;
+        /*
+        DPCT1009:208: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
+                                        size, *stream)));
+        if (!buf) {
+            char err_buf[1024];
+            snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
+            throw std::runtime_error(err_buf);
+        }
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            /*
+            DPCT1009:209: SYCL uses exceptions to report errors and does not use
+            the error codes. The original code was commented out and a warning
+            string was inserted. You need to rewrite this code.
+            */
+            SYCL_CHECK(CHECK_TRY_ERROR(
+                (*stream)
+                    .memset(buf + original_size, 0, size - original_size)
+                    .wait()));
+        }
+
+        extra->data_device[i] = buf;
+
+        for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
+            /*
+            DPCT1009:210: SYCL uses exceptions to report errors and does not use
+            the error codes. The original code was commented out and a warning
+            string was inserted. You need to rewrite this code.
+            */
+            SYCL_CHECK(
+                CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
+        }
+    }
+    tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void
+ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                          ggml_tensor *tensor, const void *data,
+                                          size_t offset, size_t size) try {
+    // split tensors must always be set in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
+    ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        const char * buf_host = (const char *)data + offset_split;
+        /*
+        DPCT1009:211: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        ggml_sycl_set_device(i);
+        const queue_ptr stream = ctx->streams[i];
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            (*stream)
+                .memcpy(extra->data_device[i], buf_host, original_size)
+                .wait()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void
+ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                          const ggml_tensor *tensor, void *data,
+                                          size_t offset, size_t size) try {
+    // split tensors must always be set in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
+    ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        char * buf_host = (char *)data + offset_split;
+        /*
+        DPCT1009:212: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        ggml_sycl_set_device(i);
+        const queue_ptr stream = ctx->streams[i];
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            (*stream)
+                .memcpy(buf_host, extra->data_device[i], original_size)
+                .wait()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    GGML_UNUSED(buffer);
+    GGML_UNUSED(value);
+}
+
+static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
+    /* .get_name        = */ ggml_backend_sycl_split_buffer_get_name,
+    /* .free_buffer     = */ ggml_backend_sycl_split_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_sycl_split_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_sycl_split_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_sycl_split_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_sycl_split_buffer_get_tensor,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ ggml_backend_sycl_split_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// sycl split buffer type
+
+static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return GGML_SYCL_NAME "_Split";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
+    // instead, we allocate them for each tensor separately in init_tensor
+    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
+    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
+    ggml_backend_sycl_split_buffer_context * ctx = new ggml_backend_sycl_split_buffer_context();
+
+    return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context;
+
+    size_t total_size = 0;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        total_size += ggml_nbytes_split(tensor, nrows_split);
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return total_size;
+}
+
+static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_sycl_split_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_sycl_split_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_sycl_split_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_alloc_size   = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
+    /* .is_host          = */ ggml_backend_sycl_split_buffer_type_is_host,
+};
+
+ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
+    ggml_check_sycl();
+    // FIXME: this is not thread safe
+    static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
+
+    std::array<float, GGML_SYCL_MAX_DEVICES> tensor_split_arr = {};
+
+    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_SYCL_MAX_DEVICES, [](float x) { return x == 0.0f; });
+    if (all_zero) {
+        tensor_split_arr = ggml_sycl_info().default_tensor_split;
+    } else {
+        float split_sum = 0.0f;
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            tensor_split_arr[i] = split_sum;
+            split_sum += tensor_split[i];
+        }
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            tensor_split_arr[i] /= split_sum;
+        }
+    }
+
+    auto it = buft_map.find(tensor_split_arr);
+    if (it != buft_map.end()) {
+        return &it->second;
+    }
+
+    struct ggml_backend_buffer_type buft {
+        /* .iface   = */ ggml_backend_sycl_split_buffer_type_interface,
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), 0),
+        /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr},
+    };
+
+    auto result = buft_map.emplace(tensor_split_arr, buft);
+    return &result.first->second;
+}
+
+// host buffer type
+
+static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return GGML_SYCL_NAME "_Host";
+
+    GGML_UNUSED(buft);
+}
+
+static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
+    return GGML_SYCL_NAME "_Host";
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_sycl_host_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr = ggml_sycl_host_malloc(size);
+
+    if (ptr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    // FIXME: this is a hack to avoid having to implement a new buffer type
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.get_name = ggml_backend_sycl_host_buffer_name;
+    buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
+    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_sycl_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_sycl_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_max_size     = */ NULL, // TODO: return device.maxBufferLength
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), 0),
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_sycl_buffer_type_host;
+}
+
+// buffer pool for sycl (legacy)
+struct ggml_sycl_pool_leg : public ggml_sycl_pool {
+    static const int MAX_SYCL_BUFFERS = 256;
+
+    int device;
+    queue_ptr qptr;
+    struct ggml_sycl_buffer {
+        void * ptr = nullptr;
+        size_t size = 0;
+    };
+
+    ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
+    size_t pool_size = 0;
+
+    explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
+        qptr(qptr_),
+        device(device_) {
+    }
+
+    ~ggml_sycl_pool_leg() {
+        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+            ggml_sycl_buffer & b = buffer_pool[i];
+            if (b.ptr != nullptr) {
+                SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
+                pool_size -= b.size;
+            }
+        }
+        GGML_ASSERT(pool_size == 0);
+    }
+
+    void * alloc(size_t size, size_t * actual_size) override {
+#ifdef DEBUG_sycl_MALLOC
+        int nnz = 0;
+        size_t max_size = 0;
+#endif
+        size_t best_diff = 1ull << 36;
+        int ibest = -1;
+        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+            ggml_sycl_buffer& b = buffer_pool[i];
+            if (b.ptr != nullptr) {
+#ifdef DEBUG_sycl_MALLOC
+                ++nnz;
+                if (b.size > max_size) max_size = b.size;
+#endif
+                if (b.size >= size) {
+                    size_t diff = b.size - size;
+                    if (diff < best_diff) {
+                        best_diff = diff;
+                        ibest = i;
+                        if (!best_diff) {
+                            void * ptr = b.ptr;
+                            *actual_size = b.size;
+                            b.ptr = nullptr;
+                            b.size = 0;
+                            return ptr;
+                        }
+                    }
+                }
+            }
+        }
+        if (ibest >= 0) {
+            ggml_sycl_buffer& b = buffer_pool[ibest];
+            void * ptr = b.ptr;
+            *actual_size = b.size;
+            b.ptr = nullptr;
+            b.size = 0;
+            return ptr;
+        }
+        void * ptr;
+        size_t look_ahead_size = (size_t) (1.05 * size);
+
+        SYCL_CHECK(
+            CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
+                                look_ahead_size, *qptr)));
+        if (!ptr) {
+            fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
+            return nullptr;
+        }
+
+        *actual_size = look_ahead_size;
+        pool_size += look_ahead_size;
+
+    #ifdef DEBUG_SYCL_MALLOC
+        fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
+                (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
+    #endif
+        // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr);
+        return ptr;
+    }
+
+    void free(void * ptr, size_t size) override {
+        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+            ggml_sycl_buffer& b = buffer_pool[i];
+            if (b.ptr == nullptr) {
+                b.ptr = ptr;
+                b.size = size;
+                return;
+            }
+        }
+        fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
+        SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
+        pool_size -= size;
+    }
+};
+
+std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
+    // TBD: NO VMM support
+    // if (ggml_sycl_info().devices[device].vmm) {
+    //     return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
+    // }
+   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
+}
+
+// TBD pool with virtual memory management
+// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
+
+/// kernels
+
 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
 typedef void (*ggml_sycl_func_t)(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
 typedef void (*ggml_sycl_op_mul_mat_t)(
@@ -1706,296 +2849,6 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
                          });
 }
 
-static bool g_sycl_loaded = false;
-
-bool ggml_sycl_loaded(void) {
-    return g_sycl_loaded;
-}
-
-void print_device_detail(int id, sycl::device &device, std::string device_type) {
-
-    dpct::device_info prop;
-    SYCL_CHECK(CHECK_TRY_ERROR(
-        dpct::get_device_info(prop, device)));
-
-    std::string version;
-    version += std::to_string(prop.get_major_version());
-    version += ".";
-    version += std::to_string(prop.get_minor_version());
-
-    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
-    std::string name = std::string(prop.get_name());
-    name = std::regex_replace(name, std::regex("\\(R\\)"), "");
-    name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
-
-    auto global_mem_size = prop.get_global_mem_size()/1000000;
-
-    fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
-            name.c_str(), version.c_str(), prop.get_max_compute_units(),
-            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
-            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
-}
-
-void ggml_backend_sycl_print_sycl_devices() {
-    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
-    int device_count = dpct::dev_mgr::instance().device_count();
-    std::map<std::string, size_t> DeviceNums;
-    fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    fprintf(stderr, "|  |                   |                                       |       |Max    |        |Max  |Global |                     |\n");
-    fprintf(stderr, "|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |\n");
-    fprintf(stderr, "|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|\n");
-    fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
-    for (int id = 0; id < device_count; ++id) {
-        sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        sycl::backend backend = device.get_backend();
-        std::string backend_type = get_device_backend_and_type(device);
-        int type_id=DeviceNums[backend_type]++;
-        std::stringstream device_type;
-        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
-        print_device_detail(id, device, device_type.str());
-    }
-}
-
-static inline int get_sycl_env(const char *env_name, int default_val) {
-    char *user_device_string = getenv(env_name);
-    int user_number = default_val;
-
-    unsigned n;
-    if (user_device_string != NULL &&
-        sscanf(user_device_string, " %u", &n) == 1) {
-        user_number = (int)n;
-    } else {
-        user_number = default_val;
-    }
-    return user_number;
-}
-
-static void ggml_check_sycl() try {
-    static bool initialized = false;
-
-    if (!initialized) {
-        fprintf(stderr, "[SYCL] call ggml_check_sycl\n");
-        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-
-        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
-
-#if defined(GGML_SYCL_F16)
-        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
-#endif
-
-/* NOT REMOVE, keep it for next optimize for XMX.
-#if defined(SYCL_USE_XMX)
-        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
-#endif
-*/
-
-        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
-                            dpct::dev_mgr::instance().device_count()) != 0) {
-            initialized = true;
-            g_sycl_loaded = false;
-            return;
-        }
-        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        ggml_backend_sycl_print_sycl_devices();
-        initialized = true;
-        g_sycl_loaded = true;
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static ggml_sycl_device_info ggml_sycl_init() {
-    ggml_sycl_device_info info = {};
-
-    info.device_count = dpct::dev_mgr::instance().device_count();
-    if (info.device_count == 0) {
-        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
-        return info;
-    }
-
-    GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
-
-    int64_t total_vram = 0;
-#if defined(GGML_SYCL_FORCE_MMQ)
-    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
-#else
-    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
-#endif
-#if defined(SYCL_USE_XMX)
-    fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
-#else
-    fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
-#endif
-    fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
-
-    for (int i = 0; i < info.device_count; ++i) {
-        info.devices[i].vmm = 0;
-        dpct::device_info prop;
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(i))));
-
-        info.default_tensor_split[i] = total_vram;
-        total_vram += prop.get_global_mem_size();
-
-        info.devices[i].cc =
-            100 * prop.get_major_version() + 10 * prop.get_minor_version();
-
-        info.max_work_group_sizes[i] = prop.get_max_work_group_size();
-    }
-
-    for (int id = 0; id < info.device_count; ++id) {
-        info.default_tensor_split[id] /= total_vram;
-    }
-    return info;
-}
-
-const ggml_sycl_device_info & ggml_sycl_info() {
-    static ggml_sycl_device_info info = ggml_sycl_init();
-    return info;
-}
-
-/*
-device_index: device index from 0 to n (continue numbers).
-    It is used for device select/set in SYCL backend internal data structure.
-*/
-inline void check_allow_gpu_index(const int device_index) {
-  if (device_index >= ggml_sycl_info().device_count) {
-    char error_buf[256];
-    snprintf(
-        error_buf,
-        sizeof(error_buf),
-        "%s error: device_index:%d is out of range: [0-%d]",
-        __func__,
-        device_index,
-        ggml_sycl_info().device_count - 1);
-    fprintf(stderr, "%s\n", error_buf);
-    assert(false);
-  }
-}
-
-// buffer pool for sycl (legacy)
-struct ggml_sycl_pool_leg : public ggml_sycl_pool {
-    static const int MAX_SYCL_BUFFERS = 256;
-
-    int device;
-    queue_ptr qptr;
-    struct ggml_sycl_buffer {
-        void * ptr = nullptr;
-        size_t size = 0;
-    };
-
-    ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
-    size_t pool_size = 0;
-
-    explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
-        qptr(qptr_),
-        device(device_) {
-    }
-
-    ~ggml_sycl_pool_leg() {
-        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
-            ggml_sycl_buffer & b = buffer_pool[i];
-            if (b.ptr != nullptr) {
-                SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
-                pool_size -= b.size;
-            }
-        }
-        GGML_ASSERT(pool_size == 0);
-    }
-
-    void * alloc(size_t size, size_t * actual_size) override {
-#ifdef DEBUG_sycl_MALLOC
-        int nnz = 0;
-        size_t max_size = 0;
-#endif
-        size_t best_diff = 1ull << 36;
-        int ibest = -1;
-        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
-            ggml_sycl_buffer& b = buffer_pool[i];
-            if (b.ptr != nullptr) {
-#ifdef DEBUG_sycl_MALLOC
-                ++nnz;
-                if (b.size > max_size) max_size = b.size;
-#endif
-                if (b.size >= size) {
-                    size_t diff = b.size - size;
-                    if (diff < best_diff) {
-                        best_diff = diff;
-                        ibest = i;
-                        if (!best_diff) {
-                            void * ptr = b.ptr;
-                            *actual_size = b.size;
-                            b.ptr = nullptr;
-                            b.size = 0;
-                            return ptr;
-                        }
-                    }
-                }
-            }
-        }
-        if (ibest >= 0) {
-            ggml_sycl_buffer& b = buffer_pool[ibest];
-            void * ptr = b.ptr;
-            *actual_size = b.size;
-            b.ptr = nullptr;
-            b.size = 0;
-            return ptr;
-        }
-        void * ptr;
-        size_t look_ahead_size = (size_t) (1.05 * size);
-
-        SYCL_CHECK(
-            CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
-                                look_ahead_size, *qptr)));
-        if (!ptr) {
-            fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
-            return nullptr;
-        }
-
-        *actual_size = look_ahead_size;
-        pool_size += look_ahead_size;
-
-    #ifdef DEBUG_SYCL_MALLOC
-        fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
-                (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
-    #endif
-        // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr);
-        return ptr;
-    }
-
-    void free(void * ptr, size_t size) override {
-        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
-            ggml_sycl_buffer& b = buffer_pool[i];
-            if (b.ptr == nullptr) {
-                b.ptr = ptr;
-                b.size = size;
-                return;
-            }
-        }
-        fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
-        SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
-        pool_size -= size;
-    }
-};
-
-std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
-    // TBD: NO VMM support
-    // if (ggml_sycl_info().devices[device].vmm) {
-    //     return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
-    // }
-   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
-}
-
-// TBD pool with virtual memory management
-// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
-
 static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
                                           const struct ggml_tensor *src,
                                           int64_t i3, int64_t i2,
@@ -2376,54 +3229,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
     (void) src1_dd;
 }
 
-static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
-    int64_t min_compute_capability = INT_MAX;
-    int64_t max_compute_capability = INT_MIN;
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
-            if (min_compute_capability > ggml_sycl_info().devices[i].cc) {
-                min_compute_capability = ggml_sycl_info().devices[i].cc;
-            }
-            if (max_compute_capability < ggml_sycl_info().devices[i].cc) {
-                max_compute_capability = ggml_sycl_info().devices[i].cc;
-            }
-        }
-    }
-
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-            return max_compute_capability >= VER_GEN9 ? 128 : 64;
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ4_NL:
-            return max_compute_capability >= VER_GEN9 ? 128 : 64;
-        case GGML_TYPE_IQ3_S:
-            return max_compute_capability >= VER_GEN9 ? 128 : 64;
-        case GGML_TYPE_Q6_K:
-            return 64;
-        default:
-            GGML_ABORT("fatal error");
-    }
-
-}
-
 inline void ggml_sycl_op_mul_mat_sycl(
     ggml_backend_sycl_context & ctx,
     const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
@@ -2783,10 +3588,6 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
     peer_access_enabled = enable_peer_access;
 }
 
-struct ggml_backend_sycl_split_buffer_type_context {
-    std::array<float, GGML_SYCL_MAX_DEVICES> tensor_split;
-};
-
 static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                  const ggml_tensor *src1, ggml_tensor *dst,
                                  ggml_sycl_op_mul_mat_t op,
@@ -3865,12 +4666,6 @@ static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * s
     (void) dst;
 }
 
-static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
-}
-
 void ggml_sycl_set_main_device(const int main_device) try {
     if (dpct::get_current_device_id() == main_device) return;
     check_allow_gpu_index(main_device);
@@ -4038,39 +4833,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
     return true;
 }
 
-GGML_API void   ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
-    GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
-    for(int i=0;i<max_len;i++) id_list[i] = -1;
-
-    for (int i=0;i< ggml_sycl_info().device_count;i++){
-        if (i>=max_len) break;
-        id_list[i] = i;
-    }
-    return;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-int ggml_sycl_get_device_count() try {
-    int device_count;
-    if (CHECK_TRY_ERROR(device_count =
-                             dpct::dev_mgr::instance().device_count()) != 0) {
-        return 0;
-    }
-    return device_count;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-GGML_API void ggml_sycl_get_device_description(int device, char *description,
+GGML_API void ggml_backend_sycl_get_device_description(int device, char *description,
                                       size_t description_size) try {
-    GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_description\n");
     dpct::device_info prop;
     SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
         prop, dpct::dev_mgr::instance().get_device(device))));
@@ -4108,801 +4873,9 @@ catch (sycl::exception const &exc) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-// backend interface
-
-#define UNUSED GGML_UNUSED
-
-// sycl buffer
-
-struct ggml_backend_sycl_buffer_context {
-    int device;
-    void * dev_ptr = nullptr;
-    queue_ptr stream;
-    std::string name;
-
-     ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
-        device(device), dev_ptr(dev_ptr), stream(stream) {
-            check_allow_gpu_index(device);
-            name = (GGML_SYCL_NAME + std::to_string(device));
-        }
-
-
-    ~ggml_backend_sycl_buffer_context() {
-        if (dev_ptr != nullptr) {
-            ggml_sycl_set_device(device);
-            SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
-        }
-    }
-};
-
-static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
-    return ctx->name.c_str();
-}
-
-static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
-}
-
-static void
-ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
-    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-    ggml_sycl_set_device(ctx->device);
-
-    delete ctx;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-    return ctx->dev_ptr;
-}
-
-static void
-ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
-                                     ggml_tensor *tensor) try {
-    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
-
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        assert(tensor->view_src->buffer->buft == buffer->buft);
-        tensor->backend = tensor->view_src->backend;
-        tensor->extra = tensor->view_src->extra;
-        return;
-    }
-
-
-    if (ggml_is_quantized(tensor->type)) {
-        // initialize padding to 0 to avoid possible NaN values
-        size_t original_size = ggml_nbytes(tensor);
-        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
-
-        if (padded_size > original_size && tensor->view_src == nullptr) {
-            SYCL_CHECK(CHECK_TRY_ERROR(ctx->stream->memset(
-                (char *)tensor->data + original_size, 0,
-                padded_size - original_size).wait()));
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
-                                                ggml_tensor *tensor,
-                                                const void *data, size_t offset,
-                                                size_t size) try {
-
-    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-
-    ggml_sycl_set_device(ctx->device);
-    auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
-    char* host_buf = (char*)malloc(size);
-    memcpy(host_buf, data, size);
-    SYCL_CHECK(
-        CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
-                             .wait()));
-    free(host_buf);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
-                                                const ggml_tensor *tensor,
-                                                void *data, size_t offset,
-                                                size_t size) try {
-
-    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-
-    ggml_sycl_set_device(ctx->device);
-    auto stream = dpct::dev_mgr::instance().get_device(ctx->device).default_queue();
-
-    SYCL_CHECK(CHECK_TRY_ERROR(
-        stream.memcpy(data, (const char *)tensor->data + offset, size)
-            .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static bool
-ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
-                                    const ggml_tensor *src,
-                                    ggml_tensor *dst) try {
-    if (ggml_backend_buffer_is_sycl(src->buffer)) {
-        ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
-        ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;
-
-        ggml_sycl_set_device(src_ctx->device);
-        /*
-        DPCT1009:198: SYCL uses exceptions to report errors and does not use the
-        error codes. The original code was commented out and a warning string
-        was inserted. You need to rewrite this code.
-        */
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            dpct::dev_mgr::instance().get_device(src_ctx->device).queues_wait_and_throw()));
-        ggml_sycl_set_device(dst_ctx->device);
-        /*
-        DPCT1009:199: SYCL uses exceptions to report errors and does not use the
-        error codes. The original code was commented out and a warning string
-        was inserted. You need to rewrite this code.
-        */
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw()));
-        /*
-        DPCT1009:200: SYCL uses exceptions to report errors and does not use the
-        error codes. The original code was commented out and a warning string
-        was inserted. You need to rewrite this code.
-        */
-
-        queue_ptr stream_dst = dst_ctx->stream;
-        queue_ptr stream_src = src_ctx->stream;
-        size_t size = ggml_nbytes(src);
-
-        //todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs.
-        dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size);
-
-//todo, it's known issue：error in device2device cross GPUs. reused when the issue is fixed. DON"T remove
-#if 0
-        SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(
-            (char *)dst->data, (const char *)src->data, size).wait()));
-
-        /*
-        DPCT1009:201: SYCL uses exceptions to report errors and does not use the
-        error codes. The original code was commented out and a warning string
-        was inserted. You need to rewrite this code.
-        */
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw()));
-#endif
-        return true;
-    }
-    return false;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-
-static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
-                                           uint8_t value) try {
-     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-
-    ggml_sycl_set_device(ctx->device);
-    queue_ptr stream = ctx->stream;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
-
-    SYCL_CHECK(CHECK_TRY_ERROR((*stream)
-                                    .memset(ctx->dev_ptr, value, buffer->size)
-                                    .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static struct ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
-    /* .get_name        = */ ggml_backend_sycl_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_sycl_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_sycl_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_sycl_buffer_init_tensor,
-    /* .memset_tensor   = */ NULL,
-    /* .set_tensor      = */ ggml_backend_sycl_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_sycl_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_sycl_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_sycl_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// sycl buffer type
-struct ggml_backend_sycl_buffer_type_context {
-    int device;
-    std::string name;
-
-    // each buffer type has its own stream
-    queue_ptr stream = nullptr;
-};
-
-static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
-
-    return ctx->name.c_str();
-}
-static ggml_backend_buffer_t
-ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
-                                           size_t size) try {
-    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
-    ggml_sycl_set_device(buft_ctx->device);
-    const queue_ptr stream = buft_ctx->stream;
-    size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
-
-    void * dev_ptr;
-    SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
-                                    size, *stream)));
-    if (!dev_ptr) {
-        fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
-        return nullptr;
-    }
-    ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
-    return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
-    UNUSED(buft);
-}
-
-static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    return dpct::get_current_device().get_max_mem_alloc_size();
-
-    UNUSED(buft);
-}
-
-static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    size_t size = ggml_nbytes(tensor);
-    int64_t ne0 = tensor->ne[0];
-
-    if (ggml_is_quantized(tensor->type)) {
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-    }
-
-    return size;
-
-    UNUSED(buft);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_sycl_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_sycl_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_sycl_buffer_type_get_alignment,
-    /* .get_max_size     = */ ggml_backend_sycl_buffer_type_get_max_size,
-    /* .get_alloc_size   = */ ggml_backend_sycl_buffer_type_get_alloc_size,
-    /* .is_host          = */ nullptr,
-};
-
-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
-    static std::mutex mutex;
-    std::lock_guard<std::mutex> lock(mutex);
-
-    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
-
-    if (device>=ggml_sycl_info().device_count or device<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
-            device, ggml_sycl_info().device_count-1);
-        GGML_ASSERT(device<ggml_sycl_info().device_count);
-    }
-    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
-
-    static bool ggml_backend_sycl_buffer_type_initialized = false;
-
-    if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-            auto & device_i = dpct::dev_mgr::instance().get_device(i);
-            queue_ptr stream = &(device_i.default_queue());
-            ggml_backend_sycl_buffer_types[i] = {
-                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
-                /* .device   = */ nullptr,
-                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
-            };
-        }
-        ggml_backend_sycl_buffer_type_initialized = true;
-    }
-    return &ggml_backend_sycl_buffer_types[device];
-}
-
-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
-    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
-
-    int device = ctx->device;
-    if (device>=ggml_sycl_info().device_count or device<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
-            device, ggml_sycl_info().device_count-1);
-        GGML_ASSERT(device<ggml_sycl_info().device_count);
-    }
-    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
-
-    static bool ggml_backend_sycl_buffer_type_initialized = false;
-
-    if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-            ggml_backend_sycl_buffer_types[i] = {
-                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
-                /* .device   = */ nullptr,
-                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
-            };
-        }
-        ggml_backend_sycl_buffer_type_initialized = true;
-    }
-    return &ggml_backend_sycl_buffer_types[device];
-}
-
-// sycl split buffer type
-static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split, int id) {
-    const int64_t nrows = ggml_nrows(tensor);
-    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
-
-    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
-    *row_low -= *row_low % rounding;
-    if (id == ggml_sycl_info().device_count - 1) {
-        *row_high = nrows;
-    } else {
-        *row_high = nrows*tensor_split[id + 1];
-        *row_high -= *row_high % rounding;
-    }
-}
-
-struct ggml_backend_sycl_split_buffer_context {
-    ~ggml_backend_sycl_split_buffer_context() try {
-        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
-            for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-                for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
-                    if (extra->events[i][is] != nullptr) {
-                        /*
-                        DPCT1009:206: SYCL uses exceptions to report errors and
-                        does not use the error codes. The original code was
-                        commented out and a warning string was inserted. You
-                        need to rewrite this code.
-                        */
-                        SYCL_CHECK(CHECK_TRY_ERROR(
-                            dpct::destroy_event(extra->events[i][is])));
-                    }
-                }
-                if (extra->data_device[i] != nullptr) {
-                    /*
-                    DPCT1009:207: SYCL uses exceptions to report errors and does
-                    not use the error codes. The original code was commented out
-                    and a warning string was inserted. You need to rewrite this
-                    code.
-                    */
-                    ggml_sycl_set_device(i);
-                    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
-                        extra->data_device[i], *(streams[i]))));
-                }
-            }
-            delete extra;
-        }
-    }
-    catch (sycl::exception const &exc) {
-      std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-                << ", line:" << __LINE__ << std::endl;
-      std::exit(1);
-    }
-
-    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
-    std::vector<queue_ptr> streams;
-};
-
-static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return GGML_SYCL_NAME "_Split";
-
-    UNUSED(buffer);
-}
-
-static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
-   return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
-}
-
-static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
-    delete ctx;
-}
-
-static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
-    return (void *)0x1000;
-
-    UNUSED(buffer);
-}
-
-static void
-ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
-                                           ggml_tensor *tensor) try {
-    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
-
-    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
-    ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-
-    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
-
-    ctx->tensor_extras.push_back(extra);
-        ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
-
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        // FIXME: do not crash if cudaMalloc fails
-        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
-        ggml_sycl_set_device(i);
-        const queue_ptr stream = ctx->streams[i];
-        char * buf;
-        /*
-        DPCT1009:208: SYCL uses exceptions to report errors and does not use the
-        error codes. The original code was commented out and a warning string
-        was inserted. You need to rewrite this code.
-        */
-        SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
-                                        size, *stream)));
-        if (!buf) {
-            char err_buf[1024];
-            snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
-            throw std::runtime_error(err_buf);
-        }
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            /*
-            DPCT1009:209: SYCL uses exceptions to report errors and does not use
-            the error codes. The original code was commented out and a warning
-            string was inserted. You need to rewrite this code.
-            */
-            SYCL_CHECK(CHECK_TRY_ERROR(
-                (*stream)
-                    .memset(buf + original_size, 0, size - original_size)
-                    .wait()));
-        }
-
-        extra->data_device[i] = buf;
-
-        for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
-            /*
-            DPCT1009:210: SYCL uses exceptions to report errors and does not use
-            the error codes. The original code was commented out and a warning
-            string was inserted. You need to rewrite this code.
-            */
-            SYCL_CHECK(
-                CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
-        }
-    }
-    tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
-    tensor->extra = extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void
-ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
-                                          ggml_tensor *tensor, const void *data,
-                                          size_t offset, size_t size) try {
-    // split tensors must always be set in their entirety at once
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
-    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
-    ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-    const size_t nb1 = tensor->nb[1];
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        const char * buf_host = (const char *)data + offset_split;
-        /*
-        DPCT1009:211: SYCL uses exceptions to report errors and does not use the
-        error codes. The original code was commented out and a warning string
-        was inserted. You need to rewrite this code.
-        */
-        ggml_sycl_set_device(i);
-        const queue_ptr stream = ctx->streams[i];
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            (*stream)
-                .memcpy(extra->data_device[i], buf_host, original_size)
-                .wait()));
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void
-ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
-                                          const ggml_tensor *tensor, void *data,
-                                          size_t offset, size_t size) try {
-    // split tensors must always be set in their entirety at once
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
-    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
-    ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-    const size_t nb1 = tensor->nb[1];
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        char * buf_host = (char *)data + offset_split;
-        /*
-        DPCT1009:212: SYCL uses exceptions to report errors and does not use the
-        error codes. The original code was commented out and a warning string
-        was inserted. You need to rewrite this code.
-        */
-        ggml_sycl_set_device(i);
-        const queue_ptr stream = ctx->streams[i];
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            (*stream)
-                .memcpy(buf_host, extra->data_device[i], original_size)
-                .wait()));
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    UNUSED(buffer);
-    UNUSED(value);
-}
-
-static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
-    /* .get_name        = */ ggml_backend_sycl_split_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_sycl_split_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_sycl_split_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_sycl_split_buffer_init_tensor,
-    /* .memset_tensor   = */ NULL,
-    /* .set_tensor      = */ ggml_backend_sycl_split_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_sycl_split_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ ggml_backend_sycl_split_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_SYCL_NAME "_Split";
-
-    UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
-    // instead, we allocate them for each tensor separately in init_tensor
-    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
-    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
-    ggml_backend_sycl_split_buffer_context * ctx = new ggml_backend_sycl_split_buffer_context();
-
-    return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size);
-}
-
-static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
-    UNUSED(buft);
-}
-
-static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context;
-
-    size_t total_size = 0;
-
-    const int64_t ne0 = tensor->ne[0];
-
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        total_size += ggml_nbytes_split(tensor, nrows_split);
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-    }
-
-    return total_size;
-}
-
-static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return false;
-
-    UNUSED(buft);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_sycl_split_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_sycl_split_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_sycl_split_buffer_type_get_alignment,
-    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-    /* .get_alloc_size   = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
-    /* .is_host          = */ ggml_backend_sycl_split_buffer_type_is_host,
-};
-
-ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
-    static std::mutex mutex;
-    std::lock_guard<std::mutex> lock(mutex);
-
-    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
-    ggml_check_sycl();
-    // FIXME: this is not thread safe
-    static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
-
-    std::array<float, GGML_SYCL_MAX_DEVICES> tensor_split_arr = {};
-
-    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_SYCL_MAX_DEVICES, [](float x) { return x == 0.0f; });
-    if (all_zero) {
-        tensor_split_arr = ggml_sycl_info().default_tensor_split;
-    } else {
-        float split_sum = 0.0f;
-        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            tensor_split_arr[i] = split_sum;
-            split_sum += tensor_split[i];
-        }
-        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            tensor_split_arr[i] /= split_sum;
-        }
-    }
-
-    auto it = buft_map.find(tensor_split_arr);
-    if (it != buft_map.end()) {
-        return &it->second;
-    }
-
-    struct ggml_backend_buffer_type buft {
-        /* .iface   = */ ggml_backend_sycl_split_buffer_type_interface,
-        /* .device  = */ nullptr,
-        /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr},
-    };
-
-    auto result = buft_map.emplace(tensor_split_arr, buft);
-    return &result.first->second;
-}
-
-// host buffer type
-
-static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_SYCL_NAME "_Host";
-
-    UNUSED(buft);
-}
-
-static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return GGML_SYCL_NAME "_Host";
-
-    UNUSED(buffer);
-}
-
-static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_sycl_host_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr = ggml_sycl_host_malloc(size);
-
-    if (ptr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    // FIXME: this is a hack to avoid having to implement a new buffer type
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_sycl_host_buffer_name;
-    buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
-    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
-    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_sycl_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_sycl_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // TODO: return device.maxBufferLength
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .device   = */ nullptr,
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_sycl_buffer_type_host;
-}
-
 // backend
 
-static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
+static const char * ggml_backend_sycl_get_name(ggml_backend_t backend) {
 
     ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
 
@@ -4931,8 +4904,8 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
 
     GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
     const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
-    SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
-        (char *)tensor->data + offset, data, size).wait()));
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        (stream)->memcpy((char *)tensor->data + offset, data, size)));
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -4987,7 +4960,7 @@ static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
     const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
     SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait()));
 
-    UNUSED(backend);
+    GGML_UNUSED(backend);
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -5023,7 +4996,151 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
     return GGML_STATUS_SUCCESS;
 }
 
-static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+static void ggml_backend_sycl_event_record(ggml_backend_t backend, ggml_backend_event_t event)
+try
+{
+    ggml_backend_sycl_context *sycl_ctx =
+        (ggml_backend_sycl_context *)backend->context;
+    sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
+
+    const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
+    // Record the current state of the queue
+    SYCL_CHECK(CHECK_TRY_ERROR(*sycl_event = stream->ext_oneapi_submit_barrier()));
+}
+catch (sycl::exception const &exc)
+{
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
+    ggml_backend_sycl_context* sycl_ctx = static_cast<ggml_backend_sycl_context*>(backend->context);
+    sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
+
+    if (ggml_backend_is_sycl(backend)) {
+        SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
+    } else
+        GGML_ABORT("fatal error");
+} catch (sycl::exception const& exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+static ggml_backend_i ggml_backend_sycl_interface = {
+    /* .get_name                = */ ggml_backend_sycl_get_name,
+    /* .free                    = */ ggml_backend_sycl_free,
+    /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
+    /* .set_tensor_async        = */ ggml_backend_sycl_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_sycl_get_tensor_async,
+    /* .cpy_tensor_async        = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
+                                           // // TODO: update for the new
+                                           // interface
+    /* .synchronize             = */ ggml_backend_sycl_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
+    /* .supports_op             = */ NULL, // moved to device
+    /* .supports_buft           = */ NULL, // moved to device
+    /* .offload_op              = */ NULL, // moved to device
+    /* .event_record            = */ ggml_backend_sycl_event_record,
+    /* .event_wait              = */ ggml_backend_sycl_event_wait,
+};
+
+static ggml_guid_t ggml_backend_sycl_guid() {
+    static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
+    return &guid;
+}
+
+bool ggml_backend_is_sycl(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
+}
+
+int ggml_backend_sycl_get_device_count() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
+    return ggml_sycl_info().device_count;
+}
+
+
+// backend device
+
+struct ggml_backend_sycl_device_context {
+    int device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_sycl_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_sycl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context;
+    ggml_sycl_set_device(ctx->device);
+    SYCL_CHECK(CHECK_TRY_ERROR(
+    dpct::dev_mgr::instance().get_device(ctx->device).get_memory_info(*free, *total)));
+}
+
+static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+}
+
+static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_sycl_device_get_name(dev);
+    props->description = ggml_backend_sycl_device_get_description(dev);
+    props->type        = ggml_backend_sycl_device_get_type(dev);
+    ggml_backend_sycl_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    bool host_buffer = getenv("GGML_SYCL_NO_PINNED") == nullptr;
+#ifdef GGML_SYCL_NO_PEER_COPY
+    bool events = false;
+#else
+    bool events = true;
+#endif
+
+    props->caps = {
+        /* .async                 = */ true,
+        /* .host_buffer           = */ host_buffer,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ events,
+    };
+}
+
+static ggml_backend_t ggml_backend_sycl_device_init(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(params);
+    ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return ggml_backend_sycl_init(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_sycl_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return ggml_backend_sycl_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_sycl_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return ggml_backend_sycl_host_buffer_type();
+}
+
+static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    GGML_UNUSED(dev);
+    GGML_UNUSED(ptr);
+    GGML_UNUSED(size);
+    GGML_UNUSED(max_tensor_size);
+    return nullptr;
+}
+
+static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_CONV_TRANSPOSE_1D:
             {
@@ -5167,47 +5284,173 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
             return false;
     }
 
-    UNUSED(backend);
+    GGML_UNUSED(dev);
 }
 
-static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
-    const int min_batch_size = 32;
-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
-    GGML_UNUSED(backend);
-}
-
-static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
+static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_name != ggml_backend_sycl_buffer_type_get_name) {
         return false;
     }
     ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
-    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
+    ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
     return buft_ctx->device == sycl_ctx->device;
 }
 
-static ggml_backend_i ggml_backend_sycl_interface = {
-    /* .get_name                = */ ggml_backend_sycl_name,
-    /* .free                    = */ ggml_backend_sycl_free,
-    /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
-    /* .set_tensor_async        = */ ggml_backend_sycl_set_tensor_async,
-    /* .get_tensor_async        = */ ggml_backend_sycl_get_tensor_async,
-    /* .cpy_tensor_async        = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
-    /* .synchronize             = */ ggml_backend_sycl_synchronize,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
-    /* .supports_op             = */ ggml_backend_sycl_supports_op,
-    /* .supports_buft           = */ ggml_backend_sycl_supports_buft,
-    /* .offload_op              = */ ggml_backend_sycl_offload_op,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
+static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    const int min_batch_size = 32;
+    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_event_t
+ggml_backend_sycl_device_event_new(ggml_backend_dev_t dev) {
+
+#ifdef GGML_SYCL_NO_PEER_COPY
+    return nullptr;
+#else
+  sycl::event *event_ptr = new sycl::event();
+
+  return new ggml_backend_event{
+      /* .device = */ dev,
+      /* .context = */ event_ptr,
+  };
+#endif
+}
+
+static void ggml_backend_sycl_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
+  GGML_UNUSED(dev);
+  if (event == nullptr) {
+    return;
+  }
+
+  if (event->context != nullptr) {
+    sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
+    delete sycl_event;
+    event->context = nullptr;
+  }
+
+  delete event;
+} catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+
+static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
+  GGML_UNUSED(dev);
+
+  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
+  SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
+} catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static const ggml_backend_device_i ggml_backend_sycl_device_interface = {
+    /* .get_name                = */ ggml_backend_sycl_device_get_name,
+    /* .get_description         = */ ggml_backend_sycl_device_get_description,
+    /* .get_memory              = */ ggml_backend_sycl_device_get_memory,
+    /* .get_type                = */ ggml_backend_sycl_device_get_type,
+    /* .get_props               = */ ggml_backend_sycl_device_get_props,
+    /* .init_backend            = */ ggml_backend_sycl_device_init,
+    /* .get_buffer_type         = */ ggml_backend_sycl_device_get_buffer_type,
+    /* .get_host_buffer_type    = */ ggml_backend_sycl_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr    = */ ggml_backend_sycl_device_buffer_from_host_ptr,
+    /* .supports_op             = */ ggml_backend_sycl_device_supports_op,
+    /* .supports_buft           = */ ggml_backend_sycl_device_supports_buft,
+    /* .offload_op              = */ ggml_backend_sycl_device_offload_op,
+    /* .event_new               = */ ggml_backend_sycl_device_event_new,
+    /* .event_free              = */ ggml_backend_sycl_device_event_free,
+    /* .event_synchronize       = */ ggml_backend_sycl_device_event_synchronize,
 };
 
-static ggml_guid_t ggml_backend_sycl_guid() {
-    static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
-    return &guid;
+// backend reg
+
+struct ggml_backend_sycl_reg_context {
+    std::vector<ggml_backend_dev_t> devices;
+};
+
+static const char * ggml_backend_sycl_reg_get_name(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return GGML_SYCL_NAME;
+}
+
+static size_t ggml_backend_sycl_reg_get_device_count(ggml_backend_reg_t reg) {
+    ggml_backend_sycl_reg_context * ctx = (ggml_backend_sycl_reg_context *)reg->context;
+    return ctx->devices.size();
+}
+
+static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    ggml_backend_sycl_reg_context * ctx = (ggml_backend_sycl_reg_context *)reg->context;
+    GGML_ASSERT(index < ctx->devices.size());
+    return ctx->devices[index];
+}
+
+static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name)
+{
+    GGML_UNUSED(reg);
+    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+        return (void *)ggml_backend_sycl_split_buffer_type;
+    }
+    // SYCL doesn't support registering host memory, left here for reference
+    // "ggml_backend_register_host_buffer"
+    // "ggml_backend_unregister_host_buffer"
+    return nullptr;
+}
+
+static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = {
+    /* .get_name          = */ ggml_backend_sycl_reg_get_name,
+    /* .get_device_count  = */ ggml_backend_sycl_reg_get_device_count,
+    /* .get_device_get    = */ ggml_backend_sycl_reg_get_device,
+    /* .get_proc_address  = */ ggml_backend_sycl_reg_get_proc_address,
+};
+
+
+// backend registry
+
+ggml_backend_reg_t ggml_backend_sycl_reg() {
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
+
+            for (int i = 0; i < ggml_sycl_info().device_count; i++) {
+                ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
+                dev_ctx->device = i;
+                dev_ctx->name = GGML_SYCL_NAME + std::to_string(i);
+
+                ggml_sycl_set_device(i);
+
+                dpct::device_info prop;
+                SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+                    prop, dpct::dev_mgr::instance().get_device(i))));
+
+                dev_ctx->description = prop.get_name();
+
+                ggml_backend_dev_t dev = new ggml_backend_device {
+                    /* .interface = */ ggml_backend_sycl_device_interface,
+                    /* .reg       = */ &reg,
+                    /* .context   = */ dev_ctx
+                };
+                ctx->devices.push_back(dev);
+            }
+
+            reg = ggml_backend_reg {
+                /* .interface = */ ggml_backend_sycl_reg_interface,
+                /* .context   = */ ctx
+            };
+        }
+
+        initialized = true;
+    }
+
+    return &reg;
 }
 
 ggml_backend_t ggml_backend_sycl_init(int device) {
@@ -5225,18 +5468,10 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
     ggml_backend_t sycl_backend = new ggml_backend {
         /* .guid      = */ ggml_backend_sycl_guid(),
         /* .interface = */ ggml_backend_sycl_interface,
-        /* .device    = */ nullptr,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
         /* .context   = */ ctx
     };
 
     return sycl_backend;
 }
 
-bool ggml_backend_is_sycl(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
-}
-
-int ggml_backend_sycl_get_device_count() {
-    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
-    return ggml_sycl_info().device_count;
-}
diff --git a/src/llama.cpp b/src/llama.cpp
index 0025e94b8..10c975bf4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8,9 +8,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
-#if defined(GGML_USE_SYCL)
-#  include "ggml-sycl.h"
-#elif defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE)
 #   include "ggml-kompute.h"
 #elif defined(GGML_USE_CANN)
 #   include "ggml-cann.h"
@@ -3422,9 +3420,11 @@ struct llama_lora_adapter {
 static int llama_get_device_count(const llama_model & model) {
     int count = (int) model.devices.size();
 
-#if defined(GGML_USE_SYCL)
-    count += ggml_backend_sycl_get_device_count();
-#elif defined(GGML_USE_CANN)
+#if defined(GGML_USE_RPC)
+    count += (int) model.rpc_servers.size();
+#endif
+
+#if defined(GGML_USE_CANN)
     count += ggml_backend_cann_get_device_count();
 #endif
 
@@ -3445,11 +3445,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
         }
     }
 
-#if defined(GGML_USE_SYCL)
-    if (host_buffer) {
-        buft = ggml_backend_sycl_host_buffer_type();
-    }
-#elif defined(GGML_USE_CANN)
+#if defined(GGML_USE_CANN)
     if (host_buffer) {
         buft = ggml_backend_cann_host_buffer_type();
     }
@@ -3473,9 +3469,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
     }
     device -= (int)model.devices.size();
 
-#if defined(GGML_USE_SYCL)
-    buft = ggml_backend_sycl_buffer_type(device);
-#elif defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE)
     buft = ggml_backend_kompute_buffer_type(device);
 #elif defined(GGML_USE_CANN)
     buft = ggml_backend_cann_buffer_type(device);
@@ -3505,12 +3499,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
         }
     }
 
-#ifdef GGML_USE_SYCL
-    if (ggml_backend_sycl_get_device_count() > 1) {
-        buft = ggml_backend_sycl_split_buffer_type(tensor_split);
-    }
-#endif
-
     if (buft == nullptr) {
         buft = llama_default_buffer_type_offload(model, fallback_gpu);
     }
@@ -3528,12 +3516,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
         return free;
     }
 
-#if defined(GGML_USE_SYCL)
-    size_t total;
-    size_t free;
-    ggml_backend_sycl_get_device_memory(device, &free, &total);
-    return free;
-#elif defined(GGML_USE_CANN)
+#if defined(GGML_USE_CANN)
     size_t total;
     size_t free;
     ggml_backend_cann_get_device_memory(device, &free, &total);
@@ -19096,7 +19079,7 @@ bool llama_supports_mlock(void) {
 }
 
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE)
     // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
     return true;
 #else
@@ -19428,29 +19411,7 @@ struct llama_context * llama_new_context_with_model(
             main_gpu -= (int)model->devices.size();
         }
 
-#if defined(GGML_USE_SYCL)
-        // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
-        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
-        } else {
-            // LLAMA_SPLIT_LAYER requires a backend for each GPU
-            for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
-                ggml_backend_t backend = ggml_backend_sycl_init(i);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-#elif defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE)
         if (model->n_gpu_layers > 0) {
             auto * backend = ggml_backend_kompute_init(main_gpu);
             if (backend == nullptr) {

From afd9909a6481402844aecefa8a8908afdd7f52f1 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Fri, 18 Oct 2024 14:33:58 +0300
Subject: [PATCH 073/396] rpc : backend refactoring (#9912)

* rpc : refactor backend

Use structs for RPC request/response messages

* rpc : refactor server
---
 ggml/src/ggml-rpc.cpp | 571 +++++++++++++++++++++++-------------------
 1 file changed, 310 insertions(+), 261 deletions(-)

diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index 13c7dd436..f95233284 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -58,7 +58,7 @@ struct socket_t {
 };
 
 // ggml_tensor is serialized into rpc_tensor
-#pragma pack(push, 1)
+#pragma pack(1)
 struct rpc_tensor {
     uint64_t id;
     uint32_t type;
@@ -76,7 +76,6 @@ struct rpc_tensor {
 
     char padding[4];
 };
-#pragma pack(pop)
 
 static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8");
 
@@ -96,6 +95,77 @@ enum rpc_cmd {
     RPC_CMD_COUNT,
 };
 
+#pragma pack(1)
+struct rpc_msg_alloc_buffer_req {
+    uint64_t size;
+};
+
+#pragma pack(1)
+struct rpc_msg_alloc_buffer_rsp {
+    uint64_t remote_ptr;
+    uint64_t remote_size;
+};
+
+#pragma pack(1)
+struct rpc_msg_get_alignment_rsp {
+    uint64_t alignment;
+};
+
+#pragma pack(1)
+struct rpc_msg_get_max_size_rsp {
+    uint64_t max_size;
+};
+
+#pragma pack(1)
+struct rpc_msg_buffer_get_base_req {
+    uint64_t remote_ptr;
+};
+
+#pragma pack(1)
+struct rpc_msg_buffer_get_base_rsp {
+    uint64_t base_ptr;
+};
+
+#pragma pack(1)
+struct rpc_msg_free_buffer_req {
+    uint64_t remote_ptr;
+};
+
+#pragma pack(1)
+struct rpc_msg_buffer_clear_req {
+    uint64_t remote_ptr;
+    uint8_t value;
+};
+
+#pragma pack(1)
+struct rpc_msg_get_tensor_req {
+    rpc_tensor tensor;
+    uint64_t offset;
+    uint64_t size;
+};
+
+#pragma pack(1)
+struct rpc_msg_copy_tensor_req {
+    rpc_tensor src;
+    rpc_tensor dst;
+};
+
+#pragma pack(1)
+struct rpc_msg_copy_tensor_rsp {
+    uint8_t result;
+};
+
+#pragma pack(1)
+struct rpc_msg_graph_compute_rsp {
+    uint8_t result;
+};
+
+#pragma pack(1)
+struct rpc_msg_get_device_memory_rsp {
+    uint64_t free_mem;
+    uint64_t total_mem;
+};
+
 // RPC data structures
 
 static ggml_guid_t ggml_backend_rpc_guid() {
@@ -240,6 +310,38 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
     return true;
 }
 
+static bool send_msg(sockfd_t sockfd, const void * msg, size_t msg_size) {
+    if (!send_data(sockfd, &msg_size, sizeof(msg_size))) {
+        return false;
+    }
+    return send_data(sockfd, msg, msg_size);
+}
+
+static bool recv_msg(sockfd_t sockfd, void * msg, size_t msg_size) {
+    uint64_t size;
+    if (!recv_data(sockfd, &size, sizeof(size))) {
+        return false;
+    }
+    if (size != msg_size) {
+        return false;
+    }
+    return recv_data(sockfd, msg, msg_size);
+}
+
+static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
+    uint64_t size;
+    if (!recv_data(sockfd, &size, sizeof(size))) {
+        return false;
+    }
+    try {
+        input.resize(size);
+    } catch (const std::bad_alloc & e) {
+        fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", size);
+        return false;
+    }
+    return recv_data(sockfd, input.data(), size);
+}
+
 static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) {
     size_t pos = endpoint.find(':');
     if (pos == std::string::npos) {
@@ -252,28 +354,27 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
 
 // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
 // RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
-static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
+static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
     uint8_t cmd_byte = cmd;
     if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
         return false;
     }
-    uint64_t input_size = input.size();
     if (!send_data(sock->fd, &input_size, sizeof(input_size))) {
         return false;
     }
-    if (!send_data(sock->fd, input.data(), input.size())) {
+    if (!send_data(sock->fd, input, input_size)) {
         return false;
     }
-    uint64_t output_size;
-    if (!recv_data(sock->fd, &output_size, sizeof(output_size))) {
+    // TODO: currently the output_size is always known, do we need support for commands with variable output size?
+    // even if we do, we can skip sending output_size from the server for commands with known output size
+    uint64_t out_size;
+    if (!recv_data(sock->fd, &out_size, sizeof(out_size))) {
         return false;
     }
-    if (output_size == 0) {
-        output.clear();
-        return true;
+    if (out_size != output_size) {
+        return false;
     }
-    output.resize(output_size);
-    if (!recv_data(sock->fd, output.data(), output_size)) {
+    if (!recv_data(sock->fd, output, output_size)) {
         return false;
     }
     return true;
@@ -326,14 +427,9 @@ static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffe
 
 static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
-    // input serialization format: | remote_ptr (8 bytes) |
-    std::vector<uint8_t> input(sizeof(uint64_t), 0);
-    uint64_t remote_ptr = ctx->remote_ptr;
-    memcpy(input.data(), &remote_ptr, sizeof(remote_ptr));
-    std::vector<uint8_t> output;
-    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, input, output);
+    rpc_msg_free_buffer_req request = {ctx->remote_ptr};
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     GGML_ASSERT(status);
-    GGML_ASSERT(output.empty());
     delete ctx;
 }
 
@@ -342,20 +438,13 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
     if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
         return ctx->base_cache[buffer];
     }
-    // input serialization format: | remote_ptr (8 bytes) |
-    std::vector<uint8_t> input(sizeof(uint64_t), 0);
-    uint64_t remote_ptr = ctx->remote_ptr;
-    memcpy(input.data(), &remote_ptr, sizeof(remote_ptr));
-    std::vector<uint8_t> output;
-    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, input, output);
+    rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
+    rpc_msg_buffer_get_base_rsp response;
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
     GGML_ASSERT(status);
-    GGML_ASSERT(output.size() == sizeof(uint64_t));
-    // output serialization format: | base_ptr (8 bytes) |
-    uint64_t base_ptr;
-    memcpy(&base_ptr, output.data(), sizeof(base_ptr));
-    void * base = reinterpret_cast<void *>(base_ptr);
-    ctx->base_cache[buffer] = base;
-    return base;
+    void * base_ptr = reinterpret_cast<void *>(response.base_ptr);
+    ctx->base_cache[buffer] = base_ptr;
+    return base_ptr;
 }
 
 static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
@@ -405,26 +494,18 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
     memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
     memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
     memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
-    std::vector<uint8_t> output;
-    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input, output);
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
     GGML_ASSERT(status);
 }
 
 static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
-    // input serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
-    int input_size = sizeof(rpc_tensor) + 2*sizeof(uint64_t);
-    std::vector<uint8_t> input(input_size, 0);
-    rpc_tensor rpc_tensor = serialize_tensor(tensor);
-    memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
-    memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
-    memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &size, sizeof(size));
-    std::vector<uint8_t> output;
-    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, input, output);
+    rpc_msg_get_tensor_req request;
+    request.tensor = serialize_tensor(tensor);
+    request.offset = offset;
+    request.size = size;
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, &request, sizeof(request), data, size);
     GGML_ASSERT(status);
-    GGML_ASSERT(output.size() == size);
-    // output serialization format: | data (size bytes) |
-    memcpy(data, output.data(), size);
 }
 
 static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@@ -437,30 +518,19 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
         return false;
     }
     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
-    // input serialization format: | rpc_tensor src | rpc_tensor dst |
-    int input_size = 2*sizeof(rpc_tensor);
-    std::vector<uint8_t> input(input_size, 0);
-    rpc_tensor rpc_src = serialize_tensor(src);
-    rpc_tensor rpc_dst = serialize_tensor(dst);
-    memcpy(input.data(), &rpc_src, sizeof(rpc_src));
-    memcpy(input.data() + sizeof(rpc_src), &rpc_dst, sizeof(rpc_dst));
-    std::vector<uint8_t> output;
-    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, input, output);
+    rpc_msg_copy_tensor_req request;
+    request.src = serialize_tensor(src);
+    request.dst = serialize_tensor(dst);
+    rpc_msg_copy_tensor_rsp response;
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
     GGML_ASSERT(status);
-    // output serialization format: | result (1 byte) |
-    GGML_ASSERT(output.size() == 1);
-    return output[0];
+    return response.result;
 }
 
 static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
-    // serialization format: | bufptr (8 bytes) | value (1 byte) |
-    int input_size = sizeof(uint64_t) + sizeof(uint8_t);
-    std::vector<uint8_t> input(input_size, 0);
-    memcpy(input.data(), &ctx->remote_ptr, sizeof(ctx->remote_ptr));
-    memcpy(input.data() + sizeof(ctx->remote_ptr), &value, sizeof(value));
-    std::vector<uint8_t> output;
-    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, input, output);
+    rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, &request, sizeof(request), nullptr, 0);
     GGML_ASSERT(status);
 }
 
@@ -484,25 +554,16 @@ static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t
 
 static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
-    // input serialization format: | size (8 bytes) |
-    int input_size = sizeof(uint64_t);
-    std::vector<uint8_t> input(input_size, 0);
-    memcpy(input.data(), &size, sizeof(size));
-    std::vector<uint8_t> output;
+    rpc_msg_alloc_buffer_req request = {size};
+    rpc_msg_alloc_buffer_rsp response;
     auto sock = get_socket(buft_ctx->endpoint);
-    bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, input, output);
+    bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response));
     GGML_ASSERT(status);
-    GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
-    // output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
-    uint64_t remote_ptr;
-    memcpy(&remote_ptr, output.data(), sizeof(remote_ptr));
-    size_t remote_size;
-    memcpy(&remote_size, output.data() + sizeof(uint64_t), sizeof(remote_size));
-    if (remote_ptr != 0) {
+    if (response.remote_ptr != 0) {
         ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
             ggml_backend_rpc_buffer_interface,
-            new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
-            remote_size);
+            new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
+            response.remote_size);
         return buffer;
     } else {
         return nullptr;
@@ -510,16 +571,10 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
 }
 
 static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
-    // input serialization format: | 0 bytes |
-    std::vector<uint8_t> input;
-    std::vector<uint8_t> output;
-    bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, input, output);
+    rpc_msg_get_alignment_rsp response;
+    bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, nullptr, 0, &response, sizeof(response));
     GGML_ASSERT(status);
-    GGML_ASSERT(output.size() == sizeof(uint64_t));
-    // output serialization format: | alignment (8 bytes) |
-    uint64_t alignment;
-    memcpy(&alignment, output.data(), sizeof(alignment));
-    return alignment;
+    return response.alignment;
 }
 
 static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -528,16 +583,10 @@ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_typ
 }
 
 static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
-    // input serialization format: | 0 bytes |
-    std::vector<uint8_t> input;
-    std::vector<uint8_t> output;
-    bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, input, output);
+    rpc_msg_get_max_size_rsp response;
+    bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, nullptr, 0, &response, sizeof(response));
     GGML_ASSERT(status);
-    GGML_ASSERT(output.size() == sizeof(uint64_t));
-    // output serialization format: | max_size (8 bytes) |
-    uint64_t max_size;
-    memcpy(&max_size, output.data(), sizeof(max_size));
-    return max_size;
+    return response.max_size;
 }
 
 static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
@@ -622,12 +671,11 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
     ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
     std::vector<uint8_t> input;
     serialize_graph(cgraph, input);
-    std::vector<uint8_t> output;
+    rpc_msg_graph_compute_rsp response;
     auto sock = get_socket(rpc_ctx->endpoint);
-    bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input, output);
+    bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response));
     GGML_ASSERT(status);
-    GGML_ASSERT(output.size() == 1);
-    return (enum ggml_status)output[0];
+    return (enum ggml_status)response.result;
 }
 
 static ggml_backend_i ggml_backend_rpc_interface = {
@@ -702,19 +750,11 @@ GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) {
 }
 
 static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * free, size_t * total) {
-    // input serialization format: | 0 bytes |
-    std::vector<uint8_t> input;
-    std::vector<uint8_t> output;
-    bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, input, output);
+    rpc_msg_get_device_memory_rsp response;
+    bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, nullptr, 0, &response, sizeof(response));
     GGML_ASSERT(status);
-    GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
-    // output serialization format: | free (8 bytes) | total (8 bytes) |
-    uint64_t free_mem;
-    memcpy(&free_mem, output.data(), sizeof(free_mem));
-    uint64_t total_mem;
-    memcpy(&total_mem, output.data() + sizeof(uint64_t), sizeof(total_mem));
-    *free = free_mem;
-    *total = total_mem;
+    *free = response.free_mem;
+    *total = response.total_mem;
 }
 
 GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
@@ -734,16 +774,16 @@ public:
     rpc_server(ggml_backend_t backend) : backend(backend) {}
     ~rpc_server();
 
-    bool alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
-    void get_alignment(std::vector<uint8_t> & output);
-    void get_max_size(std::vector<uint8_t> & output);
-    bool buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
-    bool free_buffer(const std::vector<uint8_t> & input);
-    bool buffer_clear(const std::vector<uint8_t> & input);
+    void alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response);
+    void get_alignment(rpc_msg_get_alignment_rsp & response);
+    void get_max_size(rpc_msg_get_max_size_rsp & response);
+    bool buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response);
+    bool free_buffer(const rpc_msg_free_buffer_req & request);
+    bool buffer_clear(const rpc_msg_buffer_clear_req & request);
     bool set_tensor(const std::vector<uint8_t> & input);
-    bool get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
-    bool copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
-    bool graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
+    bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
+    bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
+    bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
 
 private:
     ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
@@ -757,80 +797,50 @@ private:
     std::unordered_set<ggml_backend_buffer_t> buffers;
 };
 
-bool rpc_server::alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
-    // input serialization format: | size (8 bytes) |
-    if (input.size() != sizeof(uint64_t)) {
-        return false;
-    }
-    uint64_t size;
-    memcpy(&size, input.data(), sizeof(size));
+void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
     ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
-    uint64_t remote_ptr = 0;
-    uint64_t remote_size = 0;
+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
+    response.remote_ptr = 0;
+    response.remote_size = 0;
     if (buffer != nullptr) {
-        remote_ptr = reinterpret_cast<uint64_t>(buffer);
-        remote_size = buffer->size;
-        GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, size, remote_ptr, remote_size);
+        response.remote_ptr = reinterpret_cast<uint64_t>(buffer);
+        response.remote_size = buffer->size;
+        GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
         buffers.insert(buffer);
     } else {
-        GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, size);
+        GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
     }
-    // output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
-    output.resize(2*sizeof(uint64_t), 0);
-    memcpy(output.data(), &remote_ptr, sizeof(remote_ptr));
-    memcpy(output.data() + sizeof(uint64_t), &remote_size, sizeof(remote_size));
-    return true;
 }
 
-void rpc_server::get_alignment(std::vector<uint8_t> & output) {
+void rpc_server::get_alignment(rpc_msg_get_alignment_rsp & response) {
     ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
     size_t alignment = ggml_backend_buft_get_alignment(buft);
     GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment);
-    // output serialization format: | alignment (8 bytes) |
-    output.resize(sizeof(uint64_t), 0);
-    memcpy(output.data(), &alignment, sizeof(alignment));
+    response.alignment = alignment;
 }
 
-void rpc_server::get_max_size(std::vector<uint8_t> & output) {
+void rpc_server::get_max_size(rpc_msg_get_max_size_rsp & response) {
     ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
     size_t max_size = ggml_backend_buft_get_max_size(buft);
     GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size);
-    // output serialization format: | max_size (8 bytes) |
-    output.resize(sizeof(uint64_t), 0);
-    memcpy(output.data(), &max_size, sizeof(max_size));
+    response.max_size = max_size;
 }
 
-bool rpc_server::buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
-    // input serialization format: | remote_ptr (8 bytes) |
-    if (input.size() != sizeof(uint64_t)) {
-        return false;
-    }
-    uint64_t remote_ptr;
-    memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
-    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
-    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
+bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response) {
+    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
+    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
     if (buffers.find(buffer) == buffers.end()) {
         GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
         return false;
     }
     void * base = ggml_backend_buffer_get_base(buffer);
-    // output serialization format: | base_ptr (8 bytes) |
-    uint64_t base_ptr = reinterpret_cast<uint64_t>(base);
-    output.resize(sizeof(uint64_t), 0);
-    memcpy(output.data(), &base_ptr, sizeof(base_ptr));
+    response.base_ptr = reinterpret_cast<uint64_t>(base);
     return true;
 }
 
-bool rpc_server::free_buffer(const std::vector<uint8_t> & input) {
-    // input serialization format: | remote_ptr (8 bytes) |
-    if (input.size() != sizeof(uint64_t)) {
-        return false;
-    }
-    uint64_t remote_ptr;
-    memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
-    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
-    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
+bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
+    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
+    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
     if (buffers.find(buffer) == buffers.end()) {
         GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
         return false;
@@ -840,22 +850,14 @@ bool rpc_server::free_buffer(const std::vector<uint8_t> & input) {
     return true;
 }
 
-bool rpc_server::buffer_clear(const std::vector<uint8_t> & input) {
-    // input serialization format: | remote_ptr (8 bytes) | value (1 byte) |
-    if (input.size() != sizeof(uint64_t) + sizeof(uint8_t)) {
-        return false;
-    }
-    uint64_t remote_ptr;
-    memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
-    uint8_t value;
-    memcpy(&value, input.data() + sizeof(uint64_t), sizeof(value));
-    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, remote_ptr, value);
-    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
+bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
+    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
+    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
     if (buffers.find(buffer) == buffers.end()) {
         GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
         return false;
     }
-    ggml_backend_buffer_clear(buffer, value);
+    ggml_backend_buffer_clear(buffer, request.value);
     return true;
 }
 
@@ -930,74 +932,55 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
     return true;
 }
 
-bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
-    // serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
-    if (input.size() != sizeof(rpc_tensor) + 2*sizeof(uint64_t)) {
-        return false;
-    }
-    const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
-    uint64_t offset;
-    memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
-    uint64_t size;
-    memcpy(&size, input.data() + sizeof(rpc_tensor) + sizeof(offset), sizeof(size));
-
+bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response) {
     struct ggml_init_params params {
         /*.mem_size   =*/ ggml_tensor_overhead(),
         /*.mem_buffer =*/ NULL,
         /*.no_alloc   =*/ true,
     };
     struct ggml_context * ctx = ggml_init(params);
-    ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
+    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
     if (tensor == nullptr) {
         GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
         ggml_free(ctx);
         return false;
     }
-    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
+    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);
 
     // sanitize tensor->data
     {
         const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
         const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
 
-        if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
-            GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
+        if (request.tensor.data + request.offset < p0 ||
+            request.tensor.data + request.offset >= p1 ||
+            request.size > (p1 - request.tensor.data - request.offset)) {
+                GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
         }
     }
 
-    // output serialization format: | data (size bytes) |
-    output.resize(size, 0);
-    ggml_backend_tensor_get(tensor, output.data(), offset, size);
+    response.resize(request.size, 0);
+    ggml_backend_tensor_get(tensor, response.data(), request.offset, request.size);
     ggml_free(ctx);
     return true;
 }
 
-bool rpc_server::copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
-    // serialization format: | rpc_tensor src | rpc_tensor dst |
-    if (input.size() != 2*sizeof(rpc_tensor)) {
-        return false;
-    }
-    const rpc_tensor * rpc_src = (const rpc_tensor *)input.data();
-    const rpc_tensor * rpc_dst = (const rpc_tensor *)(input.data() + sizeof(rpc_src));
-
+bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response) {
     struct ggml_init_params params {
         /*.mem_size   =*/ 2*ggml_tensor_overhead(),
         /*.mem_buffer =*/ NULL,
         /*.no_alloc   =*/ true,
     };
     struct ggml_context * ctx = ggml_init(params);
-    ggml_tensor * src = deserialize_tensor(ctx, rpc_src);
-    ggml_tensor * dst = deserialize_tensor(ctx, rpc_dst);
+    ggml_tensor * src = deserialize_tensor(ctx, &request.src);
+    ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
     if (src == nullptr || dst == nullptr) {
         GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
         ggml_free(ctx);
         return false;
     }
     GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
-    bool result = ggml_backend_buffer_copy_tensor(src, dst);
-    // output serialization format: | result (1 byte) |
-    output.resize(1, 0);
-    output[0] = result;
+    response.result = ggml_backend_buffer_copy_tensor(src, dst);
     ggml_free(ctx);
     return true;
 }
@@ -1026,7 +1009,7 @@ ggml_tensor * rpc_server::create_node(uint64_t id,
     return result;
 }
 
-bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
+bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response) {
     // serialization format:
     // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
     if (input.size() < sizeof(uint32_t)) {
@@ -1066,9 +1049,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
         graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
     }
     ggml_status status = ggml_backend_graph_compute(backend, graph);
-    // output serialization format: | status (1 byte) |
-    output.resize(1, 0);
-    output[0] = status;
+    response.result = status;
     ggml_free(ctx);
     return true;
 }
@@ -1091,85 +1072,153 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
             fprintf(stderr, "Unknown command: %d\n", cmd);
             break;
         }
-        std::vector<uint8_t> input;
-        std::vector<uint8_t> output;
-        uint64_t input_size;
-        if (!recv_data(sockfd, &input_size, sizeof(input_size))) {
-            break;
-        }
-        try {
-            input.resize(input_size);
-        } catch (const std::bad_alloc & e) {
-            fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", input_size);
-            break;
-        }
-        if (!recv_data(sockfd, input.data(), input_size)) {
-            break;
-        }
-        bool ok = true;
         switch (cmd) {
             case RPC_CMD_ALLOC_BUFFER: {
-                ok = server.alloc_buffer(input, output);
+                rpc_msg_alloc_buffer_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                rpc_msg_alloc_buffer_rsp response;
+                server.alloc_buffer(request, response);
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
                 break;
             }
             case RPC_CMD_GET_ALIGNMENT: {
-                server.get_alignment(output);
+                if (!recv_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                rpc_msg_get_alignment_rsp response;
+                server.get_alignment(response);
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
                 break;
             }
             case RPC_CMD_GET_MAX_SIZE: {
-                server.get_max_size(output);
+                if (!recv_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                rpc_msg_get_max_size_rsp response;
+                server.get_max_size(response);
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
                 break;
             }
             case RPC_CMD_BUFFER_GET_BASE: {
-                ok = server.buffer_get_base(input, output);
+                rpc_msg_buffer_get_base_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                rpc_msg_buffer_get_base_rsp response;
+                if (!server.buffer_get_base(request, response)) {
+                    return;
+                }
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
                 break;
             }
             case RPC_CMD_FREE_BUFFER: {
-                ok = server.free_buffer(input);
+                rpc_msg_free_buffer_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                if (!server.free_buffer(request)) {
+                    return;
+                }
+                if (!send_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
                 break;
             }
             case RPC_CMD_BUFFER_CLEAR: {
-                ok = server.buffer_clear(input);
+                rpc_msg_buffer_clear_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                if (!server.buffer_clear(request)) {
+                    return;
+                }
+                if (!send_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
                 break;
             }
             case RPC_CMD_SET_TENSOR: {
-                ok = server.set_tensor(input);
+                std::vector<uint8_t> input;
+                if (!recv_msg(sockfd, input)) {
+                    return;
+                }
+                if (!server.set_tensor(input)) {
+                    return;
+                }
+                if (!send_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
                 break;
             }
             case RPC_CMD_GET_TENSOR: {
-                ok = server.get_tensor(input, output);
+                rpc_msg_get_tensor_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                std::vector<uint8_t> response;
+                if (!server.get_tensor(request, response)) {
+                    return;
+                }
+                if (!send_msg(sockfd, response.data(), response.size())) {
+                    return;
+                }
                 break;
             }
             case RPC_CMD_COPY_TENSOR: {
-                ok = server.copy_tensor(input, output);
+                rpc_msg_copy_tensor_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                rpc_msg_copy_tensor_rsp response;
+                if (!server.copy_tensor(request, response)) {
+                    return;
+                }
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
                 break;
             }
             case RPC_CMD_GRAPH_COMPUTE: {
-                ok = server.graph_compute(input, output);
+                std::vector<uint8_t> input;
+                if (!recv_msg(sockfd, input)) {
+                    return;
+                }
+                rpc_msg_graph_compute_rsp response;
+                if (!server.graph_compute(input, response)) {
+                    return;
+                }
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
                 break;
             }
             case RPC_CMD_GET_DEVICE_MEMORY: {
-                // output serialization format: | free (8 bytes) | total (8 bytes) |
-                output.resize(2*sizeof(uint64_t), 0);
-                memcpy(output.data(), &free_mem, sizeof(free_mem));
-                memcpy(output.data() + sizeof(uint64_t), &total_mem, sizeof(total_mem));
+                if (!recv_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                rpc_msg_get_device_memory_rsp response;
+                response.free_mem = free_mem;
+                response.total_mem = total_mem;
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
                 break;
             }
             default: {
                 fprintf(stderr, "Unknown command: %d\n", cmd);
-                ok = false;
+                return;
             }
         }
-        if (!ok) {
-            break;
-        }
-        uint64_t output_size = output.size();
-        if (!send_data(sockfd, &output_size, sizeof(output_size))) {
-            break;
-        }
-        if (!send_data(sockfd, output.data(), output_size)) {
-            break;
-        }
     }
 }
 

From cda0e4b648dde8fac162b3430b14a99597d3d74f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Fri, 18 Oct 2024 23:18:01 +0200
Subject: [PATCH 074/396] llama : remove all_pos_0, all_pos_1, all_seq_id from
 llama_batch (#9745)

* refactor llama_batch_get_one

* adapt all examples

* fix simple.cpp

* fix llama_bench

* fix

* fix context shifting

* free batch before return

* use common_batch_add, reuse llama_batch in loop

* null terminated seq_id list

* fix save-load-state example

* fix perplexity

* correct token pos in llama_batch_allocr
---
 common/common.cpp                             |   4 +-
 examples/batched-bench/batched-bench.cpp      |   1 -
 .../cvector-generator/cvector-generator.cpp   |   2 +-
 examples/eval-callback/eval-callback.cpp      |   2 +-
 examples/imatrix/imatrix.cpp                  |  13 +-
 examples/infill/infill.cpp                    |   2 +-
 examples/llama-bench/llama-bench.cpp          |  16 +-
 .../llama/src/main/cpp/llama-android.cpp      |   3 -
 examples/llava/llava-cli.cpp                  |   2 +-
 examples/llava/llava.cpp                      |  38 ++++-
 examples/llava/minicpmv-cli.cpp               |   2 +-
 examples/lookahead/lookahead.cpp              |   4 +-
 examples/lookup/lookup.cpp                    |   4 +-
 examples/main/main.cpp                        |   4 +-
 examples/parallel/parallel.cpp                |   1 -
 examples/perplexity/perplexity.cpp            |  27 +++-
 examples/save-load-state/save-load-state.cpp  |  30 +++-
 examples/server/server.cpp                    |   1 -
 examples/simple/simple.cpp                    |   4 +-
 examples/speculative/speculative.cpp          |   6 +-
 include/llama.h                               |  20 +--
 src/llama.cpp                                 | 137 ++++++++++--------
 22 files changed, 205 insertions(+), 118 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index c08f01b42..2bc0b8800 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -955,7 +955,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
 
         if (llama_model_has_encoder(model)) {
-            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
+            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
             llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
             if (decoder_start_token_id == -1) {
                 decoder_start_token_id = bos;
@@ -964,7 +964,7 @@ struct common_init_result common_init_from_params(common_params & params) {
             tmp.push_back(decoder_start_token_id);
         }
         if (llama_model_has_decoder(model)) {
-            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 81c3220ad..a3b21ad6b 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index 69e141ecb..d1731bba6 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
     llama_kv_cache_clear(ctx);
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
     }
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index fb52db4e1..c08e3e5f6 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -131,7 +131,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 
     std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
 
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         LOG_ERR("%s : failed to eval\n", __func__);
         return false;
     }
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index d1ff3e8bc..70ff47768 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -496,6 +496,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
         // clear the KV cache
         llama_kv_cache_clear(ctx);
 
+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
@@ -508,9 +510,14 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
                 tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
             }
 
-            // TODO: use batch.logits to save computations instead of relying on logits_all == true
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
+            if (llama_decode(ctx, batch)) {
                 LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
                 return false;
             }
 
@@ -523,6 +530,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
             }
         }
 
+        llama_batch_free(batch);
+
         const auto t_end = std::chrono::high_resolution_clock::now();
 
         if (i == 0) {
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index f82c614f5..f18362c91 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -396,7 +396,7 @@ int main(int argc, char ** argv) {
 
                 LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
                     LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 60a7aef5b..4a8ea9676 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1428,7 +1428,7 @@ struct sql_printer : public printer {
     }
 };
 
-static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
+static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
     const llama_model * model = llama_get_model(ctx);
@@ -1444,14 +1444,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
         for (int i = 1; i < n_tokens; i++) {
             tokens[i] = std::rand() % n_vocab;
         }
-        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
+        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
         n_processed += n_tokens;
     }
 
     llama_synchronize(ctx);
 }
 
-static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
+static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
     const llama_model * model = llama_get_model(ctx);
@@ -1460,7 +1460,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
     llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
 
     for (int i = 0; i < n_gen; i++) {
-        llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
+        llama_decode(ctx, llama_batch_get_one(&token, 1));
         llama_synchronize(ctx);
         token = std::rand() % n_vocab;
     }
@@ -1596,13 +1596,13 @@ int main(int argc, char ** argv) {
                 fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
             }
             //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
-            test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+            test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
         }
         if (t.n_gen > 0) {
             if (params.progress) {
                 fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
             }
-            test_gen(ctx, 1, 0, t.n_threads);
+            test_gen(ctx, 1, t.n_threads);
         }
 
         for (int i = 0; i < params.reps; i++) {
@@ -1614,13 +1614,13 @@ int main(int argc, char ** argv) {
                 if (params.progress) {
                     fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
                 }
-                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+                test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
             }
             if (t.n_gen > 0) {
                 if (params.progress) {
                     fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
                 }
-                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
+                test_gen(ctx, t.n_gen, t.n_threads);
             }
 
             uint64_t t_ns = get_time_ns() - t_start;
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index f5ffd063f..b3858ddfb 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -283,9 +283,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
         nullptr,
         nullptr,
         nullptr,
-        0,
-        0,
-        0,
     };
 
     if (embd) {
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 5f9abe2b6..161098585 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -20,7 +20,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
             LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
             return false;
         }
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 2c96973c8..be6988540 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -401,6 +401,39 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
     return true;
 }
 
+struct llava_embd_batch {
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_batch batch;
+    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        pos     .resize(n_tokens);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+        for (int i = 0; i < n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+};
+
 bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
     int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
 
@@ -409,8 +442,9 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
-        if (llama_decode(ctx_llama, batch)) {
+        float * embd = image_embed->embed+i*n_embd;
+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+        if (llama_decode(ctx_llama, llava_batch.batch)) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
         }
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 6b666de1b..cbecec343 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -97,7 +97,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
             LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
             return false;
         }
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index f9e4aba81..3c0ccfea2 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -89,8 +89,8 @@ int main(int argc, char ** argv) {
     const auto t_enc_start = ggml_time_us();
 
     // eval the prompt
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
+    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
+    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 
     for (int s = 1; s < W + G + 1; ++s) {
         llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 82fc7d466..a04728b18 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -89,8 +89,8 @@ int main(int argc, char ** argv){
 
     const auto t_enc_start = ggml_time_us();
 
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
+    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
+    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 
     const auto t_enc_end = ggml_time_us();
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 65483c45f..374ed47ad 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -528,7 +528,7 @@ int main(int argc, char ** argv) {
         int enc_input_size = embd_inp.size();
         llama_token * enc_input_buf = embd_inp.data();
 
-        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
+        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return 1;
         }
@@ -648,7 +648,7 @@ int main(int argc, char ** argv) {
 
                 LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
                     LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 20274c147..43c8f3ed5 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -308,7 +308,6 @@ int main(int argc, char ** argv) {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index efb41b80a..e803ff143 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -408,14 +408,21 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
         // clear the KV cache
         llama_kv_cache_clear(ctx);
 
+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
 
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
             //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
-            // TODO: use llama_batch.logits instead of relying on logits_all == true
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            if (llama_decode(ctx, batch)) {
                 //LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
                 return {tokens, -1, logit_history, prob_history};
             }
 
@@ -435,6 +442,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
             }
         }
 
+        llama_batch_free(batch);
+
         const auto t_end = std::chrono::high_resolution_clock::now();
 
         if (i == 0) {
@@ -704,7 +713,6 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
             batch.n_seq_id + i,
             batch.seq_id   + i,
             batch.logits   + i,
-            0, 0, 0, // unused
         };
 
         const int ret = llama_decode(ctx, batch_view);
@@ -1791,6 +1799,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         // clear the KV cache
         llama_kv_cache_clear(ctx);
 
+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
@@ -1803,9 +1813,14 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
                 tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
             }
 
-            // TODO: use llama_batch.logits instead of relying on logits_all == true
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
+            if (llama_decode(ctx, batch)) {
                 LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
                 return;
             }
 
@@ -1818,6 +1833,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
             }
         }
 
+        llama_batch_free(batch);
+
         const auto t_end = std::chrono::high_resolution_clock::now();
 
         if (i == 0) {
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 3866cfa27..5f60a86cb 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -48,9 +48,16 @@ int main(int argc, char ** argv) {
     // tokenize prompt
     auto tokens = common_tokenize(ctx, params.prompt, true);
 
+    // prepare the batch
+    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
+    for (size_t i = 0; i < tokens.size(); i++) {
+        common_batch_add(batch, tokens[i], i, {0}, false);
+    }
+    batch.logits[batch.n_tokens - 1] = true; // generate next token
+
     // evaluate prompt
-    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
-    n_past += tokens.size();
+    llama_decode(ctx, batch);
+    n_past += batch.n_tokens;
 
     // save state (rng, logits, embedding and kv_cache) to file
     {
@@ -77,8 +84,12 @@ int main(int argc, char ** argv) {
         printf("%s", next_token_str.c_str());
         result0 += next_token_str;
 
-        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        common_batch_clear(batch);
+        common_batch_add(batch, next_token, n_past, {0}, true);
+
+        if (llama_decode(ctx, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_batch_free(batch);
             llama_free(ctx);
             llama_free_model(model);
             return 1;
@@ -133,8 +144,12 @@ int main(int argc, char ** argv) {
         printf("%s", next_token_str.c_str());
         result1 += next_token_str;
 
-        if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        common_batch_clear(batch);
+        common_batch_add(batch, next_token, n_past, {0}, true);
+
+        if (llama_decode(ctx2, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_batch_free(batch);
             llama_free(ctx2);
             llama_free_model(model);
             return 1;
@@ -221,8 +236,12 @@ int main(int argc, char ** argv) {
         printf("%s", next_token_str.c_str());
         result2 += next_token_str;
 
-        if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
+        common_batch_clear(batch);
+        common_batch_add(batch, next_token, n_past, {1}, true);
+
+        if (llama_decode(ctx3, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_batch_free(batch);
             llama_free(ctx3);
             llama_free_model(model);
             return 1;
@@ -236,6 +255,7 @@ int main(int argc, char ** argv) {
     llama_sampler_free(smpl2);
     llama_sampler_free(smpl3);
 
+    llama_batch_free(batch);
     llama_free(ctx3);
     llama_free_model(model);
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8fd443878..3992108e7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2326,7 +2326,6 @@ struct server_context {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index be91b2891..59760fe95 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -138,7 +138,7 @@ int main(int argc, char ** argv) {
 
     // prepare a batch for the prompt
 
-    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size(), 0, 0);
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
 
     // main loop
 
@@ -175,7 +175,7 @@ int main(int argc, char ** argv) {
             fflush(stdout);
 
             // prepare the next batch with the sampled token
-            batch = llama_batch_get_one(&new_token_id, 1, n_pos, 0);
+            batch = llama_batch_get_one(&new_token_id, 1);
 
             n_decode += 1;
         }
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 5a7b3084f..b201bd714 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -155,9 +155,9 @@ int main(int argc, char ** argv) {
     const auto t_enc_start = ggml_time_us();
 
     // eval the prompt with both models
-    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
-    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0));
+    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1));
+    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1));
+    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input));
 
     const auto t_enc_end = ggml_time_us();
 
diff --git a/include/llama.h b/include/llama.h
index 1a13360c2..2558e9267 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -232,8 +232,11 @@ extern "C" {
     // - token  : the token ids of the input (used when embd is NULL)
     // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
     // - pos    : the positions of the respective token in the sequence
+    //            (if set to NULL, the token position will be tracked automatically by llama_decode)
     // - seq_id : the sequence to which the respective token belongs
+    //            (if set to NULL, the sequence ID will be assumed to be 0)
     // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
+    //            (if set to NULL, only the logits for last token will be returned)
     //
     typedef struct llama_batch {
         int32_t n_tokens;
@@ -244,15 +247,6 @@ extern "C" {
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
         int8_t       *  logits; // TODO: rename this to "output"
-
-        // NOTE: helpers for smooth API transition - can be deprecated in the future
-        //       for future-proof code, use the above fields instead and ignore everything below
-        //
-        // pos[i] = all_pos_0 + i*all_pos_1
-        //
-        llama_pos    all_pos_0;  // used if pos == NULL
-        llama_pos    all_pos_1;  // used if pos == NULL
-        llama_seq_id all_seq_id; // used if seq_id == NULL
     } llama_batch;
 
     enum llama_model_kv_override_type {
@@ -776,15 +770,15 @@ extern "C" {
     // Decoding
     //
 
-    // Return batch for single sequence of tokens starting at pos_0
+    // Return batch for single sequence of tokens
+    // The sequence ID will be fixed to 0
+    // The position of the tokens will be tracked automatically by llama_decode
     //
     // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
     //
     LLAMA_API struct llama_batch llama_batch_get_one(
                   llama_token * tokens,
-                      int32_t   n_tokens,
-                    llama_pos   pos_0,
-                 llama_seq_id   seq_id);
+                      int32_t   n_tokens);
 
     // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
     // Each token can be assigned up to n_seq_max sequence ids
diff --git a/src/llama.cpp b/src/llama.cpp
index 10c975bf4..1813dd29b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2949,9 +2949,6 @@ struct llama_sbatch_seq {
     llama_seq_id * seq_id;
     size_t offset;
     size_t length;
-
-    // helper for smoother batch API transition -- can be deprecated in the future
-    llama_seq_id all_seq_id; // used if seq_id == NULL
 };
 
 // sequence-length-aware batch splitting
@@ -3046,30 +3043,18 @@ struct llama_sbatch {
         } else {
             ubatch.embd = nullptr;
         }
-        // from here on, the else branches are deprecated;
-        // they are helpers for smoother batch API transition
-        if (batch->pos) {
-            if (ubatch.equal_seqs) {
-                for (size_t i = 0; i < length; ++i) {
-                    ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
-                }
-            } else {
-                // simple split
-                ubatch.pos = batch->pos + seq.offset;
+        if (ubatch.equal_seqs) {
+            for (size_t i = 0; i < length; ++i) {
+                ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
             }
         } else {
-            for (size_t i = 0; i < length; ++i) {
-                llama_pos bi = ids[seq.offset + i];
-                ubatch.pos[ubatch.n_tokens + i] = batch->all_pos_0 + (bi * batch->all_pos_1);
-            }
+            // simple split
+            ubatch.pos = batch->pos + seq.offset;
         }
         if (ubatch.equal_seqs) {
             ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
             if (seq.seq_id) {
                 ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
-            } else {
-                GGML_ASSERT(seq.n_seq_id == 1);
-                ubatch.seq_id[ubatch.n_seqs] = &seq.all_seq_id;
             }
         } else {
             // simple split
@@ -3082,10 +3067,6 @@ struct llama_sbatch {
             }
             if (batch->seq_id) {
                 ubatch.seq_id = batch->seq_id + seq.offset;
-            } else {
-                for (size_t i = 0; i < length; ++i) {
-                    ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;
-                }
             }
         }
         if (logits_all) {
@@ -3204,7 +3185,6 @@ struct llama_sbatch {
             s.seq_id = nullptr;
             s.offset = 0;
             s.length = n_tokens;
-            s.all_seq_id = batch.all_seq_id;
             return;
         }
         std::sort(ids.begin(), ids.end(),
@@ -3227,7 +3207,7 @@ struct llama_sbatch {
                     if (batch.pos) {
                         return batch.pos[a] < batch.pos[b];
                     }
-                    // no pos, sort by id (assuming batch.all_pos_1 is positive)
+                    // no pos, sort by id
                     return a < b;
                 }
                 // shared prompts go first
@@ -3237,30 +3217,25 @@ struct llama_sbatch {
         // init seq
         llama_sbatch_seq * last_seq = nullptr;
 
-        if (batch.n_seq_id != nullptr && batch.seq_id != nullptr) {
-            for (size_t i = 0; i < n_tokens; ++i) {
-                const size_t bi = ids[i];
-                const int32_t n_seqs = batch.n_seq_id[bi];
-                llama_seq_id * seq_ids = batch.seq_id[bi];
-                if (last_seq != nullptr) {
-                    bool same = n_seqs == last_seq->n_seq_id;
-                    for (int32_t j = 0; same && j < n_seqs; ++j) {
-                        if (seq_ids[j] != last_seq->seq_id[j]) {
-                            same = false;
-                        }
-                    }
-                    if (same) {
-                        last_seq->length += 1;
-                        continue;
+        for (size_t i = 0; i < n_tokens; ++i) {
+            const size_t bi = ids[i];
+            const int32_t n_seqs = batch.n_seq_id[bi];
+            llama_seq_id * seq_ids = batch.seq_id[bi];
+            if (last_seq != nullptr) {
+                bool same = n_seqs == last_seq->n_seq_id;
+                for (int32_t j = 0; same && j < n_seqs; ++j) {
+                    if (seq_ids[j] != last_seq->seq_id[j]) {
+                        same = false;
                     }
                 }
-                llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1, batch.all_seq_id};
-                seq.push_back(new_seq);
-                last_seq = &seq.back();
+                if (same) {
+                    last_seq->length += 1;
+                    continue;
+                }
             }
-        } else {
-            llama_sbatch_seq new_seq = {1, nullptr, 0, n_tokens, batch.all_seq_id};
+            llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
             seq.push_back(new_seq);
+            last_seq = &seq.back();
         }
         // keep shared prompts first at the end, then sort by length descending.
         std::sort(seq.begin(), seq.end(),
@@ -21096,9 +21071,7 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
 
 struct llama_batch llama_batch_get_one(
              llama_token * tokens,
-                 int32_t   n_tokens,
-               llama_pos   pos_0,
-            llama_seq_id   seq_id) {
+                 int32_t   n_tokens) {
     return {
         /*n_tokens       =*/ n_tokens,
         /*tokens         =*/ tokens,
@@ -21107,9 +21080,6 @@ struct llama_batch llama_batch_get_one(
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
         /*logits         =*/ nullptr,
-        /*all_pos_0      =*/ pos_0,
-        /*all_pos_1      =*/ 1,
-        /*all_seq_id     =*/ seq_id,
     };
 }
 
@@ -21122,9 +21092,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
         /*logits         =*/ nullptr,
-        /*all_pos_0      =*/ 0,
-        /*all_pos_1      =*/ 0,
-        /*all_seq_id     =*/ 0,
     };
 
     if (embd) {
@@ -21160,11 +21127,62 @@ void llama_batch_free(struct llama_batch batch) {
     if (batch.logits)   free(batch.logits);
 }
 
+// temporary allocate memory for the input batch if needed
+static const llama_seq_id batch_default_seq_id = 0;
+struct llama_batch_allocr {
+    std::array<llama_seq_id, 1> seq_id_0 = {batch_default_seq_id};
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id *> seq_id;
+    std::vector<int8_t>         logits;
+    struct llama_batch          batch;
+    // optionally fulfill the batch returned by llama_batch_get_one
+    llama_batch_allocr(struct llama_context * ctx, struct llama_batch in_batch) {
+        batch = in_batch;
+        if (!batch.pos) {
+            // determine the last position in KV cache
+            llama_pos last_pos = -1;
+            for (const auto & cell : ctx->kv_self.cells) {
+                if (cell.has_seq_id(batch_default_seq_id)) {
+                    last_pos = std::max(last_pos, cell.pos);
+                }
+            }
+            last_pos++; // next position
+            pos.resize(batch.n_tokens);
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                pos[i] = i+last_pos;
+            }
+            batch.pos = pos.data();
+        }
+        if (!batch.n_seq_id) {
+            n_seq_id.resize(batch.n_tokens);
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                n_seq_id[i] = seq_id_0.size();
+            }
+            batch.n_seq_id = n_seq_id.data();
+        }
+        if (!batch.seq_id) {
+            seq_id.resize(batch.n_tokens + 1);
+            seq_id[batch.n_tokens] = NULL;
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                seq_id[i] = seq_id_0.data();
+            }
+            batch.seq_id = seq_id.data();
+        }
+        if (!batch.logits) {
+            logits.resize(batch.n_tokens);
+            logits[logits.size() - 1] = true;
+            batch.logits = logits.data();
+        }
+    }
+};
+
 int32_t llama_encode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
-    const int ret = llama_encode_internal(*ctx, batch);
-    if (ret < 0) {
+    llama_batch_allocr batch_allocr(ctx, batch);
+    const int ret = llama_encode_internal(*ctx, batch_allocr.batch);
+    if (ret != 0) {
         LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
     }
 
@@ -21174,8 +21192,9 @@ int32_t llama_encode(
 int32_t llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
-    const int ret = llama_decode_internal(*ctx, batch);
-    if (ret < 0) {
+    llama_batch_allocr batch_allocr(ctx, batch);
+    const int ret = llama_decode_internal(*ctx, batch_allocr.batch);
+    if (ret != 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 

From 7cab2083c768dd92c20b105556c4165b59cd8a41 Mon Sep 17 00:00:00 2001
From: icppWorld <124377669+icppWorld@users.noreply.github.com>
Date: Sun, 20 Oct 2024 12:01:34 -0400
Subject: [PATCH 075/396] readme : update infra list (#9942)

llama_cpp_canister allows you to run llama.cpp as a Smart Contract on the Internet Computer. The smart contract runs as WebAssembly in a so-called 'canister'.
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 1088b3338..f1d8900c3 100644
--- a/README.md
+++ b/README.md
@@ -187,6 +187,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 
 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
+- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
 
 **Games:**
 - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.

From 45f097645efb11b6d09a5b4adbbfd7c312ac0126 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Carr=C3=A8re?= <loic.carrere@gmail.com>
Date: Sun, 20 Oct 2024 18:25:41 +0200
Subject: [PATCH 076/396] readme : update bindings list (#9951)

Update the binding list by adding LM-Kit.NET (C# & VB.NET)
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index f1d8900c3..06c32a2b4 100644
--- a/README.md
+++ b/README.md
@@ -122,6 +122,7 @@ Typically finetunes of the base models below are supported as well.
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
+- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)

From 1db8c84fc62857e1e45c1c7ea93bcd5344cb3d31 Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Mon, 21 Oct 2024 14:26:09 +0800
Subject: [PATCH 077/396] fix mul_mat_vec_q and *_vec_q error (#9939)

Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
---
 ggml/src/ggml-sycl/mmvq.cpp | 136 ++++++++++++++++++------------------
 1 file changed, 69 insertions(+), 67 deletions(-)

diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
index 1b96925e1..7b10cf688 100644
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -1,6 +1,6 @@
 #include "mmvq.hpp"
 #include "vecdotq.hpp"
-
+#include <cassert>
 
 template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
 static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
@@ -13,7 +13,8 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
     }
 
     const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
 
 // partial sum for each thread
     float tmp = 0.0f;
@@ -37,7 +38,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -61,7 +62,8 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
     }
 
     const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
 
 // partial sum for each thread
     float tmp = 0.0f;
@@ -85,7 +87,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -109,8 +111,8 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
     }
 
     const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
 // partial sum for each thread
     float tmp = 0.0f;
 
@@ -133,7 +135,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -157,8 +159,8 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
     }
 
     const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
 // partial sum for each thread
     float tmp = 0.0f;
 
@@ -181,7 +183,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -205,8 +207,8 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
     }
 
     const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
 // partial sum for each thread
     float tmp = 0.0f;
 
@@ -229,7 +231,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -253,8 +255,8 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
     }
 
     const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
 // partial sum for each thread
     float tmp = 0.0f;
 
@@ -277,7 +279,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -301,8 +303,8 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
     }
 
     const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
 // partial sum for each thread
     float tmp = 0.0f;
 
@@ -325,7 +327,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -349,8 +351,8 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
     }
 
     const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
 // partial sum for each thread
     float tmp = 0.0f;
 
@@ -373,7 +375,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -397,8 +399,8 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
     }
 
     const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
 // partial sum for each thread
     float tmp = 0.0f;
 
@@ -421,7 +423,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -446,8 +448,8 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
     }
 
     const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
 // partial sum for each thread
     float tmp = 0.0f;
 
@@ -470,7 +472,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
 
     // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -487,7 +489,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK4_0 == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -495,7 +497,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
                                       VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
@@ -511,7 +513,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK4_1 == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -519,7 +521,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
                                       VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
@@ -535,7 +537,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK5_0 == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -543,7 +545,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
                                       VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
@@ -559,7 +561,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK5_1 == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -567,7 +569,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
                                       VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
@@ -583,7 +585,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK8_0 == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -591,7 +593,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
                                       VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
@@ -607,7 +609,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -615,7 +617,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
                                       VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
@@ -631,7 +633,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -639,7 +641,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
                                       VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
@@ -655,7 +657,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -663,7 +665,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
                                       VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
@@ -679,7 +681,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -687,7 +689,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
                                       VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
@@ -703,7 +705,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -711,7 +713,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
                                       VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
@@ -728,13 +730,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
         stream->submit([&](sycl::handler &cgh) {
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
                     });
@@ -749,7 +751,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -759,7 +761,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
                     });
@@ -774,7 +776,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -784,7 +786,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
                     });
@@ -799,7 +801,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -809,7 +811,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
                     });
@@ -824,7 +826,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -833,7 +835,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
                     });
@@ -848,7 +850,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
@@ -858,7 +860,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
                     });
@@ -873,13 +875,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
         stream->submit([&](sycl::handler &cgh) {
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
                     });
@@ -894,14 +896,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK4_NL == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
                             vx, vy, dst, ncols, nrows, item_ct1);
                     });
@@ -916,14 +918,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
 
         stream->submit([&](sycl::handler &cgh) {
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
                         mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
                             vx, vy, dst, ncols, nrows, item_ct1);
                     });

From bc219750845a59166d79f0d4ee3da1993b369b8a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 21 Oct 2024 09:37:12 +0300
Subject: [PATCH 078/396] speculative : fix handling of some input params
 (#9963)

* speculative : fix batch sizes at initialization

ggml-ci

* speculative : handle params.n_predict == -1

* speculative : limit batch size to llama_n_batch
---
 examples/speculative/speculative.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index b201bd714..8a6475415 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -39,6 +39,11 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    if (params.n_predict < -1) {
+        LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
+        return 1;
+    }
+
     common_init();
 
     if (params.model_draft.empty()) {
@@ -190,8 +195,8 @@ int main(int argc, char ** argv) {
         drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
     }
 
-    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
+    llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
+    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, n_seq_dft);
 
     const auto t_dec_start = ggml_time_us();
 
@@ -441,7 +446,7 @@ int main(int argc, char ** argv) {
             ++n_past_dft;
         }
 
-        if (n_predict > params.n_predict || has_eos) {
+        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
             break;
         }
 

From 55e47786e373c90fc7803e718e3e1dd6d53c3db6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 21 Oct 2024 09:46:40 +0300
Subject: [PATCH 079/396] llama : default sampling changes + greedy update
 (#9897)

* llama : deprecate softmax sampler + fix dist sampler

ggml-ci

* tests : replace macros with functions

ggml-ci

* sampling : change temperature sampler logic

For t <= 0.0f, keep the max logit intact and set the rest to -inf

* cont : no need for special "greedy" logic

top-k == 1 is the same

* tests : init prob correctly

* llama : handle temp <= 0.0 in the temp_ext sampler too

ggml-ci

* cont : avoid extra loop in temperature sampler for sub-zero temp

ggml-ci
---
 common/sampling.cpp                           |  88 +++---
 .../llama.cpp.swift/LibLlama.swift            |   1 -
 examples/save-load-state/save-load-state.cpp  |   3 -
 examples/speculative/speculative.cpp          |   3 -
 include/llama.h                               |  10 +-
 src/llama-sampling.cpp                        |  41 ++-
 tests/test-sampling.cpp                       | 274 ++++++++----------
 7 files changed, 202 insertions(+), 218 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index 56cd0df6b..4ab3eface 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -171,60 +171,46 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                 params.penalize_nl,
                 params.ignore_eos));
 
-    if (params.temp > 0.0f) {
-        if (params.mirostat == 0) {
-            for (const auto & cnstr : params.samplers) {
-                switch (cnstr) {
-                    case COMMON_SAMPLER_TYPE_TOP_K:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TOP_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_MIN_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_XTC:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TFS_Z:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                        break;
-                    case COMMON_SAMPLER_TYPE_INFILL:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
-                        break;
-                    default:
-                        GGML_ASSERT(false && "unknown sampler type");
-                }
+    if (params.mirostat == 0) {
+        for (const auto & cnstr : params.samplers) {
+            switch (cnstr) {
+                case COMMON_SAMPLER_TYPE_TOP_K:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_MIN_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_XTC:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    break;
+                case COMMON_SAMPLER_TYPE_TFS_Z:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    break;
+                case COMMON_SAMPLER_TYPE_INFILL:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
+                    break;
+                default:
+                    GGML_ASSERT(false && "unknown sampler type");
             }
-            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-            llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
-        } else if (params.mirostat == 1) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
-        } else if (params.mirostat == 2) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
-        } else {
-            GGML_ASSERT(false && "unknown mirostat version");
         }
+        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+    } else if (params.mirostat == 1) {
+        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+    } else if (params.mirostat == 2) {
+        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
     } else {
-        if (params.n_probs > 0) {
-            // some use cases require to sample greedily, but still obtain the probabilities of the top tokens
-            // ref: https://github.com/ggerganov/llama.cpp/pull/9605
-            //
-            // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
-            // it is much faster, since we avoid sorting all tokens and should give a good approximation
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-        }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
+        GGML_ASSERT(false && "unknown mirostat version");
     }
 
     return result;
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index dcd9803a2..65cd4eb51 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -46,7 +46,6 @@ actor LlamaContext {
         let sparams = llama_sampler_chain_default_params()
         self.sampling = llama_sampler_chain_init(sparams)
         llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax())
         llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
     }
 
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 5f60a86cb..8c49a52a6 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -42,7 +42,6 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
     llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
 
     // tokenize prompt
@@ -107,7 +106,6 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
     llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
 
     printf("\nsecond run: %s", params.prompt.c_str());
@@ -171,7 +169,6 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
     llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
 
     printf("\nsingle seq run: %s", params.prompt.c_str());
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 8a6475415..a40e755a2 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -185,8 +185,6 @@ int main(int argc, char ** argv) {
     // target model sampling context (reuse the llama_context's sampling instance)
     struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
 
-    struct llama_sampler * softmax = llama_sampler_init_softmax();
-
     // draft sequence data
     std::vector<seq_draft> drafts(n_seq_dft);
 
@@ -629,7 +627,6 @@ int main(int argc, char ** argv) {
         common_sampler_free(drafts[s].smpl);
     }
 
-    llama_sampler_free(softmax);
     llama_batch_free(batch_dft);
 
     llama_free(ctx_tgt);
diff --git a/include/llama.h b/include/llama.h
index 2558e9267..d4059c8dd 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -217,6 +217,7 @@ extern "C" {
 
     typedef struct llama_token_data_array {
         // TODO: consider SoA
+        // NOTE: this pointer can be modified by the samplers
         llama_token_data * data;
         size_t size;
         int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -1069,12 +1070,13 @@ extern "C" {
 
     // available samplers:
 
-    LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
 
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
-    LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);
+    DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
+        "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
 
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
     LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
@@ -1090,6 +1092,8 @@ extern "C" {
 
     /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
     LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
+
+    /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
     LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
 
     /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index bd750c40e..d71516153 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -63,6 +63,30 @@ static void llama_log_softmax(float * array, size_t size) {
 }
 */
 
+static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
+    if (temp <= 0.0f) {
+        // find the token with the highest logit and set the rest to -inf
+        size_t max_i = 0;
+        float  max_l = cur_p->data[0].logit;
+
+        for (size_t i = 1; i < cur_p->size; ++i) {
+            if (cur_p->data[i    ].logit > max_l) {
+                cur_p->data[max_i].logit = -INFINITY;
+                max_i = i;
+                max_l = cur_p->data[i].logit;
+            } else {
+                cur_p->data[i].logit = -INFINITY;
+            }
+        }
+
+        return;
+    }
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].logit /= temp;
+    }
+}
+
 static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
     GGML_ASSERT(cur_p->size > 0);
 
@@ -427,6 +451,9 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
 
 static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_dist *) smpl->ctx;
+
+    llama_sampler_softmax_impl(cur_p);
+
     cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
 }
 
@@ -912,9 +939,8 @@ static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*
 
 static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     const auto * ctx = (llama_sampler_temp *) smpl->ctx;
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        cur_p->data[i].logit /= ctx->temp;
-    }
+
+    llama_sampler_temp_impl(cur_p, ctx->temp);
 }
 
 static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
@@ -961,6 +987,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
     if (ctx->delta > 0) {
         const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
         const float max_temp = ctx->temp + ctx->delta;
+
         float exponent_val = ctx->exponent;
 
         // no need to do anything if there is only one (or zero) candidates
@@ -998,9 +1025,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
     #endif
 
         // Apply the dynamically calculated temperature scaling
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            cur_p->data[i].logit /= dyn_temp;
-        }
+        llama_sampler_temp_impl(cur_p, dyn_temp);
 
         // Re-compute softmax probabilities after scaling logits with dynamic temperature
         const double max_l_double = cur_p->data[0].logit;
@@ -1024,9 +1049,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         }
     #endif
     } else {
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            cur_p->data[i].logit /= ctx->temp;
-        }
+        llama_sampler_temp_impl(cur_p, ctx->temp);
     }
 }
 
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 1372bdf13..05600e6f5 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -18,203 +18,176 @@ static void dump(const llama_token_data_array * cur_p) {
 
 #define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
 
-#define APPLY(__cnstr, __cur_p) do { \
-    auto * cnstr = (__cnstr); \
-    llama_sampler_apply(cnstr, (__cur_p)); \
-    llama_sampler_free(cnstr); \
-} while(0)
+struct sampler_tester {
+    sampler_tester(size_t n_vocab) {
+        cur.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+            const float logit = logf(token_id);
+            cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
+        }
 
-static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
-    const size_t n_vocab = probs.size();
+        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+    }
+
+    sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
+        cur.reserve(probs.size());
+        for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) {
+            const float logit = logf(probs[token_id]);
+            cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
+        }
+
+        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+    }
+
+    void apply(llama_sampler * sampler) {
+        llama_sampler_apply(sampler, &cur_p);
+        llama_sampler_free(sampler);
+    }
+
+    void check() {
+        GGML_ASSERT(cur_p.size == probs_expected.size());
+        for (size_t i = 0; i < cur_p.size; i++) {
+            GGML_ASSERT(fabs(cur_p.data[i].p - probs_expected[i]) < 1e-5);
+        }
+    }
+
+    llama_token_data_array cur_p;
+
+private:
+    const std::vector<float> probs_expected;
 
     std::vector<llama_token_data> cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+};
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_top_k(k), &cur_p);
-    DUMP(&cur_p);
+static void test_temp(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp) {
+    sampler_tester tester(probs, probs_expected);
 
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_temp(temp));
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
 }
 
-static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
+static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent) {
+    sampler_tester tester(probs, probs_expected);
 
-    std::vector<llama_token_data> cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_top_p(p, 1), &cur_p);
-    DUMP(&cur_p);
-
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
-static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
-    const size_t n_vocab = probs.size();
+static void test_top_k(const std::vector<float> & probs, const std::vector<float> & probs_expected, int k) {
+    sampler_tester tester(probs, probs_expected);
 
-    std::vector<llama_token_data> cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_k(k));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_tail_free(z, 1), &cur_p);
-    DUMP(&cur_p);
-
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
-static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
+static void test_top_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
 
-    std::vector<llama_token_data> cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_p(p, 1));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_min_p(p, 1), &cur_p);
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
-static void test_xtc(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p, float t) {
-    const size_t n_vocab = probs.size();
+static void test_tfs(const std::vector<float> & probs, const std::vector<float> & probs_expected, float z) {
+    sampler_tester tester(probs, probs_expected);
 
-    std::vector<llama_token_data> cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_tail_free(z, 1));
+    DUMP(&tester.cur_p);
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p);
-    DUMP(&cur_p);
-
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
-    }
+    tester.check();
 }
 
-static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
+static void test_min_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
 
-    std::vector<llama_token_data> cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_min_p(p, 1));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    DUMP(&cur_p);
-    APPLY(llama_sampler_init_typical(p, 1), &cur_p);
-    DUMP(&cur_p);
+    tester.check();
+}
 
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+static void test_xtc(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p, float t) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_xtc(p, t, 0, 0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_typical(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_typical(p, 1));
+    DUMP(&tester.cur_p);
+
+    tester.check();
 }
 
 static void test_penalties(
     const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
-    const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
+    const std::vector<float> & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence
 ) {
-    GGML_ASSERT(probs.size() == expected_probs.size());
+    GGML_ASSERT(probs.size() == probs_expected.size());
+
+    sampler_tester tester(probs, probs_expected);
 
     const size_t n_vocab = probs.size();
-
-    std::vector<llama_token_data> cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-
     auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
 
     for (size_t i = 0; i < last_tokens.size(); i++) {
         llama_sampler_accept(sampler, last_tokens[i]);
     }
 
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-    DUMP(&cur_p);
-    APPLY(sampler, &cur_p);
-    APPLY(llama_sampler_init_softmax(), &cur_p);
-    DUMP(&cur_p);
+    DUMP(&tester.cur_p);
+    tester.apply(sampler);
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
 
-    GGML_ASSERT(cur_p.size == expected_probs.size());
-    for (size_t i = 0; i < cur_p.size; i++) {
-        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
 static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
 ) {
-    std::vector<llama_token_data> cur;
-    cur.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(token_id);
-        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    sampler_tester tester(n_vocab);
 
           llama_token min_token_id = 0;
     const llama_token max_token_id = n_vocab-1;
 
     for (auto s : samplers_sequence) {
         switch (s){
-            case 'k': APPLY(llama_sampler_init_top_k(top_k), &cur_p); break;
+            case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
             case 'f': GGML_ABORT("tail_free test not implemented");
             case 'y': GGML_ABORT("typical test not implemented");
-            case 'p': APPLY(llama_sampler_init_top_p(top_p, 1), &cur_p); break;
-            case 'm': APPLY(llama_sampler_init_min_p(min_p, 1), &cur_p); break;
+            case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
+            case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
             case 't': GGML_ABORT("temperature test not implemented");
             default : GGML_ABORT("Unknown sampler");
         }
 
-        APPLY(llama_sampler_init_softmax(), &cur_p); // make sure tokens are sorted for tests
+        tester.apply(llama_sampler_init_dist(0));
+
+        auto & cur_p = tester.cur_p;
 
         const int size = cur_p.size;
 
@@ -307,21 +280,26 @@ static void test_perf() {
     BENCH(llama_sampler_init_tail_free(0.5f, 1),                data, 32);
     BENCH(llama_sampler_init_typical  (0.5f, 1),                data, 32);
     BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 1, 1),       data, 32);
-    BENCH(llama_sampler_init_softmax  (),                       data, 32);
 }
 
 int main(void) {
     ggml_time_init();
 
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f);
+
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f);
+
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
 
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
 
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);

From d5ebd79c76abd4887f0283cd6f6f9689122094d0 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Mon, 21 Oct 2024 13:35:40 +0300
Subject: [PATCH 080/396] rpc : pack only RPC structs (#9959)

---
 ggml/src/ggml-rpc.cpp | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index f95233284..0e936b343 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -57,8 +57,9 @@ struct socket_t {
     }
 };
 
+// all RPC structures must be packed
+#pragma pack(push, 1)
 // ggml_tensor is serialized into rpc_tensor
-#pragma pack(1)
 struct rpc_tensor {
     uint64_t id;
     uint32_t type;
@@ -95,76 +96,64 @@ enum rpc_cmd {
     RPC_CMD_COUNT,
 };
 
-#pragma pack(1)
 struct rpc_msg_alloc_buffer_req {
     uint64_t size;
 };
 
-#pragma pack(1)
 struct rpc_msg_alloc_buffer_rsp {
     uint64_t remote_ptr;
     uint64_t remote_size;
 };
 
-#pragma pack(1)
 struct rpc_msg_get_alignment_rsp {
     uint64_t alignment;
 };
 
-#pragma pack(1)
 struct rpc_msg_get_max_size_rsp {
     uint64_t max_size;
 };
 
-#pragma pack(1)
 struct rpc_msg_buffer_get_base_req {
     uint64_t remote_ptr;
 };
 
-#pragma pack(1)
 struct rpc_msg_buffer_get_base_rsp {
     uint64_t base_ptr;
 };
 
-#pragma pack(1)
 struct rpc_msg_free_buffer_req {
     uint64_t remote_ptr;
 };
 
-#pragma pack(1)
 struct rpc_msg_buffer_clear_req {
     uint64_t remote_ptr;
     uint8_t value;
 };
 
-#pragma pack(1)
 struct rpc_msg_get_tensor_req {
     rpc_tensor tensor;
     uint64_t offset;
     uint64_t size;
 };
 
-#pragma pack(1)
 struct rpc_msg_copy_tensor_req {
     rpc_tensor src;
     rpc_tensor dst;
 };
 
-#pragma pack(1)
 struct rpc_msg_copy_tensor_rsp {
     uint8_t result;
 };
 
-#pragma pack(1)
 struct rpc_msg_graph_compute_rsp {
     uint8_t result;
 };
 
-#pragma pack(1)
 struct rpc_msg_get_device_memory_rsp {
     uint64_t free_mem;
     uint64_t total_mem;
 };
+#pragma pack(pop)
 
 // RPC data structures
 

From f594bc80baf683818f29d8f5d6fb52daab99e572 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 21 Oct 2024 16:20:46 +0300
Subject: [PATCH 081/396] ggml : add asserts for type conversion in fattn
 kernels (#9971)

ggml-ci
---
 common/common.cpp | 4 ++--
 ggml/src/ggml.c   | 6 +++++-
 src/llama.cpp     | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 2bc0b8800..a8eebb68b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1035,7 +1035,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
         return GGML_TYPE_Q5_1;
     }
 
-    throw std::runtime_error("Invalid cache type: " + s);
+    throw std::runtime_error("Unsupported cache type: " + s);
 }
 
 struct llama_context_params common_context_params_to_llama(const common_params & params) {
@@ -1047,7 +1047,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.n_ubatch          = params.n_ubatch;
     cparams.n_threads         = params.cpuparams.n_threads;
     cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
-                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
+                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
     cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
     cparams.rope_scaling_type = params.rope_scaling_type;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7e24313ed..b16c462fa 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -324,8 +324,9 @@ struct ggml_logger_state {
 static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
 
 static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
-    if (format == NULL)
+    if (format == NULL) {
         return;
+    }
     va_list args_copy;
     va_copy(args_copy, args);
     char buffer[128];
@@ -15723,6 +15724,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
     ggml_vec_dot_t    const kq_vec_dot     = type_traits[k->type].vec_dot;
     ggml_to_float_t   const v_to_float     = type_traits[v->type].to_float;
 
+    GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
+    GGML_ASSERT(v_to_float   && "fattn: unsupported V-type");
+
     // loop over n_batch and n_head
     for (int ir = ir0; ir < ir1; ++ir) {
         // q indices
diff --git a/src/llama.cpp b/src/llama.cpp
index 1813dd29b..98ec123c1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19243,7 +19243,7 @@ struct llama_context * llama_new_context_with_model(
         params.flash_attn = false;
     }
 
-    if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
+    if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
         LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
         return nullptr;
     }

From dbd5f2f5736aec6ff8fd63df3b351dae23c43e2f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 21 Oct 2024 20:25:02 +0300
Subject: [PATCH 082/396] llama.vim : plugin for Neovim (#9787)

---
 examples/llama.vim | 706 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 706 insertions(+)
 create mode 100644 examples/llama.vim

diff --git a/examples/llama.vim b/examples/llama.vim
new file mode 100644
index 000000000..e75872cae
--- /dev/null
+++ b/examples/llama.vim
@@ -0,0 +1,706 @@
+" LLM-based text completion using llama.cpp
+"
+" requires:
+"
+"   - neovim
+"   - curl
+"   - llama.cpp server instance
+"   - FIM-compatible model
+"
+" sample config:
+"
+"   - Tab       - accept the current suggestion
+"   - Shift+Tab - accept just the first line of the segguestion
+"   - Ctrl+F    - toggle FIM completion manually
+"
+" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
+"
+" start the llama.cpp server with a FIM-compatible model. for example:
+"
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
+"
+"   --batch-size [512, model max context]
+"
+"     adjust the batch size to control how much of the provided local context will be used during the inference
+"     lower values will use smaller part of the context around the cursor, which will result in faster processing
+"
+"   --ubatch-size [64, 2048]
+"
+"     chunks the batch into smaller chunks for faster processing
+"     depends on the specific hardware. use llama-bench to profile and determine the best size
+"
+"   --cache-reuse (ge:llama_config.n_predict, 1024]
+"
+"     this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
+"     using non-zero value enables context reuse on the server side which dramatically improves the performance at
+"     large contexts. a value of 256 should be good for all cases
+"
+" run this once to initialise llama.vim:
+"
+"   :call llama#init()
+"
+" more info: https://github.com/ggerganov/llama.cpp/pull/9787
+"
+
+" colors (adjust to your liking)
+highlight llama_hl_hint guifg=#ff772f
+highlight llama_hl_info guifg=#77ff2f
+
+" general parameters:
+"
+"   endpoint:         llama.cpp server endpoint
+"   n_prefix:         number of lines before the cursor location to include in the local prefix
+"   n_suffix:         number of lines after  the cursor location to include in the local suffix
+"   n_predict:        max number of tokens to predict
+"   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
+"   t_max_predict_ms: max alloted time for the prediction
+"   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
+"   auto_fim:         trigger FIM completion automatically on cursor movement
+"   max_line_suffix:  do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
+"
+" ring buffer of chunks, accumulated with time upon:
+"
+"  - completion request
+"  - yank
+"  - entering a buffer
+"  - leaving a buffer
+"  - writing a file
+"
+" parameters for the ring-buffer with extra context:
+"
+"   ring_n_chunks:    max number of chunks to pass as extra context to the server (0 to disable)
+"   ring_chunk_size:  max size of the chunks (in number of lines)
+"                     note: adjust these numbers so that you don't overrun your context
+"                           at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context
+"   ring_scope:       the range around the cursor position (in number of lines) for gathering chunks after FIM
+"   ring_update_ms:   how often to process queued chunks in normal mode
+"
+let s:default_config = {
+    \ 'endpoint':         'http://127.0.0.1:8012/infill',
+    \ 'n_prefix':         256,
+    \ 'n_suffix':         64,
+    \ 'n_predict':        128,
+    \ 't_max_prompt_ms':  500,
+    \ 't_max_predict_ms': 1000,
+    \ 'show_info':        2,
+    \ 'auto_fim':         v:true,
+    \ 'max_line_suffix':  8,
+    \ 'ring_n_chunks':    64,
+    \ 'ring_chunk_size':  64,
+    \ 'ring_scope':       1024,
+    \ 'ring_update_ms':   1000,
+    \ }
+
+let g:llama_config = get(g:, 'llama_config', s:default_config)
+
+function! s:rand(i0, i1) abort
+    return a:i0 + rand() % (a:i1 - a:i0 + 1)
+endfunction
+
+function! llama#init()
+    if !executable('curl')
+        echohl WarningMsg
+        echo 'llama.vim requires the "curl" command to be available'
+        echohl None
+        return
+    endif
+
+    let s:pos_x = 0 " cursor position upon start of completion
+    let s:pos_y = 0
+
+    let s:line_cur = ''
+
+    let s:line_cur_prefix = ''
+    let s:line_cur_suffix = ''
+
+    let s:ring_chunks = [] " current set of chunks used as extra context
+    let s:ring_queued = [] " chunks that are queued to be sent for processing
+    let s:ring_n_evict = 0
+
+    let s:hint_shown = v:false
+    let s:pos_y_pick = -9999 " last y where we picked a chunk
+    let s:pos_dx = 0
+    let s:content = []
+    let s:can_accept = v:false
+
+    let s:timer_fim = -1
+    let s:t_fim_start = reltime() " used to measure total FIM time
+    let s:t_last_move = reltime() " last time the cursor moved
+
+    let s:current_job = v:null
+
+    augroup llama
+        autocmd!
+        autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
+        autocmd InsertLeavePre  * call llama#fim_cancel()
+
+        autocmd CursorMoved     * call s:on_move()
+        autocmd CursorMovedI    * call s:on_move()
+        autocmd CompleteChanged * call llama#fim_cancel()
+
+        if g:llama_config.auto_fim
+            autocmd CursorMovedI * call llama#fim(v:true)
+        endif
+
+        " gather chunks upon yanking
+        autocmd TextYankPost    * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
+
+        " gather chunks upon entering/leaving a buffer
+        autocmd BufEnter        * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
+        autocmd BufLeave        * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+
+        " gather chunk upon saving the file
+        autocmd BufWritePost    * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+    augroup END
+
+    silent! call llama#fim_cancel()
+
+    " init background update of the ring buffer
+    if g:llama_config.ring_n_chunks > 0
+        call s:ring_update()
+    endif
+endfunction
+
+" compute how similar two chunks of text are
+" 0 - no similarity, 1 - high similarity
+" TODO: figure out something better
+function! s:chunk_sim(c0, c1)
+    let l:lines0 = len(a:c0)
+    let l:lines1 = len(a:c1)
+
+    let l:common = 0
+
+    for l:line0 in a:c0
+        for l:line1 in a:c1
+            if l:line0 == l:line1
+                let l:common += 1
+                break
+            endif
+        endfor
+    endfor
+
+    return 2.0 * l:common / (l:lines0 + l:lines1)
+endfunction
+
+" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing
+"
+" no_mod   - do not pick chunks from buffers with pending changes
+" do_evict - evict chunks that are very similar to the new one
+"
+function! s:pick_chunk(text, no_mod, do_evict)
+    " do not pick chunks from buffers with pending changes or buffers that are not files
+    if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
+        return
+    endif
+
+    " if the extra context option is disabled - do nothing
+    if g:llama_config.ring_n_chunks <= 0
+        return
+    endif
+
+    " don't pick very small chunks
+    if len(a:text) < 3
+        return
+    endif
+
+    if len(a:text) + 1 < g:llama_config.ring_chunk_size
+        let l:chunk = a:text
+    else
+        let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2]))
+        let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)])
+
+        let l:chunk = a:text[l:l0:l:l1]
+    endif
+
+    let l:chunk_str = join(l:chunk, "\n") . "\n"
+
+    " check if this chunk is already added
+    let l:exist = v:false
+
+    for i in range(len(s:ring_chunks))
+        if s:ring_chunks[i].data == l:chunk
+            let l:exist = v:true
+            break
+        endif
+    endfor
+
+    for i in range(len(s:ring_queued))
+        if s:ring_queued[i].data == l:chunk
+            let l:exist = v:true
+            break
+        endif
+    endfor
+
+    if l:exist
+        return
+    endif
+
+    " evict queued chunks that are very similar to the new one
+    for i in range(len(s:ring_queued) - 1, 0, -1)
+        if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9
+            if a:do_evict
+                call remove(s:ring_queued, i)
+                let s:ring_n_evict += 1
+            else
+                return
+            endif
+        endif
+    endfor
+
+    " also from s:ring_chunks
+    for i in range(len(s:ring_chunks) - 1, 0, -1)
+        if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
+            if a:do_evict
+                call remove(s:ring_chunks, i)
+                let s:ring_n_evict += 1
+            else
+                return
+            endif
+        endif
+    endfor
+
+    " TODO: become parameter ?
+    if len(s:ring_queued) == 16
+        call remove(s:ring_queued, 0)
+    endif
+
+    call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
+
+    "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
+endfunction
+
+" picks a queued chunk, sends it for processing and adds it to s:ring_chunks
+" called every g:llama_config.ring_update_ms
+function! s:ring_update()
+    call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
+
+    " update only if in normal mode or if the cursor hasn't moved for a while
+    if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
+        return
+    endif
+
+    if len(s:ring_queued) == 0
+        return
+    endif
+
+    " move the first queued chunk to the ring buffer
+    if len(s:ring_chunks) == g:llama_config.ring_n_chunks
+        call remove(s:ring_chunks, 0)
+    endif
+
+    call add(s:ring_chunks, remove(s:ring_queued, 0))
+
+    "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
+
+    " send asynchronous job with the new extra context so that it is ready for the next FIM
+    let l:extra_context = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra_context, {
+            \ 'text':     l:chunk.str,
+            \ 'time':     l:chunk.time,
+            \ 'filename': l:chunk.filename
+            \ })
+    endfor
+
+    " no samplers needed here
+    let l:request = json_encode({
+        \ 'input_prefix':     "",
+        \ 'input_suffix':     "",
+        \ 'input_extra':      l:extra_context,
+        \ 'prompt':           "",
+        \ 'n_predict':        1,
+        \ 'temperature':      0.0,
+        \ 'stream':           v:false,
+        \ 'samplers':         ["temperature"],
+        \ 'cache_prompt':     v:true,
+        \ 't_max_prompt_ms':  1,
+        \ 't_max_predict_ms': 1
+        \ })
+
+    let l:curl_command = printf(
+        \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
+        \ g:llama_config.endpoint, shellescape(l:request)
+        \ )
+
+    " no callbacks because we don't need to process the response
+    call jobstart(l:curl_command, {})
+endfunction
+
+" necessary for 'inoremap <expr>'
+function! llama#fim_inline(is_auto) abort
+    call llama#fim(a:is_auto)
+    return ''
+endfunction
+
+" the main FIM call
+" takes local context around the cursor and sends it together with the extra context to the server for completion
+function! llama#fim(is_auto) abort
+    " we already have a suggestion for the current cursor position
+    if s:hint_shown && !a:is_auto
+        call llama#fim_cancel()
+        return
+    endif
+
+    call llama#fim_cancel()
+
+    " avoid sending repeated requests too fast
+    if reltimefloat(reltime(s:t_fim_start)) < 0.6
+        if s:timer_fim != -1
+            call timer_stop(s:timer_fim)
+            let s:timer_fim = -1
+        endif
+
+        let s:t_fim_start = reltime()
+        let s:timer_fim = timer_start(600, {-> llama#fim(v:true)})
+        return
+    endif
+
+    let s:t_fim_start = reltime()
+
+    let s:content = []
+    let s:can_accept = v:false
+
+    let s:pos_x = col('.') - 1
+    let s:pos_y = line('.')
+    let l:max_y = line('$')
+
+    let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1)
+    let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix]))
+
+    let s:line_cur = getline('.')
+
+    let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x)
+    let s:line_cur_suffix = strpart(s:line_cur, s:pos_x)
+
+    if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
+        return
+    endif
+
+    let l:prefix = ""
+        \ . join(l:lines_prefix, "\n")
+        \ . "\n"
+
+    let l:prompt = ""
+        \ . s:line_cur_prefix
+
+    let l:suffix = ""
+        \ . s:line_cur_suffix
+        \ . "\n"
+        \ . join(l:lines_suffix, "\n")
+        \ . "\n"
+
+    " prepare the extra context data
+    let l:extra_context = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra_context, {
+            \ 'text':     l:chunk.str,
+            \ 'time':     l:chunk.time,
+            \ 'filename': l:chunk.filename
+            \ })
+    endfor
+
+    " the indentation of the current line
+    let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
+
+    let l:request = json_encode({
+        \ 'input_prefix':     l:prefix,
+        \ 'input_suffix':     l:suffix,
+        \ 'input_extra':      l:extra_context,
+        \ 'prompt':           l:prompt,
+        \ 'n_predict':        g:llama_config.n_predict,
+        \ 'n_indent':         l:indent,
+        \ 'top_k':            40,
+        \ 'top_p':            0.99,
+        \ 'stream':           v:false,
+        \ 'samplers':         ["top_k", "top_p", "infill"],
+        \ 'cache_prompt':     v:true,
+        \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
+        \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
+        \ })
+
+    let l:curl_command = printf(
+        \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
+        \ g:llama_config.endpoint, shellescape(l:request)
+        \ )
+
+    if s:current_job != v:null
+        call jobstop(s:current_job)
+    endif
+
+    " send the request asynchronously
+    let s:current_job = jobstart(l:curl_command, {
+        \ 'on_stdout': function('s:fim_on_stdout'),
+        \ 'on_exit':   function('s:fim_on_exit'),
+        \ 'stdout_buffered': v:true,
+        \ 'pos_x': s:pos_x,
+        \ 'pos_y': s:pos_y,
+        \ 'is_auto': a:is_auto
+        \ })
+
+    " TODO: per-file location
+    let l:delta_y = abs(s:pos_y - s:pos_y_pick)
+
+    " gather some extra context nearby and process it in the background
+    " only gather chunks if the cursor has moved a lot
+    " TODO: something more clever? reranking?
+    if a:is_auto && l:delta_y > 32
+        " expand the prefix even further
+        call s:pick_chunk(getline(max([1,       s:pos_y - g:llama_config.ring_scope]), max([1,       s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
+
+        " pick a suffix chunk
+        call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]),   min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
+
+        let s:pos_y_pick = s:pos_y
+    endif
+endfunction
+
+" if first_line == v:true accept only the first line of the response
+function! llama#fim_accept(first_line)
+    " insert the suggestion at the cursor location
+    if s:can_accept && len(s:content) > 0
+        call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0])
+        if len(s:content) > 1
+            if !a:first_line
+                call append(s:pos_y, s:content[1:-1])
+            endif
+        endif
+
+        " move the cursor to the end of the accepted text
+        if !a:first_line && len(s:content) > 1
+            call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1)
+        else
+            call cursor(s:pos_y, s:pos_x + len(s:content[0]))
+        endif
+    endif
+
+    call llama#fim_cancel()
+endfunction
+
+function! llama#fim_cancel()
+    let s:hint_shown = v:false
+
+    " clear the virtual text
+    let l:bufnr = bufnr('%')
+
+    let l:id_vt_fim  = nvim_create_namespace('vt_fim')
+    let l:id_vt_info = nvim_create_namespace('vt_info')
+
+    call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
+    call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
+
+    " remove the mappings
+    silent! iunmap <buffer> <Tab>
+    silent! iunmap <buffer> <S-Tab>
+    silent! iunmap <buffer> <Esc>
+endfunction
+
+function! s:on_move()
+    let s:t_last_move = reltime()
+
+    call llama#fim_cancel()
+endfunction
+
+" callback that processes the FIM result from the server and displays the suggestion
+function! s:fim_on_stdout(job_id, data, event) dict
+    let l:raw = join(a:data, "\n")
+    if len(l:raw) == 0
+        return
+    endif
+
+    if self.pos_x != col('.') - 1 || self.pos_y != line('.')
+        return
+    endif
+
+    " show the suggestion only in insert mode
+    if mode() !=# 'i'
+        return
+    endif
+
+    let s:pos_x = self.pos_x
+    let s:pos_y = self.pos_y
+
+    let s:can_accept = v:true
+    let l:has_info   = v:false
+
+    if s:can_accept && v:shell_error
+        if !self.is_auto
+            call add(s:content, "<| curl error: is the server on? |>")
+        endif
+        let s:can_accept = v:false
+    endif
+
+    let l:n_prompt    = 0
+    let l:t_prompt_ms = 1.0
+    let l:s_prompt    = 0
+
+    let l:n_predict    = 0
+    let l:t_predict_ms = 1.0
+    let l:s_predict    = 0
+
+    " get the generated suggestion
+    if s:can_accept
+        let l:response = json_decode(l:raw)
+
+        for l:part in split(get(l:response, 'content', ''), "\n", 1)
+            call add(s:content, l:part)
+        endfor
+
+        " remove trailing new lines
+        while len(s:content) > 0 && s:content[-1] == ""
+            call remove(s:content, -1)
+        endwhile
+
+        let l:generation_settings = get(l:response, 'generation_settings', {})
+        let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
+
+        let l:n_cached  = get(l:response, 'tokens_cached', 0)
+        let l:truncated = get(l:response, 'truncated', v:false)
+
+        " if response.timings is available
+        if len(get(l:response, 'timings', {})) > 0
+            let l:has_info = v:true
+            let l:timings  = get(l:response, 'timings', {})
+
+            let l:n_prompt    = get(l:timings, 'prompt_n', 0)
+            let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
+            let l:s_prompt    = get(l:timings, 'prompt_per_second', 0)
+
+            let l:n_predict    = get(l:timings, 'predicted_n', 0)
+            let l:t_predict_ms = get(l:timings, 'predicted_ms', 1)
+            let l:s_predict    = get(l:timings, 'predicted_per_second', 0)
+        endif
+    endif
+
+    if len(s:content) == 0
+        call add(s:content, "")
+        let s:can_accept = v:false
+    endif
+
+    if len(s:content) == 0
+        return
+    endif
+
+    " NOTE: the following is logic for discarding predictions that repeat existing text
+    "       the code is quite ugly and there is very likely a simpler and more canonical way to implement this
+    "
+    "       still, I wonder if there is some better way that avoids having to do these special hacks?
+    "       on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would
+    "       start generating whatever we have given it via the extra context. but on the other hand, it's not very
+    "       helpful to re-generate the same code that is already there
+
+    " truncate the suggestion if the first line is empty
+    if len(s:content) == 1 && s:content[0] == ""
+        let s:content = [""]
+    endif
+
+    " ... and the next lines are repeated
+    if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1)
+        let s:content = [""]
+    endif
+
+    " truncate the suggestion if it repeats the suffix
+    if len(s:content) == 1 && s:content[0] == s:line_cur_suffix
+        let s:content = [""]
+    endif
+
+    " find the first non-empty line (strip whitespace)
+    let l:cmp_y = s:pos_y + 1
+    while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$'
+        let l:cmp_y += 1
+    endwhile
+
+    if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y)
+        " truncate the suggestion if it repeats the next line
+        if len(s:content) == 1
+            let s:content = [""]
+        endif
+
+        " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1
+        if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1]
+            let s:content = [""]
+        endif
+
+        " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1)
+        if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n")
+            let s:content = [""]
+        endif
+    endif
+
+    " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix
+    "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
+    "for i in range(1, len(s:content) - 1)
+    "    if strlen(matchstr(s:content[i], '^\s*')) < l:indent
+    "        let s:content = s:content[:i - 1]
+    "        break
+    "    endif
+    "endfor
+
+    let s:pos_dx = len(s:content[-1])
+
+    let s:content[-1] .= s:line_cur_suffix
+
+    call llama#fim_cancel()
+
+    " display virtual text with the suggestion
+    let l:bufnr = bufnr('%')
+
+    let l:id_vt_fim  = nvim_create_namespace('vt_fim')
+    let l:id_vt_info = nvim_create_namespace('vt_info')
+
+    " construct the info message
+    if g:llama_config.show_info > 0 && l:has_info
+        " prefix the info string with whitespace in order to offset it to the right of the fim overlay
+        let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
+
+        if l:truncated
+            let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
+                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+                \ l:n_cached, l:n_ctx
+                \ )
+        else
+            let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
+                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
+                \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
+                \ l:n_predict, l:t_predict_ms, l:s_predict,
+                \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
+                \ )
+        endif
+
+        if g:llama_config.show_info == 1
+            "" display it in the statusline
+            let &statusline = l:info
+        elseif g:llama_config.show_info == 2
+            " display it to the right of the current line
+            call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
+                \ 'virt_text': [[l:info, 'llama_hl_info']],
+                \ 'virt_text_pos': 'eol',
+                \ })
+        endif
+    endif
+
+    " display the suggestion
+    call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
+        \ 'virt_text': [[s:content[0], 'llama_hl_hint']],
+        \ 'virt_text_win_col': virtcol('.') - 1
+        \ })
+
+    call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
+        \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
+        \ 'virt_text_win_col': virtcol('.')
+        \ })
+
+    " setup accept shortcuts
+    inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
+    inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
+
+    let s:hint_shown = v:true
+endfunction
+
+function! s:fim_on_exit(job_id, exit_code, event) dict
+    if a:exit_code != 0
+        echom "Job failed with exit code: " . a:exit_code
+    endif
+
+    let s:current_job = v:null
+endfunction

From 94008cc76075fb4a29ee371e7ac255378d1bce6c Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Mon, 21 Oct 2024 20:12:52 +0200
Subject: [PATCH 083/396] arg : fix attention non-causal arg value hint (#9985)

This commit updates the argument value hint for the `--attention`
argument to `non-causal`.

The motivation for this change is that the only values for this argument
are `causal` and `non-causal`.
---
 common/arg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index d6a8e1f6f..168c2b1f3 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1097,7 +1097,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
     add_opt(common_arg(
-        {"--attention"}, "{causal,non,causal}",
+        {"--attention"}, "{causal,non-causal}",
         "attention type for embeddings, use model default if unspecified",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }

From 994cfb1acb9144bc95be0ab319175f30737cc92b Mon Sep 17 00:00:00 2001
From: Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
Date: Mon, 21 Oct 2024 20:20:59 +0200
Subject: [PATCH 084/396] readme : update UI list (#9972)

add PocketPal AI app
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 06c32a2b4..eeb3975eb 100644
--- a/README.md
+++ b/README.md
@@ -173,6 +173,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
 - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
 - [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
+- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
 
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
 

From e01c67affe450638162a1a457e2e57859ef6ebf0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 21 Oct 2024 22:52:22 +0300
Subject: [PATCH 085/396] llama.vim : move info to the right of screen [no ci]
 (#9787)

'eol' messes up the rendering with nvim v0.10.2 for some reason
---
 examples/llama.vim | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index e75872cae..9af451fbd 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -668,13 +668,14 @@ function! s:fim_on_stdout(job_id, data, event) dict
         endif
 
         if g:llama_config.show_info == 1
-            "" display it in the statusline
+            " display it in the statusline
             let &statusline = l:info
         elseif g:llama_config.show_info == 2
             " display it to the right of the current line
             call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
                 \ 'virt_text': [[l:info, 'llama_hl_info']],
-                \ 'virt_text_pos': 'eol',
+               "\ 'virt_text_pos': 'eol',
+                \ 'virt_text_pos': 'right_align',
                 \ })
         endif
     endif

From e94a138d644a9b34da61805f7aeb8af595c61b53 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 22 Oct 2024 00:35:25 +0300
Subject: [PATCH 086/396] llama.vim : fix info text display [no ci] (#9787)

---
 examples/llama.vim | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 9af451fbd..7a60442ad 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -482,11 +482,9 @@ function! llama#fim_cancel()
     " clear the virtual text
     let l:bufnr = bufnr('%')
 
-    let l:id_vt_fim  = nvim_create_namespace('vt_fim')
-    let l:id_vt_info = nvim_create_namespace('vt_info')
+    let l:id_vt_fim = nvim_create_namespace('vt_fim')
 
     call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
-    call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
 
     " remove the mappings
     silent! iunmap <buffer> <Tab>
@@ -644,13 +642,11 @@ function! s:fim_on_stdout(job_id, data, event) dict
     " display virtual text with the suggestion
     let l:bufnr = bufnr('%')
 
-    let l:id_vt_fim  = nvim_create_namespace('vt_fim')
-    let l:id_vt_info = nvim_create_namespace('vt_info')
+    let l:id_vt_fim = nvim_create_namespace('vt_fim')
 
     " construct the info message
     if g:llama_config.show_info > 0 && l:has_info
-        " prefix the info string with whitespace in order to offset it to the right of the fim overlay
-        let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
+        let l:prefix = '   '
 
         if l:truncated
             let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
@@ -668,21 +664,15 @@ function! s:fim_on_stdout(job_id, data, event) dict
         endif
 
         if g:llama_config.show_info == 1
-            " display it in the statusline
+            " display the info in the statusline
             let &statusline = l:info
-        elseif g:llama_config.show_info == 2
-            " display it to the right of the current line
-            call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
-                \ 'virt_text': [[l:info, 'llama_hl_info']],
-               "\ 'virt_text_pos': 'eol',
-                \ 'virt_text_pos': 'right_align',
-                \ })
+            let l:info = ''
         endif
     endif
 
-    " display the suggestion
+    " display the suggestion and append the info to the end of the first line
     call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
-        \ 'virt_text': [[s:content[0], 'llama_hl_hint']],
+        \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
         \ 'virt_text_win_col': virtcol('.') - 1
         \ })
 

From 674804a99617b4f90292b4080ecab450ea3d30ba Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 22 Oct 2024 09:40:02 +0200
Subject: [PATCH 087/396] arg : fix typo in embeddings argument help [no ci]
 (#9994)

This commit fixes two typos in the help text for the `--embd-normalize`
and `--embd-separator` arguments. It also updates common.h which contain
the same typo in two comments.
---
 common/arg.cpp  | 4 ++--
 common/common.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 168c2b1f3..cd9d315dc 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1695,7 +1695,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"--embd-normalize"}, "N",
-        string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
+        string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
         [](common_params & params, int value) {
             params.embd_normalize = value;
         }
@@ -1709,7 +1709,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--embd-separator"}, "STRING",
-        "separator of embendings (default \\n) for example \"<#sep#>\"",
+        "separator of embeddings (default \\n) for example \"<#sep#>\"",
         [](common_params & params, const std::string & value) {
             params.embd_sep = value;
         }
diff --git a/common/common.h b/common/common.h
index 5ca8fd391..19d928777 100644
--- a/common/common.h
+++ b/common/common.h
@@ -274,9 +274,9 @@ struct common_params {
 
     // embedding
     bool embedding         = false; // get only sentence embedding
-    int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
-    std::string embd_sep   = "\n";  // separator of embendings
+    std::string embd_sep   = "\n";  // separator of embeddings
     bool reranking         = false; // enable reranking support on server
 
     // server params

From 6b8447352df3d662b56280c8fc38d7f092885787 Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Tue, 22 Oct 2024 16:16:01 +0800
Subject: [PATCH 088/396] [CANN] Adapt to dynamically loadable backends
 mechanism (#9970)

* [CANN] Adapt to dynamically loadable backends mechanism

* Fix the Bug: inference running result is garbled in debug running model for LM models who's type is Q4_0 class

* Handle the review comments of this pull request
---
 ggml/include/ggml-cann.h  |   2 +
 ggml/src/ggml-backend.cpp |   9 +-
 ggml/src/ggml-cann.cpp    | 354 +++++++++++++++++++++++++++-----------
 src/llama.cpp             |  51 +-----
 4 files changed, 267 insertions(+), 149 deletions(-)

diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h
index 95bdaf10d..528975493 100644
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@@ -34,6 +34,8 @@ extern "C" {
  */
 #define GGML_CANN_MAX_DEVICES 16
 
+GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
+
 /**
  * @brief Initializes the CANN backend for a specified device.
  *
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 81d09cd8b..7d7b63a15 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -561,6 +561,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #  include "ggml-amx.h"
 #endif
 
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_t> backends;
     std::vector<ggml_backend_dev_t> devices;
@@ -587,8 +591,11 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_AMX
         register_backend(ggml_backend_amx_reg());
 #endif
+#ifdef GGML_USE_CANN
+        register_backend(ggml_backend_cann_reg());
+#endif
 
-        // TODO: kompute, cann
+        // TODO: kompute
 
         register_backend(ggml_backend_cpu_reg());
     }
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index ec3c0a688..af0fb603a 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -39,6 +39,8 @@
 
 #include "ggml-common.h"
 
+#define GGML_CANN_NAME "CANN"
+
 /**
  * @brief Handles CANN errors by printing an error message and aborting.
  *
@@ -851,13 +853,6 @@ static void ggml_backend_cann_buffer_set_tensor(
         void *transform_buffer = malloc(size);
         ggml_backend_cann_transform(tensor, data, transform_buffer);
 
-#ifndef NDEBUG
-        void *check_buffer = malloc(size);
-        ggml_backend_cann_transform_back(tensor, transform_buffer,
-                                         check_buffer);
-        GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
-        free(check_buffer);
-#endif
         ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
                               transform_buffer, size,
                               ACL_MEMCPY_HOST_TO_DEVICE));
@@ -969,7 +964,7 @@ static void ggml_backend_cann_buffer_clear(
  * This structure defines function pointers to operations that can be performed
  * on a CANN buffer within the backend.
  */
-static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
+static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
     /* .get_name        = */ ggml_backend_cann_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_cann_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cann_buffer_get_base,
@@ -1105,19 +1100,25 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
     GGML_UNUSED(buft);
 }
 
+static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+
+    GGML_UNUSED(buft);
+}
+
 /**
  * @brief Interface for managing CANN buffer types in the GGML backend.
  *
  * Provides function pointers for allocating, querying properties, and managing
  * memory for CANN buffer types in the GGML backend.
  */
-static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
+static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_cann_buffer_type_name,
     /* .alloc_buffer     = */ ggml_backend_cann_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_cann_buffer_type_get_alignment,
     /* .get_max_size     = */ NULL,  // defaults to SIZE_MAX
     /* .get_alloc_size   = */ ggml_backend_cann_buffer_type_get_alloc_size,
-    /* .is_host          = */ NULL,
+    /* .is_host          = */ ggml_backend_cann_buffer_type_is_host,
 };
 
 /**
@@ -1148,7 +1149,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
         for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
             ggml_backend_cann_buffer_types[i] = {
                 /* .iface    = */ ggml_backend_cann_buffer_type_interface,
-                /* .device    = */ nullptr,
+                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
                 /* .context  = */
                  new ggml_backend_cann_buffer_type_context{
                     i, "CANN" + std::to_string(i)},
@@ -1264,7 +1265,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
             /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
         },
-        /* .device   = */ nullptr,
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
         /* .context  = */ nullptr,
     };
 
@@ -1511,13 +1512,6 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
         void *transform_buffer = malloc(size);
         ggml_backend_cann_transform(tensor, data, transform_buffer);
 
-#ifndef NDEBUG
-        void *check_buffer = malloc(size);
-        ggml_backend_cann_transform_back(tensor, transform_buffer,
-                                         check_buffer);
-        GGML_ASSERT(memcmp(data, check_buffer, size));
-        free(check_buffer);
-#endif
         ACL_CHECK(aclrtMemcpyAsync(
             (char *)tensor->data + offset, size, transform_buffer, size,
             ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
@@ -1692,7 +1686,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
  * @return bool Returns true if the operation is supported by the backend,
  *              otherwise false.
  */
-static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
+static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                                                     const ggml_tensor* op) {
     switch (op->op) {
         case GGML_OP_UNARY:
@@ -1783,7 +1777,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
             return false;
     }
 
-    GGML_UNUSED(backend);
+    GGML_UNUSED(dev);
 }
 
 /**
@@ -1801,31 +1795,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
     return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
 }
 
-/**
- * @brief Checks if the CANN backend supports a specific backend buffer type.
- *
- * This function determines whether the CANN backend supports the given backend
- * buffer type by comparing the device context of the backend and buffer type.
- * It returns true if the devices are same between the backend context and
- * buffer type context.
- *
- * @param backend Pointer to the CANN backend.
- * @param buft Pointer to the backend buffer type to check.
- * @return bool Returns true if the CANN backend supports the buffer type,
- *              otherwise false.
- */
-static bool ggml_backend_cann_supports_buft(
-    ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    if (ggml_backend_buft_is_cann(buft)) {
-        ggml_backend_cann_context * cann_ctx =
-                        (ggml_backend_cann_context *)backend->context;
-        ggml_backend_cann_buffer_type_context * buft_ctx =
-                        (ggml_backend_cann_buffer_type_context *)buft->context;
-        return buft_ctx->device == cann_ctx->device;
-    }
-    return false;
-}
-
 /**
  * @brief Determines if a tensor operation should be offloaded to the CANN
  * backend.
@@ -1840,54 +1809,14 @@ static bool ggml_backend_cann_supports_buft(
  * @return bool Returns true if the operation should be offloaded, otherwise
  * false.
  */
-static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
+static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
                                                    const ggml_tensor* op) {
     const int min_batch_size = 32;
-    GGML_UNUSED(backend);
+    GGML_UNUSED(dev);
 
     return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
 }
 
-/**
- * @brief Creates a new event for the CANN backend.
- *
- * This function initializes a new event for the CANN backend by setting the
- * device and creating an ACL runtime event. The created event is then wrapped
- * in a ggml_backend_event structure and returned.
- *
- * @param backend Pointer to the CANN backend.
- * @return ggml_backend_event_t Returns a pointer to the new event structure.
- */
-static ggml_backend_event_t ggml_backend_cann_event_new(
-    ggml_backend_t backend) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
-
-    ggml_cann_set_device(cann_ctx->device);
-
-    aclrtEvent event;
-    ACL_CHECK(aclrtCreateEvent(&event));
-
-    return new ggml_backend_event{
-        /* .device = */ nullptr,
-        /* .context = */ event,
-    };
-}
-
-/**
- * @brief Frees a CANN backend event.
- *
- * This function destroys the ACL runtime event associated with the given CANN
- * backend event and then deletes the event structure itself.
- *
- * @param event Pointer to the event structure to be freed.
- */
-static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
-    ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
-
-    delete event;
-}
-
 /**
  * @brief Records an event on the CANN backend stream.
  *
@@ -1924,17 +1853,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
     }
 }
 
-/**
- * @brief Synchronizes the given event on the CANN backend.
- *
- * This function waits for the specified event to complete on the ACL runtime.
- *
- * @param event Pointer to the event structure to be synchronized.
- */
-static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
-    ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
-}
-
 /**
  * @brief Structure defining the interface for the CANN backend.
  *
@@ -1942,7 +1860,7 @@ static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
  * supported by the CANN backend, including name retrieval, memory
  * management, tensor operations, synchronization, and event handling.
  */
-static ggml_backend_i ggml_backend_cann_interface = {
+static const ggml_backend_i ggml_backend_cann_interface = {
     /* .get_name                = */ ggml_backend_cann_name,
     /* .free                    = */ ggml_backend_cann_free,
     /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
@@ -1955,9 +1873,9 @@ static ggml_backend_i ggml_backend_cann_interface = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_cann_graph_compute,
-    /* .supports_op             = */ ggml_backend_cann_supports_op,
-    /* .supports_buft           = */ ggml_backend_cann_supports_buft,
-    /* .offload_op              = */ ggml_backend_cann_offload_op,
+    /* .supports_op             = */ NULL, // moved to device
+    /* .supports_buft           = */ NULL, // moved to device
+    /* .offload_op              = */ NULL, // moved to device
     /* .event_record            = */ ggml_backend_cann_event_record,
     /* .event_wait              = */ ggml_backend_cann_event_wait,
 };
@@ -1976,6 +1894,234 @@ static ggml_guid_t ggml_backend_cann_guid() {
     return &guid;
 }
 
+// backend device
+struct ggml_backend_cann_device_context {
+    int device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_get_device_memory(ctx->device, free, total);
+}
+
+static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+}
+
+static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_cann_device_get_name(dev);
+    props->description = ggml_backend_cann_device_get_description(dev);
+    props->type        = ggml_backend_cann_device_get_type(dev);
+    ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr;
+
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ host_buffer,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ true,
+    };
+}
+
+static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(params);
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    return ggml_backend_cann_init(ctx->device);
+}
+
+/**
+ * @brief Checks if the CANN backend supports a specific backend buffer type.
+ *
+ * This function determines whether the CANN backend supports the given backend
+ * buffer type by comparing the device context of the backend and buffer type.
+ * It returns true if the devices are same between the backend context and
+ * buffer type context.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @param buft Pointer to the backend buffer type to check.
+ * @return bool Returns true if the CANN backend supports the buffer type,
+ *              otherwise false.
+ */
+static bool ggml_backend_cann_supports_buft(
+    ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (ggml_backend_buft_is_cann(buft)) {
+        ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+        ggml_backend_cann_buffer_type_context * buft_ctx =
+                        (ggml_backend_cann_buffer_type_context *)buft->context;
+        return buft_ctx->device == dev_ctx->device;
+    }
+    return false;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    return ggml_backend_cann_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return ggml_backend_cann_host_buffer_type();
+}
+
+/**
+ * @brief Creates a new event for the CANN backend device.
+ *
+ * This function initializes a new event for the CANN backend by setting the
+ * device and creating an ACL runtime event. The created event is then wrapped
+ * in a ggml_backend_event structure and returned.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @return ggml_backend_event_t Returns a pointer to the new event structure.
+ */
+static ggml_backend_event_t ggml_backend_cann_device_event_new(
+    ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+
+    ggml_cann_set_device(dev_ctx->device);
+
+    aclrtEvent event;
+    ACL_CHECK(aclrtCreateEvent(&event));
+
+    return new ggml_backend_event{
+        /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device),
+        /* .context = */ event,
+    };
+}
+
+/**
+ * @brief Frees a CANN backend event.
+ *
+ * This function destroys the ACL runtime event associated with the given CANN
+ * backend event and then deletes the event structure itself.
+ *
+ * @param event Pointer to the event structure to be freed.
+ */
+static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
+
+    delete event;
+    GGML_UNUSED(dev);
+}
+
+/**
+ * @brief Synchronizes the given event on the CANN backend.
+ *
+ * This function waits for the specified event to complete on the ACL runtime.
+ *
+ * @param event Pointer to the event structure to be synchronized.
+ */
+static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
+
+    GGML_UNUSED(dev);
+}
+
+static const ggml_backend_device_i ggml_backend_cann_device_interface = {
+    /* .get_name                = */ ggml_backend_cann_device_get_name,
+    /* .get_description         = */ ggml_backend_cann_device_get_description,
+    /* .get_memory              = */ ggml_backend_cann_device_get_memory,
+    /* .get_type                = */ ggml_backend_cann_device_get_type,
+    /* .get_props               = */ ggml_backend_cann_device_get_props,
+    /* .init_backend            = */ ggml_backend_cann_device_init,    // called for every card
+    /* .get_buffer_type         = */ ggml_backend_cann_device_get_buffer_type,
+    /* .get_host_buffer_type    = */ ggml_backend_cann_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr    = */ NULL, // not supported for CANN
+    /* .supports_op             = */ ggml_backend_cann_supports_op,
+    /* .supports_buft           = */ ggml_backend_cann_supports_buft,
+    /* .offload_op              = */ ggml_backend_cann_offload_op,
+    /* .event_new               = */ ggml_backend_cann_device_event_new,
+    /* .event_free              = */ ggml_backend_cann_device_event_free,
+    /* .event_synchronize       = */ ggml_backend_cann_device_event_synchronize,
+};
+
+
+// backend reg
+struct ggml_backend_cann_reg_context {
+    std::vector<ggml_backend_dev_t> devices;
+};
+
+static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return GGML_CANN_NAME;
+}
+
+static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
+    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
+    return ctx->devices.size();
+}
+
+static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
+    GGML_ASSERT(index < ctx->devices.size());
+    return ctx->devices[index];
+}
+
+static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    GGML_UNUSED(reg);
+    GGML_UNUSED(name);
+    // reserved for future use
+    return nullptr;
+}
+
+static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
+    /* .get_name          = */ ggml_backend_cann_reg_get_name,
+    /* .get_device_count  = */ ggml_backend_cann_reg_get_device_count,
+    /* .get_device_get    = */ ggml_backend_cann_reg_get_device,
+    /* .get_proc_address  = */ ggml_backend_cann_reg_get_proc_address,
+};
+
+// backend registry, called only once for cann backend
+ggml_backend_reg_t ggml_backend_cann_reg() {
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            aclInit(nullptr);
+            ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
+
+            for (int i = 0; i < ggml_cann_info().device_count; i++) {
+                ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
+                dev_ctx->description = aclrtGetSocName();
+                dev_ctx->device = i;
+                dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
+                ggml_cann_set_device(i);
+                ggml_backend_dev_t dev = new ggml_backend_device {
+                    /* .interface = */ ggml_backend_cann_device_interface,
+                    /* .reg       = */ &reg,
+                    /* .context   = */ dev_ctx
+                };
+                ctx->devices.push_back(dev);
+            }
+
+            reg = ggml_backend_reg {
+                /* .interface = */ ggml_backend_cann_reg_interface,
+                /* .context   = */ ctx
+            };
+        }
+
+        initialized = true;
+    }
+
+    return &reg;
+}
+
 ggml_backend_t ggml_backend_cann_init(int32_t device) {
     aclInit(nullptr);
     if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
@@ -1992,7 +2138,7 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
     ggml_backend_t cann_backend =
         new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
                          /* .interface = */ ggml_backend_cann_interface,
-                         /* .device    = */ nullptr,
+                         /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
                          /* .context   = */ ctx};
 
     return cann_backend;
diff --git a/src/llama.cpp b/src/llama.cpp
index 98ec123c1..e1ca478ec 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -10,8 +10,6 @@
 
 #if defined(GGML_USE_KOMPUTE)
 #   include "ggml-kompute.h"
-#elif defined(GGML_USE_CANN)
-#   include "ggml-cann.h"
 #endif
 
 #ifndef __AMX_INT8__
@@ -3399,10 +3397,6 @@ static int llama_get_device_count(const llama_model & model) {
     count += (int) model.rpc_servers.size();
 #endif
 
-#if defined(GGML_USE_CANN)
-    count += ggml_backend_cann_get_device_count();
-#endif
-
     return count;
 
     GGML_UNUSED(model);
@@ -3420,11 +3414,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
         }
     }
 
-#if defined(GGML_USE_CANN)
-    if (host_buffer) {
-        buft = ggml_backend_cann_host_buffer_type();
-    }
-#elif defined(GGML_USE_CPU_HBM)
+#if defined(GGML_USE_CPU_HBM)
     buft = ggml_backend_cpu_hbm_buffer_type();
 #endif
 
@@ -3446,8 +3436,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
 
 #if defined(GGML_USE_KOMPUTE)
     buft = ggml_backend_kompute_buffer_type(device);
-#elif defined(GGML_USE_CANN)
-    buft = ggml_backend_cann_buffer_type(device);
 #endif
 
     if (buft == nullptr) {
@@ -3491,14 +3479,13 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
         return free;
     }
 
-#if defined(GGML_USE_CANN)
-    size_t total;
-    size_t free;
-    ggml_backend_cann_get_device_memory(device, &free, &total);
-    return free;
-#else
+    if (model.devices.size() > 0) {
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(model.devices[0]);
+        LLAMA_LOG_WARN("%s: failed to get free memmory of device:%d of backend:%s, for device id is out of range.\n", __func__, device, ggml_backend_reg_name(reg));
+    } else {
+        LLAMA_LOG_WARN("%s: failed to get free memmory of device, no devices in inputted model.\n", __func__);
+    }
     return 1;
-#endif
 
     GGML_UNUSED(model);
     GGML_UNUSED(device);
@@ -19396,30 +19383,6 @@ struct llama_context * llama_new_context_with_model(
             }
             ctx->backends.push_back(backend);
         }
-#elif defined(GGML_USE_CANN)
-        // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
-        // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
-        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
-        } else {
-            // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
-            // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
-            for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
-                ggml_backend_t backend = ggml_backend_cann_init(device);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
 #endif
 
         // add other backends (such as BLAS)

From 4ff7fe1fb36b04ddd158b2de881c348c5f0ff5e4 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Tue, 22 Oct 2024 18:33:37 +0800
Subject: [PATCH 089/396] llama : add chat template for RWKV-World + fix EOT
 (#9968)

* Add chat template for RWKV-World

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

* RWKV: Fix the chat template not being used

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

* RWKV v6: Set EOT token to ``\n\n``

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

* readme: add rwkv into supported model list

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

---------

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 README.md                    | 1 +
 convert_hf_to_gguf.py        | 2 ++
 src/llama.cpp                | 9 +++++++++
 tests/test-chat-template.cpp | 4 ++++
 4 files changed, 16 insertions(+)

diff --git a/README.md b/README.md
index eeb3975eb..8fe1f4b4b 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
 - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
 - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
+- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
 
 (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
 
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index da5feb25b..e0b1b2bf9 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2864,6 +2864,8 @@ class Rwkv6Model(Model):
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
+        special_vocab.chat_template = "rwkv-world"
+        special_vocab._set_special_token("eot", 261)
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
diff --git a/src/llama.cpp b/src/llama.cpp
index e1ca478ec..73190c88f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21697,6 +21697,15 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "[|assistant|]";
         }
+    } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world") || tmpl_contains("'User: ' + message['content'] + '\n\nAssistant:'")) {
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "user") {
+                ss << "User: " << message->content << "\n\nAssistant:";
+            } else {
+                ss << message->content << "\n\n";
+            }
+        }
     } else {
         // template not supported
         return -1;
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 6f046249f..fdc4a9bc3 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -65,6 +65,8 @@ int main(void) {
         u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
         // DeepSeek-V2
         "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+        // RWKV-World
+        "{% for message in messages %}{% if message['role'] == 'user' %}{{'User: ' + message['content'] + '\n\nAssistant:'}}{% else %}{{message['content'] + '\n\n'}}{% endif %}{% endfor %}",
     };
     std::vector<std::string> expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
@@ -109,6 +111,8 @@ int main(void) {
         u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
         // DeepSeek-V2
         u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:",
+        // RWKV-World
+        "You are a helpful assistant\n\nUser: Hello\n\nAssistant:Hi there\n\nUser: Who are you\n\nAssistant:   I am an assistant   \n\nUser: Another question\n\nAssistant:",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;

From c421ac072d46172ab18924e1e8be53680b54ed3b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 22 Oct 2024 13:08:41 +0200
Subject: [PATCH 090/396] lora : warn user if new token is added in the adapter
 (#9948)

---
 convert_lora_to_gguf.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 439a78de1..bc68f68af 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -348,6 +348,9 @@ if __name__ == '__main__':
                         if ".base_layer.weight" in name:
                             continue
                         logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
+                        if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
+                            logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
+                            logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()")
                         sys.exit(1)
 
                     if base_name in tensor_map:

From 11d47057a51f3d9b9231e6b57d0ca36020c0ee99 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Tue, 22 Oct 2024 21:22:26 +0800
Subject: [PATCH 091/396] Rwkv chat template fix (#10001)

* llama: remove useless template matching for rwkv-world

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

* converter: Add comment about the hack for rwkv models

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

* Update src/llama.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

---------

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
---
 convert_hf_to_gguf.py        | 1 +
 src/llama.cpp                | 3 ++-
 tests/test-chat-template.cpp | 4 ----
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index e0b1b2bf9..7e552a71b 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2865,6 +2865,7 @@ class Rwkv6Model(Model):
         self.gguf_writer.add_token_types(toktypes)
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
         special_vocab.chat_template = "rwkv-world"
+        # hack: Add '\n\n' as the EOT token to make it chat normally
         special_vocab._set_special_token("eot", 261)
         special_vocab.add_to_gguf(self.gguf_writer)
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 73190c88f..6a5c56a77 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21697,7 +21697,8 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "[|assistant|]";
         }
-    } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world") || tmpl_contains("'User: ' + message['content'] + '\n\nAssistant:'")) {
+    } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world")) {
+        // this template requires the model to have "\n\n" as EOT token
         for (auto message : chat) {
             std::string role(message->role);
             if (role == "user") {
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index fdc4a9bc3..6f046249f 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -65,8 +65,6 @@ int main(void) {
         u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
         // DeepSeek-V2
         "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
-        // RWKV-World
-        "{% for message in messages %}{% if message['role'] == 'user' %}{{'User: ' + message['content'] + '\n\nAssistant:'}}{% else %}{{message['content'] + '\n\n'}}{% endif %}{% endfor %}",
     };
     std::vector<std::string> expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
@@ -111,8 +109,6 @@ int main(void) {
         u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
         // DeepSeek-V2
         u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:",
-        // RWKV-World
-        "You are a helpful assistant\n\nUser: Hello\n\nAssistant:Hi there\n\nUser: Who are you\n\nAssistant:   I am an assistant   \n\nUser: Another question\n\nAssistant:",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;

From 19d900a7565b8f6b0a708836a57d26966cb9efe2 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 22 Oct 2024 15:31:06 +0200
Subject: [PATCH 092/396] llama : rename batch to ubatch (#9950)

This commit renames the member field batch in llm_build_context to
ubatch, and also the parameter batch in llama_build_graph, and
llama_set_inputs to ubatch.

The motivation for this change is to make the code more readable
(considering there are the structs llama_batch and llama_sbatch), and
consistent with other parts of the code base where parameters/fields of
type llama_ubatch are named ubatch.
---
 src/llama.cpp | 218 +++++++++++++++++++++++++-------------------------
 1 file changed, 109 insertions(+), 109 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 6a5c56a77..7a5a46dce 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -10017,7 +10017,7 @@ struct llm_build_context {
           llama_context  & lctx;
     const llama_hparams  & hparams;
     const llama_cparams  & cparams;
-    const llama_ubatch   & batch;
+    const llama_ubatch   & ubatch;
     const llama_kv_cache & kv_self;
 
     const int64_t n_embd;
@@ -10063,14 +10063,14 @@ struct llm_build_context {
     // TODO: consider making the entire interface noexcept
     llm_build_context(
         llama_context  & lctx,
-    const llama_ubatch & batch,
+    const llama_ubatch & ubatch,
     const llm_build_cb & cb,
                   bool   worst_case) :
         model            (lctx.model),
         lctx             (lctx),
         hparams          (model.hparams),
         cparams          (lctx.cparams),
-        batch            (batch),
+        ubatch           (ubatch),
         kv_self          (lctx.kv_self),
         n_embd           (hparams.n_embd),
         n_layer          (hparams.n_layer),
@@ -10092,7 +10092,7 @@ struct llm_build_context {
         beta_slow        (cparams.yarn_beta_slow),
         norm_eps         (hparams.f_norm_eps),
         norm_rms_eps     (hparams.f_norm_rms_eps),
-        n_tokens         (batch.n_tokens),
+        n_tokens         (ubatch.n_tokens),
         n_kv             (worst_case ? kv_self.size : kv_self.n),
         n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
         n_outputs_enc    (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
@@ -10461,7 +10461,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -10621,7 +10621,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
@@ -10736,7 +10736,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -10840,7 +10840,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -10962,7 +10962,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // multiply by embedding_multiplier_scale of 78.38367176906169
         inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
@@ -11120,7 +11120,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -11242,7 +11242,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -11345,7 +11345,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@@ -11447,7 +11447,7 @@ struct llm_build_context {
         }
 
         // construct input embeddings (token, type, position)
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // token types are hardcoded to zero ("Sentence A")
         struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
@@ -11634,7 +11634,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@@ -11736,7 +11736,7 @@ struct llm_build_context {
         struct ggml_tensor * pos;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@@ -11874,7 +11874,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -12024,7 +12024,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -12137,7 +12137,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -12252,7 +12252,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -12397,7 +12397,7 @@ struct llm_build_context {
         struct ggml_tensor * ffn_output;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -12516,7 +12516,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -12644,7 +12644,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -12749,7 +12749,7 @@ struct llm_build_context {
         struct ggml_tensor * pos;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -12854,7 +12854,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -12964,7 +12964,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -13082,7 +13082,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -13209,7 +13209,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // scale the input embeddings
         inpL = ggml_scale(ctx0, inpL, scale_embd);
@@ -13353,7 +13353,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // scale the input embeddings
         inpL = ggml_scale(ctx0, inpL, scale_embd);
@@ -13554,7 +13554,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
         cb(inpL, "inp_scaled", -1);
@@ -13662,7 +13662,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
         cb(inpL, "inp_scaled", -1);
@@ -13800,7 +13800,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -13916,7 +13916,7 @@ struct llm_build_context {
         struct ggml_tensor * inpL;
 
         // {n_embd, n_tokens}
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         struct ggml_tensor * state_copy = build_inp_s_copy();
         struct ggml_tensor * state_mask = build_inp_s_mask();
@@ -13928,7 +13928,7 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "attn_norm", il);
 
-            cur = llm_build_mamba(ctx0, lctx, batch, gf, cur,
+            cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur,
                     state_copy, state_mask,
                     kv_head, n_kv, cb, il);
 
@@ -13974,7 +13974,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -14131,7 +14131,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -14259,7 +14259,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -14378,7 +14378,7 @@ struct llm_build_context {
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -14505,7 +14505,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -14650,7 +14650,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -14791,7 +14791,7 @@ struct llm_build_context {
         struct ggml_tensor * inpL;
 
         // {n_embd, n_tokens}
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -15006,7 +15006,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -15160,7 +15160,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         GGML_ASSERT(lctx.is_encoding);
         struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
@@ -15292,7 +15292,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         GGML_ASSERT(!lctx.is_encoding);
         GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
@@ -15494,7 +15494,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@@ -15586,7 +15586,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -15700,7 +15700,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -15824,7 +15824,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -15944,11 +15944,11 @@ struct llm_build_context {
         // Token shift state dimensions should be 2 * n_emb
         GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
 
-        const int64_t n_seqs = batch.n_seqs;
-        const int64_t n_seq_tokens = batch.n_seq_tokens;
-        const int64_t n_tokens = batch.n_tokens;
+        const int64_t n_seqs = ubatch.n_seqs;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_tokens = ubatch.n_tokens;
         GGML_ASSERT(n_seqs != 0);
-        GGML_ASSERT(batch.equal_seqs);
+        GGML_ASSERT(ubatch.equal_seqs);
         GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
 
         struct ggml_tensor * cur;
@@ -15956,7 +15956,7 @@ struct llm_build_context {
         struct ggml_tensor * state_copy = build_inp_s_copy();
         struct ggml_tensor * state_mask = build_inp_s_mask();
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
         inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
 
         for (int il = 0; il < n_layer; ++il) {
@@ -16070,7 +16070,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
@@ -16266,7 +16266,7 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
 
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
-    const llama_ubatch & batch,
+    const llama_ubatch & ubatch,
                   bool   worst_case) {
     const auto & model = lctx.model;
 
@@ -16288,7 +16288,7 @@ static struct ggml_cgraph * llama_build_graph(
         // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
         // FIXME: fix in ggml_backend_sched
         const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
-        if (batch.n_tokens < 32 || full_offload) {
+        if (ubatch.n_tokens < 32 || full_offload) {
             if (il != -1 && strcmp(name, "norm") == 0) {
                 for (auto * backend : lctx.backends) {
                     if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
@@ -16303,7 +16303,7 @@ static struct ggml_cgraph * llama_build_graph(
 
     struct ggml_cgraph * result = NULL;
 
-    struct llm_build_context llm(lctx, batch, cb, worst_case);
+    struct llm_build_context llm(lctx, ubatch, cb, worst_case);
 
     llm.init();
 
@@ -16554,7 +16554,7 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
-static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
+static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
     //
     // set input data
     //
@@ -16563,28 +16563,28 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
     const auto & cparams = lctx.cparams;
     const auto & kv_self = lctx.kv_self;
 
-    if (batch.token) {
-        const int64_t n_tokens = batch.n_tokens;
+    if (ubatch.token) {
+        const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
+        ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
     }
 
-    if (batch.embd) {
+    if (ubatch.embd) {
         const int64_t n_embd   = hparams.n_embd;
-        const int64_t n_tokens = batch.n_tokens;
+        const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+        ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
     }
 
-    if (batch.pos && lctx.inp_pos) {
-        const int64_t n_tokens = batch.n_tokens;
+    if (ubatch.pos && lctx.inp_pos) {
+        const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
+        ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
     }
 
     if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
         GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
-        const int64_t n_tokens = batch.n_tokens;
+        const int64_t n_tokens = ubatch.n_tokens;
 
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
         int32_t * data = (int32_t *) lctx.inp_out_ids->data;
@@ -16593,10 +16593,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             for (int i = 0; i < n_tokens; ++i) {
                 data[i] = i;
             }
-        } else if (batch.output) {
+        } else if (ubatch.output) {
             int32_t n_outputs = 0;
             for (int i = 0; i < n_tokens; ++i) {
-                if (batch.output[i]) {
+                if (ubatch.output[i]) {
                     data[n_outputs++] = i;
                 }
             }
@@ -16621,9 +16621,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
         if (cparams.causal_attn && !lctx.is_encoding) {
             const int64_t n_kv         = kv_self.n;
-            const int64_t n_tokens     = batch.n_tokens;
-            const int64_t n_seq_tokens = batch.n_seq_tokens;
-            const int64_t n_seqs       = batch.n_seqs;
+            const int64_t n_tokens     = ubatch.n_tokens;
+            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+            const int64_t n_seqs       = ubatch.n_seqs;
 
 
             float * data     = nullptr;
@@ -16640,14 +16640,14 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             }
 
             // For causal attention, use only the previous KV cells
-            // of the correct sequence for each token of the batch.
+            // of the correct sequence for each token of the ubatch.
             // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
             for (int h = 0; h < 1; ++h) {
                 for (int s = 0; s < n_seqs; ++s) {
-                    const llama_seq_id seq_id = batch.seq_id[s][0];
+                    const llama_seq_id seq_id = ubatch.seq_id[s][0];
 
                     for (int j = 0; j < n_seq_tokens; ++j) {
-                        const llama_pos pos = batch.pos[s*n_seq_tokens + j];
+                        const llama_pos pos = ubatch.pos[s*n_seq_tokens + j];
 
                         for (int i = 0; i < n_kv; ++i) {
                             float f;
@@ -16693,9 +16693,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
                 }
             }
         } else {
-            const int64_t n_tokens     = batch.n_tokens;
-            const int64_t n_seq_tokens = batch.n_seq_tokens;
-            const int64_t n_seqs       = batch.n_seqs;
+            const int64_t n_tokens     = ubatch.n_tokens;
+            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+            const int64_t n_seqs       = ubatch.n_seqs;
             // when using kv cache, the mask needs to match the kv cache size
             const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;
 
@@ -16705,7 +16705,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
 
             for (int h = 0; h < 1; ++h) {
                 for (int s1 = 0; s1 < n_seqs; ++s1) {
-                    const llama_seq_id seq_id = batch.seq_id[s1][0];
+                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
 
                     for (int j = 0; j < n_seq_tokens; ++j) {
                         const int32_t tj = s1*n_seq_tokens + j;
@@ -16715,10 +16715,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
                                 const int32_t ti = s0*n_seq_tokens + i;
                                 float f = -INFINITY;
 
-                                for (int s = 0; s < batch.n_seq_id[s0]; ++s) {
-                                    if (batch.seq_id[s0][s] == seq_id) {
+                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
+                                    if (ubatch.seq_id[s0][s] == seq_id) {
                                         if (hparams.use_alibi) {
-                                            f = -std::abs(batch.pos[ti] - batch.pos[tj]);
+                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
                                         } else {
                                             f = 0.0f;
                                         }
@@ -16740,9 +16740,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
     }
 
     if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
-        const int64_t n_tokens     = batch.n_tokens;
-        const int64_t n_seq_tokens = batch.n_seq_tokens;
-        const int64_t n_seqs       = batch.n_seqs;
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
 
         GGML_ASSERT(lctx.inp_mean);
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
@@ -16753,12 +16753,12 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
         std::vector<uint64_t> sum(n_tokens, 0);
 
         for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = batch.seq_id[s][0];
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
 
-            // TODO: adapt limits to n_seqs when batch.equal_seqs is true
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
             GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
 
-            sum[seq_id] += batch.n_seq_tokens;
+            sum[seq_id] += ubatch.n_seq_tokens;
         }
 
         std::vector<float> div(n_tokens, 0.0f);
@@ -16770,7 +16770,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
         }
 
         for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = batch.seq_id[s][0];
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
 
             for (int i = 0; i < n_seq_tokens; ++i) {
                 data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
@@ -16781,9 +16781,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
     if (cparams.embeddings && (
                 cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
                 cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
-        const int64_t n_tokens     = batch.n_tokens;
-        const int64_t n_seq_tokens = batch.n_seq_tokens;
-        const int64_t n_seqs       = batch.n_seqs;
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
 
         GGML_ASSERT(lctx.inp_cls);
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
@@ -16792,13 +16792,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
         memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
 
         for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = batch.seq_id[s][0];
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
 
-            // TODO: adapt limits to n_seqs when batch.equal_seqs is true
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
             GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
 
             for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = batch.pos[s*n_seq_tokens + i];
+                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
 
                 if (pos == 0) {
                     data[seq_id] = s*n_seq_tokens + i;
@@ -16808,9 +16808,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
     }
 
     if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
-        const int64_t n_tokens     = batch.n_tokens;
-        const int64_t n_seq_tokens = batch.n_seq_tokens;
-        const int64_t n_seqs       = batch.n_seqs;
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
 
         GGML_ASSERT(lctx.inp_cls);
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
@@ -16822,13 +16822,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
         std::vector<int> last_row(n_tokens, -1);
 
         for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = batch.seq_id[s][0];
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
 
-            // TODO: adapt limits to n_seqs when batch.equal_seqs is true
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
             GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
 
             for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = batch.pos[s*n_seq_tokens + i];
+                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
 
                 if (pos >= last_pos[seq_id]) {
                     last_pos[seq_id] = pos;
@@ -16890,10 +16890,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
     }
 
     if (lctx.inp_pos_bucket) {
-        const int64_t n_tokens = batch.n_tokens;
+        const int64_t n_tokens = ubatch.n_tokens;
 
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));
-        GGML_ASSERT(!batch.equal_seqs); // TODO: use batch.n_seqs instead of failing
+        GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
 
         int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;
 
@@ -16902,7 +16902,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             for (int h = 0; h < 1; ++h) {
                 for (int j = 0; j < n_tokens; ++j) {
                     for (int i = 0; i < n_kv; ++i) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
                     }
                 }
             }
@@ -16910,7 +16910,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             for (int h = 0; h < 1; ++h) {
                 for (int j = 0; j < n_tokens; ++j) {
                     for (int i = 0; i < n_tokens; ++i) {
-                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(batch.pos[i], batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
                     }
                 }
             }
@@ -16926,10 +16926,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
 
     if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
         const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
-        const int64_t n_tokens = batch.n_tokens;
+        const int64_t n_tokens = ubatch.n_tokens;
 
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));
-        GGML_ASSERT(!batch.equal_seqs); // TODO: use batch.n_seqs instead of failing
+        GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
 
         float * data = (float *) lctx.inp_KQ_mask_cross->data;
 
@@ -16937,8 +16937,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             for (int j = 0; j < n_tokens; ++j) {
                 for (int i = 0; i < n_output_enc; ++i) {
                     float f = -INFINITY;
-                    for (int s = 0; s < batch.n_seq_id[j]; ++s) {
-                        const llama_seq_id seq_id = batch.seq_id[j][s];
+                    for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[j][s];
                         if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
                             f = 0.0f;
                         }

From c8c07d658a6cefc5a50cfdf6be7d726503612303 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 22 Oct 2024 16:59:02 +0200
Subject: [PATCH 093/396] llama : fix empty batch causing llama_batch_allocr to
 crash (#9966)

* llama : fix empty batch cause llama_batch_allocr to crash

* move batch_allocr inside decode/encode_internal

* fix build

* add GGML_ASSERT

* Apply suggestions from code review

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 src/llama.cpp | 128 ++++++++++++++++++++++++++------------------------
 1 file changed, 67 insertions(+), 61 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 7a5a46dce..24e1f1f01 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5177,6 +5177,57 @@ struct llama_model_loader {
     }
 };
 
+// temporary allocate memory for the input batch if needed
+static const llama_seq_id batch_default_seq_id = 0;
+struct llama_batch_allocr {
+    std::array<llama_seq_id, 1> seq_id_0 = {batch_default_seq_id};
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id *> seq_id;
+    std::vector<int8_t>         logits;
+    struct llama_batch          batch;
+    // optionally fulfill the batch returned by llama_batch_get_one
+    llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) {
+        batch = in_batch;
+        GGML_ASSERT(batch.n_tokens > 0);
+        if (!batch.pos) {
+            // determine the last position in KV cache
+            llama_pos last_pos = -1;
+            for (const auto & cell : ctx.kv_self.cells) {
+                if (cell.has_seq_id(batch_default_seq_id)) {
+                    last_pos = std::max(last_pos, cell.pos);
+                }
+            }
+            last_pos++; // next position
+            pos.resize(batch.n_tokens);
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                pos[i] = i+last_pos;
+            }
+            batch.pos = pos.data();
+        }
+        if (!batch.n_seq_id) {
+            n_seq_id.resize(batch.n_tokens);
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                n_seq_id[i] = seq_id_0.size();
+            }
+            batch.n_seq_id = n_seq_id.data();
+        }
+        if (!batch.seq_id) {
+            seq_id.resize(batch.n_tokens + 1);
+            seq_id[batch.n_tokens] = NULL;
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                seq_id[i] = seq_id_0.data();
+            }
+            batch.seq_id = seq_id.data();
+        }
+        if (!batch.logits) {
+            logits.resize(batch.n_tokens);
+            logits[logits.size() - 1] = true;
+            batch.logits = logits.data();
+        }
+    }
+};
+
 template<>
 bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
     uint32_t tmp;
@@ -17095,16 +17146,20 @@ static void llama_graph_compute(
 //
 static int llama_decode_internal(
          llama_context & lctx,
-           llama_batch   batch) {
+           llama_batch   inp_batch) {
 
     lctx.is_encoding = false;
-    const uint32_t n_tokens_all = batch.n_tokens;
 
-    if (n_tokens_all == 0) {
+    if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
+    // temporary allocate memory for the input batch if needed
+    llama_batch_allocr batch_allocr(lctx, inp_batch);
+    const llama_batch & batch = batch_allocr.batch;
+    const uint32_t n_tokens_all = batch.n_tokens;
+
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
@@ -17409,17 +17464,20 @@ static int llama_decode_internal(
 //
 static int llama_encode_internal(
          llama_context & lctx,
-           llama_batch   batch) {
+           llama_batch   inp_batch) {
 
     lctx.is_encoding = true;
 
-    const uint32_t n_tokens = batch.n_tokens;
-
-    if (n_tokens == 0) {
+    if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
+    // temporary allocate memory for the input batch if needed
+    llama_batch_allocr batch_allocr(lctx, inp_batch);
+    const llama_batch & batch = batch_allocr.batch;
+    const uint32_t n_tokens = batch.n_tokens;
+
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
@@ -21090,61 +21148,10 @@ void llama_batch_free(struct llama_batch batch) {
     if (batch.logits)   free(batch.logits);
 }
 
-// temporary allocate memory for the input batch if needed
-static const llama_seq_id batch_default_seq_id = 0;
-struct llama_batch_allocr {
-    std::array<llama_seq_id, 1> seq_id_0 = {batch_default_seq_id};
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id *> seq_id;
-    std::vector<int8_t>         logits;
-    struct llama_batch          batch;
-    // optionally fulfill the batch returned by llama_batch_get_one
-    llama_batch_allocr(struct llama_context * ctx, struct llama_batch in_batch) {
-        batch = in_batch;
-        if (!batch.pos) {
-            // determine the last position in KV cache
-            llama_pos last_pos = -1;
-            for (const auto & cell : ctx->kv_self.cells) {
-                if (cell.has_seq_id(batch_default_seq_id)) {
-                    last_pos = std::max(last_pos, cell.pos);
-                }
-            }
-            last_pos++; // next position
-            pos.resize(batch.n_tokens);
-            for (int32_t i = 0; i < batch.n_tokens; i++) {
-                pos[i] = i+last_pos;
-            }
-            batch.pos = pos.data();
-        }
-        if (!batch.n_seq_id) {
-            n_seq_id.resize(batch.n_tokens);
-            for (int32_t i = 0; i < batch.n_tokens; i++) {
-                n_seq_id[i] = seq_id_0.size();
-            }
-            batch.n_seq_id = n_seq_id.data();
-        }
-        if (!batch.seq_id) {
-            seq_id.resize(batch.n_tokens + 1);
-            seq_id[batch.n_tokens] = NULL;
-            for (int32_t i = 0; i < batch.n_tokens; i++) {
-                seq_id[i] = seq_id_0.data();
-            }
-            batch.seq_id = seq_id.data();
-        }
-        if (!batch.logits) {
-            logits.resize(batch.n_tokens);
-            logits[logits.size() - 1] = true;
-            batch.logits = logits.data();
-        }
-    }
-};
-
 int32_t llama_encode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
-    llama_batch_allocr batch_allocr(ctx, batch);
-    const int ret = llama_encode_internal(*ctx, batch_allocr.batch);
+    const int ret = llama_encode_internal(*ctx, batch);
     if (ret != 0) {
         LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
     }
@@ -21155,8 +21162,7 @@ int32_t llama_encode(
 int32_t llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
-    llama_batch_allocr batch_allocr(ctx, batch);
-    const int ret = llama_decode_internal(*ctx, batch_allocr.batch);
+    const int ret = llama_decode_internal(*ctx, batch);
     if (ret != 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }

From 873279b1592e433c4d9eb5065091cc98473c7bee Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sun, 20 Oct 2024 00:22:59 +0000
Subject: [PATCH 094/396] flake.lock: Update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/5633bcff0c6162b9e4b5f1264264611e950c8ec7?narHash=sha256-9UTxR8eukdg%2BXZeHgxW5hQA9fIKHsKCdOIUycTryeVw%3D' (2024-10-09)
  → 'github:NixOS/nixpkgs/4c2fcb090b1f3e5b47eaa7bd33913b574a11e0a0?narHash=sha256-/uilDXvCIEs3C9l73JTACm4quuHUsIHcns1c%2BcHUJwA%3D' (2024-10-18)
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index 702527028..1f8defab7 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1728492678,
-        "narHash": "sha256-9UTxR8eukdg+XZeHgxW5hQA9fIKHsKCdOIUycTryeVw=",
+        "lastModified": 1729256560,
+        "narHash": "sha256-/uilDXvCIEs3C9l73JTACm4quuHUsIHcns1c+cHUJwA=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "5633bcff0c6162b9e4b5f1264264611e950c8ec7",
+        "rev": "4c2fcb090b1f3e5b47eaa7bd33913b574a11e0a0",
         "type": "github"
       },
       "original": {

From 4c9388fb96ac2415fbb1239b7ba8346616606e2e Mon Sep 17 00:00:00 2001
From: Jun Hee Yoo <contact.jhyoo@gmail.com>
Date: Wed, 23 Oct 2024 19:33:45 +0900
Subject: [PATCH 095/396] metal : add POOL2D and fix IM2COL (#9943)

* add pool_2d

Signed-off-by: Junhee Yoo <junhee.yoo@navercorp.com>

* fix im2col and add unittest for N>=1024

Signed-off-by: Junhee Yoo <junhee.yoo@navercorp.com>

* add tests for N % 1024 != 0

Signed-off-by: Junhee Yoo <junhee.yoo@navercorp.com>

* remove trailing whitespaces

Signed-off-by: Junhee Yoo <junhee.yoo@navercorp.com>

* apply suggestions

Signed-off-by: Junhee Yoo <junhee.yoo@navercorp.com>

* apply more optimization

- original IM2COL kernel + _ext with MIN()

Signed-off-by: Junhee Yoo <junhee.yoo@navercorp.com>

* apply review: change kernel name of pool_2d

Signed-off-by: Junhee Yoo <junhee.yoo@navercorp.com>

* apply review

Signed-off-by: Junhee Yoo <junhee.yoo@navercorp.com>

* fix more formatting and enhance readability

Signed-off-by: Junhee Yoo <junhee.yoo@navercorp.com>

---------

Signed-off-by: Junhee Yoo <junhee.yoo@navercorp.com>
---
 ggml/src/ggml-metal.m      | 128 ++++++++++++++++++++++----
 ggml/src/ggml-metal.metal  | 178 +++++++++++++++++++++++++++++++++++++
 tests/test-backend-ops.cpp |  10 +++
 3 files changed, 298 insertions(+), 18 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 172a0f925..e9541441c 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -241,6 +241,8 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
     GGML_METAL_KERNEL_TYPE_IM2COL_F16,
     GGML_METAL_KERNEL_TYPE_IM2COL_F32,
+    GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,
+    GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32,
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
@@ -272,6 +274,8 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_SIN,
     GGML_METAL_KERNEL_TYPE_COS,
     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
+    GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
+    GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
 
     GGML_METAL_KERNEL_TYPE_COUNT
 };
@@ -685,6 +689,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                 rope_neox_f16,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                    im2col_f16,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,                im2col_ext_f16,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32,                im2col_ext_f32,                 true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,        timestep_embedding_f32,         true);
@@ -716,6 +722,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                           sin,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                           cos,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                      sum_rows,                       true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,               pool_2d_avg_f32,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,               pool_2d_max_f32,                true);
     }
 
     [metal_library release];
@@ -844,8 +852,8 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_IM2COL:
             return op->src[0]->type == GGML_TYPE_F16;
         case GGML_OP_POOL_1D:
-        case GGML_OP_POOL_2D:
             return false;
+        case GGML_OP_POOL_2D:
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_ARANGE:
@@ -2545,6 +2553,8 @@ static void ggml_metal_encode_node(
             } break;
         case GGML_OP_IM2COL:
             {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+                GGML_ASSERT(ggml_is_contiguous(src1));
                 GGML_ASSERT(src0->type == GGML_TYPE_F16);
                 GGML_ASSERT(src1->type == GGML_TYPE_F32);
                 GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
@@ -2574,30 +2584,54 @@ static void ggml_metal_encode_node(
                 const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
                 const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
 
-                id<MTLComputePipelineState> pipeline = nil;
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline;
+
+                const bool is_gt_mttpt = ((size_t)(N * KH * KW)) > pipeline.maxTotalThreadsPerThreadgroup;
 
                 switch (dst->type) {
-                    case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
-                    case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
+                    case GGML_TYPE_F32: {
+                        pipeline = (is_gt_mttpt ?
+                                    ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32].pipeline
+                                    :
+                                    ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline);
+                    } break;
+                    case GGML_TYPE_F16: {
+                        pipeline = (is_gt_mttpt ?
+                                    ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16].pipeline
+                                    :
+                                    ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline);
+                    } break;
                     default: GGML_ABORT("fatal error");
                 };
 
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
-                [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
-                [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
-                [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
-                [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
-                [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
-                [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
-                [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
-                [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
-                [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
-                [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
+                [encoder setBuffer:id_src1 offset:offs_src1       atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst        atIndex:1];
+                [encoder setBytes:&ofs0    length:sizeof(int32_t) atIndex:2];
+                [encoder setBytes:&ofs1    length:sizeof(int32_t) atIndex:3];
+                [encoder setBytes:&IW      length:sizeof(int32_t) atIndex:4];
+                [encoder setBytes:&IH      length:sizeof(int32_t) atIndex:5];
+                [encoder setBytes:&CHW     length:sizeof(int32_t) atIndex:6];
+                [encoder setBytes:&s0      length:sizeof(int32_t) atIndex:7];
+                [encoder setBytes:&s1      length:sizeof(int32_t) atIndex:8];
+                [encoder setBytes:&p0      length:sizeof(int32_t) atIndex:9];
+                [encoder setBytes:&p1      length:sizeof(int32_t) atIndex:10];
+                [encoder setBytes:&d0      length:sizeof(int32_t) atIndex:11];
+                [encoder setBytes:&d1      length:sizeof(int32_t) atIndex:12];
 
-                [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
+                if (is_gt_mttpt) {
+                    [encoder setBytes:&N   length:sizeof(int32_t) atIndex:13];
+                    [encoder setBytes:&KH  length:sizeof(int32_t) atIndex:14];
+                    [encoder setBytes:&KW  length:sizeof(int32_t) atIndex:15];
+
+                    const uint64_t n_threads = MIN(pipeline.maxTotalThreadsPerThreadgroup, (uint64_t)N);
+
+                    const int64_t  quotient  = N / n_threads + (N % n_threads > 0 ? 1 : 0);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(quotient * CHW, OH, OW) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)];
+                } else {
+                    [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
+                }
             } break;
         case GGML_OP_UPSCALE:
             {
@@ -3001,6 +3035,64 @@ static void ggml_metal_encode_node(
 
                 [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
+        case GGML_OP_POOL_2D:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+                GGML_ASSERT(src0t == GGML_TYPE_F32 && src0t == dstt);
+
+                const int32_t * opts = dst->op_params;
+                enum ggml_op_pool op = opts[0];
+
+                id<MTLComputePipelineState> pipeline = nil;
+                switch (src0t) {
+                    case GGML_TYPE_F32: {
+                        switch(op) {
+                            case GGML_OP_POOL_AVG:
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32].pipeline; break;
+                            case GGML_OP_POOL_MAX:
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32].pipeline; break;
+                            default: GGML_ASSERT(false && "not implemented");
+                        }
+                    } break;
+                    default: GGML_ASSERT(false && "not implemented");
+                }
+
+                const int32_t k0 = opts[1];
+                const int32_t k1 = opts[2];
+                const int32_t s0 = opts[3];
+                const int32_t s1 = opts[4];
+                const int32_t p0 = opts[5];
+                const int32_t p1 = opts[6];
+
+                const int64_t IH = src0->ne[1];
+                const int64_t IW = src0->ne[0];
+
+                const int64_t N  = dst->ne[3];
+                const int64_t OC = dst->ne[2];
+                const int64_t OH = dst->ne[1];
+                const int64_t OW = dst->ne[0];
+
+                const int64_t parallel_elements = N * OC * OH * OW;
+                const int64_t n_threads = MIN((int64_t)[pipeline maxTotalThreadsPerThreadgroup], parallel_elements);
+                const int64_t n_tg = (parallel_elements + n_threads - 1) / n_threads;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0       atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst        atIndex:1];
+                [encoder setBytes:&k0      length:sizeof(int32_t) atIndex:2];
+                [encoder setBytes:&k1      length:sizeof(int32_t) atIndex:3];
+                [encoder setBytes:&s0      length:sizeof(int32_t) atIndex:4];
+                [encoder setBytes:&s1      length:sizeof(int32_t) atIndex:5];
+                [encoder setBytes:&p0      length:sizeof(int32_t) atIndex:6];
+                [encoder setBytes:&p1      length:sizeof(int32_t) atIndex:7];
+                [encoder setBytes:&IH      length:sizeof(int64_t) atIndex:8];
+                [encoder setBytes:&IW      length:sizeof(int64_t) atIndex:9];
+                [encoder setBytes:&OH      length:sizeof(int64_t) atIndex:10];
+                [encoder setBytes:&OW      length:sizeof(int64_t) atIndex:11];
+                [encoder setBytes:&parallel_elements length:sizeof(int64_t) atIndex:12];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)];
+            } break;
        default:
             {
                 GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index 2b2000323..71b58be1f 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -1933,6 +1933,85 @@ kernel void kernel_im2col(
 template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
 template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
 
+typedef void (im2col_ext_t)(
+        device const float * x,
+        device        char * dst,
+        constant   int32_t & ofs0,
+        constant   int32_t & ofs1,
+        constant   int32_t & IW,
+        constant   int32_t & IH,
+        constant   int32_t & CHW,
+        constant   int32_t & s0,
+        constant   int32_t & s1,
+        constant   int32_t & p0,
+        constant   int32_t & p1,
+        constant   int32_t & d0,
+        constant   int32_t & d1,
+        constant   int32_t & N,
+        constant   int32_t & KH,
+        constant   int32_t & KW,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]);
+
+template <typename T>
+kernel void kernel_im2col_ext(
+        device const float * x,
+        device        char * dst,
+        constant   int32_t & ofs0,
+        constant   int32_t & ofs1,
+        constant   int32_t & IW,
+        constant   int32_t & IH,
+        constant   int32_t & CHW,
+        constant   int32_t & s0,
+        constant   int32_t & s1,
+        constant   int32_t & p0,
+        constant   int32_t & p1,
+        constant   int32_t & d0,
+        constant   int32_t & d1,
+        constant   int32_t & N,
+        constant   int32_t & KH,
+        constant   int32_t & KW,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],      // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {  // [M, 1, 1]
+    const int32_t KHW = KH * KW;             // KHW == ntg[1] * ntg[2], KW == ntg[2]
+
+    const int32_t d = tgpig[0] / CHW;
+    const int32_t chw = tgpig[0] % CHW;
+    const int32_t tgpig_0 = chw / KHW;  // 0 ~ (IC - 1)
+    const int32_t HW = tgpig[0] % KHW;
+
+    const int32_t tpitg_0 = (d * ntg[0]) + tpitg[0];
+    if (tpitg_0 >= N) {
+        return;
+    }
+
+    const int32_t tpitg_1 = HW / KW;
+    const int32_t tpitg_2 = HW % KW;
+
+    const int32_t iiw = tgpig[2] * s0 + tpitg_2 * d0 - p0;
+    const int32_t iih = tgpig[1] * s1 + tpitg_1 * d1 - p1;
+
+    const int32_t offset_dst =
+        (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
+        (tgpig_0 * KHW + tpitg_1 * KW + tpitg_2);
+
+    device T * pdst = (device T *) (dst);
+
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        pdst[offset_dst] = 0.0f;
+    } else {
+        const int32_t offset_src = tpitg_0 * ofs0 + tgpig_0 * ofs1;
+        pdst[offset_dst] = x[offset_src + iih * IW + iiw];
+    }
+}
+
+template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
+template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
+
 kernel void kernel_upscale_f32(
     device  const char * src0,
     device        char * dst,
@@ -6372,3 +6451,102 @@ template [[host_name("kernel_mul_mv_id_iq3_s_f32")]]   kernel kernel_mul_mv_id_t
 template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
 template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
 template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
+
+kernel void kernel_pool_2d_max_f32(
+        device  const float * src0,
+        device        float * dst,
+        constant    int32_t & k0,
+        constant    int32_t & k1,
+        constant    int32_t & s0,
+        constant    int32_t & s1,
+        constant    int32_t & p0,
+        constant    int32_t & p1,
+        constant    int64_t & IH,
+        constant    int64_t & IW,
+        constant    int64_t & OH,
+        constant    int64_t & OW,
+        constant    int64_t & parallel_elements,
+        uint        gid[[thread_position_in_grid]]) {
+
+    if (gid >= parallel_elements) {
+        return;
+    }
+
+    const int idx = gid;
+    const int I_HW = IH * IW;
+    const int O_HW = OH * OW;
+    const int nc = idx / O_HW;
+    const int cur_oh = idx % O_HW / OW;
+    const int cur_ow = idx % O_HW % OW;
+
+    device const float * i_ptr = src0 + nc * I_HW;
+    device       float * o_ptr = dst  + nc * O_HW;
+
+    const int start_h = cur_oh * s1 - p1;
+    const int bh = MAX(0,  start_h);
+    const int eh = MIN(IH, start_h + k1);
+    const int start_w = cur_ow * s0 - p0;
+    const int bw = MAX(0,  start_w);
+    const int ew = MIN(IW, start_w + k0);
+
+    float res = -INFINITY;
+
+    for (int i = bh; i < eh; i += 1) {
+        for (int j = bw; j < ew; j += 1) {
+            res = MAX(res, i_ptr[i * IW + j]);
+        }
+    }
+
+    o_ptr[cur_oh * OW + cur_ow] = res;
+}
+
+kernel void kernel_pool_2d_avg_f32(
+        device  const float * src0,
+        device        float * dst,
+        constant    int32_t & k0,
+        constant    int32_t & k1,
+        constant    int32_t & s0,
+        constant    int32_t & s1,
+        constant    int32_t & p0,
+        constant    int32_t & p1,
+        constant    int64_t & IH,
+        constant    int64_t & IW,
+        constant    int64_t & OH,
+        constant    int64_t & OW,
+        constant    int64_t & parallel_elements,
+        uint        gid[[thread_position_in_grid]]) {
+
+    if (gid >= parallel_elements) {
+        return;
+    }
+
+    const int idx = gid;
+    const int I_HW = IH * IW;
+    const int O_HW = OH * OW;
+    const int nc = idx / O_HW;
+    const int cur_oh = idx % O_HW / OW;
+    const int cur_ow = idx % O_HW % OW;
+
+    device const float * i_ptr = src0 + nc * I_HW;
+    device       float * o_ptr = dst  + nc * O_HW;
+
+    const int start_h = cur_oh * s1 - p1;
+    const int bh = MAX(0,  start_h);
+    const int eh = MIN(IH, start_h + k1);
+    const int start_w = cur_ow * s0 - p0;
+    const int bw = MAX(0,  start_w);
+    const int ew = MIN(IW, start_w + k0);
+    // const float scale = 1. / ((eh - bh) * (ew - bw));
+    const float scale = 1. / (k0 * k1);
+
+    float res = 0;
+
+    for (int i = bh; i < eh; i += 1) {
+        for (int j = bw; j < ew; j += 1) {
+            float cur = i_ptr[i * IW + j];
+            res += cur * scale;
+        }
+    }
+
+    o_ptr[cur_oh * OW + cur_ow] = res;
+}
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index ee1a8877e..e087f7ba5 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3316,6 +3316,16 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
 
+    // test cases for 2D im2col
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 32}, {3, 3, 1, 32}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 32}, {3, 3, 2, 32}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 1024}, {3, 3, 1, 1024}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 1024}, {3, 3, 2, 1024}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2048}, {3, 3, 1, 2048}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
+
     // sycl backend will limit task global_range < MAX_INT
     // test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
     // however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)

From ac113a0feee0935b2018312f7bc8d7a646b117ed Mon Sep 17 00:00:00 2001
From: Michael Coppola <m18coppola@gmail.com>
Date: Wed, 23 Oct 2024 07:09:26 -0400
Subject: [PATCH 096/396] llama.vim : add classic vim support (#9995)

* added classic vim support

* fixed ring update, removed blank line

* minor

* minor

* minor doc update

* removed uneeded var

* minor

* minor

* fixed job_start creating new scratch buffers

* fixed job_start creating new scratch buffers

* fixed ghost text indenting when expandtab is on

* removed unused code

* minor

* unified fim_on_exit

* minor

* vim ghost text rendering now uses pos_x and pos_y parameters

* renamed *_hlgroup to hlgroup_*

* renamed *_ghost_text to ghost_text_*, moved nvim/vim detection to llama#init()

* minor

---------

Co-authored-by: Michael Coppola <info@michaeljcoppola.com>
---
 examples/llama.vim | 168 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 127 insertions(+), 41 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 7a60442ad..4bc26d4e9 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -2,7 +2,7 @@
 "
 " requires:
 "
-"   - neovim
+"   - neovim or vim
 "   - curl
 "   - llama.cpp server instance
 "   - FIM-compatible model
@@ -10,7 +10,7 @@
 " sample config:
 "
 "   - Tab       - accept the current suggestion
-"   - Shift+Tab - accept just the first line of the segguestion
+"   - Shift+Tab - accept just the first line of the suggestion
 "   - Ctrl+F    - toggle FIM completion manually
 "
 " make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
@@ -43,8 +43,8 @@
 "
 
 " colors (adjust to your liking)
-highlight llama_hl_hint guifg=#ff772f
-highlight llama_hl_info guifg=#77ff2f
+highlight llama_hl_hint guifg=#ff772f ctermfg=202
+highlight llama_hl_info guifg=#77ff2f ctermfg=119
 
 " general parameters:
 "
@@ -93,6 +93,18 @@ let s:default_config = {
 
 let g:llama_config = get(g:, 'llama_config', s:default_config)
 
+function! s:get_indent(str)
+    let l:count = 0
+    for i in range(len(a:str))
+        if a:str[i] == "\t"
+            let l:count += &tabstop - 1
+        else
+            break
+        endif
+    endfor
+    return l:count
+endfunction
+
 function! s:rand(i0, i1) abort
     return a:i0 + rand() % (a:i1 - a:i0 + 1)
 endfunction
@@ -129,6 +141,21 @@ function! llama#init()
 
     let s:current_job = v:null
 
+    let s:ghost_text_nvim = exists('*nvim_buf_get_mark')
+    let s:ghost_text_vim = has('textprop')
+
+    if s:ghost_text_vim
+        let s:hlgroup_hint = 'llama_hl_hint'
+        let s:hlgroup_info = 'llama_hl_info'
+
+        if empty(prop_type_get(s:hlgroup_hint))
+            call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint})
+        endif
+        if empty(prop_type_get(s:hlgroup_info))
+            call prop_type_add(s:hlgroup_info, {'highlight': s:hlgroup_info})
+        endif
+    endif
+
     augroup llama
         autocmd!
         autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
@@ -317,13 +344,22 @@ function! s:ring_update()
         \ 't_max_predict_ms': 1
         \ })
 
-    let l:curl_command = printf(
-        \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
-        \ g:llama_config.endpoint, shellescape(l:request)
-        \ )
+    let l:curl_command = [
+        \ "curl",
+        \ "--silent",
+        \ "--no-buffer",
+        \ "--request", "POST",
+        \ "--url", g:llama_config.endpoint,
+        \ "--header", "Content-Type: application/json",
+        \ "--data", l:request
+        \ ]
 
     " no callbacks because we don't need to process the response
-    call jobstart(l:curl_command, {})
+    if s:ghost_text_nvim
+        call jobstart(l:curl_command, {})
+    elseif s:ghost_text_vim
+        call job_start(l:curl_command, {})
+    endif
 endfunction
 
 " necessary for 'inoremap <expr>'
@@ -418,24 +454,37 @@ function! llama#fim(is_auto) abort
         \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
         \ })
 
-    let l:curl_command = printf(
-        \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
-        \ g:llama_config.endpoint, shellescape(l:request)
-        \ )
+    let l:curl_command = [
+        \ "curl",
+        \ "--silent",
+        \ "--no-buffer",
+        \ "--request", "POST",
+        \ "--url", g:llama_config.endpoint,
+        \ "--header", "Content-Type: application/json",
+        \ "--data", l:request
+        \ ]
 
     if s:current_job != v:null
-        call jobstop(s:current_job)
+        if s:ghost_text_nvim
+            call jobstop(s:current_job)
+        elseif s:ghost_text_vim
+            call job_stop(s:current_job)
+        endif
     endif
 
     " send the request asynchronously
-    let s:current_job = jobstart(l:curl_command, {
-        \ 'on_stdout': function('s:fim_on_stdout'),
-        \ 'on_exit':   function('s:fim_on_exit'),
-        \ 'stdout_buffered': v:true,
-        \ 'pos_x': s:pos_x,
-        \ 'pos_y': s:pos_y,
-        \ 'is_auto': a:is_auto
-        \ })
+    if s:ghost_text_nvim
+        let s:current_job = jobstart(l:curl_command, {
+            \ 'on_stdout': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
+            \ 'on_exit':   function('s:fim_on_exit'),
+            \ 'stdout_buffered': v:true
+            \ })
+    elseif s:ghost_text_vim
+        let s:current_job = job_start(l:curl_command, {
+            \ 'out_cb': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
+            \ 'exit_cb':   function('s:fim_on_exit')
+            \ })
+    endif
 
     " TODO: per-file location
     let l:delta_y = abs(s:pos_y - s:pos_y_pick)
@@ -482,9 +531,13 @@ function! llama#fim_cancel()
     " clear the virtual text
     let l:bufnr = bufnr('%')
 
-    let l:id_vt_fim = nvim_create_namespace('vt_fim')
-
-    call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
+    if s:ghost_text_nvim
+        let l:id_vt_fim = nvim_create_namespace('vt_fim')
+        call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
+    elseif s:ghost_text_vim
+        call prop_remove({'type': s:hlgroup_hint, 'all': v:true})
+        call prop_remove({'type': s:hlgroup_info, 'all': v:true})
+    endif
 
     " remove the mappings
     silent! iunmap <buffer> <Tab>
@@ -499,13 +552,18 @@ function! s:on_move()
 endfunction
 
 " callback that processes the FIM result from the server and displays the suggestion
-function! s:fim_on_stdout(job_id, data, event) dict
-    let l:raw = join(a:data, "\n")
+function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
+    if s:ghost_text_nvim
+        let l:raw = join(a:data, "\n")
+    elseif s:ghost_text_vim
+        let l:raw = a:data
+    endif
+
     if len(l:raw) == 0
         return
     endif
 
-    if self.pos_x != col('.') - 1 || self.pos_y != line('.')
+    if a:pos_x != col('.') - 1 || a:pos_y != line('.')
         return
     endif
 
@@ -514,14 +572,14 @@ function! s:fim_on_stdout(job_id, data, event) dict
         return
     endif
 
-    let s:pos_x = self.pos_x
-    let s:pos_y = self.pos_y
+    let s:pos_x = a:pos_x
+    let s:pos_y = a:pos_y
 
     let s:can_accept = v:true
     let l:has_info   = v:false
 
     if s:can_accept && v:shell_error
-        if !self.is_auto
+        if !a:is_auto
             call add(s:content, "<| curl error: is the server on? |>")
         endif
         let s:can_accept = v:false
@@ -642,7 +700,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
     " display virtual text with the suggestion
     let l:bufnr = bufnr('%')
 
-    let l:id_vt_fim = nvim_create_namespace('vt_fim')
+    if s:ghost_text_nvim
+        let l:id_vt_fim = nvim_create_namespace('vt_fim')
+    endif
 
     " construct the info message
     if g:llama_config.show_info > 0 && l:has_info
@@ -671,15 +731,41 @@ function! s:fim_on_stdout(job_id, data, event) dict
     endif
 
     " display the suggestion and append the info to the end of the first line
-    call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
-        \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
-        \ 'virt_text_win_col': virtcol('.') - 1
-        \ })
+    if s:ghost_text_nvim
+        call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
+            \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
+            \ 'virt_text_win_col': virtcol('.') - 1
+            \ })
 
-    call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
-        \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
-        \ 'virt_text_win_col': virtcol('.')
-        \ })
+        call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
+            \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
+            \ 'virt_text_win_col': virtcol('.')
+            \ })
+    elseif s:ghost_text_vim
+        let l:new_suffix = s:content[0]
+        if !empty(l:new_suffix)
+            call prop_add(s:pos_y, s:pos_x + 1, {
+                        \ 'type': s:hlgroup_hint,
+                        \ 'text': l:new_suffix
+                        \ })
+        endif
+        for line in s:content[1:]
+            call prop_add(s:pos_y, 0, {
+                        \ 'type': s:hlgroup_hint,
+                        \ 'text': line,
+                        \ 'text_padding_left': s:get_indent(line),
+                        \ 'text_align': 'below'
+                        \ })
+        endfor
+        if !empty(l:info)
+            call prop_add(s:pos_y, 0, {
+                        \ 'type': s:hlgroup_info,
+                        \ 'text': l:info,
+                        \ 'text_padding_left': col('$'),
+                        \ 'text_wrap': 'truncate'
+                        \ })
+        endif
+    endif
 
     " setup accept shortcuts
     inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
@@ -688,7 +774,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let s:hint_shown = v:true
 endfunction
 
-function! s:fim_on_exit(job_id, exit_code, event) dict
+function! s:fim_on_exit(job_id, exit_code, event = v:null)
     if a:exit_code != 0
         echom "Job failed with exit code: " . a:exit_code
     endif

From c19af0acb1fe6d0fdbecadd8483c1fbe5d68d095 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 16 Oct 2024 20:10:01 +0200
Subject: [PATCH 097/396] ggml : remove redundant set of contexts used field
 (ggml/978)

This commit removes the setting of the `used` field of the contexts in
the global state (g_state) in `ggml_init`.

The motivation for this change is that I believe that this additional
initialization might not be required after the changes in Commit
45fc4fed0b9fb5b1af4a8525cbebb95e11208732 ("sync : latest changes from
whisper.cpp"), which changed the initialization of the contexts field
from `{ 0 }` to `{ { 0 } }`:

```console
             g_state = (struct ggml_state) {
-                /*.contexts =*/ { 0 },
+                /*.contexts =*/ { { 0 } },
             };
```
My understanding is that the `{0}` initialization might not have
zero-initialized all the nested fields in every array element because of
compiler differences, and might have been the reason for having the
explicit setting of the `used` fields to false.
---
 ggml/src/ggml.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b16c462fa..1741d3338 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3852,10 +3852,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
                 },
             };
 
-            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
-                g_state.contexts[i].used = false;
-            }
-
             const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
 
             GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);

From 80273a306d07ed95059d6130389deacb3b2d7196 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 18 Oct 2024 09:24:44 +0200
Subject: [PATCH 098/396] CUDA: fix 1D im2col, add tests (ggml/993)

---
 ggml/src/ggml-cuda.cu        |  1 -
 ggml/src/ggml-cuda/im2col.cu |  6 +++---
 tests/test-backend-ops.cpp   | 36 +++++++++++++++++++++++++++++++-----
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 1338bd458..fa280b529 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3141,7 +3141,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_ROPE:
             return ggml_is_contiguous(op->src[0]);
         case GGML_OP_IM2COL:
-            return op->src[0]->type == GGML_TYPE_F16;
         case GGML_OP_POOL_2D:
         case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu
index 16463ab0f..86a54e42b 100644
--- a/ggml/src/ggml-cuda/im2col.cu
+++ b/ggml/src/ggml-cuda/im2col.cu
@@ -91,9 +91,9 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int64_t OH = is_2D ? dst->ne[2] : 1;
     const int64_t OW =         dst->ne[1];
 
-    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
-    const int64_t batch = src1->ne[3];
-    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
+    const size_t  delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+    const int64_t batch        = src1->ne[is_2D ? 3 : 2];
+    const size_t  batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
 
     if(dst->type == GGML_TYPE_F16) {
         im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index e087f7ba5..7e769a91a 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3308,15 +3308,41 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         }
     }
 
-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
-    // test cases for 1D im2col
+    // im2col 1D
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    for (int s0 : {1, 3}) {
+        for (int p0 : {0, 3}) {
+            for (int d0 : {1, 3}) {
+                test_cases.emplace_back(new test_im2col(
+                    GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 2, 2, 1}, {3, 2, 2, 1},
+                    s0, 0, p0, 0, d0, 0, false));
+            }
+        }
+    }
 
-    // test cases for 2D im2col
+    // im2col 2D
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
+    for (int s0 : {1, 3}) {
+        for (int s1 : {1, 3}) {
+            for (int p0 : {0, 3}) {
+                for (int p1 : {0, 3}) {
+                    for (int d0 : {1, 3}) {
+                        for (int d1 : {1, 3}) {
+                            test_cases.emplace_back(new test_im2col(
+                                GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 2, 2}, {3, 3, 2, 2},
+                                s0, s1, p0, p1, d0, d1, true));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // extra tests for im2col 2D
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 32}, {3, 3, 1, 32}, 1, 1, 1, 1, 1, 1, true));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 32}, {3, 3, 2, 32}, 1, 1, 1, 1, 1, 1, true));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 1024}, {3, 3, 1, 1024}, 1, 1, 1, 1, 1, 1, true));

From 2d3aba9ee8da9c026d54e8a912a1d64f56809be3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 23 Oct 2024 17:16:56 +0300
Subject: [PATCH 099/396] llama.vim : bump generation time limit to 3s [no ci]

---
 examples/llama.vim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 4bc26d4e9..57eb2a977 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -81,7 +81,7 @@ let s:default_config = {
     \ 'n_suffix':         64,
     \ 'n_predict':        128,
     \ 't_max_prompt_ms':  500,
-    \ 't_max_predict_ms': 1000,
+    \ 't_max_predict_ms': 3000,
     \ 'show_info':        2,
     \ 'auto_fim':         v:true,
     \ 'max_line_suffix':  8,

From 190a37d7977eb5bd6a729299bd1e371208c87149 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 23 Oct 2024 17:23:55 +0300
Subject: [PATCH 100/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 6d31b21b9..7f689f632 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-2327bda7a55ac6b72614ac5ebd5c5a5e02553b9b
+6dccc647264f5429df2624f36138f601e7ce23e5

From 0a1c750c80147687df267114c81956757cc14382 Mon Sep 17 00:00:00 2001
From: wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
Date: Wed, 23 Oct 2024 13:27:51 -0600
Subject: [PATCH 101/396] server : samplers accept the prompt correctly
 (#10019)

---
 examples/server/server.cpp | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3992108e7..51f30ffea 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2163,17 +2163,10 @@ struct server_context {
                                 GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                             }
 
-                            common_sampler_reset(slot.smpl);
-
                             if (slot.params.cache_prompt) {
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
 
-                                // push the prompt into the sampling context (do not apply grammar)
-                                for (int i = 0; i < slot.n_past; ++i) {
-                                    common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
-                                }
-
                                 // reuse chunks from the cached prompt by shifting their KV cache in the new position
                                 if (params.n_cache_reuse > 0) {
                                     size_t head_c = slot.n_past; // cache
@@ -2206,8 +2199,6 @@ struct server_context {
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
 
-                                                common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
-
                                                 slot.n_past++;
                                             }
 
@@ -2259,8 +2250,6 @@ struct server_context {
 
                         // there is no common part left
                         slot.n_past = 0;
-
-                        common_sampler_reset(slot.smpl);
                     }
 
                     SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
@@ -2288,6 +2277,13 @@ struct server_context {
 
                         GGML_ASSERT(batch.n_tokens > 0);
 
+                        common_sampler_reset(slot.smpl);
+
+                        // Process all prompt tokens through sampler system
+                        for (int i = 0; i < slot.n_prompt_tokens; ++i) {
+                            common_sampler_accept(slot.smpl, prompt_tokens[i], false);
+                        }
+
                         // extract the logits only for the last token
                         batch.logits[batch.n_tokens - 1] = true;
 

From c39665f589091903396a442a6ee56613303e0350 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 24 Oct 2024 11:09:36 +0200
Subject: [PATCH 102/396] CUDA: fix MMQ for non-contiguous src0, add tests
 (#10021)

* CUDA: fix MMQ for non-contiguous src0, add tests

* revise test code
---
 ggml/src/ggml-cuda.cu      | 18 +++++----
 ggml/src/ggml-cuda/mmq.cu  |  4 +-
 ggml/src/ggml.c            |  2 +-
 tests/test-backend-ops.cpp | 78 +++++++++++++++++++++++++++++---------
 4 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index fa280b529..4a0329a63 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1151,8 +1151,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
     void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
 
     GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
-    char * src_ptr = (char *) src->data;
-    char * dst_ptr = (char *) dst;
+    const char * src_ptr = (const char *) src->data;
+    char       * dst_ptr = (char       *) dst;
 
     const int64_t ne0 = src->ne[0];
     const int64_t nb0 = src->nb[0];
@@ -1162,7 +1162,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
     const enum ggml_type type = src->type;
     const int64_t ts = ggml_type_size(type);
     const int64_t bs = ggml_blck_size(type);
-    int64_t i1_diff = i1_high - i1_low;
+    const int64_t i1_diff = i1_high - i1_low;
 
     const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
     if (nb0 == ts && nb1 == ts*ne0/bs) {
@@ -1479,13 +1479,17 @@ static void ggml_cuda_op_mul_mat(
         if (src0_is_contiguous) {
             dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
         } else {
-            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
+            // If src0 is not contiguous it will be copied to a temporary buffer, it may then be necessary to clear padding.
+            const size_t nbytes_data    = ggml_nbytes(src0);
+            const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
+            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
+            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
         }
 
-        // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
+        // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
         if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
-            const int64_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
-            const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
+            const size_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
+            const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
             CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
         }
 
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 4935f8818..ae5c68ab3 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -8,8 +8,6 @@ void ggml_cuda_op_mul_mat_q(
 
     const int64_t ne00 = src0->ne[0];
 
-    const int64_t nb01 = src0->nb[1];
-
     const int64_t ne10 = src1->ne[0];
     const int64_t ne11 = src1->ne[1];
     GGML_ASSERT(ne10 % QK8_1 == 0);
@@ -17,7 +15,7 @@ void ggml_cuda_op_mul_mat_q(
     const int64_t ne0 = dst->ne[0];
 
     const int64_t row_diff = row_high - row_low;
-    const int64_t stride00 = nb01 / ggml_type_size(src0->type);
+    const int64_t stride00 = ne00 / ggml_blck_size(src0->type);
 
     int id = ggml_cuda_get_device();
     const int compute_capability = ggml_cuda_info().devices[id].cc;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1741d3338..66df9a9c1 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3464,7 +3464,7 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
 
 size_t ggml_nbytes(const struct ggml_tensor * tensor) {
     size_t nbytes;
-    size_t blck_size = ggml_blck_size(tensor->type);
+    const size_t blck_size = ggml_blck_size(tensor->type);
     if (blck_size == 1) {
         nbytes = ggml_type_size(tensor->type);
         for (int i = 0; i < GGML_MAX_DIMS; ++i) {
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 7e769a91a..2e3ad79f0 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1650,11 +1650,12 @@ struct test_mul_mat : public test_case {
     const int64_t m;
     const int64_t n;
     const int64_t k;
-    const std::array<int64_t, 2> bs; // dims 3 and 4
-    const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
+    const std::array<int64_t, 2> bs;  // dims 3 and 4
+    const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4
+    const std::array<int64_t, 4> per; // permutation of dimensions
 
     std::string vars() override {
-        return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
+        return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, per);
     }
 
     double max_nmse_err() override {
@@ -1669,17 +1670,44 @@ struct test_mul_mat : public test_case {
     test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
             int64_t m = 32, int64_t n = 32, int64_t k = 32,
             std::array<int64_t, 2> bs = {10, 10},
-            std::array<int64_t, 2> nr = {2, 2})
-        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
+            std::array<int64_t, 2> nr = {2, 2},
+            std::array<int64_t, 4> per = {0, 1, 2, 3})
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         // C^T = A * B^T: (k, m) * (k, n) => (m, n)
-        ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0]      , bs[1]);
-        ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
-        ggml_set_param(ctx, a);
-        ggml_set_param(ctx, b);
-        ggml_set_name(a, "a");
-        ggml_set_name(b, "b");
+        ggml_tensor * a;
+        ggml_tensor * b;
+
+        const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
+        if (npermuted > 0) {
+            GGML_ASSERT(npermuted == 2);
+            GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
+            GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
+
+            // Create tensors with the permuted dimensions, then permute them back to the dimensions given by m,n,k.
+            const int64_t ne_a[4] = {k, m, bs[0],       bs[1]};
+            const int64_t ne_b[4] = {k, n, bs[0]*nr[0], bs[1]*nr[1]};
+
+            a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
+            b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
+            ggml_set_param(ctx, a);
+            ggml_set_param(ctx, b);
+            ggml_set_name(a, "a");
+            ggml_set_name(b, "b");
+
+            a = ggml_permute(ctx, a, per[0], per[1], per[2], per[3]);
+            b = ggml_permute(ctx, b, per[0], per[1], per[2], per[3]);
+            ggml_set_name(a, "a_permuted");
+            ggml_set_name(b, "b_permuted");
+        } else {
+            a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0],       bs[1]);
+            b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
+            ggml_set_param(ctx, a);
+            ggml_set_param(ctx, b);
+            ggml_set_name(a, "a");
+            ggml_set_name(b, "b");
+        }
 
         ggml_tensor * out = ggml_mul_mat(ctx, a, b);
         ggml_set_name(out, "out");
@@ -3478,13 +3506,14 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 #if 1
     for (ggml_type type_a : base_types) {
         for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
+            // test cases without permutation
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, { 1,  1}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {10,  1}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {10,  1}, {2, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {10, 10}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {10, 10}, {2, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {10, 10}, {1, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {10, 10}, {2, 2}));
 
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1,  1}, {1, 1}));
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10,  1}, {1, 1}));
@@ -3493,6 +3522,19 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
+
+            // test cases with permutation
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
         }
     }
     for (ggml_type type_a : other_types) {

From 167a515651a4b065a16225ffc69564c5674f3d0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 24 Oct 2024 14:40:23 +0200
Subject: [PATCH 103/396] CUDA: fix insufficient buffer clearing for MMQ
 (#10032)

---
 ggml/src/ggml-cuda.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 4a0329a63..21c9f5e38 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1479,11 +1479,12 @@ static void ggml_cuda_op_mul_mat(
         if (src0_is_contiguous) {
             dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
         } else {
-            // If src0 is not contiguous it will be copied to a temporary buffer, it may then be necessary to clear padding.
+            // If src0 is not contiguous it will be copied to a temporary buffer.
+            // This buffer needs to be cleared entirely because multiple regions will function as padding.
             const size_t nbytes_data    = ggml_nbytes(src0);
             const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
             dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
-            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
+            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream));
         }
 
         // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:

From 40f2555797f97314de749873cdc29dc102be66e2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 24 Oct 2024 21:23:33 +0300
Subject: [PATCH 104/396] ci : fix cmake flags for SYCL

---
 ci/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/run.sh b/ci/run.sh
index e06778219..dc26d94ee 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -53,7 +53,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
         exit 1
     fi
 
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi
 
 if [ ! -z ${GG_BUILD_VULKAN} ]; then

From 958367bf530d943a902afa1ce1c342476098576b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 24 Oct 2024 21:51:22 +0200
Subject: [PATCH 105/396] server : refactor slot input data, move tokenizer to
 HTTP thread (#10023)

* server : refactor slot input data, move tokenizer to HTTP thread

* move prompt_tokens.empty() check

* fix incorrect if branch

* fix infinite generation loop

* bring back infill validation

* add infill test

* try fixing format_infill

* fix test

* remove redundant code

* rename completion to inference

* update docs

* use llama_tokens everywhere
---
 examples/server/README.md                     |  12 +
 examples/server/server.cpp                    | 466 +++++-------------
 examples/server/tests/features/infill.feature |  36 ++
 examples/server/tests/features/steps/steps.py |  46 ++
 examples/server/utils.hpp                     | 256 +++++++++-
 5 files changed, 468 insertions(+), 348 deletions(-)
 create mode 100644 examples/server/tests/features/infill.feature

diff --git a/examples/server/README.md b/examples/server/README.md
index 09f1aa249..8f00fcc79 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -319,6 +319,18 @@ node index.js
       - The prompt is a string or an array with the first element given as a string
       - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
 
+    These input shapes and data type are allowed for `prompt`:
+
+      - Single string: `"string"`
+      - Single sequence of tokens: `[12, 34, 56]`
+      - Mixed tokens and strings: `[12, 34, "string", 56, 78]`
+
+    Multiple prompts are also supported. In this case, the completion result will be an array.
+
+      - Only strings: `["string1", "string2"]`
+      - Strings and sequences of tokens: `["string1", [12, 34, 56]]`
+      - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]`
+
     `temperature`: Adjust the randomness of the generated text. Default: `0.8`
 
     `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 51f30ffea..58f93694f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -43,21 +43,6 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
-#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
-#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
-#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
-
-#define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-
-#define QUE_INF(fmt, ...) LOG_INF("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define QUE_WRN(fmt, ...) LOG_WRN("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-
 using json = nlohmann::ordered_json;
 
 enum stop_type {
@@ -68,6 +53,7 @@ enum stop_type {
 // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
 enum slot_state {
     SLOT_STATE_IDLE,
+    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
     SLOT_STATE_PROCESSING_PROMPT,
     SLOT_STATE_DONE_PROMPT,
     SLOT_STATE_GENERATING,
@@ -79,7 +65,7 @@ enum server_state {
 };
 
 enum server_task_type {
-    SERVER_TASK_TYPE_COMPLETION,
+    SERVER_TASK_TYPE_INFERENCE,
     SERVER_TASK_TYPE_CANCEL,
     SERVER_TASK_TYPE_NEXT_RESPONSE,
     SERVER_TASK_TYPE_METRICS,
@@ -89,21 +75,22 @@ enum server_task_type {
     SERVER_TASK_TYPE_SET_LORA,
 };
 
-enum server_task_cmpl_type {
-    SERVER_TASK_CMPL_TYPE_NORMAL,
-    SERVER_TASK_CMPL_TYPE_EMBEDDING,
-    SERVER_TASK_CMPL_TYPE_RERANK,
-    SERVER_TASK_CMPL_TYPE_INFILL,
+enum server_task_inf_type {
+    SERVER_TASK_INF_TYPE_COMPLETION,
+    SERVER_TASK_INF_TYPE_EMBEDDING,
+    SERVER_TASK_INF_TYPE_RERANK,
+    SERVER_TASK_INF_TYPE_INFILL,
 };
 
 struct server_task {
     int id        = -1; // to be filled by server_queue
     int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
 
+    llama_tokens prompt_tokens;
     server_task_type type;
     json data;
 
-    server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
+    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
 
     // utility function
     static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
@@ -161,26 +148,20 @@ struct server_slot {
     int32_t i_batch     = -1;
     int32_t n_predict   = -1; // TODO: disambiguate from params.n_predict
 
+    // n_prompt_tokens may not be equal to prompt_tokens.size(), because prompt maybe truncated
     int32_t n_prompt_tokens           = 0;
     int32_t n_prompt_tokens_processed = 0;
 
-    json prompt; // can be either a string, array of strings or array of token ids
-
-    json input_prefix;
-    json input_suffix;
-    json input_extra;
-
-    // when a task is submitted, we first tokenize the prompt and store it here
-    std::vector<llama_token> prompt_tokens;
-    std::vector<llama_token> extra_tokens;
+    // input prompt tokens
+    llama_tokens prompt_tokens;
 
     size_t last_nl_pos = 0;
 
     std::string generated_text;
-    std::vector<llama_token> cache_tokens;
+    llama_tokens cache_tokens;
     std::vector<completion_token_output> generated_token_probs;
 
-    server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
+    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
 
     bool has_next_token = true;
     bool has_new_line   = false;
@@ -229,7 +210,7 @@ struct server_slot {
         n_past             = 0;
         n_sent_text        = 0;
         n_sent_token_probs = 0;
-        cmpl_type          = SERVER_TASK_CMPL_TYPE_NORMAL;
+        inf_type           = SERVER_TASK_INF_TYPE_COMPLETION;
 
         generated_token_probs.clear();
     }
@@ -734,42 +715,6 @@ struct server_context {
         metrics.init();
     }
 
-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_special, bool parse_special) const {
-        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
-        // or the first element of the json_prompt array is a string.
-        std::vector<llama_token> prompt_tokens;
-
-        if (json_prompt.is_array()) {
-            bool first = true;
-            for (const auto & p : json_prompt) {
-                if (p.is_string()) {
-                    auto s = p.template get<std::string>();
-
-                    std::vector<llama_token> p;
-                    if (first) {
-                        p = common_tokenize(ctx, s, add_special, parse_special);
-                        first = false;
-                    } else {
-                        p = common_tokenize(ctx, s, false, parse_special);
-                    }
-
-                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
-                } else {
-                    if (first) {
-                        first = false;
-                    }
-
-                    prompt_tokens.push_back(p.template get<llama_token>());
-                }
-            }
-        } else {
-            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
-        }
-
-        return prompt_tokens;
-    }
-
     server_slot * get_slot_by_id(int id) {
         for (server_slot & slot : slots) {
             if (slot.id == id) {
@@ -794,22 +739,16 @@ struct server_context {
                     continue;
                 }
 
-                // skip the slot if it does not contains prompt
-                if (!slot.prompt.is_string()) {
+                // skip the slot if it does not contains cached tokens
+                if (slot.prompt_tokens.empty()) {
                     continue;
                 }
 
-                // current slot's prompt
-                std::string slot_prompt = slot.prompt.get<std::string>();
-
-                // length of the current slot's prompt
-                int slot_prompt_len = slot_prompt.size();
-
                 // length of the Longest Common Prefix between the current slot's prompt and the input prompt
-                int lcp_len = longest_common_prefix(slot_prompt, prompt);
+                int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens);
 
                 // fraction of the common substring length compared to the current slot's prompt length
-                similarity = static_cast<float>(lcp_len) / slot_prompt_len;
+                similarity = static_cast<float>(lcp_len) / static_cast<int>(slot.prompt_tokens.size());
 
                 // select the current slot if the criteria match
                 if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
@@ -914,57 +853,6 @@ struct server_context {
             SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
         }
 
-        // infill
-        slot.input_prefix = json_value(data, "input_prefix", json());
-        slot.input_suffix = json_value(data, "input_suffix", json());
-        slot.input_extra  = json_value(data, "input_extra",  json());
-
-        SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size());
-        for (const auto & chunk : slot.input_extra) {
-            // { "text": string, "filename": string }
-            if (!chunk.contains("text") || !chunk["text"].is_string()) {
-                send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST);
-                return false;
-            }
-
-            // filename is optional
-            if (chunk.contains("filename") && !chunk["filename"].is_string()) {
-                send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST);
-                return false;
-            }
-
-            SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str());
-        }
-
-        // get prompt
-        {
-            const auto & prompt = data.find("prompt");
-            if (prompt == data.end()) {
-                send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
-                return false;
-            }
-
-            if ((prompt->is_string()) ||
-                (prompt->is_array() &&  prompt->size() == 1 && prompt->at(0).is_string()) ||
-                (prompt->is_array() && !prompt->empty()     && prompt->at(0).is_number_integer())) {
-                slot.prompt = *prompt;
-            } else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) {
-                slot.prompt = prompt->at(0);
-            } else if (prompt->is_array() && prompt->size() > 1) {
-                // array of strings
-                for (const auto & el : *prompt) {
-                    if (!el.is_string()) {
-                        send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST);
-                        return false;
-                    }
-                }
-                slot.prompt = *prompt;
-            } else {
-                send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST);
-                return false;
-            }
-        }
-
         {
             slot.sparams.logit_bias.clear();
 
@@ -1044,8 +932,7 @@ struct server_context {
             }
         }
 
-        slot.state = SLOT_STATE_PROCESSING_PROMPT;
-        slot.prompt_tokens.clear();
+        slot.state = SLOT_STATE_STARTED;
 
         SLT_INF(slot, "%s", "processing task\n");
 
@@ -1297,7 +1184,7 @@ struct server_context {
         };
 
         if (slot.sparams.n_probs > 0) {
-            const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
+            const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
             const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
             const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
 
@@ -1333,7 +1220,7 @@ struct server_context {
             {"tokens_predicted",    slot.n_decoded},
             {"tokens_evaluated",    slot.n_prompt_tokens},
             {"generation_settings", get_formated_generation(slot)},
-            {"prompt",              slot.prompt},
+            {"prompt",              common_detokenize(ctx, slot.prompt_tokens)},
             {"has_new_line",        slot.has_new_line},
             {"truncated",           slot.truncated},
             {"stopped_eos",         slot.stopped_eos},
@@ -1348,7 +1235,7 @@ struct server_context {
         if (slot.sparams.n_probs > 0) {
             std::vector<completion_token_output> probs;
             if (!slot.params.stream && slot.stopped_word) {
-                const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
+                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
 
                 size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
                 probs = std::vector<completion_token_output>(
@@ -1457,19 +1344,17 @@ struct server_context {
     // Functions to create new task(s) and receive result(s)
     //
 
-    std::vector<server_task> create_tasks_cmpl(json data, server_task_cmpl_type cmpl_type) {
+    // break the input "prompt" into multiple tasks if needed, then format and tokenize the input prompt(s)
+    std::vector<server_task> create_tasks_inference(json data, server_task_inf_type inf_type) {
         std::vector<server_task> tasks;
-        auto create_task = [&](json & task_data, bool replace_prompt, json prompt) {
+        auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) {
+            SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size());
             server_task task;
-            task.id        = queue_tasks.get_new_id();
-            task.cmpl_type = cmpl_type;
-            task.type      = SERVER_TASK_TYPE_COMPLETION;
-            if (replace_prompt) {
-                task.data  = task_data;
-                task.data["prompt"] = std::move(prompt);
-            } else {
-                task.data  = std::move(task_data);
-            }
+            task.id            = queue_tasks.get_new_id();
+            task.inf_type      = inf_type;
+            task.type          = SERVER_TASK_TYPE_INFERENCE;
+            task.data          = task_data;
+            task.prompt_tokens = std::move(prompt_tokens);
             tasks.push_back(std::move(task));
         };
 
@@ -1478,41 +1363,49 @@ struct server_context {
             throw std::runtime_error(error_msg);
         }
 
-        json prompt = data.at("prompt");
-
-        // if the prompt is a singleton (i.e. a string or a list of tokens), we only need to create single task
-        if (prompt.is_string() || json_is_array_of_numbers(prompt)) {
-            data["index"] = 0;
-            create_task(data, false, nullptr);
-        } else if (prompt.is_array()) {
-            // otherwise, it's a multiple-prompt task, we break it into smaller tasks
-            std::vector<json> prompts = prompt;
-            if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
-                // prompts[0] is the question
-                // the rest are the answers/documents
-                SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) prompts.size() - 1);
-                for (size_t i = 1; i < prompts.size(); i++) {
-                    json qd;
-                    qd.push_back(prompts[0]);
-                    qd.push_back(prompts[i]);
-                    data["index"] = i - 1;
-                    create_task(data, true, qd);
-                }
-            } else {
-                SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) prompts.size());
-                for (size_t i = 0; i < prompts.size(); i++) {
-                    const auto & e = prompts[i];
-                    if (e.is_string() || json_is_array_of_numbers(e)) {
+        // because llama_tokenize api is thread-safe, we can tokenize the prompt from HTTP thread
+        bool add_special = inf_type != SERVER_TASK_INF_TYPE_RERANK && inf_type != SERVER_TASK_INF_TYPE_INFILL;
+        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx, data.at("prompt"), add_special, true);
+        switch (inf_type) {
+            case SERVER_TASK_INF_TYPE_RERANK:
+                {
+                    // prompts[0] is the question
+                    // the rest are the answers/documents
+                    GGML_ASSERT(tokenized_prompts.size() > 1);
+                    SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) tokenized_prompts.size() - 1);
+                    for (size_t i = 1; i < tokenized_prompts.size(); i++) {
+                        data["index"] = i - 1;
+                        auto tokens = format_rerank(model, tokenized_prompts[0], tokenized_prompts[i]);
+                        create_task(data, tokens);
+                    }
+                } break;
+            case SERVER_TASK_INF_TYPE_INFILL:
+                {
+                    SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
+                    for (size_t i = 0; i < tokenized_prompts.size(); i++) {
                         data["index"] = i;
-                        create_task(data, true, e);
-                    } else {
-                        throw std::runtime_error(error_msg);
+                        auto tokens = format_infill(
+                            ctx,
+                            data.at("input_prefix"),
+                            data.at("input_suffix"),
+                            data.at("input_extra"),
+                            params.n_batch,
+                            params.n_predict,
+                            slots[0].n_ctx, // TODO: there should be a better way
+                            params.spm_infill,
+                            tokenized_prompts[i]
+                        );
+                        create_task(data, tokens);
+                    }
+                } break;
+            default:
+                {
+                    SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
+                    for (size_t i = 0; i < tokenized_prompts.size(); i++) {
+                        data["index"] = i;
+                        create_task(data, tokenized_prompts[i]);
                     }
                 }
-            }
-        } else {
-            // invalid case
-            throw std::runtime_error(error_msg);
         }
 
         return tasks;
@@ -1534,7 +1427,7 @@ struct server_context {
         queue_tasks.post(cancel_tasks, true);
     }
 
-    // receive the results from task(s) created by create_tasks_cmpl
+    // receive the results from task(s) created by create_tasks_inference
     void receive_cmpl_results(
             const std::unordered_set<int> & id_tasks,
             const std::function<void(std::vector<server_task_result>&)> & result_handler,
@@ -1558,7 +1451,7 @@ struct server_context {
         result_handler(results);
     }
 
-    // receive the results from task(s) created by create_tasks_cmpl, in stream mode
+    // receive the results from task(s) created by create_tasks_inference, in stream mode
     void receive_cmpl_results_stream(
             const std::unordered_set<int> & id_tasks, const
             std::function<bool(server_task_result&)> & result_handler, const
@@ -1591,7 +1484,7 @@ struct server_context {
 
     void process_single_task(const server_task & task) {
         switch (task.type) {
-            case SERVER_TASK_TYPE_COMPLETION:
+            case SERVER_TASK_TYPE_INFERENCE:
                 {
                     const int id_slot = json_value(task.data, "id_slot", -1);
 
@@ -1623,9 +1516,10 @@ struct server_context {
 
                     slot->reset();
 
-                    slot->id_task   = task.id;
-                    slot->cmpl_type = task.cmpl_type;
-                    slot->index     = json_value(task.data, "index", 0);
+                    slot->id_task       = task.id;
+                    slot->inf_type      = task.inf_type;
+                    slot->index         = json_value(task.data, "index", 0);
+                    slot->prompt_tokens = std::move(task.prompt_tokens);
 
                     if (!launch_slot_with_task(*slot, task)) {
                         SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
@@ -1658,7 +1552,7 @@ struct server_context {
                         slot_data["id"]         = slot.id;
                         slot_data["id_task"]    = slot.id_task;
                         slot_data["state"]      = slot.state;
-                        slot_data["prompt"]     = slot.prompt;
+                        slot_data["prompt"]     = common_detokenize(ctx, slot.prompt_tokens);
                         slot_data["next_token"] = {
                             {"has_next_token", slot.has_next_token},
                             {"has_new_line",   slot.has_new_line},
@@ -1785,9 +1679,6 @@ struct server_context {
                     }
                     slot->cache_tokens.resize(token_count);
 
-                    // TODO: maybe detokenize the slot->cache_tokens instead?
-                    slot->prompt = string_format("[restored %d tokens from file]", (int) token_count);
-
                     const int64_t t_end = ggml_time_us();
                     const double t_restore_ms = (t_end - t_start) / 1000.0;
 
@@ -1954,142 +1845,18 @@ struct server_context {
         if (params.cont_batching || batch.n_tokens == 0) {
             for (auto & slot : slots) {
                 // this slot still has a prompt to be processed
-                if (slot.state == SLOT_STATE_PROCESSING_PROMPT) {
+                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
                     auto & prompt_tokens = slot.prompt_tokens;
 
-                    // we haven't tokenized the prompt yet - do it now:
-                    if (prompt_tokens.empty()) {
-                        SLT_INF(slot, "tokenizing prompt, len = %d\n", (int) slot.prompt.size());
-
+                    // TODO: maybe move branch to outside of this loop in the future
+                    if (slot.state == SLOT_STATE_STARTED) {
                         slot.t_start_process_prompt = ggml_time_us();
                         slot.t_start_generation = 0;
-
-                        switch (slot.cmpl_type) {
-                            case SERVER_TASK_CMPL_TYPE_NORMAL:
-                            case SERVER_TASK_CMPL_TYPE_EMBEDDING:
-                                {
-                                    prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true);
-                                } break;
-                            case SERVER_TASK_CMPL_TYPE_RERANK:
-                                {
-                                    // require slot.prompt to be array of 2 strings
-                                    if (!slot.prompt.is_array() || slot.prompt.size() != 2) {
-                                        SLT_ERR(slot, "%s", "invalid prompt for rerank task\n");
-                                        slot.release();
-                                        send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST);
-                                        continue;
-                                    }
-
-                                    // prompt: [BOS]query[EOS][SEP]doc[EOS]
-                                    prompt_tokens.clear();
-                                    prompt_tokens.push_back(llama_token_bos(model));
-                                    {
-                                        const auto part = tokenize(slot.prompt[0], false, false);
-                                        prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
-                                    }
-                                    prompt_tokens.push_back(llama_token_eos(model));
-                                    prompt_tokens.push_back(llama_token_sep(model));
-                                    {
-                                        const auto part = tokenize(slot.prompt[1], false, false);
-                                        prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
-                                    }
-                                    prompt_tokens.push_back(llama_token_eos(model));
-                                } break;
-                            case SERVER_TASK_CMPL_TYPE_INFILL:
-                                {
-                                    // TODO: optimize this block by reducing memory allocations and movement
-
-                                    // use FIM repo-level pattern:
-                                    // ref: https://arxiv.org/pdf/2409.12186
-                                    //
-                                    // [FIM_REP]myproject
-                                    // [FIM_SEP]filename0
-                                    // extra chunk 0
-                                    // [FIM_SEP]filename1
-                                    // extra chunk 1
-                                    // ...
-                                    // [FIM_SEP]filename
-                                    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
-                                    //
-                                    auto tokens_prefix = tokenize(slot.input_prefix, false, false);
-                                    auto tokens_suffix = tokenize(slot.input_suffix, false, false);
-                                    auto tokens_prompt = tokenize(slot.prompt,       false, false);
-
-                                    slot.extra_tokens.clear();
-                                    if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
-                                        static const auto k_fim_repo = tokenize("myproject\n", false, false);
-
-                                        slot.extra_tokens.push_back(llama_token_fim_rep(model));
-                                        slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
-                                    }
-
-                                    for (const auto & chunk : slot.input_extra) {
-                                        // { "text": string, "filename": string }
-                                        const std::string text     = chunk.value("text", "");
-                                        const std::string filename = chunk.value("filename", "tmp");
-
-                                        if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
-                                            const auto k_fim_file = tokenize(filename + "\n", false, false);
-
-                                            slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model));
-                                            slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
-                                        } else {
-                                            // chunk separator in binary form to avoid confusing the AI
-                                            static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
-                                            static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false);
-
-                                            slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
-                                        }
-
-                                        const auto chunk_tokens = tokenize(text, false, false);
-                                        slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
-                                    }
-
-                                    if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
-                                        // TODO: current filename
-                                        static const auto k_fim_file = tokenize("filename\n", false, false);
-
-                                        slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model));
-                                        slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
-                                    }
-
-                                    // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
-                                    const int n_suffix_take = std::min<int>(tokens_suffix.size(),   (n_batch/4));
-                                    const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4) - 3);
-
-                                    // fill the rest of the context with extra chunks
-                                    const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
-
-                                    tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
-                                    tokens_suffix.resize(n_suffix_take);
-
-                                    tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
-                                    tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
-                                    tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
-
-                                    auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix;
-                                    auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix;
-
-                                    if (llama_add_bos_token(model)) {
-                                        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
-                                    }
-
-                                    SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size());
-
-                                    // put the extra context before the FIM prefix
-                                    embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end());
-
-                                    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
-                                    embd_inp.push_back(llama_token_fim_mid(model));
-
-                                    prompt_tokens = std::move(embd_inp);
-                                } break;
-                        }
-
                         slot.n_past = 0;
                         slot.n_prompt_tokens = prompt_tokens.size();
+                        slot.state = SLOT_STATE_PROCESSING_PROMPT;
 
-                        SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
+                        SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
 
                         // print prompt tokens (for debugging)
                         if (1) {
@@ -2114,7 +1881,7 @@ struct server_context {
                             continue;
                         }
 
-                        if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
+                        if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) {
                             // this prompt is too large to process - discard it
                             if (slot.n_prompt_tokens > n_ubatch) {
                                 slot.release();
@@ -2144,7 +1911,7 @@ struct server_context {
                                 const int n_block_size = n_left / 2;
                                 const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
 
-                                std::vector<llama_token> new_tokens(
+                                llama_tokens new_tokens(
                                         prompt_tokens.begin(),
                                         prompt_tokens.begin() + slot.params.n_keep);
 
@@ -2225,7 +1992,7 @@ struct server_context {
                     }
 
                     // non-causal tasks require to fit the entire prompt in the physical batch
-                    if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
+                    if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) {
                         // cannot fit the prompt in the current batch - will try next iter
                         if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
                             continue;
@@ -2234,8 +2001,8 @@ struct server_context {
 
                     // check that we are in the right batch_type, if not defer the slot
                     const bool slot_type =
-                        slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING ||
-                        slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK     ? 1 : 0;
+                        slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING ||
+                        slot.inf_type == SERVER_TASK_INF_TYPE_RERANK     ? 1 : 0;
 
                     if (batch_type == -1) {
                         batch_type = slot_type;
@@ -2353,7 +2120,7 @@ struct server_context {
                 }
 
                 if (slot.state == SLOT_STATE_DONE_PROMPT) {
-                    if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
+                    if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING) {
                         // prompt evaluated for embedding
                         send_embedding(slot, batch_view);
                         slot.release();
@@ -2361,7 +2128,7 @@ struct server_context {
                         continue; // continue loop of slots
                     }
 
-                    if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
+                    if (slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) {
                         send_rerank(slot, batch_view);
                         slot.release();
                         slot.i_batch = -1;
@@ -2915,13 +2682,13 @@ int main(int argc, char ** argv) {
         res_ok(res, {{ "success", true }});
     };
 
-    const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) {
+    const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
         if (ctx_server.params.embedding || ctx_server.params.reranking) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
 
-        std::vector<server_task> tasks = ctx_server.create_tasks_cmpl(data, cmpl_type);
+        std::vector<server_task> tasks = ctx_server.create_tasks_inference(data, inf_type);
         ctx_server.queue_results.add_waiting_tasks(tasks);
         ctx_server.queue_tasks.post(tasks);
 
@@ -2967,10 +2734,11 @@ int main(int argc, char ** argv) {
 
     const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
         json data = json::parse(req.body);
-        return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
+        return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res);
     };
 
     const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
+        // check model compatibility
         std::string err;
         if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
             err += "prefix token is missing. ";
@@ -2981,14 +2749,42 @@ int main(int argc, char ** argv) {
         if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) {
             err += "middle token is missing. ";
         }
-
         if (!err.empty()) {
             res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
 
         json data = json::parse(req.body);
-        return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res);
+
+        // validate input
+        if (!data.contains("input_prefix")) {
+            res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (!data.contains("input_suffix")) {
+            res_error(res, format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (data.contains("input_extra") && !data.at("input_extra").is_array()) {
+            res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+        json input_extra = json_value(data, "input_extra", json::array());
+        for (const auto & chunk : input_extra) {
+            // { "text": string, "filename": string }
+            if (!chunk.contains("text") || !chunk.at("text").is_string()) {
+                res_error(res, format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST));
+                return;
+            }
+            // filename is optional
+            if (chunk.contains("filename") && !chunk.at("filename").is_string()) {
+                res_error(res, format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return;
+            }
+        }
+        data["input_extra"] = input_extra; // default to empty array if it's not exist
+
+        return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res);
     };
 
     // TODO: maybe merge this function with "handle_completions_generic"
@@ -3000,7 +2796,7 @@ int main(int argc, char ** argv) {
 
         json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
 
-        std::vector<server_task> tasks = ctx_server.create_tasks_cmpl(data, SERVER_TASK_CMPL_TYPE_NORMAL);
+        std::vector<server_task> tasks = ctx_server.create_tasks_inference(data, SERVER_TASK_INF_TYPE_COMPLETION);
         ctx_server.queue_results.add_waiting_tasks(tasks);
         ctx_server.queue_tasks.post(tasks);
 
@@ -3073,7 +2869,7 @@ int main(int argc, char ** argv) {
             const bool add_special = json_value(body, "add_special", false);
             const bool with_pieces = json_value(body, "with_pieces", false);
 
-            std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special, true);
+            llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true);
 
             if (with_pieces) {
                 for (const auto& token : tokens) {
@@ -3110,7 +2906,7 @@ int main(int argc, char ** argv) {
 
         std::string content;
         if (body.count("tokens") != 0) {
-            const std::vector<llama_token> tokens = body.at("tokens");
+            const llama_tokens tokens = body.at("tokens");
             content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
         }
 
@@ -3144,7 +2940,7 @@ int main(int argc, char ** argv) {
         json responses = json::array();
         bool error = false;
         {
-            std::vector<server_task> tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_EMBEDDING);
+            std::vector<server_task> tasks = ctx_server.create_tasks_inference({{"prompt", prompt}}, SERVER_TASK_INF_TYPE_EMBEDDING);
             ctx_server.queue_results.add_waiting_tasks(tasks);
             ctx_server.queue_tasks.post(tasks);
 
@@ -3221,7 +3017,7 @@ int main(int argc, char ** argv) {
         json responses = json::array();
         bool error = false;
         {
-            std::vector<server_task> tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_RERANK);
+            std::vector<server_task> tasks = ctx_server.create_tasks_inference({{"prompt", prompt}}, SERVER_TASK_INF_TYPE_RERANK);
             ctx_server.queue_results.add_waiting_tasks(tasks);
             ctx_server.queue_tasks.post(tasks);
 
diff --git a/examples/server/tests/features/infill.feature b/examples/server/tests/features/infill.feature
new file mode 100644
index 000000000..a0bbfef77
--- /dev/null
+++ b/examples/server/tests/features/infill.feature
@@ -0,0 +1,36 @@
+@llama.cpp
+@infill
+Feature: llama.cpp server
+
+  # The current model is made by adding FIM tokens to the existing stories260K
+  # We may want to use a better model in the future, maybe something like SmolLM 360M
+
+  Background: Server startup
+    Given a server listening on localhost:8080
+    And   a model file tinyllamas/stories260K-infill.gguf from HF repo ggml-org/models
+    And   a model file test-model-infill.gguf
+    And   a model alias tinyllama-infill
+    And   42 as server seed
+    And   1024 as batch size
+    And   1024 as ubatch size
+    And   2048 KV cache size
+    And   64 max tokens to predict
+    And   0.0 temperature
+    Then  the server is starting
+    Then  the server is healthy
+
+  Scenario: Infill without input_extra
+    Given a prompt "Complete this"
+    And   an infill input extra none none
+    And   an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_"
+    And   an infill input suffix "}\n"
+    And   an infill request with no api error
+    Then  64 tokens are predicted matching One|day|she|saw|big|scary|bird
+
+  Scenario: Infill with input_extra
+    Given a prompt "Complete this"
+    And   an infill input extra "llama.h" "LLAMA_API int32_t llama_n_threads();\n"
+    And   an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_"
+    And   an infill input suffix "}\n"
+    And   an infill request with no api error
+    Then  64 tokens are predicted matching cuts|Jimmy|mom|came|into|the|room"
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 540a2ecd5..2e418d8aa 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -80,6 +80,11 @@ def step_server_config(context, server_fqdn: str, server_port: str):
     context.lora_file = None
     context.disable_ctx_shift = False
 
+    # infill
+    context.infill_input_extra = None
+    context.infill_input_suffix = ''
+    context.infill_input_prefix = ''
+
     context.tasks_result = []
     context.concurrent_tasks = []
     context.prompts = []
@@ -291,6 +296,28 @@ async def step_request_completion(context, api_error: Literal['raised'] | str):
         assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}"
 
 
+@step('an infill request with {api_error} api error')
+@async_run_until_complete
+async def step_request_completion(context, api_error: Literal['raised'] | str):
+    if api_error != 'no':
+        raise ValueError(f'api_error={api_error} is not yet implemented')
+    payload = {
+        "prompt": context.prompts[0],
+        "input_suffix": context.infill_input_suffix,
+        "input_prefix": context.infill_input_prefix,
+        "n_predict": context.n_predict,
+        "seed": context.seed,
+        "temperature": context.temperature,
+    }
+    if context.infill_input_extra is not None:
+        payload['input_extra'] = context.infill_input_extra
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
+        async with session.post(f'{context.base_url}/infill',
+                                json=payload) as response:
+            assert response.status == 200
+            context.tasks_result = [await response.json()]
+
+
 @step('{predicted_n:d} tokens are predicted matching {re_content}')
 def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
     context.completion = context.tasks_result.pop()
@@ -539,6 +566,25 @@ def step_a_prompt_prompt(context, prompt):
     context.n_prompts = len(context.prompts)
 
 
+# TODO: allow this to be repeated
+@step('an infill input extra {filename} {text}')
+def step_infill_input_extra(context, filename, text):
+    if filename == 'none':
+        context.infill_input_extra = None
+    else:
+        context.infill_input_extra = [{'filename': filename, 'text': text}]
+
+
+@step('an infill input suffix {text}')
+def step_infill_input_suffix(context, text):
+    context.infill_input_suffix = text
+
+
+@step('an infill input prefix {text}')
+def step_infill_input_prefix(context, text):
+    context.infill_input_prefix = text
+
+
 @step('{num_prompts:d} prompts {prompt} with seed {seed:d}')
 def step_many_prompts(context, num_prompts, prompt, seed):
     if context.seed is None:
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 69519ef95..811242062 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -24,6 +24,22 @@
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 
 using json = nlohmann::ordered_json;
+using llama_tokens = std::vector<llama_token>;
+
+#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+
+#define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+#define QUE_INF(fmt, ...) LOG_INF("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_WRN(fmt, ...) LOG_WRN("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 
 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
@@ -52,9 +68,235 @@ static T json_value(const json & body, const std::string & key, const T & defaul
 }
 
 //
-// chat template utils
+// tokenizer and input processing utils
 //
 
+static bool json_is_array_of_numbers(const json & data) {
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            if (!e.is_number_integer()) {
+                return false;
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+// is array having BOTH numbers & strings?
+static bool json_is_array_of_mixed_numbers_strings(const json & data) {
+    bool seen_string = false;
+    bool seen_number = false;
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            seen_string |= e.is_string();
+            seen_number |= e.is_number_integer();
+            if (seen_number && seen_string) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+/**
+ * this handles 2 cases:
+ * - only string, example: "string"
+ * - mixed string and tokens, example: [12, 34, "string", 56, 78]
+ */
+static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
+    // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+    // or the first element of the json_prompt array is a string.
+    llama_tokens prompt_tokens;
+
+    if (json_prompt.is_array()) {
+        bool first = true;
+        for (const auto & p : json_prompt) {
+            if (p.is_string()) {
+                auto s = p.template get<std::string>();
+
+                llama_tokens p;
+                if (first) {
+                    p = common_tokenize(ctx, s, add_special, parse_special);
+                    first = false;
+                } else {
+                    p = common_tokenize(ctx, s, false, parse_special);
+                }
+
+                prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+            } else {
+                if (first) {
+                    first = false;
+                }
+
+                prompt_tokens.push_back(p.template get<llama_token>());
+            }
+        }
+    } else {
+        auto s = json_prompt.template get<std::string>();
+        prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
+    }
+
+    return prompt_tokens;
+}
+
+/**
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
+ * this supports these cases:
+ * - "prompt": "string"
+ * - "prompt": [12, 34, 56]
+ * - "prompt": [12, 34, "string", 56, 78]
+ * and multiple prompts (multi-tasks):
+ * - "prompt": ["string1", "string2"]
+ * - "prompt": ["string1", [12, 34, 56]]
+ * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
+ */
+static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
+    std::vector<llama_tokens> result;
+    if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
+        // string or mixed
+        result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special));
+    } else if (json_is_array_of_numbers(json_prompt)) {
+        // array of tokens
+        result.push_back(json_prompt.get<llama_tokens>());
+    } else if (json_prompt.is_array()) {
+        // array of prompts
+        result.reserve(json_prompt.size());
+        for (const auto & p : json_prompt) {
+            if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
+                result.push_back(tokenize_mixed(ctx, p, add_special, parse_special));
+            } else if (json_is_array_of_numbers(p)) {
+                // array of tokens
+                result.push_back(p.get<llama_tokens>());
+            } else {
+                throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
+            }
+        }
+    } else {
+        throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
+    }
+    return result;
+}
+
+//
+// template utils
+//
+
+// format rerank task: [BOS]query[EOS][SEP]doc[EOS]
+static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
+    llama_tokens result;
+    result.reserve(doc.size() + query.size() + 4);
+    result.push_back(llama_token_bos(model));
+    result.insert(result.end(), query.begin(), query.end());
+    result.push_back(llama_token_eos(model));
+    result.push_back(llama_token_sep(model));
+    result.insert(result.end(), doc.begin(), doc.end());
+    result.push_back(llama_token_eos(model));
+    return result;
+}
+
+// format infill task
+static llama_tokens format_infill(
+        const llama_context * ctx,
+        const json & input_prefix,
+        const json & input_suffix,
+        const json & input_extra,
+        const int n_batch,
+        const int n_predict,
+        const int n_ctx,
+        const bool spm_infill,
+        const llama_tokens & tokens_prompt
+    ) {
+    // TODO: optimize this block by reducing memory allocations and movement
+
+    // use FIM repo-level pattern:
+    // ref: https://arxiv.org/pdf/2409.12186
+    //
+    // [FIM_REP]myproject
+    // [FIM_SEP]filename0
+    // extra chunk 0
+    // [FIM_SEP]filename1
+    // extra chunk 1
+    // ...
+    // [FIM_SEP]filename
+    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
+    //
+    llama_tokens extra_tokens;
+    extra_tokens.reserve(n_ctx);
+
+    auto model = llama_get_model(ctx);
+    auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false);
+    auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);
+
+    if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
+        // TODO: make project name an input
+        static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false);
+
+        extra_tokens.push_back(llama_token_fim_rep(model));
+        extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
+    }
+    for (const auto & chunk : input_extra) {
+        // { "text": string, "filename": string }
+        const std::string text     = json_value(chunk, "text",     std::string());
+        const std::string filename = json_value(chunk, "filename", std::string("tmp"));
+
+        if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
+            const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false);
+
+            extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
+            extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+        } else {
+            // chunk separator in binary form to avoid confusing the AI
+            static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
+            static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false);
+
+            extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
+        }
+
+        const auto chunk_tokens = common_tokenize(ctx, text, false, false);
+        extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
+    }
+
+    if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
+        // TODO: current filename
+        static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false);
+
+        extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
+        extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+    }
+
+    // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
+    const int n_suffix_take = std::min<int>(tokens_suffix.size(),   (n_batch/4));
+    const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4) - 3);
+
+    // fill the rest of the context with extra chunks
+    const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
+
+    tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
+    tokens_suffix.resize(n_suffix_take);
+
+    tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
+    tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
+    tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
+
+    auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
+    auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
+
+    if (llama_add_bos_token(model)) {
+        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+    }
+
+    SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
+
+    // put the extra context before the FIM prefix
+    embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
+
+    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+    embd_inp.push_back(llama_token_fim_mid(model));
+
+    return embd_inp;
+}
+
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
     std::vector<common_chat_msg> chat;
@@ -229,18 +471,6 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
     return std::string::npos;
 }
 
-static bool json_is_array_of_numbers(const json & data) {
-    if (data.is_array()) {
-        for (const auto & e : data) {
-            if (!e.is_number()) {
-                return false;
-            }
-        }
-        return true;
-    }
-    return false;
-}
-
 // TODO: reuse llama_detokenize
 template <class Iter>
 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {

From bc5ba007b2c83ac95875e68724dabfc12159fc61 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 25 Oct 2024 10:13:46 +0300
Subject: [PATCH 106/396] server : check that the prompt fits in the slot's
 context (#10030)

ggml-ci
---
 convert_hf_to_gguf.py        | 3 +++
 convert_hf_to_gguf_update.py | 1 +
 examples/server/server.cpp   | 7 ++++++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 7e552a71b..a34dabe23 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -573,6 +573,9 @@ class Model:
         if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
             # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
             res = "bert-bge"
+        if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
+            # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
+            res = "bert-bge-large"
         if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
             # ref: https://huggingface.co/mosaicml/mpt-7b
             res = "mpt"
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 022354a3b..28cd02e5a 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -72,6 +72,7 @@ models = [
     {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
     {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
     {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+    {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
     {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
     {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
     {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 58f93694f..2821877b2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1882,12 +1882,17 @@ struct server_context {
                         }
 
                         if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) {
-                            // this prompt is too large to process - discard it
                             if (slot.n_prompt_tokens > n_ubatch) {
                                 slot.release();
                                 send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
                                 continue;
                             }
+
+                            if (slot.n_prompt_tokens > slot.n_ctx) {
+                                slot.release();
+                                send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_SERVER);
+                                continue;
+                            }
                         } else {
                             if (!params.ctx_shift) {
                                 // if context shift is disabled, we make sure prompt size is smaller than KV size

From 2f8bd2b90133cf37ae752015e1bfd738cc6d0112 Mon Sep 17 00:00:00 2001
From: Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
Date: Fri, 25 Oct 2024 12:57:41 +0530
Subject: [PATCH 107/396] llamafile : extend sgemm.cpp support for Q5_0 models
 (#10010)

---
 ggml/src/llamafile/sgemm.cpp | 57 ++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp
index 0193a463a..9eead3f61 100644
--- a/ggml/src/llamafile/sgemm.cpp
+++ b/ggml/src/llamafile/sgemm.cpp
@@ -942,6 +942,36 @@ class tinyBLAS_Q0_AVX {
         return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
     }
 
+    inline __m256i load(const block_q5_0 *b) {
+        return _mm256_or_si256(denibble(b->qs), bittobyte(b->qh));
+    }
+
+    inline __m128i load0(const block_q5_0* b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        uint32_t x32;
+        memcpy(&x32, b->qh, sizeof(uint32_t));
+        __m128i qxl = _mm_and_si128(_mm_set1_epi8(15), x);
+        __m128i bytesl = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
+                                        _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
+                                                     _mm_shuffle_epi8(_mm_set1_epi32(x32),
+                                                                      _mm_set_epi64x(0x0101010101010101, 0x0000000000000000))));
+        bytesl = _mm_andnot_si128(bytesl, _mm_set1_epi8((char)0xF0));
+        return _mm_or_si128(qxl, bytesl);
+    }
+
+    inline __m128i load1(const block_q5_0* b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        uint32_t x32;
+        memcpy(&x32, b->qh, sizeof(uint32_t));
+        __m128i qxh = _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4));
+        __m128i bytesh = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
+                                        _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
+                                                     _mm_shuffle_epi8(_mm_set1_epi32(x32),
+                                                                      _mm_set_epi64x(0x0303030303030303, 0x0202020202020202))));
+        bytesh = _mm_andnot_si128(bytesh, _mm_set1_epi8((char)0xF0));
+        return _mm_or_si128(qxh, bytesh);
+    }
+
     inline __m256i load(const block_iq4_nl *b) {
         return MM256_SET_M128I(load1(b), load0(b));
     }
@@ -973,6 +1003,17 @@ class tinyBLAS_Q0_AVX {
                                                         _mm_srli_epi16(x, 4), 1));
     }
 
+    static inline __m256i bittobyte(const uint8_t *p) {
+        uint32_t x32;
+        memcpy(&x32, p, sizeof(uint32_t));
+        __m256i bytes = _mm256_cmpeq_epi8(_mm256_set1_epi64x(-1),
+                                          _mm256_or_si256(_mm256_set1_epi64x(0x7fbfdfeff7fbfdfe),
+                                                          _mm256_shuffle_epi8(_mm256_set1_epi32(x32),
+                                                                              _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202,
+                                                                                                0x0101010101010101, 0x0000000000000000))));
+        return _mm256_andnot_si256(bytes, _mm256_set1_epi8((char)0xF0));
+    }
+
     const TA *const A;
     const TB *const B;
     TC *const C;
@@ -1182,6 +1223,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 #endif
     }
 
+    case GGML_TYPE_Q5_0: {
+        if (Btype != GGML_TYPE_Q8_0)
+            return false;
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+        tinyBLAS_Q0_AVX<block_q5_0, block_q8_0, float> tb{
+            k, (const block_q5_0 *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            ith, nth};
+        tb.matmul(m, n);
+        return true;
+#else
+        return false;
+#endif
+    }
+
     case GGML_TYPE_IQ4_NL: {
         if (Btype != GGML_TYPE_Q8_0)
             return false;

From d80fb71f8b8bf69ec095ba281f8248d136d21c76 Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Fri, 25 Oct 2024 17:57:54 +0200
Subject: [PATCH 108/396] llama: string_split fix (#10022)

* llama: Refactor string_split to use template specialization,  fixes parsing strings with spaces

* llama: Add static_assert in the string_split template to ensure the correct template specialization is used for std::string
---
 common/arg.cpp             | 10 +++++-----
 common/common.cpp          | 13 -------------
 common/common.h            | 19 +++++++++++++++++--
 examples/server/server.cpp |  2 +-
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index cd9d315dc..608e46e02 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -128,13 +128,13 @@ static void common_params_handle_model_default(common_params & params) {
             }
             params.hf_file = params.model;
         } else if (params.model.empty()) {
-            params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
+            params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
         }
     } else if (!params.model_url.empty()) {
         if (params.model.empty()) {
-            auto f = string_split(params.model_url, '#').front();
-            f = string_split(f, '?').front();
-            params.model = fs_get_cache_file(string_split(f, '/').back());
+            auto f = string_split<std::string>(params.model_url, '#').front();
+            f = string_split<std::string>(f, '?').front();
+            params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
         }
     } else if (params.model.empty()) {
         params.model = DEFAULT_MODEL_PATH;
@@ -879,7 +879,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--samplers"}, "SAMPLERS",
         string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
         [](common_params & params, const std::string & value) {
-            const auto sampler_names = string_split(value, ';');
+            const auto sampler_names = string_split<std::string>(value, ';');
             params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
         }
     ).set_sparam());
diff --git a/common/common.cpp b/common/common.cpp
index a8eebb68b..faaa420d9 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -416,19 +416,6 @@ std::string string_format(const char * fmt, ...) {
     return std::string(buf.data(), size);
 }
 
-std::vector<std::string> string_split(std::string input, char separator) {
-    std::vector<std::string> parts;
-    size_t separator_pos = input.find(separator);
-    while (separator_pos != std::string::npos) {
-        std::string part = input.substr(0, separator_pos);
-        parts.emplace_back(part);
-        input = input.substr(separator_pos + 1);
-        separator_pos = input.find(separator);
-    }
-    parts.emplace_back(input);
-    return parts;
-}
-
 std::string string_strip(const std::string & str) {
     size_t start = 0;
     size_t end = str.size();
diff --git a/common/common.h b/common/common.h
index 19d928777..f9333395c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -380,8 +380,6 @@ bool set_process_priority(enum ggml_sched_priority prio);
 LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
 std::string string_format(const char * fmt, ...);
 
-std::vector<std::string> string_split(std::string input, char separator);
-
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
 
@@ -389,6 +387,7 @@ void string_replace_all(std::string & s, const std::string & search, const std::
 
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
+    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
     std::vector<T> values;
     std::istringstream str_stream(str);
     std::string token;
@@ -401,6 +400,22 @@ static std::vector<T> string_split(const std::string & str, char delim) {
     return values;
 }
 
+template<>
+std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
+{
+    std::vector<std::string> parts;
+    size_t begin_pos = 0;
+    size_t separator_pos = input.find(separator);
+    while (separator_pos != std::string::npos) {
+        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
+        parts.emplace_back(part);
+        begin_pos = separator_pos + 1;
+        separator_pos = input.find(separator, begin_pos);
+    }
+    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
+    return parts;
+}
+
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2821877b2..3c12ef6f0 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2380,7 +2380,7 @@ int main(int argc, char ** argv) {
     auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
         server_state current_state = state.load();
         if (current_state == SERVER_STATE_LOADING_MODEL) {
-            auto tmp = string_split(req.path, '.');
+            auto tmp = string_split<std::string>(req.path, '.');
             if (req.path == "/" || tmp.back() == "html") {
                 res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
                 res.status = 503;

From ff252ea48e90e6552010fd74584334fb41bdd387 Mon Sep 17 00:00:00 2001
From: wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
Date: Fri, 25 Oct 2024 10:07:34 -0600
Subject: [PATCH 109/396] llama : add DRY sampler (#9702)

* sampling : add DRY sampler (post-refactor)

* DRY: Trying to fix coauthors, removed unneeded line

* DRY: Fixed redundant code

* DRY: Fixed crash issue due to DRY being in chain but uninitialized

---------

Co-authored-by: l3utterfly <gc.pthzfoldr@gmail.com>
Co-authored-by: pi6am <34464159+pi6am@users.noreply.github.com>
---
 common/arg.cpp                        |  61 ++++
 common/common.cpp                     |   4 +
 common/common.h                       |  70 +++--
 common/sampling.cpp                   |  17 ++
 examples/main/README.md               |  24 ++
 examples/server/README.md             |  15 +
 examples/server/public/index-new.html |  14 +-
 examples/server/public/index.html     |   8 +
 examples/server/public/style.css      |   0
 examples/server/server.cpp            |  87 ++++--
 include/llama.h                       |  10 +
 src/llama-sampling.cpp                | 391 ++++++++++++++++++++++++++
 src/llama-sampling.h                  |  18 ++
 src/llama-vocab.cpp                   |  16 ++
 src/llama-vocab.h                     |   5 +
 src/llama.cpp                         |   4 +
 tests/test-sampling.cpp               |  32 +++
 17 files changed, 713 insertions(+), 63 deletions(-)
 mode change 100755 => 100644 examples/server/public/style.css

diff --git a/common/arg.cpp b/common/arg.cpp
index 608e46e02..e1e933934 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -251,6 +251,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         for (auto & antiprompt : params.antiprompt) {
             string_process_escapes(antiprompt);
         }
+        for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
+            string_process_escapes(seq_breaker);
+        }
     }
 
     if (!params.kv_overrides.empty()) {
@@ -997,6 +1000,64 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sparams.penalty_freq = std::stof(value);
         }
     ).set_sparam());
+    add_opt(common_arg(
+        {"--dry-multiplier"}, "N",
+        string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
+        [](common_params & params, const std::string & value) {
+            params.sparams.dry_multiplier = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dry-base"}, "N",
+        string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
+        [](common_params & params, const std::string & value) {
+            float potential_base = std::stof(value);
+            if (potential_base >= 1.0f)
+            {
+                params.sparams.dry_base = potential_base;
+            }
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dry-allowed-length"}, "N",
+        string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
+        [](common_params & params, int value) {
+            params.sparams.dry_allowed_length = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dry-penalty-last-n"}, "N",
+        string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
+        [](common_params & params, int value) {
+            params.sparams.dry_penalty_last_n = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dry-sequence-breaker"}, "STRING",
+        string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
+            params.sparams.dry_sequence_breakers.empty() ? "none" :
+            std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
+                params.sparams.dry_sequence_breakers.end(),
+                std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
+                [](const std::string& a, const std::string& b) {
+                    std::string formatted_b = (b == "\n") ? "\\n" : b;
+                    return a + ", '" + formatted_b + "'";
+                }).c_str()),
+        [](common_params & params, const std::string & value) {
+            static bool defaults_cleared = false;
+
+            if (!defaults_cleared) {
+                params.sparams.dry_sequence_breakers.clear();
+                defaults_cleared = true;
+            }
+
+            if (value == "none") {
+                params.sparams.dry_sequence_breakers.clear();
+            } else {
+                params.sparams.dry_sequence_breakers.emplace_back(value);
+            }
+        }
+    ).set_sparam());
     add_opt(common_arg(
         {"--dynatemp-range"}, "N",
         string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
diff --git a/common/common.cpp b/common/common.cpp
index faaa420d9..ff8cc4076 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2006,6 +2006,10 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
     fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
+    fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
+    fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
+    fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
+    fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
     fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
diff --git a/common/common.h b/common/common.h
index f9333395c..18b2121ed 100644
--- a/common/common.h
+++ b/common/common.h
@@ -84,14 +84,15 @@ enum llama_example {
 
 enum common_sampler_type {
     COMMON_SAMPLER_TYPE_NONE        = 0,
-    COMMON_SAMPLER_TYPE_TOP_K       = 1,
-    COMMON_SAMPLER_TYPE_TOP_P       = 2,
-    COMMON_SAMPLER_TYPE_MIN_P       = 3,
-    COMMON_SAMPLER_TYPE_TFS_Z       = 4,
-    COMMON_SAMPLER_TYPE_TYPICAL_P   = 5,
-    COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
-    COMMON_SAMPLER_TYPE_XTC         = 7,
-    COMMON_SAMPLER_TYPE_INFILL      = 8,
+    COMMON_SAMPLER_TYPE_DRY         = 1,
+    COMMON_SAMPLER_TYPE_TOP_K       = 2,
+    COMMON_SAMPLER_TYPE_TOP_P       = 3,
+    COMMON_SAMPLER_TYPE_MIN_P       = 4,
+    COMMON_SAMPLER_TYPE_TFS_Z       = 5,
+    COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
+    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
+    COMMON_SAMPLER_TYPE_XTC         = 8,
+    COMMON_SAMPLER_TYPE_INFILL      = 9,
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -104,32 +105,39 @@ enum dimre_method {
 struct common_sampler_params {
     uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
 
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   xtc_probability   = 0.00f; // 0.0 = disabled
-    float   xtc_threshold     = 0.10f; // > 0.5 disables XTC
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-    bool    no_perf           = false; // disable performance metrics
+    int32_t n_prev             = 64;    // number of previous tokens to remember
+    int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep           = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k              = 40;    // <= 0 to use vocab size
+    float   top_p              = 0.95f; // 1.0 = disabled
+    float   min_p              = 0.05f; // 0.0 = disabled
+    float   xtc_probability    = 0.00f; // 0.0 = disabled
+    float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
+    float   tfs_z              = 1.00f; // 1.0 = disabled
+    float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
+    float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range     = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat     = 1.00f; // 1.0 = disabled
+    float   penalty_freq       = 0.00f; // 0.0 = disabled
+    float   penalty_present    = 0.00f; // 0.0 = disabled
+    float   dry_multiplier     = 0.0f;  // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
+    float   dry_base           = 1.75f; // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
+    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
+    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau       = 5.00f; // target entropy
+    float   mirostat_eta       = 0.10f; // learning rate
+    bool    penalize_nl        = false; // consider newlines as a repeatable token
+    bool    ignore_eos         = false;
+    bool    no_perf            = false; // disable performance metrics
+
+    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
 
 
     std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_DRY,
         COMMON_SAMPLER_TYPE_TOP_K,
         COMMON_SAMPLER_TYPE_TFS_Z,
         COMMON_SAMPLER_TYPE_TYPICAL_P,
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 4ab3eface..48a9df8ba 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -130,9 +130,11 @@ std::string common_sampler_params::print() const {
 
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
+            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
             "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
             "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
             penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
+            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
             top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
             mirostat, mirostat_eta, mirostat_tau);
 
@@ -174,6 +176,17 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
     if (params.mirostat == 0) {
         for (const auto & cnstr : params.samplers) {
             switch (cnstr) {
+                    case COMMON_SAMPLER_TYPE_DRY:
+                    {
+                        std::vector<const char*> c_breakers;
+                        c_breakers.reserve(params.dry_sequence_breakers.size());
+                        for (const auto& str : params.dry_sequence_breakers) {
+                            c_breakers.push_back(str.c_str());
+                        }
+
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                    }
+                        break;
                 case COMMON_SAMPLER_TYPE_TOP_K:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
                     break;
@@ -358,6 +371,7 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_
 
 char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
     switch (cnstr) {
+        case COMMON_SAMPLER_TYPE_DRY:         return 'd';
         case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
         case COMMON_SAMPLER_TYPE_TFS_Z:       return 'f';
         case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
@@ -372,6 +386,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
 
 std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
     switch (cnstr) {
+        case COMMON_SAMPLER_TYPE_DRY:         return "dry";
         case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
         case COMMON_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
         case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
@@ -386,6 +401,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
 
 std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
     std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
+        { "dry",         COMMON_SAMPLER_TYPE_DRY },
         { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
         { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
         { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
@@ -434,6 +450,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
 
 std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
     std::unordered_map<char, common_sampler_type> sampler_name_map = {
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY),         COMMON_SAMPLER_TYPE_DRY },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z),       COMMON_SAMPLER_TYPE_TFS_Z },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
diff --git a/examples/main/README.md b/examples/main/README.md
index 7e192b9f2..c7c823171 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -187,6 +187,30 @@ Use the `--no-penalize-nl` option to disable newline penalization when applying
 
 Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
 
+### DRY Repetition Penalty
+
+DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)).
+
+- `--dry-multiplier N`: Set the DRY sampling multiplier (default: 0.0, 0.0 = disabled).
+- `--dry-base N`: Set the DRY sampling base value (default: 1.75).
+- `--dry-allowed-length N`: Set the allowed length for DRY sampling (default: 2).
+- `--dry-penalty-last-n N`: Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size).
+- `--dry-sequence-breaker STRING`: Add a sequence breaker for DRY sampling. Can be used more than once to add multiple sequence breakers. Using this clears out the default breakers, which consist of: `['\n', ':', '"', '*']`. If the string `"none"` is supplied, no sequence breakers are used.
+
+The `dry-multiplier` option controls the strength of the DRY sampling effect. A value of 0.0 disables DRY sampling, while higher values increase its influence. A typical recommended value is 0.8.
+
+The `dry-base` option sets the base value for the exponential penalty calculation in DRY sampling. Higher values lead to more aggressive penalization of repetitions.
+
+The `dry-allowed-length` option sets the maximum length of repeated sequences that will not be penalized. Repetitions shorter than or equal to this length are not penalized, allowing for natural repetitions of short phrases or common words.
+
+The `dry-penalty-last-n` option controls how many recent tokens to consider when applying the DRY penalty. A value of -1 considers the entire context. Use a positive value to limit the consideration to a specific number of recent tokens.
+
+The `dry-sequence-breaker` option adds a single sequence breaker and can be used more than once to specify multiple sequence breakers. Sequence breakers interrupt sequence matching and break the input into parts where matching can be applied.
+
+DRY sampling provides more nuanced control over text generation, particularly for reducing long-range repetitions and maintaining global coherence.
+
+Example usage: `--dry-multiplier 0.8 --dry-base 1.75 --dry-allowed-length 2 --dry-penalty-last-n -1 --dry-sequence-breaker "—" --dry-sequence-breaker "##"`
+
 ### Top-K Sampling
 
 -   `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
diff --git a/examples/server/README.md b/examples/server/README.md
index 8f00fcc79..bc737237e 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -114,6 +114,11 @@ The project is under active development, and we are [looking for feedback and co
 | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
 | `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
 | `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
+| `--dry-multiplier N` | DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
+| `--dry-base N` | DRY sampling base value (default: 1.75) |
+| `--dry-allowed-length N` | allowed length for DRY sampling (default: 2) |
+| `--dry-penalty-last-n N` | DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
+| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers (`['\n', ':', '"', '*']`) in the process; use `"none"` to not use any sequence breakers
 | `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
 | `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
 | `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
@@ -369,6 +374,16 @@ node index.js
 
     `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
 
+    `dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
+
+    `dry_base`: Set the DRY repetition penalty base value. Default: `1.75`
+
+    `dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
+
+    `dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
+
+    `dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
+
     `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
 
     `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html
index ad4183cd9..cb3995abe 100644
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -40,6 +40,10 @@
       repeat_last_n: 0, // 0 = disable penalty, -1 = context size
       repeat_penalty: 1.0, // 1.0 = disabled
       penalize_nl: false, // true only useful for infinite completion
+      dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
+      dry_base: 1.75,     // 0.0 = disabled
+      dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
+      dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
       top_k: 0, // <= 0 to use vocab size
       top_p: 1.0, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
@@ -833,13 +837,17 @@ return html`
         <fieldset class="params">
           ${IntField({ label: "Top-K", title: "Limits the selection of the next token to the K most probable tokens. 1 means no randomness = greedy sampling. If set to 0, it means the entire vocabulary size is considered.", max: 100, min: 0, step: 1, name: "top_k", value: params.value.top_k })}
           ${IntField({ label: "Penalize Last N", title: "The last n tokens that are taken into account to penalise repetitions. A value of 0 means that this function is deactivated and -1 means that the entire size of the context is taken into account.", max: 2048, min: 0, step: 16, name: "repeat_last_n", value: params.value.repeat_last_n })}
-          ${FloatField({ label: "Top-P", title: "Limits the selection of the next token to a subset of tokens whose combined probability reaches a threshold value P = top-P. If set to 1, it means the entire vocabulary size is considered.", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
           ${FloatField({ label: "Presence Penalty", title: "A penalty that is applied if certain tokens appear repeatedly in the generated text. A higher value leads to fewer repetitions.", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
-          ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
           ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+          ${FloatField({ label: "Top-P", title: "Limits the selection of the next token to a subset of tokens whose combined probability reaches a threshold value P = top-P. If set to 1, it means the entire vocabulary size is considered.", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
           ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
           ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
           ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
+          ${FloatField({ label: "DRY Penalty Multiplier", title: "Set the DRY repetition penalty multiplier. Default is 0.0, which disables DRY.", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })}
+          ${FloatField({ label: "DRY Base", title: "Set the DRY repetition penalty base value. Default is 1.75", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })}
+          ${IntField({ label: "DRY Allowed Length", title: "Tokens that extend repetition beyond this receive exponentially increasing penalty. Default is 2", max: 10, min: 1, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })}
+          ${IntField({ label: "DRY Penalty Last N", title: "How many tokens to scan for repetitions. Default is -1, where 0 is disabled and -1 is context size", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })}
+          ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
           ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
         </fieldset>
 
@@ -1144,6 +1152,8 @@ document.addEventListener('DOMContentLoaded', (event) => {
     repeat_penalty: { snapValue: 1.0, snapRangeMultiplier: 4 },
     presence_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 },
     frequency_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    dry_multiplier: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    dry_base: { snapValue: 1.75, snapRangeMultiplier: 4 },
   };
   // add an event listener for each slider
   Object.keys(snapSettings).forEach(sliderName => {
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 88065705f..7f9b02bfb 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -304,6 +304,10 @@
       repeat_last_n: 256, // 0 = disable penalty, -1 = context size
       repeat_penalty: 1.18, // 1.0 = disabled
       penalize_nl: false,
+      dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
+      dry_base: 1.75,     // 0.0 = disabled
+      dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
+      dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
       top_k: 40, // <= 0 to use vocab size
       top_p: 0.95, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled
@@ -1015,6 +1019,10 @@
               ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
               ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
               ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+              ${FloatField({ label: "DRY Penalty Multiplier", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })}
+              ${FloatField({ label: "DRY Base", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })}
+              ${IntField({ label: "DRY Allowed Length", max: 10, min: 2, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })}
+              ${IntField({ label: "DRY Penalty Last N", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })}
               ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
               ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
             </fieldset>
diff --git a/examples/server/public/style.css b/examples/server/public/style.css
old mode 100755
new mode 100644
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3c12ef6f0..ff1d9b03c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -800,35 +800,58 @@ struct server_context {
             slot.oaicompat_model = "";
         }
 
-        slot.params.stream             = json_value(data, "stream",            false);
-        slot.params.cache_prompt       = json_value(data, "cache_prompt",      false);
-        slot.params.n_predict          = json_value(data, "n_predict",         json_value(data, "max_tokens", default_params.n_predict));
-        slot.params.n_indent           = json_value(data, "n_indent",          default_params.n_indent);
-        slot.sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
-        slot.sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
-        slot.sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
-        slot.sparams.xtc_probability   = json_value(data, "xtc_probability",   default_sparams.xtc_probability);
-        slot.sparams.xtc_threshold     = json_value(data, "xtc_threshold",     default_sparams.xtc_threshold);
-        slot.sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot.sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
-        slot.sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
-        slot.sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
-        slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
-        slot.sparams.penalty_last_n    = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
-        slot.sparams.penalty_repeat    = json_value(data, "repeat_penalty",    default_sparams.penalty_repeat);
-        slot.sparams.penalty_freq      = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
-        slot.sparams.penalty_present   = json_value(data, "presence_penalty",  default_sparams.penalty_present);
-        slot.sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
-        slot.sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
-        slot.sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-        slot.sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
-        slot.params.n_keep             = json_value(data, "n_keep",            default_params.n_keep);
-        slot.params.n_discard          = json_value(data, "n_discard",         default_params.n_discard);
-        slot.sparams.seed              = json_value(data, "seed",              default_sparams.seed);
-        slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
-        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
-      //slot.params.t_max_prompt_ms    = json_value(data, "t_max_prompt_ms",   default_params.t_max_prompt_ms); // TODO: implement
-        slot.params.t_max_predict_ms   = json_value(data, "t_max_predict_ms",  default_params.t_max_predict_ms);
+        slot.params.stream              = json_value(data, "stream",             false);
+        slot.params.cache_prompt        = json_value(data, "cache_prompt",       false);
+        slot.params.n_predict           = json_value(data, "n_predict",          json_value(data, "max_tokens", default_params.n_predict));
+        slot.params.n_indent            = json_value(data, "n_indent",           default_params.n_indent);
+        slot.sparams.top_k              = json_value(data, "top_k",              default_sparams.top_k);
+        slot.sparams.top_p              = json_value(data, "top_p",              default_sparams.top_p);
+        slot.sparams.min_p              = json_value(data, "min_p",              default_sparams.min_p);
+        slot.sparams.xtc_probability    = json_value(data, "xtc_probability",    default_sparams.xtc_probability);
+        slot.sparams.xtc_threshold      = json_value(data, "xtc_threshold",      default_sparams.xtc_threshold);
+        slot.sparams.tfs_z              = json_value(data, "tfs_z",              default_sparams.tfs_z);
+        slot.sparams.typ_p              = json_value(data, "typical_p",          default_sparams.typ_p);
+        slot.sparams.temp               = json_value(data, "temperature",        default_sparams.temp);
+        slot.sparams.dynatemp_range     = json_value(data, "dynatemp_range",     default_sparams.dynatemp_range);
+        slot.sparams.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  default_sparams.dynatemp_exponent);
+        slot.sparams.penalty_last_n     = json_value(data, "repeat_last_n",      default_sparams.penalty_last_n);
+        slot.sparams.penalty_repeat     = json_value(data, "repeat_penalty",     default_sparams.penalty_repeat);
+        slot.sparams.penalty_freq       = json_value(data, "frequency_penalty",  default_sparams.penalty_freq);
+        slot.sparams.penalty_present    = json_value(data, "presence_penalty",   default_sparams.penalty_present);
+        slot.sparams.dry_multiplier     = json_value(data, "dry_multiplier",     default_sparams.dry_multiplier);
+        slot.sparams.dry_base           = json_value(data, "dry_base",           default_sparams.dry_base);
+        slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length);
+        slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n);
+        slot.sparams.mirostat           = json_value(data, "mirostat",           default_sparams.mirostat);
+        slot.sparams.mirostat_tau       = json_value(data, "mirostat_tau",       default_sparams.mirostat_tau);
+        slot.sparams.mirostat_eta       = json_value(data, "mirostat_eta",       default_sparams.mirostat_eta);
+        slot.sparams.penalize_nl        = json_value(data, "penalize_nl",        default_sparams.penalize_nl);
+        slot.params.n_keep              = json_value(data, "n_keep",             default_params.n_keep);
+        slot.params.n_discard           = json_value(data, "n_discard",          default_params.n_discard);
+        slot.sparams.seed               = json_value(data, "seed",               default_sparams.seed);
+        slot.sparams.n_probs            = json_value(data, "n_probs",            default_sparams.n_probs);
+        slot.sparams.min_keep           = json_value(data, "min_keep",           default_sparams.min_keep);
+      //slot.params.t_max_prompt_ms     = json_value(data, "t_max_prompt_ms",    default_params.t_max_prompt_ms); // TODO: implement
+        slot.params.t_max_predict_ms    = json_value(data, "t_max_predict_ms",   default_params.t_max_predict_ms);
+
+        if (slot.sparams.dry_base < 1.0f)
+        {
+           slot.sparams.dry_base = default_sparams.dry_base;
+        }
+
+        // sequence breakers for DRY
+        {
+            // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format
+            // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
+
+            if (data.contains("dry_sequence_breakers")) {
+                slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
+                if (slot.sparams.dry_sequence_breakers.empty()) {
+                    send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST);
+                    return false;
+                }
+            }
+        }
 
         // process "json_schema" and "grammar"
         if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
@@ -1132,6 +1155,11 @@ struct server_context {
             {"repeat_penalty",            slot.sparams.penalty_repeat},
             {"presence_penalty",          slot.sparams.penalty_present},
             {"frequency_penalty",         slot.sparams.penalty_freq},
+            {"dry_multiplier",            slot.sparams.dry_multiplier},
+            {"dry_base",                  slot.sparams.dry_base},
+            {"dry_allowed_length",        slot.sparams.dry_allowed_length},
+            {"dry_penalty_last_n",        slot.sparams.dry_penalty_last_n},
+            {"dry_sequence_breakers",     slot.sparams.dry_sequence_breakers},
             {"mirostat",                  slot.sparams.mirostat},
             {"mirostat_tau",              slot.sparams.mirostat_tau},
             {"mirostat_eta",              slot.sparams.mirostat_eta},
@@ -1970,7 +1998,6 @@ struct server_context {
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
-
                                                 slot.n_past++;
                                             }
 
diff --git a/include/llama.h b/include/llama.h
index d4059c8dd..b2d1e7d5a 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1141,6 +1141,16 @@ extern "C" {
                                 bool   penalize_nl,     // consider newlines as a repeatable token
                                 bool   ignore_eos);     // ignore the end-of-sequence token
 
+    ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
+    LLAMA_API struct llama_sampler *    llama_sampler_init_dry(
+            const struct llama_model *  model,
+                               float    dry_multiplier,
+                               float    dry_base,
+                             int32_t    dry_allowed_length,
+                             int32_t    dry_penalty_last_n,
+                          const char ** seq_breakers,
+                              size_t    num_breakers);
+
     LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
                              int32_t   n_vocab,
                              int32_t   n_logit_bias,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index d71516153..25536eb6c 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1683,6 +1683,397 @@ struct llama_sampler * llama_sampler_init_penalties(
     };
 }
 
+// DRY
+
+struct llama_sampler_dry {
+    int32_t total_context_size;
+
+    const float   dry_multiplier;
+    const float   dry_base;
+    const int32_t dry_allowed_length;
+    const int32_t dry_penalty_last_n;
+
+    std::unordered_multimap<llama_token, std::vector<llama_token>> dry_processed_breakers;
+    std::vector<int> dry_repeat_count;
+    std::unordered_map<llama_token, int> dry_max_token_repeat;
+    ring_buffer<llama_token> last_tokens;
+};
+
+// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
+static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
+    for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
+        std::string word = llama_detokenize(vocab, {token_id}, true);
+        if (word.find(str) != std::string::npos) {
+            token_sequences.emplace(token_id, std::vector<llama_token>());
+        } else {
+            size_t word_len = word.size(), str_len = str.size();
+            size_t pos = -1;
+            while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
+                bool match = true;
+                size_t i;
+                for (i = 1; i < str_len && i + pos < word_len; ++i) {
+                    if (word[pos + i] != str[i]) {
+                        match = false;
+                        break;
+                    }
+                }
+                if (match) {
+                    std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
+                    if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
+                        tokenization.resize(max_tail_len);
+                    }
+
+                    // Ensure we don't already have a duplicate matching tokenization
+                    auto its = token_sequences.equal_range(token_id);
+                    bool found = false;
+                    for (auto it = its.first; it != its.second; ++it) {
+                        if (tokenization == it->second) {
+                            found = true;
+                            break;
+                        }
+                    }
+                    if (!found) {
+                        token_sequences.emplace(token_id, tokenization);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) {
+    return "dry";
+}
+
+static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) {
+    auto * ctx = (llama_sampler_dry *) smpl->ctx;
+    if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
+        return;
+    }
+
+    ctx->last_tokens.push_back(token);
+}
+
+// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
+static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_dry *) smpl->ctx;
+
+    if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
+        return;
+    }
+
+    int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0);
+    int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size);
+
+    if (last_n_repeat <= ctx->dry_allowed_length) {
+        return;
+    }
+
+    ctx->dry_repeat_count.assign(last_n_repeat, 0);
+    ctx->dry_max_token_repeat.clear();
+
+    // Step 1: Look for restart sequences to limit the maximum repetition length.
+    // Work backwards through the context looking for any token that begins a restart sequence.
+    //
+    // The collection `restart_sequences` is a mapping from a "head" token to all "tail"
+    // sequences that together comprise a restart sequence. This allows us to quickly check
+    // whether each token is the head of a complete sequence. Most restart sequences are actually
+    // a single token, and for these the "tail" is an empty vector.
+    //
+    // If the token is a "head", test all restart sequences that begin with this token
+    // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and
+    // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The
+    // longest matching sequence (if any) is used to limit the maximum repetition length.
+    //
+    // Note that in the case case of a short sequence contained in a longer one, this might fail to
+    // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as
+    // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress
+    // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare.
+    //
+    // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we
+    // have already clamped the maximum tail sequence length when generating `restart_sequences`.
+    // With clamping, this scan is O(N) in the context length.
+
+    int rep_limit = last_n_repeat;
+    for (int i = 0; i < last_n_repeat; ++i) {
+        llama_token token = ctx->last_tokens.rat(i);
+        auto its = ctx->dry_processed_breakers.equal_range(token);
+        if (its.first == ctx->dry_processed_breakers.end()) {
+            continue;
+        }
+        int longest_match = -1;
+        for (auto it = its.first; it != its.second; ++it) {
+            // Note that (*it) does not contain the head character, so seq_len will be
+            // the restart sequence length minus 1.
+            // In the common case of a single-token restart sequence, (*it) will be empty
+            // and we will trivially match.
+            int seq_len = (int)it->second.size();
+            if (seq_len > longest_match && seq_len <= (int)i) {
+                bool match = true;
+                for (int offset = 0; offset < seq_len; ++offset) {
+                    // The -1 when indexing `last_tokens` is because we already matched the head.
+                    if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) {
+                        match = false;
+                        break;
+                    }
+                }
+                if (match) {
+                    longest_match = seq_len;
+                }
+            }
+        }
+        if (longest_match >= 0) {
+            // We found a restart sequence starting `i` tokens from the end and continuing for
+            // `longest_match` tokens.
+            rep_limit = i - longest_match;
+            break;
+        }
+    }
+    if (rep_limit < ctx->dry_allowed_length) {
+        return;
+    }
+
+    // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in
+    // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing
+    // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences.
+    //
+    // This algorithm is not currently documented on Wikipedia, but there is a clear description here:
+    // https://ivanyu.me/blog/2014/10/15/z-algorithm/
+    //
+    // The code below is adapted from the public domain implementation by the same author here:
+    // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py
+    //
+    // Example:
+    // Last N tokens: a b c c b c y a b c
+    // Repeat counts: 0 0 3 1 0 2 0 0 0 0
+    //                    ^
+    //   This `3` means that the last three tokens of the context (a b c) also appear here.
+    //
+    // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested
+    // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each
+    // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables
+    // ensure that the inner while loops only examine each token in the context once as the outer
+    // for loop iterates over the context.
+
+    {
+        const int last = last_n_repeat - 1;
+        int rt = 0, lt = 0;
+
+        for (int k = 1; k < last_n_repeat; ++k) {
+            if (k > rt) {
+                // If k is outside the current Z-box, do naive computation.
+                int n = 0;
+                while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) {
+                    ++n;
+                }
+                ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
+                if (n > 0) {
+                    lt = k;
+                    rt = k+n-1;
+                }
+            } else {
+                // If k is inside the current Z-box, consider two cases.
+
+                int p = k - lt; // Pair index.
+                int right_part_len = rt - k + 1;
+
+                if (ctx->dry_repeat_count[last - p] < right_part_len) {
+                    int n = std::min(ctx->dry_repeat_count[last - p], rep_limit);
+                    ctx->dry_repeat_count[last - k] = n;
+                } else {
+                    int i = rt + 1;
+                    while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) {
+                        i += 1;
+                    }
+
+                    int n = std::min(i - k, rep_limit);
+                    ctx->dry_repeat_count[last - k] = n;
+                    lt = k;
+                    rt = i - 1;
+                }
+            }
+        }
+    }
+
+    // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length
+    // that would be generated by emitting each new token that would extend a sequence.
+    //
+    // Following the same example as above:
+    // Last N tokens: a b c c b c y a b c
+    // Repeat counts: 0 0 3 1 0 2 0 0 0 0
+    //
+    // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition.
+    // c: 3 -> 4 (from `a b c` to `a b c c`)
+    // b: 1 -> 2 (from `c` to `c b`)
+    // y: 2 -> 3 (from `b c` to `b c y`)
+
+    for (int i = 0; i < last_n_repeat - 1; ++i) {
+        int repeat_len = ctx->dry_repeat_count[i];
+        if (repeat_len >= ctx->dry_allowed_length) {
+            // This token ends a repeat, so the next token would continue one.
+            // By convention, the value of `repeat_len` only includes the tokens currently
+            // in the context, not the new token that would be added.
+            llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
+            // Track the maximum sequence ending in this token.
+            const auto& it = ctx->dry_max_token_repeat.find(token);
+            if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) {
+                ctx->dry_max_token_repeat[token] = repeat_len;
+            }
+        }
+    }
+
+    // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens.
+
+    // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`.
+    // Compute it from `penalty_base` and the approximate log of `std::numeric_limits<float>::max()`
+    const float FLOAT_MAX_LOG = 88.7228391f;
+    int max_exponent = 0;
+    if (ctx->dry_base > 1.000001f) {
+        max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base);
+    }
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
+        if (af_kvp != ctx->dry_max_token_repeat.end()) {
+            // Check all sequence breakers starting with this token
+            auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id);
+            bool is_single_token_breaker = false;
+
+            for (auto it = range.first; it != range.second; ++it) {
+                if (it->second.empty()) {
+                    is_single_token_breaker = true;
+                    break;
+                }
+            }
+
+            // Apply penalty only if it's not a single-token sequence breaker
+            if (!is_single_token_breaker) {
+                int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
+                if (max_exponent > 0 && repeat_exp > max_exponent) {
+                    repeat_exp = max_exponent;
+                }
+                float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
+                cur_p->data[i].logit -= penalty;
+            }
+        }
+    }
+
+    cur_p->sorted = false;
+}
+
+static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_dry *) smpl->ctx;
+    ctx->last_tokens.clear();
+    ctx->dry_repeat_count.clear();
+    ctx->dry_max_token_repeat.clear();
+}
+
+static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (llama_sampler_dry *) smpl->ctx;
+
+    // nullptr is passed as vocab because it is only needed for raw sequence breaker processing, which we have already done and will be copying
+    auto * result = llama_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
+    // Copy the state, including the processed breakers
+    {
+        auto * result_ctx = (llama_sampler_dry *) result->ctx;
+        result_ctx->dry_processed_breakers = ctx->dry_processed_breakers;
+        result_ctx->dry_repeat_count = ctx->dry_repeat_count;
+        result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat;
+        result_ctx->last_tokens = ctx->last_tokens;
+    }
+
+    return result;
+}
+
+static void llama_sampler_dry_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_dry *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_dry_i = {
+    /* .name   = */ llama_sampler_dry_name,
+    /* .accept = */ llama_sampler_dry_accept,
+    /* .apply  = */ llama_sampler_dry_apply,
+    /* .reset  = */ llama_sampler_dry_reset,
+    /* .clone  = */ llama_sampler_dry_clone,
+    /* .free   = */ llama_sampler_dry_free,
+};
+
+struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+    int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
+    std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
+    const int MAX_CHAR_LEN = 40;
+    const int MAX_SEQ_LEN = 20;
+
+    const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0);
+
+    if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) {
+        // Process sequence breakers
+        for (size_t i = 0; i < num_breakers; ++i) {
+            if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) {
+                LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i);
+                continue;
+            }
+
+            std::string sequence_break(seq_breakers[i]);
+            if (sequence_break.empty()) {
+                LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n");
+                continue;
+            }
+
+            if (sequence_break.size() > MAX_CHAR_LEN) {
+                LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN);
+                sequence_break.resize(MAX_CHAR_LEN);
+            }
+
+            get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
+        }
+    }
+
+    return new llama_sampler {
+        /* .iface = */ &llama_sampler_dry_i,
+        /* .ctx   = */ new llama_sampler_dry {
+            /* .total_context_size     = */ context_size,
+            /* .dry_multiplier         = */ dry_multiplier,
+            /* .dry_base               = */ dry_base,
+            /* .dry_allowed_length     = */ dry_allowed_length,
+            /* .dry_penalty_last_n     = */ dry_penalty_last_n,
+            /* .dry_processed_breakers = */ std::move(processed_breakers),
+            /* .dry_repeat_count       = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
+            /* .dry_max_token_repeat   = */ {},
+            /* .last_tokens            = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
+        },
+    };
+}
+
+// wrapper for test-sampling.cpp
+struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
+    llama_vocab dummy_vocab;
+    auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
+    auto * ctx = (llama_sampler_dry *) result->ctx;
+
+    // Process the token-based sequence breakers
+    ctx->dry_processed_breakers.clear();
+    if (seq_breakers.empty()) {
+        LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n");
+    } else {
+        for (const auto& breaker : seq_breakers) {
+            if (breaker.empty()) {
+                LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n");
+                continue;
+            }
+            llama_token head_token = breaker[0];
+            std::vector<llama_token> tail_tokens(breaker.begin() + 1, breaker.end());
+            ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens));
+        }
+
+        if (ctx->dry_processed_breakers.empty()) {
+            LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n");
+        }
+    }
+
+    return result;
+}
+
 // logit-bias
 
 struct llama_sampler_logit_bias {
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
index 2683f1b92..919f6fdfc 100644
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@@ -28,3 +28,21 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
 
 struct llama_sampler * llama_sampler_init_infill_impl(
         const struct llama_vocab & vocab);
+
+struct llama_sampler * llama_sampler_init_dry_impl(
+        const struct llama_vocab &  vocab,
+                         int32_t    context_size,
+                           float    dry_multiplier,
+                           float    dry_base,
+                         int32_t    dry_allowed_length,
+                         int32_t    dry_penalty_last_n,
+                      const char ** seq_breakers,
+                          size_t    num_breakers);
+
+struct llama_sampler * llama_sampler_init_dry_testing(
+                         int32_t   context_size,
+                           float   dry_multiplier,
+                           float   dry_base,
+                         int32_t   dry_allowed_length,
+                         int32_t   dry_penalty_last_n,
+  const std::vector<std::vector<llama_token>>& seq_breakers);
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 0a49ddbe3..d1dc96276 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1966,3 +1966,19 @@ int32_t llama_detokenize_impl(
 
     return total <= text_len_max ? total : -total;
 }
+
+std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) {
+    std::string text;
+    text.resize(std::max(text.capacity(), tokens.size()));
+    int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    if (n_chars < 0) {
+        text.resize(-n_chars);
+        n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
+    }
+
+    text.resize(n_chars);
+
+    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
+    return text;
+}
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index d958d0073..4bb16d2e4 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -163,3 +163,8 @@ int32_t llama_detokenize_impl(
                          int32_t   text_len_max,
                             bool   remove_special,
                             bool   unparse_special);
+
+std::string llama_detokenize(
+        const struct llama_vocab & vocab,
+  const std::vector<llama_token> & tokens,
+                            bool   special);
diff --git a/src/llama.cpp b/src/llama.cpp
index 24e1f1f01..50eebc2c2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21775,6 +21775,10 @@ struct llama_sampler * llama_sampler_init_infill(const struct llama_model * mode
     return llama_sampler_init_infill_impl(model->vocab);
 }
 
+struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+    return llama_sampler_init_dry_impl(model->vocab, llama_n_ctx_train(model), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
+}
+
 //
 // model split
 //
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 05600e6f5..eb39661c3 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -10,6 +10,8 @@
 #include <string>
 #include <vector>
 
+extern struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers);
+
 static void dump(const llama_token_data_array * cur_p) {
     for (size_t i = 0; i < cur_p->size; i++) {
         printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
@@ -167,6 +169,29 @@ static void test_penalties(
     tester.check();
 }
 
+static void test_dry(
+    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
+    const std::vector<float> & expected_probs, float dry_multiplier, float dry_base,
+    int dry_allowed_length, int dry_penalty_last_n,
+    const std::vector<std::vector<llama_token>> & seq_breakers
+) {
+    GGML_ASSERT(probs.size() == expected_probs.size());
+
+    sampler_tester tester(probs, expected_probs);
+
+    auto * sampler = llama_sampler_init_dry_testing(1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
+
+    for (size_t i = 0; i < last_tokens.size(); i++) {
+        llama_sampler_accept(sampler, last_tokens[i]);
+    }
+
+    DUMP(&tester.cur_p);
+    tester.apply(sampler);
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+    tester.check();
+}
+
 static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
 ) {
     sampler_tester tester(n_vocab);
@@ -333,6 +358,13 @@ int main(void) {
     test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
     test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
 
+
+    test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1}, {0.25f, 0.25f, 0.25f, 0.25f}, 1.0f, 1.1f, 2, 4, {});
+    test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.296923f, 0.109232f}, 1.0f, 1.1f, 2, 5, {});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 2, 6, {{3}});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.241818f, 0.241818f, 0.032727f}, 2.0f, 1.1f, 2, 5, {});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {});
+
     test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
     test_sampler_queue(10000, "k",     1, 1.0f, 1.0f);
     test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);

From 668750357e66bfa3d1504b65699f5a0dfe3cb7cb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 25 Oct 2024 22:26:15 +0300
Subject: [PATCH 110/396] metal : support permuted matrix multiplicaions
 (#10033)

* metal : support permuted matrix multiplicaions

ggml-ci

* cont : use nb01 directly for row steps

ggml-ci

* cont : add comments [no ci]

* metal : minor refactor

* metal : minor
---
 ggml/src/ggml-metal.m     |  75 ++---
 ggml/src/ggml-metal.metal | 578 +++++++++++++++++++++++++-------------
 2 files changed, 423 insertions(+), 230 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index e9541441c..80c08f15b 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -1015,19 +1015,21 @@ static void ggml_metal_encode_node(
     id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
     id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
 
-    //GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
-    //if (src0) {
-    //    GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
-    //            ggml_is_contiguous(src0), src0->name);
-    //}
-    //if (src1) {
-    //    GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
-    //            ggml_is_contiguous(src1), src1->name);
-    //}
-    //if (dst) {
-    //    GGML_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
-    //            dst->name);
-    //}
+#if 0
+    GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+    if (src0) {
+        GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03,
+                ggml_is_contiguous(src0), src0->name);
+    }
+    if (src1) {
+        GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
+                ggml_is_contiguous(src1), src1->name);
+    }
+    if (dst) {
+        GGML_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, ne3, nb0, nb1, nb2, nb3,
+                dst->name);
+    }
+#endif
 
     id<MTLDevice> device = ctx_dev->mtl_device;
 
@@ -1810,14 +1812,16 @@ static void ggml_metal_encode_node(
                             [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
                             [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
                             [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
-                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
-                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
-                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
-                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
-                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
-                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
-                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
-                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
+                            [encoder setBytes:&nb03    length:sizeof(nb03) atIndex:7];
+                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
+                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:9];
+                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:10];
+                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:11];
+                            [encoder setBytes:&nb13    length:sizeof(nb13) atIndex:12];
+                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
+                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
+                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:15];
+                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:16];
                             [encoder setThreadgroupMemoryLength:8192 atIndex:0];
                             [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                         } else {
@@ -1986,20 +1990,22 @@ static void ggml_metal_encode_node(
                             [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
                             [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
                             [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
-                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:17];
-                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:18];
+                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:13];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:14];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:15];
+                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:16];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:18];
+                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:19];
+                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:20];
 
                             if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
-                                    src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
-                                    src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
+                                src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
+                                src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
                                 [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                             }
                             else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
@@ -2048,6 +2054,9 @@ static void ggml_metal_encode_node(
 
                 GGML_ASSERT(src1t == GGML_TYPE_F32);
 
+                GGML_ASSERT(ne03 == 1);
+                GGML_ASSERT(ne13 == 1);
+
                 // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                 // to the matrix-vector kernel
                 // ne20 = n_used_experts
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index 71b58be1f..defde6246 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -777,10 +777,10 @@ kernel void kernel_ssm_conv_f32(
     const int64_t i3 = tgpig.z;
 
     const int64_t nc  = ne10;
-    const int64_t ncs = ne00;
-    const int64_t nr  = ne01;
-    const int64_t n_t = ne1;
-    const int64_t n_s = ne2;
+  //const int64_t ncs = ne00;
+  //const int64_t nr  = ne01;
+  //const int64_t n_t = ne1;
+  //const int64_t n_s = ne2;
 
     device const float * s = (device const float *) ((device const char *) src0 + ir*nb01 + i2*nb00 + i3*nb02);
     device const float * c = (device const float *) ((device const char *) src1 + ir*nb11);
@@ -834,9 +834,9 @@ kernel void kernel_ssm_scan_f32(
     const int64_t i3 = tgpig.y;
 
     const int64_t nc  = d_state;
-    const int64_t nr  = d_inner;
+  //const int64_t nr  = d_inner;
     const int64_t n_t = n_seq_tokens;
-    const int64_t n_s = n_seqs;
+  //const int64_t n_s = n_seqs;
 
     for (int64_t i2 = 0; i2 < n_t; ++i2) {
         device const float * s0 = (device const float *) ((device const char *) src0 + ir*nb01 + i3*nb02);
@@ -1064,17 +1064,18 @@ kernel void kernel_group_norm(
 inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
     float d = qb_curr->d;
 
-    float2 acc = 0.f;
+    float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
 
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
+    device const uint16_t * qs = ((device const uint16_t *) qb_curr + 1 + il/2);
 
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
-                + yl[i + 1] * (qs[i / 2] & 0x0F00);
-        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
-                + yl[i + 9] * (qs[i / 2] & 0xF000);
+    for (int i = 0; i < 8; i += 2) {
+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F);
+        acc[1] += yl[i + 1] * (qs[i / 2] & 0x0F00);
+        acc[2] += yl[i + 8] * (qs[i / 2] & 0x00F0);
+        acc[3] += yl[i + 9] * (qs[i / 2] & 0xF000);
     }
-    return d * (sumy * -8.f + acc[0] + acc[1]);
+
+    return d * (sumy * -8.f + acc[0] + acc[1] + acc[2] + acc[3]);
 }
 
 // function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
@@ -1085,17 +1086,18 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
     float d = qb_curr->d;
     float m = qb_curr->m;
 
-    float2 acc = 0.f;
+    float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
 
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
+    device const uint16_t * qs = ((device const uint16_t *) qb_curr + 2 + il/2);
 
     for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
-                + yl[i + 1] * (qs[i / 2] & 0x0F00);
-        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
-                + yl[i + 9] * (qs[i / 2] & 0xF000);
+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F);
+        acc[1] += yl[i + 1] * (qs[i / 2] & 0x0F00);
+        acc[2] += yl[i + 8] * (qs[i / 2] & 0x00F0);
+        acc[3] += yl[i + 9] * (qs[i / 2] & 0xF000);
     }
-    return d * (acc[0] + acc[1]) + sumy * m;
+
+    return d * (acc[0] + acc[1] + acc[2] + acc[3]) + sumy * m;
 }
 
 // function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i])
@@ -1105,18 +1107,19 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
 inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) {
     float d = qb_curr->d;
 
-    float2 acc = 0.f;
+    float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
 
     device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 3 + il/2);
            const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
 
     for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
-                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
-        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
-                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
+        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010));
+        acc[1] += yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
+        acc[2] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100));
+        acc[3] += yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
     }
-    return d * (sumy * -16.f + acc[0] + acc[1]);
+
+    return d * (sumy * -16.f + acc[0] + acc[1] + acc[2] + acc[3]);
 }
 
 // function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i])
@@ -1127,18 +1130,19 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
     float d = qb_curr->d;
     float m = qb_curr->m;
 
-    float2 acc = 0.f;
+    float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
 
     device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 4 + il/2);
            const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
 
     for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
-                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
-        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
-                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
+        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010));
+        acc[1] += yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
+        acc[2] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100));
+        acc[3] += yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
     }
-    return d * (acc[0] + acc[1]) + sumy * m;
+
+    return d * (acc[0] + acc[1] + acc[2] + acc[3]) + sumy * m;
 }
 
 // putting them in the kernel cause a significant performance penalty
@@ -1156,14 +1160,22 @@ void mul_vec_q_n_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
                    uint      r3,
         threadgroup int8_t * shared_values,
-                   uint3 tgpig, uint tiisg, uint sgitg) {
+                     uint3   tgpig,
+                     uint    tiisg,
+                     uint    sgitg) {
     const int nb = ne00/QK4_0;
 
     const int r0 = tgpig.x;
@@ -1175,10 +1187,19 @@ void mul_vec_q_n_f32_impl(
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+  //const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_q_type * x = (device const block_q_type *) src0 + offset0;
-    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
+  //device const block_q_type * x = (device const block_q_type *) ((device char *) src0 + offset0);
+    device const float        * y = (device const float        *) ((device char *) src1 + offset1);
+
+    // pointers to src0 rows
+    device const block_q_type * ax[nr];
+    for (int row = 0; row < nr; ++row) {
+        const uint offset0 = (first_row + row)*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+        ax[row] = (device const block_q_type *) ((device char *) src0 + offset0);
+    }
 
     float yl[16]; // src1 vector cache
     float sumf[nr] = {0.f};
@@ -1190,19 +1211,22 @@ void mul_vec_q_n_f32_impl(
 
     // each thread in a SIMD group deals with half a block.
     for (int ib = ix; ib < nb; ib += nw/2) {
-        float sumy = 0;
-        for (int i = 0; i < 8; i += 2) {
-            sumy += yb[i] + yb[i+1];
-            yl[i+0] = yb[i+ 0];
-            yl[i+1] = yb[i+ 1]/256.f;
+        float sumy[2] = { 0.f, 0.f };
 
-            sumy += yb[i+16] + yb[i+17];
-            yl[i+8] = yb[i+16]/16.f;
-            yl[i+9] = yb[i+17]/4096.f;
+#pragma unroll
+        for (int i = 0; i < 8; i += 2) {
+            sumy[0]  += yb[i +  0] + yb[i +  1];
+            yl[i + 0] = yb[i +  0];
+            yl[i + 1] = yb[i +  1]/256.f;
+
+            sumy[1]  += yb[i + 16] + yb[i + 17];
+            yl[i + 8] = yb[i + 16]/16.f;
+            yl[i + 9] = yb[i + 17]/4096.f;
         }
 
+#pragma unroll
         for (int row = 0; row < nr; row++) {
-            sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
+            sumf[row] += block_q_n_dot_y(ax[row] + ib, sumy[0] + sumy[1], yl, il);
         }
 
         yb += QK4_0 * 16;
@@ -1226,12 +1250,14 @@ kernel void kernel_mul_mv_q4_0_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -1239,7 +1265,7 @@ kernel void kernel_mul_mv_q4_0_f32(
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
+    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
 }
 
 kernel void kernel_mul_mv_q4_1_f32(
@@ -1252,12 +1278,14 @@ kernel void kernel_mul_mv_q4_1_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -1265,7 +1293,7 @@ kernel void kernel_mul_mv_q4_1_f32(
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
-     mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
+     mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
 }
 
 kernel void kernel_mul_mv_q5_0_f32(
@@ -1278,12 +1306,14 @@ kernel void kernel_mul_mv_q5_0_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -1291,7 +1321,7 @@ kernel void kernel_mul_mv_q5_0_f32(
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
+    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
 }
 
 kernel void kernel_mul_mv_q5_1_f32(
@@ -1304,12 +1334,14 @@ kernel void kernel_mul_mv_q5_1_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -1317,7 +1349,7 @@ kernel void kernel_mul_mv_q5_1_f32(
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
+    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
 }
 
 
@@ -1330,8 +1362,14 @@ void kernel_mul_mv_q8_0_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -1354,10 +1392,19 @@ void kernel_mul_mv_q8_0_f32_impl(
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+  //const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+  //device const block_q8_0 * x = (device const block_q8_0 *) ((device char *) src0 + offset0);
+    device const float      * y = (device const float      *) ((device char *) src1 + offset1);
+
+    // pointers to src0 rows
+    device const block_q8_0 * ax[nr];
+    for (int row = 0; row < nr; ++row) {
+        const uint offset0 = (first_row + row)*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+        ax[row] = (device const block_q8_0 *) ((device char *) src0 + offset0);
+    }
 
     float yl[NB_Q8_0];
     float sumf[nr]={0.f};
@@ -1374,12 +1421,12 @@ void kernel_mul_mv_q8_0_f32_impl(
         }
 
         for (int row = 0; row < nr; row++) {
-            device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
+            device const int8_t * qs = ax[row][ib].qs + NB_Q8_0*il;
             float sumq = 0.f;
             for (int iq = 0; iq < NB_Q8_0; ++iq) {
                 sumq += qs[iq] * yl[iq];
             }
-            sumf[row] += sumq*x[ib+row*nb].d;
+            sumf[row] += sumq*ax[row][ib].d;
         }
 
         yb += NB_Q8_0 * nw;
@@ -1404,12 +1451,14 @@ kernel void kernel_mul_mv_q8_0_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -1417,7 +1466,7 @@ kernel void kernel_mul_mv_q8_0_f32(
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
+    kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
 }
 
 #define N_MV_T_T 4
@@ -1433,12 +1482,14 @@ void kernel_mul_mv_impl(
                   uint64_t   nb00,
                   uint64_t   nb01,
                   uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne11,
                    int64_t   ne12,
                   uint64_t   nb10,
                   uint64_t   nb11,
                   uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -1452,7 +1503,7 @@ void kernel_mul_mv_impl(
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
 
     device const T0 * x = (device const T0 *) (src0 + offset0);
 
@@ -1463,7 +1514,9 @@ void kernel_mul_mv_impl(
                 break;
             }
 
-            device const T1 * y = (device const T1 *) (src1 + r1*nb11 + im*nb12);
+            const uint offset1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            device const T1 * y = (device const T1 *) (src1 + offset1);
 
             float sumf = 0;
             for (int i = tiisg; i < ne00; i += 32) {
@@ -1483,7 +1536,9 @@ void kernel_mul_mv_impl(
                 break;
             }
 
-            device const T1  * y  = (device const T1  *) (src1 + r1*nb11 + im*nb12);
+            const uint offset1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            device const T1  * y  = (device const T1  *) (src1 + offset1);
             device const T14 * y4 = (device const T14 *) y;
 
             float sumf = 0;
@@ -1511,12 +1566,14 @@ kernel void kernel_mul_mv(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -1533,12 +1590,14 @@ kernel void kernel_mul_mv(
         nb00,
         nb01,
         nb02,
+        nb03,
         ne10,
         ne11,
         ne12,
         nb10,
         nb11,
         nb12,
+        nb13,
         ne0,
         ne1,
         r2,
@@ -1564,12 +1623,14 @@ kernel void kernel_mul_mv_1row(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -1584,10 +1645,11 @@ kernel void kernel_mul_mv_1row(
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
     device const T     * x = (device const T     *) (src0 + offset0);
-    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
+    device const float * y = (device const float *) (src1 + offset1);
 
     float sumf = 0;
     if (ne00 < 128) {
@@ -1631,12 +1693,14 @@ kernel void kernel_mul_mv_l4(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -1651,12 +1715,14 @@ kernel void kernel_mul_mv_l4(
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
 
     device const T4 * x4 = (device const T4 *) (src0 + offset0);
 
     for (int r1 = 0; r1 < nrows; ++r1) {
-        device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
+        const uint offset1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+        device const float4 * y4 = (device const float4 *) (src1 + offset1);
 
         float sumf = 0;
         for (int i = tiisg; i < ne00/4; i += 32) {
@@ -3416,8 +3482,14 @@ void kernel_mul_mv_q2_K_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -3433,21 +3505,19 @@ void kernel_mul_mv_q2_K_f32_impl(
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+    device const block_q2_K * x = (device const block_q2_K *) ((device char *) src0 + offset0);
+    device const float      * y = (device const float      *) ((device char *) src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
 
-    const int step = sizeof(block_q2_K) * nb;
-
     const int ix = tiisg/8;  // 0...3
     const int it = tiisg%8;  // 0...7
     const int iq = it/4;     // 0 or 1
@@ -3492,9 +3562,9 @@ void kernel_mul_mv_q2_K_f32_impl(
                                  (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) -
                          dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0));
 
-            qs += step/2;
-            sc += step;
-            dh += step/2;
+            qs += nb01/2;
+            sc += nb01;
+            dh += nb01/2;
         }
 
         y4 += 4 * QK_K;
@@ -3519,12 +3589,14 @@ kernel void kernel_mul_mv_q2_K_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -3533,7 +3605,7 @@ kernel void kernel_mul_mv_q2_K_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }
 
 void kernel_mul_mv_q3_K_f32_impl(
@@ -3543,8 +3615,14 @@ void kernel_mul_mv_q3_K_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -3565,10 +3643,11 @@ void kernel_mul_mv_q3_K_f32_impl(
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+    device const block_q3_K * x = (device const block_q3_K *) ((device char *) src0 + offset0);
+    device const float     * yy = (device const float      *) ((device char *) src1 + offset1);
 
     float yl[32];
 
@@ -3608,8 +3687,6 @@ void kernel_mul_mv_q3_K_f32_impl(
     const int q_offset = 32*ip + l0;
     const int y_offset = 128*ip + 32*il + l0;
 
-    const int step = sizeof(block_q3_K) * nb / 2;
-
     device const float * y1 = yy + ix*QK_K + y_offset;
 
     uint32_t scales32, aux32;
@@ -3619,7 +3696,6 @@ void kernel_mul_mv_q3_K_f32_impl(
     float sumf1[2] = {0.f};
     float sumf2[2] = {0.f};
     for (int i = ix; i < nb; i += 4) {
-
         for (int l = 0; l < 8; ++l) {
             yl[l+ 0] = y1[l+ 0];
             yl[l+ 8] = y1[l+16];
@@ -3633,7 +3709,6 @@ void kernel_mul_mv_q3_K_f32_impl(
         device const half * dh = &x[i].d;
 
         for (int row = 0; row < 2; ++row) {
-
             const float d_all = (float)dh[0];
 
             scales16[0] = a[4];
@@ -3673,15 +3748,13 @@ void kernel_mul_mv_q3_K_f32_impl(
             sumf1[row] += d1 * (scales[1] - 32);
             sumf2[row] += d2 * (scales[3] - 32);
 
-            q  += step;
-            h  += step;
-            a  += step;
-            dh += step;
-
+            q  += nb01/2;
+            h  += nb01/2;
+            a  += nb01/2;
+            dh += nb01/2;
         }
 
         y1 += 4 * QK_K;
-
     }
 
     for (int row = 0; row < 2; ++row) {
@@ -3706,12 +3779,14 @@ kernel void kernel_mul_mv_q3_K_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -3720,7 +3795,7 @@ kernel void kernel_mul_mv_q3_K_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }
 
 void kernel_mul_mv_q4_K_f32_impl(
@@ -3730,8 +3805,14 @@ void kernel_mul_mv_q4_K_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -3756,29 +3837,26 @@ void kernel_mul_mv_q4_K_f32_impl(
     const int im = tgpig.z;
     //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
     const int first_row = r0 * N_DST;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+    device const block_q4_K * x = (device const block_q4_K *) ((device char *) src0 + offset0);
+    device const float      * y = (device const float      *) ((device char *) src1 + offset1);
 
     float yl[16];
     float yh[16];
     float sumf[N_DST]={0.f}, all_sum;
 
-    const int step = sizeof(block_q4_K) * nb / 2;
-
     device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir;
 
     uint16_t sc16[4];
     thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
 
     for (int ib = ix; ib < nb; ib += 4) {
-
         float4 sumy = {0.f, 0.f, 0.f, 0.f};
         for (int i = 0; i < 8; ++i) {
             yl[i+0] = y4[i+  0]; sumy[0] += yl[i+0];
@@ -3792,7 +3870,6 @@ void kernel_mul_mv_q4_K_f32_impl(
         device const half     * dh = &x[ib].d;
 
         for (int row = 0; row < N_DST; row++) {
-
             sc16[0] = sc[0] & kmask1;
             sc16[1] = sc[2] & kmask1;
             sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
@@ -3821,9 +3898,9 @@ void kernel_mul_mv_q4_K_f32_impl(
                                  (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
                          dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
 
-            q1 += step;
-            sc += step;
-            dh += step;
+            q1 += nb01/2;
+            sc += nb01/2;
+            dh += nb01/2;
         }
 
         y4 += 4 * QK_K;
@@ -3848,12 +3925,14 @@ kernel void kernel_mul_mv_q4_K_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -3862,7 +3941,7 @@ kernel void kernel_mul_mv_q4_K_f32(
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }
 
 void kernel_mul_mv_q5_K_f32_impl(
@@ -3872,8 +3951,14 @@ void kernel_mul_mv_q5_K_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -3894,15 +3979,14 @@ void kernel_mul_mv_q5_K_f32_impl(
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+    device const block_q5_K * x = (device const block_q5_K *) ((device char *) src0 + offset0);
+    device const float     * yy = (device const float      *) ((device char *) src1 + offset1);
 
     float sumf[2]={0.f};
 
-    const int step = sizeof(block_q5_K) * nb;
-
     float yl[16], yh[16];
 
     const uint16_t kmask1 = 0x3f3f;
@@ -3930,7 +4014,6 @@ void kernel_mul_mv_q5_K_f32_impl(
     device const float * y1 = yy + ix*QK_K + y_offset;
 
     for (int i = ix; i < nb; i += 4) {
-
         device const uint8_t * q1 = x[i].qs + q_offset;
         device const uint8_t * qh = x[i].qh + l0;
         device const half * dh = &x[i].d;
@@ -3946,7 +4029,6 @@ void kernel_mul_mv_q5_K_f32_impl(
         }
 
         for (int row = 0; row < 2; ++row) {
-
             device const uint8_t * q2 = q1 + 64;
 
             sc16[0] = a[0] & kmask1;
@@ -3975,15 +4057,13 @@ void kernel_mul_mv_q5_K_f32_impl(
                                  sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
                          dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
 
-            q1 += step;
-            qh += step;
-            dh += step/2;
-            a  += step/2;
-
+            q1 += nb01;
+            qh += nb01;
+            dh += nb01/2;
+            a  += nb01/2;
         }
 
         y1 += 4 * QK_K;
-
     }
 
     for (int row = 0; row < 2; ++row) {
@@ -4005,12 +4085,14 @@ kernel void kernel_mul_mv_q5_K_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -4019,7 +4101,7 @@ kernel void kernel_mul_mv_q5_K_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }
 
 void kernel_mul_mv_q6_K_f32_impl(
@@ -4029,8 +4111,14 @@ void kernel_mul_mv_q6_K_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -4056,10 +4144,11 @@ void kernel_mul_mv_q6_K_f32_impl(
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    const uint offset0 = row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =  r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+    device const block_q6_K * x = (device const block_q6_K *) ((device char *) src0 + offset0);
+    device const float     * yy = (device const float      *) ((device char *) src1 + offset1);
 
     float sumf = 0;
 
@@ -4115,12 +4204,14 @@ kernel void kernel_mul_mv_q6_K_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -4129,7 +4220,7 @@ kernel void kernel_mul_mv_q6_K_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }
 
 // ======================= "True" 2-bit
@@ -4141,8 +4232,14 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -4158,15 +4255,15 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_iq2_xxs * x = (device const block_iq2_xxs *) src0 + ib_row + offset0;
-    device const float         * y = (device const float         *) src1 + r1*ne10 + im*ne00*ne1;
+    device const block_iq2_xxs * x = (device const block_iq2_xxs *) ((device char *) src0 + offset0);
+    device const float         * y = (device const float         *) ((device char *) src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
@@ -4219,8 +4316,8 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
             }
             sumf[row] += d * sum;
 
-            dh += nb*sizeof(block_iq2_xxs)/2;
-            q2 += nb*sizeof(block_iq2_xxs)/2;
+            dh += nb01/2;
+            q2 += nb01/2;
         }
 
         y4 += 32 * 32;
@@ -4245,12 +4342,14 @@ kernel void kernel_mul_mv_iq2_xxs_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -4260,7 +4359,7 @@ kernel void kernel_mul_mv_iq2_xxs_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 }
 
 void kernel_mul_mv_iq2_xs_f32_impl(
@@ -4270,8 +4369,14 @@ void kernel_mul_mv_iq2_xs_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -4287,15 +4392,15 @@ void kernel_mul_mv_iq2_xs_f32_impl(
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_iq2_xs * x = (device const block_iq2_xs *) src0 + ib_row + offset0;
-    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
+    device const block_iq2_xs * x = (device const block_iq2_xs *) ((device char *) src0 + offset0);
+    device const float        * y = (device const float        *) ((device char *) src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
@@ -4357,9 +4462,9 @@ void kernel_mul_mv_iq2_xs_f32_impl(
             }
             sumf[row] += d1 * sum1 + d2 * sum2;
 
-            dh += nb*sizeof(block_iq2_xs)/2;
-            q2 += nb*sizeof(block_iq2_xs)/2;
-            sc += nb*sizeof(block_iq2_xs);
+            dh += nb01/2;
+            q2 += nb01/2;
+            sc += nb01;
         }
 
         y4 += 32 * 32;
@@ -4384,12 +4489,14 @@ kernel void kernel_mul_mv_iq2_xs_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -4399,7 +4506,7 @@ kernel void kernel_mul_mv_iq2_xs_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 }
 
 void kernel_mul_mv_iq3_xxs_f32_impl(
@@ -4409,8 +4516,14 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -4426,15 +4539,15 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_iq3_xxs * x = (device const block_iq3_xxs *) src0 + ib_row + offset0;
-    device const float         * y = (device const float         *) src1 + r1*ne10 + im*ne00*ne1;
+    device const block_iq3_xxs * x = (device const block_iq3_xxs *) ((device char *) src0 + offset0);
+    device const float         * y = (device const float         *) ((device char *) src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
@@ -4489,9 +4602,9 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
             }
             sumf[row] += d * (sum[0] + sum[1]);
 
-            dh  += nb*sizeof(block_iq3_xxs)/2;
-            q3  += nb*sizeof(block_iq3_xxs);
-            gas += nb*sizeof(block_iq3_xxs)/2;
+            dh  += nb01/2;
+            q3  += nb01;
+            gas += nb01/2;
         }
 
         y4 += 32 * 32;
@@ -4516,12 +4629,14 @@ kernel void kernel_mul_mv_iq3_xxs_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -4531,7 +4646,7 @@ kernel void kernel_mul_mv_iq3_xxs_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 }
 
 void kernel_mul_mv_iq3_s_f32_impl(
@@ -4541,8 +4656,14 @@ void kernel_mul_mv_iq3_s_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -4558,15 +4679,15 @@ void kernel_mul_mv_iq3_s_f32_impl(
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0;
-    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
+    device const block_iq3_s * x = (device const block_iq3_s *) ((device char *) src0 + offset0);
+    device const float       * y = (device const float       *) ((device char *) src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
@@ -4619,11 +4740,11 @@ void kernel_mul_mv_iq3_s_f32_impl(
             }
             sumf[row] += d * (sum[0] + sum[1]);
 
-            dh  += nb*sizeof(block_iq3_s)/2;
-            qs  += nb*sizeof(block_iq3_s);
-            qh  += nb*sizeof(block_iq3_s);
-            sc  += nb*sizeof(block_iq3_s);
-            signs += nb*sizeof(block_iq3_s);
+            dh    += nb01/2;
+            qs    += nb01;
+            qh    += nb01;
+            sc    += nb01;
+            signs += nb01;
         }
 
         y4 += 32 * 32;
@@ -4648,12 +4769,14 @@ kernel void kernel_mul_mv_iq3_s_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -4663,7 +4786,7 @@ kernel void kernel_mul_mv_iq3_s_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 }
 
 void kernel_mul_mv_iq2_s_f32_impl(
@@ -4673,8 +4796,14 @@ void kernel_mul_mv_iq2_s_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -4690,15 +4819,15 @@ void kernel_mul_mv_iq2_s_f32_impl(
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
 
-    device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0;
-    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
+    device const block_iq2_s * x = (device const block_iq2_s *) ((device char *) src0 + offset0);
+    device const float       * y = (device const float       *) ((device char *) src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
@@ -4752,11 +4881,11 @@ void kernel_mul_mv_iq2_s_f32_impl(
             }
             sumf[row] += d1 * sum[0] + d2 * sum[1];
 
-            dh  += nb*sizeof(block_iq2_s)/2;
-            qs  += nb*sizeof(block_iq2_s);
-            qh  += nb*sizeof(block_iq2_s);
-            sc  += nb*sizeof(block_iq2_s);
-            signs += nb*sizeof(block_iq2_s);
+            dh    += nb01/2;
+            qs    += nb01;
+            qh    += nb01;
+            sc    += nb01;
+            signs += nb01;
         }
 
         y4 += 32 * 32;
@@ -4781,12 +4910,14 @@ kernel void kernel_mul_mv_iq2_s_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -4796,7 +4927,7 @@ kernel void kernel_mul_mv_iq2_s_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 }
 
 void kernel_mul_mv_iq1_s_f32_impl(
@@ -4806,8 +4937,14 @@ void kernel_mul_mv_iq1_s_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -4823,14 +4960,15 @@ void kernel_mul_mv_iq1_s_f32_impl(
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-    device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
-    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+    device const block_iq1_s * x = (device const block_iq1_s *) ((device char *) src0 + offset0);
+    device const float       * y = (device const float       *) ((device char *) src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
@@ -4873,9 +5011,9 @@ void kernel_mul_mv_iq1_s_f32_impl(
             }
             sumf[row] += (float)dh[0] * (sum + sumy * (qh[0] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA)) * (2*((qh[0] >> 12) & 7) + 1);
 
-            dh += nb*sizeof(block_iq1_s)/2;
-            qs += nb*sizeof(block_iq1_s);
-            qh += nb*sizeof(block_iq1_s)/2;
+            dh += nb01/2;
+            qs += nb01;
+            qh += nb01/2;
         }
 
         y4 += 32 * 32;
@@ -4896,8 +5034,14 @@ void kernel_mul_mv_iq1_m_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -4913,14 +5057,15 @@ void kernel_mul_mv_iq1_m_f32_impl(
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-    device const block_iq1_m * x = (device const block_iq1_m *) src0 + ib_row + offset0;
-    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+    device const block_iq1_m * x = (device const block_iq1_m *) ((device char *) src0 + offset0);
+    device const float       * y = (device const float       *) ((device char *) src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
@@ -4972,9 +5117,9 @@ void kernel_mul_mv_iq1_m_f32_impl(
             sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
                                              (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
 
-            sc += nb*sizeof(block_iq1_m)/2;
-            qs += nb*sizeof(block_iq1_m);
-            qh += nb*sizeof(block_iq1_m);
+            sc += nb01/2;
+            qs += nb01;
+            qh += nb01;
         }
 
         y4 += 32 * 32;
@@ -4995,8 +5140,14 @@ void kernel_mul_mv_iq4_nl_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -5012,14 +5163,15 @@ void kernel_mul_mv_iq4_nl_f32_impl(
     const int r1 = tgpig.y;
     const int im = tgpig.z;
     const int first_row = (r0 * 2 + sgitg) * 2;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-    device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
-    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+    device const block_iq4_nl * x = (device const block_iq4_nl *) ((device char *) src0 + offset0);
+    device const float        * y = (device const float        *) ((device char *) src1 + offset1);
 
     const int ix = tiisg/2;  // 0...15
     const int it = tiisg%2;  // 0 or 1
@@ -5089,8 +5241,14 @@ void kernel_mul_mv_iq4_xs_f32_impl(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -5106,14 +5264,15 @@ void kernel_mul_mv_iq4_xs_f32_impl(
     const int r1 = tgpig.y;
     const int im = tgpig.z;
     const int first_row = (r0 * 2 + sgitg) * 2;
-    const int ib_row = first_row * nb;
 
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-    device const block_iq4_xs * x = (device const block_iq4_xs *) src0 + ib_row + offset0;
-    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
+    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+    device const block_iq4_xs * x = (device const block_iq4_xs *) ((device char *) src0 + offset0);
+    device const float        * y = (device const float        *) ((device char *) src1 + offset1);
 
     const int ix = tiisg/16;  // 0 or 1
     const int it = tiisg%16;  // 0...15
@@ -5188,12 +5347,14 @@ kernel void kernel_mul_mv_iq1_s_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -5202,7 +5363,7 @@ kernel void kernel_mul_mv_iq1_s_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }
 
 [[host_name("kernel_mul_mv_iq1_m_f32")]]
@@ -5216,12 +5377,14 @@ kernel void kernel_mul_mv_iq1_m_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -5230,7 +5393,7 @@ kernel void kernel_mul_mv_iq1_m_f32(
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq1_m_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq1_m_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }
 
 [[host_name("kernel_mul_mv_iq4_nl_f32")]]
@@ -5244,12 +5407,14 @@ kernel void kernel_mul_mv_iq4_nl_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -5259,7 +5424,7 @@ kernel void kernel_mul_mv_iq4_nl_f32(
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 }
 
 [[host_name("kernel_mul_mv_iq4_xs_f32")]]
@@ -5273,12 +5438,14 @@ kernel void kernel_mul_mv_iq4_xs_f32(
         constant  uint64_t & nb00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
         constant   int64_t & ne12,
         constant  uint64_t & nb10,
         constant  uint64_t & nb11,
         constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         constant   uint    & r2,
@@ -5288,7 +5455,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 }
 
 //============================= templates and their specializations =============================
@@ -5833,10 +6000,12 @@ kernel void kernel_mul_mm(device const  uchar * src0,
                           constant    int64_t & ne02,
                           constant   uint64_t & nb01,
                           constant   uint64_t & nb02,
+                          constant   uint64_t & nb03,
                           constant    int64_t & ne12,
                           constant   uint64_t & nb10,
                           constant   uint64_t & nb11,
                           constant   uint64_t & nb12,
+                          constant   uint64_t & nb13,
                           constant    int64_t & ne0,
                           constant    int64_t & ne1,
                           constant       uint & r2,
@@ -5873,12 +6042,13 @@ kernel void kernel_mul_mm(device const  uchar * src0,
     const uint i12 = im%ne12;
     const uint i13 = im/ne12;
 
-    uint   offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02);
+    uint   offset0 = (i12/r2)*nb02 + (i13/r3)*nb03;
     ushort offset1 = il/nl;
 
     device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
     device const float   * y = (device const float   *)(src1
-        + nb12 * im
+        + nb13 * i13
+        + nb12 * i12
         + nb11 * (r1 * BLOCK_SIZE_N + thread_col)
         + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
 
@@ -6257,12 +6427,14 @@ typedef void (kernel_mul_mv_impl_t)(
                   uint64_t   nb00,
                   uint64_t   nb01,
                   uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne11,
                    int64_t   ne12,
                   uint64_t   nb10,
                   uint64_t   nb11,
                   uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -6277,8 +6449,14 @@ typedef void (kernel_mul_mv2_impl_t)(
                    int64_t   ne00,
                    int64_t   ne01,
                    int64_t   ne02,
+                  uint64_t   nb01,
+                  uint64_t   nb02,
+                  uint64_t   nb03,
                    int64_t   ne10,
                    int64_t   ne12,
+                  uint64_t   nb11,
+                  uint64_t   nb12,
+                  uint64_t   nb13,
                    int64_t   ne0,
                    int64_t   ne1,
                    uint      r2,
@@ -6299,6 +6477,7 @@ void mmv_fn(
                     uint64_t   nb00,
                     uint64_t   nb01,
                     uint64_t   nb02,
+                    uint64_t   nb03,
                      int64_t   ne10,
                      int64_t   ne11,
                      int64_t   ne12,
@@ -6306,6 +6485,7 @@ void mmv_fn(
                     uint64_t   nb10,
                     uint64_t   nb11,
                     uint64_t   nb12,
+                    uint64_t   nb13,
                      int64_t   ne0,
                      int64_t   ne1,
                     uint64_t   nb1,
@@ -6316,7 +6496,7 @@ void mmv_fn(
         uint                   tiitg,
         uint                   tiisg,
         uint                   sgitg) {
-    impl_fn(src0,src1,dst,ne00,ne01,ne02,nb00,nb01,nb02,ne10,ne11,ne12,nb10,nb11,nb12,ne0,ne1,r2,r3,tgpig,tiisg);
+    impl_fn(src0,src1,dst,ne00,ne01,ne02,nb00,nb01,nb02,nb03,ne10,ne11,ne12,nb10,nb11,nb12,nb13,ne0,ne1,r2,r3,tgpig,tiisg);
 }
 
 template<kernel_mul_mv2_impl_t impl_fn>
@@ -6330,6 +6510,7 @@ void mmv_fn(
                     uint64_t   nb00,
                     uint64_t   nb01,
                     uint64_t   nb02,
+                    uint64_t   nb03,
                      int64_t   ne10,
                      int64_t   ne11,
                      int64_t   ne12,
@@ -6337,6 +6518,7 @@ void mmv_fn(
                     uint64_t   nb10,
                     uint64_t   nb11,
                     uint64_t   nb12,
+                    uint64_t   nb13,
                      int64_t   ne0,
                      int64_t   ne1,
                     uint64_t   nb1,
@@ -6347,7 +6529,7 @@ void mmv_fn(
         uint                   tiitg,
         uint                   tiisg,
         uint                   sgitg) {
-    impl_fn(src0,(const device float *)src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,shared_values,tgpig,tiisg,sgitg);
+    impl_fn(src0,(const device float *)src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,shared_values,tgpig,tiisg,sgitg);
 }
 
 typedef decltype(mmv_fn<kernel_mul_mv_impl<half, half4, half, half4>>) mul_mv_impl_fn_t;
@@ -6396,8 +6578,8 @@ kernel void kernel_mul_mv_id(
     const int64_t i2 = i12;
 
     device const char * src0_cur = src0s + i02*nb02;
-    device const char * src1_cur = src1 + i11*nb11 + i12*nb12;
-    device      float * dst_cur  = dst + i1*ne0 + i2*ne1*ne0;
+    device const char * src1_cur = src1  + i11*nb11 + i12*nb12;
+    device      float *  dst_cur = dst   + i1*ne0   + i2*ne1*ne0;
 
     impl_fn(
         /* src0 */ src0_cur,
@@ -6405,19 +6587,21 @@ kernel void kernel_mul_mv_id(
         /* dst  */ dst_cur,
         /* ne00 */ ne00,
         /* ne01 */ ne01,
-        /* ne02 */ 1,//ne02,
+        /* ne02 */ 1, // ne02,
         /* nb00 */ nb00,
         /* nb01 */ nb01,
         /* nb02 */ nb02,
+        /* nb03 */ nb02, // ne02 == 1
         /* ne10 */ ne10,
-        /* ne11 */ 1,//ne11,
-        /* ne12 */ 1,//ne12,
-        /* ne13 */ 1,//ne13,
+        /* ne11 */ 1, // ne11,
+        /* ne12 */ 1, // ne12,
+        /* ne13 */ 1, // ne13,
         /* nb10 */ nb10,
         /* nb11 */ nb11,
         /* nb12 */ nb12,
+        /* ne13 */ nb12, // ne12 == 1
         /* ne0  */ ne0,
-        /* ne1  */ 1,//ne1,
+        /* ne1  */ 1, // ne1,
         /* nb1  */ nb1,
         /* r2   */ 1,
         /* r3   */ 1,

From 9e4a2563eadf34e9432d248224d4f43e8495e8fe Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Oct 2024 10:33:31 +0300
Subject: [PATCH 111/396] scripts : fix amx sync [no ci]

---
 scripts/sync-ggml-am.sh | 7 +++++++
 scripts/sync-ggml.sh    | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index ffce2aab0..fba29b935 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -76,6 +76,7 @@ while read c; do
         src/ggml*.m \
         src/ggml*.metal \
         src/ggml*.cu \
+        src/ggml-amx/* \
         src/ggml-cann/* \
         src/ggml-cuda/* \
         src/ggml-sycl/* \
@@ -121,6 +122,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # src/ggml-aarch64.c      -> ggml/src/ggml-aarch64.c
     # src/ggml-aarch64.h      -> ggml/src/ggml-aarch64.h
     # src/ggml-alloc.c        -> ggml/src/ggml-alloc.c
+    # src/ggml-amx/*          -> ggml/src/ggml-amx/
+    # src/ggml-amx.cpp        -> ggml/src/ggml-amx.cpp
     # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h
     # src/ggml-backend.cpp    -> ggml/src/ggml-backend.cpp
     # src/ggml-cann/*         -> ggml/src/ggml-cann/
@@ -141,6 +144,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     #
     # include/ggml.h         -> ggml/include/ggml.h
     # include/ggml-alloc.h   -> ggml/include/ggml-alloc.h
+    # include/ggml-amx.h     -> ggml/include/ggml-amx.h
     # include/ggml-backend.h -> ggml/include/ggml-backend.h
     # include/ggml-blas.h    -> ggml/include/ggml-blas.h
     # include/ggml-cann.h    -> ggml/include/ggml-cann.h
@@ -168,6 +172,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\.cpp/\1ggml\/src\/ggml-amx.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.cpp/\1ggml\/src\/ggml-backend.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
@@ -187,6 +193,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
         -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
         -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-amx\.h/\1ggml\/include\/ggml-amx.h/g' \
         -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \
         -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \
         -e 's/([[:space:]]|[ab]\/)include\/ggml-cann\.h/\1ggml\/include\/ggml-cann.h/g' \
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index f6ff5e683..f5d87324a 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -8,6 +8,8 @@ cp -rpv ../ggml/src/ggml.c              ./ggml/src/ggml.c
 cp -rpv ../ggml/src/ggml-aarch64.c      ./ggml/src/ggml-aarch64.c
 cp -rpv ../ggml/src/ggml-aarch64.h      ./ggml/src/ggml-aarch64.h
 cp -rpv ../ggml/src/ggml-alloc.c        ./ggml/src/ggml-alloc.c
+cp -rpv ../ggml/src/ggml-amx/*          ./ggml/src/ggml-amx/
+cp -rpv ../ggml/src/ggml-amx.cpp        ./ggml/src/ggml-amx.cpp
 cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h
 cp -rpv ../ggml/src/ggml-backend.cpp    ./ggml/src/ggml-backend.cpp
 cp -rpv ../ggml/src/ggml-cann/*         ./ggml/src/ggml-cann/
@@ -29,6 +31,7 @@ cp -rpv ../ggml/src/vulkan-shaders/*    ./ggml/src/vulkan-shaders/
 
 cp -rpv ../ggml/include/ggml.h         ./ggml/include/ggml.h
 cp -rpv ../ggml/include/ggml-alloc.h   ./ggml/include/ggml-alloc.h
+cp -rpv ../ggml/include/ggml-amx.h     ./ggml/include/ggml-amx.h
 cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h
 cp -rpv ../ggml/include/ggml-blas.h    ./ggml/include/ggml-blas.h
 cp -rpv ../ggml/include/ggml-cann.h    ./ggml/include/ggml-cann.h

From 8c60a8a46261ffb92b6d23a78acfac2fcb6fe233 Mon Sep 17 00:00:00 2001
From: bssrdf <merlintiger@hotmail.com>
Date: Wed, 23 Oct 2024 14:34:00 -0400
Subject: [PATCH 112/396] increase cuda_cpy block size (ggml/996)

Co-authored-by: bssrdf <bssrdf@gmail.com>
---
 ggml/src/ggml-cuda/cpy.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
index 796167426..28b06cdda 100644
--- a/ggml/src/ggml-cuda/cpy.cuh
+++ b/ggml/src/ggml-cuda/cpy.cuh
@@ -1,6 +1,6 @@
 #include "common.cuh"
 
-#define CUDA_CPY_BLOCK_SIZE 32
+#define CUDA_CPY_BLOCK_SIZE 64
 
 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
 

From cc2983d3753c94a630ca7257723914d4c4f6122b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Oct 2024 10:34:08 +0300
Subject: [PATCH 113/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 7f689f632..da40927e1 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-6dccc647264f5429df2624f36138f601e7ce23e5
+162e232411ee98ceb0cccfa84886118d917d2123

From 8841ce3f439de6e770f70319b7e08b6613197ea7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 27 Oct 2024 20:59:58 +0200
Subject: [PATCH 114/396] llama : switch KQ multiplication to F32 precision by
 default (#10015)

ggml-ci
---
 src/llama.cpp | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 50eebc2c2..53979e83f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9618,20 +9618,16 @@ static struct ggml_tensor * llm_build_kqv(
         cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
                                   hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
 
-        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_GEMMA2) {
-            ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
-        }
+        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
 
         cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
     } else {
         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
         cb(kq, "kq", il);
 
-        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON || model.arch == LLM_ARCH_CHATGLM) {
-            // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
-            // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
-            ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
-        }
+        // note: this op tends to require high floating point range
+        //       while for some models F16 is enough, for others it is not, so we default to F32 here
+        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
 
         if (model.arch == LLM_ARCH_GROK) {
             // need to do the following:
@@ -9640,9 +9636,6 @@ static struct ggml_tensor * llm_build_kqv(
             // kq = 30 * tanh(kq / 30)
             // before the softmax below
 
-            //try from phi2
-            //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
-
             kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
             kq = ggml_scale(ctx, kq, 30);
         }

From 8125e6cbfcf2b3b9066e4d923aca9295526730f5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 28 Oct 2024 08:49:32 +0200
Subject: [PATCH 115/396] server : don't overfill the batch during infill
 (#10018)

ggml-ci
---
 examples/server/server.cpp | 1 +
 examples/server/utils.hpp  | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ff1d9b03c..077c7ad1a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1880,6 +1880,7 @@ struct server_context {
                     if (slot.state == SLOT_STATE_STARTED) {
                         slot.t_start_process_prompt = ggml_time_us();
                         slot.t_start_generation = 0;
+
                         slot.n_past = 0;
                         slot.n_prompt_tokens = prompt_tokens.size();
                         slot.state = SLOT_STATE_PROCESSING_PROMPT;
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 811242062..562635555 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -266,8 +266,10 @@ static llama_tokens format_infill(
     }
 
     // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
-    const int n_suffix_take = std::min<int>(tokens_suffix.size(),   (n_batch/4));
-    const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4) - 3);
+    const int n_prefix_take = std::min<int>(tokens_prefix.size(),                3*(n_batch/4));
+    const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
+
+    SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
 
     // fill the rest of the context with extra chunks
     const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());

From 524afeec9dad7d765ce91f5cf30c73703867cb47 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Mon, 28 Oct 2024 17:02:48 +0800
Subject: [PATCH 116/396] musa: workaround for Guilty Lockup in cleaning src0
 (#10042)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 ggml/src/ggml-cuda.cu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 21c9f5e38..217df968a 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1484,14 +1484,19 @@ static void ggml_cuda_op_mul_mat(
             const size_t nbytes_data    = ggml_nbytes(src0);
             const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
             dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
+        // TODO: remove this for MUSA once the Guilty Lockup issue is resolved
+#ifndef GGML_USE_MUSA
             CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream));
+#else // GGML_USE_MUSA
+            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
+#endif // !GGML_USE_MUSA
         }
 
         // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
         if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
             const size_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
             const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
-            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
+            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
         }
 
         if (src1_on_device && src1_is_contiguous) {

From 07028f9d74d895da2ca4a1956624e3f07e04e620 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 28 Oct 2024 17:41:24 +0200
Subject: [PATCH 117/396] flake.lock: Update (#10063)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/4c2fcb090b1f3e5b47eaa7bd33913b574a11e0a0?narHash=sha256-/uilDXvCIEs3C9l73JTACm4quuHUsIHcns1c%2BcHUJwA%3D' (2024-10-18)
  → 'github:NixOS/nixpkgs/2768c7d042a37de65bb1b5b3268fc987e534c49d?narHash=sha256-AlcmCXJZPIlO5dmFzV3V2XF6x/OpNWUV8Y/FMPGd8Z4%3D' (2024-10-23)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index 1f8defab7..732c7539c 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1729256560,
-        "narHash": "sha256-/uilDXvCIEs3C9l73JTACm4quuHUsIHcns1c+cHUJwA=",
+        "lastModified": 1729665710,
+        "narHash": "sha256-AlcmCXJZPIlO5dmFzV3V2XF6x/OpNWUV8Y/FMPGd8Z4=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "4c2fcb090b1f3e5b47eaa7bd33913b574a11e0a0",
+        "rev": "2768c7d042a37de65bb1b5b3268fc987e534c49d",
         "type": "github"
       },
       "original": {

From 61715d5cc83a28181df6a641846e4f6a740f3c74 Mon Sep 17 00:00:00 2001
From: arch-btw <57669023+arch-btw@users.noreply.github.com>
Date: Mon, 28 Oct 2024 10:45:33 -0700
Subject: [PATCH 118/396] llama : Add IBM granite template (#10013)

* Add granite template to llama.cpp

* Add granite template to test-chat-template.cpp

* Update src/llama.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* Update tests/test-chat-template.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* Added proper template and expected output

* Small change to \n

Small change to \n

* Add code space &

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* Fix spacing

* Apply suggestions from code review

* Update src/llama.cpp

---------

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
---
 src/llama.cpp                | 10 ++++++++++
 tests/test-chat-template.cpp |  4 ++++
 2 files changed, 14 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index 53979e83f..4cb669bcf 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21706,6 +21706,16 @@ static int32_t llama_chat_apply_template_internal(
                 ss << message->content << "\n\n";
             }
         }
+    } else if (tmpl == "granite" || tmpl_contains("<|start_of_role|>")) {
+        // IBM Granite template
+        for (const auto & message : chat) {
+            std::string role(message->role);
+            ss << "<|start_of_role|>" << role << "<|end_of_role|>"
+               << message->content << "<|end_of_text|>\n";
+        }
+        if (add_ass) {
+            ss << "<|start_of_role|>assistant<|end_of_role|>\n";
+        }
     } else {
         // template not supported
         return -1;
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 6f046249f..03e897e66 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -65,6 +65,8 @@ int main(void) {
         u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
         // DeepSeek-V2
         "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+        // ibm-granite/granite-3.0-8b-instruct
+        "{%- if tools %}\n    {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n    {%- for tool in tools %}\n    {{- tool | tojson(indent=4) }}\n    {%- if not loop.last %}\n        {{- '\n\n' }}\n    {%- endif %}\n    {%- endfor %}\n    {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n    {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'user' %}\n    {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant_tool_call' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'tool_response' %}\n    {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- endif %}\n    {%- if loop.last and add_generation_prompt %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n    {%- endif %}\n{%- endfor %}",
     };
     std::vector<std::string> expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
@@ -109,6 +111,8 @@ int main(void) {
         u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
         // DeepSeek-V2
         u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:",
+        // ibm-granite/granite-3.0-8b-instruct
+        "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>\n",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;

From 8d8ff715367480b856ad86ac3888e9742b13a6fa Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 29 Oct 2024 10:42:05 +0200
Subject: [PATCH 119/396] llama : remove Tail-Free sampling (#10071)

ggml-ci
---
 common/arg.cpp                                |  9 +-
 common/common.cpp                             |  1 -
 common/common.h                               |  4 +-
 common/sampling.cpp                           | 13 +--
 examples/main/README.md                       |  8 --
 examples/server/README.md                     | 11 +--
 examples/server/public/index-new.html         |  3 -
 examples/server/public/index.html             |  2 -
 examples/server/server.cpp                    |  2 -
 examples/server/themes/buttons-top/index.html |  2 -
 examples/server/themes/wild/index.html        |  2 -
 examples/server/utils.hpp                     |  2 +-
 include/llama.h                               |  3 -
 scripts/run-with-preset.py                    |  2 +-
 src/llama-sampling.cpp                        | 97 +------------------
 tests/test-sampling.cpp                       | 26 +----
 16 files changed, 15 insertions(+), 172 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index e1e933934..7c5c5e5cd 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -943,13 +943,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sparams.min_p = std::stof(value);
         }
     ).set_sparam());
-    add_opt(common_arg(
-        {"--tfs"}, "N",
-        string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
-        [](common_params & params, const std::string & value) {
-            params.sparams.tfs_z = std::stof(value);
-        }
-    ).set_sparam());
     add_opt(common_arg(
         {"--xtc-probability"}, "N",
         string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
@@ -1074,7 +1067,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_sparam());
     add_opt(common_arg(
         {"--mirostat"}, "N",
-        string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+        string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
         "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
         [](common_params & params, int value) {
             params.sparams.mirostat = value;
diff --git a/common/common.cpp b/common/common.cpp
index ff8cc4076..7656843b1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2090,7 +2090,6 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
     const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
     yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
 
-    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
     fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
diff --git a/common/common.h b/common/common.h
index 18b2121ed..cd5a8e051 100644
--- a/common/common.h
+++ b/common/common.h
@@ -88,7 +88,7 @@ enum common_sampler_type {
     COMMON_SAMPLER_TYPE_TOP_K       = 2,
     COMMON_SAMPLER_TYPE_TOP_P       = 3,
     COMMON_SAMPLER_TYPE_MIN_P       = 4,
-    COMMON_SAMPLER_TYPE_TFS_Z       = 5,
+  //COMMON_SAMPLER_TYPE_TFS_Z       = 5,
     COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
     COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
     COMMON_SAMPLER_TYPE_XTC         = 8,
@@ -113,7 +113,6 @@ struct common_sampler_params {
     float   min_p              = 0.05f; // 0.0 = disabled
     float   xtc_probability    = 0.00f; // 0.0 = disabled
     float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
-    float   tfs_z              = 1.00f; // 1.0 = disabled
     float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
     float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
     float   dynatemp_range     = 0.00f; // 0.0 = disabled
@@ -139,7 +138,6 @@ struct common_sampler_params {
     std::vector<enum common_sampler_type> samplers = {
         COMMON_SAMPLER_TYPE_DRY,
         COMMON_SAMPLER_TYPE_TOP_K,
-        COMMON_SAMPLER_TYPE_TFS_Z,
         COMMON_SAMPLER_TYPE_TYPICAL_P,
         COMMON_SAMPLER_TYPE_TOP_P,
         COMMON_SAMPLER_TYPE_MIN_P,
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 48a9df8ba..7922fde47 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -131,11 +131,11 @@ std::string common_sampler_params::print() const {
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
             "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
             "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
             penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
             dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
-            top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
+            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
             mirostat, mirostat_eta, mirostat_tau);
 
     return std::string(result);
@@ -199,9 +199,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                 case COMMON_SAMPLER_TYPE_XTC:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                     break;
-                case COMMON_SAMPLER_TYPE_TFS_Z:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
-                    break;
                 case COMMON_SAMPLER_TYPE_TYPICAL_P:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
                     break;
@@ -373,7 +370,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
     switch (cnstr) {
         case COMMON_SAMPLER_TYPE_DRY:         return 'd';
         case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
-        case COMMON_SAMPLER_TYPE_TFS_Z:       return 'f';
         case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
         case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
         case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
@@ -388,7 +384,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
     switch (cnstr) {
         case COMMON_SAMPLER_TYPE_DRY:         return "dry";
         case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
-        case COMMON_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
         case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
         case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
         case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
@@ -406,7 +401,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
         { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
         { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
-        { "tfs_z",       COMMON_SAMPLER_TYPE_TFS_Z },
         { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
         { "xtc",         COMMON_SAMPLER_TYPE_XTC },
         { "infill",      COMMON_SAMPLER_TYPE_INFILL },
@@ -423,8 +417,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
         { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
         { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
-        { "tfs-z",       COMMON_SAMPLER_TYPE_TFS_Z },
-        { "tfs",         COMMON_SAMPLER_TYPE_TFS_Z },
         { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
     };
 
@@ -452,7 +444,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
     std::unordered_map<char, common_sampler_type> sampler_name_map = {
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY),         COMMON_SAMPLER_TYPE_DRY },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z),       COMMON_SAMPLER_TYPE_TFS_Z },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
diff --git a/examples/main/README.md b/examples/main/README.md
index c7c823171..5357ac2e2 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -235,14 +235,6 @@ The Min-P sampling method was designed as an alternative to Top-P, and aims to e
 
 Example usage: `--min-p 0.05`
 
-### Tail-Free Sampling (TFS)
-
--   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
-
-Tail-free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks at how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens and thus disables the effect of TFS.
-
-Example usage: `--tfs 0.95`
-
 ### Locally Typical Sampling
 
 -   `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
diff --git a/examples/server/README.md b/examples/server/README.md
index bc737237e..1629e456b 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -99,7 +99,7 @@ The project is under active development, and we are [looking for feedback and co
 
 | Argument | Explanation |
 | -------- | ----------- |
-| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;typ_p;top_p;min_p;temperature) |
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
@@ -108,7 +108,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
 | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
 | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
-| `--tfs N` | tail free sampling, parameter z (default: 1.0, 1.0 = disabled) |
 | `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
 | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
 | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
@@ -121,7 +120,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers (`['\n', ':', '"', '*']`) in the process; use `"none"` to not use any sequence breakers
 | `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
 | `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
-| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
+| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
 | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
 | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
 | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
@@ -360,8 +359,6 @@ node index.js
     `stop`: Specify a JSON array of stopping strings.
     These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
 
-    `tfs_z`: Enable tail free sampling with parameter z. Default: `1.0`, which is disabled.
-
     `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
 
     `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
@@ -412,7 +409,7 @@ node index.js
 
     `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
 
-    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
+    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
 
 **Response format**
 
@@ -738,7 +735,6 @@ Example:
         "repeat_penalty": 1.100000023841858,
         "samplers": [
             "top_k",
-            "tfs_z",
             "typical_p",
             "top_p",
             "min_p",
@@ -752,7 +748,6 @@ Example:
         "stream": false,
         "task_id": 0,
         "temperature": 0.0,
-        "tfs_z": 1.0,
         "top_k": 40,
         "top_p": 0.949999988079071,
         "typical_p": 1.0
diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html
index cb3995abe..8bfa380e5 100644
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -49,7 +49,6 @@
       min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
       xtc_probability: 0.0, // 0 = disabled;
       xtc_threshold: 0.1, // > 0.5 disables XTC;
-      tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
       frequency_penalty: 0.0, // 0.0 = disabled
@@ -847,7 +846,6 @@ return html`
           ${FloatField({ label: "DRY Base", title: "Set the DRY repetition penalty base value. Default is 1.75", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })}
           ${IntField({ label: "DRY Allowed Length", title: "Tokens that extend repetition beyond this receive exponentially increasing penalty. Default is 2", max: 10, min: 1, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })}
           ${IntField({ label: "DRY Penalty Last N", title: "How many tokens to scan for repetitions. Default is -1, where 0 is disabled and -1 is context size", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })}
-          ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
           ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
         </fieldset>
 
@@ -1147,7 +1145,6 @@ document.addEventListener('DOMContentLoaded', (event) => {
     xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
     xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
     top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
-    tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 },
     typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
     repeat_penalty: { snapValue: 1.0, snapRangeMultiplier: 4 },
     presence_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 },
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 7f9b02bfb..a95f5c6df 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -313,7 +313,6 @@
       min_p: 0.05, // 0 = disabled
       xtc_probability: 0.0, // 0 = disabled;
       xtc_threshold: 0.1, // > 0.5 disables XTC;
-      tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
       frequency_penalty: 0.0, // 0.0 = disabled
@@ -1015,7 +1014,6 @@
           <details>
             <summary>More options</summary>
             <fieldset class="two">
-              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
               ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
               ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
               ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 077c7ad1a..7953b5065 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -809,7 +809,6 @@ struct server_context {
         slot.sparams.min_p              = json_value(data, "min_p",              default_sparams.min_p);
         slot.sparams.xtc_probability    = json_value(data, "xtc_probability",    default_sparams.xtc_probability);
         slot.sparams.xtc_threshold      = json_value(data, "xtc_threshold",      default_sparams.xtc_threshold);
-        slot.sparams.tfs_z              = json_value(data, "tfs_z",              default_sparams.tfs_z);
         slot.sparams.typ_p              = json_value(data, "typical_p",          default_sparams.typ_p);
         slot.sparams.temp               = json_value(data, "temperature",        default_sparams.temp);
         slot.sparams.dynatemp_range     = json_value(data, "dynatemp_range",     default_sparams.dynatemp_range);
@@ -1149,7 +1148,6 @@ struct server_context {
             {"min_p",                     slot.sparams.min_p},
             {"xtc_probability",           slot.sparams.xtc_probability},
             {"xtc_threshold",             slot.sparams.xtc_threshold},
-            {"tfs_z",                     slot.sparams.tfs_z},
             {"typical_p",                 slot.sparams.typ_p},
             {"repeat_last_n",             slot.sparams.penalty_last_n},
             {"repeat_penalty",            slot.sparams.penalty_repeat},
diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html
index 8334bcde5..2797c37c9 100644
--- a/examples/server/themes/buttons-top/index.html
+++ b/examples/server/themes/buttons-top/index.html
@@ -226,7 +226,6 @@
       top_k: 40, // <= 0 to use vocab size
       top_p: 0.95, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled
-      tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
       frequency_penalty: 0.0, // 0.0 = disabled
@@ -788,7 +787,6 @@
           <details>
             <summary>More options</summary>
             <fieldset class="two">
-              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
               ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
               ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
               ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html
index 8361c5774..dbe23c402 100644
--- a/examples/server/themes/wild/index.html
+++ b/examples/server/themes/wild/index.html
@@ -229,7 +229,6 @@
       top_k: 40, // <= 0 to use vocab size
       top_p: 0.95, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled
-      tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
       frequency_penalty: 0.0, // 0.0 = disabled
@@ -791,7 +790,6 @@
           <details>
             <summary>More options</summary>
             <fieldset class="two">
-              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
               ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
               ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
               ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 562635555..58f5a5684 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -607,7 +607,7 @@ static json oaicompat_completion_params_parse(
     }
 
     // Copy remaining properties to llama_params
-    // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
+    // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
     // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
     for (const auto & item : body.items()) {
         // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
diff --git a/include/llama.h b/include/llama.h
index b2d1e7d5a..4076d34a7 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1087,9 +1087,6 @@ extern "C" {
     /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
     LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
 
-    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API struct llama_sampler * llama_sampler_init_tail_free  (float   z, size_t min_keep);
-
     /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
     LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
 
diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py
index 47cacb432..8f0bf8ca8 100755
--- a/scripts/run-with-preset.py
+++ b/scripts/run-with-preset.py
@@ -20,7 +20,7 @@ CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
     "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
     "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
     "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
-    "simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical",
+    "simple-io", "tensor-split", "threads", "temp", "top-k", "top-p", "typical",
     "verbose-prompt"
 ]
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 25536eb6c..c2cfe0a77 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -113,7 +113,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
 }
 
 static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
-    // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
+    // TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
     // if (k >= (int32_t)cur_p->size) {
     //     return;
     // }
@@ -733,101 +733,6 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
     };
 }
 
-// tail-free
-
-struct llama_sampler_tail_free {
-    const float  z;
-    const size_t min_keep;
-};
-
-static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
-    return "tail-free";
-}
-
-static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
-
-    if (ctx->z >= 1.0f || cur_p->size <= 2) {
-        return;
-    }
-
-    llama_sampler_softmax_impl(cur_p);
-
-    // Compute the first and second derivatives
-    std::vector<float> first_derivatives(cur_p->size - 1);
-    std::vector<float> second_derivatives(cur_p->size - 2);
-
-    for (size_t i = 0; i < first_derivatives.size(); ++i) {
-        first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
-    }
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
-    }
-
-    // Calculate absolute value of second derivatives
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = std::abs(second_derivatives[i]);
-    }
-
-    // Normalize the second derivatives
-    {
-        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-
-        if (second_derivatives_sum > 1e-6f) {
-            for (float & value : second_derivatives) {
-                value /= second_derivatives_sum;
-            }
-        } else {
-            for (float & value : second_derivatives) {
-                value = 1.0f / second_derivatives.size();
-            }
-        }
-    }
-
-    float cum_sum = 0.0f;
-    size_t last_idx = cur_p->size;
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        cum_sum += second_derivatives[i];
-
-        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
-        if (cum_sum > ctx->z && i >= ctx->min_keep) {
-            last_idx = i;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the tokens above the tail location
-    cur_p->size = last_idx;
-}
-
-static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
-    return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
-}
-
-static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_tail_free *) smpl->ctx;
-}
-
-static struct llama_sampler_i llama_sampler_tail_free_i = {
-    /* .name   = */ llama_sampler_tail_free_name,
-    /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_tail_free_apply,
-    /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_tail_free_clone,
-    /* .free   = */ llama_sampler_tail_free_free,
-};
-
-struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_tail_free_i,
-        /* .ctx   = */ new llama_sampler_tail_free {
-            /* .z        = */ z,
-            /*. min_keep = */ min_keep,
-        },
-    };
-}
-
 // typical
 
 struct llama_sampler_typical {
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index eb39661c3..be370044d 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -105,16 +105,6 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
     tester.check();
 }
 
-static void test_tfs(const std::vector<float> & probs, const std::vector<float> & probs_expected, float z) {
-    sampler_tester tester(probs, probs_expected);
-
-    DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_tail_free(z, 1));
-    DUMP(&tester.cur_p);
-
-    tester.check();
-}
-
 static void test_min_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
     sampler_tester tester(probs, probs_expected);
 
@@ -202,7 +192,6 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
     for (auto s : samplers_sequence) {
         switch (s){
             case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
-            case 'f': GGML_ABORT("tail_free test not implemented");
             case 'y': GGML_ABORT("typical test not implemented");
             case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
             case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
@@ -299,12 +288,11 @@ static void test_perf() {
         data.emplace_back(llama_token_data{i, logit, 0.0f});
     }
 
-    BENCH(llama_sampler_init_top_k    (40),                     data, 32);
-    BENCH(llama_sampler_init_top_p    (0.8f, 1),                data, 32);
-    BENCH(llama_sampler_init_min_p    (0.2f, 1),                data, 32);
-    BENCH(llama_sampler_init_tail_free(0.5f, 1),                data, 32);
-    BENCH(llama_sampler_init_typical  (0.5f, 1),                data, 32);
-    BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 1, 1),       data, 32);
+    BENCH(llama_sampler_init_top_k  (40),                     data, 32);
+    BENCH(llama_sampler_init_top_p  (0.8f, 1),                data, 32);
+    BENCH(llama_sampler_init_min_p  (0.2f, 1),                data, 32);
+    BENCH(llama_sampler_init_typical(0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_xtc    (1.0f, 0.1f, 1, 1),       data, 32);
 }
 
 int main(void) {
@@ -343,10 +331,6 @@ int main(void) {
     printf("XTC should not:\n");
     test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
 
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
-
     test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
     test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
 

From 8f275a7c4593aa34147595a90282cf950a853690 Mon Sep 17 00:00:00 2001
From: Changyeon Kim <cyzero.kim@samsung.com>
Date: Tue, 29 Oct 2024 17:52:56 +0900
Subject: [PATCH 120/396] ggml: Add POOL2D OP for GPU acceleration to the
 Vulkan backend in the MobileVLM model. (#9763)

* ggml: Add POOL2D OP for GPU ACC to the Vulkan.

- The MobileVLM model now supports inference acceleration through GPU by utilizing the Vulkan backend.
- A GGML_OP_POOL_2D shader has been added. (Pooling)
- The encoding performance of the CLIP model improved from 2.8s on the CPU to 0.7s on the GPU.

Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com>

* [fix] Correct the incorrect order of the parameters.

fix casting to int.

Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com>

---------

Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com>
---
 ggml/src/ggml-vulkan.cpp                      | 72 ++++++++++++++++++
 ggml/src/vulkan-shaders/pool2d.comp           | 74 +++++++++++++++++++
 .../src/vulkan-shaders/vulkan-shaders-gen.cpp |  4 +
 3 files changed, 150 insertions(+)
 create mode 100644 ggml/src/vulkan-shaders/pool2d.comp

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index e749bbe70..94175a782 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -213,6 +213,7 @@ struct vk_device_struct {
     vk_pipeline pipeline_sum_rows_f32;
     vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
     vk_pipeline pipeline_timestep_embedding_f32;
+    vk_pipeline pipeline_pool2d_f32;
 
     std::unordered_map<std::string, vk_pipeline_ref> pipelines;
     std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
@@ -403,6 +404,17 @@ struct vk_op_timestep_embedding_push_constants {
     uint32_t max_period;
 };
 
+struct vk_op_pool2d_push_constants {
+    uint32_t IW; uint32_t IH;
+    uint32_t OW; uint32_t OH;
+    uint32_t OC;
+    uint32_t pelements;
+    uint32_t op;
+    int32_t k0; int32_t k1;
+    int32_t s0; int32_t s1;
+    int32_t p0; int32_t p1;
+};
+
 // Allow pre-recording command buffers
 struct vk_staging_memcpy {
     vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
@@ -1803,6 +1815,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
 
+    ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
+
     for (auto &c : compiles) {
         c.wait();
     }
@@ -4234,6 +4248,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             return ctx->device->pipeline_timestep_embedding_f32;
         }
         return nullptr;
+    case GGML_OP_POOL_2D:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_pool2d_f32;
+        }
+        return nullptr;
     case GGML_OP_LEAKY_RELU:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
             return ctx->device->pipeline_leaky_relu_f32;
@@ -4464,6 +4483,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
             uint32_t half_ceil = (dim + 1) / 2;
             elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
         } break;
+    case GGML_OP_POOL_2D:
+        {
+            const uint32_t N = dst->ne[3];
+            const uint32_t OC = dst->ne[2];
+            const uint32_t OH = dst->ne[1];
+            const uint32_t OW = dst->ne[0];
+            elements = { N * OC * OH * OW, 1, 1};
+        } break;
     case GGML_OP_ADD:
     case GGML_OP_DIV:
     case GGML_OP_MUL:
@@ -4914,6 +4941,34 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
     }, dryrun);
 }
 
+static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
+    const int32_t k1 = dst->op_params[1];
+    const int32_t k0 = dst->op_params[2];
+    const int32_t s1 = dst->op_params[3];
+    const int32_t s0 = dst->op_params[4];
+    const int32_t p1 = dst->op_params[5];
+    const int32_t p0 = dst->op_params[6];
+
+    const uint32_t IH = src0->ne[1];
+    const uint32_t IW = src0->ne[0];
+
+    const uint32_t N = dst->ne[3];
+
+    const uint32_t OC = dst->ne[2];
+    const uint32_t OH = dst->ne[1];
+    const uint32_t OW = dst->ne[0];
+
+    const uint32_t parallel_elements = N * OC * OH * OW;
+
+    ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
+        IW, IH, OW, OH, OC,
+        parallel_elements,
+        op,
+        k0, k1, s0, s1, p0, p1,
+    }, dryrun);
+}
+
 static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     const float * op_params = (const float *)dst->op_params;
     ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
@@ -5792,6 +5847,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_SUM_ROWS:
     case GGML_OP_IM2COL:
     case GGML_OP_TIMESTEP_EMBEDDING:
+    case GGML_OP_POOL_2D:
     case GGML_OP_LEAKY_RELU:
         break;
     default:
@@ -5927,6 +5983,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_TIMESTEP_EMBEDDING:
         ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
 
+        break;
+    case GGML_OP_POOL_2D:
+        ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
+
         break;
     case GGML_OP_LEAKY_RELU:
         ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
@@ -6018,6 +6078,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
     case GGML_OP_SUM_ROWS:
     case GGML_OP_IM2COL:
     case GGML_OP_TIMESTEP_EMBEDDING:
+    case GGML_OP_POOL_2D:
     case GGML_OP_LEAKY_RELU:
     case GGML_OP_REPEAT:
         buf = tensor->buffer;
@@ -6821,6 +6882,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_SUM_ROWS:
         case GGML_OP_IM2COL:
         case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_POOL_2D:
         case GGML_OP_LEAKY_RELU:
             return true;
         default:
@@ -7334,6 +7396,16 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
         const int32_t dim = tensor->op_params[0];
         const int32_t max_period = tensor->op_params[1];
         tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
+    } else if (tensor->op == GGML_OP_POOL_2D) {
+        enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
+        const int32_t k0 = tensor->op_params[1];
+        const int32_t k1 = tensor->op_params[2];
+        const int32_t s0 = tensor->op_params[3];
+        const int32_t s1 = tensor->op_params[4];
+        const int32_t p0 = tensor->op_params[5];
+        const int32_t p1 = tensor->op_params[6];
+
+        tensor_clone = ggml_pool_2d(ggml_ctx, src0_clone, op, k0, k1, s0, s1, p0, p1);
     } else if (tensor->op == GGML_OP_LEAKY_RELU) {
         const float * op_params = (const float *)tensor->op_params;
         tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
diff --git a/ggml/src/vulkan-shaders/pool2d.comp b/ggml/src/vulkan-shaders/pool2d.comp
new file mode 100644
index 000000000..b6124411a
--- /dev/null
+++ b/ggml/src/vulkan-shaders/pool2d.comp
@@ -0,0 +1,74 @@
+#version 450
+
+#include "types.comp"
+
+#extension GL_EXT_shader_16bit_storage : require
+
+layout(push_constant) uniform parameter {
+    uint IW; uint IH;
+    uint OW; uint OH;
+    uint OC;
+    uint pelements;
+    uint op;
+    int k0; int k1;
+    int s0; int s1;
+    int p0; int p1;
+} p;
+
+#define BLOCK_SIZE 512
+#define FLT_MAX 3.402823466e+38F
+#define OP_POOL_MAX 0u
+#define OP_POOL_AVG 1u
+
+layout (local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout(binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint idx = gl_GlobalInvocationID.x;
+    if (idx >= p.pelements) {
+        return;
+    }
+
+    const uint O_HW = p.OW * p.OH;
+
+    const uint nc = idx / O_HW;
+    const uint cur_oh = (idx % O_HW) / p.OW;
+    const uint cur_ow = (idx % O_HW) % p.OW;
+
+    const int start_h = int(cur_oh) * p.s0 - p.p0;
+    const uint bh = max(start_h, 0);
+    const uint eh = min(start_h + p.k0, p.IH);
+
+    const int start_w = int(cur_ow) * p.s1 - p.p1;
+    const uint bw = max(start_w, 0);
+    const uint ew = min(start_w + p.k1, p.IW);
+
+    const float scale = 1.0 / float(p.k0 * p.k1);
+    float res;
+
+    if (p.op == OP_POOL_AVG) {
+        res = 0.0;
+    } else if (p.op == OP_POOL_MAX) {
+        res = -FLT_MAX;
+    } else {
+        return;
+    }
+
+    #pragma unroll
+    for (uint i = bh; i < eh; i++) {
+        #pragma unroll
+        for (uint j = bw; j < ew; j++) {
+            const float cur = D_TYPE(data_a[nc * p.IH * p.IW + i * p.IW + j]);
+
+            if (p.op == OP_POOL_AVG) {
+                res += cur * scale;
+            } else if (p.op == OP_POOL_MAX) {
+                res = max(res, cur);
+            }
+        }
+    }
+
+    data_d[nc * O_HW + cur_oh * p.OW + cur_ow] = res;
+}
diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
index 1bd1b6f67..49759c593 100644
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -493,6 +493,10 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
     tasks.push_back(std::async(std::launch::async, [=] {
         string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
     }));
+
+    tasks.push_back(std::async(std::launch::async, [=] {
+        string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    }));
 }
 
 void write_output_files() {

From c5b0f4b5d90297f3e729fca7f78ddb25fcab5ddc Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 30 Oct 2024 02:01:23 +0100
Subject: [PATCH 121/396] llama : refactor model loader with backend registry
 (#10026)

---
 examples/llama-bench/llama-bench.cpp |  132 +-
 ggml/include/ggml-backend.h          |   19 +-
 ggml/include/ggml-cuda.h             |    2 +-
 ggml/src/ggml-amx.cpp                |   33 +-
 ggml/src/ggml-backend-impl.h         |   19 +-
 ggml/src/ggml-backend.cpp            |  235 +-
 ggml/src/ggml-blas.cpp               |   20 +-
 ggml/src/ggml-cann.cpp               |   50 +-
 ggml/src/ggml-cuda.cu                |  142 +-
 ggml/src/ggml-kompute.cpp            |   15 -
 ggml/src/ggml-metal.m                |   44 +-
 ggml/src/ggml-rpc.cpp                |   20 +-
 ggml/src/ggml-sycl.cpp               |   54 +-
 ggml/src/ggml-vulkan.cpp             |   26 +-
 ggml/src/ggml.c                      |    4 +-
 include/llama.h                      |    7 +-
 scripts/compare-llama-bench.py       |    2 +-
 src/llama.cpp                        | 3098 +++++++++++++-------------
 18 files changed, 1903 insertions(+), 2019 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 4a8ea9676..e7873a143 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -21,12 +21,6 @@
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
-#include "ggml-cuda.h"
-#include "ggml-sycl.h"
-
-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif
 
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
@@ -82,95 +76,27 @@ static T stdev(const std::vector<T> & v) {
 }
 
 static std::string get_cpu_info() {
-    std::string id;
-#ifdef __linux__
-    FILE * f = fopen("/proc/cpuinfo", "r");
-    if (f) {
-        char buf[1024];
-        while (fgets(buf, sizeof(buf), f)) {
-            if (strncmp(buf, "model name", 10) == 0) {
-                char * p = strchr(buf, ':');
-                if (p) {
-                    p++;
-                    while (std::isspace(*p)) {
-                        p++;
-                    }
-                    while (std::isspace(p[strlen(p) - 1])) {
-                        p[strlen(p) - 1] = '\0';
-                    }
-                    id = p;
-                    break;
-                }
-            }
-        }
-        fclose(f);
-    }
-#elif defined(_WIN32)
-    HKEY hKey;
-    if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
-                     TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
-                     0,
-                     KEY_READ,
-                     &hKey) != ERROR_SUCCESS) {
-        // fail to open registry key
-        return "";
-    }
-    char cpu_brand[256];
-    DWORD cpu_brand_size = sizeof(cpu_brand);
-    if (RegQueryValueExA(hKey,
-                        TEXT("ProcessorNameString"),
-                        NULL,
-                        NULL,
-                        (LPBYTE)cpu_brand,
-                        &cpu_brand_size) == ERROR_SUCCESS) {
-        id.assign(cpu_brand, cpu_brand_size);
-        if (id.find('\0') != std::string::npos) {
-            id.resize(id.find('\0'));
+    std::vector<std::string> cpu_list;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        auto * dev = ggml_backend_dev_get(i);
+        auto dev_type = ggml_backend_dev_type(dev);
+        if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+            cpu_list.push_back(ggml_backend_dev_description(dev));
         }
     }
-    RegCloseKey(hKey);
-#endif
-    // TODO: other platforms
-    return id;
+    return join(cpu_list, ", ");
 }
 
 static std::string get_gpu_info() {
-    std::string id;
-#ifdef GGML_USE_CUDA
-    int count = ggml_backend_cuda_get_device_count();
-    for (int i = 0; i < count; i++) {
-        char buf[128];
-        ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
-        id += buf;
-        if (i < count - 1) {
-            id += "/";
+    std::vector<std::string> gpu_list;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        auto * dev = ggml_backend_dev_get(i);
+        auto dev_type = ggml_backend_dev_type(dev);
+        if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
+            gpu_list.push_back(ggml_backend_dev_description(dev));
         }
     }
-#endif
-#ifdef GGML_USE_SYCL
-    int count = ggml_backend_sycl_get_device_count();
-    for (int i = 0; i < count; i++) {
-        char buf[128];
-        ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
-        id += buf;
-        if (i < count - 1) {
-            id += "/";
-        }
-    }
-#endif
-#ifdef GGML_USE_CANN
-    uint32_t count = ggml_backend_cann_get_device_count();
-    for (uint32_t i = 0; i < count; i++) {
-        char buf[128];
-        ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
-        id += buf;
-        if (i < count - 1) {
-            id += "/";
-        }
-    }
-#endif
-    // TODO: other backends
-    return id;
+    return join(gpu_list, ", ");
 }
 
 // command line params
@@ -938,29 +864,15 @@ struct test {
     }
 
     static std::string get_backend() {
-        if (cuda) {
-            return GGML_CUDA_NAME;
+        std::vector<std::string> backends;
+        for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+            auto * reg = ggml_backend_reg_get(i);
+            std::string name = ggml_backend_reg_name(reg);
+            if (name != "CPU") {
+                backends.push_back(ggml_backend_reg_name(reg));
+            }
         }
-        if (vulkan) {
-            return "Vulkan";
-        }
-        if (kompute) {
-            return "Kompute";
-        }
-        if (metal) {
-            return "Metal";
-        }
-        if (sycl) {
-            return GGML_SYCL_NAME;
-        }
-        if (gpu_blas) {
-            return "GPU BLAS";
-        }
-        if (blas) {
-            return "BLAS";
-        }
-
-        return "CPU";
+        return backends.empty() ? "CPU" : join(backends, ",");
     }
 
     static const std::vector<std::string> & get_fields() {
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 5933b8e8f..c11eb4183 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -114,11 +114,12 @@ extern "C" {
     //
 
     enum ggml_backend_dev_type {
+        // CPU device using system memory
         GGML_BACKEND_DEVICE_TYPE_CPU,
+        // GPU device using dedicated memory
         GGML_BACKEND_DEVICE_TYPE_GPU,
-        // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
-        GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
-        GGML_BACKEND_DEVICE_TYPE_GPU_FULL
+        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
+        GGML_BACKEND_DEVICE_TYPE_ACCEL
     };
 
     // functionality supported by the device
@@ -167,10 +168,14 @@ extern "C" {
     GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
     GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
 
+    // Common functions that may be obtained using ggml_backend_reg_get_proc_address
 
-    // Functions that may be obtained using ggml_backend_reg_get_proc_address
-    typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
-    typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
+    // Split buffer type for tensor parallelism
+    typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
+    // Set the number of threads for the backend
+    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
+    // Get additional buffer types provided by the device (returns a NULL-terminated array)
+    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
 
     //
     // Backend registry
@@ -192,7 +197,7 @@ extern "C" {
     GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
     // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
     GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
-    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
     GGML_API ggml_backend_t ggml_backend_init_best(void);
 
     //
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index f44d8f4e6..305d0b636 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
 GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
 
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
diff --git a/ggml/src/ggml-amx.cpp b/ggml/src/ggml-amx.cpp
index ac6ec2342..144dc9d8a 100644
--- a/ggml/src/ggml-amx.cpp
+++ b/ggml/src/ggml-amx.cpp
@@ -16,12 +16,6 @@
 #if defined(__AMX_INT8__)
 
 // AMX buffer interface
-static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return "AMX";
-
-    GGML_UNUSED(buffer);
-}
-
 static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     free(buffer->context);
 }
@@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 }
 
 static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
-    /* .get_name        = */ ggml_backend_amx_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_amx_buffer_get_base,
     /* .init_tensor     = */ NULL, // no initialization required
@@ -121,14 +114,14 @@ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft
 ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
     static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
         /* .iface = */ {
-        /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
-        /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
-        /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
-        /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-        /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
-        /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
+            /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
+            /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
         },
-        /* .device  = */ NULL,
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
         /* .context = */ NULL,
     };
 
@@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_amx_buffer_type();
-
-    GGML_UNUSED(backend);
-}
-
 static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
 
@@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
 static struct ggml_backend_i ggml_backend_amx_i = {
     /* .get_name                = */ ggml_backend_amx_name,
     /* .free                    = */ ggml_backend_amx_free,
-    /* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
@@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_amx_graph_compute,
-    /* .supports_op             = */ NULL,
-    /* .supports_buft           = */ NULL,
-    /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
 };
@@ -279,7 +262,7 @@ static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t *
 }
 
 static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_CPU;
+    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
 
     GGML_UNUSED(dev);
 }
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index fd3deae00..fa8d5b7fb 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -22,7 +22,7 @@ extern "C" {
         size_t                (*get_max_size)  (ggml_backend_buffer_type_t buft);
         // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
         size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
-        // (optional) check if tensor data is in host memory (defaults to false)
+        // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
         bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
     };
 
@@ -37,7 +37,6 @@ extern "C" {
     //
 
     struct ggml_backend_buffer_i {
-        const char * (*get_name)     (ggml_backend_buffer_t buffer);
         // (optional) free the buffer
         void         (*free_buffer)  (ggml_backend_buffer_t buffer);
         // base address of the buffer
@@ -88,19 +87,16 @@ extern "C" {
 
         void (*free)(ggml_backend_t backend);
 
-        // Will be moved to the device interface
-        // buffer allocation
-        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
-
         // (optional) asynchronous tensor data access
         void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
         void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
         bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
 
-        // (optional) complete all pending operations
+        // (optional) complete all pending operations (required if the backend supports async operations)
         void (*synchronize)(ggml_backend_t backend);
 
-        // (optional) compute graph with a plan (not used currently)
+        // (optional) graph plans (not used currently)
+        // compute graph with a plan
         ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
         void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
         // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@@ -111,13 +107,6 @@ extern "C" {
         // compute graph (always async if supported by the backend)
         enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
-        // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
-        //            new backends should implement the device interface instead
-        // These functions are being moved to the device interface
-        bool (*supports_op)  (ggml_backend_t backend, const struct ggml_tensor * op);
-        bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
-        bool (*offload_op)   (ggml_backend_t backend, const struct ggml_tensor * op);
-
         // (optional) event synchronization
         // record an event on this stream
         void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 7d7b63a15..fd574887f 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -34,6 +34,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
 }
 
 ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    if (size == 0) {
+        // return a dummy buffer for zero-sized allocations
+        return ggml_backend_buffer_init(buft, {}, NULL, 0);
+    }
+
     return buft->iface.alloc_buffer(buft, size);
 }
 
@@ -89,7 +94,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
 }
 
 const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name(buffer);
+    return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
 }
 
 void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -108,6 +113,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
 }
 
 void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
+    // get_base is optional if the buffer is zero-sized
+    if (buffer->size == 0) {
+        return NULL;
+    }
+
     void * base = buffer->iface.get_base(buffer);
 
     GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -122,6 +132,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
     }
 }
 
+void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    // clear is optional if the buffer is zero-sized
+    if (buffer->size == 0) {
+        return;
+    }
+
+    buffer->iface.clear(buffer, value);
+}
+
 size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
     return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
 }
@@ -134,10 +153,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
     return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
 }
 
-void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    buffer->iface.clear(buffer, value);
-}
-
 bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
     return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
 }
@@ -198,7 +213,7 @@ void ggml_backend_free(ggml_backend_t backend) {
 }
 
 ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
-    return backend->iface.get_default_buffer_type(backend);
+    return ggml_backend_dev_buffer_type(backend->device);
 }
 
 ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
@@ -238,43 +253,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
 void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
+    if (size == 0) {
+        return;
+    }
+
     GGML_ASSERT(buf != NULL && "tensor buffer not set");
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
 
-    if (!size) {
-        return;
-    }
-
     buf->iface.set_tensor(buf, tensor, data, offset, size);
 }
 
 void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
+    if (size == 0) {
+        return;
+    }
+
     GGML_ASSERT(buf != NULL && "tensor buffer not set");
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
 
-    if (!size) {
-        return;
-    }
-
     buf->iface.get_tensor(buf, tensor, data, offset, size);
 }
 
 GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
-    GGML_ASSERT(buf != NULL && "tensor buffer not set");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-
-    if (!size) {
+    if (size == 0) {
         return;
     }
 
-    GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
+    GGML_ASSERT(buf != NULL && "tensor buffer not set");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
 
     buf->iface.memset_tensor(buf, tensor, value, offset, size);
 }
@@ -316,32 +330,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    // helper to ease transition to device interface
-    if (backend->device) {
-        return ggml_backend_dev_supports_op(backend->device, op);
-    }
-
-    return backend->iface.supports_op(backend, op);
+    return ggml_backend_dev_supports_op(backend->device, op);
 }
 
 bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    // helper to ease transition to device interface
-    if (backend->device) {
-        return ggml_backend_dev_supports_buft(backend->device, buft);
-    }
-    return backend->iface.supports_buft(backend, buft);
+    return ggml_backend_dev_supports_buft(backend->device, buft);
 }
 
 bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    // helper to ease transition to device interface
-    if (backend->device) {
-        return ggml_backend_dev_offload_op(backend->device, op);
-    }
-
-    if (backend->iface.offload_op != NULL) {
-        return backend->iface.offload_op(backend, op);
-    }
-    return false;
+    return ggml_backend_dev_offload_op(backend->device, op);
 }
 
 ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
@@ -582,6 +579,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_VULKAN
         register_backend(ggml_backend_vk_reg());
 #endif
+#ifdef GGML_USE_CANN
+        register_backend(ggml_backend_cann_reg());
+#endif
 #ifdef GGML_USE_BLAS
         register_backend(ggml_backend_blas_reg());
 #endif
@@ -591,9 +591,6 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_AMX
         register_backend(ggml_backend_amx_reg());
 #endif
-#ifdef GGML_USE_CANN
-        register_backend(ggml_backend_cann_reg());
-#endif
 
         // TODO: kompute
 
@@ -701,9 +698,9 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
 }
 
 ggml_backend_t ggml_backend_init_best(void) {
-    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
     if (!dev) {
-        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
+        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
     }
     if (!dev) {
         return NULL;
@@ -711,13 +708,7 @@ ggml_backend_t ggml_backend_init_best(void) {
     return ggml_backend_dev_init(dev, NULL);
 }
 
-// backend CPU
-
-static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return "CPU";
-
-    GGML_UNUSED(buffer);
-}
+// CPU backend - buffer
 
 static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
     uintptr_t data = (uintptr_t)buffer->context;
@@ -767,7 +758,6 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 }
 
 static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
-    /* .get_name        = */ ggml_backend_cpu_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
     /* .init_tensor     = */ NULL, // no initialization required
@@ -780,7 +770,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
 };
 
 static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
-    /* .get_name        = */ ggml_backend_cpu_buffer_get_name,
     /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
     /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
     /* .init_tensor     = */ NULL, // no initialization required
@@ -792,6 +781,8 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
     /* .reset           = */ NULL,
 };
 
+// CPU backend - buffer type
+
 static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     return "CPU";
 
@@ -799,19 +790,14 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
 }
 
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    auto alloc_size = size;
-    if (alloc_size == 0) {
-        alloc_size = 1;
-    }
-
-    void * data = ggml_aligned_malloc(alloc_size);
+    void * data = ggml_aligned_malloc(size);
 
     if (data == NULL) {
-        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
+        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
         return NULL;
     }
 
-    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
+    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
 }
 
 static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -843,6 +829,29 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
     return &ggml_backend_cpu_buffer_type;
 }
 
+static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_Mapped";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
+        /* .iface   = */ {
+            /* .get_name         = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_cpu_buffer_type;
+}
+
 #ifdef GGML_USE_CPU_HBM
 
 // buffer type HBM
@@ -855,18 +864,11 @@ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffe
     GGML_UNUSED(buft);
 }
 
-static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
-    return "CPU_HBM";
-
-    GGML_UNUSED(buf);
-}
-
 static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     hbw_free(buffer->context);
 }
 
 static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    //void * ptr = hbw_malloc(size);
     void * ptr;
     int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
     if (result != 0) {
@@ -876,7 +878,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
 
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
     buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
     buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
 
     return buffer;
@@ -899,6 +900,21 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 }
 #endif
 
+static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
+    static ggml_backend_buffer_type_t bufts[] = {
+#ifdef GGML_USE_CPU_HBM
+        ggml_backend_cpu_hbm_buffer_type(),
+#endif
+        NULL
+    };
+
+    return bufts;
+
+    GGML_UNUSED(device);
+}
+
+// CPU backend - backend (stream)
+
 struct ggml_backend_cpu_context {
     int                 n_threads;
     ggml_threadpool_t   threadpool;
@@ -923,12 +939,6 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) {
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_cpu_buffer_type();
-
-    GGML_UNUSED(backend);
-}
-
 struct ggml_backend_plan_cpu {
     struct ggml_cplan cplan;
     struct ggml_cgraph cgraph;
@@ -998,7 +1008,6 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
 static const struct ggml_backend_i ggml_backend_cpu_i = {
     /* .get_name                = */ ggml_backend_cpu_get_name,
     /* .free                    = */ ggml_backend_cpu_free,
-    /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
@@ -1008,9 +1017,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
     /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
-    /* .supports_op             = */ NULL,
-    /* .supports_buft           = */ NULL,
-    /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
 };
@@ -1081,10 +1087,10 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
 
 ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
-    return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
+    return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
 }
 
-////////////////////////
+// CPU backend - device
 
 struct ggml_backend_cpu_device_context {
     std::string description = "CPU";
@@ -1171,7 +1177,7 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
 }
 
 static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
+    return GGML_BACKEND_DEVICE_TYPE_CPU;
 
     GGML_UNUSED(dev);
 }
@@ -1189,7 +1195,7 @@ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggm
     };
 }
 
-static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
+static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
     return ggml_backend_cpu_init();
 
     GGML_UNUSED(dev);
@@ -1202,7 +1208,7 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_b
     GGML_UNUSED(dev);
 }
 
-static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
     return ggml_backend_cpu_buffer_from_ptr(ptr, size);
 
     GGML_UNUSED(dev);
@@ -1244,10 +1250,10 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
     /* .get_memory           = */ ggml_backend_cpu_device_get_memory,
     /* .get_type             = */ ggml_backend_cpu_device_get_type,
     /* .get_props            = */ ggml_backend_cpu_device_get_props,
-    /* .init_backend         = */ ggml_backend_cpu_device_init,
+    /* .init_backend         = */ ggml_backend_cpu_device_init_backend,
     /* .get_buffer_type      = */ ggml_backend_cpu_device_get_buffer_type,
     /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
+    /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
     /* .supports_op          = */ ggml_backend_cpu_device_supports_op,
     /* .supports_buft        = */ ggml_backend_cpu_device_supports_buft,
     /* .offload_op           = */ NULL,
@@ -1256,7 +1262,7 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
     /* .event_synchronize    = */ NULL,
 };
 
-////////////////////////
+// CPU backend - backend (reg)
 
 static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
     return "CPU";
@@ -1287,6 +1293,10 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
     if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
         return (void *)ggml_backend_cpu_set_n_threads;
     }
+    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
+        return (void *)ggml_backend_cpu_get_extra_bufts;
+    }
+
     return NULL;
 
     GGML_UNUSED(reg);
@@ -1315,12 +1325,6 @@ struct ggml_backend_multi_buffer_context {
     size_t n_buffers;
 };
 
-static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
-
-    return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
-}
-
 static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
     for (size_t i = 0; i < ctx->n_buffers; i++) {
@@ -1339,7 +1343,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
 }
 
 static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
-    /* .get_name        = */ ggml_backend_multi_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_multi_buffer_free_buffer,
     /* .get_base        = */ NULL,
     /* .init_tensor     = */ NULL,
@@ -1368,7 +1371,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
 }
 
 bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
+    return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
 }
 
 void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
@@ -1460,7 +1463,7 @@ struct ggml_backend_sched {
     char * context_buffer;
     size_t context_buffer_size;
 
-    bool debug;
+    int debug;
 };
 
 #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1500,7 +1503,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
     return -1;
 }
 
-#if 0
+#if 1
 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
@@ -1548,7 +1551,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
         if (src == NULL) {
             continue;
         }
-        if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+        // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
+        // not an ideal solution
+        if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
             int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
             // check if a backend with higher prio wants to offload the op
             if (src_backend_id == sched->n_backends - 1) {
@@ -1595,19 +1600,21 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
         if (ggml_is_view_op(node->op)) {
             continue;
         }
-        ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
-        GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
-            fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                continue;
+        if (sched->debug > 1) {
+            ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
+            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
+                fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    continue;
+                }
+                ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
+                GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
+                    fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
             }
-            ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
-            GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
-                fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
+            GGML_LOG_DEBUG("\n");
         }
-        GGML_LOG_DEBUG("\n");
     }
 }
 
@@ -1899,11 +1906,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     if (src == NULL) {
                         continue;
                     }
-                    // check if a weight is on a different backend
+                    // check if a weight is on a different and incompatible backend
                     // by starting a new split, the memory of the previously offloaded weights can be reused
                     if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
                         int src_backend_id = tensor_backend_id(src);
-                        if (src_backend_id != cur_backend_id) {
+                        if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
                             need_new_split = true;
                             break;
                         }
@@ -1915,7 +1922,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                         int src_backend_id = sched->hv_tensor_backend_ids[id];
                         bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
                         if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
-                            //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
                             need_new_split = true;
                             break;
                         }
@@ -2240,7 +2246,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
 
     struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
 
-    sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
+    const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
+    sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
     sched->n_backends = n_backends;
     sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
 
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
index 7875ec86d..8d96220b9 100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@@ -224,12 +224,6 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_cpu_buffer_type();
-
-    GGML_UNUSED(backend);
-}
-
 static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
 
@@ -265,7 +259,6 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
 static struct ggml_backend_i blas_backend_i = {
     /* .get_name                = */ ggml_backend_blas_get_name,
     /* .free                    = */ ggml_backend_blas_free,
-    /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
@@ -275,9 +268,6 @@ static struct ggml_backend_i blas_backend_i = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_blas_graph_compute,
-    /* .supports_op             = */ NULL,
-    /* .supports_buft           = */ NULL,
-    /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
 };
@@ -356,7 +346,7 @@ static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t *
 }
 
 static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_CPU;
+    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
 
     GGML_UNUSED(dev);
 }
@@ -374,7 +364,7 @@ static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct gg
     };
 }
 
-static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) {
+static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
     return ggml_backend_blas_init();
 
     GGML_UNUSED(dev);
@@ -387,7 +377,7 @@ static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_
     GGML_UNUSED(dev);
 }
 
-static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
     return ggml_backend_cpu_buffer_from_ptr(ptr, size);
 
     GGML_UNUSED(dev);
@@ -456,10 +446,10 @@ static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
     /* .get_memory           = */ ggml_backend_blas_device_get_memory,
     /* .get_type             = */ ggml_backend_blas_device_get_type,
     /* .get_props            = */ ggml_backend_blas_device_get_props,
-    /* .init_backend         = */ ggml_backend_blas_device_init,
+    /* .init_backend         = */ ggml_backend_blas_device_init_backend,
     /* .get_buffer_type      = */ ggml_backend_blas_device_get_buffer_type,
     /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr,
+    /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
     /* .supports_op          = */ ggml_backend_blas_device_supports_op,
     /* .supports_buft        = */ ggml_backend_blas_device_supports_buft,
     /* .offload_op           = */ NULL,
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index af0fb603a..f8ac11e41 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -489,23 +489,6 @@ struct ggml_backend_cann_buffer_context {
     ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
 };
 
-/**
- * @brief Retrieve the name associated with a CANN buffer.
- *
- * This function returns the name of a CANN buffer, which is stored in the
- * context of the buffer.
- *
- * @param buffer The CANN buffer whose name is to be retrieved.
- * @return A pointer to a C-string containing the name of the buffer.
- */
-
-static const char* ggml_backend_cann_buffer_get_name(
-    ggml_backend_buffer_t buffer) {
-    return "CANN";
-
-    GGML_UNUSED(buffer);
-}
-
 /**
  * @brief Check if a buffer is a CANN buffer.
  *
@@ -515,9 +498,10 @@ static const char* ggml_backend_cann_buffer_get_name(
  * @param buffer The buffer to check.
  * @return true if the buffer is a CANN buffer, false otherwise.
  */
+static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
 static bool ggml_backend_buffer_is_cann(
     ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
+    return ggml_backend_buft_is_cann(buffer->buft);
 }
 
 /**
@@ -965,7 +949,6 @@ static void ggml_backend_cann_buffer_clear(
  * on a CANN buffer within the backend.
  */
 static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
-    /* .get_name        = */ ggml_backend_cann_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_cann_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cann_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_cann_buffer_init_tensor,
@@ -999,9 +982,10 @@ struct ggml_backend_cann_buffer_type_context {
  */
 static const char* ggml_backend_cann_buffer_type_name(
     ggml_backend_buffer_type_t buft) {
-    return "CANN";
+    ggml_backend_cann_buffer_type_context* buft_ctx =
+        (ggml_backend_cann_buffer_type_context*)buft->context;
 
-    GGML_UNUSED(buft);
+    return buft_ctx->name.c_str();
 }
 
 /**
@@ -1465,24 +1449,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
     delete backend;
 }
 
-/**
- * @brief Retrieves the default buffer type associated with the CANN backend.
- *
- * This function returns the buffer type specific to the device associated
- * with the CANN backend. It is used to allocate buffers for computations
- * performed by the backend.
- *
- * @param backend Pointer to the CANN backend structure.
- * @return Pointer to the buffer type structure for the CANN backend.
- */
-static ggml_backend_buffer_type_t
-ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
-
-    return ggml_backend_cann_buffer_type(cann_ctx->device);
-}
-
 /**
  * @brief Sets tensor data asynchronously in the CANN backend.
  *
@@ -1863,7 +1829,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
 static const ggml_backend_i ggml_backend_cann_interface = {
     /* .get_name                = */ ggml_backend_cann_name,
     /* .free                    = */ ggml_backend_cann_free,
-    /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
     /* .set_tensor_async        = */ ggml_backend_cann_set_tensor_async,
     /* .get_tensor_async        = */ ggml_backend_cann_get_tensor_async,
     /* .cpy_tensor_async        = */ ggml_backend_cann_cpy_tensor_async,
@@ -1873,9 +1838,6 @@ static const ggml_backend_i ggml_backend_cann_interface = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_cann_graph_compute,
-    /* .supports_op             = */ NULL, // moved to device
-    /* .supports_buft           = */ NULL, // moved to device
-    /* .offload_op              = */ NULL, // moved to device
     /* .event_record            = */ ggml_backend_cann_event_record,
     /* .event_wait              = */ ggml_backend_cann_event_wait,
 };
@@ -1918,7 +1880,7 @@ static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t *
 
 static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
     GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
 static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 217df968a..087091516 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -421,20 +421,15 @@ struct ggml_backend_cuda_buffer_context {
     }
 };
 
-static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-    return ctx->name.c_str();
-}
-
-static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
-}
-
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
 }
 
+static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
+    return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
+}
+
 static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     return ctx->dev_ptr;
@@ -515,7 +510,6 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 }
 
 static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
-    /* .get_name        = */ ggml_backend_cuda_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
@@ -548,8 +542,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
 
     ggml_cuda_set_device(buft_ctx->device);
 
-    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
-
     void * dev_ptr;
     cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
     if (err != cudaSuccess) {
@@ -657,7 +649,9 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
 }
 
 struct ggml_backend_cuda_split_buffer_type_context {
+    int main_device;
     std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
+    std::string name;
 };
 
 struct ggml_backend_cuda_split_buffer_context {
@@ -680,16 +674,6 @@ struct ggml_backend_cuda_split_buffer_context {
     std::vector<ggml_tensor_extra_gpu *> tensor_extras;
 };
 
-static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return GGML_CUDA_NAME "_Split";
-
-    GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
-    GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
-}
 
 static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@@ -833,7 +817,6 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u
 }
 
 static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
-    /* .get_name        = */ ggml_backend_cuda_split_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
@@ -848,9 +831,9 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
 // cuda split buffer type
 
 static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return GGML_CUDA_NAME "_Split";
+    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
 
-    GGML_UNUSED(buft);
+    return ctx->name.c_str();
 }
 
 static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
@@ -915,11 +898,11 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
     /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 };
 
-ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
+ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
     static std::mutex mutex;
     std::lock_guard<std::mutex> lock(mutex);
 
-    static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
+    static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
 
     std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
 
@@ -937,18 +920,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
         }
     }
 
-    auto it = buft_map.find(tensor_split_arr);
+    auto it = buft_map.find({main_device, tensor_split_arr});
     if (it != buft_map.end()) {
         return &it->second;
     }
+    auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
+        main_device,
+        tensor_split_arr,
+        GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
+    };
 
     struct ggml_backend_buffer_type buft {
         /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
-        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
+        /* .context = */ ctx,
     };
 
-    auto result = buft_map.emplace(tensor_split_arr, buft);
+    auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
     return &result.first->second;
 }
 
@@ -960,12 +948,6 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
     GGML_UNUSED(buft);
 }
 
-static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return GGML_CUDA_NAME "_Host";
-
-    GGML_UNUSED(buffer);
-}
-
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
 }
@@ -998,7 +980,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
 
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
     buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
     buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
 
     return buffer;
@@ -1400,7 +1381,7 @@ static void ggml_cuda_op_mul_mat(
 
     const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
 
-    const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
+    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
     GGML_ASSERT(!(split && ne02 > 1));
     GGML_ASSERT(!(split && ne03 > 1));
     GGML_ASSERT(!(split && ne02 < ne12));
@@ -1890,7 +1871,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 }
 
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
+    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
 
     bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
@@ -2017,7 +1998,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
-    GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
+    GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
 
     cudaStream_t stream = ctx.stream();
 
@@ -2150,7 +2131,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
 
 static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
     // why is this here instead of mul_mat?
-    if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
+    if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
         ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
     }
 
@@ -2371,12 +2352,6 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) {
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
-}
-
 static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -2582,7 +2557,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                 continue;
             }
 
-            if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
+            if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
                 use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
 #ifndef NDEBUG
                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
@@ -2669,7 +2644,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                 for (int j = 0; j < GGML_MAX_SRC; j++) {
                     if (node->src[j] != nullptr) {
                         assert(node->src[j]->buffer);
-                        assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
+                        assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
+                               ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
                     }
                 }
 #endif
@@ -2762,7 +2738,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
         cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
         if (stat == cudaErrorGraphExecUpdateFailure) {
 #ifndef NDEBUG
-            GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
+            GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
 #endif
             // The pre-existing graph exec cannot be updated due to violated constraints
             // so instead clear error and re-instantiate
@@ -2811,7 +2787,6 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
 static const ggml_backend_i ggml_backend_cuda_interface = {
     /* .get_name                = */ ggml_backend_cuda_get_name,
     /* .free                    = */ ggml_backend_cuda_free,
-    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
     /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
     /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
     /* .cpy_tensor_async        = */ ggml_backend_cuda_cpy_tensor_async,
@@ -2821,9 +2796,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
-    /* .supports_op             = */ NULL, // moved to device
-    /* .supports_buft           = */ NULL, // moved to device
-    /* .offload_op              = */ NULL, // moved to device
     /* .event_record            = */ ggml_backend_cuda_event_record,
     /* .event_wait              = */ ggml_backend_cuda_event_wait,
 };
@@ -2913,7 +2885,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
 
 static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
     GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
@@ -2937,7 +2909,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     };
 }
 
-static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) {
+static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     return ggml_backend_cuda_init(ctx->device);
@@ -2953,18 +2925,29 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
     return ggml_backend_cuda_host_buffer_type();
 }
 
-static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    GGML_UNUSED(dev);
-    GGML_UNUSED(ptr);
-    GGML_UNUSED(size);
-    GGML_UNUSED(max_tensor_size);
-    return nullptr;
-}
-
 // TODO: move these functions here
 static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
 
+    // split buffers can only be used with GGML_OP_MUL_MAT
+    if (op->op != GGML_OP_MUL_MAT) {
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
+                return false;
+            }
+        }
+    }
+
+    // check if all the sources are allocated on this device
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
+            ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
+            if (buft_ctx->device != dev_ctx->device) {
+                return false;
+            }
+        }
+    }
+
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -3190,24 +3173,27 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 }
 
 static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    if (ggml_backend_buft_is_cuda_split(buft)) {
-        return true;
-    }
+    return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
+}
 
-    if (ggml_backend_buft_is_cuda(buft)) {
-        ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
-        ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-        return buft_ctx->device == dev_ctx->device;
+static int64_t get_op_batch_size(const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_GET_ROWS:
+            return 0;
+        case GGML_OP_MUL_MAT:
+            return op->ne[1];
+        case GGML_OP_MUL_MAT_ID:
+        case GGML_OP_ROPE:
+            return op->ne[2];
+        default:
+            return ggml_nrows(op);
     }
-
-    return false;
 }
 
 static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     const int min_batch_size = 32;
 
-    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
-           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+    return get_op_batch_size(op) >= min_batch_size;
 
     GGML_UNUSED(dev);
 }
@@ -3248,10 +3234,10 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
     /* .get_memory              = */ ggml_backend_cuda_device_get_memory,
     /* .get_type                = */ ggml_backend_cuda_device_get_type,
     /* .get_props               = */ ggml_backend_cuda_device_get_props,
-    /* .init_backend            = */ ggml_backend_cuda_device_init,
+    /* .init_backend            = */ ggml_backend_cuda_device_init_backend,
     /* .get_buffer_type         = */ ggml_backend_cuda_device_get_buffer_type,
     /* .get_host_buffer_type    = */ ggml_backend_cuda_device_get_host_buffer_type,
-    /* .buffer_from_host_ptr    = */ ggml_backend_cuda_device_buffer_from_host_ptr,
+    /* .buffer_from_host_ptr    = */ NULL,
     /* .supports_op             = */ ggml_backend_cuda_device_supports_op,
     /* .supports_buft           = */ ggml_backend_cuda_device_supports_buft,
     /* .offload_op              = */ ggml_backend_cuda_device_offload_op,
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 2c926aaee..1f2220234 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -1820,11 +1820,6 @@ static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
     }
 }
 
-static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
-    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
-    return ctx->name.c_str();
-}
-
 static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     auto * memory = (ggml_vk_memory *)buffer->context;
     if (ggml_vk_has_device()) {
@@ -1868,7 +1863,6 @@ static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint
 }
 
 static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
-    /* .get_name        = */ ggml_backend_kompute_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_kompute_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_kompute_buffer_get_base,
     /* .init_tensor     = */ NULL,
@@ -1953,11 +1947,6 @@ static void ggml_backend_kompute_free(ggml_backend_t backend) {
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
-    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
-    return ggml_backend_kompute_buffer_type(ctx->device);
-}
-
 static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
     ggml_vk_graph_compute(ctx, cgraph);
@@ -1977,7 +1966,6 @@ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_back
 static struct ggml_backend_i kompute_backend_i = {
     /* .get_name                = */ ggml_backend_kompute_name,
     /* .free                    = */ ggml_backend_kompute_free,
-    /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
@@ -1987,9 +1975,6 @@ static struct ggml_backend_i kompute_backend_i = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
-    /* .supports_op             = */ ggml_backend_kompute_supports_op,
-    /* .supports_buft           = */ ggml_backend_kompute_supports_buft,
-    /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
 };
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 80c08f15b..a2b4d49d5 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -3247,12 +3247,6 @@ static enum ggml_status ggml_metal_graph_compute(
 
 // backend interface
 
-static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return "Metal";
-
-    UNUSED(buffer);
-}
-
 static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
@@ -3307,7 +3301,6 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
 }
 
 static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
-    /* .get_name        = */ ggml_backend_metal_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_metal_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_metal_buffer_get_base,
     /* .init_tensor     = */ NULL,
@@ -3432,6 +3425,29 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
     return &ggml_backend_buffer_type_metal;
 }
 
+static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "Metal_Mapped";
+
+    UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
+        /* .iface = */ {
+            /* .get_name         = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
+            /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
+        },
+        /* .device  = */ &g_ggml_backend_metal_device,
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_buffer_from_ptr_type_metal;
+}
+
 // TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
 ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
     struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
@@ -3508,7 +3524,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
         }
     }
 
-    return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
+    return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
 }
 
 // backend
@@ -3529,12 +3545,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
     free(backend);
 }
 
-static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_metal_buffer_type();
-
-    UNUSED(backend);
-}
-
 static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     return ggml_metal_graph_compute(backend, cgraph);
 }
@@ -3601,7 +3611,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 static struct ggml_backend_i ggml_backend_metal_i = {
     /* .get_name                = */ ggml_backend_metal_name,
     /* .free                    = */ ggml_backend_metal_free,
-    /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
@@ -3611,9 +3620,6 @@ static struct ggml_backend_i ggml_backend_metal_i = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_metal_graph_compute,
-    /* .supports_op             = */ NULL,
-    /* .supports_buft           = */ NULL,
-    /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
 };
@@ -3708,7 +3714,7 @@ static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t
 }
 
 static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
 
     GGML_UNUSED(dev);
 }
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index 0e936b343..2778009e4 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -178,7 +178,6 @@ struct ggml_backend_rpc_buffer_context {
     std::shared_ptr<socket_t> sock;
     std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
     uint64_t remote_ptr;
-    std::string name;
 };
 
 // RPC helper functions
@@ -409,11 +408,6 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
     return sock;
 }
 
-static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
-    return ctx->name.c_str();
-}
-
 static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
     rpc_msg_free_buffer_req request = {ctx->remote_ptr};
@@ -524,7 +518,6 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 }
 
 static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
-    /* .get_name        = */ ggml_backend_rpc_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_rpc_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_rpc_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_rpc_buffer_init_tensor,
@@ -551,7 +544,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
     if (response.remote_ptr != 0) {
         ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
             ggml_backend_rpc_buffer_interface,
-            new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
+            new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
             response.remote_size);
         return buffer;
     } else {
@@ -609,11 +602,6 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
-    return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
-}
-
 static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
     UNUSED(backend);
     // this is no-op because we don't have any async operations
@@ -670,7 +658,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
 static ggml_backend_i ggml_backend_rpc_interface = {
     /* .get_name                = */ ggml_backend_rpc_name,
     /* .free                    = */ ggml_backend_rpc_free,
-    /* .get_default_buffer_type = */ ggml_backend_rpc_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
@@ -680,9 +667,6 @@ static ggml_backend_i ggml_backend_rpc_interface = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_rpc_graph_compute,
-    /* .supports_op             = */ NULL,
-    /* .supports_buft           = */ NULL,
-    /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
 };
@@ -1278,7 +1262,7 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
 
 static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
     // TODO: obtain value from the server
-    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
 
     UNUSED(dev);
 }
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 4d91ee460..a62c67f4f 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -249,13 +249,10 @@ struct ggml_backend_sycl_buffer_context {
     }
 };
 
-static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
-    return ctx->name.c_str();
-}
+static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft);
 
 static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
+    return buffer->buft->iface.get_name == ggml_backend_sycl_buffer_type_get_name;
 }
 
 static void
@@ -440,7 +437,6 @@ catch (sycl::exception const &exc) {
 }
 
 static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
-    /* .get_name        = */ ggml_backend_sycl_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_sycl_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_sycl_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_sycl_buffer_init_tensor,
@@ -698,16 +694,6 @@ struct ggml_backend_sycl_split_buffer_context {
     std::vector<queue_ptr> streams;
 };
 
-static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return GGML_SYCL_NAME "_Split";
-
-    GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
-   return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
-}
-
 static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
     delete ctx;
@@ -915,7 +901,6 @@ static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, u
 }
 
 static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
-    /* .get_name        = */ ggml_backend_sycl_split_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_sycl_split_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_sycl_split_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_sycl_split_buffer_init_tensor,
@@ -935,6 +920,10 @@ static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_bu
     GGML_UNUSED(buft);
 }
 
+static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
+   return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name;
+}
+
 static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
     // instead, we allocate them for each tensor separately in init_tensor
@@ -1040,12 +1029,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
     GGML_UNUSED(buft);
 }
 
-static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return GGML_SYCL_NAME "_Host";
-
-    GGML_UNUSED(buffer);
-}
-
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
 }
@@ -1061,7 +1044,6 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
     // FIXME: this is a hack to avoid having to implement a new buffer type
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
     buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_sycl_host_buffer_name;
     buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
 
     return buffer;
@@ -4889,12 +4871,6 @@ static void ggml_backend_sycl_free(ggml_backend_t backend) {
     delete backend;
 }
 
-
-static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-    return ggml_backend_sycl_buffer_type(sycl_ctx->device);
-}
-
 static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
                                                ggml_tensor *tensor,
                                                const void *data, size_t offset,
@@ -5031,7 +5007,6 @@ static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_ev
 static ggml_backend_i ggml_backend_sycl_interface = {
     /* .get_name                = */ ggml_backend_sycl_get_name,
     /* .free                    = */ ggml_backend_sycl_free,
-    /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
     /* .set_tensor_async        = */ ggml_backend_sycl_set_tensor_async,
     /* .get_tensor_async        = */ ggml_backend_sycl_get_tensor_async,
     /* .cpy_tensor_async        = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
@@ -5043,9 +5018,6 @@ static ggml_backend_i ggml_backend_sycl_interface = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
-    /* .supports_op             = */ NULL, // moved to device
-    /* .supports_buft           = */ NULL, // moved to device
-    /* .offload_op              = */ NULL, // moved to device
     /* .event_record            = */ ggml_backend_sycl_event_record,
     /* .event_wait              = */ ggml_backend_sycl_event_wait,
 };
@@ -5092,7 +5064,7 @@ static void ggml_backend_sycl_device_get_memory(ggml_backend_dev_t dev, size_t *
 
 static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
     GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
 static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
@@ -5388,12 +5360,14 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
     return ctx->devices[index];
 }
 
-static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name)
-{
+static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
     GGML_UNUSED(reg);
-    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
-        return (void *)ggml_backend_sycl_split_buffer_type;
-    }
+
+    // TODO: update to the current function signature
+    //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+    //    return (void *)ggml_backend_sycl_split_buffer_type;
+    //}
+
     // SYCL doesn't support registering host memory, left here for reference
     // "ggml_backend_register_host_buffer"
     // "ggml_backend_unregister_host_buffer"
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 94175a782..83c37ea9c 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -6247,13 +6247,8 @@ static void ggml_vk_get_device_description(int device, char * description, size_
 
 // device backend
 
-static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
-    return ctx->name.c_str();
-}
-
 static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
+    return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name;
 }
 
 static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -6317,7 +6312,6 @@ static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t v
 }
 
 static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
-    /* .get_name        = */ ggml_backend_vk_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_vk_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_vk_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_vk_buffer_init_tensor,
@@ -6413,7 +6407,6 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
 
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
     buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_vk_host_buffer_name;
     buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
 
     return buffer;
@@ -6646,7 +6639,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 static ggml_backend_i ggml_backend_vk_interface = {
     /* .get_name                = */ ggml_backend_vk_name,
     /* .free                    = */ ggml_backend_vk_free,
-    /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,  // ggml_backend_vk_set_tensor_async,
     /* .get_tensor_async        = */ NULL,  // ggml_backend_vk_get_tensor_async,
     /* .cpy_tensor_async        = */ NULL,  // ggml_backend_vk_cpy_tensor_async,
@@ -6656,9 +6648,6 @@ static ggml_backend_i ggml_backend_vk_interface = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_vk_graph_compute,
-    /* .supports_op             = */ NULL,
-    /* .supports_buft           = */ NULL,
-    /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
 };
@@ -6717,7 +6706,7 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
 //////////////////////////
 
 struct ggml_backend_vk_device_context {
-    int device;
+    size_t device;
     std::string name;
     std::string description;
 };
@@ -6749,7 +6738,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
 
 static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
     UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
 static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
@@ -6758,9 +6747,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
     props->type        = ggml_backend_vk_device_get_type(dev);
     ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = {
-        /* async       */ false,
-        /* host_buffer */ true,
-        /* events      */ false,
+        /* .async                 = */ false,
+        /* .host_buffer           = */ true,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
     };
 }
 
@@ -6949,7 +6939,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
         static std::mutex mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
-            for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
+            for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
                 ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
                 char desc[256];
                 ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 66df9a9c1..a8da10d79 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4028,7 +4028,9 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
     if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
         GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
                 __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
-        assert(false);
+#ifndef NDEBUG
+        GGML_ABORT("not enough space in the context's memory pool");
+#endif
         return NULL;
     }
 
diff --git a/include/llama.h b/include/llama.h
index 4076d34a7..24005548d 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -205,7 +205,7 @@ extern "C" {
     enum llama_split_mode {
         LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
         LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW   = 2, // split rows across GPUs
+        LLAMA_SPLIT_MODE_ROW   = 2, // split layers and KV across GPUs, use tensor parallelism if supported
     };
 
     // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@@ -274,10 +274,7 @@ extern "C" {
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
-        // main_gpu interpretation depends on split_mode:
-        // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
-        // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
-        // LLAMA_SPLIT_MODE_LAYER: ignored
+        // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
         int32_t main_gpu;
 
         // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index e45e83ce8..4ac6b5fc0 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -20,7 +20,7 @@ logger = logging.getLogger("compare-llama-bench")
 # Properties by which to differentiate results per commit:
 KEY_PROPERTIES = [
     "cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
-    "blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
+    "blas", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "n_threads",
     "type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
 ]
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 4cb669bcf..ef1b8ee59 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8,18 +8,6 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
-#if defined(GGML_USE_KOMPUTE)
-#   include "ggml-kompute.h"
-#endif
-
-#ifndef __AMX_INT8__
-#undef GGML_USE_AMX
-#endif
-
-#ifdef GGML_USE_AMX
-#  include "ggml-amx.h"
-#endif
-
 // TODO: replace with ggml API call
 #define QK_K 256
 
@@ -1558,44 +1546,52 @@ static llm_arch llm_arch_from_string(const std::string & name) {
 //   std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias");         -> "token_embd.bias"
 //   std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3);     -> "blk.3.attn_norm.weight"
 //
+struct LLM_TN_IMPL {
+    const llm_arch arch;
+    const llm_tensor tensor;
+    const char * const suffix;
+    const int bid;
+    const int xid;
+
+    std::string str() const {
+        if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
+            return "__missing__";
+        }
+
+        std::string name = ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid);
+
+        if (suffix != nullptr) {
+            name += ".";
+            name += suffix;
+        }
+
+        return name;
+    }
+
+    operator std::string() const {
+        return str();
+    }
+
+    friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
+        return str == tn.str();
+    }
+
+    friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
+        return str != tn.str();
+    }
+};
+
 struct LLM_TN {
     LLM_TN(llm_arch arch) : arch(arch) {}
 
     llm_arch arch;
 
-    std::string operator()(llm_tensor tensor) const {
-        if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
-            return "__missing__";
-        }
-        return LLM_TENSOR_NAMES.at(arch).at(tensor);
+    LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
+        return { arch, tensor, suffix, bid, xid };
     }
 
-    std::string operator()(llm_tensor tensor, const char * suffix) const {
-        if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
-            return "__missing__";
-        }
-        return std::string(LLM_TENSOR_NAMES.at(arch).at(tensor)) + "." + suffix;
-    }
-
-    std::string operator()(llm_tensor tensor, int bid) const {
-        if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
-            return "__missing__";
-        }
-        return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid);
-    }
-
-    std::string operator()(llm_tensor tensor, const char * suffix, int bid) const {
-        if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
-            return "__missing__";
-        }
-        return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid) + "." + suffix;
-    }
-
-    std::string operator()(llm_tensor tensor, const char * suffix, int bid, int xid) const {
-        if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
-            return "__missing__";
-        }
-        return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid) + "." + suffix;
+    LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
+        return { arch, tensor, nullptr, bid, xid };
     }
 };
 
@@ -2587,6 +2583,11 @@ struct llama_cparams {
 
 // TODO: separate into "llama_layer_enc" and "llama_layer_dec"
 struct llama_layer {
+    llama_layer() {
+        // initialize all pointers to NULL
+        std::memset(this, 0, sizeof(*this));
+    }
+
     // normalization
     struct ggml_tensor * attn_norm;
     struct ggml_tensor * attn_norm_b;
@@ -2667,9 +2668,9 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_shexp;
 
     // ff bias
-    struct ggml_tensor * ffn_gate_b = nullptr;
-    struct ggml_tensor * ffn_down_b = nullptr; // b2
-    struct ggml_tensor * ffn_up_b   = nullptr; // b3
+    struct ggml_tensor * ffn_gate_b;
+    struct ggml_tensor * ffn_down_b; // b2
+    struct ggml_tensor * ffn_up_b; // b3
     struct ggml_tensor * ffn_act;
 
     // mamba proj
@@ -2860,22 +2861,21 @@ struct llama_model {
     llama_hparams hparams = {};
     llama_vocab   vocab;
 
-    // TODO: should init all tensors to nullptr
-    struct ggml_tensor * tok_embd;
-    struct ggml_tensor * type_embd;
-    struct ggml_tensor * pos_embd;
-    struct ggml_tensor * tok_norm;
-    struct ggml_tensor * tok_norm_b;
+    struct ggml_tensor * tok_embd = nullptr;
+    struct ggml_tensor * type_embd = nullptr;
+    struct ggml_tensor * pos_embd = nullptr;
+    struct ggml_tensor * tok_norm = nullptr;
+    struct ggml_tensor * tok_norm_b = nullptr;
 
-    struct ggml_tensor * output_norm;
-    struct ggml_tensor * output_norm_b;
-    struct ggml_tensor * output;
-    struct ggml_tensor * output_b;
-    struct ggml_tensor * output_norm_enc;
+    struct ggml_tensor * output_norm = nullptr;
+    struct ggml_tensor * output_norm_b = nullptr;
+    struct ggml_tensor * output = nullptr;
+    struct ggml_tensor * output_b = nullptr;
+    struct ggml_tensor * output_norm_enc = nullptr;
 
     // classifier
-    struct ggml_tensor * cls;
-    struct ggml_tensor * cls_b;
+    struct ggml_tensor * cls = nullptr;
+    struct ggml_tensor * cls_b = nullptr;
     struct ggml_tensor * cls_out   = nullptr;
     struct ggml_tensor * cls_out_b = nullptr;
 
@@ -2888,24 +2888,24 @@ struct llama_model {
     int main_gpu;
     int n_gpu_layers;
 
+    std::vector<std::string> rpc_servers;
+
     // list of devices used in this model
     std::vector<ggml_backend_dev_t> devices;
 
-    std::vector<std::string> rpc_servers;
 
-    // layer -> buffer type mapping
-    struct layer_buft {
-        layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
-        layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
-        layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
+    // lists of buffer types used for each layer
+    using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
+    buft_list_t cpu_buft_list;
+    std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
 
-        ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication
-        ggml_backend_buffer_type_t buft;        // everything else
+    struct layer_dev {
+        ggml_backend_dev_t dev;
+        buft_list_t * buft_list;
     };
-
-    layer_buft buft_input;
-    layer_buft buft_output;
-    std::vector<layer_buft> buft_layer;
+    layer_dev dev_input = {};
+    layer_dev dev_output = {};
+    std::vector<layer_dev> dev_layer;
 
     // contexts where the model tensors metadata is stored
     std::vector<struct ggml_context *> ctxs;
@@ -3391,104 +3391,47 @@ struct llama_lora_adapter {
 };
 
 static int llama_get_device_count(const llama_model & model) {
-    int count = (int) model.devices.size();
-
-#if defined(GGML_USE_RPC)
-    count += (int) model.rpc_servers.size();
-#endif
-
-    return count;
-
-    GGML_UNUSED(model);
+    return (int) model.devices.size();
 }
 
-static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_model & model, bool host_buffer) {
-    ggml_backend_buffer_type_t buft = nullptr;
+template<typename F>
+static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
+    ggml_init_params params = {
+        /*.mem_size   =*/ ggml_tensor_overhead()*8,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context * ctx = ggml_init(params);
+    if (!ctx) {
+        throw std::runtime_error(format("failed to create ggml context"));
+    }
 
-    if (host_buffer) {
-        for (auto * dev : model.devices) {
-            buft = ggml_backend_dev_host_buffer_type(dev);
-            if (buft != nullptr) {
-                break;
-            }
+    ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, 0);
+    ggml_tensor * op_tensor = fn(ctx);
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op_tensor->src[i] != nullptr) {
+            assert(op_tensor->src[i]->buffer == nullptr);
+            op_tensor->src[i]->buffer = buf;
         }
     }
+    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
 
-#if defined(GGML_USE_CPU_HBM)
-    buft = ggml_backend_cpu_hbm_buffer_type();
-#endif
+    ggml_free(ctx);
+    ggml_backend_buffer_free(buf);
 
-    if (buft == nullptr) {
-        buft = ggml_backend_cpu_buffer_type();
-    }
-    return buft;
-
-    GGML_UNUSED(host_buffer);
+    return op_supported;
 }
 
-static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
-    ggml_backend_buffer_type_t buft = nullptr;
-
-    if (device < (int)model.devices.size()) {
-        return ggml_backend_dev_buffer_type(model.devices[device]);
-    }
-    device -= (int)model.devices.size();
-
-#if defined(GGML_USE_KOMPUTE)
-    buft = ggml_backend_kompute_buffer_type(device);
-#endif
-
-    if (buft == nullptr) {
-        buft = llama_default_buffer_type_cpu(model, true);
-    }
-    return buft;
-
-    GGML_UNUSED(model);
-}
-
-static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
-    ggml_backend_buffer_type_t buft = nullptr;
-
-    // find a backend that supports split buffers
-    for (size_t i = 0; i < ggml_backend_reg_count(); ++i) {
-        ggml_backend_reg_t reg = ggml_backend_reg_get(i);
-
-        auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
-        if (ggml_backend_split_buffer_type_fn) {
-            buft = ggml_backend_split_buffer_type_fn(tensor_split);
-            if (buft != nullptr) {
-                break;
-            }
+template<typename F>
+static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
+    for (const auto & cur : buft_list) {
+        ggml_backend_dev_t cur_dev = cur.first;
+        ggml_backend_buffer_type_t cur_buft = cur.second;
+        if (buft_supported(cur_buft, cur_dev, fn)) {
+            return cur_buft;
         }
     }
-
-    if (buft == nullptr) {
-        buft = llama_default_buffer_type_offload(model, fallback_gpu);
-    }
-    return buft;
-
-    GGML_UNUSED(tensor_split);
-}
-
-static size_t llama_get_device_memory(const llama_model & model, int device) {
-    if (device < (int)model.devices.size()) {
-        ggml_backend_dev_t dev = model.devices[device];
-        size_t total;
-        size_t free;
-        ggml_backend_dev_memory(dev, &free, &total);
-        return free;
-    }
-
-    if (model.devices.size() > 0) {
-        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(model.devices[0]);
-        LLAMA_LOG_WARN("%s: failed to get free memmory of device:%d of backend:%s, for device id is out of range.\n", __func__, device, ggml_backend_reg_name(reg));
-    } else {
-        LLAMA_LOG_WARN("%s: failed to get free memmory of device, no devices in inputted model.\n", __func__);
-    }
-    return 1;
-
-    GGML_UNUSED(model);
-    GGML_UNUSED(device);
+    throw std::runtime_error(format("no suitable buffer type found"));
 }
 
 //
@@ -3524,33 +3467,24 @@ static bool llama_kv_cache_init(
     cache.cells.clear();
     cache.cells.resize(kv_size);
 
-    // count used buffer types
-    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
-    if (offload) {
-        for (int64_t i = 0; i < n_layer; ++i) {
-            buft_layer_count[model.buft_layer[i].buft]++;
-        }
-    } else {
-        buft_layer_count[llama_default_buffer_type_cpu(model, true)] = n_layer;
-    }
-
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    for (auto & it : buft_layer_count) {
-        int n_layers = it.second;
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ 2u*n_layers*ggml_tensor_overhead(),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ggml_context * ctx = ggml_init(params);
-        if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
-            return false;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        if (ctx_map.count(buft) == 0) {
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+            ctx_map[buft] = ctx;
+            cache.ctxs.push_back(ctx);
         }
-        ctx_map[it.first] = ctx;
-        cache.ctxs.push_back(ctx);
-    }
+        return ctx_map.at(buft);
+    };
 
     cache.k_l.reserve(n_layer);
     cache.v_l.reserve(n_layer);
@@ -3559,7 +3493,28 @@ static bool llama_kv_cache_init(
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
-        struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
+        const llama_model::buft_list_t * buft_list;
+        if (offload) {
+            buft_list = model.dev_layer.at(i).buft_list;
+        } else {
+            buft_list = &model.cpu_buft_list;
+        }
+        ggml_backend_buffer_type_t buft = select_buft(*buft_list,
+            [&](ggml_context * ctx) {
+                ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+                if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
+                    return k;
+                }
+                ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+                return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
+            });
+        ggml_context * ctx = ctx_for_buft(buft);
+
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+            return false;
+        }
+
         ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
         ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
         ggml_format_name(k, "cache_k_l%d", i);
@@ -3570,8 +3525,9 @@ static bool llama_kv_cache_init(
 
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
     for (auto it : ctx_map) {
-        ggml_backend_buffer_type_t buft = it.first;
-        ggml_context * ctx = it.second;
+        auto * buft = it.first;
+        auto * ctx  = it.second;
+
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
             LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
@@ -4765,10 +4721,10 @@ struct llama_model_loader {
         return weight->tensor;
     }
 
-    struct ggml_tensor * require_tensor_meta(const char * name) const {
-        struct ggml_tensor * tensor = get_tensor_meta(name);
+    struct ggml_tensor * require_tensor_meta(const std::string & name) const {
+        struct ggml_tensor * tensor = get_tensor_meta(name.c_str());
         if (!tensor) {
-            throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
+            throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
         }
         return tensor;
     }
@@ -4777,19 +4733,6 @@ struct llama_model_loader {
         return get_tensor_meta(get_tensor_name(i));
     }
 
-    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
-        struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
-        ggml_set_name(tensor, ggml_get_name(cur));
-
-        if (duplicated) {
-            size_data += ggml_nbytes(cur);
-        } else {
-            n_created++;
-        }
-
-        return tensor;
-    }
-
     const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
         const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
 
@@ -4830,7 +4773,19 @@ struct llama_model_loader {
             return NULL;
         }
 
-        return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
+        bool duplicated = flags & TENSOR_DUPLICATED;
+
+        struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
+        ggml_set_name(tensor, ggml_get_name(cur));
+
+        if (duplicated) {
+            size_data += ggml_nbytes(cur);
+        } else {
+            n_created++;
+        }
+
+        return tensor;
+
     }
 
     struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true) {
@@ -4962,7 +4917,7 @@ struct llama_model_loader {
         std::vector<ggml_backend_event_t> events;
         std::vector<void *> host_ptrs;
         size_t buffer_idx = 0; // buffer to use for async loads
-        ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t {
+        ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t {
             if (use_mmap || check_tensors) {
                 return nullptr;
             }
@@ -4970,20 +4925,20 @@ struct llama_model_loader {
             // First determine if the backend supports the necessary features for async uploads.
             auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
             if (!buf) {
-                LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
+                LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func);
                 return nullptr;
             }
 
             auto * buft = ggml_backend_buffer_get_type(buf);
             auto * dev = ggml_backend_buft_get_device(buft);
             if (!dev) {
-                LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
+                LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func,
                     ggml_backend_buft_name(buft));
                 return nullptr;
             }
 
             if (buft != ggml_backend_dev_buffer_type(dev)) {
-                LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
+                LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func,
                     ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
                 return nullptr;
             }
@@ -4991,14 +4946,14 @@ struct llama_model_loader {
             ggml_backend_dev_props props;
             ggml_backend_dev_get_props(dev, &props);
             if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
-                LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
+                LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func,
                     ggml_backend_dev_name(dev));
                 return nullptr;
             }
 
             auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
             if (!host_buft) {
-                LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
+                LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func,
                     ggml_backend_dev_name(dev));
                 return nullptr;
             }
@@ -5007,7 +4962,7 @@ struct llama_model_loader {
             for (size_t idx = 0; idx < n_buffers; ++idx) {
                 auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
                 if (!buf) {
-                    LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
+                    LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                         ggml_backend_dev_name(dev));
                     return nullptr;
                 }
@@ -5017,7 +4972,7 @@ struct llama_model_loader {
 
                 auto * event = ggml_backend_event_new(dev);
                 if (!event) {
-                    LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
+                    LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func,
                         ggml_backend_dev_name(dev));
                     return nullptr;
                 }
@@ -5027,7 +4982,7 @@ struct llama_model_loader {
 
             ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
             if (!backend) {
-                LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
+                LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func,
                     ggml_backend_dev_name(dev));
                 return nullptr;
             }
@@ -7000,6 +6955,338 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     }
 }
 
+enum llm_tensor_layer {
+    LLM_TENSOR_LAYER_INPUT,
+    LLM_TENSOR_LAYER_REPEATING,
+    LLM_TENSOR_LAYER_OUTPUT,
+};
+
+struct llm_tensor_info {
+    llm_tensor_layer layer;
+    ggml_op op;
+};
+
+static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
+    {LLM_TENSOR_TOKEN_EMBD,                 {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_POS_EMBD,                   {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_TOKEN_EMBD_NORM,            {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_TOKEN_TYPES,                {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_OUTPUT,                     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CLS,                        {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CLS_OUT,                    {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_OUTPUT_NORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    {LLM_TENSOR_DEC_OUTPUT_NORM,            {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    {LLM_TENSOR_ENC_OUTPUT_NORM,            {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    {LLM_TENSOR_ROPE_FREQS,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
+    {LLM_TENSOR_ROPE_FACTORS_LONG,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
+    {LLM_TENSOR_ROPE_FACTORS_SHORT,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
+    {LLM_TENSOR_ATTN_Q,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_K,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_V,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_QKV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_OUT,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_DOWN,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_UP,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_DOWN_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_UP_SHEXP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q_A,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_K,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_V,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_QKV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_OUT,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_DOWN,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_UP,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_DOWN_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_UP_SHEXP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q_A,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_V,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_OUT,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_CROSS_ATTN_Q,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_CROSS_ATTN_K,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_CROSS_ATTN_V,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_CROSS_ATTN_OUT,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_FFN_GATE,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_FFN_DOWN,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_FFN_UP,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_ATTN_V,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_ATTN_OUT,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_FFN_GATE,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_FFN_DOWN,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_FFN_UP,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE_INP_SHEXP,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE_INP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_IN,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_X,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_DT,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_OUT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_W1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_W2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_DECAY_W1,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_DECAY_W2,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_KEY,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_VALUE,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_RECEPTANCE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_GATE,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_OUTPUT,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CHANNEL_MIX_KEY,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CHANNEL_MIX_VALUE,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_ACT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
+    {LLM_TENSOR_SSM_CONV1D,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
+    {LLM_TENSOR_SSM_A,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
+    {LLM_TENSOR_SSM_D,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_TIME_MIX_LERP_X,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_TIME_MIX_LN,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CHANNEL_MIX_LERP_K,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CHANNEL_MIX_LERP_R,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_TIME_MIX_LERP_W,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_LERP_K,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_LERP_V,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_LERP_R,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_LERP_G,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_DECAY,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_FIRST,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV}},
+    {LLM_TENSOR_ATTN_NORM,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_NORM_2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_OUT_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_POST_NORM,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_FFN_NORM,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_FFN_POST_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_FFN_NORM_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_Q_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_K_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_LAYER_OUT_NORM,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_Q_A_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_KV_A_NORM,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_SUB_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_FFN_SUB_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_DEC_ATTN_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_DEC_CROSS_ATTN_NORM,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_DEC_FFN_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ENC_ATTN_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ENC_FFN_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_DEC_ATTN_REL_B,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_ENC_ATTN_REL_B,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_FFN_DOWN_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+    {LLM_TENSOR_FFN_GATE_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+    {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+    // this tensor is loaded for T5, but never used
+    {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+};
+
+// checks if the weight tensor can be used with the specified buffer type and device
+static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
+    GGML_ASSERT(w != nullptr);
+
+    if (op == GGML_OP_NONE) {
+        return true;
+    }
+
+    ggml_init_params params = {
+        /*.mem_size   =*/ ggml_tensor_overhead()*8,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context * ctx = ggml_init(params);
+    if (!ctx) {
+        throw std::runtime_error(format("failed to create ggml context"));
+    }
+
+    ggml_tensor * op_tensor = nullptr;
+
+    switch (op) {
+        case GGML_OP_GET_ROWS:
+            {
+                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+                op_tensor = ggml_get_rows(ctx, w, b);
+            } break;
+        case GGML_OP_MUL_MAT:
+            {
+                ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
+                op_tensor = ggml_mul_mat(ctx, w, b);
+            } break;
+        case GGML_OP_MUL_MAT_ID:
+            {
+                int n_expert_used = hparams.n_expert_used;
+                ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+                ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+                op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
+            } break;
+        case GGML_OP_ADD:
+            {
+                ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
+                op_tensor = ggml_add(ctx, a, w);
+            } break;
+        case GGML_OP_MUL:
+            {
+                ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
+                op_tensor = ggml_mul(ctx, a, w);
+            } break;
+        case GGML_OP_DIV:
+            {
+                ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
+                op_tensor = ggml_div(ctx, a, w);
+            } break;
+        case GGML_OP_ROPE:
+            {
+                int n_embd_head = hparams.n_embd_head_v;
+                int n_head = hparams.n_head();
+                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
+                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+                op_tensor = ggml_rope_ext(
+                    ctx, a, b, w,
+                    0, 0, 0, 0, 0,
+                    0, 0, 0, 0
+                );
+
+            } break;
+        case GGML_OP_SSM_CONV:
+            {
+                // TODO: ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d);
+                op_tensor = ggml_ssm_conv(ctx, nullptr, w);
+            } break;
+        case GGML_OP_SSM_SCAN:
+            {
+                // TODO: ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C);
+                op_tensor = ggml_ssm_scan(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr);
+            } break;
+        case GGML_OP_RWKV_WKV:
+            {
+                // TODO: ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
+                op_tensor = ggml_rwkv_wkv(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr);
+            } break;
+        default:
+            GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
+    }
+
+    // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
+    GGML_ASSERT(w->buffer == nullptr);
+    w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+    ggml_backend_buffer_free(w->buffer);
+    w->buffer = nullptr;
+
+    ggml_free(ctx);
+
+    return op_supported;
+}
+
+// find the first buffer type in the list that can use the tensor
+static ggml_backend_buffer_type_t select_weight_buft(const llama_model & model, ggml_tensor * tensor, ggml_op op, const llama_model::buft_list_t & buft_list) {
+    GGML_ASSERT(!buft_list.empty());
+    for (const auto & cur : buft_list) {
+        ggml_backend_dev_t cur_dev = cur.first;
+        ggml_backend_buffer_type_t cur_buft = cur.second;
+        if (weight_buft_supported(model.hparams, tensor, op, cur_buft, cur_dev)) {
+            return cur_buft;
+        }
+    }
+    return nullptr;
+}
+
+// CPU: ACCEL -> CPU extra -> GPU host -> CPU
+static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) {
+    llama_model::buft_list_t buft_list;
+
+    // add ACCEL buffer types
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+            auto * buft = ggml_backend_dev_buffer_type(dev);
+            // skip
+            if (buft != ggml_backend_cpu_buffer_type()) {
+                buft_list.emplace_back(dev, buft);
+            }
+        }
+    }
+
+    // add extra buffer types
+    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+    auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
+        ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_cpu_get_extra_bufts");
+    if (ggml_backend_dev_get_extra_bufts_fn) {
+        ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
+        while (extra_bufts && *extra_bufts) {
+            buft_list.emplace_back(cpu_dev, *extra_bufts);
+            ++extra_bufts;
+        }
+    }
+
+    // add a host buffer type
+    // storing the tensors in a host buffer is useful when the processing of large batches
+    // is offloaded to a GPU device, since it reduces the time spent on data transfers
+    // generally, this will be done using the first device in the list
+    // a better approach would be to handle this on a weight-by-weight basis using the offload_op
+    // function of the device to determine if it would benefit from being stored in a host buffer
+    for (auto * dev : model.devices) {
+        ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
+        if (buft) {
+            buft_list.emplace_back(dev, buft);
+            break;
+        }
+    }
+
+    // add the CPU buffer type
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+            buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
+        }
+    }
+
+    return buft_list;
+}
+
+// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
+static llama_model::buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
+    llama_model::buft_list_t buft_list;
+
+    // add the device split buffer type if requested and available
+    if (split_mode == LLAMA_SPLIT_MODE_ROW) {
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
+            ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
+        if (ggml_backend_split_buffer_type_fn) {
+            size_t dev_index = [&]() {
+                auto * reg = ggml_backend_dev_backend_reg(dev);
+                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
+                    if (ggml_backend_reg_dev_get(reg, i) == dev) {
+                        return i;
+                    }
+                }
+                throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
+            }();
+            auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
+            if (buft != nullptr) {
+                buft_list.emplace_back(dev, buft);
+            }
+        }
+    }
+
+    // add the device default buffer type
+    buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
+
+    return buft_list;
+}
+
 // Returns false if cancelled by progress_callback
 static bool llm_load_tensors(
         llama_model_loader & ml,
@@ -7013,135 +7300,96 @@ static bool llm_load_tensors(
         void * progress_callback_user_data) {
     auto & hparams = model.hparams;
 
-    // check if the value of main_gpu is valid
-    if (llama_get_device_count(model) > 0 &&
-        split_mode != LLAMA_SPLIT_MODE_LAYER &&
-        (main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
-        throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
-    }
-
     model.split_mode   = split_mode;
     model.main_gpu     = main_gpu;
     model.n_gpu_layers = n_gpu_layers;
 
     const int n_layer     = hparams.n_layer;
-    const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
     bool use_mmap_buffer = true;
 
-    // there is very little benefit to offloading the input layer, so always keep it on the CPU
-    model.buft_input = llama_default_buffer_type_cpu(model, true);
-    //model.buft_input = llama_default_buffer_type_offload(main_gpu);
-
-    model.buft_layer.resize(n_layer);
-
-    // assign cpu layers
-    for (int i = 0; i < i_gpu_start; ++i) {
-#ifdef GGML_USE_AMX
-        model.buft_layer[i] = {
-            ggml_backend_amx_buffer_type(),
-            llama_default_buffer_type_cpu(model, true)
-        };
-#else
-        model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
-#endif
+    // build a list of buffer types for the CPU and GPU devices
+    model.cpu_buft_list = make_cpu_buft_list(model);
+    for (auto * dev : model.devices) {
+        llama_model::buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
+        // add CPU buffer types as a fallback
+        buft_list.insert(buft_list.end(), model.cpu_buft_list.begin(), model.cpu_buft_list.end());
+        model.gpu_buft_list.emplace(dev, std::move(buft_list));
     }
 
-    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
-        // calculate the split points
-        int device_count = llama_get_device_count(model);
-        bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
-        std::vector<float> splits(device_count);
-        if (all_zero) {
-            // default split, by free memory
-            for (int i = 0; i < device_count; ++i) {
-                splits[i] = llama_get_device_memory(model, i);
-            }
-        } else {
-            std::copy(tensor_split, tensor_split + device_count, splits.begin());
-        }
-
-        // sum and normalize the splits to get the split points
-        float split_sum = 0.0f;
+    // calculate the split points
+    int device_count = llama_get_device_count(model);
+    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
+    std::vector<float> splits(device_count);
+    if (all_zero) {
+        // default split, by free memory
         for (int i = 0; i < device_count; ++i) {
-            split_sum += splits[i];
-            splits[i] = split_sum;
-        }
-        for (int i = 0; i < device_count; ++i) {
-            splits[i] /= split_sum;
-        }
-
-        // assign the repeating layers to the devices according to the splits
-        int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
-        for (int i = i_gpu_start; i < n_layer; ++i) {
-            int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
-            model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
-        }
-        // assign the output layer
-        if (n_gpu_layers > n_layer) {
-            int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
-            model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
-        } else {
-            model.buft_output = llama_default_buffer_type_cpu(model, true);
+            ggml_backend_dev_t dev = model.devices[i];
+            size_t total;
+            size_t free;
+            ggml_backend_dev_memory(dev, &free, &total);
+            splits[i] = free;
         }
     } else {
-        ggml_backend_buffer_type_t split_buft;
-        if (split_mode == LLAMA_SPLIT_MODE_ROW) {
-            split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
-        } else {
-            // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
-            split_buft = llama_default_buffer_type_offload(model, main_gpu);
-        }
-        // assign the repeating layers
-        for (int i = i_gpu_start; i < n_layer; ++i) {
-            model.buft_layer[i] = {
-                split_buft,
-                llama_default_buffer_type_offload(model, main_gpu)
-            };
-        }
-        // assign the output layer
-        if (n_gpu_layers > n_layer) {
-            model.buft_output = {
-                split_buft,
-                llama_default_buffer_type_offload(model, main_gpu)
-            };
-        } else {
-            model.buft_output = llama_default_buffer_type_cpu(model, true);
-        }
+        std::copy(tensor_split, tensor_split + device_count, splits.begin());
     }
 
-    // count used buffer types
-    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
-    buft_layer_count[model.buft_input.buft]++;
-    buft_layer_count[model.buft_input.buft_matrix]++;
-    buft_layer_count[model.buft_output.buft]++;
-    buft_layer_count[model.buft_output.buft_matrix]++;
-    for (int i = 0; i < n_layer; ++i) {
-        buft_layer_count[model.buft_layer[i].buft]++;
-        buft_layer_count[model.buft_layer[i].buft_matrix]++;
+    // sum and normalize the splits to get the split points
+    float split_sum = 0.0f;
+    for (int i = 0; i < device_count; ++i) {
+        split_sum += splits[i];
+        splits[i] = split_sum;
+    }
+    for (int i = 0; i < device_count; ++i) {
+        splits[i] /= split_sum;
     }
 
-    // create one context per buffer type
-    size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
+    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
+    const int act_gpu_layers = model.devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
+    auto get_layer_buft_list = [&](int il) -> llama_model::layer_dev {
+        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
+            return {cpu_dev, &model.cpu_buft_list};
+        }
+        int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
+        auto * dev = model.devices.at(layer_gpu);
+        return {dev, &model.gpu_buft_list.at(dev)};
+    };
 
-    // for moe merged tensors
-    ctx_size += ggml_tensor_overhead()*n_layer*3;
+    // assign the input layer
+    // there is very little benefit to offloading the input layer, so always keep it on the CPU
+    model.dev_input = { cpu_dev, &model.cpu_buft_list };
+
+    // assign the repeating layers to the devices according to the splits
+    model.dev_layer.resize(n_layer);
+    for (int il = 0; il < n_layer; ++il) {
+        model.dev_layer[il] = get_layer_buft_list(il);
+    }
+    // assign the output layer
+    model.dev_output = get_layer_buft_list(n_layer);
+
+    // one ggml context per buffer type
+    int max_n_tensors = ml.n_tensors;
+    max_n_tensors += 1;         // duplicated output tensor
+    max_n_tensors += n_layer*2; // duplicated rope freq tensors
+    const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
 
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    for (auto & it : buft_layer_count) {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ggml_context * ctx = ggml_init(params);
-        if (!ctx) {
-            throw std::runtime_error(format("failed to create context"));
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        if (ctx_map.count(buft) == 0) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ ctx_size,
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                throw std::runtime_error(format("failed to create ggml context"));
+            }
+            ctx_map[buft] = ctx;
+            model.ctxs.push_back(ctx);
         }
-        ctx_map[it.first] = ctx;
-        model.ctxs.push_back(ctx);
-    }
-
-    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
+        return ctx_map.at(buft);
+    };
 
     // create tensors for the weights
     {
@@ -7166,15 +7414,107 @@ static bool llm_load_tensors(
             throw std::runtime_error("model has expert layers but no expert layers are used");
         }
 
-        ggml_context * ctx_input        = ctx_map.at(model.buft_input.buft);
-        ggml_context * ctx_output       = ctx_map.at(model.buft_output.buft);
-        ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
+        int n_moved_tensors = 0;
+        ggml_tensor * first_moved_tensor = nullptr;
+        ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
+        ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
 
-        auto ctx_for_layer       = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
-        auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
+        auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
+            ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
+
+            if (!t_meta) {
+                if (flags & llama_model_loader::TENSOR_NOT_REQUIRED) {
+                    return nullptr;
+                }
+                throw std::runtime_error(format("missing tensor %s", tn.str().c_str()));
+            }
+
+            // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
+            // the tensor is duplicated
+            // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
+            llm_tensor tn_tensor = tn.tensor;
+            if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & llama_model_loader::TENSOR_DUPLICATED) {
+                tn_tensor = LLM_TENSOR_OUTPUT;
+            }
+
+            auto it = llm_tensor_info_mapping.find(tn_tensor);
+            if (it == llm_tensor_info_mapping.end()) {
+                throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
+            }
+            const auto & info = it->second;
+
+            // tensors with "bias" suffix are always used with GGML_OP_ADD
+            ggml_op op;
+            bool bias = strcmp(tn.suffix, "bias") == 0;
+            if (bias) {
+                op = GGML_OP_ADD;
+            } else {
+                op = info.op;
+            }
+
+            // sanity checks
+            if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
+                if (tn.bid != -1) {
+                    GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
+                }
+            } else {
+                if (tn.bid == -1) {
+                    GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
+                }
+            }
+
+            // select the buffer type for this tensor
+            llama_model::buft_list_t * buft_list;
+            switch (info.layer) {
+                case LLM_TENSOR_LAYER_INPUT:
+                    buft_list = model.dev_input.buft_list;
+                    break;
+                case LLM_TENSOR_LAYER_OUTPUT:
+                    buft_list = model.dev_output.buft_list;
+                    break;
+                case LLM_TENSOR_LAYER_REPEATING:
+                    buft_list = model.dev_layer.at(tn.bid).buft_list;
+                    break;
+                default:
+                    GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
+            }
+
+            ggml_backend_buffer_type_t buft = select_weight_buft(model, t_meta, op, *buft_list);
+            if (!buft) {
+                throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
+            }
+
+            // avoid using a host buffer when using mmap
+            auto * buft_dev = ggml_backend_buft_get_device(buft);
+            if (ml.use_mmap && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
+                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                buft = ggml_backend_dev_buffer_type(cpu_dev);
+            }
+
+            if (buft != buft_list->front().second) {
+                n_moved_tensors++;
+                if (!first_moved_tensor) {
+                    first_moved_tensor = t_meta;
+                    first_moved_from_buft = buft_list->front().second;
+                    first_moved_to_buft   = buft;
+                }
+            }
+
+            ggml_context * ctx = ctx_for_buft(buft);
+
+            // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
+            if (flags & llama_model_loader::TENSOR_DUPLICATED) {
+                ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
+                if (t) {
+                    return t;
+                }
+            }
+            return ml.create_tensor(ctx, tn, ne, flags);
+        };
 
         model.layers.resize(n_layer);
 
+        // TODO: move to a separate function
         const auto tn = LLM_TN(model.arch);
         switch (model.arch) {
             case LLM_ARCH_LLAMA:
@@ -7183,82 +7523,51 @@ static bool llm_load_tensors(
             case LLM_ARCH_GRANITE:
             case LLM_ARCH_GRANITE_MOE:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
 
                         // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
 
                         if (n_expert == 0) {
-                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
 
                             // optional MLP bias
-                            layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                            layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                            layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
                         } else {
-                            layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
-
-                            layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                            if (layer.ffn_gate_exps) {
-                                layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert});
-                                layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert});
-                            } else {
-                                // merge split expert into a single tensor for compatibility with older models
-                                // requires disabling mmap
-                                use_mmap_buffer = false;
-
-                                ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
-                                ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
-                                ggml_type type_up   = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, 0).c_str())->type;
-
-                                layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd,   n_ff, n_expert);
-                                layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down,   n_ff, n_embd, n_expert);
-                                layer.ffn_up_exps   = ggml_new_tensor_3d(ctx_split, type_up,   n_embd,   n_ff, n_expert);
-
-                                ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
-                                ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
-                                ggml_set_name(layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i).c_str());
-
-                                for (uint32_t x = 0; x < n_expert; ++x) {
-                                    // the individual experts are loaded into a view of the merged tensor
-                                    ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
-                                    ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
-                                    ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
-                                }
-                            }
+                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
                         }
                     }
                 } break;
@@ -7269,45 +7578,40 @@ static bool llm_load_tensors(
 
                     const int64_t q_lora_rank  = hparams.n_lora_q;
                     const int64_t kv_lora_rank = hparams.n_lora_kv;
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
 
-                        layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
+                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
 
-                        layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
-                        layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
+                        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
 
-                        layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
-                        layer.wkv_b     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
-                        layer.wo        = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd});
+                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
 
-                        layer.rope_long  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-                        layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                     }
                 } break;
             case LLM_ARCH_GROK:
@@ -7316,904 +7620,782 @@ static bool llm_load_tensors(
                         throw std::runtime_error("Grok model cannot have zero experts");
                     }
 
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.attn_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
+                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_gate_inp  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert});
-                        layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
 
-                        if (layer.ffn_gate_exps) {
-                            layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert});
-                            layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert});
-                        } else {
-                            // merge split expert into a single tensor for compatibility with older models
-                            // requires disabling mmap
-                            use_mmap_buffer = false;
-
-                            ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
-                            ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
-                            ggml_type type_up   = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, 0).c_str())->type;
-
-                            layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd,   n_ff, n_expert);
-                            layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down,   n_ff, n_embd, n_expert);
-                            layer.ffn_up_exps   = ggml_new_tensor_3d(ctx_split, type_up,   n_embd,   n_ff, n_expert);
-
-                            ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
-                            ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
-                            ggml_set_name(layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i).c_str());
-
-                            for (uint32_t x = 0; x < n_expert; ++x) {
-                                // the individual experts are loaded into a view of the merged tensor
-                                ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
-                                ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
-                                ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
-                            }
-                        }
-
-                        layer.layer_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
+                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
                     }
                 } break;
             case LLM_ARCH_DBRX:
-            {
-                if (n_expert == 0) {
-                    throw std::runtime_error("DBRX model cannot have zero experts");
-                }
-
-                model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                // output
                 {
-                    model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                    model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                }
+                    if (n_expert == 0) {
+                        throw std::runtime_error("DBRX model cannot have zero experts");
+                    }
 
-                for (int i = 0; i < n_layer; ++i) {
-                    ggml_context * ctx_layer = ctx_for_layer(i);
-                    ggml_context * ctx_split = ctx_for_layer_split(i);
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
-                    auto & layer = model.layers[i];
+                    // output
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
-                    layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = model.layers[i];
 
-                    layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                    layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                    layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                    layer.ffn_gate_inp  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert});
-                    layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert});
-                    layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert});
-                    layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert});
-                }
-            } break;
+                        layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
+                    }
+                } break;
             case LLM_ARCH_BAICHUAN:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
                     {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_FALCON:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
                     {
-                        model.output_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
+                        model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
 
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
                         if (!model.output) {
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
+                            model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
                         }
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_STARCODER:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
 
                     // output
                     {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                        model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
                         if (!model.output) {
                             // needs to be on GPU
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                         }
 
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
 
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff});
-                        layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff});
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
+                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_BERT:
             case LLM_ARCH_NOMIC_BERT:
                 {
-                    model.tok_embd     = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
-                    model.type_embd    = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
+                    model.tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
+                    model.type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0);
 
                     if (model.arch == LLM_ARCH_BERT) {
-                        model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train});
+                        model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
 
-                        model.cls   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        model.cls_out   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        model.cls_out_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "bias"),   {1},         llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {1},         llama_model_loader::TENSOR_NOT_REQUIRED);
                     }
 
-                    model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
-                    model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd});
+                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
                         if (model.arch == LLM_ARCH_BERT) {
-                            layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                            layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd});
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd}, 0);
 
-                            layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                            layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i),   {n_embd_gqa});
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i),   {n_embd_gqa}, 0);
 
-                            layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                            layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa});
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa}, 0);
                         } else {
-                            layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
                         }
 
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd});
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.attn_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
-                        layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd});
+                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd});
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd}, 0);
 
                         if (model.arch == LLM_ARCH_BERT) {
-                            layer.bo         = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
-                            layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff});
-                            layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
+                            layer.bo         = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, 0);
+                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
                         } else {
-                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
                         }
 
-                        layer.layer_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
-                        layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd});
+                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
                     }
                 } break;
             case LLM_ARCH_JINA_BERT_V2:
                 {
-                    model.tok_embd  = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}); // word_embeddings
-                    model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); // token_type_embeddings
+                    model.tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
+                    model.type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0); // token_type_embeddings
 
-                    model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
-                    model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}); //LayerNorm bias
+                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
+                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0); //LayerNorm bias
 
-                    model.cls   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                    model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"),   {1},         llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {1},         llama_model_loader::TENSOR_NOT_REQUIRED);
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i]; // JinaBertLayer
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
 
-                        layer.attn_q_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa});
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
 
-                        layer.attn_k_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa});
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
 
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
-                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}); //output_dens
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0); //output_dens
 
-                        layer.attn_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
-                        layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd});
+                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
+                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd}, 0);
 
-                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
 
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd});
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
 
-                        layer.layer_out_norm   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
-                        layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias",   i), {n_embd});
+                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias",   i), {n_embd}, 0);
                     }
                 } break;
             case LLM_ARCH_BLOOM:
                 {
-                    model.tok_embd   = ml.create_tensor(ctx_input,  tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab});
-                    model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
-                    model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd});
+                    model.tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
+                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
 
                     // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd}, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias",   i), {n_embd + 2*n_embd_gqa});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias",   i), {n_embd + 2*n_embd_gqa}, 0);
 
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd});
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);
 
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd});
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd}, 0);
 
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd});
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
 
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias",   i), {n_ff});
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias",   i), {n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_MPT:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
                     // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        if (!model.output) {
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
-                        }
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    if (!model.output) {
+                        model.output    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.attn_q_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.attn_k_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
                         // AWQ ScaleActivation layer
-                        layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
                     }
                 } break;
             case LLM_ARCH_STABLELM:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm =   ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
+                        layer.attn_norm =   create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
                         // optional bias tensors, present in Stable LM 2 1.6B
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
                         // optional q and k layernorms, present in StableLM 2 12B
-                        layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
                         // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_QWEN:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3});
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3}, 0);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2}, 0);
                     }
                 } break;
             case LLM_ARCH_QWEN2:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
                         // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd});
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa});
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa});
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_QWEN2MOE:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
                         // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd});
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa});
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa});
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
+                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
 
-                        GGML_ASSERT(n_expert      > 0);
-                        GGML_ASSERT(n_expert_used > 0);
+                        if (n_expert == 0) {
+                            throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
+                        }
+                        if (n_expert_used == 0) {
+                            throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
+                        }
 
                         // MoE branch
                         const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
 
-                        layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert});
-                        layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert});
-                        layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert});
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
 
                         // Shared expert branch
                         const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
 
-                        layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
-                        layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp});
-                        layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp,     n_embd});
-                        layer.ffn_up_shexp   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp});
+                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
+                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp,     n_embd}, 0);
+                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
                     }
                 } break;
             case LLM_ARCH_PHI2:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                        model.output_b      = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab});
-                    }
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    model.output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
                         if (layer.wqkv == nullptr) {
-                            layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
-                            layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd});
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
 
-                            layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
-                            layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa});
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa}, 0);
 
-                            layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
-                            layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa});
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa}, 0);
                         }
 
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_PHI3:
                 {
                     const int64_t n_embd_head = n_embd / n_head;
 
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+                    model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
 
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
-                        layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
 
-                        layer.rope_long  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-                        layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                     }
                 } break;
             case LLM_ARCH_PLAMO:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_GPT2:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
 
                     // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
 
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_CODESHELL:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
 
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff});
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_ORION:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_INTERNLM2:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        // layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
 
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_GEMMA:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                     }
                 } break;
             case LLM_ARCH_GEMMA2:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
-                        layer.attn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
                     }
                 } break;
             case LLM_ARCH_STARCODER2:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
 
+                    model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
                         // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd});
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa});
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa});
-                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
 
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
 
                         // optional bias tensors
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff});
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_MAMBA:
@@ -8224,284 +8406,252 @@ static bool llm_load_tensors(
                     const int64_t dt_rank = hparams.ssm_dt_rank;
 
                     // only an expansion factor of 2 is supported for now
-                    GGML_ASSERT(2 * n_embd == d_inner);
+                    if (2 * n_embd != d_inner) {
+                        throw std::runtime_error("only an expansion factor of 2 is supported for now");
+                    }
 
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
 
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed, duplicated to allow offloading
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
                         // norm
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ssm_in = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner});
+                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
 
-                        layer.ssm_conv1d = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner});
-                        layer.ssm_conv1d_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner});
+                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
+                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
 
-                        layer.ssm_x = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state});
+                        layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
 
-                        layer.ssm_dt = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner});
-                        layer.ssm_dt_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner});
+                        layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
+                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
 
                         // no "weight" suffix for these
-                        layer.ssm_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner});
-                        layer.ssm_d = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_D, i), {d_inner});
+                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
+                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
 
                         // out_proj
-                        layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
+                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
                     }
                 } break;
             case LLM_ARCH_XVERSE:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_COMMAND_R:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        // init output from the input tok embed
-                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    // init output from the input tok embed
+                    model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
                         if (n_layer >= 64){
-                            layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
-                            layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
+                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
+                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
                         }
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_OLMO:  // adapted from LLM_ARCH_LLAMA with norm params removed
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_OLMOE:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd});
-                        layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
+                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
 
-                        GGML_ASSERT(n_expert      > 0);
-                        GGML_ASSERT(n_expert_used > 0);
+                        if (n_expert == 0) {
+                            throw std::runtime_error("n_expert must be > 0");
+                        }
+                        if (n_expert_used == 0) {
+                            throw std::runtime_error("n_expert_used must be > 0");
+                        }
 
                         // MoE branch
-                        layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert});
-                        layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert});
-                        layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert});
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
                     }
                 } break;
             case LLM_ARCH_OPENELM:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        // init output from the input tok embed
-                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    // init output from the input tok embed
+                    model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
 
                     for (int i = 0; i < n_layer; ++i) {
                         const int64_t n_head      =   hparams.n_head(i);
                         const int64_t n_head_qkv  = 2*hparams.n_head_kv(i) + n_head;
                         const int64_t n_ff        =   hparams.n_ff(i);
 
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k});
-                        layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k});
-                        layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_GPTNEOX:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
 
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_ARCTIC:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_embd});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
-                        layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
-                        layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);
-                        layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert});
-                        layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert});
+                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+                        layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
                     }
                 } break;
             case LLM_ARCH_DEEPSEEK2:
@@ -8517,349 +8667,313 @@ static bool llm_load_tensors(
                     const int64_t n_ff_exp        = hparams.n_ff_exp;
                     const int64_t n_expert_shared = hparams.n_expert_shared;
 
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                         if (!is_lite) {
-                            layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
+                            layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
                         }
 
-                        layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
+                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
 
                         if (!is_lite) {
-                            layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
-                            layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
+                            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
                         } else {
-                            layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
                         }
 
-                        layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
-                        layer.wkv_b     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
-                        layer.wo        = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd});
+                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
                         if (i < (int) hparams.n_layer_dense_lead) {
-                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                         } else {
-                            layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
+                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
 
-                            GGML_ASSERT(n_expert      > 0);
-                            GGML_ASSERT(n_expert_used > 0);
+                            if (n_expert == 0) {
+                                throw std::runtime_error("n_expert must be > 0");
+                            }
+                            if (n_expert_used == 0) {
+                                throw std::runtime_error("n_expert_used must be > 0");
+                            }
 
                             // MoE branch
-                            layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert});
-                            layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert});
-                            layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert});
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
 
                             // Shared expert branch
-                            layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared});
-                            layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd});
-                            layer.ffn_up_shexp   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared});
+                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
                         }
                     }
                 } break;
             case LLM_ARCH_BITNET:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm     = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd});
-                        layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
+                        layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);
+                        layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.wk       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.wv       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.wo       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wk       = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wv       = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.wo       = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_norm     = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd});
-                        layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
+                        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd}, 0);
+                        layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
 
-                        layer.ffn_gate       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
-                        layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.ffn_down       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.ffn_up         = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_scale   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_gate       = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down       = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_up         = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_scale   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
                     }
                 } break;
             case LLM_ARCH_T5:
                 {
                     const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
 
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm     = ml.create_tensor(ctx_output, tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd});
+                    model.output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm     = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
 
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm_enc  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd});
-                        layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
+                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
 
-                        layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up_enc   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
 
-                        layer.attn_norm  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd});
-                        layer.attn_rel_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd}, 0);
+                        layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
 
-                        layer.attn_norm_cross  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd});
+                        layer.attn_norm_cross  = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd}, 0);
                         // this tensor seems to be unused in HF transformers implementation
-                        layer.attn_rel_b_cross = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.wq_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wk_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wv_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.wo_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+                        layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_T5ENCODER:
                 {
                     const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
 
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    model.output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm_enc  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd});
-                        layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
+                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
 
-                        layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up_enc   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_JAIS:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
-                    // Output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    // output
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
 
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_gate   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE,   "bias", i),   {n_ff});
+                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "bias", i),   {n_ff}, 0);
 
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_CHATGLM:
                 {
-                    model.tok_embd   = ml.create_tensor(ctx_input,  tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab});
+                    model.tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
 
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2});
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
 
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                     }
                 } break;
             case LLM_ARCH_NEMOTRON:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,   tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split,  tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+                    model.output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
                         // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
 
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
 
                         // optional MLP bias
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
                     }
                 } break;
             case LLM_ARCH_EXAONE:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd}, 0);
+                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN,   "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,     "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_RWKV6:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // Block 0, LN0
-                    model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
-                    model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
+                    model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
 
                     // output
-                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                    model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
-                    model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+                    model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
 
                     const int time_mix_extra_dim = hparams.time_mix_extra_dim;
                     const int time_decay_extra_dim = hparams.time_decay_extra_dim;
@@ -8868,90 +8982,88 @@ static bool llm_load_tensors(
                     const int ffn_size = hparams.n_ff_arr[0];
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
-                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
+                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
 
-                        layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5});
-                        layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5});
+                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
+                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
 
-                        layer.time_mix_lerp_x = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1});
-                        layer.time_mix_lerp_w = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1});
-                        layer.time_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
-                        layer.time_mix_lerp_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1});
-                        layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
-                        layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1});
+                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
+                        layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, 0);
+                        layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
+                        layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, 0);
+                        layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
+                        layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, 0);
 
-                        layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size});
-                        layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd});
-                        layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim});
-                        layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size});
-                        layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd});
-                        layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd});
-                        layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
-                        layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd});
+                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
+                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
+                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
+                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
+                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
+                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
+                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
 
-                        layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
-                        layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
-                        layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
+                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
+                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
+                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
 
-                        layer.channel_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
-                        layer.channel_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
+                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
+                        layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
 
-                        layer.channel_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size});
-                        layer.channel_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd});
-                        layer.channel_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd});
+                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
+                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
+                        layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
                     }
 
                 } break;
             case LLM_ARCH_CHAMELEON:
                 {
-                 model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                 model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                  // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
-                        layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
-                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
+                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
+
+        if (n_moved_tensors > 0) {
+            LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
+                __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
+                ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
+        }
     }
 
     ml.done_getting_tensors();
@@ -8964,27 +9076,29 @@ static bool llm_load_tensors(
     ctx_bufs.reserve(ctx_map.size());
 
     // Ensure we have enough capacity for the maximum backend buffer we will potentially create
-    size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
+    const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
     model.bufs.reserve(n_max_backend_buffer);
 
     for (auto & it : ctx_map) {
         ggml_backend_buffer_type_t buft = it.first;
         ggml_context * ctx              = it.second;
 
+        // skip contexts without tensors
+        if (ggml_get_first_tensor(ctx) == nullptr) {
+            continue;
+        }
+
         llama_buf_map bufs;
         bufs.reserve(n_max_backend_buffer);
 
-        // check if this backend device supports buffer_from_host_ptr
-        // when using a host buffer as the CPU bakcend buffer, use the CPU device to prioritize using buffer_from_host_ptr over the host buffer
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft == llama_default_buffer_type_cpu(model, true) ? ggml_backend_cpu_buffer_type() : buft);
-        bool buffer_from_host_ptr_supported = false;
-        if (dev) {
-            ggml_backend_dev_props props;
-            ggml_backend_dev_get_props(dev, &props);
-            buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
-        }
+        // check if it is possible to use buffer_from_host_ptr with this buffer type
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        ggml_backend_dev_props props;
+        ggml_backend_dev_get_props(dev, &props);
+        bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
+        bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
 
-        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported) {
+        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
             for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 // only the mmap region containing the tensors in the model is mapped to the backend buffer
                 // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
@@ -9027,7 +9141,7 @@ static bool llm_load_tensors(
 
         for (auto & buf : bufs) {
             // indicate that this buffer contains weights
-            // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
+            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
             ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
         }
 
@@ -9039,7 +9153,7 @@ static bool llm_load_tensors(
 
         LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
         if (n_gpu_layers > (int) hparams.n_layer) {
-            LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
+            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
         }
 
         const int max_backend_supported_layers = hparams.n_layer + 1;
@@ -9048,9 +9162,9 @@ static bool llm_load_tensors(
         LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
     }
 
-    // print memory requirements
+    // print memory requirements per buffer type
     for (ggml_backend_buffer_t buf : model.bufs) {
-        LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
+        LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
     }
 
     // populate tensors_by_name
@@ -9115,23 +9229,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
             return 0;
         }
 
-#ifdef GGML_USE_KOMPUTE
-        if (params.n_gpu_layers > 0 && (
-            !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
-            || !(
-                model.ftype == LLAMA_FTYPE_ALL_F32 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
-            )
-        )) {
-            // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
-            LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
-            params.n_gpu_layers = 0;
-        }
-#endif
-
         if (!llm_load_tensors(
             ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
             params.progress_callback, params.progress_callback_user_data
@@ -10210,7 +10307,7 @@ struct llm_build_context {
                 cb(tmp, "K_f32", il);
                 for (auto * backend : lctx.backends) {
                     // Figure out which backend KV cache belongs to
-                    if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft)) {
+                    if (ggml_backend_supports_buft(backend, ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
                         ggml_backend_sched_set_tensor_backend(lctx.sched, tmp, backend);
                         break;
                     }
@@ -15184,6 +15281,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
+        // FIXME: do not use model.tok_embd directly, duplicate as model.output
         cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur);
         cb(cur, "result_output", -1);
 
@@ -16334,11 +16432,12 @@ static struct ggml_cgraph * llama_build_graph(
         const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
         if (ubatch.n_tokens < 32 || full_offload) {
             if (il != -1 && strcmp(name, "norm") == 0) {
+                const auto & dev_layer = lctx.model.dev_layer.at(il);
                 for (auto * backend : lctx.backends) {
-                    if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
-                        (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
-                        ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
-                        break;
+                    if (ggml_backend_get_device(backend) == dev_layer.dev) {
+                        if (ggml_backend_supports_op(backend, cur)) {
+                            ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
+                        }
                     }
                 }
             }
@@ -17041,7 +17140,22 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
             lctx.embd = nullptr;
         }
 
-        lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(lctx.model, true), new_size);
+        auto * buft = ggml_backend_cpu_buffer_type();
+        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
+        ggml_tensor * output_tensor = lctx.model.output;
+        if (!output_tensor) {
+            // bert models don't have an output tensor, use the last layer
+            output_tensor = lctx.model.layers.back().layer_out_norm;
+        }
+        if (output_tensor) {
+            auto * output_buft = ggml_backend_buffer_get_type(output_tensor->buffer);
+            auto * output_dev = ggml_backend_buft_get_device(output_buft);
+            auto * output_dev_host_buft = ggml_backend_dev_host_buffer_type(output_dev);
+            if (output_dev_host_buft) {
+                buft = output_dev_host_buft;
+            }
+        }
+        lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size);
         if (lctx.buf_output == nullptr) {
             LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
             return 0;
@@ -18832,7 +18946,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
 
     // contexts for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
             // add a new context
@@ -18894,7 +19008,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
             ggml_free(ctx);
             throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
         }
-        struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
+        struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
         // validate tensor shape
         if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
             gguf_free(ctx_gguf);
@@ -18953,7 +19067,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         }
     }
 
-    LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
+    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 
     // free ctx for reading gguf
     gguf_free(ctx_gguf);
@@ -19092,14 +19206,8 @@ bool llama_supports_mlock(void) {
 }
 
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_KOMPUTE)
-    // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
-    return true;
-#else
     return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
-           ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
            llama_supports_rpc();
-#endif
 }
 
 bool llama_supports_rpc(void) {
@@ -19189,8 +19297,7 @@ struct llama_model * llama_load_model_from_file(
             return nullptr;
         }
 
-        // ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
-        using ggml_backend_rpc_add_device_t = ggml_backend_dev_t (*)(const char *);
+        typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
         ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
         if (!ggml_backend_rpc_add_device_fn) {
             LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
@@ -19217,22 +19324,34 @@ struct llama_model * llama_load_model_from_file(
         ggml_backend_dev_t dev = ggml_backend_dev_get(i);
         switch (ggml_backend_dev_type(dev)) {
             case GGML_BACKEND_DEVICE_TYPE_CPU:
-            case GGML_BACKEND_DEVICE_TYPE_CPU_FULL:
-                // skip CPU backends since they are `handled separately
+            case GGML_BACKEND_DEVICE_TYPE_ACCEL:
+                // skip CPU backends since they are handled separately
                 break;
 
             case GGML_BACKEND_DEVICE_TYPE_GPU:
-            case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
-            {
-                size_t free, total; // NOLINT
-                ggml_backend_dev_memory(dev, &free, &total);
-                LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
                 model->devices.push_back(dev);
                 break;
-            }
         }
     }
 
+    // if using single GPU mode, remove all except the main GPU
+    if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
+        if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
+            LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
+            llama_free_model(model);
+            return nullptr;
+        }
+        ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
+        model->devices.clear();
+        model->devices.push_back(main_gpu);
+    }
+
+    for (auto * dev : model->devices) {
+        size_t free, total; // NOLINT
+        ggml_backend_dev_memory(dev, &free, &total);
+        LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
+    }
+
     int status = llama_model_load(path_model, *model, params);
     GGML_ASSERT(status <= 0);
     if (status < 0) {
@@ -19393,53 +19512,21 @@ struct llama_context * llama_new_context_with_model(
     GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
 
     if (!hparams.vocab_only) {
-        // initialize backends
-        int main_gpu = model->main_gpu;
-
-        // with registry
-        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
-                ggml_backend_dev_t main_dev = model->devices[main_gpu];
-                ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        } else {
-            // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
-            for (auto * dev : model->devices) {
-                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-        if (main_gpu >= (int)model->devices.size()) {
-            main_gpu -= (int)model->devices.size();
-        }
-
-#if defined(GGML_USE_KOMPUTE)
-        if (model->n_gpu_layers > 0) {
-            auto * backend = ggml_backend_kompute_init(main_gpu);
+        // GPU backends
+        for (auto * dev : model->devices) {
+            ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
             if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
+                LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
                 llama_free(ctx);
                 return nullptr;
             }
             ctx->backends.push_back(backend);
         }
-#endif
 
-        // add other backends (such as BLAS)
+        // add ACCEL backends (such as BLAS)
         for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
             ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
                 ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
                 if (backend == nullptr) {
                     LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
@@ -19450,6 +19537,7 @@ struct llama_context * llama_new_context_with_model(
             }
         }
 
+        // add CPU backend
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
             LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@@ -19513,12 +19601,16 @@ struct llama_context * llama_new_context_with_model(
             // buffer types used for the compute buffer of each backend
             std::vector<ggml_backend_buffer_type_t> backend_buft;
             for (auto * backend : ctx->backends) {
-                if (ggml_backend_is_cpu(backend)) {
-                    // use host buffers for the CPU backend compute buffer
-                    backend_buft.push_back(llama_default_buffer_type_cpu(*model, true));
-                } else {
-                    backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
+                auto * buft = ggml_backend_get_default_buffer_type(backend);
+                if (ggml_backend_is_cpu(backend) && !model->devices.empty()) {
+                    // use the host buffer of the first device CPU for faster transfer of the intermediate state
+                    auto * dev = model->devices[0];
+                    auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
+                    if (host_buft) {
+                        buft = host_buft;
+                    }
                 }
+                backend_buft.push_back(buft);
             }
 
             const size_t max_nodes = llama_model_max_nodes(*model);
@@ -19542,11 +19634,6 @@ struct llama_context * llama_new_context_with_model(
                         continue;
                     }
                     auto * dev = ggml_backend_get_device(backend);
-                    if (!dev) {
-                        // backend is using old interface, not supported
-                        pipeline_parallel = false;
-                        break;
-                    }
                     ggml_backend_dev_props props;
                     ggml_backend_dev_get_props(dev, &props);
                     if (!props.caps.async || !props.caps.events) {
@@ -19563,15 +19650,29 @@ struct llama_context * llama_new_context_with_model(
                 LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
             }
 
-            // build worst-case graph
+            // initialize scheduler with the worst-case graph
             uint32_t n_seqs = 1; // TODO: worst-case number of sequences
             uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
             llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true);
 
-            // initialize scheduler with the worst-case graph
-            if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
+            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
+
+            // reserve pp graph first so that buffers are only allocated once
+            ggml_backend_sched_reserve(ctx->sched, gf_pp);
+            int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched);
+            int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
+
+            // reserve with tg graph to get the number of splits and nodes
+            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
+            ggml_backend_sched_reserve(ctx->sched, gf_tg);
+            int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched);
+            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
+
+            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
+            gf_pp = llama_build_graph(*ctx, ubatch_pp, false);
+            if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                 llama_free(ctx);
                 return nullptr;
@@ -19588,10 +19689,16 @@ struct llama_context * llama_new_context_with_model(
                 }
             }
 
-            // note: the number of splits during measure is higher than during inference due to the kv shift
-            int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, ggml_graph_n_nodes(gf));
-            LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
+            if (n_nodes_pp == n_nodes_tg) {
+                LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
+            } else {
+                LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+            }
+            if (n_splits_pp == n_splits_tg) {
+                LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
+            } else {
+                LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
+            }
         }
     }
 
@@ -19851,40 +19958,46 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
     GGML_ASSERT(cvec.ctxs.empty());
     GGML_ASSERT(cvec.bufs.empty());
 
-    // count layer buffer types
-    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
-    for (int64_t i = 0; i < model.hparams.n_layer; i++) {
-        buft_layer_count[model.buft_layer[i].buft]++;
-    }
-
-    // allocate contexts
+    // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    for (auto & it : buft_layer_count) {
-        int n_layers = it.second;
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ n_layers * ggml_tensor_overhead(),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ggml_context * ctx = ggml_init(params);
-        if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
-            return 1;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        if (ctx_map.count(buft) == 0) {
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ model.hparams.n_layer*ggml_tensor_overhead(),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+            ctx_map[buft] = ctx;
+            cvec.ctxs.push_back(ctx);
         }
-        ctx_map[it.first] = ctx;
-    }
+        return ctx_map.at(buft);
+    };
+
 
     // make tensors
     cvec.tensors.reserve(model.hparams.n_layer);
     cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
     for (size_t il = 1; il < model.hparams.n_layer; il++) {
-        struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
+        ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list,
+            [&](ggml_context * ctx) {
+                ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
+                ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
+                return ggml_add(ctx, cur, layer_dir);
+            });
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
+            return false;
+        }
         ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
         cvec.tensors.push_back(tensor);
     }
 
     // allocate tensors / buffers and zero
-    cvec.ctxs.reserve(ctx_map.size());
     cvec.bufs.reserve(ctx_map.size());
     for (auto it : ctx_map) {
         ggml_backend_buffer_type_t buft = it.first;
@@ -19895,7 +20008,6 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
             return false;
         }
         ggml_backend_buffer_clear(buf, 0);
-        cvec.ctxs.push_back(ctx);
         cvec.bufs.push_back(buf);
     }
 
@@ -21218,7 +21330,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
                 throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
             }
         } else if ((size_t) i >= ctx->output_ids.size()) {
-            throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
+            throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
         } else {
             j = ctx->output_ids[i];
         }

From fc83a9e58479e4dd70054daa7afe5184c1bbe545 Mon Sep 17 00:00:00 2001
From: xctan <axunlei@gmail.com>
Date: Wed, 30 Oct 2024 15:00:40 +0800
Subject: [PATCH 122/396] ggml : add Q4_0_8_8 RISC-V GEMV and GEMM kernels
 (#10029)

* ggml : RISC-V vector gemv for q4_0_8x8

* ggml : Added WIP rvv q4_0_8x8 gemm

* ggml : Added initial implementation of rvv gemm

* ggml : optimize gemm to avoid register spillover

* ggml : Fix GCC rvv load alignment issue

* ggml : Format gemm rvv code

* ggml : Fix a typo in RVV q4_0_8_8 GEMM
---
 ggml/src/ggml-aarch64.c | 268 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 268 insertions(+)

diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index b27f41147..eb30f8944 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -991,6 +991,73 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         }
     }
     return;
+#elif defined(__riscv_v_intrinsic)
+    if (__riscv_vlenb() >= QK4_0) {
+        const size_t vl = QK4_0;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+            for (int l = 0; l < nb; l++) {
+                const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
+                const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
+                const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
+                const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
+                __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
+                const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
+                const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
+                const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
+
+                const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
+                const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
+                const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
+                const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
+                const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
+                const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
+                const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
+
+                const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
+                const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                // vector version needs Zvfhmin extension
+                const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
+                const float b_scales[8] = {
+                    GGML_FP16_TO_FP32(b_ptr[l].d[0]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[1]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[2]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[3]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[4]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[5]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[6]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[7])
+                };
+                const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
+                const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
+                sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
+            }
+            __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
+        }
+        return;
+    }
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
     {
         float sumf[8];
@@ -3171,6 +3238,207 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                 }
             }
         }
+        return;
+    }
+#elif defined(__riscv_v_intrinsic)
+    if (__riscv_vlenb() >= QK4_0) {
+        const size_t vl = QK4_0;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+                vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                for (int l = 0; l < nb; l++) {
+                    const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
+                    const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
+                    const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
+                    const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
+                    const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
+                    const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
+                    const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
+
+                    // vector version needs Zvfhmin extension
+                    const float a_scales[4] = {
+                        GGML_FP16_TO_FP32(a_ptr[l].d[0]),
+                        GGML_FP16_TO_FP32(a_ptr[l].d[1]),
+                        GGML_FP16_TO_FP32(a_ptr[l].d[2]),
+                        GGML_FP16_TO_FP32(a_ptr[l].d[3])
+                    };
+                    const float b_scales[8] = {
+                        GGML_FP16_TO_FP32(b_ptr[l].d[0]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[1]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[2]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[3]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[4]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[5]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[6]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[7])
+                    };
+                    const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
+
+                    const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
+                    const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
+                    const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
+                    const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l0;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l0 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
+                        sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
+                    const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
+                    const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
+                    const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l1;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l1 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
+                        sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
+                    const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
+                    const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
+                    const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l2;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l2 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
+                        sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
+                    const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
+                    const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
+                    const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l3;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l3 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
+                        sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
+                    }
+                }
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
+            }
+        }
+
         return;
     }
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)

From 79a2bc042dcacaad59306865208a8c8c3149e3ea Mon Sep 17 00:00:00 2001
From: Rich Dougherty <rich@rd.nz>
Date: Thu, 31 Oct 2024 01:22:21 +1300
Subject: [PATCH 123/396] convert : more detailed convert lora usage docs
 (#10065)

---
 convert_lora_to_gguf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index bc68f68af..915e21836 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -230,7 +230,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
-        description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
+        description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file")
     parser.add_argument(
         "--outfile", type=Path,
         help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
@@ -257,11 +257,11 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--base", type=Path, required=True,
-        help="directory containing base model file",
+        help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required",
     )
     parser.add_argument(
         "lora_path", type=Path,
-        help="directory containing LoRA adapter file",
+        help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
     )
 
     return parser.parse_args()

From 6763f713bb692910e9b2d9d1a82d6959cee2dcf3 Mon Sep 17 00:00:00 2001
From: Rich Dougherty <rich@rd.nz>
Date: Thu, 31 Oct 2024 01:22:39 +1300
Subject: [PATCH 124/396] readme : more lora detail in main example readme
 (#10064)

---
 examples/main/README.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/main/README.md b/examples/main/README.md
index 5357ac2e2..145216938 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -333,6 +333,15 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
 
+## LoRA (Low-Rank Adaptation) adapters
+
+-   `--lora FNAME`: Optional path to a LoRA adapter to use with scaling of 1.0. Can be mixed with `--lora-scaled` and can be repeated to use multiple adapters.
+-   `--lora-scaled FNAME`: Optional path to a LoRA adapter with user-defined scaling. Can be mixed with `--lora` and can repeated to use multiple adapters.
+
+You can add LoRA adapters using `--lora` or `--lora-scaled`. For example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...` or `--lora-scaled lora_task_A.gguf 0.5 --lora-scaled lora_task_B.gguf 0.5`.
+
+LoRA adapters should be in GGUF format. To convert from Hugging Face format use the `convert-lora-to-gguf.py` script. LoRA adapters are loaded separately and applied during inference - they are not merged with the main model. This means that mmap model loading is fully supported when using LoRA adapters. The old `--lora-base` flag has been removed now that merging is no longer performed.
+
 ## Additional Options
 
 These options provide extra functionality and customization when running the LLaMA models:
@@ -341,6 +350,4 @@ These options provide extra functionality and customization when running the LLa
 -   `--verbose-prompt`: Print the prompt before generating text.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
--   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
--   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 -   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.

From b9e02e8184f5e6094a9e87eaf040becd404bfc90 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 30 Oct 2024 14:51:21 +0100
Subject: [PATCH 125/396] ggml : fix memory leaks when loading invalid gguf
 files (#10094)

* ggml : fix gguf string leak when reading kv pairs fails

* ggml : avoid crashing with GGML_ABORT when the KV has an invalid type

* ggml : avoid crashing on failed memory allocations when loading a gguf file
---
 ggml/src/ggml.c | 67 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index a8da10d79..0d99b0791 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -22136,7 +22136,11 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
         return false;
     }
 
-    p->data = GGML_CALLOC(p->n + 1, 1);
+    p->data = calloc(p->n + 1, 1);
+    if (!p->data) {
+        fprintf(stderr, "%s: failed to allocate memory for string of length %" PRIu64 "\n", __func__, p->n);
+        return false;
+    }
 
     ok = ok && gguf_fread_el(file,  p->data, p->n, offset);
 
@@ -22170,7 +22174,11 @@ static void gguf_free_kv(struct gguf_kv * kv) {
 }
 
 struct gguf_context * gguf_init_empty(void) {
-    struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
+    struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
+    if (!ctx) {
+        fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
+        return NULL;
+    }
 
     memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
     ctx->header.version   = GGUF_VERSION;
@@ -22216,7 +22224,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     bool ok = true;
 
-    struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
+    struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
+    if (!ctx) {
+        fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
+        fclose(file);
+        return NULL;
+    }
 
     // read the header
     {
@@ -22255,9 +22268,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     {
         const uint64_t n_kv = ctx->header.n_kv;
 
-        // header.n_kv will hold the actual value of pairs that were successfully read in the loop below
-        ctx->header.n_kv = 0;
-        ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
+        ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
+        if (!ctx->kv) {
+            fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
+        }
 
         for (uint64_t i = 0; i < n_kv; ++i) {
             struct gguf_kv * kv = &ctx->kv[i];
@@ -22308,7 +22325,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                         return NULL;
                                     }
 
-                                    kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
+                                    kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
+                                    if (!kv->value.arr.data) {
+                                        fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
+                                        fclose(file);
+                                        gguf_free(ctx);
+                                        return NULL;
+                                    }
 
                                     ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
                                 } break;
@@ -22322,24 +22345,36 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                         return NULL;
                                     }
 
-                                    kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));
+                                    kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
+                                    if (!kv->value.arr.data) {
+                                        fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
+                                        fclose(file);
+                                        gguf_free(ctx);
+                                        return NULL;
+                                    }
 
                                     for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                         ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
                                     }
                                 } break;
                             case GGUF_TYPE_ARRAY:
-                            default: GGML_ABORT("invalid type");
+                            default:
+                                {
+                                    fprintf(stderr, "%s: invalid array type %d\n", __func__, kv->value.arr.type);
+                                    ok = false;
+                                } break;
                         }
                     } break;
-                default: GGML_ABORT("invalid type");
+                default:
+                    {
+                        fprintf(stderr, "%s: invalid type %d\n", __func__, kv->type);
+                        ok = false;
+                    } break;
             }
 
             if (!ok) {
                 break;
             }
-
-            ctx->header.n_kv++;
         }
 
         if (!ok) {
@@ -22352,7 +22387,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the tensor infos
     if (ctx->header.n_tensors > 0) {
-        ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
+        ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
+        if (!ctx->infos) {
+            fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
+        }
 
         for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             struct gguf_tensor_info * info = &ctx->infos[i];

From 61408e7fad082dc44a11c8a9f1398da4837aad44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergio=20L=C3=B3pez?= <slp@redhat.com>
Date: Wed, 30 Oct 2024 17:01:52 +0100
Subject: [PATCH 126/396] kompute: add backend registry / device interfaces
 (#10045)

Get in line with the other backends by supporting the newer
backend/device registry interfaces.

Signed-off-by: Sergio Lopez <slp@redhat.com>
---
 ggml/include/ggml-kompute.h |   4 +
 ggml/src/ggml-backend.cpp   |   9 +-
 ggml/src/ggml-kompute.cpp   | 251 ++++++++++++++++++++++++++++--------
 3 files changed, 206 insertions(+), 58 deletions(-)

diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h
index 171465456..c0c43521b 100644
--- a/ggml/include/ggml-kompute.h
+++ b/ggml/include/ggml-kompute.h
@@ -11,6 +11,8 @@
 extern "C" {
 #endif
 
+#define GGML_KOMPUTE_MAX_DEVICES 16
+
 struct ggml_vk_device {
     int index;
     int type; // same as VkPhysicalDeviceType
@@ -41,6 +43,8 @@ GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
 
 GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
 
+GGML_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index fd574887f..f397f6252 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -562,6 +562,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #include "ggml-cann.h"
 #endif
 
+#ifdef GGML_USE_KOMPUTE
+#include "ggml-kompute.h"
+#endif
+
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_t> backends;
     std::vector<ggml_backend_dev_t> devices;
@@ -591,8 +595,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_AMX
         register_backend(ggml_backend_amx_reg());
 #endif
-
-        // TODO: kompute
+#ifdef GGML_USE_KOMPUTE
+        register_backend(ggml_backend_kompute_reg());
+#endif
 
         register_backend(ggml_backend_cpu_reg());
     }
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 1f2220234..fea69fb04 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -42,6 +42,7 @@
 #include <cstring>
 #include <iostream>
 #include <memory>
+#include <mutex>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
@@ -273,18 +274,9 @@ static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t mem
     return results;
 }
 
-// public API returns a C-style array
-ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count) {
-    auto devices = ggml_vk_available_devices_internal(memoryRequired);
-    *count = devices.size();
-    if (devices.empty()) {
-        return nullptr;
-    }
-
-    size_t nbytes = sizeof (ggml_vk_device) * (devices.size());
-    auto * arr = static_cast<ggml_vk_device *>(malloc(nbytes));
-    memcpy(arr, devices.data(), nbytes);
-    return arr;
+static std::vector<ggml_vk_device>& ggml_vk_available_devices() {
+    static std::vector<ggml_vk_device> devices = ggml_vk_available_devices_internal(0);
+    return devices;
 }
 
 static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
@@ -341,7 +333,7 @@ ggml_vk_device ggml_vk_current_device() {
     if (!komputeManager()->hasDevice())
         return ggml_vk_device();
 
-    auto devices = ggml_vk_available_devices_internal(0);
+    auto devices = ggml_vk_available_devices();
     ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data());
     GGML_ASSERT(!devices.empty());
     return devices.front();
@@ -1323,17 +1315,7 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
     ggml_vk_cpy(spirv, 2, 4, std::forward<Args>(args)...);
 }
 
-static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
-    switch (op->type) {
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-            break;
-        default:
-            return false;
-    }
-
+static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -1410,6 +1392,8 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
             ;
     }
     return false;
+
+    GGML_UNUSED(dev);
 }
 
 static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
@@ -1458,11 +1442,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
 
             any_commands_recorded = true;
 
-            if (!ggml_vk_supports_op(dst)) {
-                 fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
-                 GGML_ABORT("unsupported op");
-            }
-
             const int32_t ne00 = src0 ? src0->ne[0] : 0;
             const int32_t ne01 = src0 ? src0->ne[1] : 0;
             const int32_t ne02 = src0 ? src0->ne[2] : 0;
@@ -1907,25 +1886,31 @@ static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
 };
 
 ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
-    static std::vector<ggml_backend_buffer_type> bufts = []() {
-        std::vector<ggml_backend_buffer_type> vec;
-        auto devices = ggml_vk_available_devices_internal(0);
-        vec.reserve(devices.size());
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
 
-        for (const auto & dev : devices) {
-            vec.push_back({
-                /* .iface   = */ ggml_backend_kompute_buffer_type_interface,
-                /* .device  = */ nullptr,
-                /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
-            });
+    auto devices = ggml_vk_available_devices();
+    int32_t device_count = (int32_t) devices.size();
+    GGML_ASSERT(device < device_count);
+    GGML_ASSERT(devices.size() <= GGML_KOMPUTE_MAX_DEVICES);
+
+    static ggml_backend_buffer_type
+        ggml_backend_kompute_buffer_types[GGML_KOMPUTE_MAX_DEVICES];
+
+    static bool ggml_backend_kompute_buffer_type_initialized = false;
+
+    if (!ggml_backend_kompute_buffer_type_initialized) {
+        for (int32_t i = 0; i < device_count; i++) {
+            ggml_backend_kompute_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_kompute_buffer_type_interface,
+                /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), i),
+                /* .context  = */ new ggml_backend_kompute_buffer_type_context{ i, devices[i].bufferAlignment, devices[i].maxAlloc },
+            };
         }
-        return vec;
-    }();
+        ggml_backend_kompute_buffer_type_initialized = true;
+    }
 
-    auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) {
-        return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device;
-    });
-    return it < bufts.end() ? &*it : nullptr;
+    return &ggml_backend_kompute_buffer_types[device];
 }
 
 // backend
@@ -1953,16 +1938,6 @@ static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, st
     return GGML_STATUS_SUCCESS;
 }
 
-static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    GGML_UNUSED(backend);
-    return ggml_vk_supports_op(op);
-}
-
-static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(backend);
-    return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
-}
-
 static struct ggml_backend_i kompute_backend_i = {
     /* .get_name                = */ ggml_backend_kompute_name,
     /* .free                    = */ ggml_backend_kompute_free,
@@ -1991,7 +1966,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
     ggml_backend_t kompute_backend = new ggml_backend {
         /* .guid      = */ ggml_backend_kompute_guid(),
         /* .interface = */ kompute_backend_i,
-        /* .device    = */ nullptr,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), device),
         /* .context   = */ s_kompute_context,
     };
 
@@ -2001,3 +1976,167 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
 bool ggml_backend_is_kompute(ggml_backend_t backend) {
     return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
 }
+
+static size_t ggml_backend_kompute_get_device_count() {
+    auto devices = ggml_vk_available_devices();
+    return devices.size();
+}
+
+static void ggml_backend_kompute_get_device_description(int device, char * description, size_t description_size) {
+    auto devices = ggml_vk_available_devices();
+    GGML_ASSERT((size_t) device < devices.size());
+    snprintf(description, description_size, "%s", devices[device].name);
+}
+
+static void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total) {
+    auto devices = ggml_vk_available_devices();
+    GGML_ASSERT((size_t) device < devices.size());
+    *total = devices[device].heapSize;
+    *free = devices[device].heapSize;
+}
+
+//////////////////////////
+
+struct ggml_backend_kompute_device_context {
+    int device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_kompute_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_kompute_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_kompute_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    ggml_backend_kompute_get_device_memory(ctx->device, free, total);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_kompute_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    return ggml_backend_kompute_buffer_type(ctx->device);
+}
+
+static bool ggml_backend_kompute_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_name != ggml_backend_kompute_buffer_type_get_name) {
+        return false;
+    }
+
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    ggml_backend_kompute_buffer_type_context * buft_ctx = (ggml_backend_kompute_buffer_type_context *)buft->context;
+
+    return buft_ctx->device == ctx->device;
+}
+
+static enum ggml_backend_dev_type ggml_backend_kompute_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+}
+
+static void ggml_backend_kompute_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_kompute_device_get_name(dev);
+    props->description = ggml_backend_kompute_device_get_description(dev);
+    props->type        = ggml_backend_kompute_device_get_type(dev);
+    ggml_backend_kompute_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* async                  = */ false,
+        /* host_buffer            = */ false,
+        /* .buffer_from_host_ptr  = */ false,
+        /* events                 = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_kompute_device_init(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(params);
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    return ggml_backend_kompute_init(ctx->device);
+}
+
+static bool ggml_backend_kompute_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    const int min_batch_size = 32;
+
+    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_kompute_device_i = {
+    /* .get_name             = */ ggml_backend_kompute_device_get_name,
+    /* .get_description      = */ ggml_backend_kompute_device_get_description,
+    /* .get_memory           = */ ggml_backend_kompute_device_get_memory,
+    /* .get_type             = */ ggml_backend_kompute_device_get_type,
+    /* .get_props            = */ ggml_backend_kompute_device_get_props,
+    /* .init_backend         = */ ggml_backend_kompute_device_init,
+    /* .get_buffer_type      = */ ggml_backend_kompute_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ NULL,
+    /* .supports_op          = */ ggml_backend_kompute_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_kompute_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_kompute_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+static const char * ggml_backend_kompute_reg_get_name(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return "Kompute";
+}
+
+static size_t ggml_backend_kompute_reg_get_device_count(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return ggml_backend_kompute_get_device_count();
+}
+
+static ggml_backend_dev_t ggml_backend_kompute_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+    static std::vector<ggml_backend_dev_t> devices;
+
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            for (size_t i = 0; i < ggml_backend_kompute_get_device_count(); i++) {
+                ggml_backend_kompute_device_context * ctx = new ggml_backend_kompute_device_context;
+                char desc[256];
+                ggml_backend_kompute_get_device_description(i, desc, sizeof(desc));
+                ctx->device = i;
+                ctx->name = "Kompute" + std::to_string(i);
+                ctx->description = desc;
+                devices.push_back(new ggml_backend_device {
+                    /* .iface   = */ ggml_backend_kompute_device_i,
+                    /* .reg     = */ reg,
+                    /* .context = */ ctx,
+                });
+            }
+            initialized = true;
+        }
+    }
+
+    GGML_ASSERT(device < devices.size());
+    return devices[device];
+}
+
+static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {
+    /* .get_name         = */ ggml_backend_kompute_reg_get_name,
+    /* .get_device_count = */ ggml_backend_kompute_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_kompute_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_kompute_reg() {
+    static ggml_backend_reg reg = {
+        /* .iface   = */ ggml_backend_kompute_reg_i,
+        /* .context = */ nullptr,
+    };
+
+    return &reg;
+}

From 1329c0a75e6a7defc5c380eaf80d8e0f66d7da78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergio=20L=C3=B3pez?= <slp@redhat.com>
Date: Thu, 31 Oct 2024 10:09:52 +0100
Subject: [PATCH 127/396] kompute: add mul_mat_q4_k shader (#10097)

This is a more or less direct translation from the Metal implementation
to GLSL.

Signed-off-by: Sergio Lopez <slp@redhat.com>
---
 ggml/src/CMakeLists.txt                       |   2 +
 ggml/src/ggml-kompute.cpp                     |  42 ++++++
 ggml/src/kompute-shaders/common.comp          |   9 ++
 ggml/src/kompute-shaders/op_mul_mat_q4_k.comp | 133 ++++++++++++++++++
 4 files changed, 186 insertions(+)
 create mode 100644 ggml/src/kompute-shaders/op_mul_mat_q4_k.comp

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index aa405e4d0..915568798 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -800,6 +800,7 @@ if (GGML_KOMPUTE)
             kompute-shaders/op_mul_mat_q8_0.comp
             kompute-shaders/op_mul_mat_q4_0.comp
             kompute-shaders/op_mul_mat_q4_1.comp
+            kompute-shaders/op_mul_mat_q4_k.comp
             kompute-shaders/op_mul_mat_q6_k.comp
             kompute-shaders/op_getrows_f32.comp
             kompute-shaders/op_getrows_f16.comp
@@ -833,6 +834,7 @@ if (GGML_KOMPUTE)
             shaderop_mul_mat_q8_0.h
             shaderop_mul_mat_q4_0.h
             shaderop_mul_mat_q4_1.h
+            shaderop_mul_mat_q4_k.h
             shaderop_mul_mat_q6_k.h
             shaderop_getrows_f32.h
             shaderop_getrows_f16.h
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index fea69fb04..2fea9e4cc 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -20,6 +20,7 @@
 #include "shaderop_mul_mat_q8_0.h"
 #include "shaderop_mul_mat_q4_0.h"
 #include "shaderop_mul_mat_q4_1.h"
+#include "shaderop_mul_mat_q4_k.h"
 #include "shaderop_mul_mat_q6_k.h"
 #include "shaderop_mul_mat_mat_f32.h"
 #include "shaderop_getrows_f32.h"
@@ -1067,6 +1068,40 @@ static void ggml_vk_mul_mat_q8_0(Args&&... args) {
     ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
+static void ggml_vk_mul_mat_q4_k(
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10,
+    int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0,
+    int32_t ne1, int32_t r2, int32_t r3
+) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
+        kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3;
+    } pushConsts {
+        0, 0, 0,
+        ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}, {}, {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
 static void ggml_vk_mul_mat_q6_k(
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
@@ -1384,6 +1419,7 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
                 case GGML_TYPE_Q8_0:
                 case GGML_TYPE_Q4_0:
                 case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q4_K:
                     return true;
                 default:
                     ;
@@ -1635,6 +1671,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                                     ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
                                 );
                                 break;
+                            case GGML_TYPE_Q4_K:
+                                ggml_vk_mul_mat_q4_k(
+                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, ne12/ne02, ne13/ne03
+                                );
+                                break;
                             case GGML_TYPE_Q6_K:
                                 ggml_vk_mul_mat_q6_k(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
diff --git a/ggml/src/kompute-shaders/common.comp b/ggml/src/kompute-shaders/common.comp
index 62d62b025..2aaddf704 100644
--- a/ggml/src/kompute-shaders/common.comp
+++ b/ggml/src/kompute-shaders/common.comp
@@ -15,6 +15,7 @@
 #define TWOPI_F 6.283185307179586f
 
 #define QK_K 256
+#define K_SCALE_SIZE 12
 
 #define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
 #define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
@@ -64,6 +65,14 @@ mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
     return reg;
 }
 
+#define sizeof_block_q4_k 144
+struct block_q4_k {
+    float16_t d;
+    float16_t dmin;
+    uint8_t scales[K_SCALE_SIZE];
+    uint8_t qs[QK_K/2];
+};
+
 #define sizeof_block_q6_k 210
 struct block_q6_k {
     uint8_t ql[QK_K/2];      // quants, lower 4 bits
diff --git a/ggml/src/kompute-shaders/op_mul_mat_q4_k.comp b/ggml/src/kompute-shaders/op_mul_mat_q4_k.comp
new file mode 100644
index 000000000..fc8e45aa9
--- /dev/null
+++ b/ggml/src/kompute-shaders/op_mul_mat_q4_k.comp
@@ -0,0 +1,133 @@
+#version 450
+
+#include "common.comp"
+
+#define N_DST 4
+#define SIZE_OF_BLOCK sizeof_block_q4_k
+
+layout(local_size_x = 4) in;
+layout(local_size_y = 8) in;
+layout(local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+    int ne1;
+    int ne01;
+    int ne02;
+    int ne12;
+    int r2;
+    int r3;
+} pcs;
+
+void main() {
+    const uint16_t kmask1 = uint16_t(0x3f3f);
+    const uint16_t kmask2 = uint16_t(0x0f0f);
+    const uint16_t kmask3 = uint16_t(0xc0c0);
+
+    const uint ix = gl_SubgroupInvocationID/8;  // 0...3
+    const uint it = gl_SubgroupInvocationID%8;  // 0...7
+    const uint iq = it/4;     // 0 or 1
+    const uint ir = it%4;     // 0...3
+
+    const uint nb = pcs.ne00/QK_K;
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint im = gl_WorkGroupID.z;
+
+    const uint first_row = r0 * N_DST;
+    const uint ib_row = first_row * nb;
+
+    const uint i12 = im%pcs.ne12;
+    const uint i13 = im/pcs.ne12;
+
+    const uint offset0 = (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
+
+    const uint xblk = ib_row + offset0 + pcs.inAOff;
+    const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff;
+
+    float yl[16];
+    float yh[16];
+    float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
+    float all_sum = 0.f;
+
+    uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
+
+    for (uint ib = ix; ib < nb; ib += 4) {
+        const uint blk_idx = ib + xblk;
+
+        float sumy[4] = {0.f, 0.f, 0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i+0] = inB[y4+i+  0]; sumy[0] += yl[i+0];
+            yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
+            yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
+            yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
+        }
+
+        for (int row = 0; row < N_DST; row++) {
+            uint row_idx = row * nb;
+
+            uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
+            uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
+            uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
+            uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
+            uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
+
+            uint16_t sc16[4];
+            sc16[0] = sc_0 & kmask1;
+            sc16[1] = sc_2 & kmask1;
+            sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
+            sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
+
+            float acc1[4] = {0.f, 0.f, 0.f, 0.f};
+            float acc2[4] = {0.f, 0.f, 0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
+                uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
+                acc1[0] += yl[i+0] * (q1 & 0x000F);
+                acc1[1] += yl[i+1] * (q1 & 0x0F00);
+                acc1[2] += yl[i+8] * (q1 & 0x00F0);
+                acc1[3] += yl[i+9] * (q1 & 0xF000);
+                acc2[0] += yh[i+0] * (q2 & 0x000F);
+                acc2[1] += yh[i+1] * (q2 & 0x0F00);
+                acc2[2] += yh[i+8] * (q2 & 0x00F0);
+                acc2[3] += yh[i+9] * (q2 & 0xF000);
+            }
+
+            uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
+            uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
+            uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
+            uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
+            uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
+            uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
+            uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
+            uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
+
+            float dall = float(inA[blk_idx + row_idx].d);
+            float dmin = float(inA[blk_idx + row_idx].dmin);
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
+                               (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
+                               (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
+                               (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
+                dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
+        }
+
+        y4 += 4 * QK_K;
+    }
+
+    for (int row = 0; row < N_DST; ++row) {
+        all_sum = subgroupAdd(sumf[row]);
+        if (subgroupElect()) {
+            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
+        }
+    }
+}

From dea5e86051aadcdf42f7db7a8855a78d8f5ff3d6 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 31 Oct 2024 11:40:59 +0100
Subject: [PATCH 128/396] ggml : check tensor name lengths in gguf files
 (#10100)

---
 ggml/src/ggml.c | 45 ++++++++++++++++++++++++++++++++++++---------
 src/llama.cpp   |  7 +++++--
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 0d99b0791..149d8f970 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -22102,18 +22102,46 @@ static size_t gguf_type_size(enum gguf_type type) {
     return GGUF_TYPE_SIZE[type];
 }
 
-static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
-    GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
-    GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
+static bool gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
+    if (info->n_dims > GGML_MAX_DIMS) {
+        fprintf(stderr, "%s: invalid number of dimensions (%" PRIu32 ")\n", __func__, info->n_dims);
+        return false;
+    }
+
+    if (info->type < 0 || info->type >= GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid type (%d)\n", __func__, info->type);
+        return false;
+    }
+
+    if (strlen(info->name.data) >= GGML_MAX_NAME) {
+        fprintf(stderr, "%s: tensor '%s' name is too long\n", __func__, info->name.data);
+        return false;
+    }
 
     for (uint32_t i = 0; i < info->n_dims; ++i) {
-        GGML_ASSERT(info->ne[i] > 0);
+        if (info->ne[i] <= 0) {
+            fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[i]);
+            return false;
+        }
     }
 
     // prevent overflow for total number of elements
-    GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
-    GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
-    GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
+    if (INT64_MAX/info->ne[1] <= info->ne[0]) {
+        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[1]);
+        return false;
+    }
+
+    if (INT64_MAX/info->ne[2] <= info->ne[0]*info->ne[1]) {
+        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[2]);
+        return false;
+    }
+
+    if (INT64_MAX/info->ne[3] <= info->ne[0]*info->ne[1]*info->ne[2]) {
+        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[3]);
+        return false;
+    }
+
+    return true;
 }
 
 static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
@@ -22414,8 +22442,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
             ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
 
-            // TODO: return an error instead of crashing with GGML_ASSERT
-            gguf_tensor_info_sanitize(info);
+            ok = ok && gguf_tensor_info_sanitize(info);
 
             // make sure there is no duplicated tensor names
             for (uint64_t j = 0; j < i && ok; ++j) {
diff --git a/src/llama.cpp b/src/llama.cpp
index ef1b8ee59..60a0db29c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4273,8 +4273,11 @@ struct llama_model_loader {
 
         llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
             const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
-            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
+            if (tensor_idx < 0) {
+                throw std::runtime_error(format("tensor '%s' not found in the model", name));
+            }
 
+            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
             if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
                 throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
             }
@@ -7426,7 +7429,7 @@ static bool llm_load_tensors(
                 if (flags & llama_model_loader::TENSOR_NOT_REQUIRED) {
                     return nullptr;
                 }
-                throw std::runtime_error(format("missing tensor %s", tn.str().c_str()));
+                throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
             }
 
             // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops

From 0a683e8088d849626e7471f9e2ed381f7dbdf2e9 Mon Sep 17 00:00:00 2001
From: Kevin Gibbons <bakkot@gmail.com>
Date: Thu, 31 Oct 2024 06:02:35 -0700
Subject: [PATCH 129/396] server : include scheme when printing URL (#10106)

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7953b5065..f914ff88c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3259,7 +3259,7 @@ int main(int argc, char ** argv) {
         ctx_server.queue_tasks.terminate();
     };
 
-    LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
+    LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
 
     ctx_server.queue_tasks.start_loop();
 

From ab3d71f97f5b2915a229099777af00d3eada1d24 Mon Sep 17 00:00:00 2001
From: Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
Date: Fri, 1 Nov 2024 02:50:39 +0800
Subject: [PATCH 130/396] loader:  refactor tensor weights storage (#9935)

* loader: refactor tensor weights storage

* use sorted map, sort weights by layer

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 src/llama.cpp | 123 ++++++++++++++++++++++++++------------------------
 1 file changed, 65 insertions(+), 58 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 60a0db29c..bc94d7ff0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4271,20 +4271,34 @@ struct llama_model_loader {
 
         ggml_tensor * tensor;
 
-        llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
-            const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
+        llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
+            const int tensor_idx = gguf_find_tensor(gguf_ctx,  ggml_get_name(tensor));
             if (tensor_idx < 0) {
-                throw std::runtime_error(format("tensor '%s' not found in the model", name));
+                throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
             }
 
             offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
             if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
-                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
+                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
             }
         }
     };
-    std::vector<llama_tensor_weight> weights;
 
+    // custom comparator to sort weights more nicely by layer
+    struct weight_name_comparer {
+        bool operator()(const std::string & a, const std::string & b) const {
+            int a_layer = -1;
+            int b_layer = -1;
+            sscanf(a.c_str(), "blk.%d.", &a_layer);
+            sscanf(b.c_str(), "blk.%d.", &b_layer);
+            if (a_layer != b_layer) {
+                return a_layer < b_layer;
+            }
+            return a < b;
+        }
+    };
+
+    std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
     std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
 
     struct gguf_context * meta = NULL;
@@ -4326,7 +4340,14 @@ struct llama_model_loader {
         // For subsidiary files, `meta` tensor data offset must not be used,
         // so we build a unified tensors index for weights.
         for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
-            weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
+            std::string tensor_name = std::string(cur->name);
+            // make sure there is no duplicated tensor names
+            if (weights_map.find(tensor_name) != weights_map.end()) {
+                throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+            }
+            n_elements += ggml_nelements(cur);
+            n_bytes    += ggml_nbytes(cur);
+            weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur));
         }
         uint16_t n_split = 0;
         get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
@@ -4366,7 +4387,14 @@ struct llama_model_loader {
 
                 // Save tensors data offset info of the shard.
                 for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
-                    weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
+                    std::string tensor_name = std::string(cur->name);
+                    // make sure there is no duplicated tensor names
+                    if (weights_map.find(tensor_name) != weights_map.end()) {
+                        throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+                    }
+                    n_elements += ggml_nelements(cur);
+                    n_bytes    += ggml_nbytes(cur);
+                    weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur));
                 }
 
                 gguf_free(ctx_gguf);
@@ -4376,7 +4404,7 @@ struct llama_model_loader {
 
             // sanity check
             {
-                const int n_tensors_loaded = (int) weights.size();
+                const int n_tensors_loaded = (int) weights_map.size();
                 if (n_tensors != n_tensors_loaded) {
                     throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
                 }
@@ -4386,23 +4414,10 @@ struct llama_model_loader {
         }
 
         n_kv      = gguf_get_n_kv(meta);
-        n_tensors = weights.size();
+        n_tensors = weights_map.size();
 
         fver = (enum llama_fver) gguf_get_version(meta);
 
-        std::set<std::string> tensor_names;
-        for (auto & w : weights) {
-            n_elements += ggml_nelements(w.tensor);
-            n_bytes    += ggml_nbytes(w.tensor);
-            // make sure there is no duplicated tensor names
-            const std::string name(w.tensor->name);
-            auto found = tensor_names.find(name);
-            if (found != tensor_names.end()) {
-                throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
-            }
-            tensor_names.insert(name);
-        }
-
         LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
                 __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
 
@@ -4414,8 +4429,10 @@ struct llama_model_loader {
             uint32_t n_type_max = 0;
             enum ggml_type type_max = GGML_TYPE_F32;
 
-            for (int i = 0; i < n_tensors; i++) {
-                const ggml_tensor * tensor = weights.at(i).tensor;
+            for (const auto & it : weights_map) {
+                const llama_tensor_weight & w = it.second;
+                const ggml_tensor * tensor = w.tensor;
+
                 enum ggml_type type = tensor->type;
 
                 n_type[type]++;
@@ -4426,8 +4443,8 @@ struct llama_model_loader {
                 }
 
                 if (trace > 0) {
-                    const uint16_t sid = weights.at(i).idx;
-                    LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
+                    const uint16_t sid = w.idx;
+                    LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
                 }
             }
 
@@ -4691,21 +4708,13 @@ struct llama_model_loader {
         return llm_kv.arch;
     }
 
-    const char * get_tensor_name(int i) const {
-        return weights.at(i).tensor->name;
-    }
-
     const llama_tensor_weight * get_weight(const char * name) const {
-        for (const auto & weight : weights) {
-            if (strcmp(name, weight.tensor->name) == 0) {
-                return &weight;
-            }
+        auto pos = weights_map.find(name);
+        if (pos != weights_map.end()) {
+            return &pos->second;
         }
-        return nullptr;
-    }
 
-    const llama_tensor_weight * get_weight(int i) const {
-        return get_weight(get_tensor_name(i));
+        return nullptr;
     }
 
     const llama_tensor_weight & require_weight(const char * name) const {
@@ -4732,10 +4741,6 @@ struct llama_model_loader {
         return tensor;
     }
 
-    struct ggml_tensor * get_tensor_meta(int i) const {
-        return get_tensor_meta(get_tensor_name(i));
-    }
-
     const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
         const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
 
@@ -4842,8 +4847,8 @@ struct llama_model_loader {
         }
 
         // compute the total size of all tensors for progress reporting
-        for (auto & w : weights) {
-            size_data += ggml_nbytes(w.tensor);
+        for (const auto & it : weights_map) {
+            size_data += ggml_nbytes(it.second.tensor);
         }
     }
 
@@ -18598,10 +18603,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         }
     }
 
-    for (int i = 0; i < ml.n_tensors; ++i) {
-        const struct ggml_tensor * meta = ml.get_tensor_meta(i);
+    for (const auto & it : ml.weights_map) {
+        const struct ggml_tensor * tensor = it.second.tensor;
 
-        const std::string name = ggml_get_name(meta);
+        const std::string name = ggml_get_name(tensor);
 
         // TODO: avoid hardcoded tensor names - use the TN_* constants
         if (name.find("attn_v.weight")   != std::string::npos ||
@@ -18639,20 +18644,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     std::vector<no_init<float>> f32_conv_buf;
 
     uint16_t n_split = 1;
+    const auto & weights_map = ml.weights_map;
+
     // Assume split index is continuous
     if (params->keep_split) {
-        for (int i = 0; i < ml.n_tensors; ++i) {
-            n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
+        for (const auto & it : weights_map) {
+            n_split = std::max(uint16_t(it.second.idx + 1), n_split);
         }
+
     }
     std::vector<gguf_context*> ctx_outs(n_split, NULL);
     ctx_outs[0] = ctx_out;
 
     // populate the original tensors so we get an initial meta data
-    for (int i = 0; i < ml.n_tensors; ++i) {
-        auto weight = ml.get_weight(i);
-        uint16_t i_split = params->keep_split ? weight->idx : 0;
-        struct ggml_tensor * tensor = weight->tensor;
+    for (const auto & it : weights_map) {
+        uint16_t i_split = params->keep_split ? it.second.idx : 0;
+        struct ggml_tensor * tensor = it.second.tensor;
         if (ctx_outs[i_split] == NULL) {
             ctx_outs[i_split] = gguf_init_empty();
         }
@@ -18699,12 +18706,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     const auto tn = LLM_TN(model.arch);
     new_ofstream(0);
-    for (int i = 0; i < ml.n_tensors; ++i) {
-        auto weight = ml.get_weight(i);
-        struct ggml_tensor * tensor = weight->tensor;
-        if (weight->idx != cur_split && params->keep_split) {
+    for (const auto & it : weights_map) {
+        const auto & weight = it.second;
+        struct ggml_tensor * tensor = weight.tensor;
+        if (weight.idx != cur_split && params->keep_split) {
             close_ofstream();
-            new_ofstream(weight->idx);
+            new_ofstream(weight.idx);
         }
 
         const std::string name = ggml_get_name(tensor);

From c02e5ab2a675c8bc1abc8b1e4cb6a93b26bdcce7 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 31 Oct 2024 22:54:23 +0100
Subject: [PATCH 131/396] llama : fix buffer checks for mamba and rwk (#10111)

* llama : fix buffer checks for mamba and rwk

* llama : fix missing worst case flag during reserve

* cuda : fix supports_op for norm

* disable sched SET_CAUSE
---
 ggml/src/ggml-backend.cpp |  2 +-
 ggml/src/ggml-cuda.cu     |  6 ++++--
 ggml/src/ggml.c           |  1 +
 src/llama.cpp             | 38 +++++++++++++++++++++++++++++---------
 4 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index f397f6252..c2afdf391 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1508,7 +1508,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
     return -1;
 }
 
-#if 1
+#if 0
 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 087091516..b57f1b3b7 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3107,18 +3107,20 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 }
                 return false;
             } break;
+        case GGML_OP_NORM:
+        case GGML_OP_RMS_NORM:
+            return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
+            break;
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
-        case GGML_OP_NORM:
         case GGML_OP_ADD:
         case GGML_OP_ADD1:
         case GGML_OP_SUB:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
-        case GGML_OP_RMS_NORM:
         case GGML_OP_SCALE:
         case GGML_OP_SQR:
         case GGML_OP_SQRT:
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 149d8f970..6a7154920 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -7272,6 +7272,7 @@ struct ggml_tensor * ggml_ssm_conv(
     const int64_t n_s     = sx->ne[2];
 
     // TODO: maybe support other strides than 1?
+    // FIXME: this is always true?
     GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t);
     GGML_ASSERT(sx->ne[1] == d_inner);
     GGML_ASSERT(n_t >= 0);
diff --git a/src/llama.cpp b/src/llama.cpp
index bc94d7ff0..e697c310c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7127,7 +7127,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
             } break;
         case GGML_OP_MUL_MAT:
             {
-                ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
+                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
                 op_tensor = ggml_mul_mat(ctx, w, b);
             } break;
         case GGML_OP_MUL_MAT_ID:
@@ -7167,18 +7167,38 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
             } break;
         case GGML_OP_SSM_CONV:
             {
-                // TODO: ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d);
-                op_tensor = ggml_ssm_conv(ctx, nullptr, w);
+                // FIXME
+                ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
+                op_tensor = ggml_ssm_conv(ctx, conv_x, w);
             } break;
         case GGML_OP_SSM_SCAN:
             {
-                // TODO: ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C);
-                op_tensor = ggml_ssm_scan(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr);
+                // FIXME
+                const int64_t d_state      = w->ne[0];
+                const int64_t d_inner      = w->ne[1];
+                const int64_t n_seq_tokens = 512;
+                const int64_t n_seqs       = 1;
+                ggml_tensor * s  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
+                ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
+                ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
+                ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
+                ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
+                op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
             } break;
         case GGML_OP_RWKV_WKV:
             {
-                // TODO: ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
-                op_tensor = ggml_rwkv_wkv(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr);
+                // FIXME
+                const int64_t S = 123;
+                const int64_t H = 123;
+                const int64_t n_tokens = 123;
+                const int64_t n_seqs = 123;
+                ggml_tensor  * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, 1, H, n_tokens);
+                ggml_tensor  * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
+                ggml_tensor  * r = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
+                ggml_tensor  * tf = w;
+                ggml_tensor  * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
+                ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
+                op_tensor = ggml_rwkv_wkv(ctx, k, v, r, tf, td, state);
             } break;
         default:
             GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
@@ -7453,7 +7473,7 @@ static bool llm_load_tensors(
 
             // tensors with "bias" suffix are always used with GGML_OP_ADD
             ggml_op op;
-            bool bias = strcmp(tn.suffix, "bias") == 0;
+            bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
             if (bias) {
                 op = GGML_OP_ADD;
             } else {
@@ -19681,7 +19701,7 @@ struct llama_context * llama_new_context_with_model(
             int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
 
             // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-            gf_pp = llama_build_graph(*ctx, ubatch_pp, false);
+            gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
             if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                 llama_free(ctx);

From 1e9f94994ef908d964cf81069f03d9d3668beb7d Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 1 Nov 2024 00:45:34 +0100
Subject: [PATCH 132/396] quantize : fix --keep-split (#10114)

---
 src/llama.cpp | 53 +++++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index e697c310c..ed3998a1f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4860,19 +4860,12 @@ struct llama_model_loader {
         *last  = 0;
         *addr = mapping->addr;
         for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
-            try {
-                const auto * weight = get_weight(ggml_get_name(tensor));
-                if (!weight) {
-                    continue;
-                }
-                if (weight->idx != idx) {
-                    continue;
-                }
-                *first = std::min(*first, weight->offs);
-                *last  = std::max(*last,  weight->offs + ggml_nbytes(tensor));
-            } catch(...) {
-                // the tensor is not in the model
+            const auto * weight = get_weight(ggml_get_name(tensor));
+            if (!weight || weight->idx != idx) {
+                continue;
             }
+            *first = std::min(*first, weight->offs);
+            *last  = std::max(*last,  weight->offs + ggml_nbytes(tensor));
         }
     }
 
@@ -5049,7 +5042,6 @@ struct llama_model_loader {
                     ggml_backend_tensor_set(cur, data, 0, n_size);
                 }
             } else {
-                GGML_ASSERT(weight->idx < files.size());
                 const auto & file = files.at(weight->idx);
                 if (ggml_backend_buffer_is_host(cur->buffer)) {
                     file->seek(weight->offs, SEEK_SET);
@@ -18623,8 +18615,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         }
     }
 
+    // make a list of weights
+    std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
+    tensors.reserve(ml.weights_map.size());
     for (const auto & it : ml.weights_map) {
-        const struct ggml_tensor * tensor = it.second.tensor;
+        tensors.push_back(&it.second);
+    }
+
+    // keep_split requires that the weights are sorted by split index
+    if (params->keep_split) {
+        std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
+            if (a->idx == b->idx) {
+                return a->offs < b->offs;
+            }
+            return a->idx < b->idx;
+        });
+    }
+
+    for (const auto * it : tensors) {
+        const struct ggml_tensor * tensor = it->tensor;
 
         const std::string name = ggml_get_name(tensor);
 
@@ -18664,22 +18673,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     std::vector<no_init<float>> f32_conv_buf;
 
     uint16_t n_split = 1;
-    const auto & weights_map = ml.weights_map;
 
     // Assume split index is continuous
     if (params->keep_split) {
-        for (const auto & it : weights_map) {
-            n_split = std::max(uint16_t(it.second.idx + 1), n_split);
+        for (const auto * it : tensors) {
+            n_split = std::max(uint16_t(it->idx + 1), n_split);
         }
-
     }
     std::vector<gguf_context*> ctx_outs(n_split, NULL);
     ctx_outs[0] = ctx_out;
 
     // populate the original tensors so we get an initial meta data
-    for (const auto & it : weights_map) {
-        uint16_t i_split = params->keep_split ? it.second.idx : 0;
-        struct ggml_tensor * tensor = it.second.tensor;
+    for (const auto * it : tensors) {
+        uint16_t i_split = params->keep_split ? it->idx : 0;
+        struct ggml_tensor * tensor = it->tensor;
         if (ctx_outs[i_split] == NULL) {
             ctx_outs[i_split] = gguf_init_empty();
         }
@@ -18726,8 +18733,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     const auto tn = LLM_TN(model.arch);
     new_ofstream(0);
-    for (const auto & it : weights_map) {
-        const auto & weight = it.second;
+    for (const auto * it : tensors) {
+        const auto & weight = *it;
         struct ggml_tensor * tensor = weight.tensor;
         if (weight.idx != cur_split && params->keep_split) {
             close_ofstream();

From 85679d37f34f66783cc04664a06c405b28e8e035 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 1 Nov 2024 00:49:53 +0100
Subject: [PATCH 133/396] llama : improve output buffer type selection (#10098)

---
 src/llama.cpp | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index ed3998a1f..ca0d259b2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17162,18 +17162,10 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
 
         auto * buft = ggml_backend_cpu_buffer_type();
         // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
-        ggml_tensor * output_tensor = lctx.model.output;
-        if (!output_tensor) {
-            // bert models don't have an output tensor, use the last layer
-            output_tensor = lctx.model.layers.back().layer_out_norm;
-        }
-        if (output_tensor) {
-            auto * output_buft = ggml_backend_buffer_get_type(output_tensor->buffer);
-            auto * output_dev = ggml_backend_buft_get_device(output_buft);
-            auto * output_dev_host_buft = ggml_backend_dev_host_buffer_type(output_dev);
-            if (output_dev_host_buft) {
-                buft = output_dev_host_buft;
-            }
+        auto * output_dev = lctx.model.dev_output.dev;
+        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+        if (output_dev_host_buft) {
+            buft = output_dev_host_buft;
         }
         lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size);
         if (lctx.buf_output == nullptr) {

From e597e50794f07ec8dc24b9efb18f94ec6386fda0 Mon Sep 17 00:00:00 2001
From: Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
Date: Fri, 1 Nov 2024 11:09:59 +0800
Subject: [PATCH 134/396] build: fix build error in Windows env with OneAPI
 setup (#10107)

---
 ggml/src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 915568798..7365ac91b 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1402,7 +1402,7 @@ list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
 
 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
-    if (NOT WIN32 OR NOT GGML_SYCL)
+    if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
         list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
     endif()
 endif()

From f221d56220899f38f0126e683b2432bc79d1e3f6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 1 Nov 2024 10:23:05 +0200
Subject: [PATCH 135/396] ggml : alloc ggml_contexts on the heap (whisper/2525)

---
 ggml/include/ggml.h |  7 +++--
 ggml/src/ggml.c     | 67 +++++++++++++--------------------------------
 2 files changed, 23 insertions(+), 51 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index de3c706fc..e5862246c 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -217,7 +217,6 @@
 
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
 #define GGML_MAX_N_THREADS      512
 #define GGML_MAX_OP_PARAMS      64
@@ -657,6 +656,7 @@ extern "C" {
     };
 
     // scratch buffer
+    // TODO: deprecate and remove
     struct ggml_scratch {
         size_t offs;
         size_t size;
@@ -760,8 +760,9 @@ extern "C" {
 
     // main
 
-    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
-    GGML_API void                  ggml_free(struct ggml_context * ctx);
+    GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
+    GGML_API void                  ggml_reset(struct ggml_context * ctx);
+    GGML_API void                  ggml_free (struct ggml_context * ctx);
 
     GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 6a7154920..59f2ed043 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -306,6 +306,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
 }
 
 #define GGML_DEBUG 0
+
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
 
@@ -2014,7 +2015,7 @@ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
 
 struct ggml_context {
     size_t mem_size;
-    void* mem_buffer;
+    void * mem_buffer;
     bool   mem_buffer_owned;
     bool   no_alloc;
     bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
@@ -3263,7 +3264,6 @@ struct ggml_numa_nodes {
 //
 
 struct ggml_state {
-    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
     struct ggml_numa_nodes numa;
 };
 
@@ -3845,7 +3845,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
             const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 
             g_state = (struct ggml_state) {
-                /*.contexts =*/ { { 0 } },
                 /*.numa =*/ {
                     .n_nodes = 0,
                     .total_cpus = 0,
@@ -3864,26 +3863,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         is_first_call = false;
     }
 
-    // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
+    ggml_critical_section_end();
 
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (!g_state.contexts[i].used) {
-            g_state.contexts[i].used = true;
-            ctx = &g_state.contexts[i].context;
-
-            GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
-            break;
-        }
-    }
-
-    if (ctx == NULL) {
-        GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
-
-        ggml_critical_section_end();
-
-        return NULL;
-    }
+    struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));
 
     // allow to call ggml_init with 0 size
     if (params.mem_size == 0) {
@@ -3911,42 +3893,31 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
 
-    ggml_critical_section_end();
-
     return ctx;
 }
 
+void ggml_reset(struct ggml_context * ctx) {
+    if (ctx == NULL) {
+        return;
+    }
+
+    ctx->n_objects     = 0;
+    ctx->objects_begin = NULL;
+    ctx->objects_end   = NULL;
+    ctx->scratch       = (struct ggml_scratch) { 0, 0, NULL, };
+    ctx->scratch_save  = (struct ggml_scratch) { 0, 0, NULL, };
+}
+
 void ggml_free(struct ggml_context * ctx) {
     if (ctx == NULL) {
         return;
     }
 
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    bool found = false;
-
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (&g_state.contexts[i].context == ctx) {
-            g_state.contexts[i].used = false;
-
-            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
-                    __func__, i, ggml_used_mem(ctx));
-
-            if (ctx->mem_buffer_owned) {
-                ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
-            }
-
-            found = true;
-            break;
-        }
+    if (ctx->mem_buffer_owned) {
+        ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
     }
 
-    if (!found) {
-        GGML_PRINT_DEBUG("%s: context not found\n", __func__);
-    }
-
-    ggml_critical_section_end();
+    GGML_FREE(ctx);
 }
 
 size_t ggml_used_mem(const struct ggml_context * ctx) {

From 815fe72adcea5ec79d358db6a4c479191f396b3c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 1 Nov 2024 10:28:24 +0200
Subject: [PATCH 136/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index da40927e1..48863847c 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-162e232411ee98ceb0cccfa84886118d917d2123
+bb78a40dc60e04c626bac2b65840b509988e990d

From 1804adb0cfee4811eaf633741503d683a46e4c77 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 1 Nov 2024 12:58:45 +0200
Subject: [PATCH 137/396] ggml : remove ggml_scratch (#10121)

ggml-ci
---
 ggml/include/ggml.h |  9 ------
 ggml/src/ggml.c     | 67 ++-------------------------------------------
 2 files changed, 2 insertions(+), 74 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index e5862246c..41df85557 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -655,14 +655,6 @@ extern "C" {
         void *              abort_callback_data;
     };
 
-    // scratch buffer
-    // TODO: deprecate and remove
-    struct ggml_scratch {
-        size_t offs;
-        size_t size;
-        void * data;
-    };
-
     struct ggml_init_params {
         // memory pool
         size_t mem_size;   // bytes
@@ -766,7 +758,6 @@ extern "C" {
 
     GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
 
-    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
     GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
     GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 59f2ed043..84f2c766b 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2018,15 +2018,11 @@ struct ggml_context {
     void * mem_buffer;
     bool   mem_buffer_owned;
     bool   no_alloc;
-    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
 
     int    n_objects;
 
     struct ggml_object * objects_begin;
     struct ggml_object * objects_end;
-
-    struct ggml_scratch scratch;
-    struct ggml_scratch scratch_save;
 };
 
 struct ggml_context_container {
@@ -3879,12 +3875,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
         /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
         /*.no_alloc           =*/ params.no_alloc,
-        /*.no_alloc_save      =*/ params.no_alloc,
         /*.n_objects          =*/ 0,
         /*.objects_begin      =*/ NULL,
         /*.objects_end        =*/ NULL,
-        /*.scratch            =*/ { 0, 0, NULL, },
-        /*.scratch_save       =*/ { 0, 0, NULL, },
     };
 
     GGML_ASSERT(ctx->mem_buffer != NULL);
@@ -3904,8 +3897,6 @@ void ggml_reset(struct ggml_context * ctx) {
     ctx->n_objects     = 0;
     ctx->objects_begin = NULL;
     ctx->objects_end   = NULL;
-    ctx->scratch       = (struct ggml_scratch) { 0, 0, NULL, };
-    ctx->scratch_save  = (struct ggml_scratch) { 0, 0, NULL, };
 }
 
 void ggml_free(struct ggml_context * ctx) {
@@ -3924,14 +3915,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx) {
     return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
 }
 
-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
-    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
-
-    ctx->scratch = scratch;
-
-    return result;
-}
-
 bool ggml_get_no_alloc(struct ggml_context * ctx) {
     return ctx->no_alloc;
 }
@@ -3959,27 +3942,6 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
     return max_size;
 }
 
-// IMPORTANT:
-// when creating "opt" tensors, always save and load the scratch buffer
-// this is an error prone process, but it is necessary to support inplace
-// operators when using scratch buffers
-// TODO: implement a better way
-static void ggml_scratch_save(struct ggml_context * ctx) {
-    // this is needed to allow opt tensors to store their data
-    // TODO: again, need to find a better way
-    ctx->no_alloc_save = ctx->no_alloc;
-    ctx->no_alloc      = false;
-
-    ctx->scratch_save = ctx->scratch;
-    ctx->scratch.data = NULL;
-}
-
-static void ggml_scratch_load(struct ggml_context * ctx) {
-    ctx->no_alloc = ctx->no_alloc_save;
-
-    ctx->scratch = ctx->scratch_save;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 
 static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
@@ -4060,29 +4022,13 @@ static struct ggml_tensor * ggml_new_tensor_impl(
     size_t obj_alloc_size = 0;
 
     if (view_src == NULL && !ctx->no_alloc) {
-        if (ctx->scratch.data != NULL) {
-            // allocate tensor data in the scratch buffer
-            if (ctx->scratch.offs + data_size > ctx->scratch.size) {
-                GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
-                        __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
-                assert(false);
-                return NULL;
-            }
-
-            data = (char * const) ctx->scratch.data + ctx->scratch.offs;
-
-            ctx->scratch.offs += data_size;
-        } else {
-            // allocate tensor data in the context's memory pool
-            obj_alloc_size = data_size;
-        }
+        // allocate tensor data in the context's memory pool
+        obj_alloc_size = data_size;
     }
 
     struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
     GGML_ASSERT(obj_new);
 
-    // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
-
     struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
 
 #ifdef __clang__
@@ -4178,24 +4124,16 @@ struct ggml_tensor * ggml_new_tensor_4d(
 }
 
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
-    ggml_scratch_save(ctx);
-
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
 
-    ggml_scratch_load(ctx);
-
     ggml_set_i32(result, value);
 
     return result;
 }
 
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
-    ggml_scratch_save(ctx);
-
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
 
-    ggml_scratch_load(ctx);
-
     ggml_set_f32(result, value);
 
     return result;
@@ -20263,7 +20201,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
     uint64_t size_eval = 0;
 
     // compute size of intermediate results
-    // TODO: does not take into account scratch buffers !!!!
     for (int i = 0; i < cgraph->n_nodes; ++i) {
         size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
     }

From d865d1478cd4e403f82d793c2afcd0f943412f05 Mon Sep 17 00:00:00 2001
From: sasha0552 <admin@sasha0552.org>
Date: Fri, 1 Nov 2024 13:33:14 +0000
Subject: [PATCH 138/396] server : fix smart selection of available slot
 (#10120)

* Fix smart selection of available slot

* minor fix

* replace vectors of tokens with shorthands
---
 examples/server/server.cpp | 35 +++++++++----------------
 examples/server/utils.hpp  | 52 ++++++++++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f914ff88c..54cdb4b72 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -725,12 +725,12 @@ struct server_context {
         return nullptr;
     }
 
-    server_slot * get_available_slot(const std::string & prompt) {
+    server_slot * get_available_slot(const server_task & task) {
         server_slot * ret = nullptr;
 
         // find the slot that has at least n% prompt similarity
-        if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
-            int max_lcp_len = 0;
+        if (ret == nullptr && slot_prompt_similarity != 0.0f) {
+            int max_lcs_len = 0;
             float similarity = 0;
 
             for (server_slot & slot : slots) {
@@ -740,25 +740,25 @@ struct server_context {
                 }
 
                 // skip the slot if it does not contains cached tokens
-                if (slot.prompt_tokens.empty()) {
+                if (slot.cache_tokens.empty()) {
                     continue;
                 }
 
-                // length of the Longest Common Prefix between the current slot's prompt and the input prompt
-                int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens);
+                // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
+                int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
 
-                // fraction of the common substring length compared to the current slot's prompt length
-                similarity = static_cast<float>(lcp_len) / static_cast<int>(slot.prompt_tokens.size());
+                // fraction of the common subsequence length compared to the current slot's prompt length
+                similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
 
                 // select the current slot if the criteria match
-                if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
-                    max_lcp_len = lcp_len;
+                if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
+                    max_lcs_len = lcs_len;
                     ret = &slot;
                 }
             }
 
             if (ret != nullptr) {
-                SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
+                SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
             }
         }
 
@@ -1514,18 +1514,7 @@ struct server_context {
                 {
                     const int id_slot = json_value(task.data, "id_slot", -1);
 
-                    server_slot * slot;
-
-                    if (id_slot != -1) {
-                        slot = get_slot_by_id(id_slot);
-                    } else {
-                        std::string prompt;
-                        if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
-                            prompt = json_value(task.data, "prompt", std::string());
-                        }
-
-                        slot = get_available_slot(prompt);
-                    }
+                    server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
 
                     if (slot == nullptr) {
                         // if no slot is available, we defer this task for processing later
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 58f5a5684..871a17a4f 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -439,18 +439,60 @@ static std::string gen_chatcmplid() {
 // other common utils
 //
 
-static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
+static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
     size_t i;
     for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
 
     return i;
 }
 
-static size_t longest_common_prefix(const std::string & a, const std::string & b) {
-    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
+    // check for empty sequences
+    if (a.empty() || b.empty()) {
+        return 0;
+    }
 
-    return i;
+    // get the lengths of the input sequences
+    int a_len = a.size();
+    int b_len = b.size();
+
+    // initialize the maximum length of the longest common subsequence (LCS)
+    int max_length = 0;
+
+    // use two rows instead of a 2D matrix to optimize space
+    std::vector<int> prev_row(b_len + 1, 0);
+    std::vector<int> curr_row(b_len + 1, 0);
+
+    // iterate through the elements of a
+    for (int i = 1; i <= a_len; i++) {
+        // iterate through the elements of b
+        for (int j = 1; j <= b_len; j++) {
+            // if elements at the current positions match
+            if (a[i - 1] == b[j - 1]) {
+                // if it's the first element of either sequences, set LCS length to 1
+                if (i == 1 || j == 1) {
+                    curr_row[j] = 1;
+                } else {
+                    // increment LCS length by 1 compared to the previous element
+                    curr_row[j] = prev_row[j - 1] + 1;
+                }
+
+                // update max_length if necessary
+                if (curr_row[j] > max_length) {
+                    max_length = curr_row[j];
+                }
+            } else {
+                // reset LCS length if elements don't match
+                curr_row[j] = 0;
+            }
+        }
+
+        // update the previous row for the next iteration
+        prev_row = curr_row;
+    }
+
+    // return the maximum length of the LCS
+    return max_length;
 }
 
 static bool ends_with(const std::string & str, const std::string & suffix) {

From ba6f62eb793d6617892d252f5c04d7685d908a38 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 1 Nov 2024 17:31:51 +0200
Subject: [PATCH 139/396] readme : update hot topics

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8fe1f4b4b..0378a674e 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ## Hot topics
 
-- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
+- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
+- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
 
 ----

From 418f5eef262cea07c2af4f45ee6a88d882221fcb Mon Sep 17 00:00:00 2001
From: Shupei Fan <dymarkfan@outlook.com>
Date: Sat, 2 Nov 2024 02:33:14 +0800
Subject: [PATCH 140/396] vulkan : improve ggml_vk_create_buffer error handling
 (#9898)

---
 ggml/src/ggml-vulkan.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 83c37ea9c..a8e78c4db 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -1047,7 +1047,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
         return buf;
     }
 
-    buf->size = size;
     vk::BufferCreateInfo buffer_create_info{
         vk::BufferCreateFlags(),
         size,
@@ -1075,7 +1074,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
 
     if (memory_type_index == UINT32_MAX) {
         device->device.destroyBuffer(buf->buffer);
-        buf->size = 0;
         throw vk::OutOfDeviceMemoryError("No suitable memory type found");
     }
 
@@ -1092,13 +1090,11 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
             }
             catch (const vk::SystemError& e) {
                 device->device.destroyBuffer(buf->buffer);
-                buf->size = 0;
                 throw e;
             }
         } else {
             // Out of Host/Device memory, clean up buffer
             device->device.destroyBuffer(buf->buffer);
-            buf->size = 0;
             throw e;
         }
     }
@@ -1111,6 +1107,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
     device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
 
     buf->device = device;
+    buf->size = size;
 
 #ifdef GGML_VULKAN_MEMORY_DEBUG
     device->memory_logger->log_allocation(buf, size);

From e991e3127ff71a29e61fe1de5dd1cbd2e1df1858 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 1 Nov 2024 23:48:26 +0100
Subject: [PATCH 141/396] llama : use smart pointers for ggml resources
 (#10117)

---
 ggml/include/ggml-cpp.h |  38 ++++
 ggml/src/CMakeLists.txt |   1 +
 spm-headers/ggml-cpp.h  |   1 +
 src/llama.cpp           | 424 +++++++++++++++++-----------------------
 4 files changed, 219 insertions(+), 245 deletions(-)
 create mode 100644 ggml/include/ggml-cpp.h
 create mode 120000 spm-headers/ggml-cpp.h

diff --git a/ggml/include/ggml-cpp.h b/ggml/include/ggml-cpp.h
new file mode 100644
index 000000000..219361af4
--- /dev/null
+++ b/ggml/include/ggml-cpp.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include <memory>
+
+// Smart pointers for ggml types
+
+// ggml
+
+struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
+struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
+
+typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
+typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
+
+// ggml-alloc
+
+struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
+
+typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
+
+// ggml-backend
+
+struct ggml_backend_deleter        { void operator()(ggml_backend_t backend)       { ggml_backend_free(backend); } };
+struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
+struct ggml_backend_event_deleter  { void operator()(ggml_backend_event_t event)   { ggml_backend_event_free(event); } };
+struct ggml_backend_sched_deleter  { void operator()(ggml_backend_sched_t sched)   { ggml_backend_sched_free(sched); } };
+
+typedef std::unique_ptr<ggml_backend,        ggml_backend_deleter>        ggml_backend_ptr;
+typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
+typedef std::unique_ptr<ggml_backend_event,  ggml_backend_event_deleter>  ggml_backend_event_ptr;
+typedef std::unique_ptr<ggml_backend_sched,  ggml_backend_sched_deleter>  ggml_backend_sched_ptr;
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 7365ac91b..0764a8d90 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1368,6 +1368,7 @@ add_library(ggml
             ../include/ggml.h
             ../include/ggml-alloc.h
             ../include/ggml-backend.h
+            ../include/ggml-cpp.h
             ggml.c
             ggml-alloc.c
             ggml-backend.cpp
diff --git a/spm-headers/ggml-cpp.h b/spm-headers/ggml-cpp.h
new file mode 120000
index 000000000..8a8604cc2
--- /dev/null
+++ b/spm-headers/ggml-cpp.h
@@ -0,0 +1 @@
+../ggml/include/ggml-cpp.h
\ No newline at end of file
diff --git a/src/llama.cpp b/src/llama.cpp
index ca0d259b2..0991c4089 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7,6 +7,7 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
+#include "ggml-cpp.h"
 
 // TODO: replace with ggml API call
 #define QK_K 256
@@ -2797,31 +2798,22 @@ struct llama_kv_cache {
     std::vector<struct ggml_tensor *> k_l; // per layer
     std::vector<struct ggml_tensor *> v_l;
 
-    std::vector<struct ggml_context *> ctxs;
-    std::vector<ggml_backend_buffer_t> bufs;
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
 
-    size_t total_size() const {
+    size_t total_size() {
         size_t size = 0;
-        for (ggml_backend_buffer_t buf : bufs) {
-            size += ggml_backend_buffer_get_size(buf);
+        for (auto & buf : bufs) {
+            size += ggml_backend_buffer_get_size(buf.get());
         }
         return size;
     }
-
-    ~llama_kv_cache() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
-    }
 };
 
 struct llama_control_vector {
     std::vector<struct ggml_tensor *> tensors; // per layer
-    std::vector<struct ggml_context *> ctxs;
-    std::vector<ggml_backend_buffer_t> bufs;
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
 
     int32_t layer_start = -1;
     int32_t layer_end   = -1;
@@ -2840,15 +2832,6 @@ struct llama_control_vector {
         }
         return cur;
     }
-
-    ~llama_control_vector() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
-    }
 };
 
 struct llama_model {
@@ -2908,10 +2891,10 @@ struct llama_model {
     std::vector<layer_dev> dev_layer;
 
     // contexts where the model tensors metadata is stored
-    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_context_ptr> ctxs;
 
     // the model memory buffers for the tensor data
-    std::vector<ggml_backend_buffer_t> bufs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
 
     // model memory mapped files
     llama_mmaps mappings;
@@ -2930,13 +2913,7 @@ struct llama_model {
     std::set<struct llama_lora_adapter *> lora_adapters;
 
     ~llama_model() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
-        while (!lora_adapters.empty()) {
+       while (!lora_adapters.empty()) {
             llama_lora_adapter_free(*lora_adapters.begin());
         }
     }
@@ -3253,16 +3230,6 @@ struct llama_context {
         , t_start_us(model.t_start_us)
         , t_load_us(model.t_load_us) {}
 
-    ~llama_context() {
-        ggml_backend_sched_free(sched);
-
-        for (ggml_backend_t backend : backends) {
-            ggml_backend_free(backend);
-        }
-
-        ggml_backend_buffer_free(buf_output);
-    }
-
     const struct llama_model & model;
 
     struct llama_cparams        cparams;
@@ -3272,7 +3239,7 @@ struct llama_context {
 
     std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
 
-    std::vector<ggml_backend_t> backends;
+    std::vector<ggml_backend_ptr> backends;
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
 
     ggml_backend_t backend_cpu = nullptr;
@@ -3294,7 +3261,7 @@ struct llama_context {
     mutable int32_t n_eval   = 0; // number of eval calls
 
     // host buffer for the model output (logits and embeddings)
-    ggml_backend_buffer_t buf_output = nullptr;
+    ggml_backend_buffer_ptr buf_output;
 
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
@@ -3324,7 +3291,7 @@ struct llama_context {
 
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
-    ggml_backend_sched_t sched = nullptr;
+    ggml_backend_sched_ptr sched;
 
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
@@ -3358,8 +3325,8 @@ struct llama_lora_adapter {
     struct llama_model * base_model;
     // map tensor name to lora_a_b
     std::unordered_map<std::string, struct llama_lora_weight> ab_map;
-    std::vector<struct ggml_context *> ctxs;
-    std::vector<ggml_backend_buffer_t> bufs;
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
 
     float alpha;
 
@@ -3377,12 +3344,6 @@ struct llama_lora_adapter {
     }
 
     ~llama_lora_adapter() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
         auto pos = base_model->lora_adapters.find(this);
         if (pos != base_model->lora_adapters.end()) {
             base_model->lora_adapters.erase(pos);
@@ -3401,24 +3362,21 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d
         /*.mem_buffer =*/ NULL,
         /*.no_alloc   =*/ true,
     };
-    ggml_context * ctx = ggml_init(params);
+    ggml_context_ptr ctx { ggml_init(params) };
     if (!ctx) {
         throw std::runtime_error(format("failed to create ggml context"));
     }
 
-    ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, 0);
-    ggml_tensor * op_tensor = fn(ctx);
+    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
+    ggml_tensor * op_tensor = fn(ctx.get());
     for (int i = 0; i < GGML_MAX_SRC; i++) {
         if (op_tensor->src[i] != nullptr) {
             assert(op_tensor->src[i]->buffer == nullptr);
-            op_tensor->src[i]->buffer = buf;
+            op_tensor->src[i]->buffer = buf.get();
         }
     }
     bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
 
-    ggml_free(ctx);
-    ggml_backend_buffer_free(buf);
-
     return op_supported;
 }
 
@@ -3470,7 +3428,8 @@ static bool llama_kv_cache_init(
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
     auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        if (ctx_map.count(buft) == 0) {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
             struct ggml_init_params params = {
                 /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
                 /*.mem_buffer =*/ NULL,
@@ -3481,9 +3440,10 @@ static bool llama_kv_cache_init(
                 return nullptr;
             }
             ctx_map[buft] = ctx;
-            cache.ctxs.push_back(ctx);
+            cache.ctxs.emplace_back(ctx);
+            return ctx;
         }
-        return ctx_map.at(buft);
+        return it->second;
     };
 
     cache.k_l.reserve(n_layer);
@@ -3535,7 +3495,7 @@ static bool llama_kv_cache_init(
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-        cache.bufs.push_back(buf);
+        cache.bufs.emplace_back(buf);
     }
 
     return true;
@@ -3788,7 +3748,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
     cache.used = 0;
 
     for (auto & buf : cache.bufs) {
-        ggml_backend_buffer_clear(buf, 0);
+        ggml_backend_buffer_clear(buf.get(), 0);
     }
 }
 
@@ -4301,8 +4261,8 @@ struct llama_model_loader {
     std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
     std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
 
-    struct gguf_context * meta = NULL;
-    std::vector<ggml_context *> contexts;
+    gguf_context_ptr meta;
+    std::vector<ggml_context_ptr> contexts;
 
     std::string arch_name;
     LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
@@ -4325,7 +4285,7 @@ struct llama_model_loader {
             /*.ctx      = */ &ctx,
         };
 
-        meta = gguf_init_from_file(fname.c_str(), params);
+        meta.reset(gguf_init_from_file(fname.c_str(), params));
         if (!meta) {
             throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
         }
@@ -4347,7 +4307,7 @@ struct llama_model_loader {
             }
             n_elements += ggml_nelements(cur);
             n_bytes    += ggml_nbytes(cur);
-            weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur));
+            weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
         }
         uint16_t n_split = 0;
         get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
@@ -4377,7 +4337,7 @@ struct llama_model_loader {
                     /*.no_alloc = */ true,
                     /*.ctx      = */ &ctx,
                 };
-                struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
+                gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path, split_params) };
                 if (!ctx_gguf) {
                     throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
                 }
@@ -4394,10 +4354,8 @@ struct llama_model_loader {
                     }
                     n_elements += ggml_nelements(cur);
                     n_bytes    += ggml_nbytes(cur);
-                    weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur));
+                    weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
                 }
-
-                gguf_free(ctx_gguf);
             }
 
             get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
@@ -4413,10 +4371,10 @@ struct llama_model_loader {
             LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
         }
 
-        n_kv      = gguf_get_n_kv(meta);
+        n_kv      = gguf_get_n_kv(meta.get());
         n_tensors = weights_map.size();
 
-        fver = (enum llama_fver) gguf_get_version(meta);
+        fver = (enum llama_fver) gguf_get_version(meta.get());
 
         LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
                 __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
@@ -4487,23 +4445,23 @@ struct llama_model_loader {
             ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
 
             {
-                const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
+                const int kid = gguf_find_key(meta.get(), "general.file_type"); // TODO: use LLM_KV
                 if (kid >= 0) {
-                    ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
+                    ftype = (llama_ftype) gguf_get_val_u32(meta.get(), kid);
                 }
             }
 
             LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
 
             for (int i = 0; i < n_kv; i++) {
-                const char * name           = gguf_get_key(meta, i);
-                const enum gguf_type type   = gguf_get_kv_type(meta, i);
+                const char * name           = gguf_get_key(meta.get(), i);
+                const enum gguf_type type   = gguf_get_kv_type(meta.get(), i);
                 const std::string type_name =
                     type == GGUF_TYPE_ARRAY
-                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
+                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
                     : gguf_type_name(type);
 
-                std::string value          = gguf_kv_to_str(meta, i);
+                std::string value          = gguf_kv_to_str(meta.get(), i);
                 const size_t MAX_VALUE_LEN = 40;
                 if (value.size() > MAX_VALUE_LEN) {
                     value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -4532,19 +4490,10 @@ struct llama_model_loader {
         this->check_tensors = check_tensors;
     }
 
-    ~llama_model_loader() {
-        if (meta) {
-            gguf_free(meta);
-        }
-        for (auto * ctx : contexts) {
-            ggml_free(ctx);
-        }
-    }
-
     template<typename T>
     typename std::enable_if<std::is_integral<T>::value, bool>::type
     get_arr_n(const std::string & key, T & result, const bool required = true) {
-        const int kid = gguf_find_key(meta, key.c_str());
+        const int kid = gguf_find_key(meta.get(), key.c_str());
 
         if (kid < 0) {
             if (required) {
@@ -4554,7 +4503,7 @@ struct llama_model_loader {
         }
 
         struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
 
 
         result = arr_info.length;
@@ -4569,9 +4518,9 @@ struct llama_model_loader {
 
     template<typename T>
     bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
-        const int kid = gguf_find_key(meta, key.c_str());
+        const int kid = gguf_find_key(meta.get(), key.c_str());
 
-        if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) {
+        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
             if (required) {
                 throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
             }
@@ -4579,7 +4528,7 @@ struct llama_model_loader {
         }
 
         struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
 
         switch (arr_info.gt) {
             case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
@@ -4598,9 +4547,9 @@ struct llama_model_loader {
 
     template<typename T, size_t N_MAX>
     bool get_arr(const std::string & key, std::array<T, N_MAX> & result, const bool required = true) {
-        const int kid = gguf_find_key(meta, key.c_str());
+        const int kid = gguf_find_key(meta.get(), key.c_str());
 
-        if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) {
+        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
             if (required) {
                 throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
             }
@@ -4608,7 +4557,7 @@ struct llama_model_loader {
         }
 
         struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
 
         switch (arr_info.gt) {
             case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
@@ -4640,7 +4589,7 @@ struct llama_model_loader {
         const struct llama_model_kv_override * override =
             it != kv_overrides.end() ? &it->second : nullptr;
 
-        const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
+        const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
 
         if (required && !found) {
             throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -4657,7 +4606,7 @@ struct llama_model_loader {
     // get array of n <= N_MAX elements, or a single element repeated n times
     template<typename T, size_t N_MAX>
     bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
-        const int kid = gguf_find_key(meta, key.c_str());
+        const int kid = gguf_find_key(meta.get(), key.c_str());
 
         if (kid < 0) {
             if (required) {
@@ -4670,9 +4619,9 @@ struct llama_model_loader {
             throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
         }
 
-        if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
+        if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
             struct GGUFMeta::ArrayInfo arr_info =
-                GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+                GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
 
             if (n != arr_info.length) {
                 throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
@@ -5342,7 +5291,7 @@ static void llm_load_hparams(
         llama_model_loader & ml,
         llama_model & model) {
     auto & hparams = model.hparams;
-    const gguf_context * ctx = ml.meta;
+    const gguf_context * ctx = ml.meta.get();
 
     // get metadata as string
     for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -6109,7 +6058,7 @@ static void llm_load_vocab(
         llama_model & model) {
     auto & vocab = model.vocab;
 
-    struct gguf_context * ctx = ml.meta;
+    struct gguf_context * ctx = ml.meta.get();
 
     const auto kv = LLM_KV(model.arch);
 
@@ -7104,10 +7053,11 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
         /*.mem_buffer =*/ NULL,
         /*.no_alloc   =*/ true,
     };
-    ggml_context * ctx = ggml_init(params);
-    if (!ctx) {
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
+    if (!ctx_ptr) {
         throw std::runtime_error(format("failed to create ggml context"));
     }
+    ggml_context * ctx = ctx_ptr.get();
 
     ggml_tensor * op_tensor = nullptr;
 
@@ -7203,8 +7153,6 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
     ggml_backend_buffer_free(w->buffer);
     w->buffer = nullptr;
 
-    ggml_free(ctx);
-
     return op_supported;
 }
 
@@ -7395,7 +7343,8 @@ static bool llm_load_tensors(
 
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
     auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        if (ctx_map.count(buft) == 0) {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
             ggml_init_params params = {
                 /*.mem_size   =*/ ctx_size,
                 /*.mem_buffer =*/ NULL,
@@ -7406,9 +7355,10 @@ static bool llm_load_tensors(
                 throw std::runtime_error(format("failed to create ggml context"));
             }
             ctx_map[buft] = ctx;
-            model.ctxs.push_back(ctx);
+            model.ctxs.emplace_back(ctx);
+            return ctx;
         }
-        return ctx_map.at(buft);
+        return it->second;
     };
 
     // create tensors for the weights
@@ -9134,7 +9084,7 @@ static bool llm_load_tensors(
                 if (buf == nullptr) {
                     throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
                 }
-                model.bufs.push_back(buf);
+                model.bufs.emplace_back(buf);
                 bufs.emplace(idx, buf);
             }
         }
@@ -9143,7 +9093,7 @@ static bool llm_load_tensors(
             if (buf == nullptr) {
                 throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
             }
-            model.bufs.push_back(buf);
+            model.bufs.emplace_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
                 model.mlock_bufs.emplace_back(new llama_mlock);
                 auto & mlock_buf = model.mlock_bufs.back();
@@ -9183,13 +9133,13 @@ static bool llm_load_tensors(
     }
 
     // print memory requirements per buffer type
-    for (ggml_backend_buffer_t buf : model.bufs) {
-        LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
+    for (auto & buf : model.bufs) {
+        LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
     }
 
     // populate tensors_by_name
-    for (ggml_context * ctx : model.ctxs) {
-        for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
+    for (auto & ctx : model.ctxs) {
+        for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
             model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
         }
     }
@@ -10294,10 +10244,8 @@ struct llm_build_context {
     }
 
     void free() {
-        if (ctx0) {
-            ggml_free(ctx0);
-            ctx0 = nullptr;
-        }
+        ggml_free(ctx0);
+        ctx0 = nullptr;
     }
 
     struct ggml_cgraph * build_k_shift() {
@@ -10325,10 +10273,10 @@ struct llm_build_context {
                 // dequantize to f32 -> RoPE -> quantize back
                 tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
                 cb(tmp, "K_f32", il);
-                for (auto * backend : lctx.backends) {
+                for (auto & backend : lctx.backends) {
                     // Figure out which backend KV cache belongs to
-                    if (ggml_backend_supports_buft(backend, ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
-                        ggml_backend_sched_set_tensor_backend(lctx.sched, tmp, backend);
+                    if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
+                        ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get());
                         break;
                     }
                 }
@@ -16443,7 +16391,7 @@ static struct ggml_cgraph * llama_build_graph(
         if (!lctx.cparams.offload_kqv) {
             if (strcmp(name, "kqv_merged_cont") == 0) {
                 // all nodes between the KV store and the attention output are run on the CPU
-                ggml_backend_sched_set_tensor_backend(lctx.sched, cur, lctx.backend_cpu);
+                ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu);
             }
         }
 
@@ -16453,10 +16401,10 @@ static struct ggml_cgraph * llama_build_graph(
         if (ubatch.n_tokens < 32 || full_offload) {
             if (il != -1 && strcmp(name, "norm") == 0) {
                 const auto & dev_layer = lctx.model.dev_layer.at(il);
-                for (auto * backend : lctx.backends) {
-                    if (ggml_backend_get_device(backend) == dev_layer.dev) {
-                        if (ggml_backend_supports_op(backend, cur)) {
-                            ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
+                for (auto & backend : lctx.backends) {
+                    if (ggml_backend_get_device(backend.get()) == dev_layer.dev) {
+                        if (ggml_backend_supports_op(backend.get(), cur)) {
+                            ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get());
                         }
                     }
                 }
@@ -17143,7 +17091,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
         lctx.output_ids.resize(n_batch);
     }
 
-    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
+    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
     const size_t new_size  = (logits_size + embd_size) * sizeof(float);
 
     // alloc only when more than the current capacity is required
@@ -17154,7 +17102,6 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
             // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
             LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
-            ggml_backend_buffer_free(lctx.buf_output);
             lctx.buf_output = nullptr;
             lctx.logits = nullptr;
             lctx.embd = nullptr;
@@ -17167,14 +17114,14 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
         if (output_dev_host_buft) {
             buft = output_dev_host_buft;
         }
-        lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size);
+        lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
         if (lctx.buf_output == nullptr) {
             LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
             return 0;
         }
     }
 
-    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
+    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
 
     lctx.logits = has_logits ? output_base               : nullptr;
     lctx.embd   = has_embd   ? output_base + logits_size : nullptr;
@@ -17186,7 +17133,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     // set all ids as invalid (negative)
     std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
 
-    ggml_backend_buffer_clear(lctx.buf_output, 0);
+    ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
 
     lctx.n_outputs = 0;
 
@@ -17246,7 +17193,7 @@ static void llama_graph_compute(
         set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
     }
 
-    auto err = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
+    auto err = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
     if (err != GGML_STATUS_SUCCESS) {
         LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
     }
@@ -17404,8 +17351,8 @@ static int llama_decode_internal(
 
         //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
-        ggml_backend_sched_reset(lctx.sched);
-        ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
+        ggml_backend_sched_reset(lctx.sched.get());
+        ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
@@ -17433,7 +17380,7 @@ static int llama_decode_internal(
         }
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
-        ggml_backend_sched_alloc_graph(lctx.sched, gf);
+        ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
 
         llama_set_inputs(lctx, ubatch);
 
@@ -17456,7 +17403,7 @@ static int llama_decode_internal(
 
         // extract logits
         if (res) {
-            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
+            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res);
             GGML_ASSERT(backend_res != nullptr);
             GGML_ASSERT(lctx.logits != nullptr);
 
@@ -17472,7 +17419,7 @@ static int llama_decode_internal(
 
         // extract embeddings
         if (embd) {
-            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
+            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
             GGML_ASSERT(backend_embd != nullptr);
 
             switch (cparams.pooling_type) {
@@ -17567,7 +17514,7 @@ static int llama_decode_internal(
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
-    ggml_backend_sched_reset(lctx.sched);
+    ggml_backend_sched_reset(lctx.sched.get());
 
     return 0;
 }
@@ -17645,8 +17592,8 @@ static int llama_encode_internal(
 
     GGML_ASSERT(n_threads > 0);
 
-    ggml_backend_sched_reset(lctx.sched);
-    ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
+    ggml_backend_sched_reset(lctx.sched.get());
+    ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
     ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
@@ -17670,7 +17617,7 @@ static int llama_encode_internal(
         }
     }
 
-    ggml_backend_sched_alloc_graph(lctx.sched, gf);
+    ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
 
     llama_set_inputs(lctx, ubatch);
 
@@ -17678,7 +17625,7 @@ static int llama_encode_internal(
 
     // extract embeddings
     if (embd) {
-        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
         GGML_ASSERT(backend_embd != nullptr);
 
         if (llama_model_has_decoder(&lctx.model)) {
@@ -17745,7 +17692,7 @@ static int llama_encode_internal(
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
-    ggml_backend_sched_reset(lctx.sched);
+    ggml_backend_sched_reset(lctx.sched.get());
 
     return 0;
 }
@@ -17959,7 +17906,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 #else
     // ggml_graph defrag
 
-    ggml_backend_sched_reset(lctx.sched);
+    ggml_backend_sched_reset(lctx.sched.get());
 
     ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
 
@@ -17981,11 +17928,11 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
         }
 
         {
-            ggml_backend_sched_reset(lctx.sched);
+            ggml_backend_sched_reset(lctx.sched.get());
 
             ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
 
-            ggml_backend_sched_alloc_graph(lctx.sched, gf);
+            ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
 
             llama_set_k_shift(lctx);
 
@@ -18025,8 +17972,8 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
 
         // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(lctx.sched);
-        if (!ggml_backend_sched_reserve(lctx.sched, gf)) {
+        ggml_backend_sched_reset(lctx.sched.get());
+        if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
             LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
         }
     }
@@ -18577,30 +18524,30 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     }
 
     const size_t align = GGUF_DEFAULT_ALIGNMENT;
-    struct gguf_context * ctx_out = gguf_init_empty();
+    gguf_context_ptr ctx_out { gguf_init_empty() };
 
     // copy the KV pairs from the input file
-    gguf_set_kv     (ctx_out, ml.meta);
-    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
-    gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
+    gguf_set_kv     (ctx_out.get(), ml.meta.get());
+    gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
+    gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
 
     // Remove split metadata
-    gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
-    gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
-    gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
+    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
+    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
+    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
 
     if (params->kv_overrides) {
         const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
-        for (auto & o : overrides) {
+        for (const auto & o : overrides) {
             if (o.key[0] == 0) break;
             if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
-                gguf_set_val_f32(ctx_out, o.key, o.val_f64);
+                gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
-                gguf_set_val_i32(ctx_out, o.key, o.val_i64);
+                gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64);
             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
-                gguf_set_val_bool(ctx_out, o.key, o.val_bool);
+                gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
-                gguf_set_val_str(ctx_out, o.key, o.val_str);
+                gguf_set_val_str(ctx_out.get(), o.key, o.val_str);
             } else {
                 LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
             }
@@ -18672,25 +18619,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             n_split = std::max(uint16_t(it->idx + 1), n_split);
         }
     }
-    std::vector<gguf_context*> ctx_outs(n_split, NULL);
-    ctx_outs[0] = ctx_out;
+    std::vector<gguf_context_ptr> ctx_outs(n_split);
+    ctx_outs[0] = std::move(ctx_out);
 
     // populate the original tensors so we get an initial meta data
     for (const auto * it : tensors) {
         uint16_t i_split = params->keep_split ? it->idx : 0;
         struct ggml_tensor * tensor = it->tensor;
-        if (ctx_outs[i_split] == NULL) {
-            ctx_outs[i_split] = gguf_init_empty();
+        if (!ctx_outs[i_split]) {
+            ctx_outs[i_split].reset(gguf_init_empty());
         }
-        gguf_add_tensor(ctx_outs[i_split], tensor);
+        gguf_add_tensor(ctx_outs[i_split].get(), tensor);
     }
 
     // Set split info if needed
     if (n_split > 1) {
         for (size_t i = 0; i < ctx_outs.size(); ++i) {
-            gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
-            gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
-            gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
+            gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
+            gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
+            gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
         }
     }
 
@@ -18700,8 +18647,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         // Write metadata and close file handler
         if (fout.is_open()) {
             fout.seekp(0);
-            std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
-            gguf_get_meta_data(ctx_outs[cur_split], data.data());
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split].get()));
+            gguf_get_meta_data(ctx_outs[cur_split].get(), data.data());
             fout.write((const char *) data.data(), data.size());
             fout.close();
         }
@@ -18718,7 +18665,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
         fout = std::ofstream(fname, std::ios::binary);
         fout.exceptions(std::ofstream::failbit); // fail fast on write errors
-        const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
+        const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split].get());
         // placeholder for the meta data
         ::zeros(fout, meta_size);
     };
@@ -18903,17 +18850,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         total_size_new += new_size;
 
         // update the gguf meta data as we go
-        gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
-        gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
+        gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
+        gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size);
 
         // write tensor data + padding
         fout.write((const char *) new_data, new_size);
         zeros(fout, GGML_PAD(new_size, align) - new_size);
     }
     close_ofstream();
-    for (auto & c:ctx_outs) {
-        gguf_free(c);
-    }
 
     LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
     LLAMA_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
@@ -18927,51 +18871,51 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
     LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
 
-    ggml_context * ctx = nullptr;
+    ggml_context * ctx_init;
     struct gguf_init_params meta_gguf_params = {
         /* .no_alloc = */ true,
-        /* .ctx      = */ &ctx,
+        /* .ctx      = */ &ctx_init,
     };
-    struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
+
+    gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
     if (!ctx_gguf) {
         throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
     }
 
+    ggml_context_ptr ctx { ctx_init };
+
     // check metadata
     {
         auto get_kv_str = [&](const std::string & key) -> std::string {
-            int id = gguf_find_key(ctx_gguf, key.c_str());
-            return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
+            int id = gguf_find_key(ctx_gguf.get(), key.c_str());
+            return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
         };
         auto get_kv_f32 = [&](const std::string & key) -> float {
-            int id = gguf_find_key(ctx_gguf, key.c_str());
-            return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
+            int id = gguf_find_key(ctx_gguf.get(), key.c_str());
+            return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
         };
         LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
 
         auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
         if (general_type != "adapter") {
-            gguf_free(ctx_gguf);
             throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
         }
 
         auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
         auto general_arch = llm_arch_from_string(general_arch_str);
         if (general_arch != model->arch) {
-            gguf_free(ctx_gguf);
             throw std::runtime_error("model arch and LoRA arch mismatch");
         }
 
         auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
         if (adapter_type != "lora") {
-            gguf_free(ctx_gguf);
             throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
         }
 
         adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
     }
 
-    int n_tensors = gguf_get_n_tensors(ctx_gguf);
+    int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
 
     // contexts for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -18985,7 +18929,11 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                 /*.no_alloc   =*/ true,
             };
             ggml_context * buft_ctx = ggml_init(params);
+            if (!buft_ctx) {
+                return nullptr;
+            }
             ctx_map[buft] = buft_ctx;
+            adapter.ctxs.emplace_back(buft_ctx);
             return buft_ctx;
         };
         return it->second;
@@ -18996,7 +18944,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     auto str_endswith = [](const std::string & str, const std::string & suffix) {
         return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
     };
-    for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+    for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
         std::string name(cur->name);
         if (str_endswith(name, ".lora_a")) {
             replace_all(name, ".lora_a", "");
@@ -19013,8 +18961,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                 ab_map[name].b = cur;
             }
         } else {
-            gguf_free(ctx_gguf);
-            ggml_free(ctx);
             throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
         }
     }
@@ -19025,28 +18971,20 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         llama_lora_weight & w = it.second;
 
         if (!w.a || !w.b) {
-            gguf_free(ctx_gguf);
-            ggml_free(ctx);
             throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
         }
 
         // device buft and device ctx
         auto * model_tensor = llama_get_model_tensor(model, name.c_str());
         if (!model_tensor) {
-            gguf_free(ctx_gguf);
-            ggml_free(ctx);
             throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
         }
         struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
         // validate tensor shape
         if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
-            gguf_free(ctx_gguf);
-            ggml_free(ctx);
             throw std::runtime_error("tensor '" + name + "' has incorrect shape");
         }
         if (w.a->ne[1] != w.b->ne[0]) {
-            gguf_free(ctx_gguf);
-            ggml_free(ctx);
             throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
         }
         // save tensor to adapter
@@ -19061,18 +18999,15 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     {
         adapter.ctxs.reserve(ctx_map.size());
         adapter.bufs.reserve(ctx_map.size());
-        for (auto it : ctx_map) {
+        for (auto & it : ctx_map) {
             ggml_backend_buffer_type_t buft = it.first;
             ggml_context * ctx_dev = it.second;
-            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
+            ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
             if (!buf) {
-                gguf_free(ctx_gguf);
-                ggml_free(ctx);
                 throw std::runtime_error("failed to allocate buffer for lora adapter\n");
             }
-            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-            adapter.ctxs.push_back(ctx_dev);
-            adapter.bufs.push_back(buf);
+            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
+            adapter.bufs.emplace_back(std::move(buf));
         }
     }
 
@@ -19081,7 +19016,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         llama_file gguf_file(path_lora, "rb");
         std::vector<uint8_t> read_buf;
         auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
-            size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
+            size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
             size_t size = ggml_nbytes(orig);
             read_buf.resize(size);
             gguf_file.seek(offs, SEEK_SET);
@@ -19097,10 +19032,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     }
 
     LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
-
-    // free ctx for reading gguf
-    gguf_free(ctx_gguf);
-    ggml_free(ctx);
 }
 
 int32_t llama_lora_adapter_set(
@@ -19549,7 +19480,7 @@ struct llama_context * llama_new_context_with_model(
                 llama_free(ctx);
                 return nullptr;
             }
-            ctx->backends.push_back(backend);
+            ctx->backends.emplace_back(backend);
         }
 
         // add ACCEL backends (such as BLAS)
@@ -19562,7 +19493,7 @@ struct llama_context * llama_new_context_with_model(
                     llama_free(ctx);
                     return nullptr;
                 }
-                ctx->backends.push_back(backend);
+                ctx->backends.emplace_back(backend);
             }
         }
 
@@ -19573,16 +19504,16 @@ struct llama_context * llama_new_context_with_model(
             llama_free(ctx);
             return nullptr;
         }
-        ctx->backends.push_back(ctx->backend_cpu);
+        ctx->backends.emplace_back(ctx->backend_cpu);
 
         // create a list of the set_n_threads functions in the backends
-        for (auto * backend : ctx->backends) {
-            ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+        for (auto & backend : ctx->backends) {
+            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
             ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
             if (reg) {
                 auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
                 if (ggml_backend_set_n_threads_fn) {
-                    ctx->set_n_threads_fns.emplace_back(backend, ggml_backend_set_n_threads_fn);
+                    ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
                 }
             }
         }
@@ -19621,17 +19552,18 @@ struct llama_context * llama_new_context_with_model(
             }
 
             LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
-                    ggml_backend_buffer_name(ctx->buf_output),
-                    ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
+                    ggml_backend_buffer_name(ctx->buf_output.get()),
+                    ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0);
         }
 
         // scheduler and compute buffers
         {
             // buffer types used for the compute buffer of each backend
             std::vector<ggml_backend_buffer_type_t> backend_buft;
-            for (auto * backend : ctx->backends) {
-                auto * buft = ggml_backend_get_default_buffer_type(backend);
-                if (ggml_backend_is_cpu(backend) && !model->devices.empty()) {
+            std::vector<ggml_backend_t> backend_ptrs;
+            for (auto & backend : ctx->backends) {
+                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
+                if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
                     // use the host buffer of the first device CPU for faster transfer of the intermediate state
                     auto * dev = model->devices[0];
                     auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
@@ -19640,6 +19572,7 @@ struct llama_context * llama_new_context_with_model(
                     }
                 }
                 backend_buft.push_back(buft);
+                backend_ptrs.push_back(backend.get());
             }
 
             const size_t max_nodes = llama_model_max_nodes(*model);
@@ -19657,12 +19590,12 @@ struct llama_context * llama_new_context_with_model(
 
             // pipeline parallelism requires support for async compute and events in all devices
             if (pipeline_parallel) {
-                for (auto * backend : ctx->backends) {
-                    if (ggml_backend_is_cpu(backend)) {
+                for (auto & backend : ctx->backends) {
+                    if (ggml_backend_is_cpu(backend.get())) {
                         // ignore CPU backend
                         continue;
                     }
-                    auto * dev = ggml_backend_get_device(backend);
+                    auto * dev = ggml_backend_get_device(backend.get());
                     ggml_backend_dev_props props;
                     ggml_backend_dev_get_props(dev, &props);
                     if (!props.caps.async || !props.caps.events) {
@@ -19673,10 +19606,10 @@ struct llama_context * llama_new_context_with_model(
                 }
             }
 
-            ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
+            ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
 
             if (pipeline_parallel) {
-                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
+                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get()));
             }
 
             // initialize scheduler with the worst-case graph
@@ -19688,29 +19621,29 @@ struct llama_context * llama_new_context_with_model(
             ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
 
             // reserve pp graph first so that buffers are only allocated once
-            ggml_backend_sched_reserve(ctx->sched, gf_pp);
-            int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched);
+            ggml_backend_sched_reserve(ctx->sched.get(), gf_pp);
+            int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get());
             int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
 
             // reserve with tg graph to get the number of splits and nodes
             llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
             ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
-            ggml_backend_sched_reserve(ctx->sched, gf_tg);
-            int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched);
+            ggml_backend_sched_reserve(ctx->sched.get(), gf_tg);
+            int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get());
             int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
 
             // reserve again with pp graph to avoid ggml-alloc reallocations during inference
             gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
-            if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) {
+            if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                 llama_free(ctx);
                 return nullptr;
             }
 
-            for (size_t i = 0; i < ctx->backends.size(); i++) {
-                ggml_backend_t backend = ctx->backends[i];
+            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+                ggml_backend_t backend = backend_ptrs[i];
                 ggml_backend_buffer_type_t buft = backend_buft[i];
-                size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
+                size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend);
                 if (size > 1) {
                     LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
                             ggml_backend_buft_name(buft),
@@ -19990,7 +19923,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
     auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        if (ctx_map.count(buft) == 0) {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
             struct ggml_init_params params = {
                 /*.mem_size   =*/ model.hparams.n_layer*ggml_tensor_overhead(),
                 /*.mem_buffer =*/ NULL,
@@ -20001,12 +19935,12 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
                 return nullptr;
             }
             ctx_map[buft] = ctx;
-            cvec.ctxs.push_back(ctx);
+            cvec.ctxs.emplace_back(ctx);
+            return ctx;
         }
-        return ctx_map.at(buft);
+        return it->second;
     };
 
-
     // make tensors
     cvec.tensors.reserve(model.hparams.n_layer);
     cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
@@ -20037,7 +19971,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
             return false;
         }
         ggml_backend_buffer_clear(buf, 0);
-        cvec.bufs.push_back(buf);
+        cvec.bufs.emplace_back(buf);
     }
 
     return true;
@@ -21305,7 +21239,7 @@ int32_t llama_decode(
 }
 
 void llama_synchronize(struct llama_context * ctx) {
-    ggml_backend_sched_synchronize(ctx->sched);
+    ggml_backend_sched_synchronize(ctx->sched.get());
 
     // FIXME: if multiple single tokens are evaluated without a synchronization,
     // the stats will be added to the prompt evaluation stats

From a6744e43e80f4be6398fc7733a01642c846dce1d Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 1 Nov 2024 23:50:59 +0100
Subject: [PATCH 142/396] llama : add simple-chat example (#10124)

* llama : add simple-chat example

---------

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
---
 Makefile                             |   6 +
 examples/CMakeLists.txt              |   1 +
 examples/simple-chat/CMakeLists.txt  |   5 +
 examples/simple-chat/README.md       |   7 +
 examples/simple-chat/simple-chat.cpp | 197 +++++++++++++++++++++++++++
 ggml/include/ggml.h                  |   8 +-
 6 files changed, 220 insertions(+), 4 deletions(-)
 create mode 100644 examples/simple-chat/CMakeLists.txt
 create mode 100644 examples/simple-chat/README.md
 create mode 100644 examples/simple-chat/simple-chat.cpp

diff --git a/Makefile b/Makefile
index 719f45d16..051436344 100644
--- a/Makefile
+++ b/Makefile
@@ -34,6 +34,7 @@ BUILD_TARGETS = \
 	llama-save-load-state \
 	llama-server \
 	llama-simple \
+	llama-simple-chat \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
@@ -1287,6 +1288,11 @@ llama-simple: examples/simple/simple.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+llama-simple-chat: examples/simple-chat/simple-chat.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 llama-tokenize: examples/tokenize/tokenize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index ead630661..6df318c19 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -49,6 +49,7 @@ else()
     endif()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
+    add_subdirectory(simple-chat)
     add_subdirectory(speculative)
     add_subdirectory(tokenize)
 endif()
diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt
new file mode 100644
index 000000000..87723533b
--- /dev/null
+++ b/examples/simple-chat/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-simple-chat)
+add_executable(${TARGET} simple-chat.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/simple-chat/README.md b/examples/simple-chat/README.md
new file mode 100644
index 000000000..f0099ce3d
--- /dev/null
+++ b/examples/simple-chat/README.md
@@ -0,0 +1,7 @@
+# llama.cpp/example/simple-chat
+
+The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the chat template from the GGUF file.
+
+```bash
+./llama-simple-chat -m Meta-Llama-3.1-8B-Instruct.gguf -c 2048
+...
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
new file mode 100644
index 000000000..14264cfcb
--- /dev/null
+++ b/examples/simple-chat/simple-chat.cpp
@@ -0,0 +1,197 @@
+#include "llama.h"
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n    %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
+    printf("\n");
+}
+
+int main(int argc, char ** argv) {
+    std::string model_path;
+    int ngl = 99;
+    int n_ctx = 2048;
+
+    // parse command line arguments
+    for (int i = 1; i < argc; i++) {
+        try {
+            if (strcmp(argv[i], "-m") == 0) {
+                if (i + 1 < argc) {
+                    model_path = argv[++i];
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else if (strcmp(argv[i], "-c") == 0) {
+                if (i + 1 < argc) {
+                    n_ctx = std::stoi(argv[++i]);
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else if (strcmp(argv[i], "-ngl") == 0) {
+                if (i + 1 < argc) {
+                    ngl = std::stoi(argv[++i]);
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else {
+                print_usage(argc, argv);
+                return 1;
+            }
+        } catch (std::exception & e) {
+            fprintf(stderr, "error: %s\n", e.what());
+            print_usage(argc, argv);
+            return 1;
+        }
+    }
+    if (model_path.empty()) {
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    // only print errors
+    llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
+        if (level >= GGML_LOG_LEVEL_ERROR) {
+            fprintf(stderr, "%s", text);
+        }
+    }, nullptr);
+
+    // initialize the model
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = ngl;
+
+    llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
+    if (!model) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    // initialize the context
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx = n_ctx;
+    ctx_params.n_batch = n_ctx;
+
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    if (!ctx) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    // initialize the sampler
+    llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
+    llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
+    llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+
+    // helper function to evaluate a prompt and generate a response
+    auto generate = [&](const std::string & prompt) {
+        std::string response;
+
+        // tokenize the prompt
+        const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+        std::vector<llama_token> prompt_tokens(n_prompt_tokens);
+        if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
+            GGML_ABORT("failed to tokenize the prompt\n");
+        }
+
+        // prepare a batch for the prompt
+        llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+        llama_token new_token_id;
+        while (true) {
+            // check if we have enough space in the context to evaluate this batch
+            int n_ctx = llama_n_ctx(ctx);
+            int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
+            if (n_ctx_used + batch.n_tokens > n_ctx) {
+                printf("\033[0m\n");
+                fprintf(stderr, "context size exceeded\n");
+                exit(0);
+            }
+
+            if (llama_decode(ctx, batch)) {
+                GGML_ABORT("failed to decode\n");
+            }
+
+            // sample the next token
+            new_token_id = llama_sampler_sample(smpl, ctx, -1);
+
+            // is it an end of generation?
+            if (llama_token_is_eog(model, new_token_id)) {
+                break;
+            }
+
+            // convert the token to a string, print it and add it to the response
+            char buf[256];
+            int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
+            if (n < 0) {
+                GGML_ABORT("failed to convert token to piece\n");
+            }
+            std::string piece(buf, n);
+            printf("%s", piece.c_str());
+            fflush(stdout);
+            response += piece;
+
+            // prepare the next batch with the sampled token
+            batch = llama_batch_get_one(&new_token_id, 1);
+        }
+
+        return response;
+    };
+
+    std::vector<llama_chat_message> messages;
+    std::vector<char> formatted(llama_n_ctx(ctx));
+    int prev_len = 0;
+    while (true) {
+        // get user input
+        printf("\033[32m> \033[0m");
+        std::string user;
+        std::getline(std::cin, user);
+
+        if (user.empty()) {
+            break;
+        }
+
+        // add the user input to the message list and format it
+        messages.push_back({"user", strdup(user.c_str())});
+        int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
+        if (new_len > (int)formatted.size()) {
+            formatted.resize(new_len);
+            new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
+        }
+        if (new_len < 0) {
+            fprintf(stderr, "failed to apply the chat template\n");
+            return 1;
+        }
+
+        // remove previous messages to obtain the prompt to generate the response
+        std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
+
+        // generate a response
+        printf("\033[33m");
+        std::string response = generate(prompt);
+        printf("\n\033[0m");
+
+        // add the response to the messages
+        messages.push_back({"assistant", strdup(response.c_str())});
+        prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
+        if (prev_len < 0) {
+            fprintf(stderr, "failed to apply the chat template\n");
+            return 1;
+        }
+    }
+
+    // free resources
+    for (auto & msg : messages) {
+        free(const_cast<char *>(msg.content));
+    }
+    llama_sampler_free(smpl);
+    llama_free(ctx);
+    llama_free_model(model);
+
+    return 0;
+}
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 41df85557..2d93f31fa 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -558,10 +558,10 @@ extern "C" {
 
     enum ggml_log_level {
         GGML_LOG_LEVEL_NONE  = 0,
-        GGML_LOG_LEVEL_INFO  = 1,
-        GGML_LOG_LEVEL_WARN  = 2,
-        GGML_LOG_LEVEL_ERROR = 3,
-        GGML_LOG_LEVEL_DEBUG = 4,
+        GGML_LOG_LEVEL_DEBUG = 1,
+        GGML_LOG_LEVEL_INFO  = 2,
+        GGML_LOG_LEVEL_WARN  = 3,
+        GGML_LOG_LEVEL_ERROR = 4,
         GGML_LOG_LEVEL_CONT  = 5, // continue previous log
     };
 

From 7554aa4655f44b33a29068f2b18c5976fae45f9d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Sat, 2 Nov 2024 12:53:17 +0100
Subject: [PATCH 143/396] convert-lora : make `--base` optional (#10110)

* convert-lora : make `--base` optional

* lint

* handle case where base_model_name_or_path is invalid

* do not include metadata from base model

* clarify unspecified --base

* add small comment [no ci]

* trigger ci
---
 convert_hf_to_gguf.py   | 27 +++++++++++------------
 convert_lora_to_gguf.py | 47 ++++++++++++++++++++++++++++++++---------
 2 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a34dabe23..76ee6cef5 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -72,7 +72,8 @@ class Model:
     def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                  use_temp_file: bool = False, eager: bool = False,
                  metadata_override: Path | None = None, model_name: str | None = None,
-                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
+                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
+                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
         if type(self) is Model:
             raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
 
@@ -87,7 +88,7 @@ class Model:
         self.is_safetensors = len(self.part_names) > 0
         if not self.is_safetensors:
             self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
-        self.hparams = Model.load_hparams(self.dir_model)
+        self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
         self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
         self.tensor_names = None
@@ -1541,6 +1542,17 @@ class LlamaModel(Model):
             special_vocab._set_special_token("eot",    32010)
             special_vocab.add_to_gguf(self.gguf_writer)
 
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                if "add_prefix_space" in tokenizer_config_json:
+                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
+
+        # Apply to granite small models only
+        if self.hparams.get("vocab_size", 32000) == 49152:
+            self.gguf_writer.add_add_bos_token(False)
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
@@ -1557,17 +1569,6 @@ class LlamaModel(Model):
                 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                 self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
 
-        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-                if "add_prefix_space" in tokenizer_config_json:
-                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
-
-        # Apply to granite small models only
-        if self.hparams.get("vocab_size", 32000) == 49152:
-            self.gguf_writer.add_add_bos_token(False)
-
     @staticmethod
     def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
         if n_head_kv is not None and n_head != n_head_kv:
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 915e21836..ed1014cae 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -12,6 +12,7 @@ import json
 from math import prod
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
+from transformers import AutoConfig
 
 import torch
 
@@ -256,8 +257,8 @@ def parse_args() -> argparse.Namespace:
         help="only print out what will be done, without writing any new files",
     )
     parser.add_argument(
-        "--base", type=Path, required=True,
-        help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required",
+        "--base", type=Path,
+        help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
     )
     parser.add_argument(
         "lora_path", type=Path,
@@ -267,6 +268,12 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
+def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
+    # normally, adapter does not come with base model config, we need to load it from AutoConfig
+    config = AutoConfig.from_pretrained(hf_model_id)
+    return config.to_dict()
+
+
 if __name__ == '__main__':
     args = parse_args()
     logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
@@ -281,7 +288,7 @@ if __name__ == '__main__':
 
     ftype = ftype_map[args.outtype]
 
-    dir_base_model: Path = args.base
+    dir_base_model: Path | None = args.base
     dir_lora: Path = args.lora_path
     lora_config = dir_lora / "adapter_config.json"
     input_model = dir_lora / "adapter_model.safetensors"
@@ -301,9 +308,29 @@ if __name__ == '__main__':
         input_model = os.path.join(dir_lora, "adapter_model.bin")
         lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
 
+    # load LoRA config
+    with open(lora_config, "r") as f:
+        lparams: dict[str, Any] = json.load(f)
+
     # load base model
-    logger.info(f"Loading base model: {dir_base_model.name}")
-    hparams = Model.load_hparams(dir_base_model)
+    if dir_base_model is None:
+        if "base_model_name_or_path" in lparams:
+            model_id = lparams["base_model_name_or_path"]
+            logger.info(f"Loading base model from Hugging Face: {model_id}")
+            try:
+                hparams = load_hparams_from_hf(model_id)
+            except OSError as e:
+                logger.error(f"Failed to load base model config: {e}")
+                logger.error("Please try downloading the base model and add its path to --base")
+                sys.exit(1)
+        else:
+            logger.error("'base_model_name_or_path' is not found in adapter_config.json")
+            logger.error("Base model config is required. Please download the base model and add its path to --base")
+            sys.exit(1)
+    else:
+        logger.info(f"Loading base model: {dir_base_model.name}")
+        hparams = Model.load_hparams(dir_base_model)
+
     with torch.inference_mode():
         try:
             model_class = Model.from_model_architecture(hparams["architectures"][0])
@@ -323,13 +350,15 @@ if __name__ == '__main__':
                 self.dir_model_card = dir_lora_model
                 self.lora_alpha = float(lora_alpha)
 
+            def set_vocab(self):
+                pass
+
             def set_type(self):
                 self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
                 self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
 
             def set_gguf_parameters(self):
                 self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
-                super().set_gguf_parameters()
 
             def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
                 # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
@@ -350,7 +379,7 @@ if __name__ == '__main__':
                         logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
                         if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
                             logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
-                            logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()")
+                            logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
                         sys.exit(1)
 
                     if base_name in tensor_map:
@@ -384,9 +413,6 @@ if __name__ == '__main__':
                     yield (dest_name + ".lora_a", lora_a)
                     yield (dest_name + ".lora_b", lora_b)
 
-        with open(lora_config, "r") as f:
-            lparams: dict[str, Any] = json.load(f)
-
         alpha: float = lparams["lora_alpha"]
 
         model_instance = LoraModel(
@@ -399,6 +425,7 @@ if __name__ == '__main__':
             dry_run=args.dry_run,
             dir_lora_model=dir_lora,
             lora_alpha=alpha,
+            hparams=hparams,
         )
 
         logger.info("Exporting model...")

From b634f8a26fef65210fd9fb2f87e83a2809535e89 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Sat, 2 Nov 2024 13:08:53 +0100
Subject: [PATCH 144/396] simple-chat : only add bos on first prompt (#10129)

---
 examples/simple-chat/simple-chat.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index 14264cfcb..5f9973163 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -96,7 +96,7 @@ int main(int argc, char ** argv) {
         // tokenize the prompt
         const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
         std::vector<llama_token> prompt_tokens(n_prompt_tokens);
-        if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
+        if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
             GGML_ABORT("failed to tokenize the prompt\n");
         }
 

From 1926d6e39d6f6358bc1a4c52316a560178be7233 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 2 Nov 2024 15:18:56 +0200
Subject: [PATCH 145/396] llama : adjust default context size + print warnings
 (#10136)

* llama : adjust default context size + print warnings

ggml-ci

* ggml-ci : add missing gpu-layers + adjust context sizes
---
 ci/run.sh       | 164 ++++++++++++++++++++++++------------------------
 common/common.h |   2 +-
 src/llama.cpp   |  26 ++++++--
 3 files changed, 103 insertions(+), 89 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index dc26d94ee..21b62dd1e 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -326,36 +326,36 @@ function gg_run_open_llama_7b_v2 {
     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
 
-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
 
-    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
         qnt="$1"
@@ -460,34 +460,34 @@ function gg_run_pythia_1_4b {
     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
 
-    (time ./bin/llama-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
 
-    (time ./bin/llama-save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
         qnt="$1"
@@ -591,36 +591,36 @@ function gg_run_pythia_2_8b {
     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
 
-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
 
-    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
         qnt="$1"
@@ -706,8 +706,8 @@ function gg_run_embd_bge_small {
 
     ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
 
-    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
 
     set +e
 }
@@ -752,7 +752,7 @@ function gg_run_rerank_tiny {
     model_f16="${path_models}/ggml-model-f16.gguf"
 
     # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16}  -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
 
     # sample output
     # rerank score 0:    0.029
diff --git a/common/common.h b/common/common.h
index cd5a8e051..727f85baa 100644
--- a/common/common.h
+++ b/common/common.h
@@ -155,7 +155,7 @@ struct common_sampler_params {
 
 struct common_params {
     int32_t n_predict             =    -1; // new tokens to predict
-    int32_t n_ctx                 =     0; // context size
+    int32_t n_ctx                 =  4096; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                =     0; // number of tokens to keep from initial prompt
diff --git a/src/llama.cpp b/src/llama.cpp
index 0991c4089..3f534596e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19440,12 +19440,26 @@ struct llama_context * llama_new_context_with_model(
         cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
     }
 
-    LLAMA_LOG_INFO("%s: n_ctx      = %u\n",     __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_batch    = %u\n",     __func__, cparams.n_batch);
-    LLAMA_LOG_INFO("%s: n_ubatch   = %u\n",     __func__, cparams.n_ubatch);
-    LLAMA_LOG_INFO("%s: flash_attn = %d\n",     __func__, cparams.flash_attn);
-    LLAMA_LOG_INFO("%s: freq_base  = %.1f\n",   __func__, cparams.rope_freq_base);
-    LLAMA_LOG_INFO("%s: freq_scale = %g\n",     __func__, cparams.rope_freq_scale);
+    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
+    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
+    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
+    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
+    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
+    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
+    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
+    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
+    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
+
+    if (n_ctx_per_seq < hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
+
+    if (n_ctx_per_seq > hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
 
     ctx->abort_callback      = params.abort_callback;
     ctx->abort_callback_data = params.abort_callback_data;

From 45950415ed985830c59bf42cf9c9216b20cf08ef Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 2 Nov 2024 18:34:00 +0200
Subject: [PATCH 146/396] server : fix endpoint checks (#10135)

ggml-ci
---
 examples/server/server.cpp | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 54cdb4b72..5c1af549b 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2703,8 +2703,8 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
-        if (ctx_server.params.embedding || ctx_server.params.reranking) {
-            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
+        if (ctx_server.params.embedding) {
+            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
 
@@ -2809,8 +2809,8 @@ int main(int argc, char ** argv) {
 
     // TODO: maybe merge this function with "handle_completions_generic"
     const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
-        if (ctx_server.params.embedding || ctx_server.params.reranking) {
-            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
+        if (ctx_server.params.embedding) {
+            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
 
@@ -2935,11 +2935,6 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        // TODO: somehow clean up this checks in the future
-        if (!ctx_server.params.embedding || ctx_server.params.reranking) {
-            res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
         const json body = json::parse(req.body);
         bool is_openai = false;
 
@@ -2991,10 +2986,11 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params.reranking) {
-            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
+        if (!ctx_server.params.reranking || ctx_server.params.embedding) {
+            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
+
         const json body = json::parse(req.body);
 
         // TODO: implement

From 42cadc74bda60afafb45b71b1a39d150ede0ed4d Mon Sep 17 00:00:00 2001
From: sasha0552 <admin@sasha0552.org>
Date: Sat, 2 Nov 2024 16:34:56 +0000
Subject: [PATCH 147/396] server : fix slot selection by lru (#10126)

* server : fix slot selection by lru, migrate lcs to `size_t`

* minor debug log fix
---
 examples/server/server.cpp | 14 ++++++++------
 examples/server/utils.hpp  | 14 +++++++-------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 5c1af549b..8531a784d 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -247,6 +247,7 @@ struct server_slot {
         if (is_processing()) {
             SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
 
+            t_last_used = ggml_time_us();
             t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
             state = SLOT_STATE_IDLE;
             callback_on_release(id);
@@ -730,7 +731,7 @@ struct server_context {
 
         // find the slot that has at least n% prompt similarity
         if (ret == nullptr && slot_prompt_similarity != 0.0f) {
-            int max_lcs_len = 0;
+            int lcs_len = 0;
             float similarity = 0;
 
             for (server_slot & slot : slots) {
@@ -745,20 +746,21 @@ struct server_context {
                 }
 
                 // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
-                int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
+                int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
 
                 // fraction of the common subsequence length compared to the current slot's prompt length
-                similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
+                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
 
                 // select the current slot if the criteria match
-                if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
-                    max_lcs_len = lcs_len;
+                if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
+                    lcs_len = cur_lcs_len;
+                    similarity = cur_similarity;
                     ret = &slot;
                 }
             }
 
             if (ret != nullptr) {
-                SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
+                SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
             }
         }
 
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 871a17a4f..c47ed3e47 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -453,20 +453,20 @@ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tok
     }
 
     // get the lengths of the input sequences
-    int a_len = a.size();
-    int b_len = b.size();
+    size_t a_len = a.size();
+    size_t b_len = b.size();
 
     // initialize the maximum length of the longest common subsequence (LCS)
-    int max_length = 0;
+    size_t max_length = 0;
 
     // use two rows instead of a 2D matrix to optimize space
-    std::vector<int> prev_row(b_len + 1, 0);
-    std::vector<int> curr_row(b_len + 1, 0);
+    std::vector<size_t> prev_row(b_len + 1, 0);
+    std::vector<size_t> curr_row(b_len + 1, 0);
 
     // iterate through the elements of a
-    for (int i = 1; i <= a_len; i++) {
+    for (size_t i = 1; i <= a_len; i++) {
         // iterate through the elements of b
-        for (int j = 1; j <= b_len; j++) {
+        for (size_t j = 1; j <= b_len; j++) {
             // if elements at the current positions match
             if (a[i - 1] == b[j - 1]) {
                 // if it's the first element of either sequences, set LCS length to 1

From 9830b6923b61f1e652a35afeac77aa5f886dad09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6hnenkamp?= <cvk5@me.com>
Date: Sat, 2 Nov 2024 23:35:31 +0100
Subject: [PATCH 148/396] Add apple arm to presets (#10134)

* Add apple arm to presets

* Add final new line
---
 CMakePresets.json             | 13 +++++++++++++
 cmake/arm64-apple-clang.cmake | 16 ++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 cmake/arm64-apple-clang.cmake

diff --git a/CMakePresets.json b/CMakePresets.json
index d22ffa490..ae45d60af 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -48,10 +48,23 @@
         }
     },
 
+    {
+        "name": "arm64-apple-clang", "hidden": true,
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
+        }
+    },
+
     { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
     { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
     { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
 
+    { "name": "arm64-apple-clang-debug"  , "inherits": [ "base", "arm64-apple-clang",  "debug"   ] },
+    { "name": "arm64-apple-clang-release"  , "inherits": [ "base", "arm64-apple-clang",  "reldbg"   ] },
+    { "name": "arm64-apple-clang+static-release"  , "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
+
     { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
     { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
     { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
diff --git a/cmake/arm64-apple-clang.cmake b/cmake/arm64-apple-clang.cmake
new file mode 100644
index 000000000..5fcd2882a
--- /dev/null
+++ b/cmake/arm64-apple-clang.cmake
@@ -0,0 +1,16 @@
+set( CMAKE_SYSTEM_NAME Darwin )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-apple-darwin-macho )
+
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

From 1839f69130151ceeac4d01c0ef8964e1fb43bba6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Nov 2024 15:14:15 +0200
Subject: [PATCH 149/396] flake.lock: Update (#10146)

---
 flake.lock | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/flake.lock b/flake.lock
index 732c7539c..c170c4952 100644
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
         "nixpkgs-lib": "nixpkgs-lib"
       },
       "locked": {
-        "lastModified": 1727826117,
-        "narHash": "sha256-K5ZLCyfO/Zj9mPFldf3iwS6oZStJcU4tSpiXTMYaaL0=",
+        "lastModified": 1730504689,
+        "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
         "owner": "hercules-ci",
         "repo": "flake-parts",
-        "rev": "3d04084d54bedc3d6b8b736c70ef449225c361b1",
+        "rev": "506278e768c2a08bec68eb62932193e341f55c90",
         "type": "github"
       },
       "original": {
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1729665710,
-        "narHash": "sha256-AlcmCXJZPIlO5dmFzV3V2XF6x/OpNWUV8Y/FMPGd8Z4=",
+        "lastModified": 1730200266,
+        "narHash": "sha256-l253w0XMT8nWHGXuXqyiIC/bMvh1VRszGXgdpQlfhvU=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "2768c7d042a37de65bb1b5b3268fc987e534c49d",
+        "rev": "807e9154dcb16384b1b765ebe9cd2bba2ac287fd",
         "type": "github"
       },
       "original": {
@@ -36,14 +36,14 @@
     },
     "nixpkgs-lib": {
       "locked": {
-        "lastModified": 1727825735,
-        "narHash": "sha256-0xHYkMkeLVQAMa7gvkddbPqpxph+hDzdu1XdGPJR+Os=",
+        "lastModified": 1730504152,
+        "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
         "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
       },
       "original": {
         "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
       }
     },
     "root": {

From 08828a6d7d0006a487c9655ba8ace0ebe35ecad1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 3 Nov 2024 15:18:40 +0200
Subject: [PATCH 150/396] metal : minor fixup in FA kernel (#10143)

* metal : minor fixup in FA kernel

ggml-ci

* metal : use the unrolled loop variable

* metal : remove unused var
---
 ggml/src/ggml-metal.metal | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index defde6246..57eb34f13 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -2776,11 +2776,11 @@ kernel void kernel_flash_attn_ext_vec_f16(
         const short iv3 = iq3 / rv3;
 
         // load the queries from shared memory into local memory
-        float4 mq[D4];
+        float4 mq[D4/NW];
 
         for (short ii = 0; ii < D4; ii += NW) {
             short i = ii + tiisg;
-            mq[i] = (float4) sq4[i];
+            mq[ii/NW] = (float4) sq4[i];
         }
 
         // pointer to the mask
@@ -2812,7 +2812,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
                         mk[2] = (float4) pk4[i + 2*(nb11/8)];
                         mk[3] = (float4) pk4[i + 3*(nb11/8)];
 
-                        mqk += (float4) (mq[i] * mk);
+                        mqk += (float4) (mq[ii/NW] * mk);
                     }
 
                     // reduce the results from the threads in the simdgroup
@@ -2857,8 +2857,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
                 // O = diag(ms)*O
 #pragma unroll
                 for (short ii = 0; ii < D4; ii += NW) {
-                    const short i = ii + tiisg;
-                    lo[i/NW] *= ms;
+                    lo[ii/NW] *= ms;
                 }
             }
 
@@ -2872,10 +2871,10 @@ kernel void kernel_flash_attn_ext_vec_f16(
                     for (short ii = 0; ii < D4; ii += NW) {
                         const short i = ii + tiisg;
 
-                        lo[i/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0];
-                        lo[i/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1];
-                        lo[i/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2];
-                        lo[i/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3];
+                        lo[ii/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0];
+                        lo[ii/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1];
+                        lo[ii/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2];
+                        lo[ii/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3];
                     }
                 }
             }

From 9f409893519b4a6def46ef80cd6f5d05ac0fb157 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Sun, 3 Nov 2024 19:34:08 +0100
Subject: [PATCH 151/396] ggml : move CPU backend to a separate file (#10144)

---
 Makefile                           |    21 +-
 Package.swift                      |     1 +
 common/CMakeLists.txt              |     2 -
 common/common.cpp                  |     2 +
 common/train.cpp                   |  1515 ---
 common/train.h                     |   233 -
 examples/CMakeLists.txt            |     1 -
 examples/baby-llama/CMakeLists.txt |     5 -
 examples/baby-llama/baby-llama.cpp |  1639 ---
 examples/llava/clip.cpp            |     1 +
 examples/rpc/rpc-server.cpp        |     2 +
 ggml/include/ggml-backend.h        |    19 +-
 ggml/include/ggml-cpu.h            |   150 +
 ggml/include/ggml.h                |   149 +-
 ggml/src/CMakeLists.txt            |     2 +
 ggml/src/ggml-aarch64.c            |     1 +
 ggml/src/ggml-backend.cpp          |  1237 +--
 ggml/src/ggml-cpu.c                | 13715 ++++++++++++++++++++++++
 ggml/src/ggml-impl.h               |    87 +
 ggml/src/ggml-rpc.cpp              |     9 +-
 ggml/src/ggml.c                    | 15264 +--------------------------
 include/llama.h                    |     1 +
 pocs/vdot/q8dot.cpp                |     3 +-
 pocs/vdot/vdot.cpp                 |    10 +-
 spm-headers/ggml-cpu.h             |     1 +
 src/llama.cpp                      |     2 +
 tests/test-backend-ops.cpp         |     1 +
 tests/test-barrier.cpp             |     1 +
 tests/test-grad0.cpp               |     1 +
 tests/test-quantize-fns.cpp        |    10 +-
 tests/test-quantize-perf.cpp       |     6 +-
 tests/test-rope.cpp                |     1 +
 32 files changed, 14747 insertions(+), 19345 deletions(-)
 delete mode 100644 common/train.cpp
 delete mode 100644 common/train.h
 delete mode 100644 examples/baby-llama/CMakeLists.txt
 delete mode 100644 examples/baby-llama/baby-llama.cpp
 create mode 100644 ggml/include/ggml-cpu.h
 create mode 100644 ggml/src/ggml-cpu.c
 create mode 120000 spm-headers/ggml-cpu.h

diff --git a/Makefile b/Makefile
index 051436344..eb1da90f1 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,6 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
-	llama-baby-llama \
 	llama-batched \
 	llama-batched-bench \
 	llama-bench \
@@ -56,7 +55,6 @@ TEST_TARGETS = \
 	tests/test-llama-grammar \
 	tests/test-log \
 	tests/test-model-load-cancel \
-	tests/test-opt \
 	tests/test-quantize-fns \
 	tests/test-quantize-perf \
 	tests/test-rope \
@@ -64,6 +62,7 @@ TEST_TARGETS = \
 	tests/test-tokenizer-0 \
 	tests/test-tokenizer-1-bpe \
 	tests/test-tokenizer-1-spm
+#	tests/test-opt \
 
 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
 LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
@@ -916,6 +915,7 @@ endif # GGML_METAL
 
 OBJ_GGML += \
 	ggml/src/ggml.o \
+	ggml/src/ggml-cpu.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
 	ggml/src/ggml-quants.o \
@@ -936,7 +936,6 @@ OBJ_COMMON = \
 	common/console.o \
 	common/ngram-cache.o \
 	common/sampling.o \
-	common/train.o \
 	common/build-info.o \
 	common/json-schema-to-grammar.o
 
@@ -1048,6 +1047,12 @@ ggml/src/ggml.o: \
 	ggml/include/ggml.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
+ggml/src/ggml-cpu.o: \
+	ggml/src/ggml-cpu.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+
 ggml/src/ggml-alloc.o: \
 	ggml/src/ggml-alloc.c \
 	ggml/include/ggml.h \
@@ -1213,11 +1218,6 @@ common/json-schema-to-grammar.o: \
 	common/json-schema-to-grammar.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-common/train.o: \
-	common/train.cpp \
-	common/train.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
 common/ngram-cache.o: \
 	common/ngram-cache.cpp \
 	common/ngram-cache.h
@@ -1390,11 +1390,6 @@ llama-bench: examples/llama-bench/llama-bench.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-baby-llama: examples/baby-llama/baby-llama.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 llama-export-lora: examples/export-lora/export-lora.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/Package.swift b/Package.swift
index 3a17e6c34..d3661d13c 100644
--- a/Package.swift
+++ b/Package.swift
@@ -10,6 +10,7 @@ var sources = [
     "src/unicode.cpp",
     "src/unicode-data.cpp",
     "ggml/src/ggml.c",
+    "ggml/src/ggml-cpu.c",
     "ggml/src/ggml-alloc.c",
     "ggml/src/ggml-backend.cpp",
     "ggml/src/ggml-quants.c",
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 042e895ad..5ab1ffa19 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -66,8 +66,6 @@ add_library(${TARGET} STATIC
     ngram-cache.h
     sampling.cpp
     sampling.h
-    train.cpp
-    train.h
     )
 
 if (BUILD_SHARED_LIBS)
diff --git a/common/common.cpp b/common/common.cpp
index 7656843b1..c8cbaae11 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1951,6 +1951,8 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
 
 void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
+    ggml_cpu_init(); // some ARM features are detected at runtime
+
     const auto & sparams = params.sparams;
 
     fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
diff --git a/common/train.cpp b/common/train.cpp
deleted file mode 100644
index 661ad8382..000000000
--- a/common/train.cpp
+++ /dev/null
@@ -1,1515 +0,0 @@
-#include "train.h"
-#include "common.h"
-
-#include <algorithm>
-#include <random>
-#include <sstream>
-#include <functional>
-#include <cstring>
-
-struct random_normal_distribution {
-    std::mt19937 gen;
-    std::normal_distribution<float> rd;
-    float min;
-    float max;
-};
-
-struct random_uniform_distribution {
-    std::mt19937 gen;
-    std::uniform_real_distribution<float> rd;
-};
-
-struct train_state  * init_train_state() {
-    struct train_state * state = new struct train_state;
-    state->train_its     = 0;
-    state->train_samples = 0;
-    state->train_tokens  = 0;
-    state->train_epochs  = 0;
-    state->shuffle_samples_hash  = 0;
-    state->shuffle_sample_count  = 0;
-    state->shuffle_next_sample   = 0;
-    state->shuffle_rng_state_current = "";
-    state->shuffle_rng_state_next    = "";
-
-    state->opt = new struct ggml_opt_context;
-    state->opt->ctx = NULL;
-    state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-    state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
-    state->opt->loss_after = 0.0f;
-
-    return state;
-}
-
-void free_train_state(struct train_state  * state) {
-    delete state->opt;
-    delete state;
-}
-
-struct random_normal_distribution * init_random_normal_distribution(
-    int seed, float mean, float std, float min, float max
-) {
-    struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution));
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::normal_distribution<float>{mean, std};
-    rnd->min = min;
-    rnd->max = max;
-    return rnd;
-}
-
-struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) {
-    struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution));
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::uniform_real_distribution<float>{min, max};
-    return rnd;
-}
-
-void free_random_normal_distribution (struct random_normal_distribution  * rnd) {
-    free(rnd);
-}
-
-void free_random_uniform_distribution(struct random_uniform_distribution * rnd) {
-    free(rnd);
-}
-
-struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
-    float scale = 1.0f; // xavier
-    switch (ggml_n_dims(tensor)) {
-        case 1:
-            scale /= sqrtf((float) tensor->ne[0]);
-            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = scale * frand_normal(rnd);
-            }
-            break;
-        case 2:
-            scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = scale * frand_normal(rnd);
-                }
-            }
-            break;
-        case 3:
-            scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = scale * frand_normal(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
-            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = scale * frand_normal(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            die("Unsupported tensor->n_dims");
-    };
-    return tensor;
-}
-
-struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (ggml_n_dims(tensor)) {
-        case 1:
-            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = frand_uniform(rnd);
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = frand_uniform(rnd);
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = frand_uniform(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = frand_uniform(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            die("Unsupported tensor->n_dims");
-    };
-    return tensor;
-}
-
-float frand() {
-    return (float)rand()/((float)(RAND_MAX) + 1.0f);
-}
-
-float frand_normal(struct random_normal_distribution * rnd) {
-    return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
-}
-
-float frand_uniform(struct random_uniform_distribution * rnd) {
-    return rnd->rd(rnd->gen);
-}
-
-int clamp(const int v, const int min, const int max) {
-    return ((v < min) ? (min) : (v > max) ? (max) : v);
-}
-
-float fclamp(const float v, const float min, const float max) {
-    return ((v < min) ? (min) : (v > max) ? (max) : v);
-}
-
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == 1);
-    GGML_ASSERT(tensor->ne[2] == 1);
-    GGML_ASSERT(tensor->ne[3] == 1);
-}
-
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == 1);
-    GGML_ASSERT(tensor->ne[3] == 1);
-}
-
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == 1);
-}
-
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == ne3);
-}
-
-int64_t get_example_targets_batch(
-    struct llama_context * lctx,
-    struct ggml_tensor   * tokens_input,
-    struct ggml_tensor   * target_probs,
-    int64_t                example_id,
-    const size_t         * samples_offs,
-    const size_t         * samples_begin,
-    const size_t         * samples_size,
-          size_t           samples_count,
-    const llama_token    * train_data,
-    size_t                 n_train_data,
-    bool                   separate_with_eos,
-    bool                   separate_with_bos,
-    bool                   fill_with_next_samples,
-    bool                   sample_random_offsets
-) {
-    GGML_ASSERT(samples_count > 0);
-    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(ggml_is_3d(target_probs));
-    int64_t n_vocab  = target_probs->ne[0];
-    int64_t n_tokens = tokens_input->ne[0];
-    int64_t n_batch  = tokens_input->ne[1];
-    GGML_ASSERT(n_vocab  == target_probs->ne[0]);
-    GGML_ASSERT(n_tokens == target_probs->ne[1]);
-    GGML_ASSERT(n_batch  == target_probs->ne[2]);
-
-    int64_t used_samples = 0;
-
-    ggml_set_f32(target_probs, 0.0f);
-    llama_token bos = llama_token_bos(llama_get_model(lctx));
-    llama_token eos = llama_token_eos(llama_get_model(lctx));
-    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
-    for (int k=0; k<n_batch; ++k) {
-        // printf("%s: batch %d\n", __func__, k);
-        size_t sample_idx   = (example_id + used_samples) % samples_count;
-        size_t sample_offs  = sample_random_offsets ? samples_offs[sample_idx] : 0;
-        size_t sample_begin = samples_begin[sample_idx];
-        size_t sample_size  = samples_size[sample_idx];
-        ++used_samples;
-
-        // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
-        GGML_ASSERT(sample_begin+sample_size-1 < n_train_data);
-
-        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, bos);
-        bool sample_separation_eos = !separate_with_eos;
-        bool sample_separation_bos = !separate_with_bos;
-        for (int64_t i=0; i<n_tokens; ++i) {
-            llama_token token = eos;
-            if (sample_offs >= sample_size && fill_with_next_samples) {
-                if (!sample_separation_eos) {
-                    // insert eos token to separate samples
-                    sample_separation_eos = true;
-                } else if (!sample_separation_bos) {
-                    // insert bos token to separate samples
-                    sample_separation_bos = true;
-                    token = bos;
-                } else {
-                    // sample separation is done, continue with next sample
-                    sample_separation_eos = !separate_with_eos;
-                    sample_separation_bos = !separate_with_bos;
-                    sample_offs  = 0;
-                    sample_idx   = (example_id + used_samples) % samples_count;
-                    sample_begin = samples_begin[sample_idx];
-                    sample_size  = samples_size[sample_idx];
-                    ++used_samples;
-                }
-            }
-            // note: no else-if here
-            if (sample_offs < sample_size) {
-                token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1));
-                ++sample_offs;
-            }
-            ggml_set_f32_nd(target_probs,  token, (int) i, (int) k, 0, +1.0f);
-            if (i+1<n_tokens) {
-                ggml_set_i32_nd(tokens_input, (int) (i + 1), (int) k, 0, 0, token);
-            }
-        }
-    }
-
-    return used_samples;
-}
-
-void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
-    std::stringstream s_rng_state;
-    s_rng_state.imbue(std::locale::classic());
-    s_rng_state.exceptions(std::stringstream::failbit);
-    s_rng_state.str(rng_state);
-    s_rng_state >> rng;
-}
-
-std::string mt19937_get_state(const std::mt19937& rng) {
-    std::stringstream s_rng_state;
-    s_rng_state.imbue(std::locale::classic());
-    s_rng_state << rng;
-    return s_rng_state.str();
-}
-
-std::string mt19937_seed_to_state(unsigned seed) {
-    std::mt19937 rng(seed);
-    return mt19937_get_state(rng);
-}
-
-std::string shuffle_samples(
-        const std::string & rng_state,
-        size_t            * shuffled_offs,
-        size_t            * shuffled_begins,
-        size_t            * shuffled_sizes,
-        const size_t      * begins,
-        const size_t      * sizes,
-        size_t              count) {
-    if (count == 0) return rng_state;
-
-    std::mt19937 rng;
-    mt19937_set_state(rng, rng_state);
-
-    // sort indices by random value for each index
-    std::vector<size_t> idcs;
-    {
-        std::vector<unsigned> rnd;
-        idcs.resize(count);
-        rnd.resize(count);
-        for (unsigned i=0; i<count; ++i) {
-            idcs[i] = i;
-            rnd[i]  = rng();
-        }
-
-        std::sort(idcs.begin(), idcs.end(), [&rnd](size_t a, size_t b){
-            // stable sort for reproducibility
-            return (rnd[a] == rnd[b]) ? (a < b) : (rnd[a] < rnd[b]);
-        });
-    }
-
-    // create random offsets
-    for (unsigned i=0; i<count; ++i) {
-        shuffled_offs[i] = (size_t) ((sizes[idcs[i]] - 1) * ((double) rng() / (double) (rng.max()-1)));
-    }
-
-    // reorder begins and sizes by sorted indices
-    for (unsigned i=0; i<count; ++i) {
-        shuffled_begins[i] = begins[idcs[i]];
-    }
-
-    for (unsigned i=0; i<count; ++i) {
-        shuffled_sizes[i] = sizes[idcs[i]];
-    }
-
-    return mt19937_get_state(rng);
-}
-
-size_t hash_combine(size_t h1, size_t h2) {
-    return h1 ^ (h2 << 1);
-}
-
-size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
-    std::hash<std::string> h_string;
-    std::hash<unsigned long long> h_ull;
-    size_t h = h_string(std::string(fn));
-    h = hash_combine(h, h_ull((unsigned long long) sample_count));
-    for (size_t i=0; i< sample_count; ++i) {
-        h = hash_combine(h, h_ull((unsigned long long) samples_begin[i]));
-        h = hash_combine(h, h_ull((unsigned long long) samples_size[i]));
-    }
-    return h;
-}
-
-std::string replace_str(const char * s, const char * needle, const char * replacement) {
-    std::string str = s;
-    size_t pos = str.find(needle);
-    if (pos != std::string::npos) {
-        str.replace(pos, strlen(needle), replacement);
-    }
-    return str;
-}
-
-void print_duration(double fmillis) {
-    if (fmillis < 1000.0f) {
-        printf("%.1fms", (float) fmillis);
-        return;
-    }
-    const int64_t one_sec  = 1000;
-    const int64_t one_min  = one_sec  * 60;
-    const int64_t one_hour = one_min  * 60;
-    const int64_t one_day  = one_hour * 24;
-
-    int64_t millis  = (int64_t) fmillis;
-    int64_t days    = millis/one_day;
-    int64_t hours   = (millis - days*one_day)/one_hour;
-    int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
-    int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
-
-    // to print int64_t either cast to (long long int) or use macro PRId64 from <inttypes.h>
-    if (days > 0) {
-        printf("%lldd ", (long long int) days);
-    }
-    printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
-}
-
-float cosine_decay(int64_t step, int64_t decay_steps, float minimum) {
-    if (step > decay_steps) {
-        step = decay_steps;
-    }
-    const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
-    const float decay = (1 - minimum)*cosine_decay + minimum;
-    return decay;
-}
-
-float cosine_decay_restart(int64_t step, int64_t decay_steps, float minimum, float restart_step_mult) {
-    while (step > decay_steps) {
-        step -= decay_steps;
-        decay_steps = (int64_t) (restart_step_mult * decay_steps);
-    }
-    return cosine_decay(step, decay_steps, minimum);
-}
-
-float learning_schedule(
-    int64_t step,
-    int64_t warmup_steps,
-    int64_t cos_decay_steps,
-    float   learning_rate,
-    float   overall_minimum,
-    float   cos_decay_minimum,
-    float   cos_decay_restart_step_mult,
-    bool    enable_restart) {
-
-    float result =
-        (step < warmup_steps)
-            ? (float) step / (float) warmup_steps
-            : enable_restart
-                ? cosine_decay_restart(
-                    step - warmup_steps,
-                    cos_decay_steps,
-                    cos_decay_minimum,
-                    cos_decay_restart_step_mult)
-                : cosine_decay(
-                    step,
-                    cos_decay_steps,
-                    cos_decay_minimum);
-
-    float min = overall_minimum / learning_rate;
-    result = min + result * (1.0f - min);
-    return result;
-}
-
-static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
-    GGML_ASSERT(a != NULL);
-    GGML_ASSERT(b != NULL);
-    GGML_ASSERT(a->type == b->type);
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-    GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b));
-
-    return true;
-}
-
-void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
-    if (dst == NULL) {
-        return;
-    }
-    struct ggml_tensor * t  = ggml_get_tensor(ctx, name);
-    GGML_ASSERT(are_same_layout(dst, t));
-    memcpy(dst->data, t->data, ggml_nbytes(t));
-
-    if (strlen(ggml_get_name(dst)) == 0) {
-        ggml_set_name(dst, name);
-    }
-}
-
-// gguf constants
-static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
-static const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
-static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
-static const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
-static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
-static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
-static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
-static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
-static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
-static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
-static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
-static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
-static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
-static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
-
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
-
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
-
-static const char * LLM_KV_TRAINING_FILE_VERSION         = "training.file_version";
-static const char * LLM_KV_TRAINING_ITERATION_COUNT      = "training.iteration_count";
-static const char * LLM_KV_TRAINING_SAMPLE_COUNT         = "training.sample_count";
-static const char * LLM_KV_TRAINING_TOKEN_COUNT          = "training.token_count";
-static const char * LLM_KV_TRAINING_EPOCH_COUNT          = "training.epoch_count";
-static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
-static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
-static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
-static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE  = "training.shuffle.next_sample";
-
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-{ \
-    const std::string skey(key); \
-    const int kid = gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        die_fmt("key not found in model: %s", skey.c_str()); \
-    } \
-}
-
-void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
-    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
-
-    uint32_t file_version;
-    GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION);
-    GGML_ASSERT(file_version == 0);
-
-    GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
-    GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
-    GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED);
-
-    uint64_t nx;
-    GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT);
-    opt->nx = (size_t) nx;
-
-    // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
-
-    std::string opt_type;
-    GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
-    if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
-        opt->params.type = GGML_OPT_TYPE_ADAM;
-
-        GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
-        GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
-        GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT);
-
-        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
-
-        copy_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
-        copy_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
-        copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
-    } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
-        opt->params.type = GGML_OPT_TYPE_LBFGS;
-
-        GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
-        GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
-        GGUF_GET_KEY(fctx, opt->lbfgs.step,             gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP);
-        GGUF_GET_KEY(fctx, opt->lbfgs.j,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J);
-        GGUF_GET_KEY(fctx, opt->lbfgs.k,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K);
-        GGUF_GET_KEY(fctx, opt->lbfgs.end,              gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END);
-        GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT);
-
-        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
-
-        copy_tensor_by_name(opt->lbfgs.x,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
-        copy_tensor_by_name(opt->lbfgs.xp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
-        copy_tensor_by_name(opt->lbfgs.g,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
-        copy_tensor_by_name(opt->lbfgs.gp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
-        copy_tensor_by_name(opt->lbfgs.d,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
-        copy_tensor_by_name(opt->lbfgs.pf,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
-        copy_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
-        copy_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
-        copy_tensor_by_name(opt->lbfgs.lms,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
-        copy_tensor_by_name(opt->lbfgs.lmy,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
-    } else {
-        die("unknown optimizer type\n");
-    }
-}
-
-void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
-    gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
-    gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
-
-    switch (opt->params.type) {
-        case GGML_OPT_TYPE_ADAM:
-            {
-                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS,        opt->adam.fx_prev);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement);
-
-                ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
-                ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
-                if (opt->adam.pf) {
-                    ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
-                }
-
-                gguf_add_tensor(fctx, opt->adam.m);
-                gguf_add_tensor(fctx, opt->adam.v);
-                if (opt->adam.pf) {
-                    gguf_add_tensor(fctx, opt->adam.pf);
-                }
-            } break;
-        case GGML_OPT_TYPE_LBFGS:
-            {
-                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS,            opt->lbfgs.fx_best);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP,     opt->lbfgs.step);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J,        opt->lbfgs.j);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K,        opt->lbfgs.k);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END,      opt->lbfgs.end);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement);
-
-                ggml_set_name(opt->lbfgs.x,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
-                ggml_set_name(opt->lbfgs.xp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
-                ggml_set_name(opt->lbfgs.g,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
-                ggml_set_name(opt->lbfgs.gp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
-                ggml_set_name(opt->lbfgs.d,    LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
-                if (opt->lbfgs.pf) {
-                    ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
-                }
-                ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
-                ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
-                ggml_set_name(opt->lbfgs.lms,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
-                ggml_set_name(opt->lbfgs.lmy,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
-
-                gguf_add_tensor(fctx, opt->lbfgs.x);
-                gguf_add_tensor(fctx, opt->lbfgs.xp);
-                gguf_add_tensor(fctx, opt->lbfgs.g);
-                gguf_add_tensor(fctx, opt->lbfgs.gp);
-                gguf_add_tensor(fctx, opt->lbfgs.d);
-                if (opt->lbfgs.pf) {
-                    gguf_add_tensor(fctx, opt->lbfgs.pf);
-                }
-                gguf_add_tensor(fctx, opt->lbfgs.lmal);
-                gguf_add_tensor(fctx, opt->lbfgs.lmys);
-                gguf_add_tensor(fctx, opt->lbfgs.lms);
-                gguf_add_tensor(fctx, opt->lbfgs.lmy);
-            } break;
-    }
-}
-
-bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train) {
-    if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) < 0) {
-        return false;
-    }
-
-    uint32_t file_version;
-    GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
-    GGML_ASSERT(file_version <= 1);
-
-    if (file_version == 0) {
-
-        GGUF_GET_KEY(fctx, train->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
-        GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-        GGUF_GET_KEY(fctx, train->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
-
-    } else if (file_version == 1) {
-
-        GGUF_GET_KEY(fctx, train->train_its,     gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT);
-        GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-        GGUF_GET_KEY(fctx, train->train_tokens,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT);
-        GGUF_GET_KEY(fctx, train->train_epochs,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT);
-
-        GGUF_GET_KEY(fctx, train->shuffle_samples_hash,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH);
-        GGUF_GET_KEY(fctx, train->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE);
-        GGUF_GET_KEY(fctx, train->shuffle_sample_count,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT);
-        GGUF_GET_KEY(fctx, train->shuffle_next_sample,       gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE);
-    }
-
-    load_opt_context_gguf(fctx, f_ggml_ctx, train->opt);
-    return true;
-}
-
-void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train) {
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    1);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, train->train_its);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    train->train_samples);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     train->train_tokens);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT,     train->train_epochs);
-
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) train->shuffle_samples_hash);
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE,    train->shuffle_rng_state_current.c_str());
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) train->shuffle_sample_count);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE,  (uint64_t) train->shuffle_next_sample);
-
-    save_opt_context_gguf(fctx, train->opt);
-}
-
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("read error: %s", strerror(errno));
-        }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            die_fmt("write error: %s", strerror(errno));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-static size_t utf8_len(char src) {
-    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
-    return lookup[highbits];
-}
-
-// mark each byte with its utf8 unit number.
-// returns the number of utf8 characters.
-// e.g. when bytes == '\x61\xD0\xB0\x62',
-// then utf8_units will become [0,0,1,0]
-// utf8_nunits will become [1,2,2,1] and 3 is returned.
-// bytes where utf8_units is zero, are the begin of an utf8 character.
-static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) {
-    size_t offs = 0;
-    size_t count_utf8 = 0;
-    while(offs < count) {
-        int len = (int) utf8_len(bytes[offs]);
-        for (int i=0; i<len; ++i) {
-            utf8_units[offs+i]  = i;
-            utf8_nunits[offs+i] = len;
-        }
-        offs += len;
-        ++count_utf8;
-    }
-    return count_utf8;
-}
-
-size_t tokenize_file(
-        struct llama_context     * lctx,
-        const char               * filename,
-        const std::string        & sample_start,
-        bool                       include_sample_start,
-        bool                       overlapping_samples,
-        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
-        std::vector<size_t>      & out_samples_begin,
-        std::vector<size_t>      & out_samples_size) {
-    struct llama_file f(filename, "rb");
-
-    if (f.size == 0) {
-        out_tokens.clear();
-        out_samples_begin.clear();
-        out_samples_size.clear();
-        printf("%s: warning: empty or not existing training data file '%s'\n",
-            __func__, filename);
-        return out_tokens.size();
-    }
-
-    // account for possible leading whitespace that will be added by tokenizer
-    // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
-    const int n_max_tokens_overhead = 1;
-
-    std::vector<char> buf;
-    buf.resize(f.size);
-
-    f.read_raw(buf.data(), f.size);
-
-    std::vector<int> utf8_units;
-    std::vector<int> utf8_nunits;
-    utf8_units.resize(buf.size());
-    utf8_nunits.resize(buf.size());
-    mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
-
-    if (sample_start.size() == 0) {
-        // tokenize all data at once
-        out_tokens.resize(buf.size() + n_max_tokens_overhead);
-
-        int n_tokens = llama_tokenize(
-            llama_get_model(lctx),
-            buf.data(),
-            (int) buf.size(),
-            out_tokens.data(),
-            (int) out_tokens.size(),
-            false, false);
-        if (n_tokens < 0) {
-            out_tokens.resize(-n_tokens);
-            n_tokens = llama_tokenize(
-                llama_get_model(lctx),
-                buf.data(),
-                (int) buf.size(),
-                out_tokens.data(),
-                (int) out_tokens.size(),
-                false, false);
-        }
-        if (n_tokens >= 0) {
-            out_tokens.resize(n_tokens);
-        }
-
-        // generate sample starts at all token positions
-        out_samples_begin.clear();
-        out_samples_begin.push_back(0);
-        out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
-        size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
-        for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
-            out_samples_begin.push_back(sample_begin);
-            out_samples_size.push_back(context_length);
-        }
-    } else {
-        // split data into samples and tokenize each sample
-        std::string data_str(buf.data(), buf.size());
-        out_samples_begin.clear();
-        out_samples_size.clear();
-        out_tokens.clear();
-
-        // find all positions of pattern sample_start
-        size_t sample_begin = data_str.find(sample_start, 0);
-        while (sample_begin != std::string::npos) {
-            out_samples_begin.push_back(sample_begin);
-            const size_t search_start = sample_begin + sample_start.size();
-            sample_begin = data_str.find(sample_start, search_start);
-        }
-        if (out_samples_begin.size() == 0) {
-            printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n",
-                __func__, sample_start.c_str());
-            out_samples_begin.push_back(0);
-        }
-
-        out_samples_size.resize(out_samples_begin.size(), 0);
-
-        std::vector<char>        buf_sample;
-        std::vector<llama_token> tok_sample;
-
-        const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
-        size_t found_too_big_sample   = 0;
-        size_t found_too_small_sample = 0;
-        size_t found_empty_sample     = 0;
-        size_t found_min_sample_size  = SIZE_MAX;
-        size_t found_max_sample_size  = 0;
-
-        size_t max_token_text_size = 0;
-        int n_vocab = llama_n_vocab(llama_get_model(lctx));
-        for (llama_token token=0; token < n_vocab; ++token) {
-            max_token_text_size = std::max(
-                max_token_text_size,
-                strlen(llama_token_get_text(llama_get_model(lctx), token)));
-        }
-
-        // upper bound of context byte length.
-        // strings with this byte length should always tokenize to at least context_length tokens.
-        size_t context_byte_len = max_token_text_size*context_length;
-
-        for (unsigned i=0; i<out_samples_begin.size(); ++i) {
-            // determine sample begin and end from pattern positions
-            size_t sample_begin = out_samples_begin[i] + sample_begin_offset;
-            size_t sample_end   = overlapping_samples
-                                    ? std::min(
-                                        data_str.size(),
-                                        sample_begin + context_byte_len)
-                                    : (i+1 < out_samples_begin.size()
-                                        ? out_samples_begin[i+1]
-                                        : data_str.size());
-            if (sample_end < utf8_units.size() && utf8_units[sample_end] > 0) {
-                // sample end is in the middle of an utf8 character.
-                // advance sample_end to the begin of the next utf8 character.
-                sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];
-            }
-            size_t sample_size = sample_end - sample_begin;
-            if (sample_size == 0) {
-                ++found_empty_sample;
-            }
-
-            if (sample_size > 0) {
-                // llama_tokenize expects zero terminated string,
-                // copy sample into buffer and zero terminate it.
-                buf_sample.resize(sample_size);
-                memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
-
-                // printf("sample: '%s'\n", buf_sample.data());
-
-                // tokenize the sample
-                tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
-                int n_tokens = llama_tokenize(llama_get_model(lctx),
-                    buf_sample.data(),
-                    (int) buf_sample.size(),
-                    tok_sample.data(),
-                    (int) tok_sample.size(),
-                    false, false);
-                if (n_tokens < 0) {
-                    tok_sample.resize(-n_tokens);
-                    n_tokens = llama_tokenize(llama_get_model(lctx),
-                        buf_sample.data(),
-                        (int) buf_sample.size(),
-                        tok_sample.data(),
-                        (int) tok_sample.size(),
-                        false, false);
-                    GGML_ASSERT(n_tokens >= 0);
-                }
-                GGML_ASSERT(n_tokens <= (int) tok_sample.size());
-
-                if ((size_t) n_tokens > context_length) {
-                    ++found_too_big_sample;
-                } else if ((size_t) n_tokens < context_length) {
-                    ++found_too_small_sample;
-                }
-                found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens);
-                found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens);
-
-                // write out tokens, start and size of sample
-                // overwrite the string start position with the token start position
-                out_samples_begin[i] = out_tokens.size();
-                out_samples_size[i] = (size_t) n_tokens;
-                out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens);
-            } else {
-                out_samples_begin[i] = out_tokens.size();
-                out_samples_size[i] = 0;
-            }
-
-        }
-        if (found_too_big_sample > 0) {
-            printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n",
-                __func__, found_too_big_sample, found_max_sample_size, context_length);
-        }
-
-        if (found_too_small_sample > 0) {
-            printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n",
-                __func__, found_too_small_sample, found_min_sample_size, context_length);
-        }
-
-        if (found_empty_sample) {
-            printf("%s: warning: found %zu empty samples.\n",
-                __func__, found_empty_sample);
-        }
-    }
-    printf("%s: total number of samples: %zu\n",
-        __func__, out_samples_begin.size());
-
-    GGML_ASSERT(out_samples_begin.size() == out_samples_size.size());
-
-    return out_tokens.size();
-}
-
-std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration) {
-    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
-    return replace_str(filename, pattern_it, sit.c_str());
-}
-
-struct train_params_common get_default_train_params_common() {
-    struct train_params_common params;
-    params.fn_train_data     = "shakespeare.txt";
-    params.fn_checkpoint_in  = "checkpoint.gguf";
-    params.fn_checkpoint_out = "checkpoint-ITERATION.gguf";
-    params.pattern_fn_it     = "ITERATION";
-    params.fn_latest         = "LATEST";
-
-    params.print_usage = false;
-
-    params.save_every = 10;
-
-    params.seed       =   -1;
-
-    params.n_ctx      =  128;
-    params.n_threads  =    6;
-    params.n_batch    =    8;
-    params.n_gradient_accumulation = 1;
-    params.n_epochs   = -1;
-    params.n_gpu_layers = 0;
-
-    params.custom_n_ctx = false;
-
-    params.use_flash              = false;
-    params.use_checkpointing      = true;
-
-    params.sample_start           = "";
-    params.include_sample_start   = false;
-    params.escape                 = false;
-    params.overlapping_samples    = false;
-    params.fill_with_next_samples = false;
-    params.separate_with_eos      = false;
-    params.separate_with_bos      = true;
-    params.sample_random_offsets  = false;
-    params.force_reshuffle        = false;
-
-    params.opt_past               = 0;
-    params.opt_delta              = 1e-5f;
-    params.opt_max_no_improvement = 0;
-
-    params.warmup            =  100;
-    params.cos_decay_steps   = 1000;
-    params.cos_decay_restart = 1.1f;
-    params.cos_decay_min     = 0.1f;
-    params.enable_restart    = false;
-
-    params.adam_n_iter         = 256;
-    params.adam_alpha          = 1e-3f;
-    params.adam_min_alpha      = 0;
-    params.adam_decay          = 1e-1f;
-    params.adam_decay_min_ndim = 2;
-    params.adam_beta1          = 0.9f;
-    params.adam_beta2          = 0.999f;
-    params.adam_gclip          = 1.0f;
-    params.adam_eps_f          = 0.0f;
-
-    return params;
-}
-
-void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train_params_common * params) {
-    // fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    // fprintf(stderr, "\n");
-    // fprintf(stderr, "options:\n");
-    // fprintf(stderr, "  -h, --help                 show this help message and exit\n");
-    fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
-    fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
-    fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
-    fprintf(stderr, "  --pattern-fn-it STR        pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
-    fprintf(stderr, "  --fn-latest STR            string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest);
-    fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every);
-    fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
-    fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
-    fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
-    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
-    fprintf(stderr, "  --grad-acc N               Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation);
-    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
-    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
-    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --overlapping-samples      Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
-    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
-    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
-    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
-    fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
-    fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
-    fprintf(stderr, "  --sample-random-offsets    Use samples beginning at random offsets. Together with fill-with-next-samples this may help for training endless text generation.%s\n", params->sample_random_offsets ? " (default)" : "");
-    fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
-    fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
-    fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
-    fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
-    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing (default)\n");
-    fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
-    fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
-    fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
-    fprintf(stderr, "  --cos-decay-min N          Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min);
-    fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
-    fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
-    fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
-    fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
-    fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
-    fprintf(stderr, "  --epochs N                 Maximum number epochs to process. (default %d)\n", params->n_epochs);
-    fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
-    fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
-    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
-    fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
-    fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
-    fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
-    fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
-    fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
-    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
-    fprintf(stderr, "  -ngl N, --n-gpu-layers N   Number of model layers to offload to GPU (default %d)", params->n_gpu_layers);
-    fprintf(stderr, "\n");
-}
-
-bool consume_common_train_arg(
-    int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param
-) {
-    int& i = *idx;
-    std::string arg = argv[i];
-    const std::string arg_prefix = "--";
-    if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-        std::replace(arg.begin(), arg.end(), '_', '-');
-    }
-    if (arg == "--train-data") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->fn_train_data = argv[i];
-    } else if (arg == "--checkpoint-in") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->fn_checkpoint_in = argv[i];
-    } else if (arg == "--checkpoint-out") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->fn_checkpoint_out = argv[i];
-    } else if (arg == "--pattern-fn-it") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->pattern_fn_it = argv[i];
-    } else if (arg == "--fn-latest") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->fn_latest = argv[i];
-    } else if (arg == "--save-every") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->save_every = std::stoi(argv[i]);
-    } else if (arg == "-s" || arg == "--seed") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->seed = std::stoi(argv[i]);
-    } else if (arg == "-c" || arg == "--ctx") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->n_ctx = std::stoi(argv[i]);
-        params->custom_n_ctx = true;
-    } else if (arg == "-t" || arg == "--threads") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->n_threads = std::stoi(argv[i]);
-    } else if (arg == "-b" || arg == "--batch") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->n_batch = std::stoi(argv[i]);
-    } else if (arg == "--grad-acc") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->n_gradient_accumulation = std::max(1, std::stoi(argv[i]));
-    } else if (arg == "--sample-start") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->sample_start = std::string(argv[i]);
-    } else if (arg == "--escape") {
-        params->escape = true;
-    } else if (arg == "--include-sample-start") {
-        params->include_sample_start = true;
-    } else if (arg == "--overlapping-samples") {
-        params->overlapping_samples = true;
-    } else if (arg == "--fill-with-next-samples") {
-        params->fill_with_next_samples = true;
-    } else if (arg == "--separate-with-eos") {
-        params->separate_with_eos = true;
-    } else if (arg == "--separate-with-bos") {
-        params->separate_with_bos = true;
-    } else if (arg == "--no-separate-with-eos") {
-        params->separate_with_eos = false;
-    } else if (arg == "--no-separate-with-bos") {
-        params->separate_with_bos = false;
-    } else if (arg == "--sample-random-offsets") {
-        params->sample_random_offsets = true;
-    } else if (arg == "--force-reshuffle") {
-        params->force_reshuffle = true;
-    } else if (arg == "--no-flash") {
-        params->use_flash = false;
-    } else if (arg == "--use-flash") {
-        params->use_flash = true;
-    } else if (arg == "--no-checkpointing") {
-        params->use_checkpointing = false;
-    } else if (arg == "--use-checkpointing") {
-        params->use_checkpointing = true;
-    } else if (arg == "--warmup") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->warmup = std::stoi(argv[i]);
-    } else if (arg == "--cos-decay-steps") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->cos_decay_steps = std::stoi(argv[i]);
-    } else if (arg == "--cos-decay-restart") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->cos_decay_restart = std::stof(argv[i]);
-    } else if (arg == "--cos-decay-min") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->cos_decay_min = std::stof(argv[i]);
-    } else if (arg == "--enable-restart") {
-        params->enable_restart = true;
-    } else if (arg == "--disable-restart") {
-        params->enable_restart = false;
-    } else if (arg == "--opt-past") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->opt_past = std::stoi(argv[i]);
-    } else if (arg == "--opt-delta") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->opt_delta = std::stof(argv[i]);
-    } else if (arg == "--opt-max-no-improvement") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->opt_max_no_improvement = std::stoi(argv[i]);
-    } else if (arg == "--adam-epsf") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_eps_f = std::stof(argv[i]);
-    } else if (arg == "--epochs") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->n_epochs = std::stoi(argv[i]);
-    } else if (arg == "--adam-iter") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_n_iter = std::stoi(argv[i]);
-    } else if (arg == "--adam-alpha") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_alpha = std::stof(argv[i]);
-    } else if (arg == "--adam-min-alpha") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_min_alpha = std::stof(argv[i]);
-    } else if (arg == "--adam-decay") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_decay = std::stof(argv[i]);
-    } else if (arg == "--adam-decay-min-ndim") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_decay_min_ndim = std::stoi(argv[i]);
-    } else if (arg == "--adam-beta1") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_beta1 = std::stof(argv[i]);
-    } else if (arg == "--adam-beta2") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_beta2 = std::stof(argv[i]);
-    } else if (arg == "--adam-gclip") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_gclip = std::stof(argv[i]);
-    } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
-            if (++i >= argc) {
-                *invalid_param = true;
-                return true;
-            }
-            if (llama_supports_gpu_offload()) {
-                params->n_gpu_layers = std::stoi(argv[i]);
-            } else {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-            }
-    } else if (arg == "-h" || arg == "--help") {
-        params->print_usage = true;
-        return true;
-    } else {
-        return false;
-    }
-    return true;
-}
-
-void finish_processing_train_args(struct train_params_common * params) {
-    if (params->escape) {
-        string_process_escapes(params->sample_start);
-    }
-}
-
-void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) {
-    struct train_opt_callback_data * data   = (struct train_opt_callback_data *) vdata;
-    struct train_params_common     * params = data->params;
-    struct train_state             * train  = data->train;
-    struct ggml_opt_context        * opt    = train->opt;
-    int n_batch = params->n_batch;
-    int n_ctx = params->n_ctx;
-
-    if (accum_step == 0) {
-        // time measurement
-        int64_t now = ggml_time_ms();
-        if (now > data->last_time && opt->iter > data->first_iter) {
-            double dt = (double) (now - data->last_time);
-            if (data->millis_per_iter == 0.0) {
-                data->millis_per_iter = dt;
-            } else {
-                const double gain = 0.7;
-                data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
-            }
-        }
-
-        double remaining_millis = 0.0;
-        if (data->millis_per_iter > 0.0) {
-            const int n_iter = params->adam_n_iter;
-            const int done_iter = opt->iter - data->first_iter;
-            const int remaining_iter = n_iter - done_iter;
-            remaining_millis = remaining_iter * data->millis_per_iter;
-        }
-
-        // file saving
-        const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
-        if (save_now) {
-            int new_iters = opt->iter - data->last_save_iter;
-            train->train_its    += new_iters;
-            train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
-
-            if (data->save_cb) {
-                data->save_cb(data->save_data, train);
-            }
-
-            data->last_save_iter = opt->iter;
-        }
-
-        // exclude file saving from time measurement, by measuring last_time after saving
-        data->last_time = ggml_time_ms();
-
-        *sched = learning_schedule(
-            opt->iter,
-            params->warmup,
-            params->cos_decay_steps,
-            params->adam_alpha,
-            params->adam_min_alpha,
-            params->cos_decay_min,
-            params->cos_decay_restart,
-            params->enable_restart);
-
-        int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
-        if (impr_plot > 0) impr_plot = 0;
-        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
-        printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
-            __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
-            *sched, opt->loss_after);
-
-
-        if (data->millis_per_iter > 0) {
-            printf(" dt=");
-            print_duration(data->millis_per_iter);
-            printf(" eta=");
-            print_duration(remaining_millis);
-        }
-
-        float improvement = opt->loss_before - opt->loss_after;
-        const float plot_scale = 10.0f;
-        int bar_len = (int)(1 + improvement*plot_scale + 0.5);
-        printf(" |");
-        for (int i=0; i<bar_len; ++i) {
-            printf("-");
-        }
-        printf(">");
-        printf("\n");
-    }
-
-    int64_t used_samples = get_example_targets_batch(
-        data->lctx,
-        data->tokens_input,
-        data->target_probs,
-        train->shuffle_next_sample,
-        data->shuffled_samples_offs,
-        data->shuffled_samples_begin,
-        data->shuffled_samples_size,
-        data->samples_count,
-        data->tokens_data,
-        data->tokens_size,
-        params->separate_with_eos,
-        params->separate_with_bos,
-        params->fill_with_next_samples,
-        params->sample_random_offsets);
-
-    train->train_samples += used_samples;
-    train->shuffle_next_sample += used_samples;
-
-    if (train->shuffle_next_sample >= train->shuffle_sample_count) {
-        ++train->train_epochs;
-        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
-        // note: we may have used some samples from the current shuffling more than once
-        train->shuffle_rng_state_current = train->shuffle_rng_state_next;
-        train->shuffle_rng_state_next = shuffle_samples(
-            train->shuffle_rng_state_current,
-            data->shuffled_samples_offs,
-            data->shuffled_samples_begin,
-            data->shuffled_samples_size,
-            data->samples_begin,
-            data->samples_size,
-            data->samples_count);
-        train->shuffle_next_sample = 0;
-    }
-
-    const bool last_epoch_reached = (params->n_epochs > 0 && (int64_t) train->train_epochs - data->first_epoch >= params->n_epochs);
-    if (last_epoch_reached) {
-        // allow optimization iteration at last epoch to be completed before canceling
-        if (data->iter_at_last_epoch < 0) {
-            data->iter_at_last_epoch = opt->iter;
-        } else if (opt->iter > data->iter_at_last_epoch) {
-            *cancel = true;
-        }
-    }
-}
diff --git a/common/train.h b/common/train.h
deleted file mode 100644
index 263d940c0..000000000
--- a/common/train.h
+++ /dev/null
@@ -1,233 +0,0 @@
-// Various helper functions and utilities for training
-
-#pragma once
-
-#include <string>
-#include <random>
-#include <vector>
-
-#include "ggml.h"
-#include "llama.h"
-
-#define LLAMA_TRAIN_MAX_NODES 16384
-
-typedef std::string mt19937_state;
-
-struct train_state {
-    struct ggml_opt_context * opt;
-
-    uint64_t train_its;
-    uint64_t train_samples;
-    uint64_t train_tokens;
-    uint64_t train_epochs;
-
-    size_t        shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
-    mt19937_state shuffle_rng_state_current;
-    mt19937_state shuffle_rng_state_next;
-    size_t        shuffle_sample_count;
-    size_t        shuffle_next_sample;
-};
-
-struct train_params_common {
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
-    const char * pattern_fn_it;
-    const char * fn_latest;
-
-    bool print_usage;
-
-    int save_every;
-
-    uint32_t seed;
-
-    int n_ctx;
-    int n_threads;
-    int n_batch;
-    int n_gradient_accumulation;
-    int n_epochs;
-    int n_gpu_layers;
-
-    bool custom_n_ctx;
-
-    bool use_flash;
-    bool use_checkpointing;
-
-    std::string sample_start;
-    bool include_sample_start;
-    bool escape;
-    bool overlapping_samples;
-    bool fill_with_next_samples;
-    bool separate_with_eos;
-    bool separate_with_bos;
-    bool sample_random_offsets;
-
-    bool force_reshuffle;
-
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_min;
-    bool  enable_restart;
-
-    int   opt_past;
-    float opt_delta;
-    int   opt_max_no_improvement;
-
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_min_alpha;
-    float adam_decay;
-    int   adam_decay_min_ndim;
-    float adam_beta1;
-    float adam_beta2;
-    float adam_gclip;
-    float adam_eps_f;
-};
-
-typedef void (*save_train_files_callback)(void * data, struct train_state * train);
-
-struct train_opt_callback_data {
-    struct train_params_common * params;
-    struct train_state         * train;
-    save_train_files_callback    save_cb;
-    void                       * save_data;
-    struct llama_context       * lctx;
-    int                          last_save_iter;
-    llama_token                * tokens_data;
-    size_t                       tokens_size;
-    size_t                     * samples_begin;
-    size_t                     * samples_size;
-    size_t                     * shuffled_samples_offs;
-    size_t                     * shuffled_samples_begin;
-    size_t                     * shuffled_samples_size;
-    size_t                       samples_count;
-    struct ggml_tensor         * tokens_input;
-    struct ggml_tensor         * target_probs;
-    int                          first_iter;
-    int                          first_epoch;
-    int                          iter_at_last_epoch;
-    int64_t                      last_time;
-    double                       millis_per_iter;
-};
-
-struct train_state * init_train_state();
-void free_train_state(struct train_state  * state);
-
-struct train_params_common get_default_train_params_common();
-void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
-
-bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
-void finish_processing_train_args(struct train_params_common * params);
-
-struct random_normal_distribution;
-struct random_uniform_distribution;
-
-struct random_normal_distribution  * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
-struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
-
-void free_random_normal_distribution (struct random_normal_distribution  * rnd);
-void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
-
-struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
-struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
-
-// generate random float in interval [0,1)
-float frand();
-float frand_normal (struct random_normal_distribution * rnd);
-float frand_uniform(struct random_uniform_distribution * rnd);
-
-int   clamp (const int v, const int min, const int max);
-float fclamp(const float v, const float min, const float max);
-
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
-
-size_t tokenize_file(
-        struct llama_context     * lctx,
-        const char               * filename,
-        const std::string        & sample_start,
-        bool                       include_sample_start,
-        bool                       overlapping_samples,
-        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
-        std::vector<size_t>      & out_samples_begin,
-        std::vector<size_t>      & out_samples_size);
-
-int64_t get_example_targets_batch(
-        struct llama_context * lctx,
-        struct ggml_tensor   * tokens_input,
-        struct ggml_tensor   * target_probs,
-        int64_t                example_id,
-        const size_t         * samples_offs,
-        const size_t         * samples_begin,
-        const size_t         * samples_size,
-              size_t           samples_count,
-        const llama_token    * train_data,
-        size_t                 n_train_data,
-        bool                   separate_with_eos,
-        bool                   separate_with_bos,
-        bool                   fill_with_next_samples,
-        bool                   sample_random_offsets);
-
-
-void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
-mt19937_state mt19937_get_state(const std::mt19937& rng);
-mt19937_state mt19937_seed_to_state(unsigned seed);
-
-mt19937_state shuffle_samples(
-        const mt19937_state & rng_state,
-        size_t              * shuffled_offs,
-        size_t              * shuffled_begins,
-        size_t              * shuffled_sizes,
-        const size_t        * begins,
-        const size_t        * sizes,
-        size_t                count);
-
-size_t hash_combine(size_t h1, size_t h2);
-
-size_t compute_samples_hash(
-    const char* fn,
-    const size_t* samples_begin,
-    const size_t* samples_size,
-    size_t sample_count);
-
-
-std::string replace_str(const char * s, const char * needle, const char * replacement);
-
-void print_duration(double milliseconds);
-
-float cosine_decay(
-    int64_t step,
-    int64_t decay_steps,
-    float   minimum);
-
-float cosine_decay_restart(
-    int64_t step,
-    int64_t decay_steps,
-    float   minimum,
-    float   restart_step_mult);
-
-float learning_schedule(
-    int64_t step,
-    int64_t warmup_steps,
-    int64_t decay_steps,
-    float   learning_rate,
-    float   overall_minimum,
-    float   cos_decay_minimum,
-    float   cos_decay_restart_step_mult,
-    bool    enable_restart);
-
-void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
-
-void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
-void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
-
-bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
-void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
-
-std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
-
-void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 6df318c19..d63a96c1c 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
     add_subdirectory(cvector-generator)
-    add_subdirectory(baby-llama)
     add_subdirectory(batched-bench)
     add_subdirectory(batched)
     add_subdirectory(convert-llama2c-to-ggml)
diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt
deleted file mode 100644
index 71b82105c..000000000
--- a/examples/baby-llama/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-baby-llama)
-add_executable(${TARGET} baby-llama.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
deleted file mode 100644
index 3ce91070b..000000000
--- a/examples/baby-llama/baby-llama.cpp
+++ /dev/null
@@ -1,1639 +0,0 @@
-#include "ggml.h"
-#include "train.h"
-
-#include <cassert>
-#include <cstdlib>
-#include <cstring>
-#include <random>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#ifdef LLAMA_DEFAULT_RMS_EPS
-constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
-#else
-constexpr float rms_norm_eps = 5e-6f;
-#endif
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-static struct ggml_tensor * randomize_tensor(
-    struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
-) {
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return tensor;
-}
-
-struct llama_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
-    uint32_t n_embd  = 4096;
-    uint32_t n_mult  = 4;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
-
-    bool operator!=(const llama_hparams & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams));
-    }
-};
-
-static uint32_t get_n_ff(const struct llama_hparams* hparams) {
-    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
-    return n_ff;
-}
-
-struct llama_hparams_lora {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
-    uint32_t n_embd  = 4096;
-    uint32_t n_mult  = 4;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
-    uint32_t n_lora  = 64;
-
-    bool operator!=(const llama_hparams_lora & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
-    }
-};
-
-struct llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
-};
-
-struct llama_layer_lora {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wqa;
-    struct ggml_tensor * wqb;
-    struct ggml_tensor * wka;
-    struct ggml_tensor * wkb;
-    struct ggml_tensor * wva;
-    struct ggml_tensor * wvb;
-    struct ggml_tensor * woa;
-    struct ggml_tensor * wob;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
-};
-
-
-struct llama_kv_cache {
-    struct ggml_context * ctx = NULL;
-
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    // llama_ctx_buffer buf;
-
-    int n; // number of tokens currently in the cache
-};
-
-struct llama_model {
-    struct ggml_context * ctx = NULL;
-
-    llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<llama_layer> layers;
-};
-
-struct llama_model_lora {
-    struct ggml_context * ctx = NULL;
-
-    llama_hparams_lora hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * outputa;
-    struct ggml_tensor * outputb;
-
-    std::vector<llama_layer_lora> layers;
-};
-
-static void init_model(struct llama_model * model) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-    const uint32_t n_vocab = hparams.n_vocab;
-
-    const uint32_t n_ff = get_n_ff(&hparams);
-
-    struct ggml_context * ctx = model->ctx;
-
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);          // ("norm.weight",           {n_embd});
-    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight",         {n_embd, n_vocab});
-
-    model->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        // std::string layers_i = "layers." + std::to_string(i);
-
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
-
-        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
-        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
-
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);       // (layers_i + ".ffn_norm.weight", {n_embd});
-
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);     // (layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
-    }
-}
-
-
-static void init_model_lora(struct llama_model_lora * model) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_mult  = hparams.n_mult;
-    const uint32_t n_layer = hparams.n_layer;
-    const uint32_t n_vocab = hparams.n_vocab;
-    const uint32_t n_lora  = hparams.n_lora;
-
-    const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
-
-    struct ggml_context * ctx = model->ctx;
-
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);          // ("norm.weight",           {n_embd});
-    model->outputa        = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight",         {n_embd, n_vocab});
-    model->outputb        = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,  n_lora); // ("output.weight",         {n_embd, n_vocab});
-
-    model->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        // std::string layers_i = "layers." + std::to_string(i);
-
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
-
-        layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
-        layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
-        layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
-        layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
-        layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
-        layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
-        layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
-        layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
-
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);       // (layers_i + ".ffn_norm.weight", {n_embd});
-
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);     // (layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
-    }
-}
-
-static void set_param_model(struct llama_model * model) {
-    const auto& hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct ggml_context* ctx = model->ctx;
-
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->output);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wq);
-        ggml_set_param(ctx, layer.wk);
-        ggml_set_param(ctx, layer.wv);
-        ggml_set_param(ctx, layer.wo);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.w1);
-        ggml_set_param(ctx, layer.w2);
-        ggml_set_param(ctx, layer.w3);
-    }
-}
-
-static void set_param_model_lora(struct llama_model_lora * model) {
-    const auto& hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct ggml_context* ctx = model->ctx;
-
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->outputa);
-    ggml_set_param(ctx, model->outputb);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wqa);
-        ggml_set_param(ctx, layer.wqb);
-        ggml_set_param(ctx, layer.wka);
-        ggml_set_param(ctx, layer.wkb);
-        ggml_set_param(ctx, layer.wva);
-        ggml_set_param(ctx, layer.wvb);
-        ggml_set_param(ctx, layer.woa);
-        ggml_set_param(ctx, layer.wob);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.w1);
-        ggml_set_param(ctx, layer.w2);
-        ggml_set_param(ctx, layer.w3);
-    }
-}
-
-static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(model->tok_embeddings , rnd);
-    randomize_tensor_normal(model->norm           , rnd);
-    randomize_tensor_normal(model->output         , rnd);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, rnd);
-
-        randomize_tensor_normal(layer.wq, rnd);
-        randomize_tensor_normal(layer.wk, rnd);
-        randomize_tensor_normal(layer.wv, rnd);
-        randomize_tensor_normal(layer.wo, rnd);
-
-        randomize_tensor_normal(layer.ffn_norm, rnd);
-
-        randomize_tensor_normal(layer.w1, rnd);
-        randomize_tensor_normal(layer.w2, rnd);
-        randomize_tensor_normal(layer.w3, rnd);
-    }
-
-    free_random_normal_distribution(rnd);
-}
-
-
-static void randomize_model_lora(
-    struct llama_model_lora * model, int seed, float mean, float std, float min, float max
-) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(model->tok_embeddings, rnd);
-    randomize_tensor_normal(model->norm          , rnd);
-    randomize_tensor_normal(model->outputa       , rnd);
-    randomize_tensor_normal(model->outputb       , rnd);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, rnd);
-
-        randomize_tensor_normal(layer.wqa, rnd);
-        randomize_tensor_normal(layer.wqb, rnd);
-        randomize_tensor_normal(layer.wka, rnd);
-        randomize_tensor_normal(layer.wkb, rnd);
-        randomize_tensor_normal(layer.wva, rnd);
-        randomize_tensor_normal(layer.wvb, rnd);
-        randomize_tensor_normal(layer.woa, rnd);
-        randomize_tensor_normal(layer.wob, rnd);
-
-        randomize_tensor_normal(layer.ffn_norm, rnd);
-
-        randomize_tensor_normal(layer.w1, rnd);
-        randomize_tensor_normal(layer.w2, rnd);
-        randomize_tensor_normal(layer.w3, rnd);
-    }
-
-    free_random_normal_distribution(rnd);
-}
-
-static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_ctx   = hparams.n_ctx;
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-
-    const int64_t n_mem      = n_layer*n_ctx*n_batch;
-    const int64_t n_elements = n_embd*n_mem;
-
-    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
-
-    // struct ggml_init_params params;
-    // params.mem_size   = cache.buf.size;
-    // params.mem_buffer = cache.buf.addr;
-    // params.no_alloc   = false;
-    if (!cache->ctx) {
-        struct ggml_init_params params;
-        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
-        params.mem_buffer = NULL;
-        params.no_alloc   = false;
-
-        cache->ctx = ggml_init(params);
-
-        if (!cache->ctx) {
-            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-            exit(1);
-        }
-    }
-
-    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-}
-
-static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_ctx   = hparams.n_ctx;
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-
-    const int64_t n_mem      = n_layer*n_ctx*n_batch;
-    const int64_t n_elements = n_embd*n_mem;
-
-    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
-
-    // struct ggml_init_params params;
-    // params.mem_size   = cache.buf.size;
-    // params.mem_buffer = cache.buf.addr;
-    // params.no_alloc   = false;
-    if (!cache->ctx) {
-        struct ggml_init_params params;
-        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
-        params.mem_buffer = NULL;
-        params.no_alloc   = false;
-
-        cache->ctx = ggml_init(params);
-
-        if (!cache->ctx) {
-            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-            return false;
-        }
-    }
-
-    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-
-    return true;
-}
-
-static struct ggml_tensor * forward(
-    struct llama_model    * model,
-    struct llama_kv_cache * cache,
-    struct ggml_context   * ctx0,
-    struct ggml_cgraph    * gf,
-    struct ggml_tensor    * tokens_input,
-    const  int              n_tokens,
-    const  int              n_past
-) {
-    const int N = n_tokens;
-
-    struct llama_kv_cache& kv_self = *cache;
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
-
-    struct ggml_tensor * kc = kv_self.k;
-    struct ggml_tensor * vc = kv_self.v;
-
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    {
-        int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < N; ++i) {
-            data[i] = n_past + i;
-        }
-    }
-
-    // inpL shape [n_embd,N,1,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-
-        // norm
-        {
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
-
-            // store key and value to memory
-            {
-                // compute the transposed [N, n_embd] V matrix
-                // wv   shape [n_embd, n_embd, 1, 1]
-                // Vcur shape [n_embd, N, 1, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
-
-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
-                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
-                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
-
-                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-                } //*/
-
-                kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                vc = ggml_set_2d(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-            }
-
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Q shape    [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-            // K shape [n_embd/n_head, n_past + N, n_head, 1]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            // KQ shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // split cached V into n_head heads
-            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
-            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, vc,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(vc),
-                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(vc)*n_embd);
-
-            // KQV shape [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            // KQV_merged shape
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
-            // cur = ggml_cpy(ctx0,
-            //         KQV_merged,
-            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N,1,1]
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-            }
-
-            // tmp shape [n_ff,N,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-
-            // SILU activation
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_silu(ctx0, cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-        }
-
-        // cur shape [n_embd,N,1,1]
-        cur = ggml_add(ctx0, cur, inpFF);
-
-        // input for next layer
-        // inpL shape [n_embd,N,1,1]
-        inpL = cur;
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-static struct ggml_tensor * forward_batch(
-    struct llama_model    * model,
-    struct llama_kv_cache * cache,
-    struct ggml_context   * ctx0,
-    struct ggml_cgraph    * gf,
-    struct ggml_tensor    * tokens_input,
-    const  int              n_tokens,
-    const  int              n_past,
-    const  int              n_batch
-) {
-    const int N = n_tokens;
-
-    struct llama_kv_cache& kv_self = *cache;
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_vocab = hparams.n_vocab;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const int n_ff    = get_n_ff(&hparams);
-
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
-    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
-
-    struct ggml_tensor * kc = kv_self.k;
-    struct ggml_tensor * vc = kv_self.v;
-
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    {
-        int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < N; ++i) {
-            data[i] = n_past + i;
-        }
-    }
-
-    // inpL shape [n_embd,N*n_batch,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    assert_shape_2d(inpL, n_embd, N*n_batch);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-
-        // norm
-        {
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
-            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
-            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
-
-            // store key and value to memory
-            {
-                // compute the transposed [N, n_embd] V matrix
-                // wv   shape [n_embd, n_embd, 1, 1]
-                // Vcur shape [N, n_embd, n_batch, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0,
-                    ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_mul_mat(ctx0,
-                                model->layers[il].wv,
-                                cur),
-                        n_embd, N, n_batch),
-                        1, 0, 2, 3));
-
-                assert_shape_3d(Vcur, N, n_embd, n_batch);
-
-                // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-                // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
-                // k         shape [n_embd * N, n_batch]   == kv_self.k[:,n_past:n_past+N,:,il]
-                // v         shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
-
-                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-                } //*/
-
-                kc = ggml_set_2d(ctx0, kc,
-                        ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
-                        ggml_element_size(kc)*n_embd*n_ctx,
-                        (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
-                vc = ggml_set_2d(ctx0, vc,
-                        ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
-                        ggml_element_size(vc)*n_ctx*n_embd,
-                        ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
-
-                assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
-                assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
-            }
-
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Q shape    [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
-
-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-            // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_4d(ctx0,
-                            ggml_view_3d(ctx0,
-                                kc,
-                                n_embd,
-                                (n_past + N),
-                                n_batch,
-                                n_embd*ggml_element_size(kc),
-                                n_ctx*n_embd*ggml_element_size(kc),
-                                il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
-                            n_embd/n_head, n_head, n_past + N, n_batch),
-                        0, 2, 1, 3);
-            assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
-
-            // K * Q
-            // KQ shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
-            assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
-
-            // split cached V into n_head heads
-            // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
-            // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
-            struct ggml_tensor * V =
-                ggml_view_4d(ctx0, vc,
-                        n_past + N, n_embd/n_head, n_head, n_batch,
-                        ggml_element_size(vc)*n_ctx,
-                        ggml_element_size(vc)*n_ctx*n_embd/n_head,
-                        ggml_element_size(vc)*n_ctx*n_embd,
-                        il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
-            assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
-
-            // KQV shape [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            // KQV_merged shape
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-            // cur = ggml_cpy(ctx0,
-            //         KQV_merged,
-            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N*n_batch,1,1]
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        assert_shape_2d(inpFF, n_embd, N*n_batch);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-            }
-
-            // tmp shape [n_ff,N*n_batch,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-            assert_shape_2d(tmp, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // SILU activation
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_silu(ctx0, cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // cur shape [n_embd,N*n_batch,1,1]
-        cur = ggml_add(ctx0, cur, inpFF);
-        assert_shape_2d(cur, n_embd, N*n_batch);
-
-        // input for next layer
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = cur;
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N*n_batch,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-    assert_shape_2d(inpL, n_vocab, N*n_batch);
-
-    {
-        // inpL shape [n_vocab,N,n_batch,1]
-        inpL = ggml_reshape_3d(ctx0,
-                        inpL,
-                        n_vocab, N, n_batch);
-        assert_shape_3d(inpL, n_vocab, N, n_batch);
-    }
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-static struct ggml_tensor * forward_lora(
-    struct llama_model_lora * model,
-    struct llama_kv_cache   * cache,
-    struct ggml_context     * ctx0,
-    struct ggml_cgraph      * gf,
-    struct ggml_tensor      * tokens_input,
-    const  int                n_tokens,
-    const  int                n_past
-) {
-    const int N = n_tokens;
-
-    struct llama_kv_cache& kv_self = *cache;
-    const auto & hparams = model->hparams;
-
-    const int n_ctx   = hparams.n_ctx;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
-
-    struct ggml_tensor * kc = kv_self.k;
-    struct ggml_tensor * vc = kv_self.v;
-
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    {
-        int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < N; ++i) {
-            data[i] = n_past + i;
-        }
-    }
-
-    // inpL shape [n_embd,N,1,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0,
-                                            ggml_reshape_3d(ctx0,
-                                                ggml_mul_mat(ctx0,
-                                                    model->layers[il].wqa,
-                                                    ggml_mul_mat(ctx0,
-                                                        model->layers[il].wqb,
-                                                        cur)),
-                                                n_embd/n_head, n_head, N),
-                                            KQ_pos, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0,
-                                            ggml_reshape_3d(ctx0,
-                                                ggml_mul_mat(ctx0,
-                                                    model->layers[il].wka,
-                                                    ggml_mul_mat(ctx0,
-                                                        model->layers[il].wkb,
-                                                        cur)),
-                                                n_embd/n_head, n_head, N),
-                                            KQ_pos, n_rot, 0);
-
-            // store key and value to memory
-            {
-                // compute the transposed [N, n_embd] V matrix
-                // wv   shape [n_embd, n_embd, 1, 1]
-                // Vcur shape [n_embd, N, 1, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0,
-                                                ggml_transpose(ctx0,
-                                                    ggml_reshape_2d(ctx0,
-                                                        ggml_mul_mat(ctx0,
-                                                            model->layers[il].wva,
-                                                            ggml_mul_mat(ctx0,
-                                                                model->layers[il].wvb,
-                                                                cur)),
-                                                        n_embd, N)));
-
-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
-                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
-                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
-
-                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-                } //*/
-
-                kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                vc = ggml_set_2d(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-            }
-
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Q shape    [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-            // K shape [n_embd/n_head, n_past + N, n_head, 1]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            // KQ shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // split cached V into n_head heads
-            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
-            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, vc,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(vc),
-                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(vc)*n_embd);
-
-            // KQV shape [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            // KQV_merged shape
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
-            // cur = ggml_cpy(ctx0,
-            //         KQV_merged,
-            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].woa,
-                    ggml_mul_mat(ctx0,
-                        model->layers[il].wob,
-                        cur));
-        }
-
-        // inpFF shape [n_embd,N,1,1]
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-            }
-
-            // tmp shape [n_ff,N,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-
-            // SILU activation
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_silu(ctx0, cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-        }
-
-        // cur shape [n_embd,N,1,1]
-        cur = ggml_add(ctx0, cur, inpFF);
-
-        // input for next layer
-        // inpL shape [n_embd,N,1,1]
-        inpL = cur;
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        //embeddings = inpL;
-    }
-
-
-    // lm_head
-    // inpL shape [n_vocab,N,1,1]
-    inpL = ggml_mul_mat(ctx0,
-                model->outputa,
-                    ggml_mul_mat(ctx0,
-                        model->outputb,
-                        inpL));
-
-    // ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    assert(ggml_is_matrix(logits));
-    assert(ggml_is_matrix(probs));
-    assert(ggml_is_vector(best_samples));
-    assert(logits->ne[1] == best_samples->ne[0]);
-    assert(logits->ne[0] == probs->ne[0]);
-    assert(logits->ne[1] == probs->ne[1]);
-    for (int i = 0; i < logits->ne[1]; ++i) {
-        float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
-        ggml_set_i32_1d(best_samples, i, 0);
-        for (int k = 0; k < logits->ne[0]; ++k) {
-            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
-            if (logit > max_logit) {
-                max_logit = logit;
-                ggml_set_i32_1d(best_samples, i, k);
-            }
-        }
-        float psum = 0;
-        for (int k = 0; k < logits->ne[0]; ++k) {
-            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
-            float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
-            psum += p;
-            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
-        }
-        for (int k = 0; k < logits->ne[0]; ++k) {
-            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
-            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
-        }
-    }
-}
-
-static void sample_softmax_batch(
-    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
-    struct ggml_tensor * best_samples
-) {
-    GGML_ASSERT(ggml_is_matrix(best_samples));
-    GGML_ASSERT(ggml_is_3d(logits));
-    GGML_ASSERT(ggml_is_3d(probs));
-    int n_tokens = best_samples->ne[0];
-    int n_batch  = best_samples->ne[1];
-    int n_vocab  = logits->ne[0];
-    GGML_ASSERT(n_tokens == logits->ne[1]);
-    GGML_ASSERT(n_batch  == logits->ne[2]);
-    GGML_ASSERT(n_vocab  == probs->ne[0]);
-    GGML_ASSERT(n_tokens == probs->ne[1]);
-    GGML_ASSERT(n_batch  == probs->ne[2]);
-
-    for (int k = 0; k < n_batch; ++k) {
-        struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
-                                                best_samples,
-                                                best_samples->ne[0],
-                                                k*best_samples->nb[1]);
-        struct ggml_tensor * logits_k       = ggml_view_2d(ctx,
-                                                logits,
-                                                logits->ne[0],
-                                                logits->ne[1],
-                                                logits->nb[1],
-                                                k*logits->nb[2]);
-        struct ggml_tensor * probs_k        = ggml_view_2d(ctx,
-                                                probs,
-                                                probs->ne[0],
-                                                probs->ne[1],
-                                                probs->nb[1],
-                                                k*probs->nb[2]);
-        sample_softmax(logits_k, probs_k, best_samples_k);
-    }
-}
-
-static void print_row(struct ggml_tensor * probs, int i) {
-    for (int k = 0; k < probs->ne[0]; ++k) {
-        float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
-        printf(" %.2f", p);
-    }
-    printf("\n");
-}
-
-static void print_matrix(struct ggml_tensor * probs) {
-    assert(ggml_is_matrix(probs));
-    for (int i = 0; i < probs->ne[1]; ++i) {
-        for (int k = 0; k < probs->ne[0]; ++k) {
-            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
-            printf(" %.2f", p);
-        }
-        printf("\n");
-    }
-}
-
-static void print_token(int token, int n_vocab) {
-    for (int k = 0; k < token; ++k) {
-        printf(" ");
-    }
-    printf("X");
-    for (int k = token+1; k < n_vocab; ++k) {
-        printf(" ");
-    }
-    printf("\n");
-}
-
-static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
-    for (int i=0; i<tokens->ne[0]; ++i) {
-        int token = ggml_get_i32_1d(tokens, i);
-        print_token(token, n_vocab);
-    }
-}
-
-static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
-    int n_tokens = tokens_input->ne[0];
-    int n_vocab = targets->ne[0];
-    float randomness = 0.0f;
-    // ggml_set_zero(targets);
-    ggml_set_f32(targets, -1.0f);
-    ggml_set_i32_1d(tokens_input, 0, 0);
-    for (int i=1; i<n_tokens+1; ++i) {
-        float x = example_id + i * 3.14159f * 2.0f * 1.0f * 0.5f / n_tokens;
-        float y = sinf(x);//*cosf(x*1.1f+1.0f);
-        float z = (y+1.0f)*0.5f; // scale to [0..1]
-        z += (frand()-0.5f)*(randomness/n_vocab);
-        z = (z < 0.0f) ? 0.0f : (z > 1.0f) ? 1.0f : z; // clamp to [0..1]
-        int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1));
-        ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
-        if (i<n_tokens) {
-            ggml_set_i32_1d(tokens_input, i, token);
-        }
-    }
-}
-
-static void get_example_targets_batch(
-    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
-) {
-    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(ggml_is_3d(targets));
-    int n_tokens = tokens_input->ne[0];
-    int n_batch  = tokens_input->ne[1];
-    GGML_ASSERT(n_tokens == targets->ne[1]);
-    GGML_ASSERT(n_batch  == targets->ne[2]);
-
-    for (int k=0; k<n_batch; ++k) {
-        struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
-                                                tokens_input,
-                                                tokens_input->ne[0],
-                                                k*tokens_input->nb[1]);
-        struct ggml_tensor * targets_k    = ggml_view_2d(ctx,
-                                                targets,
-                                                targets->ne[0],
-                                                targets->ne[1],
-                                                targets->nb[1],
-                                                k*targets->nb[2]);
-        get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k);
-    }
-}
-
-static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
-    int n_tokens = tokens_input->ne[0];
-    int n_vocab = targets->ne[0];
-    for (int i=0; i<n_tokens-n_shift; ++i) {
-        ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
-        for (int k=0; k<n_vocab; ++k) {
-            ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
-        }
-    }
-}
-
-static struct ggml_tensor * square_error_loss(
-    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
-) {
-    // todo: instead of a-b: a[1:]-b[:-1]
-    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
-}
-
-static struct ggml_tensor * cross_entropy_loss(
-    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
-) {
-    const float eps = 1e-3f;
-    return
-        ggml_sum(ctx,
-            ggml_neg(ctx,
-                ggml_sum_rows(ctx,
-                    ggml_mul(ctx,
-                        ggml_soft_max(ctx, a),
-                        ggml_log(ctx,
-                            ggml_add1(ctx,
-                                ggml_soft_max(ctx, b),
-                                ggml_new_f32(ctx, eps)))))));
-}
-
-int main(int argc, char ** argv) {
-    if (argc < 1) {
-        fprintf(stderr, "usage: %s\n", argv[0]);
-
-        return 1;
-    }
-
-    struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll;
-    lcparams.mem_buffer = NULL;
-    lcparams.no_alloc   = false;
-
-    struct llama_model model;
-    model.hparams.n_vocab = 8;
-    model.hparams.n_ctx   = 8;
-    model.hparams.n_embd  = 32;
-    model.hparams.n_mult  = 2;
-    model.hparams.n_head  = 8;
-    model.hparams.n_layer = 1;
-    model.hparams.n_rot   = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
-
-    // model.hparams.n_embd  = 32;
-    // model.hparams.n_mult  = 2;
-    // model.hparams.n_head  = 4;
-    // model.hparams.n_layer = 8;
-    // model.hparams.n_rot   = 8;
-
-    model.ctx = ggml_init(lcparams);
-    printf("init model\n");
-    init_model(&model);
-    set_param_model(&model);
-
-    randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
-
-/*
-    struct llama_model_lora model_lora;
-    // model.hparams.n_vocab = 6;
-    // model.hparams.n_ctx   = 64;
-    // model.hparams.n_embd  = 128;
-    // model.hparams.n_mult  = 2;
-    // model.hparams.n_head  = 8;
-    // model.hparams.n_layer = 6;
-    // model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;
-
-    model_lora.hparams.n_vocab = 16;
-    model_lora.hparams.n_ctx   = 32;
-    model_lora.hparams.n_embd  = 256;
-    model_lora.hparams.n_mult  = 2;
-    model_lora.hparams.n_head  = 16;
-    model_lora.hparams.n_layer = 1;
-    model_lora.hparams.n_lora  = 64;
-    model_lora.hparams.n_rot   = MIN(16, model_lora.hparams.n_embd / model_lora.hparams.n_head);
-    // model.hparams.n_rot   = (model.hparams.n_embd / model.hparams.n_head) / 2;
-
-    // model.hparams.n_embd  = 32;
-    // model.hparams.n_mult  = 2;
-    // model.hparams.n_head  = 4;
-    // model.hparams.n_layer = 8;
-    // model.hparams.n_rot   = 8;
-
-    model_lora.ctx = ggml_init(lcparams);
-    printf("init model_lora\n");
-    init_model_lora(&model_lora);
-    set_param_model_lora(&model_lora);
-
-    randomize_model_lora(&model_lora, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
-*/
-    int n_batch = 8;
-    // key + value cache for the self attention
-    struct llama_kv_cache kv_self;
-    printf("init_kv_cache\n");
-    kv_self.ctx = model.ctx;
-    init_kv_cache(&kv_self, &model, n_batch);
-    //init_kv_cache_lora(&kv_self, &model_lora);
-
-    size_t    compute_size = 1024ll*1024ll*1024ll;
-    uint8_t * compute_addr = new uint8_t[compute_size];
-
-    int n_examples = 256;
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-
-    std::vector<uint8_t> work_buffer;
-
-    for (int ex=0; ex<n_examples; ++ex) {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ compute_size,
-            /*.mem_buffer =*/ compute_addr,
-            /*.no_alloc   =*/ false,
-        };
-
-        struct ggml_context * ctx0 = ggml_init(params);
-
-        struct ggml_tensor * after_opt_best_samples  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * after_opt_probs         = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
-        struct ggml_tensor * tokens_input            = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * targets                 = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
-
-        int n_past = 0;
-
-        struct ggml_cgraph * gf = NULL;
-        gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
-
-        get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
-
-        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
-        // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
-        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
-
-        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
-
-        float error_before_opt = ggml_get_f32_1d(e, 0);
-
-        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
-        opt_params_lbfgs.print_forward_graph = false;
-        opt_params_lbfgs.print_backward_graph = false;
-        opt_params_lbfgs.lbfgs.n_iter = 16;
-        ggml_opt(ctx0, opt_params_lbfgs, e);
-        //
-        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
-
-        float error_after_opt = ggml_get_f32_1d(e, 0);
-
-        if (ex % 8 == 0) {
-            printf("Example %d\n", (ex+1));
-            printf("error_before_opt: %.2f\n", error_before_opt);
-            printf("error_after_opt:  %.2f\n", error_after_opt);
-        }
-
-        if (ex % 64 == 0) {
-            sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
-            // printf("probabilities after optimization:\n");
-            // print_matrix(after_opt_probs);
-            printf("best samples after optimization:\n");
-            print_tokens(after_opt_best_samples, n_vocab);
-        }
-
-        ggml_free(ctx0);
-    }
-
-    {
-        int n_gen = 128;
-        int sample_ctx = n_tokens-n_tokens/8;
-
-        printf("Generating %d tokens.\n", n_gen);
-
-        struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
-        struct ggml_tensor * targets      = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
-
-        get_example_targets(137, tokens_input, targets);
-        for (int i=sample_ctx; i<n_tokens; ++i) {
-            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
-        }
-
-        for (int i=0; i<sample_ctx-1; ++i) {
-            print_token(ggml_get_i32_1d(tokens_input, i), n_vocab);
-        }
-        printf("---\n");
-        for (int i=0; i<n_gen; ++i) {
-            struct ggml_init_params params = {
-                /*.mem_size   =*/ compute_size,
-                /*.mem_buffer =*/ compute_addr,
-                /*.no_alloc   =*/ false,
-            };
-            struct ggml_context * ctx0 = ggml_init(params);
-
-            struct ggml_cgraph * gf = NULL;
-            gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
-
-            int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
-
-            ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
-
-            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
-            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
-
-            sample_softmax(logits, probs, best_samples);
-
-            // int sample_at = n_tokens-1;
-            int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
-
-            // print_row(probs, sample_at);
-            print_token(token, n_vocab);
-
-            lshift_examples(tokens_input, targets, 1);
-            ggml_set_i32_1d(tokens_input, 0, 0);
-            ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
-
-            ggml_free(ctx0);
-        }
-    }
-
-    print_matrix(model.tok_embeddings);
-    printf("done\n");
-
-    // ggml_free(kv_self.ctx);
-    // ggml_free(model_lora.ctx);
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 14e02c8dd..aae49c965 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -4,6 +4,7 @@
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 #include "ggml.h"
+#include "ggml-cpu.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
index 8354e37e5..5fe70dac7 100644
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -1,3 +1,5 @@
+#include "ggml-cpu.h"
+
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index c11eb4183..125413d1b 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -305,27 +305,10 @@ extern "C" {
     GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
     GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
 
-    //
-    // CPU backend
-    //
-
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-
-    GGML_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
-    GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-
-    // Create a backend buffer from an existing pointer
+    // CPU buffer types are always available
     GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
 
-    GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
-
-#ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
new file mode 100644
index 000000000..7f1ee7573
--- /dev/null
+++ b/ggml/include/ggml-cpu.h
@@ -0,0 +1,150 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    // Scheduling priorities
+    enum ggml_sched_priority {
+        GGML_SCHED_PRIO_NORMAL,
+        GGML_SCHED_PRIO_MEDIUM,
+        GGML_SCHED_PRIO_HIGH,
+        GGML_SCHED_PRIO_REALTIME
+    };
+
+    // Threadpool params
+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
+    struct ggml_threadpool_params {
+        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
+    };
+
+    struct ggml_threadpool;     // forward declaration, see ggml.c
+
+    typedef struct ggml_threadpool * ggml_threadpool_t;
+
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+        struct ggml_threadpool * threadpool;
+
+        // abort ggml_graph_compute when true
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
+    };
+
+    // numa strategies
+    enum ggml_numa_strategy {
+        GGML_NUMA_STRATEGY_DISABLED   = 0,
+        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
+        GGML_NUMA_STRATEGY_ISOLATE    = 2,
+        GGML_NUMA_STRATEGY_NUMACTL    = 3,
+        GGML_NUMA_STRATEGY_MIRROR     = 4,
+        GGML_NUMA_STRATEGY_COUNT
+    };
+
+    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
+    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
+    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+
+    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+
+    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+
+    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
+    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+
+    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
+    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+                    struct ggml_threadpool * threadpool /* = NULL */ );
+    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+
+    // TODO: move to backend interface
+    GGML_API int ggml_cpu_has_neon       (void);
+    GGML_API int ggml_cpu_has_sve        (void);
+    GGML_API int ggml_cpu_has_matmul_int8(void);
+    // get the sve vector length in bytes
+    GGML_API int ggml_cpu_get_sve_cnt(void);
+
+    // Internal types and functions exposed for tests and benchmarks
+
+    typedef void (*ggml_from_float_to_mat_t)
+                                     (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
+    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+    typedef void (*ggml_gemv_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                       const void * GGML_RESTRICT y, int nr, int nc);
+    typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                       const void * GGML_RESTRICT y, int nr, int nc);
+
+    struct ggml_type_traits_cpu {
+        ggml_from_float_to_mat_t from_float_to_mat;
+        ggml_vec_dot_t           vec_dot;
+        enum ggml_type           vec_dot_type;
+        int64_t                  nrows; // number of rows to process simultaneously
+        int64_t                  ncols; // number of columns to process simultaneously
+        ggml_gemv_t              gemv;
+        ggml_gemm_t              gemm;
+    };
+
+    GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
+
+    GGML_API void ggml_cpu_init(void);
+
+    //
+    // CPU backend
+    //
+
+    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+    GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+
+    GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
+
+#ifdef GGML_USE_CPU_HBM
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 2d93f31fa..8a0bcbff8 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -573,6 +573,13 @@ extern "C" {
         GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
     };
 
+    struct ggml_init_params {
+        // memory pool
+        size_t mem_size;   // bytes
+        void * mem_buffer; // if NULL, memory will be allocated internally
+        bool   no_alloc;   // don't allocate memory for the tensor data
+    };
+
     // n-dimensional tensor
     struct ggml_tensor {
         enum ggml_type type;
@@ -618,59 +625,6 @@ extern "C" {
     // If it returns true, the computation is aborted
     typedef bool (*ggml_abort_callback)(void * data);
 
-    // Scheduling priorities
-    enum ggml_sched_priority {
-        GGML_SCHED_PRIO_NORMAL,
-        GGML_SCHED_PRIO_MEDIUM,
-        GGML_SCHED_PRIO_HIGH,
-        GGML_SCHED_PRIO_REALTIME
-    };
-
-    // Threadpool params
-    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
-    struct ggml_threadpool_params {
-        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
-        int                 n_threads;                   // number of threads
-        enum ggml_sched_priority prio;                   // thread priority
-        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
-        bool                strict_cpu;                  // strict cpu placement
-        bool                paused;                      // start in paused state
-    };
-
-    struct ggml_threadpool;     // forward declaration, see ggml.c
-
-    typedef struct ggml_threadpool * ggml_threadpool_t;
-
-    // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
-
-        int n_threads;
-        struct ggml_threadpool * threadpool;
-
-        // abort ggml_graph_compute when true
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
-    };
-
-    struct ggml_init_params {
-        // memory pool
-        size_t mem_size;   // bytes
-        void * mem_buffer; // if NULL, memory will be allocated internally
-        bool   no_alloc;   // don't allocate memory for the tensor data
-    };
-
-    // numa strategies
-    enum ggml_numa_strategy {
-        GGML_NUMA_STRATEGY_DISABLED   = 0,
-        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
-        GGML_NUMA_STRATEGY_ISOLATE    = 2,
-        GGML_NUMA_STRATEGY_NUMACTL    = 3,
-        GGML_NUMA_STRATEGY_MIRROR     = 4,
-        GGML_NUMA_STRATEGY_COUNT
-    };
 
     //
     // GUID
@@ -693,9 +647,6 @@ extern "C" {
     // accepts a UTF-8 path, even on Windows
     GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);
 
-    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
-    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
-
     GGML_API void    ggml_print_object (const struct ggml_object * obj);
     GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
 
@@ -797,8 +748,7 @@ extern "C" {
             int64_t ne2,
             int64_t ne3);
 
-    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+    GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
 
     GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
     GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
@@ -808,35 +758,25 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
 
-    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-
     // Converts a flat index into coordinates
-    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+    GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
 
-    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-
-    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
-
-    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
-
-    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
 
     GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
     GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
 
-    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
-
     GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
     GGML_ATTRIBUTE_FORMAT(2, 3)
     GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);
 
+    // Tensor flags
+    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
+    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
+    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
+
     //
     // operations on tensors with backpropagation
     //
@@ -2052,9 +1992,6 @@ extern "C" {
     // automatic differentiation
     //
 
-    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
-    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
-
     GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
     GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
 
@@ -2086,27 +2023,6 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
-    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
-    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
-    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
-    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
-
-    // ggml_graph_plan() has to be called before ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan(
-                  const struct ggml_cgraph * cgraph,
-                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
-                    struct ggml_threadpool * threadpool /* = NULL */ );
-    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
-
-    // same as ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
-
     GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
 
     GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
@@ -2277,6 +2193,8 @@ extern "C" {
         } lbfgs;
     };
 
+    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+
     GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
 
     // optimize the function defined by the tensor f
@@ -2308,12 +2226,6 @@ extern "C" {
             ggml_opt_callback callback,
             void * callback_data);
 
-    //
-    // tensor flags
-    //
-    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
-
     //
     // quantization
     //
@@ -2482,8 +2394,6 @@ extern "C" {
     GGML_API int ggml_cpu_has_avx512_bf16(void);
     GGML_API int ggml_cpu_has_amx_int8   (void);
     GGML_API int ggml_cpu_has_fma        (void);
-    GGML_API int ggml_cpu_has_neon       (void);
-    GGML_API int ggml_cpu_has_sve        (void);
     GGML_API int ggml_cpu_has_arm_fma    (void);
     GGML_API int ggml_cpu_has_metal      (void);
     GGML_API int ggml_cpu_has_f16c       (void);
@@ -2500,17 +2410,9 @@ extern "C" {
     GGML_API int ggml_cpu_has_sycl       (void);
     GGML_API int ggml_cpu_has_rpc        (void);
     GGML_API int ggml_cpu_has_vsx        (void);
-    GGML_API int ggml_cpu_has_matmul_int8(void);
     GGML_API int ggml_cpu_has_cann       (void);
     GGML_API int ggml_cpu_has_llamafile  (void);
 
-    // get the sve vector length in bytes
-    GGML_API int ggml_cpu_get_sve_cnt(void);
-
-    //
-    // Internal types and functions exposed for tests and benchmarks
-    //
-
 #ifdef  __cplusplus
 // restrict not standard in C++
 #define GGML_RESTRICT
@@ -2519,14 +2421,6 @@ extern "C" {
 #endif
     typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-    typedef void (*ggml_from_float_to_mat_t)
-                                     (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
-    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                       const void * GGML_RESTRICT y, size_t by, int nrc);
-    typedef void (*ggml_gemv_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
-                                       const void * GGML_RESTRICT y, int nr, int nc);
-    typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
-                                       const void * GGML_RESTRICT y, int nr, int nc);
 
     struct ggml_type_traits {
         const char             * type_name;
@@ -2537,13 +2431,6 @@ extern "C" {
         ggml_to_float_t          to_float;
         ggml_from_float_t        from_float;
         ggml_from_float_t        from_float_ref;
-        ggml_from_float_to_mat_t from_float_to_mat;
-        ggml_vec_dot_t           vec_dot;
-        enum ggml_type           vec_dot_type;
-        int64_t                  nrows; // number of rows to process simultaneously
-        int64_t                  ncols; // number of columns to process simultaneously
-        ggml_gemv_t              gemv;
-        ggml_gemm_t              gemm;
     };
 
     GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 0764a8d90..82b81cf12 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1366,10 +1366,12 @@ endif()
 
 add_library(ggml
             ../include/ggml.h
+            ../include/ggml-cpu.h
             ../include/ggml-alloc.h
             ../include/ggml-backend.h
             ../include/ggml-cpp.h
             ggml.c
+            ggml-cpu.c
             ggml-alloc.c
             ggml-backend.cpp
             ggml-quants.c
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index eb30f8944..81f62ff4f 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -7,6 +7,7 @@
 
 #include "ggml-quants.h"
 #include "ggml-impl.h"
+#include "ggml-cpu.h"
 #include "ggml-cpu-impl.h"
 
 #include <math.h>
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index c2afdf391..0b8ebac53 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -8,6 +8,7 @@
 #include <windows.h>
 #endif
 
+#include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-alloc.h"
 #include "ggml-impl.h"
@@ -566,6 +567,8 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #include "ggml-kompute.h"
 #endif
 
+#include "ggml-cpu.h"
+
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_t> backends;
     std::vector<ggml_backend_dev_t> devices;
@@ -713,616 +716,6 @@ ggml_backend_t ggml_backend_init_best(void) {
     return ggml_backend_dev_init(dev, NULL);
 }
 
-// CPU backend - buffer
-
-static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
-    uintptr_t data = (uintptr_t)buffer->context;
-
-    // align the buffer
-    if (data % TENSOR_ALIGNMENT != 0) {
-        data = GGML_PAD(data, TENSOR_ALIGNMENT);
-    }
-
-    return (void *)data;
-}
-
-static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_aligned_free(buffer->context, buffer->size);
-}
-
-static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-    memset((char *)tensor->data + offset, value, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    memcpy((char *)tensor->data + offset, data, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        memcpy(dst->data, src->data, ggml_nbytes(src));
-        return true;
-    }
-    return false;
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    memset(buffer->context, value, buffer->size);
-}
-
-static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
-    /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_cpu_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
-    /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
-    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_cpu_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// CPU backend - buffer type
-
-static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * data = ggml_aligned_malloc(size);
-
-    if (data == NULL) {
-        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
-        return NULL;
-    }
-
-    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
-}
-
-static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return true;
-
-    GGML_UNUSED(buft);
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
-        /* .iface   = */ {
-            /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type;
-}
-
-static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_Mapped";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
-        /* .iface   = */ {
-            /* .get_name         = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type;
-}
-
-#ifdef GGML_USE_CPU_HBM
-
-// buffer type HBM
-
-#include <hbwmalloc.h>
-
-static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_HBM";
-
-    GGML_UNUSED(buft);
-}
-
-static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    hbw_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr;
-    int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
-    if (result != 0) {
-        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
-        return NULL;
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .context  = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type_hbm;
-}
-#endif
-
-static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
-    static ggml_backend_buffer_type_t bufts[] = {
-#ifdef GGML_USE_CPU_HBM
-        ggml_backend_cpu_hbm_buffer_type(),
-#endif
-        NULL
-    };
-
-    return bufts;
-
-    GGML_UNUSED(device);
-}
-
-// CPU backend - backend (stream)
-
-struct ggml_backend_cpu_context {
-    int                 n_threads;
-    ggml_threadpool_t   threadpool;
-
-    uint8_t *           work_data;
-    size_t              work_size;
-
-    ggml_abort_callback abort_callback;
-    void *              abort_callback_data;
-};
-
-static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
-    return "CPU";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_cpu_free(ggml_backend_t backend) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-    delete[] cpu_ctx->work_data;
-    delete cpu_ctx;
-    delete backend;
-}
-
-struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
-    struct ggml_cgraph cgraph;
-};
-
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
-
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
-
-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
-        if (cpu_plan->cplan.work_data == NULL) {
-            delete cpu_plan;
-            return NULL;
-        }
-    }
-
-    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
-    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
-    return cpu_plan;
-}
-
-static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    delete[] cpu_plan->cplan.work_data;
-    delete cpu_plan;
-
-    GGML_UNUSED(backend);
-}
-
-static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-
-    GGML_UNUSED(backend);
-}
-
-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-
-    if (cpu_ctx->work_size < cplan.work_size) {
-        delete[] cpu_ctx->work_data;
-        cpu_ctx->work_data = new uint8_t[cplan.work_size];
-        if (cpu_ctx->work_data == NULL) {
-            cpu_ctx->work_size = 0;
-            return GGML_STATUS_ALLOC_FAILED;
-        }
-        cpu_ctx->work_size = cplan.work_size;
-    }
-    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
-
-    cplan.abort_callback      = cpu_ctx->abort_callback;
-    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
-    return ggml_graph_compute(cgraph, &cplan);
-}
-
-static const struct ggml_backend_i ggml_backend_cpu_i = {
-    /* .get_name                = */ ggml_backend_cpu_get_name,
-    /* .free                    = */ ggml_backend_cpu_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
-    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_cpu_guid(void) {
-    static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
-    return &guid;
-}
-
-ggml_backend_t ggml_backend_cpu_init(void) {
-    struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
-    if (ctx == NULL) {
-        return NULL;
-    }
-
-    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
-    ctx->threadpool          = NULL;
-    ctx->work_data           = NULL;
-    ctx->work_size           = 0;
-    ctx->abort_callback      = NULL;
-    ctx->abort_callback_data = NULL;
-
-    ggml_backend_t cpu_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_cpu_guid(),
-        /* .interface = */ ggml_backend_cpu_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context   = */ ctx,
-    };
-
-    if (cpu_backend == NULL) {
-        delete ctx;
-        return NULL;
-    }
-
-    return cpu_backend;
-}
-
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
-}
-
-void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->n_threads = n_threads;
-}
-
-void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-
-    if (ctx->threadpool && ctx->threadpool != threadpool) {
-        // already had a different threadpool, pause/suspend it before switching
-        ggml_threadpool_pause(ctx->threadpool);
-    }
-    ctx->threadpool = threadpool;
-}
-
-void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->abort_callback = abort_callback;
-    ctx->abort_callback_data = abort_callback_data;
-}
-
-ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
-    GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
-    return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
-}
-
-// CPU backend - device
-
-struct ggml_backend_cpu_device_context {
-    std::string description = "CPU";
-
-    ggml_backend_cpu_device_context() {
-#ifdef __APPLE__
-        size_t len = 0;
-        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
-            description.resize(len);
-            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
-        }
-#elif defined(__linux__)
-        FILE * f = fopen("/proc/cpuinfo", "r");
-        if (f) {
-            char buf[1024];
-            while (fgets(buf, sizeof(buf), f)) {
-                if (strncmp(buf, "model name", 10) == 0) {
-                    char * p = strchr(buf, ':');
-                    if (p) {
-                        p++;
-                        while (std::isspace(*p)) {
-                            p++;
-                        }
-                        while (std::isspace(p[strlen(p) - 1])) {
-                            p[strlen(p) - 1] = '\0';
-                        }
-                        description = p;
-                        break;
-                    }
-                }
-            }
-            fclose(f);
-        }
-#elif defined(_WIN32)
-        HKEY hKey;
-        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
-                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
-                        0,
-                        KEY_READ,
-                        &hKey) == ERROR_SUCCESS) {
-            DWORD cpu_brand_size = 0;
-            if (RegQueryValueExA(hKey,
-                                TEXT("ProcessorNameString"),
-                                NULL,
-                                NULL,
-                                NULL,
-                                &cpu_brand_size) == ERROR_SUCCESS) {
-                description.resize(cpu_brand_size);
-                if (RegQueryValueExA(hKey,
-                                    TEXT("ProcessorNameString"),
-                                    NULL,
-                                    NULL,
-                                    (LPBYTE)&description[0], // NOLINT
-                                    &cpu_brand_size) == ERROR_SUCCESS) {
-                    if (description.find('\0') != std::string::npos) {
-                        description.resize(description.find('\0'));
-                    }
-                }
-            }
-            RegCloseKey(hKey);
-        }
-#endif
-    }
-};
-
-static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
-    return "CPU";
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
-    struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
-
-    return ctx->description.c_str();
-}
-
-static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-
-    GGML_UNUSED(dev);
-}
-
-static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_CPU;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_cpu_device_get_name(dev);
-    props->description = ggml_backend_cpu_device_get_description(dev);
-    props->type        = ggml_backend_cpu_device_get_type(dev);
-    ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ true,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
-    return ggml_backend_cpu_init();
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_cpu_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(max_tensor_size);
-}
-
-static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_CPY:
-            return
-                op->type != GGML_TYPE_IQ2_XXS &&
-                op->type != GGML_TYPE_IQ2_XS  &&
-                op->type != GGML_TYPE_IQ1_S   &&
-                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
-        case GGML_OP_MUL_MAT:
-            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
-        case GGML_OP_ROPE_BACK:
-            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
-        case GGML_OP_IM2COL_BACK:
-            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
-        case GGML_OP_OUT_PROD:
-            return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
-        default:
-            return true;
-    }
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft);
-
-    GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
-    /* .get_name             = */ ggml_backend_cpu_device_get_name,
-    /* .get_description      = */ ggml_backend_cpu_device_get_description,
-    /* .get_memory           = */ ggml_backend_cpu_device_get_memory,
-    /* .get_type             = */ ggml_backend_cpu_device_get_type,
-    /* .get_props            = */ ggml_backend_cpu_device_get_props,
-    /* .init_backend         = */ ggml_backend_cpu_device_init_backend,
-    /* .get_buffer_type      = */ ggml_backend_cpu_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
-    /* .supports_op          = */ ggml_backend_cpu_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_cpu_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-// CPU backend - backend (reg)
-
-static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
-    return "CPU";
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    static ggml_backend_cpu_device_context ctx;
-    static ggml_backend_device ggml_backend_cpu_device = {
-        /* .iface   = */ ggml_backend_cpu_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ &ctx,
-    };
-
-    return &ggml_backend_cpu_device;
-}
-
-static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        return (void *)ggml_backend_cpu_set_n_threads;
-    }
-    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
-        return (void *)ggml_backend_cpu_get_extra_bufts;
-    }
-
-    return NULL;
-
-    GGML_UNUSED(reg);
-}
-
-static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
-    /* .get_name         = */ ggml_backend_cpu_reg_get_name,
-    /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_cpu_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_cpu_reg(void) {
-    static struct ggml_backend_reg ggml_backend_cpu_reg = {
-        /* .iface   = */ ggml_backend_cpu_reg_i,
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_cpu_reg;
-}
-
 // multi-buffer buffer
 
 struct ggml_backend_multi_buffer_context {
@@ -2642,3 +2035,627 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
 
     return true;
 }
+
+
+
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+#include <cctype>
+#include <string>
+
+// ggml-backend interface
+
+// CPU backend - buffer
+
+static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+    uintptr_t data = (uintptr_t)buffer->context;
+
+    // align the buffer
+    if (data % TENSOR_ALIGNMENT != 0) {
+        data = GGML_PAD(data, TENSOR_ALIGNMENT);
+    }
+
+    return (void *)data;
+}
+
+static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_aligned_free(buffer->context, buffer->size);
+}
+
+static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    memset((char *)tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    memcpy((char *)tensor->data + offset, data, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        memcpy(dst->data, src->data, ggml_nbytes(src));
+        return true;
+    }
+    return false;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+
+static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+    /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_cpu_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
+    /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
+    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_cpu_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// CPU backend - buffer type
+
+static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * data = ggml_aligned_malloc(size);
+
+    if (data == NULL) {
+        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
+}
+
+static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return true;
+
+    GGML_UNUSED(buft);
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
+        /* .iface   = */ {
+            /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_cpu_buffer_type;
+}
+
+static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_Mapped";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
+        /* .iface   = */ {
+            /* .get_name         = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_cpu_buffer_type;
+}
+
+#ifdef GGML_USE_CPU_HBM
+
+// buffer type HBM
+
+#include <hbwmalloc.h>
+
+static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_HBM";
+
+    GGML_UNUSED(buft);
+}
+
+static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    hbw_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr;
+    int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+    if (result != 0) {
+        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
+        return NULL;
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .context  = */ NULL,
+    };
+
+    return &ggml_backend_cpu_buffer_type_hbm;
+}
+#endif
+
+static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
+    static ggml_backend_buffer_type_t bufts[] = {
+#ifdef GGML_USE_CPU_HBM
+        ggml_backend_cpu_hbm_buffer_type(),
+#endif
+        NULL
+    };
+
+    return bufts;
+
+    GGML_UNUSED(device);
+}
+
+// CPU backend - backend (stream)
+
+struct ggml_backend_cpu_context {
+    int                 n_threads;
+    ggml_threadpool_t   threadpool;
+
+    uint8_t *           work_data;
+    size_t              work_size;
+
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
+};
+
+static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
+    return "CPU";
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_cpu_free(ggml_backend_t backend) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+    delete[] cpu_ctx->work_data;
+    delete cpu_ctx;
+    delete backend;
+}
+
+struct ggml_backend_plan_cpu {
+    struct ggml_cplan cplan;
+    struct ggml_cgraph cgraph;
+};
+
+static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
+
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
+
+    if (cpu_plan->cplan.work_size > 0) {
+        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
+        if (cpu_plan->cplan.work_data == NULL) {
+            delete cpu_plan;
+            return NULL;
+        }
+    }
+
+    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
+    return cpu_plan;
+}
+
+static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    delete[] cpu_plan->cplan.work_data;
+    delete cpu_plan;
+
+    GGML_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+
+    GGML_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+
+    if (cpu_ctx->work_size < cplan.work_size) {
+        delete[] cpu_ctx->work_data;
+        cpu_ctx->work_data = new uint8_t[cplan.work_size];
+        if (cpu_ctx->work_data == NULL) {
+            cpu_ctx->work_size = 0;
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+        cpu_ctx->work_size = cplan.work_size;
+    }
+    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
+
+    cplan.abort_callback      = cpu_ctx->abort_callback;
+    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
+    return ggml_graph_compute(cgraph, &cplan);
+}
+
+static const struct ggml_backend_i ggml_backend_cpu_i = {
+    /* .get_name                = */ ggml_backend_cpu_get_name,
+    /* .free                    = */ ggml_backend_cpu_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
+    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
+    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_cpu_guid(void) {
+    static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_cpu_init(void) {
+    // initialize CPU backend now to avoid slowing the first graph computation
+    ggml_cpu_init();
+
+    struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
+    if (ctx == NULL) {
+        return NULL;
+    }
+
+    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
+    ctx->work_data           = NULL;
+    ctx->work_size           = 0;
+    ctx->abort_callback      = NULL;
+    ctx->abort_callback_data = NULL;
+
+    ggml_backend_t cpu_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_cpu_guid(),
+        /* .interface = */ ggml_backend_cpu_i,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context   = */ ctx,
+    };
+
+    if (cpu_backend == NULL) {
+        delete ctx;
+        return NULL;
+    }
+
+    return cpu_backend;
+}
+
+bool ggml_backend_is_cpu(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
+}
+
+void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->n_threads = n_threads;
+}
+
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+
+    if (ctx->threadpool && ctx->threadpool != threadpool) {
+        // already had a different threadpool, pause/suspend it before switching
+        ggml_threadpool_pause(ctx->threadpool);
+    }
+    ctx->threadpool = threadpool;
+}
+
+void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+}
+
+ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+    GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
+    return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
+}
+
+// CPU backend - device
+
+struct ggml_backend_cpu_device_context {
+    std::string description = "CPU";
+
+    ggml_backend_cpu_device_context() {
+#ifdef __APPLE__
+        size_t len = 0;
+        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
+            description.resize(len);
+            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
+        }
+#elif defined(__linux__)
+        FILE * f = fopen("/proc/cpuinfo", "r");
+        if (f) {
+            char buf[1024];
+            while (fgets(buf, sizeof(buf), f)) {
+                if (strncmp(buf, "model name", 10) == 0) {
+                    char * p = strchr(buf, ':');
+                    if (p) {
+                        p++;
+                        while (std::isspace(*p)) {
+                            p++;
+                        }
+                        while (std::isspace(p[strlen(p) - 1])) {
+                            p[strlen(p) - 1] = '\0';
+                        }
+                        description = p;
+                        break;
+                    }
+                }
+            }
+            fclose(f);
+        }
+#elif defined(_WIN32)
+        HKEY hKey;
+        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
+                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
+                        0,
+                        KEY_READ,
+                        &hKey) == ERROR_SUCCESS) {
+            DWORD cpu_brand_size = 0;
+            if (RegQueryValueExA(hKey,
+                                TEXT("ProcessorNameString"),
+                                NULL,
+                                NULL,
+                                NULL,
+                                &cpu_brand_size) == ERROR_SUCCESS) {
+                description.resize(cpu_brand_size);
+                if (RegQueryValueExA(hKey,
+                                    TEXT("ProcessorNameString"),
+                                    NULL,
+                                    NULL,
+                                    (LPBYTE)&description[0], // NOLINT
+                                    &cpu_brand_size) == ERROR_SUCCESS) {
+                    if (description.find('\0') != std::string::npos) {
+                        description.resize(description.find('\0'));
+                    }
+                }
+            }
+            RegCloseKey(hKey);
+        }
+#endif
+    }
+};
+
+static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
+    return "CPU";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
+    struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
+
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    // TODO
+    *free = 0;
+    *total = 0;
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_CPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_cpu_device_get_name(dev);
+    props->description = ggml_backend_cpu_device_get_description(dev);
+    props->type        = ggml_backend_cpu_device_get_type(dev);
+    ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    return ggml_backend_cpu_init();
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_cpu_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
+}
+
+static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_CPY:
+            return
+                op->type != GGML_TYPE_IQ2_XXS &&
+                op->type != GGML_TYPE_IQ2_XS  &&
+                op->type != GGML_TYPE_IQ1_S   &&
+                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
+        case GGML_OP_MUL_MAT:
+            return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
+        case GGML_OP_ROPE_BACK:
+            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
+        case GGML_OP_IM2COL_BACK:
+            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+        case GGML_OP_OUT_PROD:
+            return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
+        default:
+            return true;
+    }
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft);
+
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
+    /* .get_name             = */ ggml_backend_cpu_device_get_name,
+    /* .get_description      = */ ggml_backend_cpu_device_get_description,
+    /* .get_memory           = */ ggml_backend_cpu_device_get_memory,
+    /* .get_type             = */ ggml_backend_cpu_device_get_type,
+    /* .get_props            = */ ggml_backend_cpu_device_get_props,
+    /* .init_backend         = */ ggml_backend_cpu_device_init_backend,
+    /* .get_buffer_type      = */ ggml_backend_cpu_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
+    /* .supports_op          = */ ggml_backend_cpu_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_cpu_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// CPU backend - backend (reg)
+
+static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
+    return "CPU";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
+    return 1;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    static ggml_backend_cpu_device_context ctx;
+    static ggml_backend_device ggml_backend_cpu_device = {
+        /* .iface   = */ ggml_backend_cpu_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ &ctx,
+    };
+
+    return &ggml_backend_cpu_device;
+}
+
+static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        return (void *)ggml_backend_cpu_set_n_threads;
+    }
+    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
+        return (void *)ggml_backend_cpu_get_extra_bufts;
+    }
+
+    return NULL;
+
+    GGML_UNUSED(reg);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
+    /* .get_name         = */ ggml_backend_cpu_reg_get_name,
+    /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_cpu_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_cpu_reg(void) {
+    static struct ggml_backend_reg ggml_backend_cpu_reg = {
+        /* .iface   = */ ggml_backend_cpu_reg_i,
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_cpu_reg;
+}
diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c
new file mode 100644
index 000000000..4b8ffb629
--- /dev/null
+++ b/ggml/src/ggml-cpu.c
@@ -0,0 +1,13715 @@
+#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
+#define _USE_MATH_DEFINES // For M_PI on MSVC
+
+#include "ggml-aarch64.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+#include "ggml-quants.h"
+#include "ggml.h"
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h> // using malloc.h with MSC/MINGW
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <float.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <signal.h>
+#if defined(__gnu_linux__)
+#include <syscall.h>
+#endif
+
+#ifdef GGML_USE_OPENMP
+#include <omp.h>
+#endif
+
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
+#undef GGML_USE_LLAMAFILE
+#endif
+
+#ifdef GGML_USE_LLAMAFILE
+#include <llamafile/sgemm.h>
+#endif
+
+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+
+// disable POSIX deprecation warnings
+// these functions are never going away, anyway
+#pragma warning(disable: 4996)
+
+// unreachable code because of multiple instances of code after GGML_ABORT
+#pragma warning(disable: 4702)
+#endif
+
+// Note: once we move threading into a separate C++ file
+// will use std::hardware_destructive_interference_size instead of hardcoding it here
+// and we'll use C++ attribute syntax.
+#define GGML_CACHE_LINE  64
+
+#if defined(__clang__) || defined(__GNUC__)
+#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
+#endif
+
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define GGML_TSAN_ENABLED 1
+#endif
+#else  // __has_feature
+#if defined(__SANITIZE_THREAD__)
+#define GGML_TSAN_ENABLED 1
+#endif
+#endif // __has_feature
+
+#define UNUSED GGML_UNUSED
+#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
+
+#if defined(GGML_USE_ACCELERATE)
+#include <Accelerate/Accelerate.h>
+#endif
+
+// floating point type used to accumulate sums
+typedef double ggml_float;
+
+#define GGML_GELU_FP16
+#define GGML_GELU_QUICK_FP16
+
+#define GGML_SOFT_MAX_UNROLL 4
+#define GGML_VEC_DOT_UNROLL  2
+#define GGML_VEC_MAD_UNROLL  32
+
+//
+// global data
+//
+
+// precomputed gelu table for f16 (128 KB)
+static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
+
+// precomputed quick gelu table for f16 (128 KB)
+static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
+
+// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
+float ggml_table_f32_f16[1 << 16];
+
+#if defined(__ARM_ARCH)
+struct ggml_arm_arch_features_type {
+    int has_neon;
+    int has_i8mm;
+    int has_sve;
+    int sve_cnt;
+} ggml_arm_arch_features = {-1, -1, -1, 0};
+#endif
+
+
+#if defined(_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+    #define NOMINMAX
+#endif
+#include <windows.h>
+
+
+#if !defined(__clang__)
+#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
+
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+typedef atomic_int atomic_flag;
+
+#define ATOMIC_FLAG_INIT 0
+
+typedef enum {
+    memory_order_relaxed,
+    memory_order_consume,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} memory_order;
+
+static void atomic_store(atomic_int * ptr, LONG val) {
+    InterlockedExchange(ptr, val);
+}
+static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
+    // TODO: add support for explicit memory order
+    InterlockedExchange(ptr, val);
+}
+static LONG atomic_load(atomic_int * ptr) {
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
+static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
+static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
+    return InterlockedExchangeAdd(ptr, inc);
+}
+static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedExchangeAdd(ptr, inc);
+}
+static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
+    return InterlockedExchange(ptr, 1);
+}
+static void atomic_flag_clear(atomic_flag * ptr) {
+    InterlockedExchange(ptr, 0);
+}
+static void atomic_thread_fence(memory_order mo) {
+    MemoryBarrier();
+}
+#else // clang
+#include <stdatomic.h>
+#endif
+
+typedef HANDLE pthread_t;
+
+typedef DWORD thread_ret_t;
+static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
+    (void) unused;
+    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
+    if (handle == NULL)
+    {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void * unused) {
+    (void) unused;
+    int ret = (int) WaitForSingleObject(thread, INFINITE);
+    CloseHandle(thread);
+    return ret;
+}
+
+static int sched_yield (void) {
+    Sleep (0);
+    return 0;
+}
+#else
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <sched.h>
+#if defined(__FreeBSD__)
+#include <pthread_np.h>
+#endif
+
+typedef void * thread_ret_t;
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#endif
+
+typedef pthread_t ggml_thread_t;
+
+#ifdef GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
+#endif
+
+#if defined(__APPLE__)
+#include <unistd.h>
+#include <mach/mach.h>
+#include <TargetConditionals.h>
+#endif
+
+//
+// cache line
+//
+
+#if defined(__cpp_lib_hardware_interference_size)
+#define CACHE_LINE_SIZE hardware_destructive_interference_size
+#else
+#if defined(__POWER9_VECTOR__)
+#define CACHE_LINE_SIZE 128
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+#endif
+
+static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
+
+
+static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
+static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
+
+static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_F32] = {
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
+        .vec_dot_type             = GGML_TYPE_F32,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_F16] = {
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
+        .vec_dot_type             = GGML_TYPE_F16,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q4_0] = {
+        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
+        .nrows                    = 1,
+#endif
+    },
+    [GGML_TYPE_Q4_1] = {
+        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
+        .nrows                    = 1,
+#endif
+    },
+    [4] = { // GGML_TYPE_Q4_2
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_COUNT,
+        .nrows                    = 1,
+    },
+    [5] = { // GGML_TYPE_Q4_3
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_COUNT,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q5_0] = {
+        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q5_1] = {
+        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q8_0] = {
+        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
+        .nrows                    = 1,
+#endif
+    },
+    [GGML_TYPE_Q8_1] = {
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q2_K] = {
+        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q3_K] = {
+        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q4_K] = {
+        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q5_K] = {
+        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q6_K] = {
+        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ2_XXS] = {
+        .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ2_XS] = {
+        .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ3_XXS] = {
+        .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ3_S] = {
+        .vec_dot                  = ggml_vec_dot_iq3_s_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ2_S] = {
+        .vec_dot                  = ggml_vec_dot_iq2_s_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ1_S] = {
+        .vec_dot                  = ggml_vec_dot_iq1_s_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ1_M] = {
+        .vec_dot                  = ggml_vec_dot_iq1_m_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ4_NL] = {
+        .vec_dot                  = ggml_vec_dot_iq4_nl_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ4_XS] = {
+        .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_BF16] = {
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
+        .vec_dot_type             = GGML_TYPE_BF16,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q4_0_4_4] = {
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+        .ncols                    = 4,
+        .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
+        .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
+    },
+    [GGML_TYPE_Q4_0_4_8] = {
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+        .ncols                    = 4,
+        .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
+        .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
+    },
+    [GGML_TYPE_Q4_0_8_8] = {
+        .nrows                    = 1,
+        .ncols                    = 8,
+        .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
+        .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
+    },
+    [GGML_TYPE_TQ1_0] = {
+        .vec_dot                  = ggml_vec_dot_tq1_0_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TQ2_0] = {
+        .vec_dot                  = ggml_vec_dot_tq2_0_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+};
+
+const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
+    return &type_traits_cpu[type];
+}
+
+//
+// simd mappings
+//
+
+// we define a common set of C macros which map to specific intrinsics based on the current architecture
+// we then implement the fundamental computation operations below using only these macros
+// adding support for new architectures requires to define the corresponding SIMD macros
+//
+// GGML_F32_STEP / GGML_F16_STEP
+//   number of elements to process in a single step
+//
+// GGML_F32_EPR / GGML_F16_EPR
+//   number of elements to fit in a single register
+//
+
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+
+#define GGML_SIMD
+
+// F32 NEON
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              float32x4_t
+#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
+#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
+#define GGML_F32x4_LOAD         vld1q_f32
+#define GGML_F32x4_STORE        vst1q_f32
+#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
+#define GGML_F32x4_ADD          vaddq_f32
+#define GGML_F32x4_MUL          vmulq_f32
+#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
+#define GGML_F32x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F32_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    (res) = GGML_F32x4_REDUCE_ONE((x)[0]);         \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 NEON
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    #define GGML_F16_STEP 32
+    #define GGML_F16_EPR  8
+
+    #define GGML_F16x8              float16x8_t
+    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
+    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
+    #define GGML_F16x8_LOAD(x)      vld1q_f16((const ggml_fp16_internal_t *)(x))
+    #define GGML_F16x8_STORE        vst1q_f16
+    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
+    #define GGML_F16x8_ADD          vaddq_f16
+    #define GGML_F16x8_MUL          vmulq_f16
+    #define GGML_F16x8_REDUCE(res, x)                               \
+    do {                                                            \
+        int offset = GGML_F16_ARR >> 1;                             \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+    } while (0)
+
+    #define GGML_F16_VEC                GGML_F16x8
+    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
+    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
+    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
+    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
+    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
+    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
+#else
+    // if FP16 vector arithmetic is not supported, we use FP32 instead
+    // and take advantage of the vcvt_ functions to convert to/from FP16
+
+    #define GGML_F16_STEP 16
+    #define GGML_F16_EPR  4
+
+    #define GGML_F32Cx4              float32x4_t
+    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
+    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
+    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
+    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
+    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
+    #define GGML_F32Cx4_ADD          vaddq_f32
+    #define GGML_F32Cx4_MUL          vmulq_f32
+    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
+
+    #define GGML_F16_VEC                GGML_F32Cx4
+    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
+    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
+    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
+    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
+    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
+    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
+#endif
+
+#elif defined(__AVX512F__)
+
+#define GGML_SIMD
+
+// F32 AVX512
+
+#define GGML_F32_STEP 64
+#define GGML_F32_EPR  16
+
+#define GGML_F32x16         __m512
+#define GGML_F32x16_ZERO    _mm512_setzero_ps()
+#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
+#define GGML_F32x16_LOAD    _mm512_loadu_ps
+#define GGML_F32x16_STORE   _mm512_storeu_ps
+// _mm512_fmadd_ps is defined in AVX512F so no guard is required
+#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32x16_ADD     _mm512_add_ps
+#define GGML_F32x16_MUL     _mm512_mul_ps
+#define GGML_F32x16_REDUCE(res, x)                                    \
+do {                                                                  \
+    int offset = GGML_F32_ARR >> 1;                                   \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    res = _mm512_reduce_add_ps(x[0]);                                 \
+} while (0)
+
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x16
+#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
+
+// F16 AVX512
+
+// F16 AVX
+
+#define GGML_F16_STEP 64
+#define GGML_F16_EPR  16
+
+// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
+
+#define GGML_F32Cx16             __m512
+#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
+#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
+
+// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
+// so F16C guard isn't required
+#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
+#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
+
+#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32Cx16_ADD         _mm512_add_ps
+#define GGML_F32Cx16_MUL         _mm512_mul_ps
+#define GGML_F32Cx16_REDUCE(res, x)                               \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    res = _mm512_reduce_add_ps(x[0]);                             \
+} while (0)
+
+#define GGML_F16_VEC                GGML_F32Cx16
+#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
+
+#elif defined(__AVX__)
+
+#define GGML_SIMD
+
+// F32 AVX
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  8
+
+#define GGML_F32x8         __m256
+#define GGML_F32x8_ZERO    _mm256_setzero_ps()
+#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
+#define GGML_F32x8_LOAD    _mm256_loadu_ps
+#define GGML_F32x8_STORE   _mm256_storeu_ps
+#if defined(__FMA__)
+    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
+#else
+    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
+#endif
+#define GGML_F32x8_ADD     _mm256_add_ps
+#define GGML_F32x8_MUL     _mm256_mul_ps
+#define GGML_F32x8_REDUCE(res, x)                                 \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
+                                 _mm256_extractf128_ps(x[0], 1)); \
+    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
+    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
+} while (0)
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x8
+#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
+
+// F16 AVX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  8
+
+// F16 arithmetic is not supported by AVX, so we use F32 instead
+
+#define GGML_F32Cx8             __m256
+#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
+#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
+
+#if defined(__F16C__)
+// the  _mm256_cvt intrinsics require F16C
+#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
+#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
+#else
+static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
+    float tmp[8];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
+    float arr[8];
+
+    _mm256_storeu_ps(arr, y);
+
+    for (int i = 0; i < 8; i++)
+        x[i] = GGML_FP32_TO_FP16(arr[i]);
+}
+#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
+#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
+#endif
+
+#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
+#define GGML_F32Cx8_ADD         _mm256_add_ps
+#define GGML_F32Cx8_MUL         _mm256_mul_ps
+#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
+
+#define GGML_F16_VEC                GGML_F32Cx8
+#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_SIMD
+
+// F32 POWER9
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              vector float
+#define GGML_F32x4_ZERO         0.0f
+#define GGML_F32x4_SET1         vec_splats
+#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
+#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
+#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define GGML_F32x4_ADD          vec_add
+#define GGML_F32x4_MUL          vec_mul
+#define GGML_F32x4_REDUCE(res, x)              \
+{                                              \
+    int offset = GGML_F32_ARR >> 1;            \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    res = vec_extract(x[0], 0) +               \
+          vec_extract(x[0], 1) +               \
+          vec_extract(x[0], 2) +               \
+          vec_extract(x[0], 3);                \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 POWER9
+#define GGML_F16_STEP       GGML_F32_STEP
+#define GGML_F16_EPR        GGML_F32_EPR
+#define GGML_F16_VEC        GGML_F32x4
+#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F16_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F16_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
+// Use vec_xl, not vec_ld, in case the load address is not aligned.
+#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
+  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
+  vec_extract_fp32_from_shortl(vec_xl(0, p))
+#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
+#define GGML_F16_VEC_STORE(p, r, i)                             \
+  if (i & 0x1)                                                  \
+    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
+                                   r[i - GGML_ENDIAN_BYTE(0)]), \
+            0, p - GGML_F16_EPR)
+
+#elif defined(__wasm_simd128__)
+
+#define GGML_SIMD
+
+// F32 WASM
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              v128_t
+#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
+#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
+#define GGML_F32x4_LOAD         wasm_v128_load
+#define GGML_F32x4_STORE        wasm_v128_store
+#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
+#define GGML_F32x4_ADD          wasm_f32x4_add
+#define GGML_F32x4_MUL          wasm_f32x4_mul
+#define GGML_F32x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F32_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    res = wasm_f32x4_extract_lane(x[0], 0) +       \
+          wasm_f32x4_extract_lane(x[0], 1) +       \
+          wasm_f32x4_extract_lane(x[0], 2) +       \
+          wasm_f32x4_extract_lane(x[0], 3);        \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 WASM
+
+#define GGML_F16_STEP 16
+#define GGML_F16_EPR  4
+
+inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(p[0]);
+    tmp[1] = GGML_FP16_TO_FP32(p[1]);
+    tmp[2] = GGML_FP16_TO_FP32(p[2]);
+    tmp[3] = GGML_FP16_TO_FP32(p[3]);
+
+    return wasm_v128_load(tmp);
+}
+
+inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
+    float tmp[4];
+
+    wasm_v128_store(tmp, x);
+
+    p[0] = GGML_FP32_TO_FP16(tmp[0]);
+    p[1] = GGML_FP32_TO_FP16(tmp[1]);
+    p[2] = GGML_FP32_TO_FP16(tmp[2]);
+    p[3] = GGML_FP32_TO_FP16(tmp[3]);
+}
+
+#define GGML_F16x4             v128_t
+#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
+#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
+#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
+#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
+#define GGML_F16x4_FMA         GGML_F32x4_FMA
+#define GGML_F16x4_ADD         wasm_f32x4_add
+#define GGML_F16x4_MUL         wasm_f32x4_mul
+#define GGML_F16x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F16_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    res = wasm_f32x4_extract_lane(x[0], 0) +       \
+          wasm_f32x4_extract_lane(x[0], 1) +       \
+          wasm_f32x4_extract_lane(x[0], 2) +       \
+          wasm_f32x4_extract_lane(x[0], 3);        \
+}
+
+#define GGML_F16_VEC                GGML_F16x4
+#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
+#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
+#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
+#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
+
+#elif defined(__SSE3__)
+
+#define GGML_SIMD
+
+// F32 SSE
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4         __m128
+#define GGML_F32x4_ZERO    _mm_setzero_ps()
+#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
+#define GGML_F32x4_LOAD    _mm_loadu_ps
+#define GGML_F32x4_STORE   _mm_storeu_ps
+#if defined(__FMA__)
+    // TODO: Does this work?
+    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
+#else
+    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
+#endif
+#define GGML_F32x4_ADD     _mm_add_ps
+#define GGML_F32x4_MUL     _mm_mul_ps
+#define GGML_F32x4_REDUCE(res, x)                                 \
+{                                                                 \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
+    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
+}
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 SSE
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  4
+
+static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(x[0]);
+    tmp[1] = GGML_FP16_TO_FP32(x[1]);
+    tmp[2] = GGML_FP16_TO_FP32(x[2]);
+    tmp[3] = GGML_FP16_TO_FP32(x[3]);
+
+    return _mm_loadu_ps(tmp);
+}
+
+static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
+    float arr[4];
+
+    _mm_storeu_ps(arr, y);
+
+    x[0] = GGML_FP32_TO_FP16(arr[0]);
+    x[1] = GGML_FP32_TO_FP16(arr[1]);
+    x[2] = GGML_FP32_TO_FP16(arr[2]);
+    x[3] = GGML_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_F32Cx4             __m128
+#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
+#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
+#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
+#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
+#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
+#define GGML_F32Cx4_ADD         _mm_add_ps
+#define GGML_F32Cx4_MUL         _mm_mul_ps
+#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
+
+#define GGML_F16_VEC                 GGML_F32Cx4
+#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
+#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
+#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
+#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
+#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
+#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
+
+#elif defined(__loongarch_asx)
+
+#define GGML_SIMD
+
+// F32 LASX
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  8
+
+#define GGML_F32x8         __m256
+#define GGML_F32x8_ZERO    (__m256)__lasx_xvldi(0)
+#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
+#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
+#define GGML_F32x8_STORE(x,y)   __lasx_xvst((y), (x), 0)
+#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
+#define GGML_F32x8_ADD     __lasx_xvfadd_s
+#define GGML_F32x8_MUL     __lasx_xvfmul_s
+#define GGML_F32x8_REDUCE(res, x)                                 \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
+    }                                                             \
+    float *tmp_p = (float *)&x[0]; \
+    res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7];  \
+} while (0)
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x8
+#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
+
+// F16 LASX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  8
+
+// F16 arithmetic is not supported by AVX, so we use F32 instead
+
+#define GGML_F32Cx8          __m256
+#define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
+#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
+
+static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
+    float tmp[8];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return (__m256)__lasx_xvld(tmp, 0);
+}
+static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
+    float arr[8];
+
+    __lasx_xvst(y, arr, 0);
+
+    for (int i = 0; i < 8; i++) {
+        x[i] = GGML_FP32_TO_FP16(arr[i]);
+    }
+}
+#define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
+#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
+
+#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
+#define GGML_F32Cx8_ADD         __lasx_xvfadd_s
+#define GGML_F32Cx8_MUL         __lasx_xvfmul_s
+#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
+
+#define GGML_F16_VEC                GGML_F32Cx8
+#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
+
+#elif defined(__loongarch_sx)
+
+#define GGML_SIMD
+
+// F32 LSX
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4         __m128
+#define GGML_F32x4_ZERO    __lsx_vldi(0)
+#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
+#define GGML_F32x4_STORE((x),(y))   __lsx_vst((y), (x), 0)
+#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
+#define GGML_F32x4_ADD     __lsx_vfadd_s
+#define GGML_F32x4_MUL     __lsx_vfmul_s
+#define GGML_F32x4_REDUCE(res, x)                                 \
+{                                                                 \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
+    }                                                             \
+    __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
+    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
+    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
+    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
+    tmp = __lsx_vsrli_d((__m128i)t0, 32); \
+    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
+    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
+    res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0);        \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 LSX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  4
+
+static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(x[0]);
+    tmp[1] = GGML_FP16_TO_FP32(x[1]);
+    tmp[2] = GGML_FP16_TO_FP32(x[2]);
+    tmp[3] = GGML_FP16_TO_FP32(x[3]);
+
+    return __lsx_vld(tmp, 0);
+}
+
+static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
+    float arr[4];
+
+    __lsx_vst(y, arr, 0);
+
+    x[0] = GGML_FP32_TO_FP16(arr[0]);
+    x[1] = GGML_FP32_TO_FP16(arr[1]);
+    x[2] = GGML_FP32_TO_FP16(arr[2]);
+    x[3] = GGML_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_F32Cx4             __m128
+#define GGML_F32Cx4_ZERO        __lsx_vldi(0)
+#define GGML_F32Cx4_SET1(x)     __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32Cx4_LOAD(x)     __lsx_f16x4_load(x)
+#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
+#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
+#define GGML_F32Cx4_ADD         __lsx_vfadd_s
+#define GGML_F32Cx4_MUL         __lsx_vfmul_s
+#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
+
+#define GGML_F16_VEC                 GGML_F32Cx4
+#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
+#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
+#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
+#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
+#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
+#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
+
+#endif
+
+// GGML_F32_ARR / GGML_F16_ARR
+//   number of registers to use per step
+#ifdef GGML_SIMD
+#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
+#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
+#endif
+
+//
+// Threading defs
+//
+
+typedef pthread_t          ggml_thread_t;
+
+#if defined(_WIN32)
+
+typedef CONDITION_VARIABLE ggml_cond_t;
+typedef SRWLOCK            ggml_mutex_t;
+
+#define ggml_mutex_init(m)   InitializeSRWLock(m)
+#define ggml_mutex_destroy(m)
+#define ggml_mutex_lock(m)   AcquireSRWLockExclusive(m)
+#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
+#define ggml_mutex_lock_shared(m)   AcquireSRWLockShared(m)
+#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
+
+#define ggml_cond_init(c)    InitializeConditionVariable(c)
+#define ggml_cond_destroy(c)
+#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
+#define ggml_cond_broadcast(c) WakeAllConditionVariable(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#else
+
+typedef pthread_cond_t     ggml_cond_t;
+typedef pthread_mutex_t    ggml_mutex_t;
+
+#define ggml_mutex_init(m)          pthread_mutex_init(m, NULL)
+#define ggml_mutex_destroy(m)       pthread_mutex_destroy(m)
+#define ggml_mutex_lock(m)          pthread_mutex_lock(m)
+#define ggml_mutex_unlock(m)        pthread_mutex_unlock(m)
+#define ggml_mutex_lock_shared(m)   pthread_mutex_lock(m)
+#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
+
+#define ggml_lock_init(x)    UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_lock_lock(x)    _mm_pause()
+#else
+#define ggml_lock_lock(x)    UNUSED(x)
+#endif
+#define ggml_lock_unlock(x)  UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+#define ggml_cond_init(c)      pthread_cond_init(c, NULL)
+#define ggml_cond_destroy(c)   pthread_cond_destroy(c)
+#define ggml_cond_wait(c, m)   pthread_cond_wait(c, m)
+#define ggml_cond_broadcast(c) pthread_cond_broadcast(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#endif
+
+// Threadpool def
+struct ggml_threadpool {
+    ggml_mutex_t mutex;       // mutex for cond.var
+    ggml_cond_t  cond;        // cond.var for waiting for new work
+
+    struct ggml_cgraph * cgraph;
+    struct ggml_cplan  * cplan;
+
+    // synchronization primitives
+    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
+    atomic_int GGML_CACHE_ALIGN n_barrier;
+    atomic_int GGML_CACHE_ALIGN n_barrier_passed;
+    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
+
+    // these are atomic as an annotation for thread-sanitizer
+    atomic_bool stop;         // Used for stopping the threadpool altogether
+    atomic_bool pause;        // Used for pausing the threadpool or individual threads
+    atomic_bool abort;        // Used for aborting processing of a graph
+
+    struct ggml_compute_state * workers;   // per thread state
+    int          n_threads_max; // number of threads in the pool
+    atomic_int   n_threads_cur; // number of threads used in the current graph
+
+    int32_t      prio;        // Scheduling priority
+    uint32_t     poll;        // Polling level (0 - no polling)
+
+    enum ggml_status ec;
+};
+
+// Per-thread state
+struct ggml_compute_state {
+#ifndef GGML_USE_OPENMP
+    ggml_thread_t thrd;
+    bool cpumask[GGML_MAX_N_THREADS];
+    int  last_graph;
+    bool pending;
+#endif
+    struct ggml_threadpool * threadpool;
+    int ith;
+};
+
+struct ggml_compute_params {
+    // ith = thread index, nth = number of threads
+    int ith, nth;
+
+    // work buffer for all threads
+    size_t wsize;
+    void * wdata;
+
+    struct ggml_threadpool * threadpool;
+};
+
+//
+// fundamental operations
+//
+
+inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
+inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
+inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
+inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
+inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
+inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
+inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
+inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
+inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
+
+static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
+   assert(nrc == 1);
+   UNUSED(nrc);
+   UNUSED(bx);
+   UNUSED(by);
+   UNUSED(bs);
+
+#if defined(GGML_SIMD)
+    float sumf = 0.0f;
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+
+    GGML_F32_VEC ax[GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    GGML_F32_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += x[i]*y[i];
+    }
+#else
+    // scalar
+    ggml_float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(x[i]*y[i]);
+    }
+#endif
+
+    *s = sumf;
+}
+
+static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    int i = 0;
+    ggml_float sumf = 0;
+
+#if defined(__AVX512BF16__)
+    __m512 c1 = _mm512_setzero_ps();
+    __m512 c2 = _mm512_setzero_ps();
+    for (; i + 64 <= n; i += 64) {
+        c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
+                             m512bh(_mm512_loadu_si512((y + i))));
+        c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
+                             m512bh(_mm512_loadu_si512((y + i + 32))));
+    }
+    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
+    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
+
+#elif defined(__AVX512F__)
+#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
+    __m512 c1 = _mm512_setzero_ps();
+    __m512 c2 = _mm512_setzero_ps();
+    for (; i + 32 <= n; i += 32) {
+        c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+        c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
+    }
+    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
+    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
+
+#undef LOAD
+#elif defined(__AVX2__)
+#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
+    __m256 c1 = _mm256_setzero_ps();
+    __m256 c2 = _mm256_setzero_ps();
+    __m256 c3 = _mm256_setzero_ps();
+    __m256 c4 = _mm256_setzero_ps();
+    for (; i + 32 <= n; i += 32) {
+        c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+        c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
+        c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
+        c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
+    }
+    __m128 g;
+    c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
+                       _mm256_add_ps(c2, c4));
+    g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
+                   _mm256_castps256_ps128(c1));
+    g = _mm_add_ps(g, _mm_movehl_ps(g, g));
+    g = _mm_add_ss(g, _mm_movehdup_ps(g));
+    sumf += (ggml_float)_mm_cvtss_f32(g);
+
+#undef LOAD
+#endif
+
+    for (; i < n; ++i) {
+        sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
+                             GGML_BF16_TO_FP32(y[i]));
+    }
+    *s = sumf;
+}
+
+static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    ggml_float sumf = 0.0;
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+
+            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    GGML_F16_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
+    }
+#endif
+
+    *s = sumf;
+}
+
+// compute GGML_VEC_DOT_UNROLL dot products at once
+// xs - x row stride in bytes
+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
+    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
+
+    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
+
+    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
+        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
+    }
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+
+            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
+                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
+
+                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
+            }
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
+        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
+            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
+        }
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
+            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
+        }
+    }
+#endif
+
+    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
+        s[i] = sumf[i];
+    }
+}
+
+inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+    GGML_F32_VEC ax[GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#endif
+}
+
+inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
+
+            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
+    }
+#endif
+}
+
+// xs and vs are byte strides of x and v
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+
+    const float * restrict x[GGML_VEC_MAD_UNROLL];
+    const float * restrict v[GGML_VEC_MAD_UNROLL];
+
+    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
+        x[i] = (const float *) ((const char *) xv + i*xs);
+        v[i] = (const float *) ((const char *) vv + i*vs);
+    }
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
+
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
+    }
+
+    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
+            }
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        for (int i = np; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#else
+    // scalar
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        for (int i = 0; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#endif
+}
+
+//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
+inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
+#if defined(GGML_USE_ACCELERATE)
+    vDSP_vsmul(y, 1, &v, y, 1, n);
+#elif defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] *= v;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] *= v;
+    }
+#endif
+}
+
+inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
+
+            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
+    }
+#endif
+}
+
+inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
+inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
+inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
+inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
+inline static void ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
+inline static void ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
+inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
+inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
+inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
+inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
+inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
+inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
+inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
+inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
+// TODO: optimize performance
+inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
+inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
+inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
+
+static const float GELU_COEF_A     = 0.044715f;
+static const float GELU_QUICK_COEF = -1.702f;
+static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+
+inline static float ggml_gelu_f32(float x) {
+    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_table_gelu_f16[i16[i]];
+    }
+}
+
+#ifdef GGML_GELU_FP16
+inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        if (x[i] <= -10.0f) {
+            y[i] = 0.0f;
+        } else if (x[i] >= 10.0f) {
+            y[i] = x[i];
+        } else {
+            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+            memcpy(&t, &fp16, sizeof(uint16_t));
+            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
+        }
+    }
+}
+#else
+inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_gelu_f32(x[i]);
+    }
+}
+#endif
+
+inline static float ggml_gelu_quick_f32(float x) {
+    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
+}
+
+//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+//    const uint16_t * i16 = (const uint16_t *) x;
+//    for (int i = 0; i < n; ++i) {
+//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
+//    }
+//}
+
+#ifdef GGML_GELU_QUICK_FP16
+inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
+    }
+}
+#else
+inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_gelu_quick_f32(x[i]);
+    }
+}
+#endif
+
+// Sigmoid Linear Unit (SiLU) function
+inline static float ggml_silu_f32(float x) {
+    return x/(1.0f + expf(-x));
+}
+
+#if __FINITE_MATH_ONLY__
+#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
+#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
+#endif
+
+#if defined(__ARM_NEON) && defined(__aarch64__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static float32x4_t ggml_v_expf(float32x4_t x) {
+    const float32x4_t r = vdupq_n_f32(0x1.8p23f);
+    const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
+    const float32x4_t n = vsubq_f32(z, r);
+    const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
+                                    vdupq_n_f32(0x1.7f7d1cp-20f));
+    const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
+    const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
+    const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
+    const float32x4_t u = vmulq_f32(b, b);
+    const float32x4_t j = vfmaq_f32(
+        vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
+        vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
+                  vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
+    if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
+        return vfmaq_f32(k, j, k);
+    const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
+    const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
+    const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
+    return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
+                     vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static float32x4_t ggml_v_silu(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    const float32x4_t zero = vdupq_n_f32(0.0f);
+    const float32x4_t neg_x = vsubq_f32(zero, x);
+    const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
+    const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
+    return vdivq_f32(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__AVX512F__) && defined(__AVX512DQ__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m512 ggml_v_expf(__m512 x) {
+  const __m512 r = _mm512_set1_ps(0x1.8p23f);
+  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
+  const __m512 n = _mm512_sub_ps(z, r);
+  const __m512 b =
+      _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
+                       _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
+  const __mmask16 d =
+      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
+  const __m512 u = _mm512_mul_ps(b, b);
+  const __m512 j = _mm512_fmadd_ps(
+      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
+                                      _mm512_set1_ps(0x1.573e2ep-5f)),
+                      u,
+                      _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
+                                      _mm512_set1_ps(0x1.fffdb6p-2f))),
+      u,
+      _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
+  const __m512 res = _mm512_scalef_ps(j, n);
+  if (_mm512_kortestz(d, d))
+    return res;
+  const __m512 zero = _mm512_setzero_ps();
+  const __m512 alt = _mm512_mask_blend_ps(
+      _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
+  return _mm512_mask_blend_ps(d, res, alt);
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m512 ggml_v_silu(__m512 x) {
+    const __m512 one = _mm512_set1_ps(1);
+    const __m512 zero = _mm512_setzero_ps();
+    const __m512 neg_x = _mm512_sub_ps(zero, x);
+    const __m512 exp_neg_x = ggml_v_expf(neg_x);
+    const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
+    return _mm512_div_ps(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__AVX2__) && defined(__FMA__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m256 ggml_v_expf(__m256 x) {
+  const __m256 r = _mm256_set1_ps(0x1.8p23f);
+  const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
+  const __m256 n = _mm256_sub_ps(z, r);
+  const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
+                                    _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
+  const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
+  const __m256 k = _mm256_castsi256_ps(
+      _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
+  const __m256i c = _mm256_castps_si256(
+      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
+                    _mm256_set1_ps(126), _CMP_GT_OQ));
+  const __m256 u = _mm256_mul_ps(b, b);
+  const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
+                                                                   _mm256_set1_ps(0x1.573e2ep-5f)), u,
+                                                   _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
+                                                                   _mm256_set1_ps(0x1.fffdb6p-2f))),
+                                   u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
+  if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
+    return _mm256_fmadd_ps(j, k, k);
+  const __m256i g = _mm256_and_si256(
+      _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
+      _mm256_set1_epi32(0x82000000u));
+  const __m256 s1 =
+      _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
+  const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
+  const __m256i d = _mm256_castps_si256(
+      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
+                    _mm256_set1_ps(192), _CMP_GT_OQ));
+  return _mm256_or_ps(
+      _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
+      _mm256_andnot_ps(
+          _mm256_castsi256_ps(d),
+          _mm256_or_ps(
+              _mm256_and_ps(_mm256_castsi256_ps(c),
+                            _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
+              _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m256 ggml_v_silu(__m256 x) {
+    const __m256 one = _mm256_set1_ps(1);
+    const __m256 zero = _mm256_setzero_ps();
+    const __m256 neg_x = _mm256_sub_ps(zero, x);
+    const __m256 exp_neg_x = ggml_v_expf(neg_x);
+    const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
+    return _mm256_div_ps(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
+
+#if defined(__FMA__)
+#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
+#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
+#else
+#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
+#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
+#endif
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m128 ggml_v_expf(__m128 x) {
+    const __m128 r = _mm_set1_ps(0x1.8p23f);
+    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
+    const __m128 n = _mm_sub_ps(z, r);
+    const __m128 b =
+        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
+    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
+    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
+    const __m128i c =
+        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
+    const __m128 u = _mm_mul_ps(b, b);
+    const __m128 j =
+        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
+                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
+                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
+    if (!_mm_movemask_epi8(c))
+        return MADD128(j, k, k);
+    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
+                                    _mm_set1_epi32(0x82000000u));
+    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
+    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
+    const __m128i d =
+        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
+    return _mm_or_ps(
+        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
+        _mm_andnot_ps(_mm_castsi128_ps(d),
+                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
+                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m128 ggml_v_silu(__m128 x) {
+    const __m128 one = _mm_set1_ps(1);
+    const __m128 zero = _mm_setzero_ps();
+    const __m128 neg_x = _mm_sub_ps(zero, x);
+    const __m128 exp_neg_x = ggml_v_expf(neg_x);
+    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
+    return _mm_div_ps(x, one_plus_exp_neg_x);
+}
+
+#endif // __ARM_NEON / __AVX2__ / __SSE2__
+
+static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
+    int i = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = ggml_silu_f32(x[i]);
+    }
+}
+
+static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
+    int i = 0;
+    ggml_float sum = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
+                                               _mm512_set1_ps(max)));
+        _mm512_storeu_ps(y + i, val);
+        sum += (ggml_float)_mm512_reduce_add_ps(val);
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
+                                               _mm256_set1_ps(max)));
+        _mm256_storeu_ps(y + i, val);
+        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
+                                 _mm256_castps256_ps128(val));
+        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
+        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
+        sum += (ggml_float)_mm_cvtss_f32(val2);
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
+                                            _mm_set1_ps(max)));
+        _mm_storeu_ps(y + i, val);
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
+        val = _mm_add_ss(val, _mm_movehdup_ps(val));
+#else
+        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
+        val = _mm_add_ps(val, tmp);
+        tmp = _mm_movehl_ps(tmp, val);
+        val = _mm_add_ss(val, tmp);
+#endif
+        sum += (ggml_float)_mm_cvtss_f32(val);
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
+                                                vdupq_n_f32(max)));
+        vst1q_f32(y + i, val);
+        sum += (ggml_float)vaddvq_f32(val);
+    }
+#endif
+    for (; i < n; ++i) {
+        float val = expf(x[i] - max);
+        sum += (ggml_float)val;
+        y[i] = val;
+    }
+    return sum;
+}
+
+static ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
+    // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
+
+    int i = 0;
+    ggml_float sum = 0;
+    for (; i < n; ++i) {
+        float val = x[i] - max;
+        y[i] = val;
+        sum += (ggml_float)expf(val);
+    }
+    return sum = (ggml_float)logf(sum);
+}
+
+inline static float ggml_silu_backward_f32(float x, float dy) {
+    const float s = 1.0f/(1.0f + expf(-x));
+    return dy*s*(1.0f + x*(1.0f - s));
+}
+
+inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+    for (int i = 0; i < n; ++i) {
+        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
+    }
+}
+
+inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
+#ifndef GGML_USE_ACCELERATE
+    ggml_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_float)x[i];
+    }
+    *s = sum;
+#else
+    vDSP_sve(x, 1, s, n);
+#endif
+}
+
+inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
+    ggml_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_float)x[i];
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
+    float sum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        sum += GGML_FP16_TO_FP32(x[i]);
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
+    float sum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        sum += GGML_BF16_TO_FP32(x[i]);
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
+#ifndef GGML_USE_ACCELERATE
+    float max = -INFINITY;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+    }
+    *s = max;
+#else
+    vDSP_maxv(x, 1, s, n);
+#endif
+}
+
+inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
+    ggml_vec_norm_f32(n, s, x);
+    *s = 1.f/(*s);
+}
+
+inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
+    float max = -INFINITY;
+    int idx = 0;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+        if (max == x[i]) { idx = i; }
+    }
+    *s = idx;
+}
+
+// Helpers for polling loops
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void ggml_thread_cpu_relax(void) {
+    __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void ggml_thread_cpu_relax(void) {
+    _mm_pause();
+}
+#else
+static inline void ggml_thread_cpu_relax(void) {;}
+#endif
+
+//
+// NUMA support
+//
+
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
+struct ggml_numa_node {
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes {
+    enum ggml_numa_strategy numa_strategy;
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+    uint32_t current_node; // node on which main process is execting
+#if defined(__gnu_linux__)
+    cpu_set_t cpuset; // cpuset from numactl
+#else
+    uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
+#endif
+};
+
+//
+// ggml state
+//
+
+struct ggml_state {
+    struct ggml_numa_nodes numa;
+};
+
+// global state
+static struct ggml_state g_state = {0};
+static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
+
+// TODO: move to threading file
+// critical section via spin lock
+void ggml_critical_section_start(void) {
+    while (atomic_flag_test_and_set(&g_state_critical)) {
+        // spin
+        sched_yield();
+    }
+}
+
+void ggml_critical_section_end(void) {
+    atomic_flag_clear(&g_state_critical);
+}
+
+static void ggml_barrier(struct ggml_threadpool * tp) {
+    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
+    if (n_threads == 1) {
+        return;
+    }
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp barrier
+#else
+    int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
+
+    // enter barrier (full seq-cst fence)
+    int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
+
+    if (n_barrier == (n_threads - 1)) {
+        // last thread
+        atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
+
+        // exit barrier (fill seq-cst fence)
+        atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
+        return;
+    }
+
+    // wait for other threads
+    while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
+        ggml_thread_cpu_relax();
+    }
+
+    // exit barrier (full seq-cst fence)
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
+    #ifdef GGML_TSAN_ENABLED
+    atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
+    #else
+    atomic_thread_fence(memory_order_seq_cst);
+    #endif
+#endif
+}
+
+#if defined(__gnu_linux__)
+static cpu_set_t ggml_get_numa_affinity(void) {
+    cpu_set_t cpuset;
+    pthread_t thread;
+    thread = pthread_self();
+    CPU_ZERO(&cpuset);
+    pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
+    return cpuset;
+}
+#else
+static uint32_t ggml_get_numa_affinity(void) {
+    return 0; // no NUMA support
+}
+#endif
+
+void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
+    if (g_state.numa.n_nodes > 0) {
+        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
+
+        return;
+    }
+
+#if defined(__gnu_linux__)
+    struct stat st;
+    char path[256];
+    int rv;
+
+    // set numa scheme
+    g_state.numa.numa_strategy = numa_flag;
+
+    GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
+
+    g_state.numa.cpuset = ggml_get_numa_affinity();
+
+    // enumerate nodes
+    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.n_nodes;
+    }
+
+    // enumerate CPUs
+    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.total_cpus;
+    }
+
+    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+
+    // figure out which node we're on
+    uint current_cpu;
+    int getcpu_ret = 0;
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
+    getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
+#else
+    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
+#   if !defined(SYS_getcpu) && defined(SYS_get_cpu)
+#       define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
+#   endif
+    getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
+#endif
+
+    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
+        g_state.numa.n_nodes = 0;
+        return;
+    }
+
+    GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
+
+    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
+        struct ggml_numa_node * node = &g_state.numa.nodes[n];
+        GGML_PRINT_DEBUG("CPUs on node %u:", n);
+        node->n_cpus = 0;
+        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
+            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
+            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+            if (stat(path, &st) == 0) {
+                node->cpus[node->n_cpus++] = c;
+                GGML_PRINT_DEBUG(" %u", c);
+            }
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+
+    if (ggml_is_numa()) {
+        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
+        if (fptr != NULL) {
+            char buf[42];
+            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
+                GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
+            }
+            fclose(fptr);
+        }
+    }
+#else
+    UNUSED(numa_flag);
+    // TODO
+#endif
+}
+
+bool ggml_is_numa(void) {
+    return g_state.numa.n_nodes > 1;
+}
+
+#if defined(__ARM_ARCH)
+
+#if defined(__linux__) && defined(__aarch64__)
+#include <sys/auxv.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !defined(HWCAP2_I8MM)
+#define HWCAP2_I8MM 0
+#endif
+
+static void ggml_init_arm_arch_features(void) {
+#if defined(__linux__) && defined(__aarch64__)
+    uint32_t hwcap = getauxval(AT_HWCAP);
+    uint32_t hwcap2 = getauxval(AT_HWCAP2);
+
+    ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
+    ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
+    ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
+
+#if defined(__ARM_FEATURE_SVE)
+    ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+#endif
+#elif defined(__APPLE__)
+    int oldp = 0;
+    size_t size = sizeof(oldp);
+    if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_arm_arch_features.has_neon = oldp;
+
+    if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_arm_arch_features.has_i8mm = oldp;
+
+    ggml_arm_arch_features.has_sve = 0;
+    ggml_arm_arch_features.sve_cnt = 0;
+#else
+// Run-time CPU feature detection not implemented for this platform, fallback to compile time
+#if defined(__ARM_NEON)
+    ggml_arm_arch_features.has_neon = 1;
+#else
+    ggml_arm_arch_features.has_neon = 0;
+#endif
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    ggml_arm_arch_features.has_i8mm = 1;
+#else
+    ggml_arm_arch_features.has_i8mm = 0;
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+    ggml_arm_arch_features.has_sve = 1;
+    ggml_arm_arch_features.sve_cnt = 16;
+#else
+    ggml_arm_arch_features.has_sve = 0;
+    ggml_arm_arch_features.sve_cnt = 0;
+#endif
+#endif
+}
+#endif
+
+struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
+    GGML_ASSERT(!ggml_get_no_alloc(ctx));
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+
+    ggml_set_i32(result, value);
+
+    return result;
+}
+
+struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
+    GGML_ASSERT(!ggml_get_no_alloc(ctx));
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+
+    ggml_set_f32(result, value);
+
+    return result;
+}
+
+struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
+    const int n     = ggml_nrows(tensor);
+    const int nc    = tensor->ne[0];
+    const size_t n1 = tensor->nb[1];
+
+    char * const data = tensor->data;
+
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                assert(tensor->nb[0] == sizeof(int8_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I16:
+            {
+                assert(tensor->nb[0] == sizeof(int16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I32:
+            {
+                assert(tensor->nb[0] == sizeof(int32_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_F16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
+                }
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
+                }
+            } break;
+        case GGML_TYPE_F32:
+            {
+                assert(tensor->nb[0] == sizeof(float));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
+                }
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+
+    return tensor;
+}
+
+struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
+    const int n     = ggml_nrows(tensor);
+    const int nc    = tensor->ne[0];
+    const size_t n1 = tensor->nb[1];
+
+    char * const data = tensor->data;
+
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                assert(tensor->nb[0] == sizeof(int8_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I16:
+            {
+                assert(tensor->nb[0] == sizeof(int16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I32:
+            {
+                assert(tensor->nb[0] == sizeof(int32_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_F16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
+                }
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_bf16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
+                }
+            } break;
+        case GGML_TYPE_F32:
+            {
+                assert(tensor->nb[0] == sizeof(float));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
+                }
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+
+    return tensor;
+}
+
+int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                return ((int8_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                return ((int16_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                return ((int32_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_F16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
+            }
+        case GGML_TYPE_BF16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
+                return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
+            }
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(float));
+                return ((float *)(tensor->data))[i];
+            }
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                ((int8_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                ((int16_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                ((int32_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
+                ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(float));
+                ((float *)(tensor->data))[i] = value;
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            return ((int8_t *) data)[0];
+        case GGML_TYPE_I16:
+            return ((int16_t *) data)[0];
+        case GGML_TYPE_I32:
+            return ((int32_t *) data)[0];
+        case GGML_TYPE_F16:
+            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+        case GGML_TYPE_BF16:
+            return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
+        case GGML_TYPE_F32:
+            return ((float *) data)[0];
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                ((int8_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                ((int16_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ((int32_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ((float *)(data))[0] = value;
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                return ((int8_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I16:
+            {
+                return ((int16_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I32:
+            {
+                return ((int32_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_F16:
+            {
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
+            }
+        case GGML_TYPE_BF16:
+            {
+                return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
+            }
+        case GGML_TYPE_F32:
+            {
+                return ((float *)(tensor->data))[i];
+            }
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                ((int8_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                ((int16_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ((int32_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ((float *)(tensor->data))[i] = value;
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            return ((int8_t *) data)[0];
+        case GGML_TYPE_I16:
+            return ((int16_t *) data)[0];
+        case GGML_TYPE_I32:
+            return ((int32_t *) data)[0];
+        case GGML_TYPE_F16:
+            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+        case GGML_TYPE_BF16:
+            return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
+        case GGML_TYPE_F32:
+            return ((float *) data)[0];
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                ((int8_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                ((int16_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ((int32_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ((float *)(data))[0] = value;
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// ggml_compute_forward_dup
+
+static void ggml_compute_forward_dup_same_cont(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+    GGML_ASSERT(src0->type == dst->type);
+
+    const size_t nb0 = ggml_type_size(src0->type);
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    // parallelize by elements
+    const int ne = ggml_nelements(dst);
+    const int dr = (ne + nth - 1) / nth;
+    const int ie0 = dr * ith;
+    const int ie1 = MIN(ie0 + dr, ne);
+
+    if (ie0 < ie1) {
+        memcpy(
+            ((char *)  dst->data + ie0*nb0),
+            ((char *) src0->data + ie0*nb0),
+            (ie1 - ie0) * nb0);
+    }
+}
+
+static void ggml_compute_forward_dup_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
+
+    if (ggml_is_contiguous(dst)) {
+        if (nb00 == sizeof(ggml_fp16_t)) {
+            if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                const size_t rs = ne00 * nb00;
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, rs);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (ggml_get_type_traits(dst->type)->from_float) {
+                ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
+                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+                size_t id = 0;
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
+                            }
+
+                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = *src0_ptr;
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        }
+        return;
+    }
+
+    // dst counters
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    if (dst->type == GGML_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
+
+                        if (++i10 == ne00) {
+                            i10 = 0;
+                            if (++i11 == ne01) {
+                                i11 = 0;
+                                if (++i12 == ne02) {
+                                    i12 = 0;
+                                    if (++i13 == ne03) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_ABORT("fatal error"); // TODO: implement
+    }
+}
+
+static void ggml_compute_forward_dup_bf16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
+
+    if (ggml_is_contiguous(dst)) {
+        if (nb00 == sizeof(ggml_bf16_t)) {
+            if (dst->type == GGML_TYPE_BF16) {
+                size_t id = 0;
+                const size_t rs = ne00 * nb00;
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, rs);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                dst_ptr[id] = GGML_BF16_TO_FP32(src0_ptr[i00]);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (ggml_get_type_traits(dst->type)->from_float) {
+                ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
+                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+                size_t id = 0;
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                src0_f32[i00] = GGML_BF16_TO_FP32(src0_ptr[i00]);
+                            }
+
+                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_BF16_TO_FP32(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_BF16) {
+                size_t id = 0;
+                ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = *src0_ptr;
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        }
+        return;
+    }
+
+    // dst counters
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    if (dst->type == GGML_TYPE_BF16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t));
+
+                        if (++i10 == ne00) {
+                            i10 = 0;
+                            if (++i11 == ne01) {
+                                i11 = 0;
+                                if (++i12 == ne02) {
+                                    i12 = 0;
+                                    if (++i13 == ne03) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(float *) dst_ptr = GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_ABORT("fatal error"); // TODO: implement
+    }
+}
+
+static void ggml_compute_forward_dup_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    if (ggml_is_contiguous(dst)) {
+        // TODO: simplify
+        if (nb00 == sizeof(float)) {
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                const size_t rs = ne00 * nb00;
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, rs);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else if (ggml_get_type_traits(dst->type)->from_float) {
+                ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
+
+                size_t id = 0;
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = *src0_ptr;
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_BF16) {
+                size_t id = 0;
+                ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_FP32_TO_BF16(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        }
+
+        return;
+    }
+
+    // dst counters
+
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    if (dst->type == GGML_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(float));
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_BF16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(ggml_bf16_t *) dst_ptr = GGML_FP32_TO_BF16(*(const float *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_ABORT("fatal error"); // TODO: implement
+    }
+}
+
+// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
+static void ggml_compute_forward_dup_bytes(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+    GGML_ASSERT(src0->type == dst->type);
+
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+        ggml_compute_forward_dup_same_cont(params, dst);
+        return;
+    }
+
+    const size_t type_size = ggml_type_size(src0->type);
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == type_size && nb0 == type_size) {
+        // copy by rows
+        const size_t rs = ne00 * type_size;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    if (ggml_is_contiguous(dst)) {
+        size_t id = 0;
+        char * dst_ptr = (char *) dst->data;
+        const size_t rs = ne00 * type_size;
+
+        if (nb00 == type_size) {
+            // src0 is contigous on first dimension, copy by rows
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    id += rs * ir0;
+                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                        memcpy(dst_ptr + id, src0_ptr, rs);
+                        id += rs;
+                    }
+                    id += rs * (ne01 - ir1);
+                }
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    id += rs * ir0;
+                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                        for (int64_t i00 = 0; i00 < ne00; i00++) {
+                            const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, type_size);
+
+                            id += type_size;
+                        }
+                    }
+                    id += rs * (ne01 - ir1);
+                }
+            }
+        }
+
+        return;
+    }
+
+    // dst counters
+
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            i10 += ne00 * ir0;
+            while (i10 >= ne0) {
+                i10 -= ne0;
+                if (++i11 == ne1) {
+                    i11 = 0;
+                    if (++i12 == ne2) {
+                        i12 = 0;
+                        if (++i13 == ne3) {
+                            i13 = 0;
+                        }
+                    }
+                }
+            }
+            for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                    memcpy(dst_ptr, src0_ptr, type_size);
+
+                    if (++i10 == ne0) {
+                        i10 = 0;
+                        if (++i11 == ne1) {
+                            i11 = 0;
+                            if (++i12 == ne2) {
+                                i12 = 0;
+                                if (++i13 == ne3) {
+                                    i13 = 0;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            i10 += ne00 * (ne01 - ir1);
+            while (i10 >= ne0) {
+                i10 -= ne0;
+                if (++i11 == ne1) {
+                    i11 = 0;
+                    if (++i12 == ne2) {
+                        i12 = 0;
+                        if (++i13 == ne3) {
+                            i13 = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_dup(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (src0->type == dst->type) {
+        ggml_compute_forward_dup_bytes(params, dst);
+        return;
+    }
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_dup_f16(params, dst);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ggml_compute_forward_dup_bf16(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_dup_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_add
+
+static void ggml_compute_forward_add_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_add_f16_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    if (dst->type == GGML_TYPE_F32) {
+        GGML_ASSERT( nb0 == sizeof(float));
+    }
+    else {
+        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    }
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        if (dst->type == GGML_TYPE_F16) {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+                }
+            }
+        } else {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
+                }
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ABORT("fatal error");
+    }
+}
+
+static void ggml_compute_forward_add_bf16_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    if (dst->type == GGML_TYPE_F32) {
+        GGML_ASSERT( nb0 == sizeof(float));
+    }
+    else {
+        GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
+        GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+    }
+
+    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        if (dst->type == GGML_TYPE_BF16) {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+                }
+            }
+        } else {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
+                }
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ABORT("fatal error");
+    }
+}
+
+static void ggml_compute_forward_add_f16_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(ggml_fp16_t)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+            for (int i = 0; i < ne0; i++) {
+                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ABORT("fatal error");
+    }
+}
+
+static void ggml_compute_forward_add_bf16_bf16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+    GGML_ASSERT(src1->type == GGML_TYPE_BF16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(ggml_bf16_t)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+            ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+            ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            ggml_bf16_t * src1_ptr = (ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+            for (int i = 0; i < ne0; i++) {
+                dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + GGML_BF16_TO_FP32(src1_ptr[i]));
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ABORT("fatal error");
+    }
+}
+
+static void ggml_compute_forward_add_q_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+    const enum ggml_type dtype = dst->type;
+    ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
+    ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dtype)->from_float;
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ggml_is_quantized(src0->type));
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i03 = ir/(ne02*ne01);
+        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        // src1 and dst are same shape as src0 => same indices
+        const int i13 = i03;
+        const int i12 = i02;
+        const int i11 = i01;
+
+        const int i3 = i03;
+        const int i2 = i02;
+        const int i1 = i01;
+
+        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
+        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb3));
+
+        assert(ne00 % 32 == 0);
+
+        // unquantize row from src0 to temp buffer
+        dequantize_row_q(src0_row, wdata, ne00);
+        // add src1
+        ggml_vec_acc_f32(ne00, wdata, src1_row);
+        // quantize row to dst
+        if (quantize_row_q != NULL) {
+            quantize_row_q(wdata, dst_row, ne00);
+        } else {
+            memcpy(dst_row, wdata, ne0*nb0);
+        }
+    }
+}
+
+static void ggml_compute_forward_add(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add_f32(params, dst);
+                }
+                else {
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_TYPE_F16:
+            {
+                if (src1->type == GGML_TYPE_F16) {
+                    ggml_compute_forward_add_f16_f16(params, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add_f16_f32(params, dst);
+                }
+                else {
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                if (src1->type == GGML_TYPE_BF16) {
+                    ggml_compute_forward_add_bf16_bf16(params, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add_bf16_f32(params, dst);
+                }
+                else {
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
+            {
+                ggml_compute_forward_add_q_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_add1
+
+static void ggml_compute_forward_add1_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+#ifdef GGML_USE_ACCELERATE
+        UNUSED(ggml_vec_add1_f32);
+
+        vDSP_vadd(
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+                (float *) ((char *) src1->data), 0,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
+                ne0);
+#else
+        ggml_vec_add1_f32(ne0,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+               *(float *) src1->data);
+#endif
+    }
+}
+
+static void ggml_compute_forward_add1_f16_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1_f16_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    // scalar to add
+    const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1_q_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
+    ggml_from_float_t const quantize_row_q = ggml_get_type_traits(type)->from_float;
+
+    // we don't support permuted src0
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ggml_is_quantized(src0->type));
+    GGML_ASSERT(dst->type == src0->type);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
+        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));
+
+        assert(ne0 % 32 == 0);
+
+        // unquantize row from src0 to temp buffer
+        dequantize_row_q(src0_row, wdata, ne0);
+        // add src1
+        ggml_vec_acc1_f32(ne0, wdata, v);
+        // quantize row to dst
+        quantize_row_q(wdata, dst_row, ne0);
+    }
+}
+
+static void ggml_compute_forward_add1_bf16_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1_bf16_bf16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    // scalar to add
+    const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+    GGML_ASSERT(src1->type == GGML_TYPE_BF16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_add1_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                if (src1->type == GGML_TYPE_F16) {
+                    ggml_compute_forward_add1_f16_f16(params, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add1_f16_f32(params, dst);
+                }
+                else {
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                if (src1->type == GGML_TYPE_BF16) {
+                    ggml_compute_forward_add1_bf16_bf16(params, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add1_bf16_f32(params, dst);
+                }
+                else {
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
+            {
+                ggml_compute_forward_add1_q_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_acc
+
+static void ggml_compute_forward_acc_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+    // view src0 and dst with these strides and data offset inbytes during acc
+    // nb0 is implicitly element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    if (!inplace) {
+        if (params->ith == 0) {
+            // memcpy needs to be synchronized across threads to avoid race conditions.
+            // => do it in INIT phase
+            memcpy(
+                ((char *)  dst->data),
+                ((char *) src0->data),
+                ggml_nbytes(dst));
+        }
+        ggml_barrier(params->threadpool);
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src1);
+    const int nc = src1->ne[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+    // src0 and dst as viewed during acc
+    const size_t nb0 = ggml_element_size(src0);
+
+    const size_t nb00 = nb0;
+    const size_t nb01 = nb1;
+    const size_t nb02 = nb2;
+    const size_t nb03 = nb3;
+
+    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0  + (ne11 == 0 ? 0 : ne11-1)*nb1  + (ne12 == 0 ? 0 : ne12-1)*nb2  + (ne13 == 0 ? 0 : ne13-1)*nb3  < ggml_nbytes(dst));
+    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0));
+
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+#ifdef GGML_USE_ACCELERATE
+        vDSP_vadd(
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1,
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1  + offset), 1, nc);
+#else
+        ggml_vec_add_f32(nc,
+                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset),
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+    }
+}
+
+static void ggml_compute_forward_acc(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_acc_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sub
+
+static void ggml_compute_forward_sub_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                vDSP_vsub(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_sub_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_sub(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sub_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_mul
+
+static void ggml_compute_forward_mul_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0 ; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                UNUSED(ggml_vec_mul_f32);
+
+                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_mul(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_mul_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_div
+
+static void ggml_compute_forward_div_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                UNUSED(ggml_vec_div_f32);
+
+                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_div(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_div_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sqr
+
+static void ggml_compute_forward_sqr_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n     = ggml_nrows(src0);
+    const int nc    = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sqr_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sqr(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sqr_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sqrt
+
+static void ggml_compute_forward_sqrt_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sqrt_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sqrt(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sqrt_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_log
+
+static void ggml_compute_forward_log_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_log_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_log(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_log_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sin
+
+static void ggml_compute_forward_sin_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sin_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sin(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sin_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_cos
+
+static void ggml_compute_forward_cos_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_cos_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_cos(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_cos_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sum
+
+static void ggml_compute_forward_sum_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_scalar(dst));
+    assert(src0->nb[0] == sizeof(float));
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
+
+    ggml_float sum     = 0;
+    ggml_float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f32_ggf(ne00,
+                        &row_sum,
+                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((float *) dst->data)[0] = sum;
+}
+
+static void ggml_compute_forward_sum_f16(
+    const struct ggml_compute_params * params,
+          struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_scalar(dst));
+
+    assert(src0->nb[0] == sizeof(ggml_fp16_t));
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
+
+    float sum = 0;
+    float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f16_ggf(ne00,
+                    &row_sum,
+                    (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
+}
+
+static void ggml_compute_forward_sum_bf16(
+    const struct ggml_compute_params * params,
+          struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_scalar(dst));
+
+    assert(src0->nb[0] == sizeof(ggml_bf16_t));
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
+
+    float sum = 0;
+    float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_bf16_ggf(ne00,
+                    &row_sum,
+                    (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum);
+}
+
+static void ggml_compute_forward_sum(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sum_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_sum_f16(params, dst);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ggml_compute_forward_sum_bf16(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sum_rows
+
+static void ggml_compute_forward_sum_rows_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(dst->nb[0] == sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(ne0 == 1);
+    GGML_ASSERT(ne1 == ne01);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    for (int64_t i3 = 0; i3 < ne03; i3++) {
+        for (int64_t i2 = 0; i2 < ne02; i2++) {
+            for (int64_t i1 = 0; i1 < ne01; i1++) {
+                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
+                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
+                float row_sum = 0;
+                ggml_vec_sum_f32(ne00, &row_sum, src_row);
+                dst_row[0] = row_sum;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_sum_rows(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sum_rows_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_mean
+
+static void ggml_compute_forward_mean_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    assert(ne0 == 1);
+    assert(ne1 == ne01);
+    assert(ne2 == ne02);
+    assert(ne3 == ne03);
+
+    UNUSED(ne0);
+    UNUSED(ne1);
+    UNUSED(ne2);
+    UNUSED(ne3);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f32(ne00,
+                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
+
+                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_mean(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_mean_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_argmax
+
+static void ggml_compute_forward_argmax_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(float));
+    assert(dst->nb[0] == sizeof(float));
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+
+    const size_t nb01 = src0->nb[1];
+    const size_t nb0 = dst->nb[0];
+
+    for (int64_t i1 = 0; i1 < ne01; i1++) {
+        float * src = (float *) ((char *) src0->data + i1*nb01);
+        int32_t * dst_ = (int32_t *) ((char *)  dst->data + i1*nb0);
+        int v = 0;
+        ggml_vec_argmax_f32(ne00, &v, src);
+        dst_[0] = v;
+    }
+}
+
+static void ggml_compute_forward_argmax(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_argmax_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_count_equal
+
+static void ggml_compute_forward_count_equal_i32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    GGML_ASSERT(src0->type == GGML_TYPE_I32);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    GGML_ASSERT(ggml_is_scalar(dst));
+    GGML_ASSERT(dst->type == GGML_TYPE_I64);
+
+    const int64_t nr = ggml_nrows(src0);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int64_t * sums = (int64_t *) params->wdata;
+    int64_t sum_thread = 0;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 =  ir                        / (ne02*ne01);
+        const int64_t i02 = (ir - i03*ne03)            /       ne01;
+        const int64_t i01 =  ir - i03*ne03 - i02*ne02;
+
+        const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01;
+        const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11;
+
+        for (int64_t i00 = 0; i00 < ne00; ++i00) {
+            const int32_t val0 = *((const int32_t *) (data0 + i00*nb00));
+            const int32_t val1 = *((const int32_t *) (data1 + i00*nb10));
+
+            sum_thread += val0 == val1;
+        }
+    }
+    if (ith != 0) {
+        sums[ith] = sum_thread;
+    }
+    ggml_barrier(params->threadpool);
+
+    if (ith != 0) {
+        return;
+    }
+
+    for (int ith_other = 1; ith_other < nth; ++ith_other) {
+        sum_thread += sums[ith_other];
+    }
+    *((int64_t *) dst->data) = sum_thread;
+}
+
+static void ggml_compute_forward_count_equal(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_count_equal_i32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_repeat
+
+static void ggml_compute_forward_repeat_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_can_repeat(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                ggml_vec_cpy_f32(ne00,
+                                        (float *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0),
+                                        (float *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_repeat_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_can_repeat(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                ggml_fp16_t * y = (ggml_fp16_t *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0);
+                                ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01);
+                                // ggml_vec_cpy_f16(ne00, y, x)
+                                for (int i = 0; i < ne00; ++i) {
+                                    y[i]  = x[i];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_repeat(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_I16:
+            {
+                ggml_compute_forward_repeat_f16(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_repeat_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_repeat_back
+
+static void ggml_compute_forward_repeat_back_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_can_repeat(dst, src0));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne00/ne0);
+    const int nr1 = (int)(ne01/ne1);
+    const int nr2 = (int)(ne02/ne2);
+    const int nr3 = (int)(ne03/ne3);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (ggml_is_contiguous(dst)) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+    } else {
+        for         (int k3 = 0; k3 < ne3; k3++) {
+            for     (int k2 = 0; k2 < ne2; k2++) {
+                for (int k1 = 0; k1 < ne1; k1++) {
+                    ggml_vec_set_f32(ne0,
+                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
+                        0);
+                }
+            }
+        }
+    }
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3; i3++) {
+        for                     (int k3 = 0; k3 < ne3; k3++) {
+            for                 (int i2 = 0; i2 < nr2; i2++) {
+                for             (int k2 = 0; k2 < ne2; k2++) {
+                    for         (int i1 = 0; i1 < nr1; i1++) {
+                        for     (int k1 = 0; k1 < ne1; k1++) {
+                            for (int i0 = 0; i0 < nr0; i0++) {
+                                ggml_vec_acc_f32(ne0,
+                                        (float *) ((char *)  dst->data + (         k3)*nb3  + (         k2)*nb2  + (         k1)*nb1),
+                                        (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_repeat_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_repeat_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_concat
+
+static void ggml_compute_forward_concat_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int32_t dim = ggml_get_op_params_i32(dst, 0);
+
+    GGML_ASSERT(dim >= 0 && dim < 4);
+
+    int64_t o[4] = {0, 0, 0, 0};
+    o[dim] = src0->ne[dim];
+
+    const float * x;
+
+    // TODO: smarter multi-theading
+    for (int i3 = 0; i3 < ne3; i3++) {
+        for (int i2 = ith; i2 < ne2; i2 += nth) {
+            for (int i1 = 0; i1 < ne1; i1++) {
+                for (int i0 = 0; i0 < ne0; i0++) {
+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                        x = (const float *) ((const char *)src0->data + (i0       )*nb00 + (i1       )*nb01 + (i2       )*nb02 + (i3       )*nb03);
+                    } else {
+                        x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
+                    }
+
+                    float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+
+                    *y = *x;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_concat(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_concat_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_abs
+
+static void ggml_compute_forward_abs_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_abs_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_abs(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_abs_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sgn
+
+static void ggml_compute_forward_sgn_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sgn_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sgn(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sgn_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_neg
+
+static void ggml_compute_forward_neg_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_neg_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_neg(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_neg_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_step
+
+static void ggml_compute_forward_step_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_step_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_step(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_step_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_tanh
+
+static void ggml_compute_forward_tanh_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_tanh_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_tanh(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_tanh_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_elu
+
+static void ggml_compute_forward_elu_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_elu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_elu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_elu_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_relu
+
+static void ggml_compute_forward_relu_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_relu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_relu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_relu_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sigmoid
+
+static void ggml_compute_forward_sigmoid_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sigmoid_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sigmoid(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sigmoid_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_gelu
+
+static void ggml_compute_forward_gelu_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_gelu_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_gelu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_gelu_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_gelu_quick
+
+static void ggml_compute_forward_gelu_quick_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_gelu_quick_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_gelu_quick(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_gelu_quick_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_silu
+
+static void ggml_compute_forward_silu_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_silu_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_silu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_silu_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+// ggml_compute_forward_leaky_relu
+
+static void ggml_compute_forward_leaky_relu_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_leaky_relu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
+    }
+}
+
+static void ggml_compute_forward_leaky_relu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_leaky_relu_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_silu_back
+
+static void ggml_compute_forward_silu_back_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * grad = dst->src[1];
+
+    assert(ggml_is_contiguous_1(grad));
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+    assert(ggml_are_same_shape(src0, grad));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_silu_backward_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])),
+                (float *) ((char *) grad->data + i1*(grad->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_silu_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_silu_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+
+static void ggml_compute_forward_hardswish_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_hardswish_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+static void ggml_compute_forward_hardswish(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hardswish_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_hardsigmoid_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_hardsigmoid_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_hardsigmoid(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hardsigmoid_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_exp_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_exp_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_exp(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_exp_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+
+// ggml_compute_forward_norm
+
+static void ggml_compute_forward_norm_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    GGML_ASSERT(eps > 0.0f);
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                ggml_float sum = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum += (ggml_float)x[i00];
+                }
+
+                float mean = sum/ne00;
+
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                ggml_float sum2 = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    float v = x[i00] - mean;
+                    y[i00] = v;
+                    sum2 += (ggml_float)(v*v);
+                }
+
+                float variance = sum2/ne00;
+                const float scale = 1.0f/sqrtf(variance + eps);
+
+                ggml_vec_scale_f32(ne00, y, scale);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_norm(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_norm_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_group_rms_norm
+
+static void ggml_compute_forward_rms_norm_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    GGML_ASSERT(eps > 0.0f);
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                ggml_float sum = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum += (ggml_float)(x[i00] * x[i00]);
+                }
+
+                const float mean = sum/ne00;
+
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                memcpy(y, x, ne00 * sizeof(float));
+                // for (int i00 = 0; i00 < ne00; i00++) {
+                //     y[i00] = x[i00];
+                // }
+
+                const float scale = 1.0f/sqrtf(mean + eps);
+
+                ggml_vec_scale_f32(ne00, y, scale);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rms_norm(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rms_norm_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_rms_norm_back_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                // src1 is same shape as src0 => same indices
+                const int64_t i11 = i01;
+                const int64_t i12 = i02;
+                const int64_t i13 = i03;
+
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
+
+                ggml_float sum_xx  = 0.0;
+                ggml_float sum_xdz = 0.0;
+
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum_xx  += (ggml_float)(x[i00] * x[i00]);
+                    sum_xdz += (ggml_float)(x[i00] * dz[i00]);
+                }
+
+                //const float mean     = (float)(sum_xx)/ne00;
+                const float mean_eps = (float)(sum_xx)/ne00 + eps;
+                const float sum_eps  = (float)(sum_xx) + eps*ne00;
+                //const float mean_xdz = (float)(sum_xdz)/ne00;
+                // we could cache rms from forward pass to improve performance.
+                // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms.
+                //const float rms      = sqrtf(mean_eps);
+                const float rrms     = 1.0f / sqrtf(mean_eps);
+                //const float scale    = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
+
+                {
+                    // z = rms_norm(x)
+                    //
+                    // rms_norm(src0) =
+                    //     scale(
+                    //         src0,
+                    //         div(
+                    //             1,
+                    //             sqrt(
+                    //                 add(
+                    //                     scale(
+                    //                         sum(
+                    //                             sqr(
+                    //                                 src0)),
+                    //                         (1.0/N)),
+                    //                     eps))));
+
+                    // postorder:
+                    // ## op    args         grad
+                    // 00 param src0         grad[#00]
+                    // 01 const 1
+                    // 02 sqr   (#00)        grad[#02]
+                    // 03 sum   (#02)        grad[#03]
+                    // 04 const 1/N
+                    // 05 scale (#03, #04)   grad[#05]
+                    // 06 const eps
+                    // 07 add   (#05, #06)   grad[#07]
+                    // 08 sqrt  (#07)        grad[#08]
+                    // 09 div   (#01,#08)    grad[#09]
+                    // 10 scale (#00,#09)    grad[#10]
+                    //
+                    // backward pass, given grad[#10]
+                    // #10: scale
+                    // grad[#00] += scale(grad[#10],#09)
+                    // grad[#09] += sum(mul(grad[#10],#00))
+                    // #09: div
+                    // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
+                    // #08: sqrt
+                    // grad[#07] += mul(grad[#08], div(0.5, #08))
+                    // #07: add
+                    // grad[#05] += grad[#07]
+                    // #05: scale
+                    // grad[#03] += scale(grad[#05],#04)
+                    // #03: sum
+                    // grad[#02] += repeat(grad[#03], #02)
+                    // #02:
+                    // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
+                    //
+                    // substitute and simplify:
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+                    // grad[#02] = repeat(grad[#03], #02)
+                    // grad[#02] = repeat(scale(grad[#05],#04), #02)
+                    // grad[#02] = repeat(scale(grad[#07],#04), #02)
+                    // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N)))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps)))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps))
+                    // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps))
+                    // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps))
+                    // a = b*c + d*e
+                    // a = b*c*f/f + d*e*f/f
+                    // a = (b*c*f + d*e*f)*(1/f)
+                    // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c))
+                    // a = (b + d*e/c)*c
+                    // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps)
+                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms
+                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
+                    // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
+                    // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
+                    // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
+                    // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
+                    // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
+                    // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                    // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                }
+                // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                // post-order:
+                // dx := x
+                // dx := scale(dx,-mean_xdz/mean_eps)
+                // dx := add(dx, dz)
+                // dx := scale(dx, rrms)
+                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                ggml_vec_cpy_f32  (ne00, dx, x);
+                // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
+                ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
+                ggml_vec_acc_f32  (ne00, dx, dz);
+                ggml_vec_scale_f32(ne00, dx, rrms);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rms_norm_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rms_norm_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_group_norm
+
+static void ggml_compute_forward_group_norm_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    // TODO: optimize
+
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+
+    int n_channels = src0->ne[2];
+    int n_groups = dst->op_params[0];
+    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
+    for (int i = ith; i < n_groups; i += nth) {
+        int start = i * n_channels_per_group;
+        int end = start + n_channels_per_group;
+        if (end > n_channels) {
+            end = n_channels;
+        }
+        int step = end - start;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            ggml_float sum = 0.0;
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                    ggml_float sumr = 0.0;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        sumr += (ggml_float)x[i00];
+                    }
+                    sum += sumr;
+                }
+            }
+            const float mean = sum / (ne00 * ne01 * step);
+
+            ggml_float sum2 = 0.0;
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+
+                    ggml_float sumr = 0.0;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        float v = x[i00] - mean;
+                        y[i00] = v;
+                        sumr += (ggml_float)(v * v);
+                    }
+                    sum2 += sumr;
+                }
+            }
+            const float variance = sum2 / (ne00 * ne01 * step);
+            const float scale = 1.0f / sqrtf(variance + eps);
+
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+                    ggml_vec_scale_f32(ne00, y, scale);
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_group_norm(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_group_norm_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_mul_mat
+
+static void ggml_compute_forward_mul_mat_one_chunk(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const int64_t num_rows_per_vec_dot,
+    const int64_t ir0_start,
+    const int64_t ir0_end,
+    const int64_t ir1_start,
+    const int64_t ir1_end) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const enum ggml_type type = src0->type;
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
+    enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+
+    // broadcast factors
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
+
+    // threads with no work simply yield (not sure if it helps)
+    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
+        return;
+    }
+
+    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    // block-tiling attempt
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
+
+    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
+
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    // 16 * 2, accounting for mmla kernels
+    float tmp[32];
+
+    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
+                const int64_t i13 = (ir1 / (ne12 * ne1));
+                const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
+                const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+
+                // broadcast src0 into src1
+                const int64_t i03 = i13 / r3;
+                const int64_t i02 = i12 / r2;
+
+                const int64_t i1 = i11;
+                const int64_t i2 = i12;
+                const int64_t i3 = i13;
+
+                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char*)wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                        ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                        : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                }
+
+                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
+                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_mul_mat(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+
+    enum ggml_type           const vec_dot_type         = type_traits_cpu[type].vec_dot_type;
+    ggml_from_float_t        const from_float           = ggml_get_type_traits(vec_dot_type)->from_float;
+    ggml_from_float_to_mat_t const from_float_to_mat    = type_traits_cpu[vec_dot_type].from_float_to_mat;
+    int64_t                  const vec_dot_num_rows     = type_traits_cpu[type].nrows;
+    int64_t                  const matmul_num_cols      = type_traits_cpu[type].ncols;
+    int64_t                  const blck_size_interleave = ggml_get_type_traits(type)->blck_size_interleave;
+    ggml_gemv_t              const gemv                 = type_traits_cpu[type].gemv;
+    ggml_gemm_t              const gemm                 = type_traits_cpu[type].gemm;
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+#if GGML_USE_LLAMAFILE
+    // broadcast factors
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    if (src1_cont) {
+        for (int64_t i13 = 0; i13 < ne13; i13++)
+            for (int64_t i12 = 0; i12 < ne12; i12++)
+                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
+                                     nb01/ggml_type_size(src0->type),
+                                     (const char *)src1->data + i12*nb12 + i13*nb13,
+                                     nb11/ggml_type_size(src1->type),
+                                     (char *)dst->data + i12*nb2 + i13*nb3,
+                                     nb1/ggml_type_size(dst->type),
+                                     ith, nth,
+                                     src0->type,
+                                     src1->type,
+                                     dst->type))
+                    goto UseGgmlGemm1;
+        return;
+    }
+UseGgmlGemm1:;
+#endif
+
+    if (src1->type != vec_dot_type) {
+        char * wdata = params->wdata;
+
+        const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
+        const size_t nbw2 = nbw1*ne11;
+        const size_t nbw3 = nbw2*ne12;
+
+        assert(params->wsize >= ne13*nbw3);
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        for (int64_t i13 = 0; i13 < ne13; ++i13) {
+            for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                int64_t i11_processed = 0;
+                if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
+                    for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+                        from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                                          4, ne10, blck_size_interleave);
+                    }
+                    i11_processed = ne11 - ne11 % 4;
+                }
+                for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
+                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                           (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                           ne10);
+                }
+            }
+        }
+    }
+
+    if (ith == 0) {
+        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+        atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
+    }
+
+    ggml_barrier(params->threadpool);
+
+#if GGML_USE_LLAMAFILE
+    if (src1->type != vec_dot_type) {
+        const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+        for (int64_t i13 = 0; i13 < ne13; i13++)
+            for (int64_t i12 = 0; i12 < ne12; i12++)
+                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
+                                     nb01/ggml_type_size(src0->type),
+                                     (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
+                                     row_size/ggml_type_size(vec_dot_type),
+                                     (char *)dst->data + i12*nb2 + i13*nb3,
+                                     nb1/ggml_type_size(dst->type),
+                                     ith, nth,
+                                     src0->type,
+                                     vec_dot_type,
+                                     dst->type))
+                    goto UseGgmlGemm2;
+        return;
+    }
+UseGgmlGemm2:;
+#endif
+
+    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
+    const int64_t nr0 = ne0;
+
+    // This is the size of the rest of the dimensions of the result
+    const int64_t nr1 = ne1 * ne2 * ne3;
+
+    // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
+    int64_t num_rows_per_vec_dot = vec_dot_num_rows;
+    // TODO: currently the mmla kernels support only even numbered rows/cols.
+    // this check can be removed once they are extended to support odd numbered rows/cols too
+    if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
+        num_rows_per_vec_dot = 1;
+    }
+
+    // Now select a reasonable chunk size.
+    int chunk_size = 16;
+
+    // We need to step up the size if it's small
+    if (nr0 == 1 || nr1 == 1) {
+        chunk_size = 64;
+    }
+
+    // distribute the work across the inner or outer loop based on which one is larger
+    // The number of chunks in the 0/1 dim.
+    // CEIL(nr0/chunk_size)
+    int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
+    int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
+
+    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggerganov/llama.cpp/pull/6915
+    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
+    if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
+        // distribute the thread work across the inner or outer loop based on which one is larger
+        nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+        nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
+    }
+
+    // The number of elements in each chunk
+    const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+    const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
+
+    if ((ggml_n_dims(src0) == 2) && gemv) {
+        const void * src1_wdata      = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
+        int64_t src0_start = (ith * ne01) / nth;
+        int64_t src0_end   = ((ith + 1) * ne01) / nth;
+        src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
+        src0_end   = (src0_end   % matmul_num_cols) ? src0_end   + matmul_num_cols - (src0_end   % matmul_num_cols): src0_end;
+        if (src0_start >= src0_end) return;
+
+        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+        if (gemm && (ne11 > 3)) {
+            gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
+                 (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        }
+        for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
+            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+                 (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                 src0_end - src0_start);
+        }
+        return;
+    }
+
+    // The first chunk comes from our thread_id, the rest will get auto-assigned.
+    int current_chunk = ith;
+
+    while (current_chunk < nchunk0 * nchunk1) {
+        const int64_t ith0 = current_chunk % nchunk0;
+        const int64_t ith1 = current_chunk / nchunk0;
+
+        const int64_t ir0_start = dr0 * ith0;
+        const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
+
+        const int64_t ir1_start = dr1 * ith1;
+        const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
+
+        ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+
+        if (nth >= nchunk0 * nchunk1) {
+            break;
+        }
+
+        current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
+    }
+}
+
+// ggml_compute_forward_mul_mat_id
+
+static void ggml_compute_forward_mul_mat_id(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * ids = dst->src[2];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    ggml_vec_dot_t    const vec_dot         = type_traits_cpu[type].vec_dot;
+    enum ggml_type    const vec_dot_type    = type_traits_cpu[type].vec_dot_type;
+    ggml_from_float_t const from_float      = ggml_get_type_traits(vec_dot_type)->from_float;
+    int64_t           const matmul_num_cols = type_traits_cpu[type].ncols;
+    ggml_gemv_t       const gemv            = type_traits_cpu[type].gemv;
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // row groups
+    const int n_ids = ids->ne[0]; // n_expert_used
+    const int n_as  = ne02;       // n_expert
+
+    char * wdata_src1_end = (src1->type == vec_dot_type) ?
+            (char *) params->wdata :
+            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
+
+    struct mmid_row_mapping {
+        int32_t i1;
+        int32_t i2;
+    };
+
+    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
+    struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
+
+    if (src1->type != vec_dot_type) {
+        char * wdata = params->wdata;
+
+        const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
+        const size_t nbw2 = nbw1*ne11;
+        const size_t nbw3 = nbw2*ne12;
+
+        assert(params->wsize >= ne13*nbw3);
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        for (int64_t i13 = 0; i13 < ne13; ++i13) {
+            for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                               ne10);
+                }
+            }
+        }
+    }
+
+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
+
+    if (ith == 0) {
+        // initialize matrix_row_counts
+        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
+
+        // group rows by src0 matrix
+        for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
+            for (int id = 0; id < n_ids; ++id) {
+                const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
+
+                assert(i02 >= 0 && i02 < n_as);
+
+                MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
+                matrix_row_counts[i02] += 1;
+            }
+        }
+    }
+
+    ggml_barrier(params->threadpool);
+
+    // compute each matrix multiplication in sequence
+    for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+        const int64_t cne1 = matrix_row_counts[cur_a];
+
+        if (cne1 == 0) {
+            continue;
+        }
+
+        const char * src0_cur = (const char *) src0->data + cur_a*nb02;
+
+        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+        const int64_t nr0 = ne01; // src0 rows
+        const int64_t nr1 = cne1; // src1 rows
+
+        if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
+            int64_t src0_cur_start = (ith * ne01) / nth;
+            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
+            src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
+            src0_cur_end   = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
+            if (src0_cur_start >= src0_cur_end) return;
+
+            for (int ir1 = 0; ir1 < nr1; ir1++) {
+                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
+                const int id       = row_mapping.i1; // selected expert index
+
+                const int64_t  i11 = id % ne11;
+                const int64_t  i12 = row_mapping.i2; // row index in src1
+
+                const int64_t  i1 = id;  // selected expert index
+                const int64_t  i2 = i12; // row
+
+                const char * src1_col = (const char *) wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                    ? (i11        + i12 * ne11) * row_size
+                    : (i11 * nb11 + i12 * nb12));
+
+                gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
+                     (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
+            }
+            continue;
+        }
+
+        // distribute the thread work across the inner or outer loop based on which one is larger
+
+        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+        const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
+
+        const int64_t ith0 = ith % nth0;
+        const int64_t ith1 = ith / nth0;
+
+        const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
+        const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
+
+        const int64_t ir010 = dr0*ith0;
+        const int64_t ir011 = MIN(ir010 + dr0, nr0);
+
+        const int64_t ir110 = dr1*ith1;
+        const int64_t ir111 = MIN(ir110 + dr1, nr1);
+
+        // threads with no work simply yield (not sure if it helps)
+        //if (ir010 >= ir011 || ir110 >= ir111) {
+        //    sched_yield();
+        //    continue;
+        //}
+
+        // block-tiling attempt
+        const int64_t blck_0 = 16;
+        const int64_t blck_1 = 16;
+
+        // attempt to reduce false-sharing (does not seem to make a difference)
+        float tmp[16];
+
+        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
+            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
+                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
+                    const int64_t _i12 = ir1; // logical row index for this expert
+
+                    struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
+                    const int id       = row_mapping.i1; // selected expert index
+
+                    const int64_t  i11 = id % ne11;
+                    const int64_t  i12 = row_mapping.i2; // row index in src1
+
+                    const int64_t  i1 = id;  // selected expert index
+                    const int64_t  i2 = i12; // row
+
+                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                    //       the original src1 data pointer, so we should index using the indices directly
+                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                    const char * src1_col = (const char *) wdata +
+                        (src1_cont || src1->type != vec_dot_type
+                        ? (i11      + i12*ne11)*row_size
+                        : (i11*nb11 + i12*nb12));
+
+                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
+
+                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                    //}
+
+                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
+                    }
+
+                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
+                }
+            }
+        }
+    }
+
+#undef MMID_MATRIX_ROW
+}
+
+// ggml_compute_forward_out_prod
+
+static void ggml_compute_forward_out_prod_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_ASSERT(ne0  == ne00);
+    GGML_ASSERT(ne1  == ne10);
+    GGML_ASSERT(ne2  == ne02);
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne3  == ne13);
+    GGML_ASSERT(ne03 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    if (ith == 0) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+    }
+    ggml_barrier(params->threadpool);
+
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i1:
+    //     for i01:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
+    // parallelize by last three dimensions
+
+    // total rows in dst
+    const int64_t nr = ne1*ne2*ne3;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    // block-tiling attempt
+    const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
+    const int64_t blck_1 = 16;
+
+    for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
+        const int64_t bir1 = MIN(bir + blck_1, ir1);
+        for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
+            const int64_t bne01 = MIN(bi01 + blck_0, ne01);
+            for (int64_t ir = bir; ir < bir1; ++ir) {
+                // dst indices
+                const int64_t i3 = ir/(ne2*ne1);
+                const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+                const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                const int64_t i02 = i2;
+                const int64_t i03 = i3;
+
+                //const int64_t i10 = i1;
+                const int64_t i12 = i2;
+                const int64_t i13 = i3;
+
+#if GGML_VEC_MAD_UNROLL > 2
+                const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL);
+                for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                    ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
+                }
+                for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                    ggml_vec_mad_f32(ne0, d, s0, *s1);
+                }
+#else
+                for (int64_t i01 = bi01; i01 < bne01; ++i01) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                    ggml_vec_mad_f32(ne0, d, s0, *s1);
+                }
+#endif
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_out_prod_q_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
+
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne03 == ne13);
+    GGML_ASSERT(ne2  == ne12);
+    GGML_ASSERT(ne3  == ne13);
+
+    // we don't support permuted src0 dim0
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+
+    // dst dim0 cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ne0 == ne00);
+    GGML_ASSERT(ne1 == ne10);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    if (ith == 0) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+    }
+    ggml_barrier(params->threadpool);
+
+    // parallelize by last three dimensions
+
+    // total rows in dst
+    const int64_t nr = ne1*ne2*ne3;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i1:
+    //     for i01:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
+    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        // dst indices
+        const int64_t i3 = ir/(ne2*ne1);
+        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        const int64_t i02 = i2;
+        const int64_t i03 = i3;
+
+        //const int64_t i10 = i1;
+        const int64_t i12 = i2;
+        const int64_t i13 = i3;
+
+        for (int64_t i01 = 0; i01 < ne01; ++i01) {
+            const int64_t i11 = i01;
+
+            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+            dequantize_row_q(s0, wdata, ne0);
+            ggml_vec_mad_f32(ne0, d, wdata, *s1);
+        }
+    }
+}
+
+static void ggml_compute_forward_out_prod(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
+            {
+                ggml_compute_forward_out_prod_q_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                GGML_ABORT("fatal error"); // todo
+                // ggml_compute_forward_out_prod_f16_f32(params, dst);
+            }
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_out_prod_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_scale
+
+static void ggml_compute_forward_scale_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    // scale factor
+    float v;
+    memcpy(&v, dst->op_params, sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb1 = dst->nb[1];
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        if (dst->data != src0->data) {
+            // src0 is same shape as dst => same indices
+            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
+        }
+        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
+    }
+}
+
+static void ggml_compute_forward_scale(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_scale_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_set
+
+static void ggml_compute_forward_set_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+    // view src0 and dst with these strides and data offset inbytes during set
+    // nb0 is implicitly element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    if (!inplace) {
+        if (params->ith == 0) {
+            // memcpy needs to be synchronized across threads to avoid race conditions.
+            // => do it in INIT phase
+            memcpy(
+                ((char *)  dst->data),
+                ((char *) src0->data),
+                ggml_nbytes(dst));
+        }
+        ggml_barrier(params->threadpool);
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src1);
+    const int nc = src1->ne[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+    // src0 and dst as viewed during set
+    const size_t nb0 = ggml_element_size(src0);
+
+    const int im0 = (ne10 == 0 ? 0 : ne10-1);
+    const int im1 = (ne11 == 0 ? 0 : ne11-1);
+    const int im2 = (ne12 == 0 ? 0 : ne12-1);
+    const int im3 = (ne13 == 0 ? 0 : ne13-1);
+
+    GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_nbytes(dst));
+
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+        ggml_vec_cpy_f32(nc,
+                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+    }
+}
+
+static void ggml_compute_forward_set(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_set_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_cpy
+
+static void ggml_compute_forward_cpy(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    ggml_compute_forward_dup(params, dst);
+}
+
+// ggml_compute_forward_cont
+
+static void ggml_compute_forward_cont(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    ggml_compute_forward_dup(params, dst);
+}
+
+// ggml_compute_forward_reshape
+
+static void ggml_compute_forward_reshape(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    // NOP
+    UNUSED(params);
+    UNUSED(dst);
+}
+
+// ggml_compute_forward_view
+
+static void ggml_compute_forward_view(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * dst) {
+    // NOP
+    UNUSED(params);
+    UNUSED(dst);
+}
+
+// ggml_compute_forward_permute
+
+static void ggml_compute_forward_permute(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * dst) {
+    // NOP
+    UNUSED(params);
+    UNUSED(dst);
+}
+
+// ggml_compute_forward_transpose
+
+static void ggml_compute_forward_transpose(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * dst) {
+    // NOP
+    UNUSED(params);
+    UNUSED(dst);
+}
+
+// ggml_compute_forward_get_rows
+
+static void ggml_compute_forward_get_rows_q(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_nelements(src1);
+
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == ggml_type_size(type));
+    assert(ggml_nrows(dst) == nr);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+        dequantize_row_q(
+                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+    }
+}
+
+static void ggml_compute_forward_get_rows_f16(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_nelements(src1);
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == sizeof(ggml_fp16_t));
+    assert(ggml_nrows(dst) == nr);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+        ggml_fp16_to_fp32_row(
+                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+    }
+}
+
+static void ggml_compute_forward_get_rows_bf16(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_nelements(src1);
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == sizeof(ggml_bf16_t));
+    assert(ggml_nrows(dst) == nr);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+        ggml_bf16_to_fp32_row(
+                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+    }
+}
+
+static void ggml_compute_forward_get_rows_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_nelements(src1);
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == sizeof(float));
+    assert(ggml_nrows(dst) == nr);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+        ggml_vec_cpy_f32(nc,
+                (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
+                (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
+    }
+}
+
+static void ggml_compute_forward_get_rows(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
+            {
+                ggml_compute_forward_get_rows_q(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_get_rows_f16(params, dst);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ggml_compute_forward_get_rows_bf16(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_get_rows_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+
+    //static bool first = true;
+    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
+    //if (first) {
+    //    first = false;
+    //} else {
+    //    for (int k = 0; k < dst->ne[1]; ++k) {
+    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
+    //            for (int i = 0; i < 16; ++i) {
+    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
+    //            }
+    //            printf("\n");
+    //        }
+    //        printf("\n");
+    //    }
+    //    printf("\n");
+    //    exit(0);
+    //}
+}
+
+// ggml_compute_forward_get_rows_back
+
+static void ggml_compute_forward_get_rows_back_f32_f16(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
+
+    memset(dst->data, 0, ggml_nbytes(dst));
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nelements(src1);
+
+    GGML_ASSERT( dst->ne[0] == nc);
+    GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        for (int j = 0; j < nc; ++j) {
+            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
+            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
+        }
+    }
+}
+
+static void ggml_compute_forward_get_rows_back_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
+
+    memset(dst->data, 0, ggml_nbytes(dst));
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nelements(src1);
+
+    GGML_ASSERT( dst->ne[0] == nc);
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        ggml_vec_add_f32(nc,
+                (float *) ((char *)  dst->data + r*dst->nb[1]),
+                (float *) ((char *)  dst->data + r*dst->nb[1]),
+                (float *) ((char *) src0->data + i*src0->nb[1]));
+    }
+}
+
+static void ggml_compute_forward_get_rows_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_get_rows_back_f32_f16(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_get_rows_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+
+    //static bool first = true;
+    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
+    //if (first) {
+    //    first = false;
+    //} else {
+    //    for (int k = 0; k < dst->ne[1]; ++k) {
+    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
+    //            for (int i = 0; i < 16; ++i) {
+    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
+    //            }
+    //            printf("\n");
+    //        }
+    //        printf("\n");
+    //    }
+    //    printf("\n");
+    //    exit(0);
+    //}
+}
+
+// ggml_compute_forward_diag
+
+static void ggml_compute_forward_diag_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(ne00 == ne0);
+    GGML_ASSERT(ne00 == ne1);
+    GGML_ASSERT(ne01 == 1);
+    GGML_ASSERT(ne02 == ne2);
+    GGML_ASSERT(ne03 == ne3);
+
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    for (int i3 = 0; i3 < ne3; i3++) {
+        for (int i2 = 0; i2 < ne2; i2++) {
+            for (int i1 = 0; i1 < ne1; i1++) {
+                float * d = (float *)((char *)  dst->data + i3*nb3  + i2*nb2 + i1*nb1);
+                float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02);
+                for (int i0 = 0; i0 < i1; i0++) {
+                    d[i0] = 0;
+                }
+                d[i1] = s[i1];
+                for (int i0 = i1+1; i0 < ne0; i0++) {
+                    d[i0] = 0;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_diag(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_diag_mask_inf
+
+static void ggml_compute_forward_diag_mask_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const float value) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int  n_past  = ((int32_t *) dst->op_params)[0];
+    const bool inplace = src0->data == dst->data;
+
+    GGML_ASSERT(n_past >= 0);
+
+    if (!inplace) {
+        if (ith == 0) {
+            // memcpy needs to be synchronized across threads to avoid race conditions.
+            // => do it in INIT phase
+            GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+            GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+            memcpy(
+                ((char *)  dst->data),
+                ((char *) src0->data),
+                ggml_nbytes(dst));
+        }
+        ggml_barrier(params->threadpool);
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+    const int nr = src0->ne[1];
+    const int nz = n/nr;
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int k = 0; k < nz; k++) {
+        for (int j = ith; j < nr; j += nth) {
+            for (int i = n_past; i < nc; i++) {
+                if (i > n_past + j) {
+                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_diag_mask_inf(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_diag_mask_zero(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_mask_f32(params, dst, 0);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_soft_max
+
+static void ggml_compute_forward_soft_max_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    assert(ggml_is_contiguous(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
+
+    // TODO: handle transposed/permuted matrices
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //const int64_t ne11 = src1 ? src1->ne[1] : 1;
+
+    // TODO: is this supposed to be ceil instead of floor?
+    //       https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
+    const uint32_t n_head      = ne02;
+    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
+
+    const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        // ALiBi
+        const uint32_t h = (i1/ne01)%ne02; // head
+        const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
+
+        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+
+        // broadcast the mask across rows
+        ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
+        float       * mp_f32 = src1 ? (float       *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
+
+        ggml_vec_cpy_f32  (nc, wp, sp);
+        ggml_vec_scale_f32(nc, wp, scale);
+        if (mp_f32) {
+            if (use_f16) {
+                for (int i = 0; i < nc; ++i) {
+                    wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
+                }
+            } else {
+                for (int i = 0; i < nc; ++i) {
+                    wp[i] += slope*mp_f32[i];
+                }
+            }
+        }
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(wp[i]));
+        }
+#endif
+
+        float max = -INFINITY;
+        ggml_vec_max_f32(nc, &max, wp);
+
+        ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
+        assert(sum > 0.0);
+
+        sum = 1.0/sum;
+        ggml_vec_scale_f32(nc, dp, sum);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(dp[i]));
+            assert(!isinf(dp[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_soft_max(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_soft_max_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+
+// ggml_compute_forward_soft_max_back
+
+static void ggml_compute_forward_soft_max_back_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_are_same_shape(src1, dst));
+
+    // TODO: handle transposed/permuted matrices
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float *dy = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float *y  = (float *)((char *) src1->data + i1*src1->nb[1]);
+        float *dx = (float *)((char *) dst->data  + i1*dst->nb[1]);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(dy[i]));
+            assert(!isnan(y[i]));
+        }
+#endif
+        // Jii = yi - yi*yi
+        // Jij = -yi*yj
+        // J = diag(y)-y.T*y
+        // dx = J * dy
+        // dxk = sum_i(Jki * dyi)
+        // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*dyk
+        // dxk = -yk * sum_i(yi * dyi) + yk*dyk
+        // dxk = -yk * dot(y, dy) + yk*dyk
+        // dxk = yk * (- dot(y, dy) + dyk)
+        // dxk = yk * (dyk - dot(y, dy))
+        //
+        // post-order:
+        // dot_y_dy := dot(y, dy)
+        // dx := dy
+        // dx := dx - dot_y_dy
+        // dx := dx * y
+
+        // linear runtime, no additional memory
+        float dot_y_dy = 0;
+        ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
+        ggml_vec_cpy_f32 (nc, dx, dy);
+        ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
+        ggml_vec_mul_f32 (nc, dx, dx, y);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(dx[i]));
+            assert(!isinf(dx[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_soft_max_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_soft_max_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_clamp
+
+static void ggml_compute_forward_clamp_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    float min;
+    float max;
+    memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    for (int j = ith; j < n; j += nth) {
+        float * dst_ptr  = (float *) ((char *)  dst->data + j*nb1);
+        float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
+
+        for (int i = 0; i < nc; i++) {
+            dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
+        }
+    }
+}
+
+static void ggml_compute_forward_clamp(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_clamp_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q8_K:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
+        case GGML_TYPE_I8:
+        case GGML_TYPE_I16:
+        case GGML_TYPE_I32:
+        case GGML_TYPE_I64:
+        case GGML_TYPE_F64:
+        case GGML_TYPE_COUNT:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_rope
+
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
+    return 1 - MIN(1, MAX(0, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(
+    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
+    *cos_theta = cosf(theta) * mscale;
+    *sin_theta = sinf(theta) * mscale;
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
+    return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
+}
+
+static void ggml_rope_cache_init(
+     float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
+     float * cache, float sin_sign, float theta_scale) {
+    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
+    float theta = theta_base;
+    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
+        rope_yarn(
+            theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+        );
+        cache[i0 + 1] *= sin_sign;
+
+        theta *= theta_scale;
+    }
+}
+
+void ggml_rope_yarn_corr_dims(
+    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
+) {
+    // start and end correction dims
+    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
+    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
+    dims[0] = MAX(0, start);
+    dims[1] = MIN(n_dims - 1, end);
+}
+
+static void ggml_compute_forward_rope_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const bool forward) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * src2 = dst->src[2];
+
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+
+    GGML_ASSERT(n_dims <= ne0);
+    GGML_ASSERT(n_dims % 2 == 0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+
+    const float * freq_factors = NULL;
+    if (src2 != NULL) {
+        GGML_ASSERT(src2->type == GGML_TYPE_F32);
+        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
+        freq_factors = (const float *) src2->data;
+    }
+
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
+
+            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
+            ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
+
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = src[0];
+                        const float x1 = src[1];
+
+                        dst_data[0] = x0*cos_theta - x1*sin_theta;
+                        dst_data[1] = x0*sin_theta + x1*cos_theta;
+                    }
+                } else {
+                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                        const int64_t ic = i0/2;
+
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
+
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                        const float x0 = src[0];
+                        const float x1 = src[n_dims/2];
+
+                        dst_data[0]        = x0*cos_theta - x1*sin_theta;
+                        dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+                    }
+                }
+
+                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                    const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                    float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                    dst_data[0] = src[0];
+                    dst_data[1] = src[1];
+                }
+            }
+        }
+    }
+}
+
+// TODO: deduplicate f16/f32 code
+static void ggml_compute_forward_rope_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const bool forward) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * src2 = dst->src[2];
+
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+
+    GGML_ASSERT(n_dims <= ne0);
+    GGML_ASSERT(n_dims % 2 == 0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+
+    const float * freq_factors = NULL;
+    if (src2 != NULL) {
+        GGML_ASSERT(src2->type == GGML_TYPE_F32);
+        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
+        freq_factors = (const float *) src2->data;
+    }
+
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
+
+            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
+            ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
+
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = GGML_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_FP16_TO_FP32(src[1]);
+
+                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                    }
+                } else {
+                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                        const int64_t ic = i0/2;
+
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
+
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                        ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                        const float x0 = GGML_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
+
+                        dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                    }
+                }
+
+                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                    ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                    dst_data[0] = src[0];
+                    dst_data[1] = src[1];
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rope(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_rope_f16(params, dst, true);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rope_f32(params, dst, true);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_rope_back
+
+static void ggml_compute_forward_rope_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_rope_f16(params, dst, false);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rope_f32(params, dst, false);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_conv_transpose_1d
+
+static void ggml_compute_forward_conv_transpose_1d_f16_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (ith == 0) {
+        memset(params->wdata, 0, params->wsize);
+
+        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
+
+        // permute source data (src1) from (L x Cin) to (Cin x L)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+            ggml_fp16_t * dst_data = wdata;
+
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
+                }
+            }
+        }
+
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, ggml_nbytes(dst));
+    }
+    ggml_barrier(params->threadpool);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
+    // total rows in dst
+    const int nr = ne1;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    ggml_fp16_t * const wdata     = (ggml_fp16_t *) params->wdata + 0;
+    ggml_fp16_t * const wdata_src = wdata + nk;
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                ggml_vec_dot_f16(ne02, &v, 0,
+                        (ggml_fp16_t *)    wdata_src + i1n, 0,
+                        (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
+                dst_data[i10*s0 + i00] += v;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_conv_transpose_1d_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02;
+
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (ith == 0) {
+        memset(params->wdata, 0, params->wsize);
+
+        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            float * const wdata = (float *) params->wdata + 0;
+
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    float * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
+
+        // prepare source data (src1)
+        {
+            float * const wdata = (float *) params->wdata + nk;
+            float * dst_data = wdata;
+
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = src[i10];
+                }
+            }
+        }
+
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, ggml_nbytes(dst));
+    }
+    ggml_barrier(params->threadpool);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
+    // total rows in dst
+    const int nr = ne1;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * const wdata     = (float *) params->wdata + 0;
+    float * const wdata_src = wdata + nk;
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        float * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                ggml_vec_dot_f32(ne02, &v, 0,
+                        wdata_src + i1n, 0,
+                        wdata_kernel + i00*ne02, 0, 1);
+                dst_data[i10*s0 + i00] += v;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_conv_transpose_1d(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_conv_transpose_1d_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_im2col_f32
+// src0: kernel [OC, IC, KH, KW]
+// src1: image [N, IC, IH, IW]
+// dst:  result [N, OH, OW, IC*KH*KW]
+static void ggml_compute_forward_im2col_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t N  = is_2D ? ne13 : ne12;
+    const int64_t IC = is_2D ? ne12 : ne11;
+    const int64_t IH = is_2D ? ne11 : 1;
+    const int64_t IW = ne10;
+
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
+
+    const int64_t OH = is_2D ? ne2 : 1;
+    const int64_t OW = ne1;
+
+    int ofs0 = is_2D ? nb13 : nb12;
+    int ofs1 = is_2D ? nb12 : nb11;
+
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        float * const wdata = (float *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
+                for (int64_t iow = 0; iow < OW; iow++) {
+                    for (int64_t iic = ith; iic < IC; iic += nth) {
+
+                        // micro kernel
+                        float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
+
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
+                                } else {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+// ggml_compute_forward_im2col_f16
+// src0: kernel [OC, IC, KH, KW]
+// src1: image [N, IC, IH, IW]
+// dst:  result [N, OH, OW, IC*KH*KW]
+static void ggml_compute_forward_im2col_f16(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t N  = is_2D ? ne13 : ne12;
+    const int64_t IC = is_2D ? ne12 : ne11;
+    const int64_t IH = is_2D ? ne11 : 1;
+    const int64_t IW = ne10;
+
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
+
+    const int64_t OH = is_2D ? ne2 : 1;
+    const int64_t OW = ne1;
+
+    int ofs0 = is_2D ? nb13 : nb12;
+    int ofs1 = is_2D ? nb12 : nb11;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
+                for (int64_t iow = 0; iow < OW; iow++) {
+                    for (int64_t iic = ith; iic < IC; iic += nth) {
+
+                        // micro kernel
+                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
+
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
+                                } else {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_im2col(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_im2col_f16(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_im2col_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_im2col_back_f32
+
+static void ggml_compute_forward_im2col_back_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t N  = is_2D ? ne3 : ne2;
+    const int64_t IC = is_2D ? ne2 : ne1;
+    const int64_t IH = is_2D ? ne1 : 1;
+    const int64_t IW = ne0;
+
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
+
+    const int64_t OH = is_2D ? ne12 : 1;
+    const int64_t OW = ne11;
+
+    int ofs0 = is_2D ? nb3 : nb2;
+    int ofs1 = is_2D ? nb2 : nb1;
+
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        float * const wdata = (float *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t iic = ith; iic < IC; iic += nth) {
+                for (int64_t iih = 0; iih < IH; iih++) {
+                    for (int64_t iiw = 0; iiw < IW; iiw++) {
+
+                        // micro kernel
+                        float grad = 0.0f;
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                // For s0 > 1 some values were skipped over in the forward pass.
+                                // These values have tmpw % s0 != 0 and need to be skipped in the backwards pass as well.
+                                const int64_t tmpw = (iiw + p0 - ikw*d0);
+                                if (tmpw % s0 != 0) {
+                                    continue;
+                                }
+                                const int64_t iow = tmpw / s0;
+
+                                // Equivalent logic as above except for s1.
+                                int64_t ioh;
+                                if (is_2D) {
+                                    const int64_t tmph = iih + p1 - ikh*d1;
+
+                                    if (tmph % s1 != 0) {
+                                        continue;
+                                    }
+
+                                    ioh = tmph / s1;
+                                } else {
+                                    ioh = 0;
+                                }
+
+                                if (iow < 0 || iow >= OW || ioh < 0 || ioh >= OH) {
+                                    continue;
+                                }
+
+                                const float * const src_data = (const float *) src1->data
+                                    + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                                grad += src_data[iic*(KH*KW) + ikh*KW + ikw];
+                            }
+                        }
+                        float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW]
+                        dst_data[iih*IW + iiw] = grad;
+                    }
+                }
+            }
+        }
+    }
+}
+
+// ggml_compute_forward_conv_transpose_2d
+
+static void ggml_compute_forward_conv_transpose_2d(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02*ne03;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (ith == 0) {
+        memset(params->wdata, 0, params->wsize);
+
+        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
+                    ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
+                    for (int64_t i01 = 0; i01 < ne01; i01++) {
+                        for (int64_t i00 = 0; i00 < ne00; i00++) {
+                            dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
+                        }
+                    }
+                }
+            }
+        }
+
+        // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+            for (int i12 = 0; i12 < ne12; i12++) {
+                for (int i11 = 0; i11 < ne11; i11++) {
+                    const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
+                    ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
+                    for (int i10 = 0; i10 < ne10; i10++) {
+                        dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
+                    }
+                }
+            }
+        }
+
+        memset(dst->data, 0, ggml_nbytes(dst));
+    }
+    ggml_barrier(params->threadpool);
+
+    const int32_t stride = ggml_get_op_params_i32(dst, 0);
+
+    // total patches in dst
+    const int np = ne2;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
+    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+    ggml_fp16_t * const wdata_src = wdata + nk;
+
+    for (int i2 = ip0; i2 < ip1; i2++) { // Cout
+        float * dst_data = (float *)((char *) dst->data + i2*nb2);
+        ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
+        for (int i11 = 0; i11 < ne11; i11++) {
+            for (int i10 = 0; i10 < ne10; i10++) {
+                const int i1n = i11*ne10*ne12 + i10*ne12;
+                for (int i01 = 0; i01 < ne01; i01++) {
+                    for (int i00 = 0; i00 < ne00; i00++) {
+                        float v = 0;
+                        ggml_vec_dot_f16(ne03, &v, 0,
+                                wdata_src + i1n, 0,
+                                wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
+                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
+                    }
+                }
+            }
+        }
+    }
+}
+
+// ggml_compute_forward_pool_1d_sk_p0
+
+static void ggml_compute_forward_pool_1d_sk_p0(
+        const struct ggml_compute_params * params,
+        const enum ggml_op_pool op,
+        const int k,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src = dst->src[0];
+
+    assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    const char * cdata = (const char *)src->data;
+    const char * const data_end = cdata + ggml_nbytes(src);
+    float * drow = (float *)dst->data;
+
+    const int64_t rs = dst->ne[0];
+
+    while (cdata < data_end) {
+        const void * srow = (const void *)cdata;
+        int j = 0;
+        for (int64_t i = 0; i < rs; ++i) {
+            switch (op) {
+                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
+                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
+                case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
+            }
+            for (int ki = 0; ki < k; ++ki) {
+                const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
+                switch (op) {
+                    case GGML_OP_POOL_AVG:                         drow[i] += srow_j; break;
+                    case GGML_OP_POOL_MAX:   if (srow_j > drow[i]) drow[i]  = srow_j; break;
+                    case GGML_OP_POOL_COUNT:                       GGML_ABORT("fatal error");
+                }
+                ++j;
+            }
+            switch (op) {
+                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
+                case GGML_OP_POOL_MAX:                       break;
+                case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
+            }
+        }
+
+        cdata += src->nb[1];
+        drow  += rs;
+    }
+}
+
+// ggml_compute_forward_pool_1d
+
+static void ggml_compute_forward_pool_1d(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int s0 = opts[2];
+    const int p0 = opts[3];
+    GGML_ASSERT(p0 == 0); // padding not supported
+    GGML_ASSERT(k0 == s0); // only s = k supported
+
+    ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
+}
+
+// ggml_compute_forward_pool_2d
+
+static void ggml_compute_forward_pool_2d(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src = dst->src[0];
+
+    assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+    const char * cdata = (const char*)src->data;
+    const char * const data_end = cdata + ggml_nbytes(src);
+
+    const int64_t px = dst->ne[0];
+    const int64_t py = dst->ne[1];
+    const int64_t pa = px * py;
+
+    float * dplane = (float *)dst->data;
+
+    const int ka = k0 * k1;
+    const int offset0 = -p0;
+    const int offset1 = -p1;
+
+    while (cdata < data_end) {
+        for (int oy = 0; oy < py; ++oy) {
+            float * const drow = dplane + oy * px;
+            for (int ox = 0; ox < px; ++ox) {
+                float * const out =  drow + ox;
+                switch (op) {
+                    case GGML_OP_POOL_AVG:     *out = 0;        break;
+                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
+                    case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
+                }
+
+                const int ix = offset0 + ox * s0;
+                const int iy = offset1 + oy * s1;
+
+                for (int ky = 0; ky < k1; ++ky) {
+                    if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
+                    const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky));
+                    for (int kx = 0; kx < k0; ++kx) {
+                        int j = ix + kx;
+                        if (j < 0 || j >= src->ne[0]) continue;
+                        const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
+                        switch (op) {
+                            case GGML_OP_POOL_AVG:                     *out += srow_j; break;
+                            case GGML_OP_POOL_MAX: if (srow_j > *out)  *out  = srow_j; break;
+                            case GGML_OP_POOL_COUNT:               GGML_ABORT("fatal error");
+                        }
+                    }
+                }
+                switch (op) {
+                    case GGML_OP_POOL_AVG:           *out /= ka; break;
+                    case GGML_OP_POOL_MAX:                       break;
+                    case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
+                }
+            }
+        }
+
+        cdata  += src->nb[2];
+        dplane += pa;
+    }
+}
+
+// ggml_compute_forward_pool_2d_back
+
+static void ggml_compute_forward_pool_2d_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src  = dst->src[0];
+    const struct ggml_tensor * dstf = dst->src[1]; // forward tensor of dst
+
+    assert(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+
+    char       * cdata  = (char       *) dst->data;
+    const char * cdataf = (const char *) dstf->data;
+    const char * const data_end = cdata + ggml_nbytes(dst);
+
+    GGML_ASSERT(params->ith == 0);
+    memset(cdata, 0, ggml_nbytes(dst));
+
+    const int64_t px = src->ne[0];
+    const int64_t py = src->ne[1];
+    const int64_t pa = px * py;
+
+    const float * splane = (const float *) src->data;
+
+    const int ka = k0 * k1;
+    const int offset0 = -p0;
+    const int offset1 = -p1;
+
+    while (cdata < data_end) {
+        for (int oy = 0; oy < py; ++oy) {
+            const float * const srow = splane + oy * px;
+            for (int ox = 0; ox < px; ++ox) {
+                const float grad0 = srow[ox];
+
+                const int ix = offset0 + ox * s0;
+                const int iy = offset1 + oy * s1;
+
+                if (op == GGML_OP_POOL_MAX) {
+                    float maxval = -FLT_MAX;
+                    int kxmax = -1;
+                    int kymax = -1;
+
+                    for (int ky = 0; ky < k1; ++ky) {
+                        if (iy + ky < 0 || iy + ky >= dst->ne[1]) {
+                            continue;
+                        }
+                        const void * drowf = (const void *)(cdataf + dst->nb[1] * (iy + ky));
+                        for (int kx = 0; kx < k0; ++kx) {
+                            int j = ix + kx;
+                            if (j < 0 || j >= dst->ne[0]) {
+                                continue;
+                            }
+
+                            const float val = dst->type == GGML_TYPE_F32 ?
+                                ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
+                            if (val <= maxval) {
+                                continue;
+                            }
+
+                            maxval = val;
+                            kxmax = kx;
+                            kymax = ky;
+                        }
+                    }
+
+                    if (kxmax == -1 || kymax == -1) {
+                        continue;
+                    }
+
+                    void * drow = (void *)(cdata + dst->nb[1] * (iy + kymax));
+                    const int j = ix + kxmax;
+                    if (dst->type == GGML_TYPE_F32) {
+                        ((float *) drow)[j] += grad0;
+                    } else {
+                        ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
+                    }
+                } else if (op == GGML_OP_POOL_AVG) {
+                    const float grad = grad0 / ka;
+
+                    for (int ky = 0; ky < k1; ++ky) {
+                        if (iy + ky < 0 || iy + ky >= dst->ne[1]) {
+                            continue;
+                        }
+                        void * drow = (void *)(cdata + dst->nb[1] * (iy + ky));
+                        for (int kx = 0; kx < k0; ++kx) {
+                            int j = ix + kx;
+                            if (j < 0 || j >= dst->ne[0]) {
+                                continue;
+                            }
+
+                            if (dst->type == GGML_TYPE_F32) {
+                                ((float *) drow)[j] += grad;
+                            } else {
+                                ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad);
+                            }
+                        }
+                    }
+                } else {
+                    GGML_ASSERT(false);
+                }
+            }
+        }
+
+        cdata  += dst->nb[2];
+        cdataf += dst->nb[2];
+        splane += pa;
+    }
+}
+
+// ggml_compute_forward_upscale
+
+static void ggml_compute_forward_upscale_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const float sf0 = (float)ne0/src0->ne[0];
+    const float sf1 = (float)ne1/src0->ne[1];
+    const float sf2 = (float)ne2/src0->ne[2];
+    const float sf3 = (float)ne3/src0->ne[3];
+
+    // TODO: optimize
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        const int64_t i03 = i3 / sf3;
+        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+            const int64_t i02 = i2 / sf2;
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                const int64_t i01 = i1 / sf1;
+                for (int64_t i0 = 0; i0 < ne0; i0++) {
+                    const int64_t i00 = i0 / sf0;
+
+                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
+
+                    *y = *x;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_upscale(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_upscale_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+
+// ggml_compute_forward_pad
+
+static void ggml_compute_forward_pad_f32(
+    const struct ggml_compute_params * params,
+          struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float * dst_ptr = (float *) dst->data;
+
+    // TODO: optimize
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                        dst_ptr[dst_idx] = *src_ptr;
+                    } else {
+                        dst_ptr[dst_idx] = 0;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_pad(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_pad_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+
+// ggml_compute_forward_arange
+
+static void ggml_compute_forward_arange_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    GGML_ASSERT(dst->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const float start = ggml_get_op_params_f32(dst, 0);
+    const float stop  = ggml_get_op_params_f32(dst, 1);
+    const float step  = ggml_get_op_params_f32(dst, 2);
+
+    const int64_t steps = (int64_t) ceilf((stop - start) / step);
+
+    GGML_ASSERT(ggml_nelements(dst) == steps);
+
+    for (int64_t i = ith; i < steps; i+= nth) {
+        float value = start + step * i;
+        ((float *)dst->data)[i] = value;
+    }
+}
+
+static void ggml_compute_forward_arange(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+    switch (dst->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_arange_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_timestep_embedding_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int dim = ggml_get_op_params_i32(dst, 0);
+    const int max_period = ggml_get_op_params_i32(dst, 1);
+
+    int half = dim / 2;
+
+    for (int64_t i = 0; i < ne00; i++) {
+        float * embed_data = (float *)((char *)  dst->data +  i*nb1);
+        for (int64_t j = ith; j < half; j += nth) {
+            float timestep = ((float *)src0->data)[i];
+            float freq = (float)expf(-logf(max_period) * j / half);
+            float arg = timestep * freq;
+            embed_data[j] = cosf(arg);
+            embed_data[j + half] = sinf(arg);
+        }
+        if (dim % 2 != 0 && ith == 0) {
+            embed_data[dim] = 0.f;
+        }
+    }
+}
+
+static void ggml_compute_forward_timestep_embedding(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_timestep_embedding_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_argsort
+
+static void ggml_compute_forward_argsort_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(nb0 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
+
+    for (int64_t i = ith; i < nr; i += nth) {
+        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
+        const float * src_data = (float *)((char *) src0->data + i*nb01);
+
+        for (int64_t j = 0; j < ne0; j++) {
+            dst_data[j] = j;
+        }
+
+        // C doesn't have a functional sort, so we do a bubble sort instead
+        for (int64_t j = 0; j < ne0; j++) {
+            for (int64_t k = j + 1; k < ne0; k++) {
+                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
+                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
+                    int32_t tmp = dst_data[j];
+                    dst_data[j] = dst_data[k];
+                    dst_data[k] = tmp;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_argsort(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_argsort_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_flash_attn_ext
+
+static void ggml_compute_forward_flash_attn_ext_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const struct ggml_tensor * mask,
+        struct ggml_tensor * dst) {
+
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+
+    GGML_ASSERT(ne0 == D);
+    GGML_ASSERT(ne2 == N);
+
+    // input tensor rows must be contiguous
+    GGML_ASSERT(nbq0 == ggml_type_size(q->type));
+    GGML_ASSERT(nbk0 == ggml_type_size(k->type));
+    GGML_ASSERT(nbv0 == ggml_type_size(v->type));
+
+    GGML_ASSERT(neq0 == D);
+    GGML_ASSERT(nek0 == D);
+    GGML_ASSERT(nev0 == D);
+
+    GGML_ASSERT(neq1 == N);
+    GGML_ASSERT(nev0 == D);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // broadcast factors
+    const int64_t rk2 = neq2/nek2;
+    const int64_t rk3 = neq3/nek3;
+
+    const int64_t rv2 = neq2/nev2;
+    const int64_t rv3 = neq3/nev3;
+
+    // parallelize by q rows using ggml_vec_dot_f32
+
+    // total rows in q
+    const int nr = neq1*neq2*neq3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float scale         = 1.0f;
+    float max_bias      = 0.0f;
+    float logit_softcap = 0.0f;
+
+    memcpy(&scale,         (float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias,      (float *) dst->op_params + 1, sizeof(float));
+    memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float));
+
+    if (logit_softcap != 0) {
+        scale /= logit_softcap;
+    }
+
+    const uint32_t n_head      = neq2;
+    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    enum ggml_type    const k_vec_dot_type = type_traits_cpu[k->type].vec_dot_type;
+    ggml_from_float_t const q_to_vec_dot   = ggml_get_type_traits(k_vec_dot_type)->from_float;
+    ggml_vec_dot_t    const kq_vec_dot     = type_traits_cpu[k->type].vec_dot;
+    ggml_to_float_t   const v_to_float     = ggml_get_type_traits(v->type)->to_float;
+
+    GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
+    GGML_ASSERT(v_to_float   && "fattn: unsupported V-type");
+
+    // loop over n_batch and n_head
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int iq3 = ir/(neq2*neq1);
+        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
+        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
+
+        const uint32_t h = iq2; // head index
+        const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
+
+        float S = 0.0f;      // sum
+        float M = -INFINITY; // maximum KQ value
+
+        float       * VKQ32 = (float       *) params->wdata + ith*(3*D + CACHE_LINE_SIZE_F32); // FP32 VKQ accumulator
+        float       * V32   =                 (VKQ32 + 1*D); // (temporary) FP32 V buffer
+        ggml_fp16_t * VKQ16 = (ggml_fp16_t *) (VKQ32 + 1*D); // (temporary) FP16 VKQ accumulator
+        ggml_fp16_t * Q_q   = (ggml_fp16_t *) (VKQ32 + 2*D); // (temporary) buffer for Q converted to quantized/FP16
+
+        if (v->type == GGML_TYPE_F16) {
+            memset(VKQ16, 0, D*sizeof(ggml_fp16_t));
+        } else {
+            memset(VKQ32, 0, D*sizeof(float));
+        }
+
+        const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
+
+        // k indices
+        const int ik3 = iq3 / rk3;
+        const int ik2 = iq2 / rk2;
+
+        // v indices
+        const int iv3 = iq3 / rv3;
+        const int iv2 = iq2 / rv2;
+
+        const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
+        q_to_vec_dot(pq, Q_q, D);
+
+        // online softmax / attention
+        // loop over n_kv and n_head_kv
+        // ref: https://arxiv.org/pdf/2112.05682.pdf
+        for (int64_t ic = 0; ic < nek1; ++ic) {
+            const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
+            if (mv == -INFINITY) {
+                continue;
+            }
+
+            float s; // KQ value
+
+            const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
+            kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1);
+
+            s = s*scale; // scale KQ value
+
+            if (logit_softcap != 0.0f) {
+                s = logit_softcap*tanhf(s);
+            }
+
+            s += mv; // apply mask
+
+            const float Mold = M;
+
+            float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value
+            float vs = 1.0f; // post-softmax KQ value, expf(s - M)
+
+            const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
+
+            if (v->type == GGML_TYPE_F16) {
+                if (s > M) {
+                    // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
+                    M = s;
+                    ms = expf(Mold - M);
+
+                    // V = V*expf(Mold - M)
+                    ggml_vec_scale_f16(D, VKQ16, ms);
+                } else {
+                    // no new maximum, ms == 1.0f, vs != 1.0f
+                    vs = expf(s - M);
+                }
+
+                // V += v*expf(s - M)
+                ggml_vec_mad_f16(D, VKQ16, (const ggml_fp16_t *) v_data, vs);
+            } else {
+                if (s > M) {
+                    // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
+                    M = s;
+                    ms = expf(Mold - M);
+
+                    // V = V*expf(Mold - M)
+                    ggml_vec_scale_f32(D, VKQ32, ms);
+                } else {
+                    // no new maximum, ms == 1.0f, vs != 1.0f
+                    vs = expf(s - M);
+                }
+
+                v_to_float(v_data, V32, D);
+
+                // V += v*expf(s - M)
+                ggml_vec_mad_f32(D, VKQ32, V32, vs);
+            }
+
+            S = S*ms + vs; // scale and increment sum with partial sum
+        }
+
+        if (v->type == GGML_TYPE_F16) {
+            for (int64_t d = 0; d < D; ++d) {
+                VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]);
+            }
+        }
+
+        // V /= S
+        const float S_inv = 1.0f/S;
+        ggml_vec_scale_f32(D, VKQ32, S_inv);
+
+        // dst indices
+        const int i1 = iq1;
+        const int i2 = iq2;
+        const int i3 = iq3;
+
+        // original
+        //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
+
+        // permute(0, 2, 1, 3)
+        memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
+    }
+}
+
+static void ggml_compute_forward_flash_attn_ext(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const struct ggml_tensor * mask,
+        struct ggml_tensor * dst) {
+    switch (dst->op_params[3]) {
+        case GGML_PREC_DEFAULT:
+        case GGML_PREC_F32:
+            {
+                // uses F32 accumulators
+                ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_flash_attn_back
+
+static void ggml_compute_forward_flash_attn_back_f32(
+        const struct ggml_compute_params * params,
+        const bool masked,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * q = dst->src[0];
+    const struct ggml_tensor * k = dst->src[1];
+    const struct ggml_tensor * v = dst->src[2];
+    const struct ggml_tensor * d = dst->src[3];
+
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+    const int64_t P = nek1 - N;
+    const int64_t M = P + N;
+
+    const int Mup  = ggml_up(M, GGML_SOFT_MAX_UNROLL);
+    const int mxDM = MAX(D, Mup);
+
+    // GGML_ASSERT(ne0 == D);
+    // GGML_ASSERT(ne1 == N);
+    GGML_ASSERT(P >= 0);
+
+    GGML_ASSERT(nbq0 == sizeof(float));
+    GGML_ASSERT(nbk0 == sizeof(float));
+    GGML_ASSERT(nbv0 == sizeof(float));
+
+    GGML_ASSERT(neq0 == D);
+    GGML_ASSERT(nek0 == D);
+    GGML_ASSERT(nev1 == D);
+    GGML_ASSERT(ned0 == D);
+
+    GGML_ASSERT(neq1 == N);
+    GGML_ASSERT(nek1 == N + P);
+    GGML_ASSERT(nev1 == D);
+    GGML_ASSERT(ned1 == N);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    if (ith == 0) {
+        memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
+    }
+    ggml_barrier(params->threadpool);
+
+    const int64_t elem_q = ggml_nelements(q);
+    const int64_t elem_k = ggml_nelements(k);
+
+    enum ggml_type result_type = dst->type;
+    GGML_ASSERT(ggml_blck_size(result_type) == 1);
+    const size_t tsize = ggml_type_size(result_type);
+
+    const size_t offs_q = 0;
+    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
+    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
+
+    void * grad_q = (char *) dst->data;
+    void * grad_k = (char *) dst->data + offs_k;
+    void * grad_v = (char *) dst->data + offs_v;
+
+    const size_t nbgq1 = nb0*neq0;
+    const size_t nbgq2 = nb0*neq0*neq1;
+    const size_t nbgq3 = nb0*neq0*neq1*neq2;
+
+    const size_t nbgk1 = nb0*nek0;
+    const size_t nbgk2 = nb0*nek0*nek1;
+    const size_t nbgk3 = nb0*nek0*nek1*neq2;
+
+    const size_t nbgv1 = nb0*nev0;
+    const size_t nbgv2 = nb0*nev0*nev1;
+    const size_t nbgv3 = nb0*nev0*nev1*neq2;
+
+    // parallelize by k rows using ggml_vec_dot_f32
+
+    // total rows in k
+    const int nr = nek2*nek3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float scale = 1.0f/sqrtf(D);
+
+    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+    // how often k2 (and v2) is repeated in q2
+    int nrep = neq2/nek2;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int ik3 = ir/(nek2);
+        const int ik2 = ir - ik3*nek2;
+
+        const int iq3 = ik3;
+        const int id3 = ik3;
+        const int iv3 = ik3;
+        const int iv2 = ik2;
+
+        for (int irep = 0; irep < nrep; ++irep) {
+            const int iq2 = ik2 + irep*nek2;
+            const int id2 = iq2;
+
+            // (ik2 + irep*nek2) % nek2 == ik2
+            for (int iq1 = 0; iq1 < neq1; ++iq1) {
+                const int id1 = iq1;
+
+                // not sure about CACHE_LINE_SIZE_F32..
+                // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
+                float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
+                float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
+
+                for (int i = M; i < Mup; ++i) {
+                    S[i] = -INFINITY;
+                }
+
+                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    // k indices
+                    const int ik1 = ic;
+
+                    // S indices
+                    const int i1 = ik1;
+
+                    ggml_vec_dot_f32(neq0,
+                            S + i1, 0,
+                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
+                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
+                }
+
+                // scale
+                ggml_vec_scale_f32(masked_begin, S, scale);
+
+                for (int64_t i = masked_begin; i < M; i++) {
+                    S[i] = -INFINITY;
+                }
+
+                // softmax
+                // exclude known -INF S[..] values from max and loop
+                // dont forget to set their SM values to zero
+                {
+                    float max = -INFINITY;
+                    ggml_vec_max_f32(masked_begin, &max, S);
+
+                    ggml_float sum = 0.0;
+                    {
+#ifdef GGML_SOFT_MAX_ACCELERATE
+                        max = -max;
+                        vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
+                        vvexpf(SM, SM, &Mup);
+                        ggml_vec_sum_f32(Mup, &sum, SM);
+#else
+                        sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
+#endif
+                    }
+
+                    assert(sum > 0.0);
+
+                    sum = 1.0/sum;
+                    ggml_vec_scale_f32(masked_begin, SM, sum);
+
+                }
+
+                // step-by-step explanation
+                {
+                    // forward-process                    shape      grads from backward process
+                    // parallel_for ik2,ik3:
+                    //  for irep:
+                    //   iq2 = ik2 + irep*nek2
+                    //   k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,ik2,ik3]  += grad[kcur]
+                    //   q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
+                    //   v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iv2,iv3]  += grad[vcur]
+                    //   for iq1:
+                    //    kcur   = k[:D,:M,ik2,ik3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
+                    //    qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
+                    //    vcur   = v[:M,:D,iv2,iv3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
+                    //    S0     = -Inf                   [D,1,1,1]
+                    //   ~S1[i]  = dot(kcur[:D,i], qcur)
+                    //    S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
+                    //    S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
+                    //    S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    //    S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
+                    //   ~S5[i]  = dot(vcur[:,i], S4)
+                    //    S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,id1,id2,id3]
+                    //   ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
+                    //    dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3]
+                    // dst                               backward-/ grad[dst]                 = d
+                    //
+                    // output gradients with their dependencies:
+                    //
+                    // grad[kcur] = grad[S1].T @ qcur
+                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    // grad[S4]   = grad[S5] @ vcur
+                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
+                    // grad[qcur] = grad[S1]   @ kcur
+                    // grad[vcur] = grad[S5].T @ S4
+                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
+                    //
+                    // in post-order:
+                    //
+                    // S1         = qcur @ kcur.T
+                    // S2         = S1 * scale
+                    // S3         = diag_mask_inf(S2, P)
+                    // S4         = softmax(S3)
+                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
+                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                    // grad[qcur] = grad[S1]   @ kcur
+                    // grad[kcur] = grad[S1].T @ qcur
+                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
+                    //
+                    // using less variables (SM=S4):
+                    //
+                    // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
+                    // SM            = softmax(S)
+                    // S             = d[:D,iq1,iq2,iq3] @ vcur
+                    // dot_SM_gradSM = dot(SM, S)
+                    // S             = SM * (S - dot(SM, S))
+                    // S             = diag_mask_zero(S, P) * scale
+                    //
+                    // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
+                    // grad[k][:D,:M,ik2,ik3]  += S.T @ qcur
+                    // grad[v][:M,:D,iv2,iv3]  += d[:D,id1,id2,id3].T @ SM
+                }
+
+                // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
+                // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
+                // for ic:
+                //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
+                // exclude known future zero S[..] values from operation
+                ggml_vec_set_f32(masked_begin, S, 0);
+                for (int64_t ic = 0; ic < D; ++ic) {
+                    ggml_vec_mad_f32(masked_begin,
+                            S,
+                             (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
+                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
+                }
+
+                // S = SM * (S - dot(SM, S))
+                float dot_SM_gradSM = 0;
+                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
+                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
+                ggml_vec_mul_f32 (masked_begin, S, S, SM);
+
+                // S = diag_mask_zero(S, P) * scale
+                // already done by above ggml_vec_set_f32
+
+                // exclude known zero S[..] values from operation
+                ggml_vec_scale_f32(masked_begin, S, scale);
+
+                // S    shape [M,1]
+                // SM   shape [M,1]
+                // kcur shape [D,M]
+                // qcur shape [D,1]
+                // vcur shape [M,D]
+
+                // grad[q][:D,iq1,iq2,iq3] += S @ kcur
+                // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
+                // for ic:
+                //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
+                // exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    ggml_vec_mad_f32(D,
+                            (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
+                            (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
+                            S[ic]);
+                }
+
+                // grad[k][:D,:M,iq2,iq3] += S.T @ qcur
+                // for ic:
+                //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
+                //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
+                // exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    ggml_vec_mad_f32(D,
+                            (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
+                            (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
+                            S[ic]);
+                }
+
+                // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T       @ SM
+                // for ic:
+                //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
+                //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
+                // exclude known zero SM[..] values from mad
+                for (int64_t ic = 0; ic < D; ++ic) {
+                    ggml_vec_mad_f32(masked_begin,
+                            (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
+                            SM,
+                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_flash_attn_back(
+        const struct ggml_compute_params * params,
+        const bool masked,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * q = dst->src[0];
+
+    switch (q->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_flash_attn_back_f32(params, masked, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_ssm_conv
+
+static void ggml_compute_forward_ssm_conv_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0]; // conv_x
+    const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc  = src1->ne[0]; // d_conv
+    const int ncs = src0->ne[0]; // d_conv - 1 + n_t
+    const int nr  = src0->ne[1]; // d_inner
+    const int n_t =  dst->ne[1]; // tokens per sequence
+    const int n_s =  dst->ne[2]; // number of sequences in the batch
+
+    GGML_ASSERT( dst->ne[0] == nr);
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    const int ir  = ir1 - ir0;
+
+    for (int i3 = 0; i3 < n_s; ++i3) {
+        for (int i2 = 0; i2 < n_t; ++i2) {
+            // {d_conv - 1 + n_t, d_inner, n_seqs}
+            // sliding window
+            const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s}
+            const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner}
+            float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s}
+
+            // TODO: transpose the output for smaller strides for big batches?
+            // d_inner
+            for (int i1 = 0; i1 < ir; ++i1) {
+                // rowwise dot product
+                // NOTE: not using ggml_vec_dot_f32, because its sum is in double precision
+                float sumf = 0.0f;
+
+                // d_conv
+                for (int i0 = 0; i0 < nc; ++i0) {
+                    sumf += s[i0 + i1*ncs] * c[i0 + i1*nc];
+                }
+                x[i1] = sumf;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_ssm_conv(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    switch (dst->src[0]->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_ssm_conv_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_ssm_scan
+
+static void ggml_compute_forward_ssm_scan_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0]; // s
+    const struct ggml_tensor * src1 = dst->src[1]; // x
+    const struct ggml_tensor * src2 = dst->src[2]; // dt
+    const struct ggml_tensor * src3 = dst->src[3]; // A
+    const struct ggml_tensor * src4 = dst->src[4]; // B
+    const struct ggml_tensor * src5 = dst->src[5]; // C
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nc  = src0->ne[0]; // d_state
+    const int64_t nr  = src0->ne[1]; // d_inner
+    const int64_t n_t = src1->ne[1]; // number of tokens per sequence
+    const int64_t n_s = src0->ne[2]; // number of sequences in the batch
+
+    GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src2->nb[0] == sizeof(float));
+    GGML_ASSERT(src3->nb[0] == sizeof(float));
+    GGML_ASSERT(src4->nb[0] == sizeof(float));
+    GGML_ASSERT(src5->nb[0] == sizeof(float));
+    // required for the dot product between s and C
+    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
+    // required for per-sequence offsets for states
+    GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
+    // required to get correct offset for state destination (i.e. src1->nb[3])
+    GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    const int ir  = ir1 - ir0;
+
+    for (int i3 = 0; i3 < n_s; ++i3) {
+        for (int i2 = 0; i2 < n_t; ++i2) {
+            const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
+            const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+            const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
+            const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+            const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
+            const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
+                  float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                  float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
+
+            // use the output as the source for the next token-wise iterations
+            if (i2 > 0) { s0 = s; }
+
+            // d_inner
+            for (int i1 = 0; i1 < ir; ++i1) {
+                // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
+                float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+                float x_dt = x[i1] * dt_soft_plus;
+                float sumf = 0.0f;
+                // d_state
+                for (int i0 = 0; i0 < nc; ++i0) {
+                    int i = i0 + i1*nc;
+                    // state = prev_state * dA + dB * x
+                    float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                    // y = rowwise_dotprod(state, C)
+                    sumf += state * C[i0];
+                    s[i] = state;
+                }
+                y[i1] = sumf;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_ssm_scan(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    switch (dst->src[0]->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_ssm_scan_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_win_part
+
+static void ggml_compute_forward_win_part_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    UNUSED(params);
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
+
+    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t w    = ((const int32_t *)(dst->op_params))[2];
+
+    assert(ne00 == ne0);
+    assert(ne3  == nep0*nep1);
+
+    // TODO: optimize / multi-thread
+    for (int py = 0; py < nep1; ++py) {
+        for (int px = 0; px < nep0; ++px) {
+            const int64_t i3 = py*nep0 + px;
+            for (int64_t i2 = 0; i2 < ne2; ++i2) {
+                for (int64_t i1 = 0; i1 < ne1; ++i1) {
+                    for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                        const int64_t i02 = py*w + i2;
+                        const int64_t i01 = px*w + i1;
+                        const int64_t i00 = i0;
+
+                        const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0    + i1*ne0   + i0;
+                        const int64_t j =                  i02*ne01*ne00 + i01*ne00 + i00;
+
+                        if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
+                            ((float *) dst->data)[i] = 0.0f;
+                        } else {
+                            ((float *) dst->data)[i] = ((float *) src0->data)[j];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_win_part(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_win_part_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_win_unpart
+
+static void ggml_compute_forward_win_unpart_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    UNUSED(params);
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
+
+    const int32_t w = ((const int32_t *)(dst->op_params))[0];
+
+    // padding
+    const int px = (w - ne1%w)%w;
+    //const int py = (w - ne2%w)%w;
+
+    const int npx = (px + ne1)/w;
+    //const int npy = (py + ne2)/w;
+
+    assert(ne0 == ne00);
+
+    // TODO: optimize / multi-thread
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = 0; i1 < ne1; ++i1) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int ip2 = i2/w;
+                const int ip1 = i1/w;
+
+                const int64_t i02 = i2%w;
+                const int64_t i01 = i1%w;
+                const int64_t i00 = i0;
+
+                const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
+                const int64_t j =                                  i2*ne1*ne0    + i1*ne0   + i0;
+
+                ((float *) dst->data)[j] = ((float *) src0->data)[i];
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_win_unpart(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_win_unpart_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+//gmml_compute_forward_unary
+
+static void ggml_compute_forward_unary(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const enum ggml_unary_op op = ggml_get_unary_op(dst);
+
+    switch (op) {
+        case GGML_UNARY_OP_ABS:
+            {
+                ggml_compute_forward_abs(params, dst);
+            } break;
+        case GGML_UNARY_OP_SGN:
+            {
+                ggml_compute_forward_sgn(params, dst);
+            } break;
+        case GGML_UNARY_OP_NEG:
+            {
+                ggml_compute_forward_neg(params, dst);
+            } break;
+        case GGML_UNARY_OP_STEP:
+            {
+                ggml_compute_forward_step(params, dst);
+            } break;
+        case GGML_UNARY_OP_TANH:
+            {
+                ggml_compute_forward_tanh(params, dst);
+            } break;
+        case GGML_UNARY_OP_ELU:
+            {
+                ggml_compute_forward_elu(params, dst);
+            } break;
+        case GGML_UNARY_OP_RELU:
+            {
+                ggml_compute_forward_relu(params, dst);
+            } break;
+        case GGML_UNARY_OP_SIGMOID:
+            {
+                ggml_compute_forward_sigmoid(params, dst);
+            } break;
+        case GGML_UNARY_OP_GELU:
+            {
+                ggml_compute_forward_gelu(params, dst);
+            } break;
+        case GGML_UNARY_OP_GELU_QUICK:
+            {
+                ggml_compute_forward_gelu_quick(params, dst);
+            } break;
+        case GGML_UNARY_OP_SILU:
+            {
+                ggml_compute_forward_silu(params, dst);
+            } break;
+        case GGML_UNARY_OP_HARDSWISH:
+            {
+                ggml_compute_forward_hardswish(params, dst);
+            } break;
+        case GGML_UNARY_OP_HARDSIGMOID:
+            {
+                ggml_compute_forward_hardsigmoid(params, dst);
+            } break;
+        case GGML_UNARY_OP_EXP:
+            {
+                ggml_compute_forward_exp(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_get_rel_pos
+
+static void ggml_compute_forward_get_rel_pos_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    UNUSED(params);
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int64_t w = ne1;
+
+    ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
+    ggml_fp16_t * dst_data  = (ggml_fp16_t *) dst->data;
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = 0; i1 < ne1; ++i1) {
+            const int64_t pos = (w - i1 - 1) + i2;
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_get_rel_pos(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
+            {
+                ggml_compute_forward_get_rel_pos_f16(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_add_rel_pos
+
+static void ggml_compute_forward_add_rel_pos_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * src2 = dst->src[2];
+
+    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
+    if (!inplace) {
+        if (params->ith == 0) {
+            memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
+        }
+        ggml_barrier(params->threadpool);
+    }
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
+
+    float * src1_data = (float *) src1->data;
+    float * src2_data = (float *) src2->data;
+    float * dst_data  = (float *) dst->data;
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // total patches in dst
+    const int np = ne13;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
+    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
+        for (int64_t i12 = 0; i12 < ne12; ++i12) {
+            for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
+                for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                    const int64_t jp0  = jp1 + i10;
+                    const float src1_e = src1_data[jp0];
+                    const float src2_e = src2_data[jp0];
+
+                    const int64_t jdh = jp0 * ne10;
+                    const int64_t jdw = jdh - (ne10 - 1) * i10;
+
+                    for (int64_t j = 0; j < ne10; ++j) {
+                        dst_data[jdh + j     ] += src2_e;
+                        dst_data[jdw + j*ne10] += src1_e;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_add_rel_pos(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_add_rel_pos_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_rwkv_wkv
+
+static void ggml_compute_forward_rwkv_wkv_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    const size_t T = dst->src[1]->ne[3];
+    const size_t C = dst->ne[0];
+    const size_t H = dst->src[1]->ne[2];
+    const size_t n_seqs = dst->src[5]->ne[1];
+
+    float * dst_data = (float *) dst->data;
+    float * state = ((float *) dst->data) + C * T;
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    memset(dst_data, 0, T * C * sizeof(float));
+
+    float * k =          (float *) dst->src[0]->data;
+    float * v =          (float *) dst->src[1]->data;
+    float * r =          (float *) dst->src[2]->data;
+    float * time_faaaa = (float *) dst->src[3]->data;
+    float * time_decay = (float *) dst->src[4]->data;
+
+    size_t t_stride = H * (C / H);
+
+    size_t h_stride = C / H;
+    size_t h_stride_2d = (C / H) * (C / H);
+
+    // basically fused operations:
+    // dst = r @ (time_faaaa * (k @ v) + state),
+    // state = time_decay * state + (k @ v),
+    // recursive through each token
+    for (size_t t = 0; t < T; t++) {
+        size_t t_offset = t * t_stride;
+        size_t state_offset = (C / H) * C * (t / (T / n_seqs));
+        float * state_cur = state + state_offset;
+        float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
+
+        for (size_t h = 0; h < H; h++) {
+            size_t h_offset = h * h_stride;
+            size_t t_h_offset = t_offset + h_offset;
+            size_t h_2d_offset = h * h_stride_2d;
+
+            for (size_t i = 0; i < C / H; i++) {
+                size_t t_h_i_offset = t_h_offset + i;
+                size_t h_i_offset = h_offset + i;
+                size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+
+                float k_val = k[t_h_i_offset];
+                float r_val = r[t_h_i_offset];
+                float time_faaaa_val = time_faaaa[h_i_offset];
+                // RWKV v6: different time_decay for each token.
+                float time_decay_val = time_decay[t_h_i_offset];
+
+                for (size_t j = 0; j < C / H; j ++) {
+                    size_t t_h_j_offset = t_h_offset + j;
+                    size_t h_2d_i_j_offset = h_2d_i_offset + j;
+
+                    float v_val = v[t_h_j_offset];
+                    float kv_val = v_val * k_val;
+                    float prev_state_val = state_prev[h_2d_i_j_offset];
+                    float temp_val = kv_val * time_faaaa_val + prev_state_val;
+                    dst_data[t_h_j_offset] += temp_val * r_val;
+                    state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rwkv_wkv(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rwkv_wkv_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_map_unary
+
+static void ggml_compute_forward_map_unary_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_unary_op_f32_t fun) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_map_unary(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_unary_op_f32_t fun) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_map_unary_f32(params, dst, fun);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_map_binary
+
+static void ggml_compute_forward_map_binary_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_binary_op_f32_t fun) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(src1));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])),
+                (float *) ((char *) src1->data + i*(src1->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_map_binary(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_binary_op_f32_t fun) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_map_binary_f32(params, dst, fun);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_map_custom1
+
+static void ggml_compute_forward_map_custom1_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_custom1_op_f32_t fun) {
+
+    const struct ggml_tensor * a = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    fun(dst, a);
+}
+
+// ggml_compute_forward_map_custom2
+
+static void ggml_compute_forward_map_custom2_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_custom2_op_f32_t fun) {
+
+    const struct ggml_tensor * a = dst->src[0];
+    const struct ggml_tensor * b = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    fun(dst, a, b);
+}
+
+// ggml_compute_forward_map_custom3
+
+static void ggml_compute_forward_map_custom3_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_custom3_op_f32_t fun) {
+
+    const struct ggml_tensor * a = dst->src[0];
+    const struct ggml_tensor * b = dst->src[1];
+    const struct ggml_tensor * c = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    fun(dst, a, b, c);
+}
+
+// ggml_compute_forward_map_custom1
+
+static void ggml_compute_forward_map_custom1(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * a = dst->src[0];
+
+    struct ggml_map_custom1_op_params p;
+    memcpy(&p, dst->op_params, sizeof(p));
+
+    p.fun(dst, a, params->ith, params->nth, p.userdata);
+}
+
+// ggml_compute_forward_map_custom2
+
+static void ggml_compute_forward_map_custom2(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * a = dst->src[0];
+    const struct ggml_tensor * b = dst->src[1];
+
+    struct ggml_map_custom2_op_params p;
+    memcpy(&p, dst->op_params, sizeof(p));
+
+    p.fun(dst, a, b, params->ith, params->nth, p.userdata);
+}
+
+// ggml_compute_forward_map_custom3
+
+static void ggml_compute_forward_map_custom3(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * a = dst->src[0];
+    const struct ggml_tensor * b = dst->src[1];
+    const struct ggml_tensor * c = dst->src[2];
+
+    struct ggml_map_custom3_op_params p;
+    memcpy(&p, dst->op_params, sizeof(p));
+
+    p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
+}
+
+// ggml_compute_forward_cross_entropy_loss
+
+static void ggml_compute_forward_cross_entropy_loss_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    GGML_ASSERT(ggml_is_scalar(dst));
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    // TODO: handle transposed/permuted matrices
+    const int64_t nc = src0->ne[0];
+    const int64_t nr = ggml_nrows(src0);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    float * sums =  (float *) params->wdata;
+    float * st   = ((float *) params->wdata) + nth + ith*nc;
+    float sum_thread = 0.0f;
+
+    GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i1 = ir0; i1 < ir1; ++i1) {
+        const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]);
+        const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]);
+
+#ifndef NDEBUG
+        for (int64_t i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+
+        float max = -INFINITY;
+        ggml_vec_max_f32(nc, &max, s0);
+        const ggml_float sum_softmax = ggml_vec_log_soft_max_f32(nc, st, s0, max);
+        assert(sum_softmax >= 0.0);
+
+        ggml_vec_add1_f32(nc, st, st, -sum_softmax);
+        ggml_vec_mul_f32(nc, st, st, s1);
+
+        float sum_st = 0.0f;
+        ggml_vec_sum_f32(nc, &sum_st, st);
+        sum_thread += sum_st;
+
+#ifndef NDEBUG
+        for (int64_t i = 0; i < nc; ++i) {
+            assert(!isnan(st[i]));
+            assert(!isinf(st[i]));
+        }
+#endif
+    }
+    sums[ith] = sum_thread;
+    ggml_barrier(params->threadpool);
+
+    if (ith == 0) {
+        float * dp = (float *) dst->data;
+        ggml_vec_sum_f32(nth, dp, sums);
+        dp[0] *= -1.0f / (float) nr;
+    }
+}
+
+static void ggml_compute_forward_cross_entropy_loss(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_cross_entropy_loss_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_cross_entropy_loss_back
+
+static void ggml_compute_forward_cross_entropy_loss_back_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * opt0 = dst->src[2];
+
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(opt0));
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+
+    // TODO: handle transposed/permuted matrices
+    const int64_t nc = src0->ne[0];
+    const int64_t nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr;
+
+    for (int64_t i1 = ir0; i1 < ir1; i1++) {
+        float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
+        float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
+
+#ifndef NDEBUG
+        for (int64_t i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+
+        // soft_max
+        float max = -INFINITY;
+        ggml_vec_max_f32(nc, &max, s0);
+        ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
+        assert(sum > 0.0);
+        ggml_vec_scale_f32(nc, ds0, 1.0/sum);
+
+        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
+        ggml_vec_sub_f32(nc, ds0, ds0, s1);
+        ggml_vec_scale_f32(nc, ds0, d_by_nr);
+
+#ifndef NDEBUG
+        for (int64_t i = 0; i < nc; ++i) {
+            assert(!isnan(ds0[i]));
+            assert(!isinf(ds0[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_cross_entropy_loss_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_cross_entropy_loss_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_opt_step_adamw_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0        = dst->src[0];
+    const struct ggml_tensor * src0_grad   = dst->src[1];
+    const struct ggml_tensor * src0_grad_m = dst->src[2];
+    const struct ggml_tensor * src0_grad_v = dst->src[3];
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    /* const float   gnorm = 1.0f; */
+    int64_t       iter;   memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
+    const float   alpha = ggml_get_op_params_f32(dst, 2);
+    const float   beta1 = ggml_get_op_params_f32(dst, 3);
+    const float   beta2 = ggml_get_op_params_f32(dst, 4);
+    const float   eps   = ggml_get_op_params_f32(dst, 5);
+    const float   wd    = ggml_get_op_params_f32(dst, 6);
+
+    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
+    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        const size_t offset = i03*nb03 + i02*nb02 + i01*nb01;
+
+        float       * w = (float       *) ((char       *) src0->data        + offset); // weight
+        const float * g = (const float *) ((const char *) src0_grad->data   + offset); // grad
+        float       * m = (float       *) ((char       *) src0_grad_m->data + offset);
+        float       * v = (float       *) ((char       *) src0_grad_v->data + offset);
+
+        for (int i00 = 0; i00 < ne00; ++i00) {
+            m[i00] = m[i00]*beta1 +        g[i00]*(1.0f - beta1);
+            v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2);
+
+            const float mh =       m[i00]*beta1h;
+            const float vh = sqrtf(v[i00]*beta2h) + eps;
+
+            // The weight decay is applied independently of the Adam momenta m and v.
+            // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
+            // See: https://arxiv.org/pdf/1711.05101v3.pdf
+            w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
+        }
+    }
+
+    ggml_barrier(params->threadpool);
+    if (ith != 0) {
+        return;
+    }
+
+    iter++;
+    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
+}
+
+static void ggml_compute_forward_opt_step_adamw(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_opt_step_adamw_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+/////////////////////////////////
+
+static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+    GGML_ASSERT(params);
+
+    if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
+        return;
+    }
+
+    switch (tensor->op) {
+        case GGML_OP_DUP:
+            {
+                ggml_compute_forward_dup(params, tensor);
+            } break;
+        case GGML_OP_ADD:
+            {
+                ggml_compute_forward_add(params, tensor);
+            } break;
+        case GGML_OP_ADD1:
+            {
+                ggml_compute_forward_add1(params, tensor);
+            } break;
+        case GGML_OP_ACC:
+            {
+                ggml_compute_forward_acc(params, tensor);
+            } break;
+        case GGML_OP_SUB:
+            {
+                ggml_compute_forward_sub(params, tensor);
+            } break;
+        case GGML_OP_MUL:
+            {
+                ggml_compute_forward_mul(params, tensor);
+            } break;
+        case GGML_OP_DIV:
+            {
+                ggml_compute_forward_div(params, tensor);
+            } break;
+        case GGML_OP_SQR:
+            {
+                ggml_compute_forward_sqr(params, tensor);
+            } break;
+        case GGML_OP_SQRT:
+            {
+                ggml_compute_forward_sqrt(params, tensor);
+            } break;
+        case GGML_OP_LOG:
+            {
+                ggml_compute_forward_log(params, tensor);
+            } break;
+        case GGML_OP_SIN:
+            {
+                ggml_compute_forward_sin(params, tensor);
+            } break;
+        case GGML_OP_COS:
+            {
+                ggml_compute_forward_cos(params, tensor);
+            } break;
+        case GGML_OP_SUM:
+            {
+                ggml_compute_forward_sum(params, tensor);
+            } break;
+        case GGML_OP_SUM_ROWS:
+            {
+                ggml_compute_forward_sum_rows(params, tensor);
+            } break;
+        case GGML_OP_MEAN:
+            {
+                ggml_compute_forward_mean(params, tensor);
+            } break;
+        case GGML_OP_ARGMAX:
+            {
+                ggml_compute_forward_argmax(params, tensor);
+            } break;
+        case GGML_OP_COUNT_EQUAL:
+            {
+                ggml_compute_forward_count_equal(params, tensor);
+            } break;
+        case GGML_OP_REPEAT:
+            {
+                ggml_compute_forward_repeat(params, tensor);
+            } break;
+        case GGML_OP_REPEAT_BACK:
+            {
+                ggml_compute_forward_repeat_back(params, tensor);
+            } break;
+        case GGML_OP_CONCAT:
+            {
+                ggml_compute_forward_concat(params, tensor);
+            } break;
+        case GGML_OP_SILU_BACK:
+            {
+                ggml_compute_forward_silu_back(params, tensor);
+            } break;
+        case GGML_OP_NORM:
+            {
+                ggml_compute_forward_norm(params, tensor);
+            } break;
+        case GGML_OP_RMS_NORM:
+            {
+                ggml_compute_forward_rms_norm(params, tensor);
+            } break;
+        case GGML_OP_RMS_NORM_BACK:
+            {
+                ggml_compute_forward_rms_norm_back(params, tensor);
+            } break;
+        case GGML_OP_GROUP_NORM:
+            {
+                ggml_compute_forward_group_norm(params, tensor);
+            } break;
+        case GGML_OP_MUL_MAT:
+            {
+                ggml_compute_forward_mul_mat(params, tensor);
+            } break;
+        case GGML_OP_MUL_MAT_ID:
+            {
+                ggml_compute_forward_mul_mat_id(params, tensor);
+            } break;
+        case GGML_OP_OUT_PROD:
+            {
+                ggml_compute_forward_out_prod(params, tensor);
+            } break;
+        case GGML_OP_SCALE:
+            {
+                ggml_compute_forward_scale(params, tensor);
+            } break;
+        case GGML_OP_SET:
+            {
+                ggml_compute_forward_set(params, tensor);
+            } break;
+        case GGML_OP_CPY:
+            {
+                ggml_compute_forward_cpy(params, tensor);
+            } break;
+        case GGML_OP_CONT:
+            {
+                ggml_compute_forward_cont(params, tensor);
+            } break;
+        case GGML_OP_RESHAPE:
+            {
+                ggml_compute_forward_reshape(params, tensor);
+            } break;
+        case GGML_OP_VIEW:
+            {
+                ggml_compute_forward_view(params, tensor);
+            } break;
+        case GGML_OP_PERMUTE:
+            {
+                ggml_compute_forward_permute(params, tensor);
+            } break;
+        case GGML_OP_TRANSPOSE:
+            {
+                ggml_compute_forward_transpose(params, tensor);
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                ggml_compute_forward_get_rows(params, tensor);
+            } break;
+        case GGML_OP_GET_ROWS_BACK:
+            {
+                ggml_compute_forward_get_rows_back(params, tensor);
+            } break;
+        case GGML_OP_DIAG:
+            {
+                ggml_compute_forward_diag(params, tensor);
+            } break;
+        case GGML_OP_DIAG_MASK_INF:
+            {
+                ggml_compute_forward_diag_mask_inf(params, tensor);
+            } break;
+        case GGML_OP_DIAG_MASK_ZERO:
+            {
+                ggml_compute_forward_diag_mask_zero(params, tensor);
+            } break;
+        case GGML_OP_SOFT_MAX:
+            {
+                ggml_compute_forward_soft_max(params, tensor);
+            } break;
+        case GGML_OP_SOFT_MAX_BACK:
+            {
+                ggml_compute_forward_soft_max_back(params, tensor);
+            } break;
+        case GGML_OP_ROPE:
+            {
+                ggml_compute_forward_rope(params, tensor);
+            } break;
+        case GGML_OP_ROPE_BACK:
+            {
+                ggml_compute_forward_rope_back(params, tensor);
+            } break;
+        case GGML_OP_CLAMP:
+            {
+                ggml_compute_forward_clamp(params, tensor);
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                ggml_compute_forward_conv_transpose_1d(params, tensor);
+            } break;
+        case GGML_OP_IM2COL:
+            {
+                ggml_compute_forward_im2col(params, tensor);
+            } break;
+        case GGML_OP_IM2COL_BACK:
+            {
+                ggml_compute_forward_im2col_back_f32(params, tensor);
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            {
+                ggml_compute_forward_conv_transpose_2d(params, tensor);
+            } break;
+        case GGML_OP_POOL_1D:
+            {
+                ggml_compute_forward_pool_1d(params, tensor);
+            } break;
+        case GGML_OP_POOL_2D:
+            {
+                ggml_compute_forward_pool_2d(params, tensor);
+            } break;
+        case GGML_OP_POOL_2D_BACK:
+            {
+                ggml_compute_forward_pool_2d_back(params, tensor);
+            } break;
+        case GGML_OP_UPSCALE:
+            {
+                ggml_compute_forward_upscale(params, tensor);
+            } break;
+        case GGML_OP_PAD:
+            {
+                ggml_compute_forward_pad(params, tensor);
+            } break;
+        case GGML_OP_ARANGE:
+            {
+                ggml_compute_forward_arange(params, tensor);
+            } break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            {
+                ggml_compute_forward_timestep_embedding(params, tensor);
+            } break;
+        case GGML_OP_ARGSORT:
+            {
+                ggml_compute_forward_argsort(params, tensor);
+            } break;
+        case GGML_OP_LEAKY_RELU:
+            {
+                ggml_compute_forward_leaky_relu(params, tensor);
+            } break;
+        case GGML_OP_FLASH_ATTN_EXT:
+            {
+                ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
+            } break;
+        case GGML_OP_FLASH_ATTN_BACK:
+            {
+                int32_t t = ggml_get_op_params_i32(tensor, 0);
+                GGML_ASSERT(t == 0 || t == 1);
+                bool masked = t != 0;
+                ggml_compute_forward_flash_attn_back(params, masked, tensor);
+            } break;
+        case GGML_OP_SSM_CONV:
+            {
+                ggml_compute_forward_ssm_conv(params, tensor);
+            } break;
+        case GGML_OP_SSM_SCAN:
+            {
+                ggml_compute_forward_ssm_scan(params, tensor);
+            } break;
+        case GGML_OP_WIN_PART:
+            {
+                ggml_compute_forward_win_part(params, tensor);
+            } break;
+        case GGML_OP_WIN_UNPART:
+            {
+                ggml_compute_forward_win_unpart(params, tensor);
+            } break;
+        case GGML_OP_UNARY:
+            {
+                ggml_compute_forward_unary(params, tensor);
+            } break;
+        case GGML_OP_GET_REL_POS:
+            {
+                ggml_compute_forward_get_rel_pos(params, tensor);
+            } break;
+        case GGML_OP_ADD_REL_POS:
+            {
+                ggml_compute_forward_add_rel_pos(params, tensor);
+            } break;
+        case GGML_OP_RWKV_WKV:
+            {
+                ggml_compute_forward_rwkv_wkv(params, tensor);
+            } break;
+        case GGML_OP_MAP_UNARY:
+            {
+                ggml_unary_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_unary(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_BINARY:
+            {
+                ggml_binary_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_binary(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM1_F32:
+            {
+                ggml_custom1_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom1_f32(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM2_F32:
+            {
+                ggml_custom2_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom2_f32(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM3_F32:
+            {
+                ggml_custom3_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom3_f32(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM1:
+            {
+                ggml_compute_forward_map_custom1(params, tensor);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM2:
+            {
+                ggml_compute_forward_map_custom2(params, tensor);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM3:
+            {
+                ggml_compute_forward_map_custom3(params, tensor);
+            }
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            {
+                ggml_compute_forward_cross_entropy_loss(params, tensor);
+            }
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+            {
+                ggml_compute_forward_cross_entropy_loss_back(params, tensor);
+            }
+            break;
+        case GGML_OP_OPT_STEP_ADAMW:
+            {
+                ggml_compute_forward_opt_step_adamw(params, tensor);
+            }
+            break;
+        case GGML_OP_NONE:
+            {
+                // nop
+            } break;
+        case GGML_OP_COUNT:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// Android's libc implementation "bionic" does not support setting affinity
+#if defined(__gnu_linux__)
+static void set_numa_thread_affinity(int thread_n) {
+    if (!ggml_is_numa()) {
+        return;
+    }
+
+    int node_num;
+    int rv;
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+
+    switch(g_state.numa.numa_strategy) {
+        case GGML_NUMA_STRATEGY_DISTRIBUTE:
+            // run thread on node_num thread_n / (threads per node)
+            node_num = thread_n % g_state.numa.n_nodes;
+            break;
+        case GGML_NUMA_STRATEGY_ISOLATE:
+            // run thread on current_node
+            node_num = g_state.numa.current_node;
+            break;
+        case GGML_NUMA_STRATEGY_NUMACTL:
+            // use the cpuset that numactl gave us
+            rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
+            if (rv) {
+                fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
+            }
+            return;
+        default:
+            return;
+    }
+
+    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
+
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
+    CPU_ZERO_S(setsize, cpus);
+    for (size_t i = 0; i < node->n_cpus; ++i) {
+        CPU_SET_S(node->cpus[i], setsize, cpus);
+    }
+
+    rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    if (rv) {
+            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
+    }
+
+    CPU_FREE(cpus);
+}
+
+static void clear_numa_thread_affinity(void) {
+    if (!ggml_is_numa()) {
+        return;
+    }
+
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
+    CPU_ZERO_S(setsize, cpus);
+    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
+        CPU_SET_S(i, setsize, cpus);
+    }
+
+    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    if (rv) {
+        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
+    }
+
+    CPU_FREE(cpus);
+}
+#else
+// TODO: Windows etc.
+// (the linux implementation may also work on BSD, someone should test)
+static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
+static void clear_numa_thread_affinity(void) {}
+#endif
+
+static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+    int n_tasks = 0;
+
+    if (ggml_is_empty(node)) {
+        // no need to multi-thread a no-op
+        n_tasks = 1;
+        return n_tasks;
+    }
+
+    switch (node->op) {
+        case GGML_OP_CPY:
+        case GGML_OP_DUP:
+        case GGML_OP_CONT:
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_ACC:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_SUB:
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
+        case GGML_OP_SUM:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
+        case GGML_OP_ARGMAX:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_COUNT_EQUAL:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_REPEAT:
+        case GGML_OP_REPEAT_BACK:
+        case GGML_OP_LEAKY_RELU:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(node)) {
+                case GGML_UNARY_OP_ABS:
+                case GGML_UNARY_OP_SGN:
+                case GGML_UNARY_OP_NEG:
+                case GGML_UNARY_OP_STEP:
+                case GGML_UNARY_OP_TANH:
+                case GGML_UNARY_OP_ELU:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
+                case GGML_UNARY_OP_HARDSWISH:
+                case GGML_UNARY_OP_HARDSIGMOID:
+                case GGML_UNARY_OP_EXP:
+                    {
+                        n_tasks = 1;
+                    } break;
+
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_SILU:
+                    {
+                        n_tasks = n_threads;
+                    } break;
+                default:
+                    GGML_ABORT("fatal error");
+            }
+            break;
+        case GGML_OP_SILU_BACK:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_NORM:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_RMS_NORM_BACK:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_CONCAT:
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+        case GGML_OP_OUT_PROD:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                // FIXME: get_rows can use additional threads, but the cost of launching additional threads
+                // decreases performance with GPU offloading
+                //n_tasks = n_threads;
+                n_tasks = 1;
+            } break;
+        case GGML_OP_SCALE:
+        case GGML_OP_SET:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_GET_ROWS_BACK:
+        case GGML_OP_DIAG:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_DIAG_MASK_ZERO:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX_BACK:
+        case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
+        case GGML_OP_ADD_REL_POS:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_CLAMP:
+            {
+                n_tasks = 1; //TODO
+            } break;
+        case GGML_OP_SOFT_MAX:
+            {
+                n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
+            } break;
+        case GGML_OP_IM2COL:
+        case GGML_OP_IM2COL_BACK:
+        case GGML_OP_CONV_TRANSPOSE_1D:
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_POOL_1D:
+        case GGML_OP_POOL_2D:
+        case GGML_OP_POOL_2D_BACK:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_ARANGE:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_FLASH_ATTN_EXT:
+        case GGML_OP_FLASH_ATTN_BACK:
+        case GGML_OP_SSM_CONV:
+        case GGML_OP_SSM_SCAN:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_WIN_PART:
+        case GGML_OP_WIN_UNPART:
+        case GGML_OP_GET_REL_POS:
+        case GGML_OP_RWKV_WKV:
+        case GGML_OP_MAP_UNARY:
+        case GGML_OP_MAP_BINARY:
+        case GGML_OP_MAP_CUSTOM1_F32:
+        case GGML_OP_MAP_CUSTOM2_F32:
+        case GGML_OP_MAP_CUSTOM3_F32:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_MAP_CUSTOM1:
+            {
+                struct ggml_map_custom1_op_params p;
+                memcpy(&p, node->op_params, sizeof(p));
+                if (p.n_tasks == GGML_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p.n_tasks, n_threads);
+                }
+            } break;
+        case GGML_OP_MAP_CUSTOM2:
+            {
+                struct ggml_map_custom2_op_params p;
+                memcpy(&p, node->op_params, sizeof(p));
+                if (p.n_tasks == GGML_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p.n_tasks, n_threads);
+                }
+            } break;
+        case GGML_OP_MAP_CUSTOM3:
+            {
+                struct ggml_map_custom3_op_params p;
+                memcpy(&p, node->op_params, sizeof(p));
+                if (p.n_tasks == GGML_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p.n_tasks, n_threads);
+                }
+            } break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+        case GGML_OP_OPT_STEP_ADAMW:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_NONE:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_COUNT:
+            {
+                GGML_ABORT("fatal error");
+            }
+        default:
+            {
+                fprintf(stderr, "%s: op not implemented: ", __func__);
+                if (node->op < GGML_OP_COUNT) {
+                    fprintf(stderr, "%s\n", ggml_op_name(node->op));
+                } else {
+                    fprintf(stderr, "%d\n", node->op);
+                }
+                GGML_ABORT("fatal error");
+            }
+    }
+
+    assert(n_tasks > 0);
+
+    return n_tasks;
+}
+
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
+
+#if defined(_WIN32)
+#include "windows.h"
+
+// TODO: support > 64 CPUs
+bool ggml_thread_apply_affinity(bool * mask) {
+    HANDLE    h = GetCurrentThread();
+    uint64_t  bitmask = 0ULL;
+
+    assert(GGML_MAX_N_THREADS >= 64);
+
+    for (int32_t i = 0; i < 8; i++) {
+        int32_t idx = i * 8;
+        uint8_t val = 0;
+        val |= mask[idx + 0] << 0;
+        val |= mask[idx + 1] << 1;
+        val |= mask[idx + 2] << 2;
+        val |= mask[idx + 3] << 3;
+        val |= mask[idx + 4] << 4;
+        val |= mask[idx + 5] << 5;
+        val |= mask[idx + 6] << 6;
+        val |= mask[idx + 7] << 7;
+        bitmask |= (uint64_t)val << idx;
+    }
+
+    for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
+            break;
+        }
+    }
+
+    DWORD_PTR m = (DWORD_PTR)bitmask;
+
+    m = SetThreadAffinityMask(h, m);
+
+    return m != 0;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    // Note that on Windows the Process Priority Class must be updated in order to set Thread priority.
+    // This is up to the applications.
+    DWORD p = THREAD_PRIORITY_NORMAL;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
+        case GGML_SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
+        case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
+    }
+
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    if (!SetThreadPriority(GetCurrentThread(), p)) {
+        fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
+
+    return true;
+}
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/resource.h>
+
+static bool ggml_thread_apply_affinity(const bool * mask) {
+    // Not supported on Apple platforms
+    UNUSED(mask);
+    return true;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+        return false;
+    }
+
+    return true;
+}
+
+#elif defined(__gnu_linux__)
+// TODO: this may not work on BSD, to be verified
+
+static bool ggml_thread_apply_affinity(const bool * mask) {
+    cpu_set_t cpuset;
+    int err;
+
+    CPU_ZERO(&cpuset);
+
+    for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            CPU_SET(i, &cpuset);
+        }
+    }
+
+#ifdef __ANDROID__
+    err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+    if (err < 0) {
+        err = errno;
+    }
+#else
+    err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+#endif
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err);
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+        return false;
+    }
+
+    return true;
+}
+
+#else // unsupported platforms
+
+static bool ggml_thread_apply_affinity(const bool * mask) {
+    UNUSED(mask);
+    return true;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    UNUSED(prio);
+    return true;
+}
+
+#endif
+
+static bool ggml_thread_cpumask_is_valid(const bool * mask) {
+    for (int i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) { return true; }
+    }
+    return false;
+}
+
+static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
+    if (!strict) {
+        memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
+        return;
+    } else {
+        memset(local_mask, 0, GGML_MAX_N_THREADS);
+        int32_t base_idx = *iter;
+        for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+            int32_t idx = base_idx + i;
+            if (idx >= GGML_MAX_N_THREADS) {
+                // Just a cheaper modulo
+                idx -= GGML_MAX_N_THREADS;
+            }
+            if (global_mask[idx]) {
+                local_mask[idx] = 1;
+                *iter = idx + 1;
+                return;
+            }
+        }
+    }
+}
+
+void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
+    if (!threadpool) return;
+
+    const int n_threads = threadpool->n_threads_max;
+
+#ifndef GGML_USE_OPENMP
+    struct ggml_compute_state* workers = threadpool->workers;
+
+    ggml_mutex_lock(&threadpool->mutex);
+
+    threadpool->stop = true;
+    threadpool->pause = false;
+
+    ggml_cond_broadcast(&threadpool->cond);
+    ggml_mutex_unlock(&threadpool->mutex);
+
+    for (int j = 1; j < n_threads; j++) {
+        int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
+        GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
+        UNUSED(rc);
+    }
+
+    ggml_mutex_destroy(&threadpool->mutex);
+    ggml_cond_destroy(&threadpool->cond);
+#endif // GGML_USE_OPENMP
+
+    const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
+    ggml_aligned_free(threadpool->workers, workers_size);
+    ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
+}
+
+#ifndef GGML_USE_OPENMP
+// pause/resume must be called under mutex
+static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Pausing threadpool\n");
+    threadpool->pause = true;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+
+static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Resuming threadpool\n");
+    threadpool->pause = false;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+#endif
+
+void ggml_threadpool_pause(struct ggml_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    ggml_mutex_lock(&threadpool->mutex);
+    if (!threadpool->pause) {
+       ggml_threadpool_pause_locked(threadpool);
+    }
+    ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    ggml_mutex_lock(&threadpool->mutex);
+    if (threadpool->pause) {
+       ggml_threadpool_resume_locked(threadpool);
+    }
+    ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+struct ggml_cplan ggml_graph_plan(
+          const struct ggml_cgraph * cgraph,
+                               int   n_threads,
+            struct ggml_threadpool * threadpool) {
+
+    if (threadpool == NULL) {
+        //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
+    }
+    if (n_threads <= 0) {
+        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
+    }
+
+    size_t work_size = 0;
+
+    struct ggml_cplan cplan;
+    memset(&cplan, 0, sizeof(struct ggml_cplan));
+
+    int max_tasks = 1;
+
+    // thread scheduling for the different operations + work buffer size estimation
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        const int n_tasks = ggml_get_n_tasks(node, n_threads);
+
+        max_tasks = MAX(max_tasks, n_tasks);
+
+        size_t cur = 0;
+
+        switch (node->op) {
+            case GGML_OP_CPY:
+            case GGML_OP_DUP:
+                {
+                    if (ggml_is_quantized(node->type) ||
+                        // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
+                        (node->src[0]->type == GGML_TYPE_F16  && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
+                        (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
+                    }
+                } break;
+            case GGML_OP_ADD:
+            case GGML_OP_ADD1:
+                {
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                    }
+                } break;
+            case GGML_OP_ACC:
+                {
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
+                    }
+                } break;
+            case GGML_OP_COUNT_EQUAL:
+                {
+                    cur = ggml_type_size(node->type)*n_tasks;
+                } break;
+            case GGML_OP_MUL_MAT:
+                {
+                    const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
+
+                    if (node->src[1]->type != vec_dot_type) {
+                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+                    }
+                } break;
+            case GGML_OP_MUL_MAT_ID:
+                {
+                    cur = 0;
+                    const struct ggml_tensor * src0 = node->src[0];
+                    const struct ggml_tensor * src1 = node->src[1];
+                    const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
+                    if (src1->type != vec_dot_type) {
+                        cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
+                    }
+                    const int n_as = src0->ne[2];
+                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
+                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
+                    cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
+                } break;
+            case GGML_OP_OUT_PROD:
+                {
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                    }
+                } break;
+            case GGML_OP_SOFT_MAX:
+            case GGML_OP_ROPE:
+                {
+                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
+                } break;
+            case GGML_OP_CONV_TRANSPOSE_1D:
+                {
+                    GGML_ASSERT(node->src[0]->ne[3] == 1);
+                    GGML_ASSERT(node->src[1]->ne[2] == 1);
+                    GGML_ASSERT(node->src[1]->ne[3] == 1);
+
+                    const int64_t ne00 = node->src[0]->ne[0];  // K
+                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
+                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
+
+                    const int64_t ne10 = node->src[1]->ne[0];  // L
+                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
+
+                    if ((node->src[0]->type == GGML_TYPE_F16 ||
+                         node->src[0]->type == GGML_TYPE_BF16) &&
+                        node->src[1]->type == GGML_TYPE_F32) {
+                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
+                        cur += sizeof(ggml_fp16_t)*ne10*ne11;
+                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                               node->src[1]->type == GGML_TYPE_F32) {
+                        cur += sizeof(float)*ne00*ne01*ne02;
+                        cur += sizeof(float)*ne10*ne11;
+                    } else {
+                        GGML_ABORT("fatal error");
+                    }
+                } break;
+            case GGML_OP_CONV_TRANSPOSE_2D:
+                {
+                    const int64_t ne00 = node->src[0]->ne[0]; // W
+                    const int64_t ne01 = node->src[0]->ne[1]; // H
+                    const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
+                    const int64_t ne03 = node->src[0]->ne[3]; // Channels In
+
+                    const int64_t ne10 = node->src[1]->ne[0]; // W
+                    const int64_t ne11 = node->src[1]->ne[1]; // H
+                    const int64_t ne12 = node->src[1]->ne[2]; // Channels In
+
+                    cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
+                    cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
+                } break;
+            case GGML_OP_FLASH_ATTN_EXT:
+                {
+                    const int64_t ne00 = node->src[0]->ne[0]; // D
+
+                    cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
+                } break;
+            case GGML_OP_FLASH_ATTN_BACK:
+                {
+                    const int64_t    D = node->src[0]->ne[0];
+                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
+                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
+                    if (node->src[1]->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    } else if (node->src[1]->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    } else if (node->src[1]->type == GGML_TYPE_BF16) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    }
+                } break;
+
+            case GGML_OP_CROSS_ENTROPY_LOSS:
+                {
+                    cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
+                } break;
+            case GGML_OP_COUNT:
+                {
+                    GGML_ABORT("fatal error");
+                }
+            default:
+                break;
+        }
+
+        work_size = MAX(work_size, cur);
+    }
+
+    if (work_size > 0) {
+        work_size += CACHE_LINE_SIZE*(n_threads);
+    }
+
+    cplan.threadpool = threadpool;
+    cplan.n_threads  = MIN(max_tasks, n_threads);
+    cplan.work_size  = work_size;
+    cplan.work_data  = NULL;
+
+    return cplan;
+}
+
+static thread_ret_t ggml_graph_compute_thread(void * data) {
+    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+    struct ggml_threadpool    * tp    = state->threadpool;
+
+    const struct ggml_cgraph * cgraph = tp->cgraph;
+    const struct ggml_cplan  * cplan  = tp->cplan;
+
+    set_numa_thread_affinity(state->ith);
+
+    struct ggml_compute_params params = {
+        /*.ith       =*/ state->ith,
+        /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
+        /*.wsize     =*/ cplan->work_size,
+        /*.wdata     =*/ cplan->work_data,
+        /*.threadpool=*/ tp,
+    };
+
+    for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
+        struct ggml_tensor * node = cgraph->nodes[node_n];
+
+        ggml_compute_forward(&params, node);
+
+        if (state->ith == 0 && cplan->abort_callback &&
+                cplan->abort_callback(cplan->abort_callback_data)) {
+            tp->abort = true;
+            tp->ec    = GGML_STATUS_ABORTED;
+        }
+
+        ggml_barrier(state->threadpool);
+    }
+
+    return 0;
+}
+
+#ifndef GGML_USE_OPENMP
+
+// check if thread is active
+static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+    int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
+    return (state->ith < n_threads);
+}
+
+// check if thread is ready to proceed (exit from polling or sleeping)
+static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    if (state->pending || threadpool->stop || threadpool->pause) { return true; }
+
+    // check for new graph/work
+    int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
+    if (new_graph != state->last_graph) {
+        state->pending    = ggml_graph_compute_thread_active(state);
+        state->last_graph = new_graph;
+    }
+
+    return state->pending;
+}
+
+// sync thread state after polling
+static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
+    #ifdef GGML_TSAN_ENABLED
+    atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
+    #else
+    atomic_thread_fence(memory_order_seq_cst);
+    #endif
+    UNUSED(state);
+}
+
+static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    // Skip polling for unused threads
+    if (!ggml_graph_compute_thread_active(state)) {
+        return state->pending;
+    }
+
+    // This seems to make 0 ... 100 a decent range for polling level across modern processors.
+    // Perhaps, we can adjust it dynamically based on load and things.
+    const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
+
+    for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) {
+        // No new work. Keep polling.
+        ggml_thread_cpu_relax();
+    }
+
+    return state->pending;
+}
+
+static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    if (ggml_graph_compute_poll_for_work(state)) {
+        ggml_graph_compute_thread_sync(state);
+        return state->pending;
+    }
+
+    ggml_mutex_lock_shared(&threadpool->mutex);
+    while (!ggml_graph_compute_thread_ready(state)) {
+        // No new work. Wait for the signal.
+        GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith);
+        ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+    }
+    ggml_mutex_unlock_shared(&threadpool->mutex);
+
+    return state->pending;
+}
+
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
+    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    ggml_thread_apply_priority(threadpool->prio);
+    if (ggml_thread_cpumask_is_valid(state->cpumask)) {
+        ggml_thread_apply_affinity(state->cpumask);
+    }
+
+    while (true) {
+        // Check if we need to sleep
+        while (threadpool->pause) {
+            GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
+            ggml_mutex_lock_shared(&threadpool->mutex);
+            if (threadpool->pause) {
+                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+            }
+            GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
+            ggml_mutex_unlock_shared(&threadpool->mutex);
+        }
+
+        // This needs to be checked for after the cond_wait
+        if (threadpool->stop) break;
+
+        // Check if there is new work
+        // The main thread is the only one that can dispatch new work
+
+        ggml_graph_compute_check_for_work(state);
+        if (state->pending) {
+            state->pending = false;
+
+            ggml_graph_compute_thread(state);
+        }
+    }
+
+    return (thread_ret_t) 0;
+}
+
+// Start processing new graph
+static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
+{
+    // Always take the mutex here because the worker threads are doing hybrid poll/wait
+
+    ggml_mutex_lock(&threadpool->mutex);
+
+    GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
+
+    // Update the number of active threads
+    atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
+
+    // Indicate the graph is ready to be processed
+    // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
+    atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
+
+    if (threadpool->pause) {
+       // Update main thread prio and affinity to match the threadpool settings
+       ggml_thread_apply_priority(threadpool->prio);
+       if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+           ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+       }
+
+       // resume does cond broadcast
+       ggml_threadpool_resume_locked(threadpool);
+    } else {
+       ggml_cond_broadcast(&threadpool->cond);
+    }
+
+    ggml_mutex_unlock(&threadpool->mutex);
+}
+
+#endif // GGML_USE_OPENMP
+
+void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
+    p->n_threads  = n_threads;
+    p->prio       = 0;     // default priority (usually means normal or inherited)
+    p->poll       = 50;    // hybrid-polling enabled
+    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+    p->paused     = false; // threads are ready to go
+    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
+}
+
+struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
+    struct ggml_threadpool_params p;
+    ggml_threadpool_params_init(&p, n_threads);
+    return p;
+}
+
+bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
+    if (p0->n_threads      != p1->n_threads  )    return false;
+    if (p0->prio           != p1->prio       )    return false;
+    if (p0->poll           != p1->poll       )    return false;
+    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
+}
+
+static struct ggml_threadpool * ggml_threadpool_new_impl(
+    struct ggml_threadpool_params * tpp,
+               struct ggml_cgraph * cgraph,
+                struct ggml_cplan * cplan) {
+
+    struct ggml_threadpool * threadpool =
+        ggml_aligned_malloc(sizeof(struct ggml_threadpool));
+    {
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->n_graph          = 0;
+        threadpool->n_barrier        = 0;
+        threadpool->n_barrier_passed = 0;
+        threadpool->current_chunk    = 0;
+        threadpool->stop             = false;
+        threadpool->pause            = tpp->paused;
+        threadpool->abort            = false;
+        threadpool->workers          = NULL;
+        threadpool->n_threads_max    = tpp->n_threads;
+        threadpool->n_threads_cur    = tpp->n_threads;
+        threadpool->poll             = tpp->poll;
+        threadpool->prio             = tpp->prio;
+        threadpool->ec               = GGML_STATUS_SUCCESS;
+    }
+
+    // Allocate and init workers state
+    const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
+    struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
+
+    memset(workers, 0, workers_size);
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j].threadpool = threadpool;
+        workers[j].ith        = j;
+    }
+
+    threadpool->workers = workers;
+
+#ifndef GGML_USE_OPENMP
+    ggml_mutex_init(&threadpool->mutex);
+    ggml_cond_init(&threadpool->cond);
+
+    // Spin the threads for all workers, and update CPU placements.
+    // Place the main thread last (towards the higher numbered CPU cores).
+
+    int32_t cpumask_iter = 0;
+
+    for (int j = 1; j < tpp->n_threads; j++) {
+        ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+
+        int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]);
+        GGML_ASSERT(rc == 0);
+    }
+
+    ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
+
+    if (!threadpool->pause) {
+        // Update main thread prio and affinity at the start, otherwise we'll do it in resume
+        ggml_thread_apply_priority(threadpool->prio);
+        if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+            ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+        }
+    }
+#endif // GGML_USE_OPENMP
+
+    return threadpool;
+}
+
+struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
+    return ggml_threadpool_new_impl(tpp, NULL, NULL);
+}
+
+enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+    ggml_cpu_init();
+
+    GGML_ASSERT(cplan);
+    GGML_ASSERT(cplan->n_threads > 0);
+    GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
+
+    int n_threads                               = cplan->n_threads;
+    struct ggml_threadpool * threadpool = cplan->threadpool;
+
+    bool disposable_threadpool = false;
+
+    if (threadpool == NULL) {
+        //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
+        disposable_threadpool = true;
+
+        struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
+        threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
+    } else {
+        // Reset some of the parameters that need resetting
+        // No worker threads should be accessing the parameters below at this stage
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->current_chunk    = 0;
+        threadpool->abort            = false;
+        threadpool->ec               = GGML_STATUS_SUCCESS;
+    }
+
+#ifdef GGML_USE_OPENMP
+    if (n_threads > 1) {
+        #pragma omp parallel num_threads(n_threads)
+        {
+            #pragma omp single
+            {
+                // update the number of threads from the actual number of threads that we got from OpenMP
+                n_threads = omp_get_num_threads();
+                atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
+            }
+
+            ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
+        }
+    } else {
+        atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
+        ggml_graph_compute_thread(&threadpool->workers[0]);
+    }
+#else
+    if (n_threads > threadpool->n_threads_max) {
+        GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
+        n_threads = threadpool->n_threads_max;
+    }
+
+    // Kick all threads to start the new graph
+    ggml_graph_compute_kickoff(threadpool, n_threads);
+
+    // This is a work thread too
+    ggml_graph_compute_thread(&threadpool->workers[0]);
+#endif
+
+    // don't leave affinity set on the main thread
+    clear_numa_thread_affinity();
+
+    enum ggml_status ret = threadpool->ec;
+
+    if (disposable_threadpool) {
+        ggml_threadpool_free(threadpool);
+    }
+
+    return ret;
+}
+
+enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
+
+    cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
+
+    return ggml_graph_compute(cgraph, &cplan);
+}
+
+int ggml_cpu_has_neon(void) {
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.has_neon;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_sve(void) {
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.has_sve;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_matmul_int8(void) {
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.has_i8mm;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_get_sve_cnt(void) {
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.sve_cnt;
+#else
+    return 0;
+#endif
+}
+
+void ggml_cpu_init(void) {
+    ggml_critical_section_start();
+
+    static bool is_first_call = true;
+
+    if (is_first_call) {
+        // initialize GELU, Quick GELU, SILU and EXP F32 tables
+        {
+            // FIXME: this may be called before ggml_init
+            //const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
+
+            for (int i = 0; i < (1 << 16); ++i) {
+                union {
+                    uint16_t u16;
+                    ggml_fp16_t fp16;
+                } u = {i};
+                // FIXME: this table is used in conversion functions outside of compute
+                // current code depends on ggml_init initializing this table
+                float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
+                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
+                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
+            }
+
+            //const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+
+            //GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
+        }
+
+#if defined(__ARM_ARCH)
+        ggml_init_arm_arch_features();
+#endif
+
+        is_first_call = false;
+    }
+
+    ggml_critical_section_end();
+}
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 65c4f8119..af29a26f0 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -8,6 +8,7 @@
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 #include <stdbool.h>
 #include <stdint.h>
+#include <string.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -36,6 +37,20 @@ extern "C" {
 #endif
 #endif
 
+static inline int ggml_up32(int n) {
+    return (n + 31) & ~31;
+}
+
+//static inline int ggml_up64(int n) {
+//    return (n + 63) & ~63;
+//}
+
+static inline int ggml_up(int n, int m) {
+    // assert m is a power of 2
+    GGML_ASSERT((m & (m - 1)) == 0);
+    return (n + m - 1) & ~(m - 1);
+}
+
 //
 // logging
 //
@@ -51,6 +66,74 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
 #define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define GGML_LOG_CONT(...)  ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
 
+#define GGML_DEBUG 0
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+// tensor params
+
+static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
+    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
+    assert(params_size <= GGML_MAX_OP_PARAMS);
+    memcpy(tensor->op_params, params, params_size);
+}
+
+static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    return ((const int32_t *)(tensor->op_params))[i];
+}
+
+static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
+    return ((const float *)(tensor->op_params))[i];
+}
+
+static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    ((int32_t *)(tensor->op_params))[i] = value;
+}
+
+static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
+    ((float *)(tensor->op_params))[i] = value;
+}
+
+struct ggml_map_custom1_op_params {
+    ggml_custom1_op_t  fun;
+    int                n_tasks;
+    void             * userdata;
+};
+
+
+struct ggml_map_custom2_op_params {
+    ggml_custom2_op_t   fun;
+    int                 n_tasks;
+    void              * userdata;
+};
+
+
+struct ggml_map_custom3_op_params {
+    ggml_custom3_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
 // bitset
 
 typedef uint32_t ggml_bitset_t;
@@ -204,6 +287,10 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 void * ggml_aligned_malloc(size_t size);
 void ggml_aligned_free(void * ptr, size_t size);
 
+// TODO: move to threading file
+void ggml_critical_section_start(void);
+void ggml_critical_section_end(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index 2778009e4..8a772f224 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -1296,13 +1296,6 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
     UNUSED(dev);
 }
 
-static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
-
-    UNUSED(dev);
-    UNUSED(max_tensor_size);
-}
-
 static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     UNUSED(dev);
     UNUSED(op);
@@ -1328,7 +1321,7 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
     /* .init_backend         = */ ggml_backend_rpc_device_init,
     /* .get_buffer_type      = */ ggml_backend_rpc_device_get_buffer_type,
     /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr,
+    /* .buffer_from_host_ptr = */ NULL,
     /* .supports_op          = */ ggml_backend_rpc_device_supports_op,
     /* .supports_buft        = */ ggml_backend_rpc_device_supports_buft,
     /* .offload_op           = */ NULL,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 84f2c766b..7dc3340a1 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1,4 +1,4 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
+#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
 #include "ggml-backend.h"
@@ -31,168 +31,27 @@
 #include <syscall.h>
 #endif
 
-#ifdef GGML_USE_OPENMP
-#include <omp.h>
-#endif
-
-#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
-#undef GGML_USE_LLAMAFILE
-#endif
-
-#ifdef GGML_USE_LLAMAFILE
-#include <llamafile/sgemm.h>
-#endif
-
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-
-// disable POSIX deprecation warnings
-// these functions are never going away, anyway
-#pragma warning(disable: 4996)
-
-// unreachable code because of multiple instances of code after GGML_ABORT
-#pragma warning(disable: 4702)
-#endif
-
-// Note: once we move threading into a separate C++ file
-// will use std::hardware_destructive_interference_size instead of hardcoding it here
-// and we'll use C++ attribute syntax.
-#define GGML_CACHE_LINE  64
-
-#if defined(__clang__) || defined(__GNUC__)
-#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
-#endif
-
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define GGML_TSAN_ENABLED 1
-#endif
-#else  // __has_feature
-#if defined(__SANITIZE_THREAD__)
-#define GGML_TSAN_ENABLED 1
-#endif
-#endif // __has_feature
-
-#if defined(_WIN32)
-
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-    #define NOMINMAX
-#endif
-#include <windows.h>
-
-#if !defined(__clang__)
-#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
-
-typedef volatile LONG atomic_int;
-typedef atomic_int atomic_bool;
-typedef atomic_int atomic_flag;
-
-#define ATOMIC_FLAG_INIT 0
-
-typedef enum {
-    memory_order_relaxed,
-    memory_order_consume,
-    memory_order_acquire,
-    memory_order_release,
-    memory_order_acq_rel,
-    memory_order_seq_cst
-} memory_order;
-
-static void atomic_store(atomic_int * ptr, LONG val) {
-    InterlockedExchange(ptr, val);
-}
-static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
-    // TODO: add support for explicit memory order
-    InterlockedExchange(ptr, val);
-}
-static LONG atomic_load(atomic_int * ptr) {
-    return InterlockedCompareExchange(ptr, 0, 0);
-}
-static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
-    // TODO: add support for explicit memory order
-    return InterlockedCompareExchange(ptr, 0, 0);
-}
-static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
-    return InterlockedExchangeAdd(ptr, inc);
-}
-static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
-    // TODO: add support for explicit memory order
-    return InterlockedExchangeAdd(ptr, inc);
-}
-static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
-    return InterlockedExchange(ptr, 1);
-}
-static void atomic_flag_clear(atomic_flag * ptr) {
-    InterlockedExchange(ptr, 0);
-}
-static void atomic_thread_fence(memory_order mo) {
-    MemoryBarrier();
-}
-#else // clang
-#include <stdatomic.h>
-#endif
-
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
-    (void) unused;
-    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-    if (handle == NULL)
-    {
-        return EAGAIN;
-    }
-
-    *out = handle;
-    return 0;
-}
-
-static int pthread_join(pthread_t thread, void * unused) {
-    (void) unused;
-    int ret = (int) WaitForSingleObject(thread, INFINITE);
-    CloseHandle(thread);
-    return ret;
-}
-
-static int sched_yield (void) {
-    Sleep (0);
-    return 0;
-}
-#else
-
-#include <pthread.h>
-#include <stdatomic.h>
-#include <sched.h>
-#if defined(__FreeBSD__)
-#include <pthread_np.h>
-#endif
-
-typedef void * thread_ret_t;
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#endif
-
-typedef pthread_t ggml_thread_t;
-
-#ifdef GGML_USE_CPU_HBM
-#include <hbwmalloc.h>
-#endif
-
 #if defined(__APPLE__)
 #include <unistd.h>
 #include <mach/mach.h>
 #include <TargetConditionals.h>
 #endif
 
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+    #define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#define UNUSED GGML_UNUSED
+
 #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
     (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
-
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <sys/wait.h>
 
 #if defined(__ANDROID__)
@@ -305,15 +164,6 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
     abort();
 }
 
-#define GGML_DEBUG 0
-
-#define GGML_GELU_FP16
-#define GGML_GELU_QUICK_FP16
-
-#define GGML_SOFT_MAX_UNROLL 4
-#define GGML_VEC_DOT_UNROLL  2
-#define GGML_VEC_MAD_UNROLL  32
-
 //
 // logging
 //
@@ -358,24 +208,6 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
     fflush(stderr);
 }
 
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
 //
 // end of logging block
 //
@@ -396,9 +228,9 @@ void * ggml_aligned_malloc(size_t size) {
         return NULL;
     }
     void * aligned_memory = NULL;
-#ifdef GGML_USE_CPU_HBM
+  #ifdef GGML_USE_CPU_HBM
     int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
-#elif TARGET_OS_OSX
+  #elif TARGET_OS_OSX
     kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
     int result = EFAULT;
     switch (alloc_status) {
@@ -415,12 +247,9 @@ void * ggml_aligned_malloc(size_t size) {
             result = EFAULT;
             break;
     }
-#elif GGML_USE_METAL
-    const long page_size = sysconf(_SC_PAGESIZE);
-    int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
-#else
+  #else
     int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
-#endif
+  #endif
     if (result != 0) {
         // Handle allocation failure
         const char *error_desc = "unknown allocation error";
@@ -433,7 +262,6 @@ void * ggml_aligned_malloc(size_t size) {
                 break;
         }
         GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
-        GGML_ABORT("fatal error");
         return NULL;
     }
     return aligned_memory;
@@ -490,44 +318,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 
 #define GGML_FREE(ptr) free(ptr)
 
-#define UNUSED GGML_UNUSED
-#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
-
-#if defined(GGML_USE_ACCELERATE)
-#include <Accelerate/Accelerate.h>
-#endif
-
-// floating point type used to accumulate sums
-typedef double ggml_float;
-
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-//
-// global data
-//
-
-// precomputed gelu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
-
-// precomputed quick gelu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
-
-// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
-float ggml_table_f32_f16[1 << 16];
-
-#if defined(__ARM_ARCH)
-struct ggml_arm_arch_features_type {
-    int has_neon;
-    int has_i8mm;
-    int has_sve;
-    int sve_cnt;
-} ggml_arm_arch_features = {-1, -1, -1, 0};
-#endif
-
 const char * ggml_status_to_string(enum ggml_status status) {
     switch (status) {
         case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
@@ -565,18 +355,22 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
     }
 }
 
+// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
+//        currently, the ggml_cpu_has_* functions are entirely compile-time
 void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
     int64_t i = 0;
 #if defined(__F16C__)
-    for (; i + 7 < n; i += 8) {
-        __m256 x_vec = _mm256_loadu_ps(x + i);
-        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storeu_si128((__m128i *)(y + i), y_vec);
-    }
-    for(; i + 3 < n; i += 4) {
-        __m128 x_vec = _mm_loadu_ps(x + i);
-        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storel_epi64((__m128i *)(y + i), y_vec);
+    if (ggml_cpu_has_f16c()) {
+        for (; i + 7 < n; i += 8) {
+            __m256 x_vec = _mm256_loadu_ps(x + i);
+            __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+            _mm_storeu_si128((__m128i *)(y + i), y_vec);
+        }
+        for(; i + 3 < n; i += 4) {
+            __m128 x_vec = _mm_loadu_ps(x + i);
+            __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+            _mm_storel_epi64((__m128i *)(y + i), y_vec);
+        }
     }
 #endif
     for (; i < n; i++) {
@@ -587,24 +381,27 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
 void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
     int64_t i = 0;
 #if defined(__AVX512F__)
-    for (; i + 16 <= n; i += 16) {
-        _mm512_storeu_ps(y + i,
-                         _mm512_castsi512_ps(
-                             _mm512_slli_epi32(
-                                 _mm512_cvtepu16_epi32(
-                                     _mm256_loadu_si256(
-                                         (const __m256i *)(x + i))),
-                                 16)));
+    if (ggml_cpu_has_avx512()) {
+        for (; i + 16 <= n; i += 16) {
+            _mm512_storeu_ps(y + i,
+                            _mm512_castsi512_ps(
+                                _mm512_slli_epi32(
+                                    _mm512_cvtepu16_epi32(
+                                        _mm256_loadu_si256(
+                                            (const __m256i *)(x + i))),
+                                    16)));
+        }
     }
-#elif defined(__AVX2__)
-    for (; i + 8 <= n; i += 8) {
-        _mm256_storeu_ps(y + i,
-                         _mm256_castsi256_ps(
-                             _mm256_slli_epi32(
-                                 _mm256_cvtepu16_epi32(
-                                     _mm_loadu_si128(
-                                         (const __m128i *)(x + i))),
-                                 16)));
+    if (ggml_cpu_has_avx2()) {
+        for (; i + 8 <= n; i += 8) {
+            _mm256_storeu_ps(y + i,
+                            _mm256_castsi256_ps(
+                                _mm256_slli_epi32(
+                                    _mm256_cvtepu16_epi32(
+                                        _mm_loadu_si128(
+                                            (const __m128i *)(x + i))),
+                                    16)));
+        }
     }
 #endif
     for (; i < n; i++) {
@@ -737,24 +534,8 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
 #else
     return fopen(fname, mode);
 #endif
+
 }
-
-//
-// cache line
-//
-
-#if defined(__cpp_lib_hardware_interference_size)
-#define CACHE_LINE_SIZE hardware_destructive_interference_size
-#else
-#if defined(__POWER9_VECTOR__)
-#define CACHE_LINE_SIZE 128
-#else
-#define CACHE_LINE_SIZE 64
-#endif
-#endif
-
-static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
-
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
 static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
 static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
@@ -789,16 +570,12 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .blck_size                = 1,
         .type_size                = sizeof(double),
         .is_quantized             = false,
-        .nrows                    = 1,
     },
     [GGML_TYPE_F32] = {
         .type_name                = "f32",
         .blck_size                = 1,
         .type_size                = sizeof(float),
         .is_quantized             = false,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
-        .vec_dot_type             = GGML_TYPE_F32,
-        .nrows                    = 1,
     },
     [GGML_TYPE_F16] = {
         .type_name                = "f16",
@@ -808,9 +585,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
         .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
         .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
-        .vec_dot_type             = GGML_TYPE_F16,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q4_0] = {
         .type_name                = "q4_0",
@@ -820,13 +594,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
         .from_float               = quantize_row_q4_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_0_ref,
-        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-        .nrows                    = 2,
-#else
-        .nrows                    = 1,
-#endif
     },
     [GGML_TYPE_Q4_1] = {
         .type_name                = "q4_1",
@@ -836,13 +603,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
         .from_float               = quantize_row_q4_1,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_1_ref,
-        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-        .nrows                    = 2,
-#else
-        .nrows                    = 1,
-#endif
     },
     [4] = { // GGML_TYPE_Q4_2
         .type_name                = "DEPRECATED",
@@ -852,9 +612,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = NULL,
         .from_float               = NULL,
         .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-        .nrows                    = 1,
     },
     [5] = { // GGML_TYPE_Q4_3
         .type_name                = "DEPRECATED",
@@ -864,9 +621,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = NULL,
         .from_float               = NULL,
         .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q5_0] = {
         .type_name                = "q5_0",
@@ -876,9 +630,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
         .from_float               = quantize_row_q5_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q5_0_ref,
-        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q5_1] = {
         .type_name                = "q5_1",
@@ -888,9 +639,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
         .from_float               = quantize_row_q5_1,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q5_1_ref,
-        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q8_0] = {
         .type_name                = "q8_0",
@@ -900,14 +648,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
         .from_float               = quantize_row_q8_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q8_0_ref,
-        .from_float_to_mat        = quantize_mat_q8_0,
-        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-        .nrows                    = 2,
-#else
-        .nrows                    = 1,
-#endif
     },
     [GGML_TYPE_Q8_1] = {
         .type_name                = "q8_1",
@@ -916,8 +656,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .is_quantized             = true,
         .from_float               = quantize_row_q8_1,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q8_1_ref,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q2_K] = {
         .type_name                = "q2_K",
@@ -927,9 +665,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
         .from_float               = quantize_row_q2_K,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q2_K_ref,
-        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q3_K] = {
         .type_name                = "q3_K",
@@ -939,9 +674,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
         .from_float               = quantize_row_q3_K,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q3_K_ref,
-        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q4_K] = {
         .type_name                = "q4_K",
@@ -951,9 +683,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
         .from_float               = quantize_row_q4_K,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_K_ref,
-        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q5_K] = {
         .type_name                = "q5_K",
@@ -963,9 +692,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
         .from_float               = quantize_row_q5_K,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q5_K_ref,
-        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q6_K] = {
         .type_name                = "q6_K",
@@ -975,9 +701,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
         .from_float               = quantize_row_q6_K,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q6_K_ref,
-        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_IQ2_XXS] = {
         .type_name                = "iq2_xxs",
@@ -987,9 +710,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
         .from_float               = NULL,
         .from_float_ref           = NULL,
-        .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_IQ2_XS] = {
         .type_name                = "iq2_xs",
@@ -999,9 +719,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
         .from_float               = NULL,
         .from_float_ref           = NULL,
-        .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_IQ3_XXS] = {
         .type_name                = "iq3_xxs",
@@ -1011,9 +728,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_iq3_xxs,
         .from_float               = quantize_row_iq3_xxs,
         .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
-        .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_IQ3_S] = {
         .type_name                = "iq3_s",
@@ -1023,9 +737,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_iq3_s,
         .from_float               = quantize_row_iq3_s,
         .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_s_ref,
-        .vec_dot                  = ggml_vec_dot_iq3_s_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_IQ2_S] = {
         .type_name                = "iq2_s",
@@ -1035,9 +746,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_s,
         .from_float               = quantize_row_iq2_s,
         .from_float_ref           = (ggml_from_float_t)quantize_row_iq2_s_ref,
-        .vec_dot                  = ggml_vec_dot_iq2_s_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_IQ1_S] = {
         .type_name                = "iq1_s",
@@ -1047,9 +755,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_iq1_s,
         .from_float               = NULL,
         .from_float_ref           = NULL,
-        .vec_dot                  = ggml_vec_dot_iq1_s_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_IQ1_M] = {
         .type_name                = "iq1_m",
@@ -1059,9 +764,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_iq1_m,
         .from_float               = NULL,
         .from_float_ref           = NULL,
-        .vec_dot                  = ggml_vec_dot_iq1_m_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_IQ4_NL] = {
         .type_name                = "iq4_nl",
@@ -1071,9 +773,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_iq4_nl,
         .from_float               = quantize_row_iq4_nl,
         .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_nl_ref,
-        .vec_dot                  = ggml_vec_dot_iq4_nl_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
     },
     [GGML_TYPE_IQ4_XS] = {
         .type_name                = "iq4_xs",
@@ -1083,9 +782,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
         .from_float               = quantize_row_iq4_xs,
         .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_xs_ref,
-        .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q8_K] = {
         .type_name                = "q8_K",
@@ -1102,9 +798,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) ggml_bf16_to_fp32_row,
         .from_float               = (ggml_from_float_t) ggml_fp32_to_bf16_row,
         .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
-        .vec_dot_type             = GGML_TYPE_BF16,
-        .nrows                    = 1,
     },
     [GGML_TYPE_Q4_0_4_4] = {
         .type_name                = "q4_0_4x4",
@@ -1115,12 +808,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = NULL,
         .from_float               = NULL,
         .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
-        .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
     },
     [GGML_TYPE_Q4_0_4_8] = {
         .type_name                = "q4_0_4x8",
@@ -1131,12 +818,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = NULL,
         .from_float               = NULL,
         .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
-        .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
     },
     [GGML_TYPE_Q4_0_8_8] = {
         .type_name                = "q4_0_8x8",
@@ -1147,12 +828,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = NULL,
         .from_float               = NULL,
         .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 8,
-        .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
-        .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
     },
     [GGML_TYPE_TQ1_0] = {
         .type_name                = "tq1_0",
@@ -1162,9 +837,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_tq1_0,
         .from_float               = quantize_row_tq1_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_tq1_0_ref,
-        .vec_dot                  = ggml_vec_dot_tq1_0_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
     [GGML_TYPE_TQ2_0] = {
         .type_name                = "tq2_0",
@@ -1174,824 +846,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_tq2_0,
         .from_float               = quantize_row_tq2_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_tq2_0_ref,
-        .vec_dot                  = ggml_vec_dot_tq2_0_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
     },
 };
 
-// For internal test use
 const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
     GGML_ASSERT(type < GGML_TYPE_COUNT);
     return &type_traits[type];
 }
 
-//
-// simd mappings
-//
-
-// we define a common set of C macros which map to specific intrinsics based on the current architecture
-// we then implement the fundamental computation operations below using only these macros
-// adding support for new architectures requires to define the corresponding SIMD macros
-//
-// GGML_F32_STEP / GGML_F16_STEP
-//   number of elements to process in a single step
-//
-// GGML_F32_EPR / GGML_F16_EPR
-//   number of elements to fit in a single register
-//
-
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
-
-#define GGML_SIMD
-
-// F32 NEON
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              float32x4_t
-#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
-#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
-#define GGML_F32x4_LOAD         vld1q_f32
-#define GGML_F32x4_STORE        vst1q_f32
-#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
-#define GGML_F32x4_ADD          vaddq_f32
-#define GGML_F32x4_MUL          vmulq_f32
-#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = GGML_F32_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    (res) = GGML_F32x4_REDUCE_ONE((x)[0]);         \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 NEON
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    #define GGML_F16_STEP 32
-    #define GGML_F16_EPR  8
-
-    #define GGML_F16x8              float16x8_t
-    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
-    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
-    #define GGML_F16x8_LOAD(x)      vld1q_f16((const ggml_fp16_internal_t *)(x))
-    #define GGML_F16x8_STORE        vst1q_f16
-    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
-    #define GGML_F16x8_ADD          vaddq_f16
-    #define GGML_F16x8_MUL          vmulq_f16
-    #define GGML_F16x8_REDUCE(res, x)                               \
-    do {                                                            \
-        int offset = GGML_F16_ARR >> 1;                             \
-        for (int i = 0; i < offset; ++i) {                          \
-            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
-        }                                                           \
-        offset >>= 1;                                               \
-        for (int i = 0; i < offset; ++i) {                          \
-            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
-        }                                                           \
-        offset >>= 1;                                               \
-        for (int i = 0; i < offset; ++i) {                          \
-            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
-        }                                                           \
-        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
-        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
-        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
-    } while (0)
-
-    #define GGML_F16_VEC                GGML_F16x8
-    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
-    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
-    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
-    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
-#else
-    // if FP16 vector arithmetic is not supported, we use FP32 instead
-    // and take advantage of the vcvt_ functions to convert to/from FP16
-
-    #define GGML_F16_STEP 16
-    #define GGML_F16_EPR  4
-
-    #define GGML_F32Cx4              float32x4_t
-    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
-    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
-    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
-    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
-    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
-    #define GGML_F32Cx4_ADD          vaddq_f32
-    #define GGML_F32Cx4_MUL          vmulq_f32
-    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
-
-    #define GGML_F16_VEC                GGML_F32Cx4
-    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
-    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
-    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
-    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
-#endif
-
-#elif defined(__AVX512F__)
-
-#define GGML_SIMD
-
-// F32 AVX512
-
-#define GGML_F32_STEP 64
-#define GGML_F32_EPR  16
-
-#define GGML_F32x16         __m512
-#define GGML_F32x16_ZERO    _mm512_setzero_ps()
-#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
-#define GGML_F32x16_LOAD    _mm512_loadu_ps
-#define GGML_F32x16_STORE   _mm512_storeu_ps
-// _mm512_fmadd_ps is defined in AVX512F so no guard is required
-#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
-#define GGML_F32x16_ADD     _mm512_add_ps
-#define GGML_F32x16_MUL     _mm512_mul_ps
-#define GGML_F32x16_REDUCE(res, x)                                    \
-do {                                                                  \
-    int offset = GGML_F32_ARR >> 1;                                   \
-    for (int i = 0; i < offset; ++i) {                                \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
-    }                                                                 \
-    offset >>= 1;                                                     \
-    for (int i = 0; i < offset; ++i) {                                \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
-    }                                                                 \
-    offset >>= 1;                                                     \
-    for (int i = 0; i < offset; ++i) {                                \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
-    }                                                                 \
-    res = _mm512_reduce_add_ps(x[0]);                                 \
-} while (0)
-
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x16
-#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
-
-// F16 AVX512
-
-// F16 AVX
-
-#define GGML_F16_STEP 64
-#define GGML_F16_EPR  16
-
-// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
-
-#define GGML_F32Cx16             __m512
-#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
-#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
-
-// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
-// so F16C guard isn't required
-#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
-#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
-
-#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
-#define GGML_F32Cx16_ADD         _mm512_add_ps
-#define GGML_F32Cx16_MUL         _mm512_mul_ps
-#define GGML_F32Cx16_REDUCE(res, x)                               \
-do {                                                              \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    res = _mm512_reduce_add_ps(x[0]);                             \
-} while (0)
-
-#define GGML_F16_VEC                GGML_F32Cx16
-#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
-#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
-#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
-
-#elif defined(__AVX__)
-
-#define GGML_SIMD
-
-// F32 AVX
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  8
-
-#define GGML_F32x8         __m256
-#define GGML_F32x8_ZERO    _mm256_setzero_ps()
-#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
-#define GGML_F32x8_LOAD    _mm256_loadu_ps
-#define GGML_F32x8_STORE   _mm256_storeu_ps
-#if defined(__FMA__)
-    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
-#endif
-#define GGML_F32x8_ADD     _mm256_add_ps
-#define GGML_F32x8_MUL     _mm256_mul_ps
-#define GGML_F32x8_REDUCE(res, x)                                 \
-do {                                                              \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
-                                 _mm256_extractf128_ps(x[0], 1)); \
-    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
-    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
-} while (0)
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x8
-#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
-
-// F16 AVX
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  8
-
-// F16 arithmetic is not supported by AVX, so we use F32 instead
-
-#define GGML_F32Cx8             __m256
-#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
-#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
-
-#if defined(__F16C__)
-// the  _mm256_cvt intrinsics require F16C
-#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
-#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
-#else
-static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
-    float tmp[8];
-
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    return _mm256_loadu_ps(tmp);
-}
-static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
-    float arr[8];
-
-    _mm256_storeu_ps(arr, y);
-
-    for (int i = 0; i < 8; i++)
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
-}
-#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
-#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
-#endif
-
-#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
-#define GGML_F32Cx8_ADD         _mm256_add_ps
-#define GGML_F32Cx8_MUL         _mm256_mul_ps
-#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
-
-#define GGML_F16_VEC                GGML_F32Cx8
-#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
-#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
-#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_SIMD
-
-// F32 POWER9
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              vector float
-#define GGML_F32x4_ZERO         0.0f
-#define GGML_F32x4_SET1         vec_splats
-#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
-#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
-#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
-#define GGML_F32x4_ADD          vec_add
-#define GGML_F32x4_MUL          vec_mul
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    int offset = GGML_F32_ARR >> 1;            \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    res = vec_extract(x[0], 0) +               \
-          vec_extract(x[0], 1) +               \
-          vec_extract(x[0], 2) +               \
-          vec_extract(x[0], 3);                \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 POWER9
-#define GGML_F16_STEP       GGML_F32_STEP
-#define GGML_F16_EPR        GGML_F32_EPR
-#define GGML_F16_VEC        GGML_F32x4
-#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F16_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F16_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
-// Use vec_xl, not vec_ld, in case the load address is not aligned.
-#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
-  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
-  vec_extract_fp32_from_shortl(vec_xl(0, p))
-#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
-#define GGML_F16_VEC_STORE(p, r, i)                             \
-  if (i & 0x1)                                                  \
-    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
-                                   r[i - GGML_ENDIAN_BYTE(0)]), \
-            0, p - GGML_F16_EPR)
-
-#elif defined(__wasm_simd128__)
-
-#define GGML_SIMD
-
-// F32 WASM
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              v128_t
-#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
-#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
-#define GGML_F32x4_LOAD         wasm_v128_load
-#define GGML_F32x4_STORE        wasm_v128_store
-#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
-#define GGML_F32x4_ADD          wasm_f32x4_add
-#define GGML_F32x4_MUL          wasm_f32x4_mul
-#define GGML_F32x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = GGML_F32_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 WASM
-
-#define GGML_F16_STEP 16
-#define GGML_F16_EPR  4
-
-inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(p[0]);
-    tmp[1] = GGML_FP16_TO_FP32(p[1]);
-    tmp[2] = GGML_FP16_TO_FP32(p[2]);
-    tmp[3] = GGML_FP16_TO_FP32(p[3]);
-
-    return wasm_v128_load(tmp);
-}
-
-inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
-    float tmp[4];
-
-    wasm_v128_store(tmp, x);
-
-    p[0] = GGML_FP32_TO_FP16(tmp[0]);
-    p[1] = GGML_FP32_TO_FP16(tmp[1]);
-    p[2] = GGML_FP32_TO_FP16(tmp[2]);
-    p[3] = GGML_FP32_TO_FP16(tmp[3]);
-}
-
-#define GGML_F16x4             v128_t
-#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
-#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
-#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
-#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
-#define GGML_F16x4_FMA         GGML_F32x4_FMA
-#define GGML_F16x4_ADD         wasm_f32x4_add
-#define GGML_F16x4_MUL         wasm_f32x4_mul
-#define GGML_F16x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = GGML_F16_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F16_VEC                GGML_F16x4
-#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
-#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
-#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
-#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
-
-#elif defined(__SSE3__)
-
-#define GGML_SIMD
-
-// F32 SSE
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4         __m128
-#define GGML_F32x4_ZERO    _mm_setzero_ps()
-#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
-#define GGML_F32x4_LOAD    _mm_loadu_ps
-#define GGML_F32x4_STORE   _mm_storeu_ps
-#if defined(__FMA__)
-    // TODO: Does this work?
-    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
-#endif
-#define GGML_F32x4_ADD     _mm_add_ps
-#define GGML_F32x4_MUL     _mm_mul_ps
-#define GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
-    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
-}
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 SSE
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  4
-
-static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_FP16_TO_FP32(x[3]);
-
-    return _mm_loadu_ps(tmp);
-}
-
-static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
-    float arr[4];
-
-    _mm_storeu_ps(arr, y);
-
-    x[0] = GGML_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_FP32_TO_FP16(arr[3]);
-}
-
-#define GGML_F32Cx4             __m128
-#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
-#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
-#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
-#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
-#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
-#define GGML_F32Cx4_ADD         _mm_add_ps
-#define GGML_F32Cx4_MUL         _mm_mul_ps
-#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
-
-#define GGML_F16_VEC                 GGML_F32Cx4
-#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
-#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
-#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
-#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
-#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
-#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
-
-#elif defined(__loongarch_asx)
-
-#define GGML_SIMD
-
-// F32 LASX
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  8
-
-#define GGML_F32x8         __m256
-#define GGML_F32x8_ZERO    (__m256)__lasx_xvldi(0)
-#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
-#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
-#define GGML_F32x8_STORE(x,y)   __lasx_xvst((y), (x), 0)
-#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
-#define GGML_F32x8_ADD     __lasx_xvfadd_s
-#define GGML_F32x8_MUL     __lasx_xvfmul_s
-#define GGML_F32x8_REDUCE(res, x)                                 \
-do {                                                              \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
-    }                                                             \
-    float *tmp_p = (float *)&x[0]; \
-    res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7];  \
-} while (0)
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x8
-#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
-
-// F16 LASX
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  8
-
-// F16 arithmetic is not supported by AVX, so we use F32 instead
-
-#define GGML_F32Cx8          __m256
-#define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
-#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
-
-static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
-    float tmp[8];
-
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    return (__m256)__lasx_xvld(tmp, 0);
-}
-static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
-    float arr[8];
-
-    __lasx_xvst(y, arr, 0);
-
-    for (int i = 0; i < 8; i++) {
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
-    }
-}
-#define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
-#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
-
-#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
-#define GGML_F32Cx8_ADD         __lasx_xvfadd_s
-#define GGML_F32Cx8_MUL         __lasx_xvfmul_s
-#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
-
-#define GGML_F16_VEC                GGML_F32Cx8
-#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
-#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
-#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
-
-#elif defined(__loongarch_sx)
-
-#define GGML_SIMD
-
-// F32 LSX
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4         __m128
-#define GGML_F32x4_ZERO    __lsx_vldi(0)
-#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
-#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
-#define GGML_F32x4_STORE((x),(y))   __lsx_vst((y), (x), 0)
-#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
-#define GGML_F32x4_ADD     __lsx_vfadd_s
-#define GGML_F32x4_MUL     __lsx_vfmul_s
-#define GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
-    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
-    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
-    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
-    tmp = __lsx_vsrli_d((__m128i)t0, 32); \
-    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
-    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
-    res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0);        \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 LSX
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  4
-
-static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_FP16_TO_FP32(x[3]);
-
-    return __lsx_vld(tmp, 0);
-}
-
-static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
-    float arr[4];
-
-    __lsx_vst(y, arr, 0);
-
-    x[0] = GGML_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_FP32_TO_FP16(arr[3]);
-}
-
-#define GGML_F32Cx4             __m128
-#define GGML_F32Cx4_ZERO        __lsx_vldi(0)
-#define GGML_F32Cx4_SET1(x)     __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
-#define GGML_F32Cx4_LOAD(x)     __lsx_f16x4_load(x)
-#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
-#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
-#define GGML_F32Cx4_ADD         __lsx_vfadd_s
-#define GGML_F32Cx4_MUL         __lsx_vfmul_s
-#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
-
-#define GGML_F16_VEC                 GGML_F32Cx4
-#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
-#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
-#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
-#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
-#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
-#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
-
-#endif
-
-// GGML_F32_ARR / GGML_F16_ARR
-//   number of registers to use per step
-#ifdef GGML_SIMD
-#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
-#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
-#endif
-
 //
 // ggml object
 //
@@ -2031,972 +893,6 @@ struct ggml_context_container {
     struct ggml_context context;
 };
 
-//
-// Threading defs
-//
-
-typedef pthread_t          ggml_thread_t;
-
-#if defined(_WIN32)
-
-typedef CONDITION_VARIABLE ggml_cond_t;
-typedef SRWLOCK            ggml_mutex_t;
-
-#define ggml_mutex_init(m)   InitializeSRWLock(m)
-#define ggml_mutex_destroy(m)
-#define ggml_mutex_lock(m)   AcquireSRWLockExclusive(m)
-#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
-#define ggml_mutex_lock_shared(m)   AcquireSRWLockShared(m)
-#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
-
-#define ggml_cond_init(c)    InitializeConditionVariable(c)
-#define ggml_cond_destroy(c)
-#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
-#define ggml_cond_broadcast(c) WakeAllConditionVariable(c)
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#else
-
-typedef pthread_cond_t     ggml_cond_t;
-typedef pthread_mutex_t    ggml_mutex_t;
-
-#define ggml_mutex_init(m)          pthread_mutex_init(m, NULL)
-#define ggml_mutex_destroy(m)       pthread_mutex_destroy(m)
-#define ggml_mutex_lock(m)          pthread_mutex_lock(m)
-#define ggml_mutex_unlock(m)        pthread_mutex_unlock(m)
-#define ggml_mutex_lock_shared(m)   pthread_mutex_lock(m)
-#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
-#endif
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-#define ggml_cond_init(c)      pthread_cond_init(c, NULL)
-#define ggml_cond_destroy(c)   pthread_cond_destroy(c)
-#define ggml_cond_wait(c, m)   pthread_cond_wait(c, m)
-#define ggml_cond_broadcast(c) pthread_cond_broadcast(c)
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#endif
-
-// Threadpool def
-struct ggml_threadpool {
-    ggml_mutex_t mutex;       // mutex for cond.var
-    ggml_cond_t  cond;        // cond.var for waiting for new work
-
-    struct ggml_cgraph * cgraph;
-    struct ggml_cplan  * cplan;
-
-    // synchronization primitives
-    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
-    atomic_int GGML_CACHE_ALIGN n_barrier;
-    atomic_int GGML_CACHE_ALIGN n_barrier_passed;
-    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
-
-    // these are atomic as an annotation for thread-sanitizer
-    atomic_bool stop;         // Used for stopping the threadpool altogether
-    atomic_bool pause;        // Used for pausing the threadpool or individual threads
-    atomic_bool abort;        // Used for aborting processing of a graph
-
-    struct ggml_compute_state * workers;   // per thread state
-    int          n_threads_max; // number of threads in the pool
-    atomic_int   n_threads_cur; // number of threads used in the current graph
-
-    int32_t      prio;        // Scheduling priority
-    uint32_t     poll;        // Polling level (0 - no polling)
-
-    enum ggml_status ec;
-};
-
-// Per-thread state
-struct ggml_compute_state {
-#ifndef GGML_USE_OPENMP
-    ggml_thread_t thrd;
-    bool cpumask[GGML_MAX_N_THREADS];
-    int  last_graph;
-    bool pending;
-#endif
-    struct ggml_threadpool * threadpool;
-    int ith;
-};
-
-struct ggml_compute_params {
-    // ith = thread index, nth = number of threads
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-
-    struct ggml_threadpool * threadpool;
-};
-
-//
-// fundamental operations
-//
-
-inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
-inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
-inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
-inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
-inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
-inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
-inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
-inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
-inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
-inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
-
-static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
-   assert(nrc == 1);
-   UNUSED(nrc);
-   UNUSED(bx);
-   UNUSED(by);
-   UNUSED(bs);
-
-#if defined(GGML_SIMD)
-    float sumf = 0.0f;
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F32_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += x[i]*y[i];
-    }
-#else
-    // scalar
-    ggml_float sumf = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(x[i]*y[i]);
-    }
-#endif
-
-    *s = sumf;
-}
-
-static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    int i = 0;
-    ggml_float sumf = 0;
-
-#if defined(__AVX512BF16__)
-    __m512 c1 = _mm512_setzero_ps();
-    __m512 c2 = _mm512_setzero_ps();
-    for (; i + 64 <= n; i += 64) {
-        c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
-                             m512bh(_mm512_loadu_si512((y + i))));
-        c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
-                             m512bh(_mm512_loadu_si512((y + i + 32))));
-    }
-    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
-    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
-
-#elif defined(__AVX512F__)
-#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
-    __m512 c1 = _mm512_setzero_ps();
-    __m512 c2 = _mm512_setzero_ps();
-    for (; i + 32 <= n; i += 32) {
-        c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
-        c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
-    }
-    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
-    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
-
-#undef LOAD
-#elif defined(__AVX2__)
-#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
-    __m256 c1 = _mm256_setzero_ps();
-    __m256 c2 = _mm256_setzero_ps();
-    __m256 c3 = _mm256_setzero_ps();
-    __m256 c4 = _mm256_setzero_ps();
-    for (; i + 32 <= n; i += 32) {
-        c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
-        c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
-        c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
-        c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
-    }
-    __m128 g;
-    c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
-                       _mm256_add_ps(c2, c4));
-    g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
-                   _mm256_castps256_ps128(c1));
-    g = _mm_add_ps(g, _mm_movehl_ps(g, g));
-    g = _mm_add_ss(g, _mm_movehdup_ps(g));
-    sumf += (ggml_float)_mm_cvtss_f32(g);
-
-#undef LOAD
-#endif
-
-    for (; i < n; ++i) {
-        sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
-                             GGML_BF16_TO_FP32(y[i]));
-    }
-    *s = sumf;
-}
-
-static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    ggml_float sumf = 0.0;
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F16_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#endif
-
-    *s = sumf;
-}
-
-// compute GGML_VEC_DOT_UNROLL dot products at once
-// xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
-    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
-
-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
-    }
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
-
-                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
-            }
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#endif
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        s[i] = sumf[i];
-    }
-}
-
-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#endif
-}
-
-inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
-
-            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
-    }
-#endif
-}
-
-// xs and vs are byte strides of x and v
-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
-
-    const float * restrict x[GGML_VEC_MAD_UNROLL];
-    const float * restrict v[GGML_VEC_MAD_UNROLL];
-
-    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
-        x[i] = (const float *) ((const char *) xv + i*xs);
-        v[i] = (const float *) ((const char *) vv + i*vs);
-    }
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
-
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
-    }
-
-    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
-                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
-            }
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        for (int i = np; i < n; ++i) {
-            y[i] += x[k][i]*v[k][0];
-        }
-    }
-#else
-    // scalar
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        for (int i = 0; i < n; ++i) {
-            y[i] += x[k][i]*v[k][0];
-        }
-    }
-#endif
-}
-
-//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
-inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
-#if defined(GGML_USE_ACCELERATE)
-    vDSP_vsmul(y, 1, &v, y, 1, n);
-#elif defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] *= v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] *= v;
-    }
-#endif
-}
-
-inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
-
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
-
-            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
-    }
-#endif
-}
-
-inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
-inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
-inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
-inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
-inline static void ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
-inline static void ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
-inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
-inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
-inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
-inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
-inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
-inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
-inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
-inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
-// TODO: optimize performance
-inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
-inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
-inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
-
-static const float GELU_COEF_A     = 0.044715f;
-static const float GELU_QUICK_COEF = -1.702f;
-static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
-
-inline static float ggml_gelu_f32(float x) {
-    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
-}
-
-inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    const uint16_t * i16 = (const uint16_t *) x;
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_table_gelu_f16[i16[i]];
-    }
-}
-
-#ifdef GGML_GELU_FP16
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        if (x[i] <= -10.0f) {
-            y[i] = 0.0f;
-        } else if (x[i] >= 10.0f) {
-            y[i] = x[i];
-        } else {
-            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-            memcpy(&t, &fp16, sizeof(uint16_t));
-            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
-        }
-    }
-}
-#else
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_f32(x[i]);
-    }
-}
-#endif
-
-inline static float ggml_gelu_quick_f32(float x) {
-    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
-}
-
-//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-//    const uint16_t * i16 = (const uint16_t *) x;
-//    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
-//    }
-//}
-
-#ifdef GGML_GELU_QUICK_FP16
-inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_quick_f32(x[i]);
-    }
-}
-#endif
-
-// Sigmoid Linear Unit (SiLU) function
-inline static float ggml_silu_f32(float x) {
-    return x/(1.0f + expf(-x));
-}
-
-#if __FINITE_MATH_ONLY__
-#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
-#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
-#endif
-
-#if defined(__ARM_NEON) && defined(__aarch64__)
-
-// adapted from arm limited optimized routine
-// the maximum error is 1.45358 plus 0.5 ulps
-// numbers above 88.38 will flush to infinity
-// numbers beneath -103.97 will flush to zero
-inline static float32x4_t ggml_v_expf(float32x4_t x) {
-    const float32x4_t r = vdupq_n_f32(0x1.8p23f);
-    const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
-    const float32x4_t n = vsubq_f32(z, r);
-    const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
-                                    vdupq_n_f32(0x1.7f7d1cp-20f));
-    const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
-    const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
-    const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
-    const float32x4_t u = vmulq_f32(b, b);
-    const float32x4_t j = vfmaq_f32(
-        vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
-        vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
-                  vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
-    if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
-        return vfmaq_f32(k, j, k);
-    const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
-    const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
-    const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
-    return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
-                     vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
-}
-
-// computes silu x/(1+exp(-x)) in single precision vector
-inline static float32x4_t ggml_v_silu(float32x4_t x) {
-    const float32x4_t one = vdupq_n_f32(1.0f);
-    const float32x4_t zero = vdupq_n_f32(0.0f);
-    const float32x4_t neg_x = vsubq_f32(zero, x);
-    const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
-    const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
-    return vdivq_f32(x, one_plus_exp_neg_x);
-}
-
-#elif defined(__AVX512F__) && defined(__AVX512DQ__)
-
-// adapted from arm limited optimized routine
-// the maximum error is 1.45358 plus 0.5 ulps
-// numbers above 88.38 will flush to infinity
-// numbers beneath -103.97 will flush to zero
-inline static __m512 ggml_v_expf(__m512 x) {
-  const __m512 r = _mm512_set1_ps(0x1.8p23f);
-  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
-  const __m512 n = _mm512_sub_ps(z, r);
-  const __m512 b =
-      _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
-                       _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
-  const __mmask16 d =
-      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
-  const __m512 u = _mm512_mul_ps(b, b);
-  const __m512 j = _mm512_fmadd_ps(
-      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
-                                      _mm512_set1_ps(0x1.573e2ep-5f)),
-                      u,
-                      _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
-                                      _mm512_set1_ps(0x1.fffdb6p-2f))),
-      u,
-      _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
-  const __m512 res = _mm512_scalef_ps(j, n);
-  if (_mm512_kortestz(d, d))
-    return res;
-  const __m512 zero = _mm512_setzero_ps();
-  const __m512 alt = _mm512_mask_blend_ps(
-      _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
-  return _mm512_mask_blend_ps(d, res, alt);
-}
-
-// computes silu x/(1+exp(-x)) in single precision vector
-inline static __m512 ggml_v_silu(__m512 x) {
-    const __m512 one = _mm512_set1_ps(1);
-    const __m512 zero = _mm512_setzero_ps();
-    const __m512 neg_x = _mm512_sub_ps(zero, x);
-    const __m512 exp_neg_x = ggml_v_expf(neg_x);
-    const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
-    return _mm512_div_ps(x, one_plus_exp_neg_x);
-}
-
-#elif defined(__AVX2__) && defined(__FMA__)
-
-// adapted from arm limited optimized routine
-// the maximum error is 1.45358 plus 0.5 ulps
-// numbers above 88.38 will flush to infinity
-// numbers beneath -103.97 will flush to zero
-inline static __m256 ggml_v_expf(__m256 x) {
-  const __m256 r = _mm256_set1_ps(0x1.8p23f);
-  const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
-  const __m256 n = _mm256_sub_ps(z, r);
-  const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
-                                    _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
-  const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
-  const __m256 k = _mm256_castsi256_ps(
-      _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
-  const __m256i c = _mm256_castps_si256(
-      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
-                    _mm256_set1_ps(126), _CMP_GT_OQ));
-  const __m256 u = _mm256_mul_ps(b, b);
-  const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
-                                                                   _mm256_set1_ps(0x1.573e2ep-5f)), u,
-                                                   _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
-                                                                   _mm256_set1_ps(0x1.fffdb6p-2f))),
-                                   u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
-  if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
-    return _mm256_fmadd_ps(j, k, k);
-  const __m256i g = _mm256_and_si256(
-      _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
-      _mm256_set1_epi32(0x82000000u));
-  const __m256 s1 =
-      _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
-  const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
-  const __m256i d = _mm256_castps_si256(
-      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
-                    _mm256_set1_ps(192), _CMP_GT_OQ));
-  return _mm256_or_ps(
-      _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
-      _mm256_andnot_ps(
-          _mm256_castsi256_ps(d),
-          _mm256_or_ps(
-              _mm256_and_ps(_mm256_castsi256_ps(c),
-                            _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
-              _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
-}
-
-// computes silu x/(1+exp(-x)) in single precision vector
-inline static __m256 ggml_v_silu(__m256 x) {
-    const __m256 one = _mm256_set1_ps(1);
-    const __m256 zero = _mm256_setzero_ps();
-    const __m256 neg_x = _mm256_sub_ps(zero, x);
-    const __m256 exp_neg_x = ggml_v_expf(neg_x);
-    const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
-    return _mm256_div_ps(x, one_plus_exp_neg_x);
-}
-
-#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
-
-#if defined(__FMA__)
-#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
-#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
-#else
-#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
-#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
-#endif
-
-// adapted from arm limited optimized routine
-// the maximum error is 1.45358 plus 0.5 ulps
-// numbers above 88.38 will flush to infinity
-// numbers beneath -103.97 will flush to zero
-inline static __m128 ggml_v_expf(__m128 x) {
-    const __m128 r = _mm_set1_ps(0x1.8p23f);
-    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
-    const __m128 n = _mm_sub_ps(z, r);
-    const __m128 b =
-        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
-    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
-    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
-    const __m128i c =
-        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
-    const __m128 u = _mm_mul_ps(b, b);
-    const __m128 j =
-        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
-                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
-                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
-    if (!_mm_movemask_epi8(c))
-        return MADD128(j, k, k);
-    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
-                                    _mm_set1_epi32(0x82000000u));
-    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
-    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
-    const __m128i d =
-        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
-    return _mm_or_ps(
-        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
-        _mm_andnot_ps(_mm_castsi128_ps(d),
-                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
-                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
-}
-
-// computes silu x/(1+exp(-x)) in single precision vector
-inline static __m128 ggml_v_silu(__m128 x) {
-    const __m128 one = _mm_set1_ps(1);
-    const __m128 zero = _mm_setzero_ps();
-    const __m128 neg_x = _mm_sub_ps(zero, x);
-    const __m128 exp_neg_x = ggml_v_expf(neg_x);
-    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
-    return _mm_div_ps(x, one_plus_exp_neg_x);
-}
-
-#endif // __ARM_NEON / __AVX2__ / __SSE2__
-
-static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    int i = 0;
-#if defined(__AVX512F__) && defined(__AVX512DQ__)
-    for (; i + 15 < n; i += 16) {
-        _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
-    }
-#elif defined(__AVX2__) && defined(__FMA__)
-    for (; i + 7 < n; i += 8) {
-        _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
-    }
-#elif defined(__SSE2__)
-    for (; i + 3 < n; i += 4) {
-        _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
-    }
-#elif defined(__ARM_NEON) && defined(__aarch64__)
-    for (; i + 3 < n; i += 4) {
-        vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
-    }
-#endif
-    for (; i < n; ++i) {
-        y[i] = ggml_silu_f32(x[i]);
-    }
-}
-
-static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
-    int i = 0;
-    ggml_float sum = 0;
-#if defined(__AVX512F__) && defined(__AVX512DQ__)
-    for (; i + 15 < n; i += 16) {
-        __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
-                                               _mm512_set1_ps(max)));
-        _mm512_storeu_ps(y + i, val);
-        sum += (ggml_float)_mm512_reduce_add_ps(val);
-    }
-#elif defined(__AVX2__) && defined(__FMA__)
-    for (; i + 7 < n; i += 8) {
-        __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
-                                               _mm256_set1_ps(max)));
-        _mm256_storeu_ps(y + i, val);
-        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
-                                 _mm256_castps256_ps128(val));
-        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
-        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
-        sum += (ggml_float)_mm_cvtss_f32(val2);
-    }
-#elif defined(__SSE2__)
-    for (; i + 3 < n; i += 4) {
-        __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
-                                            _mm_set1_ps(max)));
-        _mm_storeu_ps(y + i, val);
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
-        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
-        val = _mm_add_ss(val, _mm_movehdup_ps(val));
-#else
-        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
-        val = _mm_add_ps(val, tmp);
-        tmp = _mm_movehl_ps(tmp, val);
-        val = _mm_add_ss(val, tmp);
-#endif
-        sum += (ggml_float)_mm_cvtss_f32(val);
-    }
-#elif defined(__ARM_NEON) && defined(__aarch64__)
-    for (; i + 3 < n; i += 4) {
-        float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
-                                                vdupq_n_f32(max)));
-        vst1q_f32(y + i, val);
-        sum += (ggml_float)vaddvq_f32(val);
-    }
-#endif
-    for (; i < n; ++i) {
-        float val = expf(x[i] - max);
-        sum += (ggml_float)val;
-        y[i] = val;
-    }
-    return sum;
-}
-
-static ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
-    // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
-
-    int i = 0;
-    ggml_float sum = 0;
-    for (; i < n; ++i) {
-        float val = x[i] - max;
-        y[i] = val;
-        sum += (ggml_float)expf(val);
-    }
-    return sum = (ggml_float)logf(sum);
-}
-
-inline static float ggml_silu_backward_f32(float x, float dy) {
-    const float s = 1.0f/(1.0f + expf(-x));
-    return dy*s*(1.0f + x*(1.0f - s));
-}
-
-inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
-    for (int i = 0; i < n; ++i) {
-        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
-    }
-}
-
-inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = sum;
-#else
-    vDSP_sve(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
-    float sum = 0.0f;
-    for (int i = 0; i < n; ++i) {
-        sum += GGML_FP16_TO_FP32(x[i]);
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
-    float sum = 0.0f;
-    for (int i = 0; i < n; ++i) {
-        sum += GGML_BF16_TO_FP32(x[i]);
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    float max = -INFINITY;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-    }
-    *s = max;
-#else
-    vDSP_maxv(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
-    ggml_vec_norm_f32(n, s, x);
-    *s = 1.f/(*s);
-}
-
-inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
-    float max = -INFINITY;
-    int idx = 0;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-        if (max == x[i]) { idx = i; }
-    }
-    *s = idx;
-}
-
 //
 // data types
 //
@@ -3217,214 +1113,6 @@ static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14");
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
 
-// Helpers for polling loops
-#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
-static inline void ggml_thread_cpu_relax(void) {
-    __asm__ volatile("yield" ::: "memory");
-}
-#elif defined(__x86_64__)
-static inline void ggml_thread_cpu_relax(void) {
-    _mm_pause();
-}
-#else
-static inline void ggml_thread_cpu_relax(void) {;}
-#endif
-
-//
-// NUMA support
-//
-
-#define GGML_NUMA_MAX_NODES 8
-#define GGML_NUMA_MAX_CPUS 512
-
-struct ggml_numa_node {
-    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
-    uint32_t n_cpus;
-};
-
-struct ggml_numa_nodes {
-    enum ggml_numa_strategy numa_strategy;
-    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
-    uint32_t n_nodes;
-    uint32_t total_cpus; // hardware threads on system
-    uint32_t current_node; // node on which main process is execting
-#if defined(__gnu_linux__)
-    cpu_set_t cpuset; // cpuset from numactl
-#else
-    uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
-#endif
-};
-
-//
-// ggml state
-//
-
-struct ggml_state {
-    struct ggml_numa_nodes numa;
-};
-
-// global state
-static struct ggml_state g_state;
-static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
-
-// critical section via spin lock
-inline static void ggml_critical_section_start(void) {
-    while (atomic_flag_test_and_set(&g_state_critical)) {
-        // spin
-        sched_yield();
-    }
-}
-
-static void ggml_barrier(struct ggml_threadpool * tp) {
-    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
-    if (n_threads == 1) {
-        return;
-    }
-
-#ifdef GGML_USE_OPENMP
-    #pragma omp barrier
-#else
-    int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
-
-    // enter barrier (full seq-cst fence)
-    int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
-
-    if (n_barrier == (n_threads - 1)) {
-        // last thread
-        atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
-
-        // exit barrier (fill seq-cst fence)
-        atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
-        return;
-    }
-
-    // wait for other threads
-    while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
-        ggml_thread_cpu_relax();
-    }
-
-    // exit barrier (full seq-cst fence)
-    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
-    #ifdef GGML_TSAN_ENABLED
-    atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
-    #else
-    atomic_thread_fence(memory_order_seq_cst);
-    #endif
-#endif
-}
-
-// TODO: make this somehow automatically executed
-//       some sort of "sentry" mechanism
-inline static void ggml_critical_section_end(void) {
-    atomic_flag_clear(&g_state_critical);
-}
-
-#if defined(__gnu_linux__)
-static cpu_set_t ggml_get_numa_affinity(void) {
-    cpu_set_t cpuset;
-    pthread_t thread;
-    thread = pthread_self();
-    CPU_ZERO(&cpuset);
-    pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
-    return cpuset;
-}
-#else
-static uint32_t ggml_get_numa_affinity(void) {
-    return 0; // no NUMA support
-}
-#endif
-
-void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
-    if (g_state.numa.n_nodes > 0) {
-        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
-
-        return;
-    }
-
-#if defined(__gnu_linux__)
-    struct stat st;
-    char path[256];
-    int rv;
-
-    // set numa scheme
-    g_state.numa.numa_strategy = numa_flag;
-
-    GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
-
-    g_state.numa.cpuset = ggml_get_numa_affinity();
-
-    // enumerate nodes
-    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
-        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
-        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) { break; }
-        ++g_state.numa.n_nodes;
-    }
-
-    // enumerate CPUs
-    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
-        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
-        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) { break; }
-        ++g_state.numa.total_cpus;
-    }
-
-    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
-
-    // figure out which node we're on
-    uint current_cpu;
-    int getcpu_ret = 0;
-#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
-    getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
-#else
-    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
-#   if !defined(SYS_getcpu) && defined(SYS_get_cpu)
-#       define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
-#   endif
-    getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
-#endif
-
-    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
-        g_state.numa.n_nodes = 0;
-        return;
-    }
-
-    GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
-
-    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
-        struct ggml_numa_node * node = &g_state.numa.nodes[n];
-        GGML_PRINT_DEBUG("CPUs on node %u:", n);
-        node->n_cpus = 0;
-        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
-            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
-            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-            if (stat(path, &st) == 0) {
-                node->cpus[node->n_cpus++] = c;
-                GGML_PRINT_DEBUG(" %u", c);
-            }
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-
-    if (ggml_is_numa()) {
-        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
-        if (fptr != NULL) {
-            char buf[42];
-            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
-                GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
-            }
-            fclose(fptr);
-        }
-    }
-#else
-    UNUSED(numa_flag);
-    // TODO
-#endif
-}
-
-bool ggml_is_numa(void) {
-    return g_state.numa.n_nodes > 1;
-}
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -3561,22 +1249,6 @@ int ggml_n_dims(const struct ggml_tensor * tensor) {
     return 1;
 }
 
-static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (t0->ne[0]           == t1->ne[0])  &&
-           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
-           (t1->ne[3]%t0->ne[3] == 0);
-}
-
-static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (t0->ne[1] == t1->ne[1])   &&
-           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
-           (t1->ne[3]%t0->ne[3] == 0);
-}
-
 enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
     enum ggml_type wtype = GGML_TYPE_COUNT;
 
@@ -3723,140 +1395,29 @@ static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const str
     return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
 }
 
-static inline int ggml_up32(int n) {
-    return (n + 31) & ~31;
-}
-
-//static inline int ggml_up64(int n) {
-//    return (n + 63) & ~63;
-//}
-
-static inline int ggml_up(int n, int m) {
-    // assert m is a power of 2
-    GGML_ASSERT((m & (m - 1)) == 0);
-    return (n + m - 1) & ~(m - 1);
-}
-
 // assert that pointer is aligned to GGML_MEM_ALIGN
 #define GGML_ASSERT_ALIGNED(ptr) \
     GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__ARM_ARCH)
-
-#if defined(__linux__) && defined(__aarch64__)
-#include <sys/auxv.h>
-#elif defined(__APPLE__)
-#include <sys/sysctl.h>
-#endif
-
-#if !defined(HWCAP2_I8MM)
-#define HWCAP2_I8MM 0
-#endif
-
-static void ggml_init_arm_arch_features(void) {
-#if defined(__linux__) && defined(__aarch64__)
-    uint32_t hwcap = getauxval(AT_HWCAP);
-    uint32_t hwcap2 = getauxval(AT_HWCAP2);
-
-    ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
-    ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
-    ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
-
-#if defined(__ARM_FEATURE_SVE)
-    ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
-#endif
-#elif defined(__APPLE__)
-    int oldp = 0;
-    size_t size = sizeof(oldp);
-    if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
-        oldp = 0;
-    }
-    ggml_arm_arch_features.has_neon = oldp;
-
-    if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
-        oldp = 0;
-    }
-    ggml_arm_arch_features.has_i8mm = oldp;
-
-    ggml_arm_arch_features.has_sve = 0;
-    ggml_arm_arch_features.sve_cnt = 0;
-#else
-// Run-time CPU feature detection not implemented for this platform, fallback to compile time
-#if defined(__ARM_NEON)
-    ggml_arm_arch_features.has_neon = 1;
-#else
-    ggml_arm_arch_features.has_neon = 0;
-#endif
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    ggml_arm_arch_features.has_i8mm = 1;
-#else
-    ggml_arm_arch_features.has_i8mm = 0;
-#endif
-
-#if defined(__ARM_FEATURE_SVE)
-    ggml_arm_arch_features.has_sve = 1;
-    ggml_arm_arch_features.sve_cnt = 16;
-#else
-    ggml_arm_arch_features.has_sve = 0;
-    ggml_arm_arch_features.sve_cnt = 0;
-#endif
-#endif
-}
-#endif
-
 struct ggml_context * ggml_init(struct ggml_init_params params) {
-    // make this function thread safe
+    static bool is_first_call = false;
+
     ggml_critical_section_start();
 
-    static bool is_first_call = true;
-
-    if (is_first_call) {
+    if (!is_first_call) {
         // initialize time system (required on Windows)
         ggml_time_init();
 
-        // initialize GELU, Quick GELU, SILU and EXP F32 tables
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            for (int i = 0; i < (1 << 16); ++i) {
-                union {
-                    uint16_t u16;
-                    ggml_fp16_t fp16;
-                } u = {i};
-                float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
-                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
-                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
-            }
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-
-            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
+        for (int i = 0; i < (1 << 16); ++i) {
+            union {
+                uint16_t u16;
+                ggml_fp16_t fp16;
+            } u = {i};
+            ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
         }
-
-        // initialize g_state
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            g_state = (struct ggml_state) {
-                /*.numa =*/ {
-                    .n_nodes = 0,
-                    .total_cpus = 0,
-                },
-            };
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-
-            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-        }
-
-#if defined(__ARM_ARCH)
-        ggml_init_arm_arch_features();
-#endif
-
-        is_first_call = false;
+        is_first_call = true;
     }
 
     ggml_critical_section_end();
@@ -4123,183 +1684,16 @@ struct ggml_tensor * ggml_new_tensor_4d(
     return ggml_new_tensor(ctx, type, 4, ne);
 }
 
-struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, nbytes);
 
-    ggml_set_i32(result, value);
-
-    return result;
-}
-
-struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-
-    ggml_set_f32(result, value);
-
-    return result;
+    return (uint8_t *)ctx->mem_buffer + obj->offs;
 }
 
 struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
     return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
 }
 
-static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
-    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
-    assert(params_size <= GGML_MAX_OP_PARAMS);
-    memcpy(tensor->op_params, params, params_size);
-}
-
-static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
-    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
-    return ((const int32_t *)(tensor->op_params))[i];
-}
-
-static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
-    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
-    return ((const float *)(tensor->op_params))[i];
-}
-
-static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
-    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
-    ((int32_t *)(tensor->op_params))[i] = value;
-}
-
-static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
-    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
-    ((float *)(tensor->op_params))[i] = value;
-}
-
-struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
-    if (ggml_is_empty(tensor)) {
-        return tensor;
-    }
-    if (tensor->buffer) {
-        ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor));
-    } else {
-        GGML_ASSERT(tensor->data);
-        memset(tensor->data, 0, ggml_nbytes(tensor));
-    }
-    return tensor;
-}
-
-struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
-    const int n     = ggml_nrows(tensor);
-    const int nc    = tensor->ne[0];
-    const size_t n1 = tensor->nb[1];
-
-    char * const data = tensor->data;
-
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                assert(tensor->nb[0] == sizeof(int8_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I16:
-            {
-                assert(tensor->nb[0] == sizeof(int16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I32:
-            {
-                assert(tensor->nb[0] == sizeof(int32_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
-                }
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
-                }
-            } break;
-        case GGML_TYPE_F32:
-            {
-                assert(tensor->nb[0] == sizeof(float));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
-                }
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-
-    return tensor;
-}
-
-struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
-    const int n     = ggml_nrows(tensor);
-    const int nc    = tensor->ne[0];
-    const size_t n1 = tensor->nb[1];
-
-    char * const data = tensor->data;
-
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                assert(tensor->nb[0] == sizeof(int8_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I16:
-            {
-                assert(tensor->nb[0] == sizeof(int16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I32:
-            {
-                assert(tensor->nb[0] == sizeof(int32_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
-                }
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_bf16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
-                }
-            } break;
-        case GGML_TYPE_F32:
-            {
-                assert(tensor->nb[0] == sizeof(float));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
-                }
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-
-    return tensor;
-}
-
 void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
     const int64_t ne2 = tensor->ne[2];
     const int64_t ne1 = tensor->ne[1];
@@ -4324,280 +1718,6 @@ void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t *
     }
 }
 
-int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                return ((int8_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                return ((int16_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                return ((int32_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            }
-        case GGML_TYPE_BF16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
-                return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
-            }
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            }
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
-        return;
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                ((int8_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                ((int16_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                ((int32_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
-                ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                ((float *)(tensor->data))[i] = value;
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            return ((int8_t *) data)[0];
-        case GGML_TYPE_I16:
-            return ((int16_t *) data)[0];
-        case GGML_TYPE_I32:
-            return ((int32_t *) data)[0];
-        case GGML_TYPE_F16:
-            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
-        case GGML_TYPE_BF16:
-            return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
-        case GGML_TYPE_F32:
-            return ((float *) data)[0];
-        default:
-            GGML_ABORT("fatal error");
-    }
-}
-
-void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                ((int8_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                ((int16_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                ((int32_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ((float *)(data))[0] = value;
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                return ((int8_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I16:
-            {
-                return ((int16_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I32:
-            {
-                return ((int32_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_F16:
-            {
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            }
-        case GGML_TYPE_BF16:
-            {
-                return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
-            }
-        case GGML_TYPE_F32:
-            {
-                return ((float *)(tensor->data))[i];
-            }
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
-        return;
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                ((int8_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                ((int16_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                ((int32_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ((float *)(tensor->data))[i] = value;
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            return ((int8_t *) data)[0];
-        case GGML_TYPE_I16:
-            return ((int16_t *) data)[0];
-        case GGML_TYPE_I32:
-            return ((int32_t *) data)[0];
-        case GGML_TYPE_F16:
-            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
-        case GGML_TYPE_BF16:
-            return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
-        case GGML_TYPE_F32:
-            return ((float *) data)[0];
-        default:
-            GGML_ABORT("fatal error");
-    }
-}
-
-void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                ((int8_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                ((int16_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                ((int32_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ((float *)(data))[0] = value;
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
 void * ggml_get_data(const struct ggml_tensor * tensor) {
     return tensor->data;
 }
@@ -5572,6 +2692,14 @@ struct ggml_tensor * ggml_group_norm_inplace(
 
 // ggml_mul_mat
 
+static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[0]           == t1->ne[0])  &&
+           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
+           (t1->ne[3]%t0->ne[3] == 0);
+}
+
 struct ggml_tensor * ggml_mul_mat(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -5641,6 +2769,14 @@ struct ggml_tensor * ggml_mul_mat_id(
 
 // ggml_out_prod
 
+static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[1] == t1->ne[1])   &&
+           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
+           (t1->ne[3]%t0->ne[3] == 0);
+}
+
 struct ggml_tensor * ggml_out_prod(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -7613,11 +4749,6 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
 }
 
 // ggml_map_custom1
-struct ggml_map_custom1_op_params {
-    ggml_custom1_op_t  fun;
-    int                n_tasks;
-    void             * userdata;
-};
 
 static struct ggml_tensor * ggml_map_custom1_impl(
         struct ggml_context      * ctx,
@@ -7663,12 +4794,6 @@ struct ggml_tensor * ggml_map_custom1_inplace(
 
 // ggml_map_custom2
 
-struct ggml_map_custom2_op_params {
-    ggml_custom2_op_t   fun;
-    int                 n_tasks;
-    void              * userdata;
-};
-
 static struct ggml_tensor * ggml_map_custom2_impl(
         struct ggml_context      * ctx,
         struct ggml_tensor       * a,
@@ -7717,12 +4842,6 @@ struct ggml_tensor * ggml_map_custom2_inplace(
 
 // ggml_map_custom3
 
-struct ggml_map_custom3_op_params {
-    ggml_custom3_op_t fun;
-    int n_tasks;
-    void * userdata;
-};
-
 static struct ggml_tensor * ggml_map_custom3_impl(
         struct ggml_context      * ctx,
         struct ggml_tensor       * a,
@@ -7850,9675 +4969,6 @@ struct ggml_tensor * ggml_opt_step_adamw(
 
 ////////////////////////////////////////////////////////////////////////////////
 
-// ggml_compute_forward_dup
-
-static void ggml_compute_forward_dup_same_cont(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-    GGML_ASSERT(src0->type == dst->type);
-
-    const size_t nb0 = ggml_type_size(src0->type);
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    // parallelize by elements
-    const int ne = ggml_nelements(dst);
-    const int dr = (ne + nth - 1) / nth;
-    const int ie0 = dr * ith;
-    const int ie1 = MIN(ie0 + dr, ne);
-
-    if (ie0 < ie1) {
-        memcpy(
-            ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb0),
-            (ie1 - ie0) * nb0);
-    }
-}
-
-static void ggml_compute_forward_dup_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
-
-    if (ggml_is_contiguous(dst)) {
-        if (nb00 == sizeof(ggml_fp16_t)) {
-            if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (type_traits[dst->type].from_float) {
-                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
-                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-                            }
-
-                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ABORT("fatal error"); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ABORT("fatal error"); // TODO: implement
-            }
-        }
-        return;
-    }
-
-    // dst counters
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
-
-                        if (++i10 == ne00) {
-                            i10 = 0;
-                            if (++i11 == ne01) {
-                                i11 = 0;
-                                if (++i12 == ne02) {
-                                    i12 = 0;
-                                    if (++i13 == ne03) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ABORT("fatal error"); // TODO: implement
-    }
-}
-
-static void ggml_compute_forward_dup_bf16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
-
-    if (ggml_is_contiguous(dst)) {
-        if (nb00 == sizeof(ggml_bf16_t)) {
-            if (dst->type == GGML_TYPE_BF16) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = GGML_BF16_TO_FP32(src0_ptr[i00]);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (type_traits[dst->type].from_float) {
-                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
-                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                src0_f32[i00] = GGML_BF16_TO_FP32(src0_ptr[i00]);
-                            }
-
-                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ABORT("fatal error"); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_BF16_TO_FP32(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_BF16) {
-                size_t id = 0;
-                ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ABORT("fatal error"); // TODO: implement
-            }
-        }
-        return;
-    }
-
-    // dst counters
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_BF16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t));
-
-                        if (++i10 == ne00) {
-                            i10 = 0;
-                            if (++i11 == ne01) {
-                                i11 = 0;
-                                if (++i12 == ne02) {
-                                    i12 = 0;
-                                    if (++i13 == ne03) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(float *) dst_ptr = GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ABORT("fatal error"); // TODO: implement
-    }
-}
-
-static void ggml_compute_forward_dup_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    if (ggml_is_contiguous(dst)) {
-        // TODO: simplify
-        if (nb00 == sizeof(float)) {
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (type_traits[dst->type].from_float) {
-                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ABORT("fatal error"); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_BF16) {
-                size_t id = 0;
-                ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP32_TO_BF16(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ABORT("fatal error"); // TODO: implement
-            }
-        }
-
-        return;
-    }
-
-    // dst counters
-
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(float));
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_BF16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(ggml_bf16_t *) dst_ptr = GGML_FP32_TO_BF16(*(const float *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ABORT("fatal error"); // TODO: implement
-    }
-}
-
-// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
-static void ggml_compute_forward_dup_bytes(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-    GGML_ASSERT(src0->type == dst->type);
-
-    GGML_TENSOR_UNARY_OP_LOCALS;
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
-        ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
-
-    const size_t type_size = ggml_type_size(src0->type);
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == type_size && nb0 == type_size) {
-        // copy by rows
-        const size_t rs = ne00 * type_size;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    if (ggml_is_contiguous(dst)) {
-        size_t id = 0;
-        char * dst_ptr = (char *) dst->data;
-        const size_t rs = ne00 * type_size;
-
-        if (nb00 == type_size) {
-            // src0 is contigous on first dimension, copy by rows
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    id += rs * ir0;
-                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                        memcpy(dst_ptr + id, src0_ptr, rs);
-                        id += rs;
-                    }
-                    id += rs * (ne01 - ir1);
-                }
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    id += rs * ir0;
-                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, type_size);
-
-                            id += type_size;
-                        }
-                    }
-                    id += rs * (ne01 - ir1);
-                }
-            }
-        }
-
-        return;
-    }
-
-    // dst counters
-
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            i10 += ne00 * ir0;
-            while (i10 >= ne0) {
-                i10 -= ne0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-            for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                    memcpy(dst_ptr, src0_ptr, type_size);
-
-                    if (++i10 == ne0) {
-                        i10 = 0;
-                        if (++i11 == ne1) {
-                            i11 = 0;
-                            if (++i12 == ne2) {
-                                i12 = 0;
-                                if (++i13 == ne3) {
-                                    i13 = 0;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            i10 += ne00 * (ne01 - ir1);
-            while (i10 >= ne0) {
-                i10 -= ne0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_dup(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (src0->type == dst->type) {
-        ggml_compute_forward_dup_bytes(params, dst);
-        return;
-    }
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_dup_f16(params, dst);
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                ggml_compute_forward_dup_bf16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_dup_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_add
-
-static void ggml_compute_forward_add_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(float)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_add_f16_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    if (dst->type == GGML_TYPE_F32) {
-        GGML_ASSERT( nb0 == sizeof(float));
-    }
-    else {
-        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    }
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(float)) {
-        if (dst->type == GGML_TYPE_F16) {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
-                }
-            }
-        } else {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
-                }
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ABORT("fatal error");
-    }
-}
-
-static void ggml_compute_forward_add_bf16_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    if (dst->type == GGML_TYPE_F32) {
-        GGML_ASSERT( nb0 == sizeof(float));
-    }
-    else {
-        GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
-        GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
-    }
-
-    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(float)) {
-        if (dst->type == GGML_TYPE_BF16) {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
-                }
-            }
-        } else {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
-                }
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ABORT("fatal error");
-    }
-}
-
-static void ggml_compute_forward_add_f16_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(ggml_fp16_t)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-            for (int i = 0; i < ne0; i++) {
-                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ABORT("fatal error");
-    }
-}
-
-static void ggml_compute_forward_add_bf16_bf16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
-    GGML_ASSERT(src1->type == GGML_TYPE_BF16);
-    GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(ggml_bf16_t)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-            ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-            ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            ggml_bf16_t * src1_ptr = (ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-            for (int i = 0; i < ne0; i++) {
-                dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + GGML_BF16_TO_FP32(src1_ptr[i]));
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ABORT("fatal error");
-    }
-}
-
-static void ggml_compute_forward_add_q_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-    const enum ggml_type dtype = dst->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-    ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float;
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ggml_is_quantized(src0->type));
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        // src1 and dst are same shape as src0 => same indices
-        const int i13 = i03;
-        const int i12 = i02;
-        const int i11 = i01;
-
-        const int i3 = i03;
-        const int i2 = i02;
-        const int i1 = i01;
-
-        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
-        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb3));
-
-        assert(ne00 % 32 == 0);
-
-        // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne00);
-        // add src1
-        ggml_vec_acc_f32(ne00, wdata, src1_row);
-        // quantize row to dst
-        if (quantize_row_q != NULL) {
-            quantize_row_q(wdata, dst_row, ne00);
-        } else {
-            memcpy(dst_row, wdata, ne0*nb0);
-        }
-    }
-}
-
-static void ggml_compute_forward_add(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add_f32(params, dst);
-                }
-                else {
-                    GGML_ABORT("fatal error");
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                if (src1->type == GGML_TYPE_F16) {
-                    ggml_compute_forward_add_f16_f16(params, dst);
-                }
-                else if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add_f16_f32(params, dst);
-                }
-                else {
-                    GGML_ABORT("fatal error");
-                }
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                if (src1->type == GGML_TYPE_BF16) {
-                    ggml_compute_forward_add_bf16_bf16(params, dst);
-                }
-                else if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add_bf16_f32(params, dst);
-                }
-                else {
-                    GGML_ABORT("fatal error");
-                }
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_TQ1_0:
-        case GGML_TYPE_TQ2_0:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
-            {
-                ggml_compute_forward_add_q_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_add1
-
-static void ggml_compute_forward_add1_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-#ifdef GGML_USE_ACCELERATE
-        UNUSED(ggml_vec_add1_f32);
-
-        vDSP_vadd(
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                (float *) ((char *) src1->data), 0,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                ne0);
-#else
-        ggml_vec_add1_f32(ne0,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-               *(float *) src1->data);
-#endif
-    }
-}
-
-static void ggml_compute_forward_add1_f16_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    // scalar to add
-    const float v = *(float *) src1->data;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-        for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
-        }
-    }
-}
-
-static void ggml_compute_forward_add1_f16_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    // scalar to add
-    const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-        for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
-        }
-    }
-}
-
-static void ggml_compute_forward_add1_q_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    // scalar to add
-    const float v = *(float *) src1->data;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const enum ggml_type type = src0->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
-
-    // we don't support permuted src0
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ggml_is_quantized(src0->type));
-    GGML_ASSERT(dst->type == src0->type);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
-        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));
-
-        assert(ne0 % 32 == 0);
-
-        // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne0);
-        // add src1
-        ggml_vec_acc1_f32(ne0, wdata, v);
-        // quantize row to dst
-        quantize_row_q(wdata, dst_row, ne0);
-    }
-}
-
-static void ggml_compute_forward_add1_bf16_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    // scalar to add
-    const float v = *(float *) src1->data;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-        ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-        for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v);
-        }
-    }
-}
-
-static void ggml_compute_forward_add1_bf16_bf16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    // scalar to add
-    const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
-    GGML_ASSERT(src1->type == GGML_TYPE_BF16);
-    GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-        ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-        for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v);
-        }
-    }
-}
-
-static void ggml_compute_forward_add1(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add1_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                if (src1->type == GGML_TYPE_F16) {
-                    ggml_compute_forward_add1_f16_f16(params, dst);
-                }
-                else if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add1_f16_f32(params, dst);
-                }
-                else {
-                    GGML_ABORT("fatal error");
-                }
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                if (src1->type == GGML_TYPE_BF16) {
-                    ggml_compute_forward_add1_bf16_bf16(params, dst);
-                }
-                else if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add1_bf16_f32(params, dst);
-                }
-                else {
-                    GGML_ABORT("fatal error");
-                }
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_TQ1_0:
-        case GGML_TYPE_TQ2_0:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
-            {
-                ggml_compute_forward_add1_q_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_acc
-
-static void ggml_compute_forward_acc_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-
-    // view src0 and dst with these strides and data offset inbytes during acc
-    // nb0 is implicitly element_size because src0 and dst are contiguous
-    size_t nb1     = ((int32_t *) dst->op_params)[0];
-    size_t nb2     = ((int32_t *) dst->op_params)[1];
-    size_t nb3     = ((int32_t *) dst->op_params)[2];
-    size_t offset  = ((int32_t *) dst->op_params)[3];
-    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-    if (!inplace) {
-        if (params->ith == 0) {
-            // memcpy needs to be synchronized across threads to avoid race conditions.
-            // => do it in INIT phase
-            memcpy(
-                ((char *)  dst->data),
-                ((char *) src0->data),
-                ggml_nbytes(dst));
-        }
-        ggml_barrier(params->threadpool);
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src1);
-    const int nc = src1->ne[0];
-
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
-
-    // src0 and dst as viewed during acc
-    const size_t nb0 = ggml_element_size(src0);
-
-    const size_t nb00 = nb0;
-    const size_t nb01 = nb1;
-    const size_t nb02 = nb2;
-    const size_t nb03 = nb3;
-
-    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0  + (ne11 == 0 ? 0 : ne11-1)*nb1  + (ne12 == 0 ? 0 : ne12-1)*nb2  + (ne13 == 0 ? 0 : ne13-1)*nb3  < ggml_nbytes(dst));
-    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0));
-
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are viewed with shape of src1 and offset
-        // => same indices
-        const int i3 = ir/(ne12*ne11);
-        const int i2 = (ir - i3*ne12*ne11)/ne11;
-        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
-
-#ifdef GGML_USE_ACCELERATE
-        vDSP_vadd(
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1,
-                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1  + offset), 1, nc);
-#else
-        ggml_vec_add_f32(nc,
-                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset),
-                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
-#endif
-    }
-}
-
-static void ggml_compute_forward_acc(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_acc_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_BF16:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_TQ1_0:
-        case GGML_TYPE_TQ2_0:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_sub
-
-static void ggml_compute_forward_sub_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(float)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                vDSP_vsub(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_sub_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_sub(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sub_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_mul
-
-static void ggml_compute_forward_mul_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t nr = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0 ; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                UNUSED(ggml_vec_mul_f32);
-
-                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne00; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_mul(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mul_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_div
-
-static void ggml_compute_forward_div_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t nr = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                UNUSED(ggml_vec_div_f32);
-
-                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne00; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_div(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_div_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_sqr
-
-static void ggml_compute_forward_sqr_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n     = ggml_nrows(src0);
-    const int nc    = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sqr_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sqr(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sqr_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_sqrt
-
-static void ggml_compute_forward_sqrt_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sqrt_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sqrt(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sqrt_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_log
-
-static void ggml_compute_forward_log_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_log_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_log(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_log_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_sin
-
-static void ggml_compute_forward_sin_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sin_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sin(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sin_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_cos
-
-static void ggml_compute_forward_cos_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_cos_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_cos(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_cos_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_sum
-
-static void ggml_compute_forward_sum_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_scalar(dst));
-    assert(src0->nb[0] == sizeof(float));
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
-
-    ggml_float sum     = 0;
-    ggml_float row_sum = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f32_ggf(ne00,
-                        &row_sum,
-                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-                sum += row_sum;
-            }
-        }
-    }
-    ((float *) dst->data)[0] = sum;
-}
-
-static void ggml_compute_forward_sum_f16(
-    const struct ggml_compute_params * params,
-          struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_scalar(dst));
-
-    assert(src0->nb[0] == sizeof(ggml_fp16_t));
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
-
-    float sum = 0;
-    float row_sum = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f16_ggf(ne00,
-                    &row_sum,
-                    (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
-                sum += row_sum;
-            }
-        }
-    }
-    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
-}
-
-static void ggml_compute_forward_sum_bf16(
-    const struct ggml_compute_params * params,
-          struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_scalar(dst));
-
-    assert(src0->nb[0] == sizeof(ggml_bf16_t));
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
-
-    float sum = 0;
-    float row_sum = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_bf16_ggf(ne00,
-                    &row_sum,
-                    (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
-                sum += row_sum;
-            }
-        }
-    }
-    ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum);
-}
-
-static void ggml_compute_forward_sum(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sum_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_sum_f16(params, dst);
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                ggml_compute_forward_sum_bf16(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_sum_rows
-
-static void ggml_compute_forward_sum_rows_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT(dst->nb[0] == sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(ne0 == 1);
-    GGML_ASSERT(ne1 == ne01);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    for (int64_t i3 = 0; i3 < ne03; i3++) {
-        for (int64_t i2 = 0; i2 < ne02; i2++) {
-            for (int64_t i1 = 0; i1 < ne01; i1++) {
-                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
-                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
-                float row_sum = 0;
-                ggml_vec_sum_f32(ne00, &row_sum, src_row);
-                dst_row[0] = row_sum;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_sum_rows(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sum_rows_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_mean
-
-static void ggml_compute_forward_mean_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(src0->nb[0] == sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    assert(ne0 == 1);
-    assert(ne1 == ne01);
-    assert(ne2 == ne02);
-    assert(ne3 == ne03);
-
-    UNUSED(ne0);
-    UNUSED(ne1);
-    UNUSED(ne2);
-    UNUSED(ne3);
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f32(ne00,
-                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-
-                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_mean(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mean_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_argmax
-
-static void ggml_compute_forward_argmax_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(src0->nb[0] == sizeof(float));
-    assert(dst->nb[0] == sizeof(float));
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb0 = dst->nb[0];
-
-    for (int64_t i1 = 0; i1 < ne01; i1++) {
-        float * src = (float *) ((char *) src0->data + i1*nb01);
-        int32_t * dst_ = (int32_t *) ((char *)  dst->data + i1*nb0);
-        int v = 0;
-        ggml_vec_argmax_f32(ne00, &v, src);
-        dst_[0] = v;
-    }
-}
-
-static void ggml_compute_forward_argmax(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_argmax_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_count_equal
-
-static void ggml_compute_forward_count_equal_i32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    GGML_ASSERT(src0->type == GGML_TYPE_I32);
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_are_same_shape(src0, src1));
-    GGML_ASSERT(ggml_is_scalar(dst));
-    GGML_ASSERT(dst->type == GGML_TYPE_I64);
-
-    const int64_t nr = ggml_nrows(src0);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    int64_t * sums = (int64_t *) params->wdata;
-    int64_t sum_thread = 0;
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    for (int64_t ir = ir0; ir < ir1; ++ir) {
-        const int64_t i03 =  ir                        / (ne02*ne01);
-        const int64_t i02 = (ir - i03*ne03)            /       ne01;
-        const int64_t i01 =  ir - i03*ne03 - i02*ne02;
-
-        const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01;
-        const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11;
-
-        for (int64_t i00 = 0; i00 < ne00; ++i00) {
-            const int32_t val0 = *((const int32_t *) (data0 + i00*nb00));
-            const int32_t val1 = *((const int32_t *) (data1 + i00*nb10));
-
-            sum_thread += val0 == val1;
-        }
-    }
-    if (ith != 0) {
-        sums[ith] = sum_thread;
-    }
-    ggml_barrier(params->threadpool);
-
-    if (ith != 0) {
-        return;
-    }
-
-    for (int ith_other = 1; ith_other < nth; ++ith_other) {
-        sum_thread += sums[ith_other];
-    }
-    *((int64_t *) dst->data) = sum_thread;
-}
-
-static void ggml_compute_forward_count_equal(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_count_equal_i32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_repeat
-
-static void ggml_compute_forward_repeat_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    GGML_ASSERT(ggml_can_repeat(src0, dst));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nr0 = (int)(ne0/ne00);
-    const int nr1 = (int)(ne1/ne01);
-    const int nr2 = (int)(ne2/ne02);
-    const int nr3 = (int)(ne3/ne03);
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // TODO: maybe this is not optimal?
-    for                         (int i3 = 0; i3 < nr3;  i3++) {
-        for                     (int k3 = 0; k3 < ne03; k3++) {
-            for                 (int i2 = 0; i2 < nr2;  i2++) {
-                for             (int k2 = 0; k2 < ne02; k2++) {
-                    for         (int i1 = 0; i1 < nr1;  i1++) {
-                        for     (int k1 = 0; k1 < ne01; k1++) {
-                            for (int i0 = 0; i0 < nr0;  i0++) {
-                                ggml_vec_cpy_f32(ne00,
-                                        (float *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0),
-                                        (float *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01));
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    GGML_ASSERT(ggml_can_repeat(src0, dst));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nr0 = (int)(ne0/ne00);
-    const int nr1 = (int)(ne1/ne01);
-    const int nr2 = (int)(ne2/ne02);
-    const int nr3 = (int)(ne3/ne03);
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // TODO: maybe this is not optimal?
-    for                         (int i3 = 0; i3 < nr3;  i3++) {
-        for                     (int k3 = 0; k3 < ne03; k3++) {
-            for                 (int i2 = 0; i2 < nr2;  i2++) {
-                for             (int k2 = 0; k2 < ne02; k2++) {
-                    for         (int i1 = 0; i1 < nr1;  i1++) {
-                        for     (int k1 = 0; k1 < ne01; k1++) {
-                            for (int i0 = 0; i0 < nr0;  i0++) {
-                                ggml_fp16_t * y = (ggml_fp16_t *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0);
-                                ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01);
-                                // ggml_vec_cpy_f16(ne00, y, x)
-                                for (int i = 0; i < ne00; ++i) {
-                                    y[i]  = x[i];
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-        case GGML_TYPE_BF16:
-        case GGML_TYPE_I16:
-            {
-                ggml_compute_forward_repeat_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_repeat_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_repeat_back
-
-static void ggml_compute_forward_repeat_back_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    GGML_ASSERT(ggml_can_repeat(dst, src0));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nr0 = (int)(ne00/ne0);
-    const int nr1 = (int)(ne01/ne1);
-    const int nr2 = (int)(ne02/ne2);
-    const int nr3 = (int)(ne03/ne3);
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (ggml_is_contiguous(dst)) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-    } else {
-        for         (int k3 = 0; k3 < ne3; k3++) {
-            for     (int k2 = 0; k2 < ne2; k2++) {
-                for (int k1 = 0; k1 < ne1; k1++) {
-                    ggml_vec_set_f32(ne0,
-                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
-                        0);
-                }
-            }
-        }
-    }
-
-    // TODO: maybe this is not optimal?
-    for                         (int i3 = 0; i3 < nr3; i3++) {
-        for                     (int k3 = 0; k3 < ne3; k3++) {
-            for                 (int i2 = 0; i2 < nr2; i2++) {
-                for             (int k2 = 0; k2 < ne2; k2++) {
-                    for         (int i1 = 0; i1 < nr1; i1++) {
-                        for     (int k1 = 0; k1 < ne1; k1++) {
-                            for (int i0 = 0; i0 < nr0; i0++) {
-                                ggml_vec_acc_f32(ne0,
-                                        (float *) ((char *)  dst->data + (         k3)*nb3  + (         k2)*nb2  + (         k1)*nb1),
-                                        (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_repeat_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_concat
-
-static void ggml_compute_forward_concat_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int32_t dim = ggml_get_op_params_i32(dst, 0);
-
-    GGML_ASSERT(dim >= 0 && dim < 4);
-
-    int64_t o[4] = {0, 0, 0, 0};
-    o[dim] = src0->ne[dim];
-
-    const float * x;
-
-    // TODO: smarter multi-theading
-    for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = ith; i2 < ne2; i2 += nth) {
-            for (int i1 = 0; i1 < ne1; i1++) {
-                for (int i0 = 0; i0 < ne0; i0++) {
-                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-                        x = (const float *) ((const char *)src0->data + (i0       )*nb00 + (i1       )*nb01 + (i2       )*nb02 + (i3       )*nb03);
-                    } else {
-                        x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
-                    }
-
-                    float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-
-                    *y = *x;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_concat(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_concat_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_abs
-
-static void ggml_compute_forward_abs_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_abs_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_abs(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_abs_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_sgn
-
-static void ggml_compute_forward_sgn_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sgn_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sgn(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sgn_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_neg
-
-static void ggml_compute_forward_neg_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_neg_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_neg(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_neg_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_step
-
-static void ggml_compute_forward_step_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_step_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_step(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_step_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_tanh
-
-static void ggml_compute_forward_tanh_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_tanh_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_tanh(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_tanh_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_elu
-
-static void ggml_compute_forward_elu_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_elu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_elu(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_elu_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_relu
-
-static void ggml_compute_forward_relu_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_relu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_relu(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_relu_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_sigmoid
-
-static void ggml_compute_forward_sigmoid_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sigmoid_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sigmoid(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sigmoid_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_gelu
-
-static void ggml_compute_forward_gelu_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_gelu_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_gelu(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_gelu_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_gelu_quick
-
-static void ggml_compute_forward_gelu_quick_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_gelu_quick_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_gelu_quick(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_gelu_quick_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_silu
-
-static void ggml_compute_forward_silu_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_silu_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_silu(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_silu_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-// ggml_compute_forward_leaky_relu
-
-static void ggml_compute_forward_leaky_relu_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    float negative_slope;
-    memcpy(&negative_slope, dst->op_params, sizeof(float));
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_leaky_relu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
-    }
-}
-
-static void ggml_compute_forward_leaky_relu(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_leaky_relu_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_silu_back
-
-static void ggml_compute_forward_silu_back_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * grad = dst->src[1];
-
-    assert(ggml_is_contiguous_1(grad));
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-    assert(ggml_are_same_shape(src0, grad));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_silu_backward_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])),
-                (float *) ((char *) grad->data + i1*(grad->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_silu_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_silu_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-
-static void ggml_compute_forward_hardswish_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_hardswish_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-static void ggml_compute_forward_hardswish(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_hardswish_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-static void ggml_compute_forward_hardsigmoid_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_hardsigmoid_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_hardsigmoid(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_hardsigmoid_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-static void ggml_compute_forward_exp_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_exp_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_exp(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_exp_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-
-// ggml_compute_forward_norm
-
-static void ggml_compute_forward_norm_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    GGML_ASSERT(eps > 0.0f);
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)x[i00];
-                }
-
-                float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                ggml_float sum2 = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    float v = x[i00] - mean;
-                    y[i00] = v;
-                    sum2 += (ggml_float)(v*v);
-                }
-
-                float variance = sum2/ne00;
-                const float scale = 1.0f/sqrtf(variance + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_norm(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_norm_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_group_rms_norm
-
-static void ggml_compute_forward_rms_norm_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    GGML_ASSERT(eps > 0.0f);
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)(x[i00] * x[i00]);
-                }
-
-                const float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                memcpy(y, x, ne00 * sizeof(float));
-                // for (int i00 = 0; i00 < ne00; i00++) {
-                //     y[i00] = x[i00];
-                // }
-
-                const float scale = 1.0f/sqrtf(mean + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rms_norm(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rms_norm_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-static void ggml_compute_forward_rms_norm_back_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                // src1 is same shape as src0 => same indices
-                const int64_t i11 = i01;
-                const int64_t i12 = i02;
-                const int64_t i13 = i03;
-
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
-
-                ggml_float sum_xx  = 0.0;
-                ggml_float sum_xdz = 0.0;
-
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum_xx  += (ggml_float)(x[i00] * x[i00]);
-                    sum_xdz += (ggml_float)(x[i00] * dz[i00]);
-                }
-
-                //const float mean     = (float)(sum_xx)/ne00;
-                const float mean_eps = (float)(sum_xx)/ne00 + eps;
-                const float sum_eps  = (float)(sum_xx) + eps*ne00;
-                //const float mean_xdz = (float)(sum_xdz)/ne00;
-                // we could cache rms from forward pass to improve performance.
-                // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms.
-                //const float rms      = sqrtf(mean_eps);
-                const float rrms     = 1.0f / sqrtf(mean_eps);
-                //const float scale    = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
-
-                {
-                    // z = rms_norm(x)
-                    //
-                    // rms_norm(src0) =
-                    //     scale(
-                    //         src0,
-                    //         div(
-                    //             1,
-                    //             sqrt(
-                    //                 add(
-                    //                     scale(
-                    //                         sum(
-                    //                             sqr(
-                    //                                 src0)),
-                    //                         (1.0/N)),
-                    //                     eps))));
-
-                    // postorder:
-                    // ## op    args         grad
-                    // 00 param src0         grad[#00]
-                    // 01 const 1
-                    // 02 sqr   (#00)        grad[#02]
-                    // 03 sum   (#02)        grad[#03]
-                    // 04 const 1/N
-                    // 05 scale (#03, #04)   grad[#05]
-                    // 06 const eps
-                    // 07 add   (#05, #06)   grad[#07]
-                    // 08 sqrt  (#07)        grad[#08]
-                    // 09 div   (#01,#08)    grad[#09]
-                    // 10 scale (#00,#09)    grad[#10]
-                    //
-                    // backward pass, given grad[#10]
-                    // #10: scale
-                    // grad[#00] += scale(grad[#10],#09)
-                    // grad[#09] += sum(mul(grad[#10],#00))
-                    // #09: div
-                    // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
-                    // #08: sqrt
-                    // grad[#07] += mul(grad[#08], div(0.5, #08))
-                    // #07: add
-                    // grad[#05] += grad[#07]
-                    // #05: scale
-                    // grad[#03] += scale(grad[#05],#04)
-                    // #03: sum
-                    // grad[#02] += repeat(grad[#03], #02)
-                    // #02:
-                    // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
-                    //
-                    // substitute and simplify:
-                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
-                    // grad[#02] = repeat(grad[#03], #02)
-                    // grad[#02] = repeat(scale(grad[#05],#04), #02)
-                    // grad[#02] = repeat(scale(grad[#07],#04), #02)
-                    // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
-                    // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
-                    // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
-                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0)
-                    // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0)
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N)))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps)))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps))
-                    // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps))
-                    // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps))
-                    // a = b*c + d*e
-                    // a = b*c*f/f + d*e*f/f
-                    // a = (b*c*f + d*e*f)*(1/f)
-                    // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c))
-                    // a = (b + d*e/c)*c
-                    // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps)
-                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms
-                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
-                    // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
-                    // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
-                    // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
-                    // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
-                    // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
-                    // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
-                    // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
-                }
-                // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
-                // post-order:
-                // dx := x
-                // dx := scale(dx,-mean_xdz/mean_eps)
-                // dx := add(dx, dz)
-                // dx := scale(dx, rrms)
-                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                ggml_vec_cpy_f32  (ne00, dx, x);
-                // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
-                ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
-                ggml_vec_acc_f32  (ne00, dx, dz);
-                ggml_vec_scale_f32(ne00, dx, rrms);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rms_norm_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rms_norm_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_group_norm
-
-static void ggml_compute_forward_group_norm_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    // TODO: optimize
-
-    float eps;
-    memcpy(&eps, dst->op_params + 1, sizeof(float));
-
-    int n_channels = src0->ne[2];
-    int n_groups = dst->op_params[0];
-    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
-    for (int i = ith; i < n_groups; i += nth) {
-        int start = i * n_channels_per_group;
-        int end = start + n_channels_per_group;
-        if (end > n_channels) {
-            end = n_channels;
-        }
-        int step = end - start;
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            ggml_float sum = 0.0;
-            for (int64_t i02 = start; i02 < end; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
-
-                    ggml_float sumr = 0.0;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        sumr += (ggml_float)x[i00];
-                    }
-                    sum += sumr;
-                }
-            }
-            const float mean = sum / (ne00 * ne01 * step);
-
-            ggml_float sum2 = 0.0;
-            for (int64_t i02 = start; i02 < end; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
-
-                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
-
-                    ggml_float sumr = 0.0;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        float v = x[i00] - mean;
-                        y[i00] = v;
-                        sumr += (ggml_float)(v * v);
-                    }
-                    sum2 += sumr;
-                }
-            }
-            const float variance = sum2 / (ne00 * ne01 * step);
-            const float scale = 1.0f / sqrtf(variance + eps);
-
-            for (int64_t i02 = start; i02 < end; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
-                    ggml_vec_scale_f32(ne00, y, scale);
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_group_norm(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_group_norm_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_mul_mat
-
-static void ggml_compute_forward_mul_mat_one_chunk(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst,
-    const int64_t num_rows_per_vec_dot,
-    const int64_t ir0_start,
-    const int64_t ir0_end,
-    const int64_t ir1_start,
-    const int64_t ir1_end) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const enum ggml_type type = src0->type;
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    ggml_vec_dot_t const vec_dot      = type_traits[type].vec_dot;
-    enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
-
-    // broadcast factors
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
-
-    // threads with no work simply yield (not sure if it helps)
-    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
-        return;
-    }
-
-    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-    assert(ne12 % ne02 == 0);
-    assert(ne13 % ne03 == 0);
-
-    // block-tiling attempt
-    const int64_t blck_0 = 16;
-    const int64_t blck_1 = 16;
-
-    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
-
-    // attempt to reduce false-sharing (does not seem to make a difference)
-    // 16 * 2, accounting for mmla kernels
-    float tmp[32];
-
-    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
-        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
-            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
-                const int64_t i13 = (ir1 / (ne12 * ne1));
-                const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
-                const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
-
-                // broadcast src0 into src1
-                const int64_t i03 = i13 / r3;
-                const int64_t i02 = i12 / r2;
-
-                const int64_t i1 = i11;
-                const int64_t i2 = i12;
-                const int64_t i3 = i13;
-
-                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
-
-                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                //       the original src1 data pointer, so we should index using the indices directly
-                // TODO: this is a bit of a hack, we should probably have a better way to handle this
-                const char * src1_col = (const char*)wdata +
-                    (src1_cont || src1->type != vec_dot_type
-                        ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
-                        : (i11 * nb11 + i12 * nb12 + i13 * nb13));
-                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
-
-                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
-                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                //}
-
-                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
-                }
-
-                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
-                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_mul_mat(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-
-    enum ggml_type           const vec_dot_type         = type_traits[type].vec_dot_type;
-    ggml_from_float_t        const from_float           = type_traits[vec_dot_type].from_float;
-    ggml_from_float_to_mat_t const from_float_to_mat    = type_traits[vec_dot_type].from_float_to_mat;
-    int64_t                  const vec_dot_num_rows     = type_traits[type].nrows;
-    int64_t                  const matmul_num_cols      = type_traits[type].ncols;
-    int64_t                  const blck_size_interleave = type_traits[type].blck_size_interleave;
-    ggml_gemv_t              const gemv                 = type_traits[type].gemv;
-    ggml_gemm_t              const gemm                 = type_traits[type].gemm;
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if GGML_USE_LLAMAFILE
-    // broadcast factors
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    if (src1_cont) {
-        for (int64_t i13 = 0; i13 < ne13; i13++)
-            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
-                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(src0->type),
-                                     (const char *)src1->data + i12*nb12 + i13*nb13,
-                                     nb11/ggml_type_size(src1->type),
-                                     (char *)dst->data + i12*nb2 + i13*nb3,
-                                     nb1/ggml_type_size(dst->type),
-                                     ith, nth,
-                                     src0->type,
-                                     src1->type,
-                                     dst->type))
-                    goto UseGgmlGemm1;
-        return;
-    }
-UseGgmlGemm1:;
-#endif
-
-    if (src1->type != vec_dot_type) {
-        char * wdata = params->wdata;
-
-        const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
-        const size_t nbw2 = nbw1*ne11;
-        const size_t nbw3 = nbw2*ne12;
-
-        assert(params->wsize >= ne13*nbw3);
-        GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                int64_t i11_processed = 0;
-                if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
-                    for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
-                        from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
-                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
-                                          4, ne10, blck_size_interleave);
-                    }
-                    i11_processed = ne11 - ne11 % 4;
-                }
-                for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
-                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
-                           (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
-                           ne10);
-                }
-            }
-        }
-    }
-
-    if (ith == 0) {
-        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
-    }
-
-    ggml_barrier(params->threadpool);
-
-#if GGML_USE_LLAMAFILE
-    if (src1->type != vec_dot_type) {
-        const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-        for (int64_t i13 = 0; i13 < ne13; i13++)
-            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
-                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(src0->type),
-                                     (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
-                                     row_size/ggml_type_size(vec_dot_type),
-                                     (char *)dst->data + i12*nb2 + i13*nb3,
-                                     nb1/ggml_type_size(dst->type),
-                                     ith, nth,
-                                     src0->type,
-                                     vec_dot_type,
-                                     dst->type))
-                    goto UseGgmlGemm2;
-        return;
-    }
-UseGgmlGemm2:;
-#endif
-
-    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
-    const int64_t nr0 = ne0;
-
-    // This is the size of the rest of the dimensions of the result
-    const int64_t nr1 = ne1 * ne2 * ne3;
-
-    // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
-    int64_t num_rows_per_vec_dot = vec_dot_num_rows;
-    // TODO: currently the mmla kernels support only even numbered rows/cols.
-    // this check can be removed once they are extended to support odd numbered rows/cols too
-    if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
-        num_rows_per_vec_dot = 1;
-    }
-
-    // Now select a reasonable chunk size.
-    int chunk_size = 16;
-
-    // We need to step up the size if it's small
-    if (nr0 == 1 || nr1 == 1) {
-        chunk_size = 64;
-    }
-
-    // distribute the work across the inner or outer loop based on which one is larger
-    // The number of chunks in the 0/1 dim.
-    // CEIL(nr0/chunk_size)
-    int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
-    int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
-
-    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
-    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggerganov/llama.cpp/pull/6915
-    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
-    if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
-        // distribute the thread work across the inner or outer loop based on which one is larger
-        nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
-        nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
-    }
-
-    // The number of elements in each chunk
-    const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
-    const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
-
-    if ((ggml_n_dims(src0) == 2) && gemv) {
-        const void * src1_wdata      = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-        const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
-        int64_t src0_start = (ith * ne01) / nth;
-        int64_t src0_end   = ((ith + 1) * ne01) / nth;
-        src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
-        src0_end   = (src0_end   % matmul_num_cols) ? src0_end   + matmul_num_cols - (src0_end   % matmul_num_cols): src0_end;
-        if (src0_start >= src0_end) return;
-
-        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
-        if (gemm && (ne11 > 3)) {
-            gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
-                 (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
-        }
-        for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
-            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
-                 (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
-                 src0_end - src0_start);
-        }
-        return;
-    }
-
-    // The first chunk comes from our thread_id, the rest will get auto-assigned.
-    int current_chunk = ith;
-
-    while (current_chunk < nchunk0 * nchunk1) {
-        const int64_t ith0 = current_chunk % nchunk0;
-        const int64_t ith1 = current_chunk / nchunk0;
-
-        const int64_t ir0_start = dr0 * ith0;
-        const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
-
-        const int64_t ir1_start = dr1 * ith1;
-        const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
-
-        ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
-
-        if (nth >= nchunk0 * nchunk1) {
-            break;
-        }
-
-        current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
-    }
-}
-
-// ggml_compute_forward_mul_mat_id
-
-static void ggml_compute_forward_mul_mat_id(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * ids = dst->src[2];
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    ggml_vec_dot_t    const vec_dot         = type_traits[type].vec_dot;
-    enum ggml_type    const vec_dot_type    = type_traits[type].vec_dot_type;
-    ggml_from_float_t const from_float      = type_traits[vec_dot_type].from_float;
-    int64_t           const matmul_num_cols = type_traits[type].ncols;
-    ggml_gemv_t       const gemv            = type_traits[type].gemv;
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    // row groups
-    const int n_ids = ids->ne[0]; // n_expert_used
-    const int n_as  = ne02;       // n_expert
-
-    char * wdata_src1_end = (src1->type == vec_dot_type) ?
-            (char *) params->wdata :
-            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
-
-    struct mmid_row_mapping {
-        int32_t i1;
-        int32_t i2;
-    };
-
-    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
-    struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
-
-    if (src1->type != vec_dot_type) {
-        char * wdata = params->wdata;
-
-        const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
-        const size_t nbw2 = nbw1*ne11;
-        const size_t nbw3 = nbw2*ne12;
-
-        assert(params->wsize >= ne13*nbw3);
-        GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
-                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
-                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
-                               ne10);
-                }
-            }
-        }
-    }
-
-#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
-
-    if (ith == 0) {
-        // initialize matrix_row_counts
-        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
-
-        // group rows by src0 matrix
-        for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
-            for (int id = 0; id < n_ids; ++id) {
-                const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
-
-                assert(i02 >= 0 && i02 < n_as);
-
-                MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
-                matrix_row_counts[i02] += 1;
-            }
-        }
-    }
-
-    ggml_barrier(params->threadpool);
-
-    // compute each matrix multiplication in sequence
-    for (int cur_a = 0; cur_a < n_as; ++cur_a) {
-        const int64_t cne1 = matrix_row_counts[cur_a];
-
-        if (cne1 == 0) {
-            continue;
-        }
-
-        const char * src0_cur = (const char *) src0->data + cur_a*nb02;
-
-        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-        const int64_t nr0 = ne01; // src0 rows
-        const int64_t nr1 = cne1; // src1 rows
-
-        if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
-            int64_t src0_cur_start = (ith * ne01) / nth;
-            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
-            src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
-            src0_cur_end   = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
-            if (src0_cur_start >= src0_cur_end) return;
-
-            for (int ir1 = 0; ir1 < nr1; ir1++) {
-                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
-                const int id       = row_mapping.i1; // selected expert index
-
-                const int64_t  i11 = id % ne11;
-                const int64_t  i12 = row_mapping.i2; // row index in src1
-
-                const int64_t  i1 = id;  // selected expert index
-                const int64_t  i2 = i12; // row
-
-                const char * src1_col = (const char *) wdata +
-                    (src1_cont || src1->type != vec_dot_type
-                    ? (i11        + i12 * ne11) * row_size
-                    : (i11 * nb11 + i12 * nb12));
-
-                gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
-                     (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
-            }
-            continue;
-        }
-
-        // distribute the thread work across the inner or outer loop based on which one is larger
-
-        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
-        const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
-
-        const int64_t ith0 = ith % nth0;
-        const int64_t ith1 = ith / nth0;
-
-        const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
-        const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
-
-        const int64_t ir010 = dr0*ith0;
-        const int64_t ir011 = MIN(ir010 + dr0, nr0);
-
-        const int64_t ir110 = dr1*ith1;
-        const int64_t ir111 = MIN(ir110 + dr1, nr1);
-
-        // threads with no work simply yield (not sure if it helps)
-        //if (ir010 >= ir011 || ir110 >= ir111) {
-        //    sched_yield();
-        //    continue;
-        //}
-
-        // block-tiling attempt
-        const int64_t blck_0 = 16;
-        const int64_t blck_1 = 16;
-
-        // attempt to reduce false-sharing (does not seem to make a difference)
-        float tmp[16];
-
-        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
-            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
-                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
-                    const int64_t _i12 = ir1; // logical row index for this expert
-
-                    struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
-                    const int id       = row_mapping.i1; // selected expert index
-
-                    const int64_t  i11 = id % ne11;
-                    const int64_t  i12 = row_mapping.i2; // row index in src1
-
-                    const int64_t  i1 = id;  // selected expert index
-                    const int64_t  i2 = i12; // row
-
-                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                    //       the original src1 data pointer, so we should index using the indices directly
-                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
-                    const char * src1_col = (const char *) wdata +
-                        (src1_cont || src1->type != vec_dot_type
-                        ? (i11      + i12*ne11)*row_size
-                        : (i11*nb11 + i12*nb12));
-
-                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
-
-                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                    //}
-
-                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
-                    }
-
-                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
-                }
-            }
-        }
-    }
-
-#undef MMID_MATRIX_ROW
-}
-
-// ggml_compute_forward_out_prod
-
-static void ggml_compute_forward_out_prod_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne0  == ne00);
-    GGML_ASSERT(ne1  == ne10);
-    GGML_ASSERT(ne2  == ne02);
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne3  == ne13);
-    GGML_ASSERT(ne03 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    // GGML_ASSERT(nb0 <= nb1);
-    // GGML_ASSERT(nb1 <= nb2);
-    // GGML_ASSERT(nb2 <= nb3);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-    if (ith == 0) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-    }
-    ggml_barrier(params->threadpool);
-
-    // dst[:,:,:,:] = 0
-    // for i2,i3:
-    //   for i1:
-    //     for i01:
-    //       for i0:
-    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
-
-    // parallelize by last three dimensions
-
-    // total rows in dst
-    const int64_t nr = ne1*ne2*ne3;
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    // block-tiling attempt
-    const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
-    const int64_t blck_1 = 16;
-
-    for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
-        const int64_t bir1 = MIN(bir + blck_1, ir1);
-        for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
-            const int64_t bne01 = MIN(bi01 + blck_0, ne01);
-            for (int64_t ir = bir; ir < bir1; ++ir) {
-                // dst indices
-                const int64_t i3 = ir/(ne2*ne1);
-                const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
-                const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                const int64_t i02 = i2;
-                const int64_t i03 = i3;
-
-                //const int64_t i10 = i1;
-                const int64_t i12 = i2;
-                const int64_t i13 = i3;
-
-#if GGML_VEC_MAD_UNROLL > 2
-                const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL);
-                for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
-                    const int64_t i11 = i01;
-
-                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-                    ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
-                }
-                for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) {
-                    const int64_t i11 = i01;
-
-                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-                    ggml_vec_mad_f32(ne0, d, s0, *s1);
-                }
-#else
-                for (int64_t i01 = bi01; i01 < bne01; ++i01) {
-                    const int64_t i11 = i01;
-
-                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-                    ggml_vec_mad_f32(ne0, d, s0, *s1);
-                }
-#endif
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_out_prod_q_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    // we don't support permuted src0 dim0
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-
-    // dst dim0 cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    // GGML_ASSERT(nb0 <= nb1);
-    // GGML_ASSERT(nb1 <= nb2);
-    // GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ne0 == ne00);
-    GGML_ASSERT(ne1 == ne10);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-    if (ith == 0) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-    }
-    ggml_barrier(params->threadpool);
-
-    // parallelize by last three dimensions
-
-    // total rows in dst
-    const int64_t nr = ne1*ne2*ne3;
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    // dst[:,:,:,:] = 0
-    // for i2,i3:
-    //   for i1:
-    //     for i01:
-    //       for i0:
-    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
-
-    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int64_t ir = ir0; ir < ir1; ++ir) {
-        // dst indices
-        const int64_t i3 = ir/(ne2*ne1);
-        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
-        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        const int64_t i02 = i2;
-        const int64_t i03 = i3;
-
-        //const int64_t i10 = i1;
-        const int64_t i12 = i2;
-        const int64_t i13 = i3;
-
-        for (int64_t i01 = 0; i01 < ne01; ++i01) {
-            const int64_t i11 = i01;
-
-            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-            dequantize_row_q(s0, wdata, ne0);
-            ggml_vec_mad_f32(ne0, d, wdata, *s1);
-        }
-    }
-}
-
-static void ggml_compute_forward_out_prod(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_TQ1_0:
-        case GGML_TYPE_TQ2_0:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
-            {
-                ggml_compute_forward_out_prod_q_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ABORT("fatal error"); // todo
-                // ggml_compute_forward_out_prod_f16_f32(params, dst);
-            }
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_out_prod_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_scale
-
-static void ggml_compute_forward_scale_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    // scale factor
-    float v;
-    memcpy(&v, dst->op_params, sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb1 = dst->nb[1];
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        if (dst->data != src0->data) {
-            // src0 is same shape as dst => same indices
-            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
-        }
-        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
-    }
-}
-
-static void ggml_compute_forward_scale(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_scale_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_set
-
-static void ggml_compute_forward_set_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-
-    // view src0 and dst with these strides and data offset inbytes during set
-    // nb0 is implicitly element_size because src0 and dst are contiguous
-    size_t nb1     = ((int32_t *) dst->op_params)[0];
-    size_t nb2     = ((int32_t *) dst->op_params)[1];
-    size_t nb3     = ((int32_t *) dst->op_params)[2];
-    size_t offset  = ((int32_t *) dst->op_params)[3];
-    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-    if (!inplace) {
-        if (params->ith == 0) {
-            // memcpy needs to be synchronized across threads to avoid race conditions.
-            // => do it in INIT phase
-            memcpy(
-                ((char *)  dst->data),
-                ((char *) src0->data),
-                ggml_nbytes(dst));
-        }
-        ggml_barrier(params->threadpool);
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src1);
-    const int nc = src1->ne[0];
-
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
-
-    // src0 and dst as viewed during set
-    const size_t nb0 = ggml_element_size(src0);
-
-    const int im0 = (ne10 == 0 ? 0 : ne10-1);
-    const int im1 = (ne11 == 0 ? 0 : ne11-1);
-    const int im2 = (ne12 == 0 ? 0 : ne12-1);
-    const int im3 = (ne13 == 0 ? 0 : ne13-1);
-
-    GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_nbytes(dst));
-
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are viewed with shape of src1 and offset
-        // => same indices
-        const int i3 = ir/(ne12*ne11);
-        const int i2 = (ir - i3*ne12*ne11)/ne11;
-        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
-
-        ggml_vec_cpy_f32(nc,
-                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
-                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
-    }
-}
-
-static void ggml_compute_forward_set(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_set_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_BF16:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_TQ1_0:
-        case GGML_TYPE_TQ2_0:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_cpy
-
-static void ggml_compute_forward_cpy(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    ggml_compute_forward_dup(params, dst);
-}
-
-// ggml_compute_forward_cont
-
-static void ggml_compute_forward_cont(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    ggml_compute_forward_dup(params, dst);
-}
-
-// ggml_compute_forward_reshape
-
-static void ggml_compute_forward_reshape(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_view
-
-static void ggml_compute_forward_view(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_permute
-
-static void ggml_compute_forward_permute(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_transpose
-
-static void ggml_compute_forward_transpose(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_get_rows
-
-static void ggml_compute_forward_get_rows_q(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1);
-
-    const enum ggml_type type = src0->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-
-    assert(ne0  == nc);
-    assert(ne02 == ne11);
-    assert(nb00 == ggml_type_size(type));
-    assert(ggml_nrows(dst) == nr);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int64_t i = ir0; i < ir1; ++i) {
-        const int64_t i12 = i/(ne11*ne10);
-        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
-        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
-        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-
-        GGML_ASSERT(i01 >= 0 && i01 < ne01);
-
-        dequantize_row_q(
-                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
-                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-    }
-}
-
-static void ggml_compute_forward_get_rows_f16(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1);
-
-    assert(ne0  == nc);
-    assert(ne02 == ne11);
-    assert(nb00 == sizeof(ggml_fp16_t));
-    assert(ggml_nrows(dst) == nr);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int64_t i = ir0; i < ir1; ++i) {
-        const int64_t i12 = i/(ne11*ne10);
-        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
-        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
-        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-
-        GGML_ASSERT(i01 >= 0 && i01 < ne01);
-
-        ggml_fp16_to_fp32_row(
-                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
-                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-    }
-}
-
-static void ggml_compute_forward_get_rows_bf16(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1);
-
-    assert(ne0  == nc);
-    assert(ne02 == ne11);
-    assert(nb00 == sizeof(ggml_bf16_t));
-    assert(ggml_nrows(dst) == nr);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int64_t i = ir0; i < ir1; ++i) {
-        const int64_t i12 = i/(ne11*ne10);
-        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
-        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
-        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-
-        GGML_ASSERT(i01 >= 0 && i01 < ne01);
-
-        ggml_bf16_to_fp32_row(
-                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
-                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-    }
-}
-
-static void ggml_compute_forward_get_rows_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1);
-
-    assert(ne0  == nc);
-    assert(ne02 == ne11);
-    assert(nb00 == sizeof(float));
-    assert(ggml_nrows(dst) == nr);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int64_t i = ir0; i < ir1; ++i) {
-        const int64_t i12 = i/(ne11*ne10);
-        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
-        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
-        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-
-        GGML_ASSERT(i01 >= 0 && i01 < ne01);
-
-        ggml_vec_cpy_f32(nc,
-                (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
-                (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
-    }
-}
-
-static void ggml_compute_forward_get_rows(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_TQ1_0:
-        case GGML_TYPE_TQ2_0:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
-            {
-                ggml_compute_forward_get_rows_q(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_get_rows_f16(params, dst);
-            } break;
-        case GGML_TYPE_BF16:
-            {
-                ggml_compute_forward_get_rows_bf16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_get_rows_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-
-    //static bool first = true;
-    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
-    //if (first) {
-    //    first = false;
-    //} else {
-    //    for (int k = 0; k < dst->ne[1]; ++k) {
-    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
-    //            for (int i = 0; i < 16; ++i) {
-    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
-    //            }
-    //            printf("\n");
-    //        }
-    //        printf("\n");
-    //    }
-    //    printf("\n");
-    //    exit(0);
-    //}
-}
-
-// ggml_compute_forward_get_rows_back
-
-static void ggml_compute_forward_get_rows_back_f32_f16(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    GGML_ASSERT(ggml_is_contiguous(dst));
-
-    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
-
-    memset(dst->data, 0, ggml_nbytes(dst));
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-
-    GGML_ASSERT( dst->ne[0] == nc);
-    GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        for (int j = 0; j < nc; ++j) {
-            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
-            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows_back_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    GGML_ASSERT(ggml_is_contiguous(dst));
-
-    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
-
-    memset(dst->data, 0, ggml_nbytes(dst));
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-
-    GGML_ASSERT( dst->ne[0] == nc);
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        ggml_vec_add_f32(nc,
-                (float *) ((char *)  dst->data + r*dst->nb[1]),
-                (float *) ((char *)  dst->data + r*dst->nb[1]),
-                (float *) ((char *) src0->data + i*src0->nb[1]));
-    }
-}
-
-static void ggml_compute_forward_get_rows_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_get_rows_back_f32_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_get_rows_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-
-    //static bool first = true;
-    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
-    //if (first) {
-    //    first = false;
-    //} else {
-    //    for (int k = 0; k < dst->ne[1]; ++k) {
-    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
-    //            for (int i = 0; i < 16; ++i) {
-    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
-    //            }
-    //            printf("\n");
-    //        }
-    //        printf("\n");
-    //    }
-    //    printf("\n");
-    //    exit(0);
-    //}
-}
-
-// ggml_compute_forward_diag
-
-static void ggml_compute_forward_diag_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    // TODO: handle transposed/permuted matrices
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(ne00 == ne0);
-    GGML_ASSERT(ne00 == ne1);
-    GGML_ASSERT(ne01 == 1);
-    GGML_ASSERT(ne02 == ne2);
-    GGML_ASSERT(ne03 == ne3);
-
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb0  == sizeof(float));
-
-    for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = 0; i2 < ne2; i2++) {
-            for (int i1 = 0; i1 < ne1; i1++) {
-                float * d = (float *)((char *)  dst->data + i3*nb3  + i2*nb2 + i1*nb1);
-                float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02);
-                for (int i0 = 0; i0 < i1; i0++) {
-                    d[i0] = 0;
-                }
-                d[i1] = s[i1];
-                for (int i0 = i1+1; i0 < ne0; i0++) {
-                    d[i0] = 0;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_diag(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_diag_mask_inf
-
-static void ggml_compute_forward_diag_mask_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const float value) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int  n_past  = ((int32_t *) dst->op_params)[0];
-    const bool inplace = src0->data == dst->data;
-
-    GGML_ASSERT(n_past >= 0);
-
-    if (!inplace) {
-        if (ith == 0) {
-            // memcpy needs to be synchronized across threads to avoid race conditions.
-            // => do it in INIT phase
-            GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-            GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-            memcpy(
-                ((char *)  dst->data),
-                ((char *) src0->data),
-                ggml_nbytes(dst));
-        }
-        ggml_barrier(params->threadpool);
-    }
-
-    // TODO: handle transposed/permuted matrices
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-    const int nr = src0->ne[1];
-    const int nz = n/nr;
-
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int k = 0; k < nz; k++) {
-        for (int j = ith; j < nr; j += nth) {
-            for (int i = n_past; i < nc; i++) {
-                if (i > n_past + j) {
-                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_diag_mask_inf(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-static void ggml_compute_forward_diag_mask_zero(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_mask_f32(params, dst, 0);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_soft_max
-
-static void ggml_compute_forward_soft_max_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    assert(ggml_is_contiguous(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    float scale    = 1.0f;
-    float max_bias = 0.0f;
-
-    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
-
-    // TODO: handle transposed/permuted matrices
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //const int64_t ne11 = src1 ? src1->ne[1] : 1;
-
-    // TODO: is this supposed to be ceil instead of floor?
-    //       https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
-    const uint32_t n_head      = ne02;
-    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
-
-    const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        // ALiBi
-        const uint32_t h = (i1/ne01)%ne02; // head
-        const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
-
-        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
-
-        // broadcast the mask across rows
-        ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
-        float       * mp_f32 = src1 ? (float       *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
-
-        ggml_vec_cpy_f32  (nc, wp, sp);
-        ggml_vec_scale_f32(nc, wp, scale);
-        if (mp_f32) {
-            if (use_f16) {
-                for (int i = 0; i < nc; ++i) {
-                    wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
-                }
-            } else {
-                for (int i = 0; i < nc; ++i) {
-                    wp[i] += slope*mp_f32[i];
-                }
-            }
-        }
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(wp[i]));
-        }
-#endif
-
-        float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, wp);
-
-        ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
-        assert(sum > 0.0);
-
-        sum = 1.0/sum;
-        ggml_vec_scale_f32(nc, dp, sum);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(dp[i]));
-            assert(!isinf(dp[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_soft_max(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_soft_max_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-
-// ggml_compute_forward_soft_max_back
-
-static void ggml_compute_forward_soft_max_back_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_are_same_shape(src1, dst));
-
-    // TODO: handle transposed/permuted matrices
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float *dy = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float *y  = (float *)((char *) src1->data + i1*src1->nb[1]);
-        float *dx = (float *)((char *) dst->data  + i1*dst->nb[1]);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(dy[i]));
-            assert(!isnan(y[i]));
-        }
-#endif
-        // Jii = yi - yi*yi
-        // Jij = -yi*yj
-        // J = diag(y)-y.T*y
-        // dx = J * dy
-        // dxk = sum_i(Jki * dyi)
-        // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
-        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
-        // dxk = sum_i(-yk*yi * dyi) + yk*dyk
-        // dxk = -yk * sum_i(yi * dyi) + yk*dyk
-        // dxk = -yk * dot(y, dy) + yk*dyk
-        // dxk = yk * (- dot(y, dy) + dyk)
-        // dxk = yk * (dyk - dot(y, dy))
-        //
-        // post-order:
-        // dot_y_dy := dot(y, dy)
-        // dx := dy
-        // dx := dx - dot_y_dy
-        // dx := dx * y
-
-        // linear runtime, no additional memory
-        float dot_y_dy = 0;
-        ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
-        ggml_vec_cpy_f32 (nc, dx, dy);
-        ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
-        ggml_vec_mul_f32 (nc, dx, dx, y);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(dx[i]));
-            assert(!isinf(dx[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_soft_max_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_soft_max_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_clamp
-
-static void ggml_compute_forward_clamp_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    float min;
-    float max;
-    memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    for (int j = ith; j < n; j += nth) {
-        float * dst_ptr  = (float *) ((char *)  dst->data + j*nb1);
-        float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
-
-        for (int i = 0; i < nc; i++) {
-            dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
-        }
-    }
-}
-
-static void ggml_compute_forward_clamp(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_clamp_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_BF16:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_TQ1_0:
-        case GGML_TYPE_TQ2_0:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q8_K:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
-        case GGML_TYPE_I8:
-        case GGML_TYPE_I16:
-        case GGML_TYPE_I32:
-        case GGML_TYPE_I64:
-        case GGML_TYPE_F64:
-        case GGML_TYPE_COUNT:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_rope
-
-static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
-    return 1 - MIN(1, MAX(0, y));
-}
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static void rope_yarn(
-    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
-    }
-    *cos_theta = cosf(theta) * mscale;
-    *sin_theta = sinf(theta) * mscale;
-}
-
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
-    return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
-}
-
-static void ggml_rope_cache_init(
-     float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
-     float * cache, float sin_sign, float theta_scale) {
-    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
-    float theta = theta_base;
-    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-        const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
-        rope_yarn(
-            theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
-        );
-        cache[i0 + 1] *= sin_sign;
-
-        theta *= theta_scale;
-    }
-}
-
-void ggml_rope_yarn_corr_dims(
-    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
-) {
-    // start and end correction dims
-    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
-    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
-    dims[0] = MAX(0, start);
-    dims[1] = MIN(n_dims - 1, end);
-}
-
-static void ggml_compute_forward_rope_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const bool forward) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src2 = dst->src[2];
-
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-
-    //const int n_past     = ((int32_t *) dst->op_params)[0];
-    const int n_dims     = ((int32_t *) dst->op_params)[1];
-    const int mode       = ((int32_t *) dst->op_params)[2];
-    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
-
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    GGML_ASSERT(n_dims <= ne0);
-    GGML_ASSERT(n_dims % 2 == 0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
-
-    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
-
-    const float * freq_factors = NULL;
-    if (src2 != NULL) {
-        GGML_ASSERT(src2->type == GGML_TYPE_F32);
-        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
-        freq_factors = (const float *) src2->data;
-    }
-
-    // backward process uses inverse rotation by cos and sin.
-    // cos and sin build a rotation matrix, where the inverse is the transpose.
-    // this essentially just switches the sign of sin.
-    const float sin_sign = forward ? 1.0f : -1.0f;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-
-            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
-
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
-                        const float cos_theta = cache[i0 + 0];
-                        const float sin_theta = cache[i0 + 1];
-
-                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[1];
-
-                        dst_data[0] = x0*cos_theta - x1*sin_theta;
-                        dst_data[1] = x0*sin_theta + x1*cos_theta;
-                    }
-                } else {
-                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
-                        const int64_t ic = i0/2;
-
-                        const float cos_theta = cache[i0 + 0];
-                        const float sin_theta = cache[i0 + 1];
-
-                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
-                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[n_dims/2];
-
-                        dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                        dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
-                    }
-                }
-
-                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
-                    const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                    float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                    dst_data[0] = src[0];
-                    dst_data[1] = src[1];
-                }
-            }
-        }
-    }
-}
-
-// TODO: deduplicate f16/f32 code
-static void ggml_compute_forward_rope_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const bool forward) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src2 = dst->src[2];
-
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-
-    //const int n_past     = ((int32_t *) dst->op_params)[0];
-    const int n_dims     = ((int32_t *) dst->op_params)[1];
-    const int mode       = ((int32_t *) dst->op_params)[2];
-    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    GGML_ASSERT(n_dims <= ne0);
-    GGML_ASSERT(n_dims % 2 == 0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
-
-    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
-
-    const float * freq_factors = NULL;
-    if (src2 != NULL) {
-        GGML_ASSERT(src2->type == GGML_TYPE_F32);
-        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
-        freq_factors = (const float *) src2->data;
-    }
-
-    // backward process uses inverse rotation by cos and sin.
-    // cos and sin build a rotation matrix, where the inverse is the transpose.
-    // this essentially just switches the sign of sin.
-    const float sin_sign = forward ? 1.0f : -1.0f;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-
-            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
-
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
-                        const float cos_theta = cache[i0 + 0];
-                        const float sin_theta = cache[i0 + 1];
-
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[1]);
-
-                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                    }
-                } else {
-                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
-                        const int64_t ic = i0/2;
-
-                        const float cos_theta = cache[i0 + 0];
-                        const float sin_theta = cache[i0 + 1];
-
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
-                        ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-
-                        dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                    }
-                }
-
-                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                    ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                    dst_data[0] = src[0];
-                    dst_data[1] = src[1];
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_rope_f16(params, dst, true);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rope_f32(params, dst, true);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_rope_back
-
-static void ggml_compute_forward_rope_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_rope_f16(params, dst, false);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rope_f32(params, dst, false);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_conv_transpose_1d
-
-static void ggml_compute_forward_conv_transpose_1d_f16_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (ith == 0) {
-        memset(params->wdata, 0, params->wsize);
-
-        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // permute source data (src1) from (L x Cin) to (Cin x L)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
-            ggml_fp16_t * dst_data = wdata;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
-                }
-            }
-        }
-
-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, ggml_nbytes(dst));
-    }
-    ggml_barrier(params->threadpool);
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
-    // total rows in dst
-    const int nr = ne1;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    ggml_fp16_t * const wdata     = (ggml_fp16_t *) params->wdata + 0;
-    ggml_fp16_t * const wdata_src = wdata + nk;
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
-        for (int i10 = 0; i10 < ne10; i10++) {
-            const int i1n = i10*ne11;
-            for (int i00 = 0; i00 < ne00; i00++) {
-                float v = 0;
-                ggml_vec_dot_f16(ne02, &v, 0,
-                        (ggml_fp16_t *)    wdata_src + i1n, 0,
-                        (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
-                dst_data[i10*s0 + i00] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_transpose_1d_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02;
-
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (ith == 0) {
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
-        {
-            float * const wdata = (float *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i01*ne00*ne02;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            float * const wdata = (float *) params->wdata + nk;
-            float * dst_data = wdata;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = src[i10];
-                }
-            }
-        }
-
-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, ggml_nbytes(dst));
-    }
-    ggml_barrier(params->threadpool);
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
-    // total rows in dst
-    const int nr = ne1;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * const wdata     = (float *) params->wdata + 0;
-    float * const wdata_src = wdata + nk;
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        float * wdata_kernel = wdata + i1*ne02*ne00;
-        for (int i10 = 0; i10 < ne10; i10++) {
-            const int i1n = i10*ne11;
-            for (int i00 = 0; i00 < ne00; i00++) {
-                float v = 0;
-                ggml_vec_dot_f32(ne02, &v, 0,
-                        wdata_src + i1n, 0,
-                        wdata_kernel + i00*ne02, 0, 1);
-                dst_data[i10*s0 + i00] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_transpose_1d(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_transpose_1d_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_im2col_f32
-// src0: kernel [OC, IC, KH, KW]
-// src1: image [N, IC, IH, IW]
-// dst:  result [N, OH, OW, IC*KH*KW]
-static void ggml_compute_forward_im2col_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t N  = is_2D ? ne13 : ne12;
-    const int64_t IC = is_2D ? ne12 : ne11;
-    const int64_t IH = is_2D ? ne11 : 1;
-    const int64_t IW = ne10;
-
-    const int64_t KH = is_2D ? ne01 : 1;
-    const int64_t KW = ne00;
-
-    const int64_t OH = is_2D ? ne2 : 1;
-    const int64_t OW = ne1;
-
-    int ofs0 = is_2D ? nb13 : nb12;
-    int ofs1 = is_2D ? nb12 : nb11;
-
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-    {
-        float * const wdata = (float *) dst->data;
-
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
-                for (int64_t iow = 0; iow < OW; iow++) {
-                    for (int64_t iic = ith; iic < IC; iic += nth) {
-
-                        // micro kernel
-                        float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
-
-                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
-                            for (int64_t ikw = 0; ikw < KW; ikw++) {
-                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
-                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
-
-                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
-                                } else {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-
-// ggml_compute_forward_im2col_f16
-// src0: kernel [OC, IC, KH, KW]
-// src1: image [N, IC, IH, IW]
-// dst:  result [N, OH, OW, IC*KH*KW]
-static void ggml_compute_forward_im2col_f16(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t N  = is_2D ? ne13 : ne12;
-    const int64_t IC = is_2D ? ne12 : ne11;
-    const int64_t IH = is_2D ? ne11 : 1;
-    const int64_t IW = ne10;
-
-    const int64_t KH = is_2D ? ne01 : 1;
-    const int64_t KW = ne00;
-
-    const int64_t OH = is_2D ? ne2 : 1;
-    const int64_t OW = ne1;
-
-    int ofs0 = is_2D ? nb13 : nb12;
-    int ofs1 = is_2D ? nb12 : nb11;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-    {
-        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
-
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
-                for (int64_t iow = 0; iow < OW; iow++) {
-                    for (int64_t iic = ith; iic < IC; iic += nth) {
-
-                        // micro kernel
-                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
-
-                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
-                            for (int64_t ikw = 0; ikw < KW; ikw++) {
-                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
-                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
-
-                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
-                                } else {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_im2col(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-    switch (dst->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_im2col_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_im2col_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_im2col_back_f32
-
-static void ggml_compute_forward_im2col_back_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t N  = is_2D ? ne3 : ne2;
-    const int64_t IC = is_2D ? ne2 : ne1;
-    const int64_t IH = is_2D ? ne1 : 1;
-    const int64_t IW = ne0;
-
-    const int64_t KH = is_2D ? ne01 : 1;
-    const int64_t KW = ne00;
-
-    const int64_t OH = is_2D ? ne12 : 1;
-    const int64_t OW = ne11;
-
-    int ofs0 = is_2D ? nb3 : nb2;
-    int ofs1 = is_2D ? nb2 : nb1;
-
-    GGML_ASSERT(nb0  == sizeof(float));
-
-    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-    {
-        float * const wdata = (float *) dst->data;
-
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t iic = ith; iic < IC; iic += nth) {
-                for (int64_t iih = 0; iih < IH; iih++) {
-                    for (int64_t iiw = 0; iiw < IW; iiw++) {
-
-                        // micro kernel
-                        float grad = 0.0f;
-                        for (int64_t ikh = 0; ikh < KH; ikh++) {
-                            for (int64_t ikw = 0; ikw < KW; ikw++) {
-                                // For s0 > 1 some values were skipped over in the forward pass.
-                                // These values have tmpw % s0 != 0 and need to be skipped in the backwards pass as well.
-                                const int64_t tmpw = (iiw + p0 - ikw*d0);
-                                if (tmpw % s0 != 0) {
-                                    continue;
-                                }
-                                const int64_t iow = tmpw / s0;
-
-                                // Equivalent logic as above except for s1.
-                                int64_t ioh;
-                                if (is_2D) {
-                                    const int64_t tmph = iih + p1 - ikh*d1;
-
-                                    if (tmph % s1 != 0) {
-                                        continue;
-                                    }
-
-                                    ioh = tmph / s1;
-                                } else {
-                                    ioh = 0;
-                                }
-
-                                if (iow < 0 || iow >= OW || ioh < 0 || ioh >= OH) {
-                                    continue;
-                                }
-
-                                const float * const src_data = (const float *) src1->data
-                                    + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                                grad += src_data[iic*(KH*KW) + ikh*KW + ikw];
-                            }
-                        }
-                        float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW]
-                        dst_data[iih*IW + iiw] = grad;
-                    }
-                }
-            }
-        }
-    }
-}
-
-// ggml_compute_forward_conv_transpose_2d
-
-static void ggml_compute_forward_conv_transpose_2d(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02*ne03;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (ith == 0) {
-        memset(params->wdata, 0, params->wsize);
-
-        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
-                    ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
-                        }
-                    }
-                }
-            }
-        }
-
-        // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
-            for (int i12 = 0; i12 < ne12; i12++) {
-                for (int i11 = 0; i11 < ne11; i11++) {
-                    const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
-                    ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
-                    for (int i10 = 0; i10 < ne10; i10++) {
-                        dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
-                    }
-                }
-            }
-        }
-
-        memset(dst->data, 0, ggml_nbytes(dst));
-    }
-    ggml_barrier(params->threadpool);
-
-    const int32_t stride = ggml_get_op_params_i32(dst, 0);
-
-    // total patches in dst
-    const int np = ne2;
-
-    // patches per thread
-    const int dp = (np + nth - 1)/nth;
-
-    // patch range for this thread
-    const int ip0 = dp*ith;
-    const int ip1 = MIN(ip0 + dp, np);
-
-    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-    ggml_fp16_t * const wdata_src = wdata + nk;
-
-    for (int i2 = ip0; i2 < ip1; i2++) { // Cout
-        float * dst_data = (float *)((char *) dst->data + i2*nb2);
-        ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
-        for (int i11 = 0; i11 < ne11; i11++) {
-            for (int i10 = 0; i10 < ne10; i10++) {
-                const int i1n = i11*ne10*ne12 + i10*ne12;
-                for (int i01 = 0; i01 < ne01; i01++) {
-                    for (int i00 = 0; i00 < ne00; i00++) {
-                        float v = 0;
-                        ggml_vec_dot_f16(ne03, &v, 0,
-                                wdata_src + i1n, 0,
-                                wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
-                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
-                    }
-                }
-            }
-        }
-    }
-}
-
-// ggml_compute_forward_pool_1d_sk_p0
-
-static void ggml_compute_forward_pool_1d_sk_p0(
-        const struct ggml_compute_params * params,
-        const enum ggml_op_pool op,
-        const int k,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src = dst->src[0];
-
-    assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    const char * cdata = (const char *)src->data;
-    const char * const data_end = cdata + ggml_nbytes(src);
-    float * drow = (float *)dst->data;
-
-    const int64_t rs = dst->ne[0];
-
-    while (cdata < data_end) {
-        const void * srow = (const void *)cdata;
-        int j = 0;
-        for (int64_t i = 0; i < rs; ++i) {
-            switch (op) {
-                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
-                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
-                case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
-            }
-            for (int ki = 0; ki < k; ++ki) {
-                const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
-                switch (op) {
-                    case GGML_OP_POOL_AVG:                         drow[i] += srow_j; break;
-                    case GGML_OP_POOL_MAX:   if (srow_j > drow[i]) drow[i]  = srow_j; break;
-                    case GGML_OP_POOL_COUNT:                       GGML_ABORT("fatal error");
-                }
-                ++j;
-            }
-            switch (op) {
-                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
-                case GGML_OP_POOL_MAX:                       break;
-                case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
-            }
-        }
-
-        cdata += src->nb[1];
-        drow  += rs;
-    }
-}
-
-// ggml_compute_forward_pool_1d
-
-static void ggml_compute_forward_pool_1d(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
-    const int k0 = opts[1];
-    const int s0 = opts[2];
-    const int p0 = opts[3];
-    GGML_ASSERT(p0 == 0); // padding not supported
-    GGML_ASSERT(k0 == s0); // only s = k supported
-
-    ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
-}
-
-// ggml_compute_forward_pool_2d
-
-static void ggml_compute_forward_pool_2d(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src = dst->src[0];
-
-    assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
-    const int k0 = opts[1];
-    const int k1 = opts[2];
-    const int s0 = opts[3];
-    const int s1 = opts[4];
-    const int p0 = opts[5];
-    const int p1 = opts[6];
-    const char * cdata = (const char*)src->data;
-    const char * const data_end = cdata + ggml_nbytes(src);
-
-    const int64_t px = dst->ne[0];
-    const int64_t py = dst->ne[1];
-    const int64_t pa = px * py;
-
-    float * dplane = (float *)dst->data;
-
-    const int ka = k0 * k1;
-    const int offset0 = -p0;
-    const int offset1 = -p1;
-
-    while (cdata < data_end) {
-        for (int oy = 0; oy < py; ++oy) {
-            float * const drow = dplane + oy * px;
-            for (int ox = 0; ox < px; ++ox) {
-                float * const out =  drow + ox;
-                switch (op) {
-                    case GGML_OP_POOL_AVG:     *out = 0;        break;
-                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
-                    case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
-                }
-
-                const int ix = offset0 + ox * s0;
-                const int iy = offset1 + oy * s1;
-
-                for (int ky = 0; ky < k1; ++ky) {
-                    if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
-                    const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky));
-                    for (int kx = 0; kx < k0; ++kx) {
-                        int j = ix + kx;
-                        if (j < 0 || j >= src->ne[0]) continue;
-                        const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
-                        switch (op) {
-                            case GGML_OP_POOL_AVG:                     *out += srow_j; break;
-                            case GGML_OP_POOL_MAX: if (srow_j > *out)  *out  = srow_j; break;
-                            case GGML_OP_POOL_COUNT:               GGML_ABORT("fatal error");
-                        }
-                    }
-                }
-                switch (op) {
-                    case GGML_OP_POOL_AVG:           *out /= ka; break;
-                    case GGML_OP_POOL_MAX:                       break;
-                    case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
-                }
-            }
-        }
-
-        cdata  += src->nb[2];
-        dplane += pa;
-    }
-}
-
-// ggml_compute_forward_pool_2d_back
-
-static void ggml_compute_forward_pool_2d_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src  = dst->src[0];
-    const struct ggml_tensor * dstf = dst->src[1]; // forward tensor of dst
-
-    assert(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
-    const int k0 = opts[1];
-    const int k1 = opts[2];
-    const int s0 = opts[3];
-    const int s1 = opts[4];
-    const int p0 = opts[5];
-    const int p1 = opts[6];
-
-    char       * cdata  = (char       *) dst->data;
-    const char * cdataf = (const char *) dstf->data;
-    const char * const data_end = cdata + ggml_nbytes(dst);
-
-    GGML_ASSERT(params->ith == 0);
-    memset(cdata, 0, ggml_nbytes(dst));
-
-    const int64_t px = src->ne[0];
-    const int64_t py = src->ne[1];
-    const int64_t pa = px * py;
-
-    const float * splane = (const float *) src->data;
-
-    const int ka = k0 * k1;
-    const int offset0 = -p0;
-    const int offset1 = -p1;
-
-    while (cdata < data_end) {
-        for (int oy = 0; oy < py; ++oy) {
-            const float * const srow = splane + oy * px;
-            for (int ox = 0; ox < px; ++ox) {
-                const float grad0 = srow[ox];
-
-                const int ix = offset0 + ox * s0;
-                const int iy = offset1 + oy * s1;
-
-                if (op == GGML_OP_POOL_MAX) {
-                    float maxval = -FLT_MAX;
-                    int kxmax = -1;
-                    int kymax = -1;
-
-                    for (int ky = 0; ky < k1; ++ky) {
-                        if (iy + ky < 0 || iy + ky >= dst->ne[1]) {
-                            continue;
-                        }
-                        const void * drowf = (const void *)(cdataf + dst->nb[1] * (iy + ky));
-                        for (int kx = 0; kx < k0; ++kx) {
-                            int j = ix + kx;
-                            if (j < 0 || j >= dst->ne[0]) {
-                                continue;
-                            }
-
-                            const float val = dst->type == GGML_TYPE_F32 ?
-                                ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
-                            if (val <= maxval) {
-                                continue;
-                            }
-
-                            maxval = val;
-                            kxmax = kx;
-                            kymax = ky;
-                        }
-                    }
-
-                    if (kxmax == -1 || kymax == -1) {
-                        continue;
-                    }
-
-                    void * drow = (void *)(cdata + dst->nb[1] * (iy + kymax));
-                    const int j = ix + kxmax;
-                    if (dst->type == GGML_TYPE_F32) {
-                        ((float *) drow)[j] += grad0;
-                    } else {
-                        ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
-                    }
-                } else if (op == GGML_OP_POOL_AVG) {
-                    const float grad = grad0 / ka;
-
-                    for (int ky = 0; ky < k1; ++ky) {
-                        if (iy + ky < 0 || iy + ky >= dst->ne[1]) {
-                            continue;
-                        }
-                        void * drow = (void *)(cdata + dst->nb[1] * (iy + ky));
-                        for (int kx = 0; kx < k0; ++kx) {
-                            int j = ix + kx;
-                            if (j < 0 || j >= dst->ne[0]) {
-                                continue;
-                            }
-
-                            if (dst->type == GGML_TYPE_F32) {
-                                ((float *) drow)[j] += grad;
-                            } else {
-                                ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad);
-                            }
-                        }
-                    }
-                } else {
-                    GGML_ASSERT(false);
-                }
-            }
-        }
-
-        cdata  += dst->nb[2];
-        cdataf += dst->nb[2];
-        splane += pa;
-    }
-}
-
-// ggml_compute_forward_upscale
-
-static void ggml_compute_forward_upscale_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const float sf0 = (float)ne0/src0->ne[0];
-    const float sf1 = (float)ne1/src0->ne[1];
-    const float sf2 = (float)ne2/src0->ne[2];
-    const float sf3 = (float)ne3/src0->ne[3];
-
-    // TODO: optimize
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        const int64_t i03 = i3 / sf3;
-        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-            const int64_t i02 = i2 / sf2;
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                const int64_t i01 = i1 / sf1;
-                for (int64_t i0 = 0; i0 < ne0; i0++) {
-                    const int64_t i00 = i0 / sf0;
-
-                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
-
-                    *y = *x;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_upscale(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_upscale_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-
-// ggml_compute_forward_pad
-
-static void ggml_compute_forward_pad_f32(
-    const struct ggml_compute_params * params,
-          struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float * dst_ptr = (float *) dst->data;
-
-    // TODO: optimize
-
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-
-                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-
-                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-                        dst_ptr[dst_idx] = *src_ptr;
-                    } else {
-                        dst_ptr[dst_idx] = 0;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_pad(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_pad_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-
-// ggml_compute_forward_arange
-
-static void ggml_compute_forward_arange_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    GGML_ASSERT(dst->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const float start = ggml_get_op_params_f32(dst, 0);
-    const float stop  = ggml_get_op_params_f32(dst, 1);
-    const float step  = ggml_get_op_params_f32(dst, 2);
-
-    const int64_t steps = (int64_t) ceilf((stop - start) / step);
-
-    GGML_ASSERT(ggml_nelements(dst) == steps);
-
-    for (int64_t i = ith; i < steps; i+= nth) {
-        float value = start + step * i;
-        ((float *)dst->data)[i] = value;
-    }
-}
-
-static void ggml_compute_forward_arange(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-    switch (dst->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_arange_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-static void ggml_compute_forward_timestep_embedding_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int dim = ggml_get_op_params_i32(dst, 0);
-    const int max_period = ggml_get_op_params_i32(dst, 1);
-
-    int half = dim / 2;
-
-    for (int64_t i = 0; i < ne00; i++) {
-        float * embed_data = (float *)((char *)  dst->data +  i*nb1);
-        for (int64_t j = ith; j < half; j += nth) {
-            float timestep = ((float *)src0->data)[i];
-            float freq = (float)expf(-logf(max_period) * j / half);
-            float arg = timestep * freq;
-            embed_data[j] = cosf(arg);
-            embed_data[j + half] = sinf(arg);
-        }
-        if (dim % 2 != 0 && ith == 0) {
-            embed_data[dim] = 0.f;
-        }
-    }
-}
-
-static void ggml_compute_forward_timestep_embedding(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_timestep_embedding_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_argsort
-
-static void ggml_compute_forward_argsort_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(nb0 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t nr = ggml_nrows(src0);
-
-    enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
-
-    for (int64_t i = ith; i < nr; i += nth) {
-        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
-        const float * src_data = (float *)((char *) src0->data + i*nb01);
-
-        for (int64_t j = 0; j < ne0; j++) {
-            dst_data[j] = j;
-        }
-
-        // C doesn't have a functional sort, so we do a bubble sort instead
-        for (int64_t j = 0; j < ne0; j++) {
-            for (int64_t k = j + 1; k < ne0; k++) {
-                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
-                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
-                    int32_t tmp = dst_data[j];
-                    dst_data[j] = dst_data[k];
-                    dst_data[k] = tmp;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_argsort(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_argsort_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_flash_attn_ext
-
-static void ggml_compute_forward_flash_attn_ext_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const struct ggml_tensor * mask,
-        struct ggml_tensor * dst) {
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne2 == N);
-
-    // input tensor rows must be contiguous
-    GGML_ASSERT(nbq0 == ggml_type_size(q->type));
-    GGML_ASSERT(nbk0 == ggml_type_size(k->type));
-    GGML_ASSERT(nbv0 == ggml_type_size(v->type));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev0 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nev0 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    // broadcast factors
-    const int64_t rk2 = neq2/nek2;
-    const int64_t rk3 = neq3/nek3;
-
-    const int64_t rv2 = neq2/nev2;
-    const int64_t rv3 = neq3/nev3;
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float scale         = 1.0f;
-    float max_bias      = 0.0f;
-    float logit_softcap = 0.0f;
-
-    memcpy(&scale,         (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&max_bias,      (float *) dst->op_params + 1, sizeof(float));
-    memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float));
-
-    if (logit_softcap != 0) {
-        scale /= logit_softcap;
-    }
-
-    const uint32_t n_head      = neq2;
-    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-    enum ggml_type    const k_vec_dot_type = type_traits[k->type].vec_dot_type;
-    ggml_from_float_t const q_to_vec_dot   = type_traits[k_vec_dot_type].from_float;
-    ggml_vec_dot_t    const kq_vec_dot     = type_traits[k->type].vec_dot;
-    ggml_to_float_t   const v_to_float     = type_traits[v->type].to_float;
-
-    GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
-    GGML_ASSERT(v_to_float   && "fattn: unsupported V-type");
-
-    // loop over n_batch and n_head
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        const uint32_t h = iq2; // head index
-        const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
-
-        float S = 0.0f;      // sum
-        float M = -INFINITY; // maximum KQ value
-
-        float       * VKQ32 = (float       *) params->wdata + ith*(3*D + CACHE_LINE_SIZE_F32); // FP32 VKQ accumulator
-        float       * V32   =                 (VKQ32 + 1*D); // (temporary) FP32 V buffer
-        ggml_fp16_t * VKQ16 = (ggml_fp16_t *) (VKQ32 + 1*D); // (temporary) FP16 VKQ accumulator
-        ggml_fp16_t * Q_q   = (ggml_fp16_t *) (VKQ32 + 2*D); // (temporary) buffer for Q converted to quantized/FP16
-
-        if (v->type == GGML_TYPE_F16) {
-            memset(VKQ16, 0, D*sizeof(ggml_fp16_t));
-        } else {
-            memset(VKQ32, 0, D*sizeof(float));
-        }
-
-        const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
-
-        // k indices
-        const int ik3 = iq3 / rk3;
-        const int ik2 = iq2 / rk2;
-
-        // v indices
-        const int iv3 = iq3 / rv3;
-        const int iv2 = iq2 / rv2;
-
-        const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
-        q_to_vec_dot(pq, Q_q, D);
-
-        // online softmax / attention
-        // loop over n_kv and n_head_kv
-        // ref: https://arxiv.org/pdf/2112.05682.pdf
-        for (int64_t ic = 0; ic < nek1; ++ic) {
-            const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
-            if (mv == -INFINITY) {
-                continue;
-            }
-
-            float s; // KQ value
-
-            const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
-            kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1);
-
-            s = s*scale; // scale KQ value
-
-            if (logit_softcap != 0.0f) {
-                s = logit_softcap*tanhf(s);
-            }
-
-            s += mv; // apply mask
-
-            const float Mold = M;
-
-            float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value
-            float vs = 1.0f; // post-softmax KQ value, expf(s - M)
-
-            const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
-
-            if (v->type == GGML_TYPE_F16) {
-                if (s > M) {
-                    // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
-                    M = s;
-                    ms = expf(Mold - M);
-
-                    // V = V*expf(Mold - M)
-                    ggml_vec_scale_f16(D, VKQ16, ms);
-                } else {
-                    // no new maximum, ms == 1.0f, vs != 1.0f
-                    vs = expf(s - M);
-                }
-
-                // V += v*expf(s - M)
-                ggml_vec_mad_f16(D, VKQ16, (const ggml_fp16_t *) v_data, vs);
-            } else {
-                if (s > M) {
-                    // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
-                    M = s;
-                    ms = expf(Mold - M);
-
-                    // V = V*expf(Mold - M)
-                    ggml_vec_scale_f32(D, VKQ32, ms);
-                } else {
-                    // no new maximum, ms == 1.0f, vs != 1.0f
-                    vs = expf(s - M);
-                }
-
-                v_to_float(v_data, V32, D);
-
-                // V += v*expf(s - M)
-                ggml_vec_mad_f32(D, VKQ32, V32, vs);
-            }
-
-            S = S*ms + vs; // scale and increment sum with partial sum
-        }
-
-        if (v->type == GGML_TYPE_F16) {
-            for (int64_t d = 0; d < D; ++d) {
-                VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]);
-            }
-        }
-
-        // V /= S
-        const float S_inv = 1.0f/S;
-        ggml_vec_scale_f32(D, VKQ32, S_inv);
-
-        // dst indices
-        const int i1 = iq1;
-        const int i2 = iq2;
-        const int i3 = iq3;
-
-        // original
-        //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
-
-        // permute(0, 2, 1, 3)
-        memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
-    }
-}
-
-static void ggml_compute_forward_flash_attn_ext(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const struct ggml_tensor * mask,
-        struct ggml_tensor * dst) {
-    switch (dst->op_params[3]) {
-        case GGML_PREC_DEFAULT:
-        case GGML_PREC_F32:
-            {
-                // uses F32 accumulators
-                ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_flash_attn_back
-
-static void ggml_compute_forward_flash_attn_back_f32(
-        const struct ggml_compute_params * params,
-        const bool masked,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * q = dst->src[0];
-    const struct ggml_tensor * k = dst->src[1];
-    const struct ggml_tensor * v = dst->src[2];
-    const struct ggml_tensor * d = dst->src[3];
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup  = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-    const int mxDM = MAX(D, Mup);
-
-    // GGML_ASSERT(ne0 == D);
-    // GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(float));
-    GGML_ASSERT(nbk0 == sizeof(float));
-    GGML_ASSERT(nbv0 == sizeof(float));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-    GGML_ASSERT(ned0 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-    GGML_ASSERT(ned1 == N);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (ith == 0) {
-        memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
-    }
-    ggml_barrier(params->threadpool);
-
-    const int64_t elem_q = ggml_nelements(q);
-    const int64_t elem_k = ggml_nelements(k);
-
-    enum ggml_type result_type = dst->type;
-    GGML_ASSERT(ggml_blck_size(result_type) == 1);
-    const size_t tsize = ggml_type_size(result_type);
-
-    const size_t offs_q = 0;
-    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
-    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
-
-    void * grad_q = (char *) dst->data;
-    void * grad_k = (char *) dst->data + offs_k;
-    void * grad_v = (char *) dst->data + offs_v;
-
-    const size_t nbgq1 = nb0*neq0;
-    const size_t nbgq2 = nb0*neq0*neq1;
-    const size_t nbgq3 = nb0*neq0*neq1*neq2;
-
-    const size_t nbgk1 = nb0*nek0;
-    const size_t nbgk2 = nb0*nek0*nek1;
-    const size_t nbgk3 = nb0*nek0*nek1*neq2;
-
-    const size_t nbgv1 = nb0*nev0;
-    const size_t nbgv2 = nb0*nev0*nev1;
-    const size_t nbgv3 = nb0*nev0*nev1*neq2;
-
-    // parallelize by k rows using ggml_vec_dot_f32
-
-    // total rows in k
-    const int nr = nek2*nek3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    // how often k2 (and v2) is repeated in q2
-    int nrep = neq2/nek2;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int ik3 = ir/(nek2);
-        const int ik2 = ir - ik3*nek2;
-
-        const int iq3 = ik3;
-        const int id3 = ik3;
-        const int iv3 = ik3;
-        const int iv2 = ik2;
-
-        for (int irep = 0; irep < nrep; ++irep) {
-            const int iq2 = ik2 + irep*nek2;
-            const int id2 = iq2;
-
-            // (ik2 + irep*nek2) % nek2 == ik2
-            for (int iq1 = 0; iq1 < neq1; ++iq1) {
-                const int id1 = iq1;
-
-                // not sure about CACHE_LINE_SIZE_F32..
-                // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
-                float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
-                float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
-
-                for (int i = M; i < Mup; ++i) {
-                    S[i] = -INFINITY;
-                }
-
-                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
-                for (int64_t ic = 0; ic < masked_begin; ++ic) {
-                    // k indices
-                    const int ik1 = ic;
-
-                    // S indices
-                    const int i1 = ik1;
-
-                    ggml_vec_dot_f32(neq0,
-                            S + i1, 0,
-                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
-                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
-                }
-
-                // scale
-                ggml_vec_scale_f32(masked_begin, S, scale);
-
-                for (int64_t i = masked_begin; i < M; i++) {
-                    S[i] = -INFINITY;
-                }
-
-                // softmax
-                // exclude known -INF S[..] values from max and loop
-                // dont forget to set their SM values to zero
-                {
-                    float max = -INFINITY;
-                    ggml_vec_max_f32(masked_begin, &max, S);
-
-                    ggml_float sum = 0.0;
-                    {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                        max = -max;
-                        vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
-                        vvexpf(SM, SM, &Mup);
-                        ggml_vec_sum_f32(Mup, &sum, SM);
-#else
-                        sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
-#endif
-                    }
-
-                    assert(sum > 0.0);
-
-                    sum = 1.0/sum;
-                    ggml_vec_scale_f32(masked_begin, SM, sum);
-
-                }
-
-                // step-by-step explanation
-                {
-                    // forward-process                    shape      grads from backward process
-                    // parallel_for ik2,ik3:
-                    //  for irep:
-                    //   iq2 = ik2 + irep*nek2
-                    //   k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,ik2,ik3]  += grad[kcur]
-                    //   q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
-                    //   v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iv2,iv3]  += grad[vcur]
-                    //   for iq1:
-                    //    kcur   = k[:D,:M,ik2,ik3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
-                    //    qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
-                    //    vcur   = v[:M,:D,iv2,iv3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
-                    //    S0     = -Inf                   [D,1,1,1]
-                    //   ~S1[i]  = dot(kcur[:D,i], qcur)
-                    //    S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
-                    //    S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
-                    //    S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                    //    S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
-                    //   ~S5[i]  = dot(vcur[:,i], S4)
-                    //    S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,id1,id2,id3]
-                    //   ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
-                    //    dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3]
-                    // dst                               backward-/ grad[dst]                 = d
-                    //
-                    // output gradients with their dependencies:
-                    //
-                    // grad[kcur] = grad[S1].T @ qcur
-                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
-                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                    // grad[S4]   = grad[S5] @ vcur
-                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
-                    // grad[qcur] = grad[S1]   @ kcur
-                    // grad[vcur] = grad[S5].T @ S4
-                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
-                    //
-                    // in post-order:
-                    //
-                    // S1         = qcur @ kcur.T
-                    // S2         = S1 * scale
-                    // S3         = diag_mask_inf(S2, P)
-                    // S4         = softmax(S3)
-                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
-                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
-                    // grad[qcur] = grad[S1]   @ kcur
-                    // grad[kcur] = grad[S1].T @ qcur
-                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
-                    //
-                    // using less variables (SM=S4):
-                    //
-                    // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
-                    // SM            = softmax(S)
-                    // S             = d[:D,iq1,iq2,iq3] @ vcur
-                    // dot_SM_gradSM = dot(SM, S)
-                    // S             = SM * (S - dot(SM, S))
-                    // S             = diag_mask_zero(S, P) * scale
-                    //
-                    // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
-                    // grad[k][:D,:M,ik2,ik3]  += S.T @ qcur
-                    // grad[v][:M,:D,iv2,iv3]  += d[:D,id1,id2,id3].T @ SM
-                }
-
-                // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
-                // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
-                // for ic:
-                //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
-                // exclude known future zero S[..] values from operation
-                ggml_vec_set_f32(masked_begin, S, 0);
-                for (int64_t ic = 0; ic < D; ++ic) {
-                    ggml_vec_mad_f32(masked_begin,
-                            S,
-                             (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
-                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
-                }
-
-                // S = SM * (S - dot(SM, S))
-                float dot_SM_gradSM = 0;
-                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
-                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
-                ggml_vec_mul_f32 (masked_begin, S, S, SM);
-
-                // S = diag_mask_zero(S, P) * scale
-                // already done by above ggml_vec_set_f32
-
-                // exclude known zero S[..] values from operation
-                ggml_vec_scale_f32(masked_begin, S, scale);
-
-                // S    shape [M,1]
-                // SM   shape [M,1]
-                // kcur shape [D,M]
-                // qcur shape [D,1]
-                // vcur shape [M,D]
-
-                // grad[q][:D,iq1,iq2,iq3] += S @ kcur
-                // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
-                // for ic:
-                //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
-                // exclude known zero S[..] values from loop
-                for (int64_t ic = 0; ic < masked_begin; ++ic) {
-                    ggml_vec_mad_f32(D,
-                            (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
-                            (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
-                            S[ic]);
-                }
-
-                // grad[k][:D,:M,iq2,iq3] += S.T @ qcur
-                // for ic:
-                //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
-                //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
-                // exclude known zero S[..] values from loop
-                for (int64_t ic = 0; ic < masked_begin; ++ic) {
-                    ggml_vec_mad_f32(D,
-                            (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
-                            (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
-                            S[ic]);
-                }
-
-                // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T       @ SM
-                // for ic:
-                //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
-                //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
-                // exclude known zero SM[..] values from mad
-                for (int64_t ic = 0; ic < D; ++ic) {
-                    ggml_vec_mad_f32(masked_begin,
-                            (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
-                            SM,
-                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn_back(
-        const struct ggml_compute_params * params,
-        const bool masked,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * q = dst->src[0];
-
-    switch (q->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_flash_attn_back_f32(params, masked, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_ssm_conv
-
-static void ggml_compute_forward_ssm_conv_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    const struct ggml_tensor * src0 = dst->src[0]; // conv_x
-    const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc  = src1->ne[0]; // d_conv
-    const int ncs = src0->ne[0]; // d_conv - 1 + n_t
-    const int nr  = src0->ne[1]; // d_inner
-    const int n_t =  dst->ne[1]; // tokens per sequence
-    const int n_s =  dst->ne[2]; // number of sequences in the batch
-
-    GGML_ASSERT( dst->ne[0] == nr);
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT(src1->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-    const int ir  = ir1 - ir0;
-
-    for (int i3 = 0; i3 < n_s; ++i3) {
-        for (int i2 = 0; i2 < n_t; ++i2) {
-            // {d_conv - 1 + n_t, d_inner, n_seqs}
-            // sliding window
-            const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s}
-            const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner}
-            float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s}
-
-            // TODO: transpose the output for smaller strides for big batches?
-            // d_inner
-            for (int i1 = 0; i1 < ir; ++i1) {
-                // rowwise dot product
-                // NOTE: not using ggml_vec_dot_f32, because its sum is in double precision
-                float sumf = 0.0f;
-
-                // d_conv
-                for (int i0 = 0; i0 < nc; ++i0) {
-                    sumf += s[i0 + i1*ncs] * c[i0 + i1*nc];
-                }
-                x[i1] = sumf;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_ssm_conv(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    switch (dst->src[0]->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_ssm_conv_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_ssm_scan
-
-static void ggml_compute_forward_ssm_scan_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    const struct ggml_tensor * src0 = dst->src[0]; // s
-    const struct ggml_tensor * src1 = dst->src[1]; // x
-    const struct ggml_tensor * src2 = dst->src[2]; // dt
-    const struct ggml_tensor * src3 = dst->src[3]; // A
-    const struct ggml_tensor * src4 = dst->src[4]; // B
-    const struct ggml_tensor * src5 = dst->src[5]; // C
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t nc  = src0->ne[0]; // d_state
-    const int64_t nr  = src0->ne[1]; // d_inner
-    const int64_t n_t = src1->ne[1]; // number of tokens per sequence
-    const int64_t n_s = src0->ne[2]; // number of sequences in the batch
-
-    GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT(src1->nb[0] == sizeof(float));
-    GGML_ASSERT(src2->nb[0] == sizeof(float));
-    GGML_ASSERT(src3->nb[0] == sizeof(float));
-    GGML_ASSERT(src4->nb[0] == sizeof(float));
-    GGML_ASSERT(src5->nb[0] == sizeof(float));
-    // required for the dot product between s and C
-    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
-    // required for per-sequence offsets for states
-    GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
-    // required to get correct offset for state destination (i.e. src1->nb[3])
-    GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-    const int ir  = ir1 - ir0;
-
-    for (int i3 = 0; i3 < n_s; ++i3) {
-        for (int i2 = 0; i2 < n_t; ++i2) {
-            const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
-            const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
-            const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
-            const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-            const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
-            const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
-                  float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
-                  float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
-
-            // use the output as the source for the next token-wise iterations
-            if (i2 > 0) { s0 = s; }
-
-            // d_inner
-            for (int i1 = 0; i1 < ir; ++i1) {
-                // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
-                float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
-                float x_dt = x[i1] * dt_soft_plus;
-                float sumf = 0.0f;
-                // d_state
-                for (int i0 = 0; i0 < nc; ++i0) {
-                    int i = i0 + i1*nc;
-                    // state = prev_state * dA + dB * x
-                    float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
-                    // y = rowwise_dotprod(state, C)
-                    sumf += state * C[i0];
-                    s[i] = state;
-                }
-                y[i1] = sumf;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_ssm_scan(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    switch (dst->src[0]->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_ssm_scan_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_win_part
-
-static void ggml_compute_forward_win_part_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    UNUSED(params);
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
-
-    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
-    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
-    const int32_t w    = ((const int32_t *)(dst->op_params))[2];
-
-    assert(ne00 == ne0);
-    assert(ne3  == nep0*nep1);
-
-    // TODO: optimize / multi-thread
-    for (int py = 0; py < nep1; ++py) {
-        for (int px = 0; px < nep0; ++px) {
-            const int64_t i3 = py*nep0 + px;
-            for (int64_t i2 = 0; i2 < ne2; ++i2) {
-                for (int64_t i1 = 0; i1 < ne1; ++i1) {
-                    for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                        const int64_t i02 = py*w + i2;
-                        const int64_t i01 = px*w + i1;
-                        const int64_t i00 = i0;
-
-                        const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0    + i1*ne0   + i0;
-                        const int64_t j =                  i02*ne01*ne00 + i01*ne00 + i00;
-
-                        if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
-                            ((float *) dst->data)[i] = 0.0f;
-                        } else {
-                            ((float *) dst->data)[i] = ((float *) src0->data)[j];
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_win_part(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_win_part_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_win_unpart
-
-static void ggml_compute_forward_win_unpart_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    UNUSED(params);
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
-
-    const int32_t w = ((const int32_t *)(dst->op_params))[0];
-
-    // padding
-    const int px = (w - ne1%w)%w;
-    //const int py = (w - ne2%w)%w;
-
-    const int npx = (px + ne1)/w;
-    //const int npy = (py + ne2)/w;
-
-    assert(ne0 == ne00);
-
-    // TODO: optimize / multi-thread
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = 0; i1 < ne1; ++i1) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                const int ip2 = i2/w;
-                const int ip1 = i1/w;
-
-                const int64_t i02 = i2%w;
-                const int64_t i01 = i1%w;
-                const int64_t i00 = i0;
-
-                const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
-                const int64_t j =                                  i2*ne1*ne0    + i1*ne0   + i0;
-
-                ((float *) dst->data)[j] = ((float *) src0->data)[i];
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_win_unpart(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_win_unpart_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-//gmml_compute_forward_unary
-
-static void ggml_compute_forward_unary(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const enum ggml_unary_op op = ggml_get_unary_op(dst);
-
-    switch (op) {
-        case GGML_UNARY_OP_ABS:
-            {
-                ggml_compute_forward_abs(params, dst);
-            } break;
-        case GGML_UNARY_OP_SGN:
-            {
-                ggml_compute_forward_sgn(params, dst);
-            } break;
-        case GGML_UNARY_OP_NEG:
-            {
-                ggml_compute_forward_neg(params, dst);
-            } break;
-        case GGML_UNARY_OP_STEP:
-            {
-                ggml_compute_forward_step(params, dst);
-            } break;
-        case GGML_UNARY_OP_TANH:
-            {
-                ggml_compute_forward_tanh(params, dst);
-            } break;
-        case GGML_UNARY_OP_ELU:
-            {
-                ggml_compute_forward_elu(params, dst);
-            } break;
-        case GGML_UNARY_OP_RELU:
-            {
-                ggml_compute_forward_relu(params, dst);
-            } break;
-        case GGML_UNARY_OP_SIGMOID:
-            {
-                ggml_compute_forward_sigmoid(params, dst);
-            } break;
-        case GGML_UNARY_OP_GELU:
-            {
-                ggml_compute_forward_gelu(params, dst);
-            } break;
-        case GGML_UNARY_OP_GELU_QUICK:
-            {
-                ggml_compute_forward_gelu_quick(params, dst);
-            } break;
-        case GGML_UNARY_OP_SILU:
-            {
-                ggml_compute_forward_silu(params, dst);
-            } break;
-        case GGML_UNARY_OP_HARDSWISH:
-            {
-                ggml_compute_forward_hardswish(params, dst);
-            } break;
-        case GGML_UNARY_OP_HARDSIGMOID:
-            {
-                ggml_compute_forward_hardsigmoid(params, dst);
-            } break;
-        case GGML_UNARY_OP_EXP:
-            {
-                ggml_compute_forward_exp(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_get_rel_pos
-
-static void ggml_compute_forward_get_rel_pos_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    UNUSED(params);
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int64_t w = ne1;
-
-    ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
-    ggml_fp16_t * dst_data  = (ggml_fp16_t *) dst->data;
-
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = 0; i1 < ne1; ++i1) {
-            const int64_t pos = (w - i1 - 1) + i2;
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rel_pos(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-        case GGML_TYPE_BF16:
-            {
-                ggml_compute_forward_get_rel_pos_f16(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_add_rel_pos
-
-static void ggml_compute_forward_add_rel_pos_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src2 = dst->src[2];
-
-    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
-    if (!inplace) {
-        if (params->ith == 0) {
-            memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
-        }
-        ggml_barrier(params->threadpool);
-    }
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
-
-    float * src1_data = (float *) src1->data;
-    float * src2_data = (float *) src2->data;
-    float * dst_data  = (float *) dst->data;
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // total patches in dst
-    const int np = ne13;
-
-    // patches per thread
-    const int dp = (np + nth - 1)/nth;
-
-    // patch range for this thread
-    const int ip0 = dp*ith;
-    const int ip1 = MIN(ip0 + dp, np);
-
-    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
-        for (int64_t i12 = 0; i12 < ne12; ++i12) {
-            for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
-                for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                    const int64_t jp0  = jp1 + i10;
-                    const float src1_e = src1_data[jp0];
-                    const float src2_e = src2_data[jp0];
-
-                    const int64_t jdh = jp0 * ne10;
-                    const int64_t jdw = jdh - (ne10 - 1) * i10;
-
-                    for (int64_t j = 0; j < ne10; ++j) {
-                        dst_data[jdh + j     ] += src2_e;
-                        dst_data[jdw + j*ne10] += src1_e;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_add_rel_pos(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add_rel_pos_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_rwkv_wkv
-
-static void ggml_compute_forward_rwkv_wkv_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    const size_t T = dst->src[1]->ne[3];
-    const size_t C = dst->ne[0];
-    const size_t H = dst->src[1]->ne[2];
-    const size_t n_seqs = dst->src[5]->ne[1];
-
-    float * dst_data = (float *) dst->data;
-    float * state = ((float *) dst->data) + C * T;
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    memset(dst_data, 0, T * C * sizeof(float));
-
-    float * k =          (float *) dst->src[0]->data;
-    float * v =          (float *) dst->src[1]->data;
-    float * r =          (float *) dst->src[2]->data;
-    float * time_faaaa = (float *) dst->src[3]->data;
-    float * time_decay = (float *) dst->src[4]->data;
-
-    size_t t_stride = H * (C / H);
-
-    size_t h_stride = C / H;
-    size_t h_stride_2d = (C / H) * (C / H);
-
-    // basically fused operations:
-    // dst = r @ (time_faaaa * (k @ v) + state),
-    // state = time_decay * state + (k @ v),
-    // recursive through each token
-    for (size_t t = 0; t < T; t++) {
-        size_t t_offset = t * t_stride;
-        size_t state_offset = (C / H) * C * (t / (T / n_seqs));
-        float * state_cur = state + state_offset;
-        float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
-
-        for (size_t h = 0; h < H; h++) {
-            size_t h_offset = h * h_stride;
-            size_t t_h_offset = t_offset + h_offset;
-            size_t h_2d_offset = h * h_stride_2d;
-
-            for (size_t i = 0; i < C / H; i++) {
-                size_t t_h_i_offset = t_h_offset + i;
-                size_t h_i_offset = h_offset + i;
-                size_t h_2d_i_offset = h_2d_offset + i * h_stride;
-
-                float k_val = k[t_h_i_offset];
-                float r_val = r[t_h_i_offset];
-                float time_faaaa_val = time_faaaa[h_i_offset];
-                // RWKV v6: different time_decay for each token.
-                float time_decay_val = time_decay[t_h_i_offset];
-
-                for (size_t j = 0; j < C / H; j ++) {
-                    size_t t_h_j_offset = t_h_offset + j;
-                    size_t h_2d_i_j_offset = h_2d_i_offset + j;
-
-                    float v_val = v[t_h_j_offset];
-                    float kv_val = v_val * k_val;
-                    float prev_state_val = state_prev[h_2d_i_j_offset];
-                    float temp_val = kv_val * time_faaaa_val + prev_state_val;
-                    dst_data[t_h_j_offset] += temp_val * r_val;
-                    state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rwkv_wkv(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rwkv_wkv_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_map_unary
-
-static void ggml_compute_forward_map_unary_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_unary_op_f32_t fun) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_map_unary(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_unary_op_f32_t fun) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_unary_f32(params, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_map_binary
-
-static void ggml_compute_forward_map_binary_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_binary_op_f32_t fun) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(src1));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_map_binary(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_binary_op_f32_t fun) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_binary_f32(params, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_map_custom1
-
-static void ggml_compute_forward_map_custom1_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_custom1_op_f32_t fun) {
-
-    const struct ggml_tensor * a = dst->src[0];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    fun(dst, a);
-}
-
-// ggml_compute_forward_map_custom2
-
-static void ggml_compute_forward_map_custom2_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_custom2_op_f32_t fun) {
-
-    const struct ggml_tensor * a = dst->src[0];
-    const struct ggml_tensor * b = dst->src[1];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    fun(dst, a, b);
-}
-
-// ggml_compute_forward_map_custom3
-
-static void ggml_compute_forward_map_custom3_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_custom3_op_f32_t fun) {
-
-    const struct ggml_tensor * a = dst->src[0];
-    const struct ggml_tensor * b = dst->src[1];
-    const struct ggml_tensor * c = dst->src[1];
-
-    if (params->ith != 0) {
-        return;
-    }
-
-    fun(dst, a, b, c);
-}
-
-// ggml_compute_forward_map_custom1
-
-static void ggml_compute_forward_map_custom1(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * a = dst->src[0];
-
-    struct ggml_map_custom1_op_params p;
-    memcpy(&p, dst->op_params, sizeof(p));
-
-    p.fun(dst, a, params->ith, params->nth, p.userdata);
-}
-
-// ggml_compute_forward_map_custom2
-
-static void ggml_compute_forward_map_custom2(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * a = dst->src[0];
-    const struct ggml_tensor * b = dst->src[1];
-
-    struct ggml_map_custom2_op_params p;
-    memcpy(&p, dst->op_params, sizeof(p));
-
-    p.fun(dst, a, b, params->ith, params->nth, p.userdata);
-}
-
-// ggml_compute_forward_map_custom3
-
-static void ggml_compute_forward_map_custom3(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * a = dst->src[0];
-    const struct ggml_tensor * b = dst->src[1];
-    const struct ggml_tensor * c = dst->src[2];
-
-    struct ggml_map_custom3_op_params p;
-    memcpy(&p, dst->op_params, sizeof(p));
-
-    p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
-}
-
-// ggml_compute_forward_cross_entropy_loss
-
-static void ggml_compute_forward_cross_entropy_loss_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
-    GGML_ASSERT(ggml_are_same_shape(src0, src1));
-    GGML_ASSERT(ggml_is_scalar(dst));
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    // TODO: handle transposed/permuted matrices
-    const int64_t nc = src0->ne[0];
-    const int64_t nr = ggml_nrows(src0);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    float * sums =  (float *) params->wdata;
-    float * st   = ((float *) params->wdata) + nth + ith*nc;
-    float sum_thread = 0.0f;
-
-    GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    for (int64_t i1 = ir0; i1 < ir1; ++i1) {
-        const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]);
-        const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]);
-
-#ifndef NDEBUG
-        for (int64_t i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(s0[i]));
-            assert(!isnan(s1[i]));
-        }
-#endif
-
-        float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, s0);
-        const ggml_float sum_softmax = ggml_vec_log_soft_max_f32(nc, st, s0, max);
-        assert(sum_softmax >= 0.0);
-
-        ggml_vec_add1_f32(nc, st, st, -sum_softmax);
-        ggml_vec_mul_f32(nc, st, st, s1);
-
-        float sum_st = 0.0f;
-        ggml_vec_sum_f32(nc, &sum_st, st);
-        sum_thread += sum_st;
-
-#ifndef NDEBUG
-        for (int64_t i = 0; i < nc; ++i) {
-            assert(!isnan(st[i]));
-            assert(!isinf(st[i]));
-        }
-#endif
-    }
-    sums[ith] = sum_thread;
-    ggml_barrier(params->threadpool);
-
-    if (ith == 0) {
-        float * dp = (float *) dst->data;
-        ggml_vec_sum_f32(nth, dp, sums);
-        dp[0] *= -1.0f / (float) nr;
-    }
-}
-
-static void ggml_compute_forward_cross_entropy_loss(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_cross_entropy_loss_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-// ggml_compute_forward_cross_entropy_loss_back
-
-static void ggml_compute_forward_cross_entropy_loss_back_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * opt0 = dst->src[2];
-
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(ggml_is_contiguous(opt0));
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    const int64_t ith = params->ith;
-    const int64_t nth = params->nth;
-
-    // TODO: handle transposed/permuted matrices
-    const int64_t nc = src0->ne[0];
-    const int64_t nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr;
-
-    for (int64_t i1 = ir0; i1 < ir1; i1++) {
-        float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
-        float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
-
-#ifndef NDEBUG
-        for (int64_t i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(s0[i]));
-            assert(!isnan(s1[i]));
-        }
-#endif
-
-        // soft_max
-        float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, s0);
-        ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
-        assert(sum > 0.0);
-        ggml_vec_scale_f32(nc, ds0, 1.0/sum);
-
-        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
-        ggml_vec_sub_f32(nc, ds0, ds0, s1);
-        ggml_vec_scale_f32(nc, ds0, d_by_nr);
-
-#ifndef NDEBUG
-        for (int64_t i = 0; i < nc; ++i) {
-            assert(!isnan(ds0[i]));
-            assert(!isinf(ds0[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_cross_entropy_loss_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_cross_entropy_loss_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-static void ggml_compute_forward_opt_step_adamw_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0        = dst->src[0];
-    const struct ggml_tensor * src0_grad   = dst->src[1];
-    const struct ggml_tensor * src0_grad_m = dst->src[2];
-    const struct ggml_tensor * src0_grad_v = dst->src[3];
-    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    /* const float   gnorm = 1.0f; */
-    int64_t       iter;   memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
-    const float   alpha = ggml_get_op_params_f32(dst, 2);
-    const float   beta1 = ggml_get_op_params_f32(dst, 3);
-    const float   beta2 = ggml_get_op_params_f32(dst, 4);
-    const float   eps   = ggml_get_op_params_f32(dst, 5);
-    const float   wd    = ggml_get_op_params_f32(dst, 6);
-
-    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
-    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        const int64_t i03 = ir/(ne02*ne01);
-        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        const size_t offset = i03*nb03 + i02*nb02 + i01*nb01;
-
-        float       * w = (float       *) ((char       *) src0->data        + offset); // weight
-        const float * g = (const float *) ((const char *) src0_grad->data   + offset); // grad
-        float       * m = (float       *) ((char       *) src0_grad_m->data + offset);
-        float       * v = (float       *) ((char       *) src0_grad_v->data + offset);
-
-        for (int i00 = 0; i00 < ne00; ++i00) {
-            m[i00] = m[i00]*beta1 +        g[i00]*(1.0f - beta1);
-            v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2);
-
-            const float mh =       m[i00]*beta1h;
-            const float vh = sqrtf(v[i00]*beta2h) + eps;
-
-            // The weight decay is applied independently of the Adam momenta m and v.
-            // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
-            // See: https://arxiv.org/pdf/1711.05101v3.pdf
-            w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
-        }
-    }
-
-    ggml_barrier(params->threadpool);
-    if (ith != 0) {
-        return;
-    }
-
-    iter++;
-    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
-}
-
-static void ggml_compute_forward_opt_step_adamw(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_opt_step_adamw_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-/////////////////////////////////
-
-static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    GGML_ASSERT(params);
-
-    if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
-        return;
-    }
-
-    switch (tensor->op) {
-        case GGML_OP_DUP:
-            {
-                ggml_compute_forward_dup(params, tensor);
-            } break;
-        case GGML_OP_ADD:
-            {
-                ggml_compute_forward_add(params, tensor);
-            } break;
-        case GGML_OP_ADD1:
-            {
-                ggml_compute_forward_add1(params, tensor);
-            } break;
-        case GGML_OP_ACC:
-            {
-                ggml_compute_forward_acc(params, tensor);
-            } break;
-        case GGML_OP_SUB:
-            {
-                ggml_compute_forward_sub(params, tensor);
-            } break;
-        case GGML_OP_MUL:
-            {
-                ggml_compute_forward_mul(params, tensor);
-            } break;
-        case GGML_OP_DIV:
-            {
-                ggml_compute_forward_div(params, tensor);
-            } break;
-        case GGML_OP_SQR:
-            {
-                ggml_compute_forward_sqr(params, tensor);
-            } break;
-        case GGML_OP_SQRT:
-            {
-                ggml_compute_forward_sqrt(params, tensor);
-            } break;
-        case GGML_OP_LOG:
-            {
-                ggml_compute_forward_log(params, tensor);
-            } break;
-        case GGML_OP_SIN:
-            {
-                ggml_compute_forward_sin(params, tensor);
-            } break;
-        case GGML_OP_COS:
-            {
-                ggml_compute_forward_cos(params, tensor);
-            } break;
-        case GGML_OP_SUM:
-            {
-                ggml_compute_forward_sum(params, tensor);
-            } break;
-        case GGML_OP_SUM_ROWS:
-            {
-                ggml_compute_forward_sum_rows(params, tensor);
-            } break;
-        case GGML_OP_MEAN:
-            {
-                ggml_compute_forward_mean(params, tensor);
-            } break;
-        case GGML_OP_ARGMAX:
-            {
-                ggml_compute_forward_argmax(params, tensor);
-            } break;
-        case GGML_OP_COUNT_EQUAL:
-            {
-                ggml_compute_forward_count_equal(params, tensor);
-            } break;
-        case GGML_OP_REPEAT:
-            {
-                ggml_compute_forward_repeat(params, tensor);
-            } break;
-        case GGML_OP_REPEAT_BACK:
-            {
-                ggml_compute_forward_repeat_back(params, tensor);
-            } break;
-        case GGML_OP_CONCAT:
-            {
-                ggml_compute_forward_concat(params, tensor);
-            } break;
-        case GGML_OP_SILU_BACK:
-            {
-                ggml_compute_forward_silu_back(params, tensor);
-            } break;
-        case GGML_OP_NORM:
-            {
-                ggml_compute_forward_norm(params, tensor);
-            } break;
-        case GGML_OP_RMS_NORM:
-            {
-                ggml_compute_forward_rms_norm(params, tensor);
-            } break;
-        case GGML_OP_RMS_NORM_BACK:
-            {
-                ggml_compute_forward_rms_norm_back(params, tensor);
-            } break;
-        case GGML_OP_GROUP_NORM:
-            {
-                ggml_compute_forward_group_norm(params, tensor);
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                ggml_compute_forward_mul_mat(params, tensor);
-            } break;
-        case GGML_OP_MUL_MAT_ID:
-            {
-                ggml_compute_forward_mul_mat_id(params, tensor);
-            } break;
-        case GGML_OP_OUT_PROD:
-            {
-                ggml_compute_forward_out_prod(params, tensor);
-            } break;
-        case GGML_OP_SCALE:
-            {
-                ggml_compute_forward_scale(params, tensor);
-            } break;
-        case GGML_OP_SET:
-            {
-                ggml_compute_forward_set(params, tensor);
-            } break;
-        case GGML_OP_CPY:
-            {
-                ggml_compute_forward_cpy(params, tensor);
-            } break;
-        case GGML_OP_CONT:
-            {
-                ggml_compute_forward_cont(params, tensor);
-            } break;
-        case GGML_OP_RESHAPE:
-            {
-                ggml_compute_forward_reshape(params, tensor);
-            } break;
-        case GGML_OP_VIEW:
-            {
-                ggml_compute_forward_view(params, tensor);
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                ggml_compute_forward_permute(params, tensor);
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                ggml_compute_forward_transpose(params, tensor);
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                ggml_compute_forward_get_rows(params, tensor);
-            } break;
-        case GGML_OP_GET_ROWS_BACK:
-            {
-                ggml_compute_forward_get_rows_back(params, tensor);
-            } break;
-        case GGML_OP_DIAG:
-            {
-                ggml_compute_forward_diag(params, tensor);
-            } break;
-        case GGML_OP_DIAG_MASK_INF:
-            {
-                ggml_compute_forward_diag_mask_inf(params, tensor);
-            } break;
-        case GGML_OP_DIAG_MASK_ZERO:
-            {
-                ggml_compute_forward_diag_mask_zero(params, tensor);
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                ggml_compute_forward_soft_max(params, tensor);
-            } break;
-        case GGML_OP_SOFT_MAX_BACK:
-            {
-                ggml_compute_forward_soft_max_back(params, tensor);
-            } break;
-        case GGML_OP_ROPE:
-            {
-                ggml_compute_forward_rope(params, tensor);
-            } break;
-        case GGML_OP_ROPE_BACK:
-            {
-                ggml_compute_forward_rope_back(params, tensor);
-            } break;
-        case GGML_OP_CLAMP:
-            {
-                ggml_compute_forward_clamp(params, tensor);
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                ggml_compute_forward_conv_transpose_1d(params, tensor);
-            } break;
-        case GGML_OP_IM2COL:
-            {
-                ggml_compute_forward_im2col(params, tensor);
-            } break;
-        case GGML_OP_IM2COL_BACK:
-            {
-                ggml_compute_forward_im2col_back_f32(params, tensor);
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_2D:
-            {
-                ggml_compute_forward_conv_transpose_2d(params, tensor);
-            } break;
-        case GGML_OP_POOL_1D:
-            {
-                ggml_compute_forward_pool_1d(params, tensor);
-            } break;
-        case GGML_OP_POOL_2D:
-            {
-                ggml_compute_forward_pool_2d(params, tensor);
-            } break;
-        case GGML_OP_POOL_2D_BACK:
-            {
-                ggml_compute_forward_pool_2d_back(params, tensor);
-            } break;
-        case GGML_OP_UPSCALE:
-            {
-                ggml_compute_forward_upscale(params, tensor);
-            } break;
-        case GGML_OP_PAD:
-            {
-                ggml_compute_forward_pad(params, tensor);
-            } break;
-        case GGML_OP_ARANGE:
-            {
-                ggml_compute_forward_arange(params, tensor);
-            } break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            {
-                ggml_compute_forward_timestep_embedding(params, tensor);
-            } break;
-        case GGML_OP_ARGSORT:
-            {
-                ggml_compute_forward_argsort(params, tensor);
-            } break;
-        case GGML_OP_LEAKY_RELU:
-            {
-                ggml_compute_forward_leaky_relu(params, tensor);
-            } break;
-        case GGML_OP_FLASH_ATTN_EXT:
-            {
-                ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
-            } break;
-        case GGML_OP_FLASH_ATTN_BACK:
-            {
-                int32_t t = ggml_get_op_params_i32(tensor, 0);
-                GGML_ASSERT(t == 0 || t == 1);
-                bool masked = t != 0;
-                ggml_compute_forward_flash_attn_back(params, masked, tensor);
-            } break;
-        case GGML_OP_SSM_CONV:
-            {
-                ggml_compute_forward_ssm_conv(params, tensor);
-            } break;
-        case GGML_OP_SSM_SCAN:
-            {
-                ggml_compute_forward_ssm_scan(params, tensor);
-            } break;
-        case GGML_OP_WIN_PART:
-            {
-                ggml_compute_forward_win_part(params, tensor);
-            } break;
-        case GGML_OP_WIN_UNPART:
-            {
-                ggml_compute_forward_win_unpart(params, tensor);
-            } break;
-        case GGML_OP_UNARY:
-            {
-                ggml_compute_forward_unary(params, tensor);
-            } break;
-        case GGML_OP_GET_REL_POS:
-            {
-                ggml_compute_forward_get_rel_pos(params, tensor);
-            } break;
-        case GGML_OP_ADD_REL_POS:
-            {
-                ggml_compute_forward_add_rel_pos(params, tensor);
-            } break;
-        case GGML_OP_RWKV_WKV:
-            {
-                ggml_compute_forward_rwkv_wkv(params, tensor);
-            } break;
-        case GGML_OP_MAP_UNARY:
-            {
-                ggml_unary_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_unary(params, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_BINARY:
-            {
-                ggml_binary_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_binary(params, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM1_F32:
-            {
-                ggml_custom1_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom1_f32(params, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM2_F32:
-            {
-                ggml_custom2_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom2_f32(params, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM3_F32:
-            {
-                ggml_custom3_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom3_f32(params, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM1:
-            {
-                ggml_compute_forward_map_custom1(params, tensor);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM2:
-            {
-                ggml_compute_forward_map_custom2(params, tensor);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM3:
-            {
-                ggml_compute_forward_map_custom3(params, tensor);
-            }
-            break;
-        case GGML_OP_CROSS_ENTROPY_LOSS:
-            {
-                ggml_compute_forward_cross_entropy_loss(params, tensor);
-            }
-            break;
-        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-            {
-                ggml_compute_forward_cross_entropy_loss_back(params, tensor);
-            }
-            break;
-        case GGML_OP_OPT_STEP_ADAMW:
-            {
-                ggml_compute_forward_opt_step_adamw(params, tensor);
-            }
-            break;
-        case GGML_OP_NONE:
-            {
-                // nop
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 struct ggml_hash_set ggml_hash_set_new(size_t size) {
     size = ggml_hash_size(size);
     struct ggml_hash_set result;
@@ -18870,7 +6320,6 @@ void ggml_build_opt_adamw(
     }
 }
 
-
 static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
     void * ptr = *p;
     ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
@@ -18994,6 +6443,19 @@ struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgrap
     return result;
 }
 
+struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
+    if (ggml_is_empty(tensor)) {
+        return tensor;
+    }
+    if (tensor->buffer) {
+        ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor));
+    } else {
+        GGML_ASSERT(tensor->data);
+        memset(tensor->data, 0, ggml_nbytes(tensor));
+    }
+    return tensor;
+}
+
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
     GGML_ASSERT(cgraph->grads != NULL);
 
@@ -19058,1096 +6520,6 @@ void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tenso
     cgraph->n_nodes++;
 }
 
-// Android's libc implementation "bionic" does not support setting affinity
-#if defined(__gnu_linux__)
-static void set_numa_thread_affinity(int thread_n) {
-    if (!ggml_is_numa()) {
-        return;
-    }
-
-    int node_num;
-    int rv;
-    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
-
-    switch(g_state.numa.numa_strategy) {
-        case GGML_NUMA_STRATEGY_DISTRIBUTE:
-            // run thread on node_num thread_n / (threads per node)
-            node_num = thread_n % g_state.numa.n_nodes;
-            break;
-        case GGML_NUMA_STRATEGY_ISOLATE:
-            // run thread on current_node
-            node_num = g_state.numa.current_node;
-            break;
-        case GGML_NUMA_STRATEGY_NUMACTL:
-            // use the cpuset that numactl gave us
-            rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
-            if (rv) {
-                fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
-            }
-            return;
-        default:
-            return;
-    }
-
-    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
-
-    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
-    CPU_ZERO_S(setsize, cpus);
-    for (size_t i = 0; i < node->n_cpus; ++i) {
-        CPU_SET_S(node->cpus[i], setsize, cpus);
-    }
-
-    rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
-    if (rv) {
-            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
-    }
-
-    CPU_FREE(cpus);
-}
-
-static void clear_numa_thread_affinity(void) {
-    if (!ggml_is_numa()) {
-        return;
-    }
-
-    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
-
-    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
-    CPU_ZERO_S(setsize, cpus);
-    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
-        CPU_SET_S(i, setsize, cpus);
-    }
-
-    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
-    if (rv) {
-        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
-    }
-
-    CPU_FREE(cpus);
-}
-#else
-// TODO: Windows etc.
-// (the linux implementation may also work on BSD, someone should test)
-static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
-static void clear_numa_thread_affinity(void) {}
-#endif
-
-static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-    int n_tasks = 0;
-
-    if (ggml_is_empty(node)) {
-        // no need to multi-thread a no-op
-        n_tasks = 1;
-        return n_tasks;
-    }
-
-    switch (node->op) {
-        case GGML_OP_CPY:
-        case GGML_OP_DUP:
-        case GGML_OP_CONT:
-        case GGML_OP_ADD:
-        case GGML_OP_ADD1:
-        case GGML_OP_ACC:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_SUB:
-        case GGML_OP_SQR:
-        case GGML_OP_SQRT:
-        case GGML_OP_LOG:
-        case GGML_OP_SIN:
-        case GGML_OP_COS:
-        case GGML_OP_SUM:
-        case GGML_OP_SUM_ROWS:
-        case GGML_OP_MEAN:
-        case GGML_OP_ARGMAX:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_COUNT_EQUAL:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_REPEAT:
-        case GGML_OP_REPEAT_BACK:
-        case GGML_OP_LEAKY_RELU:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(node)) {
-                case GGML_UNARY_OP_ABS:
-                case GGML_UNARY_OP_SGN:
-                case GGML_UNARY_OP_NEG:
-                case GGML_UNARY_OP_STEP:
-                case GGML_UNARY_OP_TANH:
-                case GGML_UNARY_OP_ELU:
-                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_SIGMOID:
-                case GGML_UNARY_OP_HARDSWISH:
-                case GGML_UNARY_OP_HARDSIGMOID:
-                case GGML_UNARY_OP_EXP:
-                    {
-                        n_tasks = 1;
-                    } break;
-
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_GELU_QUICK:
-                case GGML_UNARY_OP_SILU:
-                    {
-                        n_tasks = n_threads;
-                    } break;
-                default:
-                    GGML_ABORT("fatal error");
-            }
-            break;
-        case GGML_OP_SILU_BACK:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_NORM:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_RMS_NORM_BACK:
-        case GGML_OP_GROUP_NORM:
-        case GGML_OP_CONCAT:
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
-        case GGML_OP_OUT_PROD:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                // FIXME: get_rows can use additional threads, but the cost of launching additional threads
-                // decreases performance with GPU offloading
-                //n_tasks = n_threads;
-                n_tasks = 1;
-            } break;
-        case GGML_OP_SCALE:
-        case GGML_OP_SET:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_GET_ROWS_BACK:
-        case GGML_OP_DIAG:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_DIAG_MASK_ZERO:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX_BACK:
-        case GGML_OP_ROPE:
-        case GGML_OP_ROPE_BACK:
-        case GGML_OP_ADD_REL_POS:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_CLAMP:
-            {
-                n_tasks = 1; //TODO
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
-            } break;
-        case GGML_OP_IM2COL:
-        case GGML_OP_IM2COL_BACK:
-        case GGML_OP_CONV_TRANSPOSE_1D:
-        case GGML_OP_CONV_TRANSPOSE_2D:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_POOL_1D:
-        case GGML_OP_POOL_2D:
-        case GGML_OP_POOL_2D_BACK:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
-        case GGML_OP_ARANGE:
-        case GGML_OP_TIMESTEP_EMBEDDING:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_FLASH_ATTN_EXT:
-        case GGML_OP_FLASH_ATTN_BACK:
-        case GGML_OP_SSM_CONV:
-        case GGML_OP_SSM_SCAN:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_WIN_PART:
-        case GGML_OP_WIN_UNPART:
-        case GGML_OP_GET_REL_POS:
-        case GGML_OP_RWKV_WKV:
-        case GGML_OP_MAP_UNARY:
-        case GGML_OP_MAP_BINARY:
-        case GGML_OP_MAP_CUSTOM1_F32:
-        case GGML_OP_MAP_CUSTOM2_F32:
-        case GGML_OP_MAP_CUSTOM3_F32:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_MAP_CUSTOM1:
-            {
-                struct ggml_map_custom1_op_params p;
-                memcpy(&p, node->op_params, sizeof(p));
-                if (p.n_tasks == GGML_N_TASKS_MAX) {
-                    n_tasks = n_threads;
-                } else {
-                    n_tasks = MIN(p.n_tasks, n_threads);
-                }
-            } break;
-        case GGML_OP_MAP_CUSTOM2:
-            {
-                struct ggml_map_custom2_op_params p;
-                memcpy(&p, node->op_params, sizeof(p));
-                if (p.n_tasks == GGML_N_TASKS_MAX) {
-                    n_tasks = n_threads;
-                } else {
-                    n_tasks = MIN(p.n_tasks, n_threads);
-                }
-            } break;
-        case GGML_OP_MAP_CUSTOM3:
-            {
-                struct ggml_map_custom3_op_params p;
-                memcpy(&p, node->op_params, sizeof(p));
-                if (p.n_tasks == GGML_N_TASKS_MAX) {
-                    n_tasks = n_threads;
-                } else {
-                    n_tasks = MIN(p.n_tasks, n_threads);
-                }
-            } break;
-        case GGML_OP_CROSS_ENTROPY_LOSS:
-        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-        case GGML_OP_OPT_STEP_ADAMW:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_NONE:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ABORT("fatal error");
-            }
-        default:
-            {
-                fprintf(stderr, "%s: op not implemented: ", __func__);
-                if (node->op < GGML_OP_COUNT) {
-                    fprintf(stderr, "%s\n", ggml_op_name(node->op));
-                } else {
-                    fprintf(stderr, "%d\n", node->op);
-                }
-                GGML_ABORT("fatal error");
-            }
-    }
-
-    assert(n_tasks > 0);
-
-    return n_tasks;
-}
-
-static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
-
-#if defined(_WIN32)
-#include "windows.h"
-
-// TODO: support > 64 CPUs
-bool ggml_thread_apply_affinity(bool * mask) {
-    HANDLE    h = GetCurrentThread();
-    uint64_t  bitmask = 0ULL;
-
-    assert(GGML_MAX_N_THREADS >= 64);
-
-    for (int32_t i = 0; i < 8; i++) {
-        int32_t idx = i * 8;
-        uint8_t val = 0;
-        val |= mask[idx + 0] << 0;
-        val |= mask[idx + 1] << 1;
-        val |= mask[idx + 2] << 2;
-        val |= mask[idx + 3] << 3;
-        val |= mask[idx + 4] << 4;
-        val |= mask[idx + 5] << 5;
-        val |= mask[idx + 6] << 6;
-        val |= mask[idx + 7] << 7;
-        bitmask |= (uint64_t)val << idx;
-    }
-
-    for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) {
-        if (mask[i]) {
-            fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
-            break;
-        }
-    }
-
-    DWORD_PTR m = (DWORD_PTR)bitmask;
-
-    m = SetThreadAffinityMask(h, m);
-
-    return m != 0;
-}
-
-static bool ggml_thread_apply_priority(int32_t prio) {
-    // Note that on Windows the Process Priority Class must be updated in order to set Thread priority.
-    // This is up to the applications.
-    DWORD p = THREAD_PRIORITY_NORMAL;
-    switch (prio) {
-        case GGML_SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
-        case GGML_SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
-        case GGML_SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
-        case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
-    }
-
-    if (prio == GGML_SCHED_PRIO_NORMAL) {
-        // Keep inherited policy/priority
-        return true;
-    }
-
-    if (!SetThreadPriority(GetCurrentThread(), p)) {
-        fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError());
-        return false;
-    }
-
-    return true;
-}
-
-#elif defined(__APPLE__)
-#include <sys/types.h>
-#include <sys/resource.h>
-
-static bool ggml_thread_apply_affinity(const bool * mask) {
-    // Not supported on Apple platforms
-    UNUSED(mask);
-    return true;
-}
-
-static bool ggml_thread_apply_priority(int32_t prio) {
-    struct sched_param p;
-    int32_t policy = SCHED_OTHER;
-    switch (prio) {
-        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
-        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
-        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
-        case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
-    }
-
-    if (prio == GGML_SCHED_PRIO_NORMAL) {
-        // Keep inherited policy/priority
-        return true;
-    }
-
-    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
-    if (err != 0) {
-        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
-        return false;
-    }
-
-    return true;
-}
-
-#elif defined(__gnu_linux__)
-// TODO: this may not work on BSD, to be verified
-
-static bool ggml_thread_apply_affinity(const bool * mask) {
-    cpu_set_t cpuset;
-    int err;
-
-    CPU_ZERO(&cpuset);
-
-    for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
-        if (mask[i]) {
-            GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
-            CPU_SET(i, &cpuset);
-        }
-    }
-
-#ifdef __ANDROID__
-    err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
-    if (err < 0) {
-        err = errno;
-    }
-#else
-    err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
-#endif
-    if (err != 0) {
-        fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err);
-        return false;
-    }
-
-    return true;
-}
-
-static bool ggml_thread_apply_priority(int32_t prio) {
-    struct sched_param p;
-    int32_t policy = SCHED_OTHER;
-    switch (prio) {
-        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
-        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
-        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
-        case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
-    }
-
-    if (prio == GGML_SCHED_PRIO_NORMAL) {
-        // Keep inherited policy/priority
-        return true;
-    }
-
-    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
-    if (err != 0) {
-        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
-        return false;
-    }
-
-    return true;
-}
-
-#else // unsupported platforms
-
-static bool ggml_thread_apply_affinity(const bool * mask) {
-    UNUSED(mask);
-    return true;
-}
-
-static bool ggml_thread_apply_priority(int32_t prio) {
-    UNUSED(prio);
-    return true;
-}
-
-#endif
-
-static bool ggml_thread_cpumask_is_valid(const bool * mask) {
-    for (int i = 0; i < GGML_MAX_N_THREADS; i++) {
-        if (mask[i]) { return true; }
-    }
-    return false;
-}
-
-static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
-    if (!strict) {
-        memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
-        return;
-    } else {
-        memset(local_mask, 0, GGML_MAX_N_THREADS);
-        int32_t base_idx = *iter;
-        for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
-            int32_t idx = base_idx + i;
-            if (idx >= GGML_MAX_N_THREADS) {
-                // Just a cheaper modulo
-                idx -= GGML_MAX_N_THREADS;
-            }
-            if (global_mask[idx]) {
-                local_mask[idx] = 1;
-                *iter = idx + 1;
-                return;
-            }
-        }
-    }
-}
-
-void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
-    if (!threadpool) return;
-
-    const int n_threads = threadpool->n_threads_max;
-
-#ifndef GGML_USE_OPENMP
-    struct ggml_compute_state* workers = threadpool->workers;
-
-    ggml_mutex_lock(&threadpool->mutex);
-
-    threadpool->stop = true;
-    threadpool->pause = false;
-
-    ggml_cond_broadcast(&threadpool->cond);
-    ggml_mutex_unlock(&threadpool->mutex);
-
-    for (int j = 1; j < n_threads; j++) {
-        int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
-        GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
-        UNUSED(rc);
-    }
-
-    ggml_mutex_destroy(&threadpool->mutex);
-    ggml_cond_destroy(&threadpool->cond);
-#endif // GGML_USE_OPENMP
-
-    const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
-    ggml_aligned_free(threadpool->workers, workers_size);
-    ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
-}
-
-#ifndef GGML_USE_OPENMP
-// pause/resume must be called under mutex
-static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) {
-    GGML_PRINT_DEBUG("Pausing threadpool\n");
-    threadpool->pause = true;
-    ggml_cond_broadcast(&threadpool->cond);
-}
-
-static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) {
-    GGML_PRINT_DEBUG("Resuming threadpool\n");
-    threadpool->pause = false;
-    ggml_cond_broadcast(&threadpool->cond);
-}
-#endif
-
-void ggml_threadpool_pause(struct ggml_threadpool * threadpool) {
-#ifndef GGML_USE_OPENMP
-    ggml_mutex_lock(&threadpool->mutex);
-    if (!threadpool->pause) {
-       ggml_threadpool_pause_locked(threadpool);
-    }
-    ggml_mutex_unlock(&threadpool->mutex);
-#else
-    UNUSED(threadpool);
-#endif
-}
-
-void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
-#ifndef GGML_USE_OPENMP
-    ggml_mutex_lock(&threadpool->mutex);
-    if (threadpool->pause) {
-       ggml_threadpool_resume_locked(threadpool);
-    }
-    ggml_mutex_unlock(&threadpool->mutex);
-#else
-    UNUSED(threadpool);
-#endif
-}
-
-struct ggml_cplan ggml_graph_plan(
-          const struct ggml_cgraph * cgraph,
-                               int   n_threads,
-            struct ggml_threadpool * threadpool) {
-
-    if (threadpool == NULL) {
-        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
-    }
-    if (n_threads <= 0) {
-        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
-    }
-
-    size_t work_size = 0;
-
-    struct ggml_cplan cplan;
-    memset(&cplan, 0, sizeof(struct ggml_cplan));
-
-    int max_tasks = 1;
-
-    // thread scheduling for the different operations + work buffer size estimation
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        const int n_tasks = ggml_get_n_tasks(node, n_threads);
-
-        max_tasks = MAX(max_tasks, n_tasks);
-
-        size_t cur = 0;
-
-        switch (node->op) {
-            case GGML_OP_CPY:
-            case GGML_OP_DUP:
-                {
-                    if (ggml_is_quantized(node->type) ||
-                        // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
-                        (node->src[0]->type == GGML_TYPE_F16  && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
-                        (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_ADD:
-            case GGML_OP_ADD1:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_ACC:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_COUNT_EQUAL:
-                {
-                    cur = ggml_type_size(node->type)*n_tasks;
-                } break;
-            case GGML_OP_MUL_MAT:
-                {
-                    const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
-
-                    if (node->src[1]->type != vec_dot_type) {
-                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
-                    }
-                } break;
-            case GGML_OP_MUL_MAT_ID:
-                {
-                    cur = 0;
-                    const struct ggml_tensor * src0 = node->src[0];
-                    const struct ggml_tensor * src1 = node->src[1];
-                    const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
-                    if (src1->type != vec_dot_type) {
-                        cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
-                    }
-                    const int n_as = src0->ne[2];
-                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
-                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
-                    cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
-                } break;
-            case GGML_OP_OUT_PROD:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_SOFT_MAX:
-            case GGML_OP_ROPE:
-                {
-                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
-                } break;
-            case GGML_OP_CONV_TRANSPOSE_1D:
-                {
-                    GGML_ASSERT(node->src[0]->ne[3] == 1);
-                    GGML_ASSERT(node->src[1]->ne[2] == 1);
-                    GGML_ASSERT(node->src[1]->ne[3] == 1);
-
-                    const int64_t ne00 = node->src[0]->ne[0];  // K
-                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
-                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
-
-                    const int64_t ne10 = node->src[1]->ne[0];  // L
-                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
-
-                    if ((node->src[0]->type == GGML_TYPE_F16 ||
-                         node->src[0]->type == GGML_TYPE_BF16) &&
-                        node->src[1]->type == GGML_TYPE_F32) {
-                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
-                        cur += sizeof(ggml_fp16_t)*ne10*ne11;
-                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                               node->src[1]->type == GGML_TYPE_F32) {
-                        cur += sizeof(float)*ne00*ne01*ne02;
-                        cur += sizeof(float)*ne10*ne11;
-                    } else {
-                        GGML_ABORT("fatal error");
-                    }
-                } break;
-            case GGML_OP_CONV_TRANSPOSE_2D:
-                {
-                    const int64_t ne00 = node->src[0]->ne[0]; // W
-                    const int64_t ne01 = node->src[0]->ne[1]; // H
-                    const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
-                    const int64_t ne03 = node->src[0]->ne[3]; // Channels In
-
-                    const int64_t ne10 = node->src[1]->ne[0]; // W
-                    const int64_t ne11 = node->src[1]->ne[1]; // H
-                    const int64_t ne12 = node->src[1]->ne[2]; // Channels In
-
-                    cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
-                    cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
-                } break;
-            case GGML_OP_FLASH_ATTN_EXT:
-                {
-                    const int64_t ne00 = node->src[0]->ne[0]; // D
-
-                    cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
-                } break;
-            case GGML_OP_FLASH_ATTN_BACK:
-                {
-                    const int64_t    D = node->src[0]->ne[0];
-                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
-                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
-                    if (node->src[1]->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_BF16) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    }
-                } break;
-
-            case GGML_OP_CROSS_ENTROPY_LOSS:
-                {
-                    cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
-                } break;
-            case GGML_OP_COUNT:
-                {
-                    GGML_ABORT("fatal error");
-                }
-            default:
-                break;
-        }
-
-        work_size = MAX(work_size, cur);
-    }
-
-    if (work_size > 0) {
-        work_size += CACHE_LINE_SIZE*(n_threads);
-    }
-
-    cplan.threadpool = threadpool;
-    cplan.n_threads  = MIN(max_tasks, n_threads);
-    cplan.work_size  = work_size;
-    cplan.work_data  = NULL;
-
-    return cplan;
-}
-
-static thread_ret_t ggml_graph_compute_thread(void * data) {
-    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-    struct ggml_threadpool    * tp    = state->threadpool;
-
-    const struct ggml_cgraph * cgraph = tp->cgraph;
-    const struct ggml_cplan  * cplan  = tp->cplan;
-
-    set_numa_thread_affinity(state->ith);
-
-    struct ggml_compute_params params = {
-        /*.ith       =*/ state->ith,
-        /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
-        /*.wsize     =*/ cplan->work_size,
-        /*.wdata     =*/ cplan->work_data,
-        /*.threadpool=*/ tp,
-    };
-
-    for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
-        struct ggml_tensor * node = cgraph->nodes[node_n];
-
-        ggml_compute_forward(&params, node);
-
-        if (state->ith == 0 && cplan->abort_callback &&
-                cplan->abort_callback(cplan->abort_callback_data)) {
-            tp->abort = true;
-            tp->ec    = GGML_STATUS_ABORTED;
-        }
-
-        ggml_barrier(state->threadpool);
-    }
-
-    return 0;
-}
-
-#ifndef GGML_USE_OPENMP
-
-// check if thread is active
-static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
-    struct ggml_threadpool * threadpool = state->threadpool;
-    int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
-    return (state->ith < n_threads);
-}
-
-// check if thread is ready to proceed (exit from polling or sleeping)
-static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
-    struct ggml_threadpool * threadpool = state->threadpool;
-
-    if (state->pending || threadpool->stop || threadpool->pause) { return true; }
-
-    // check for new graph/work
-    int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
-    if (new_graph != state->last_graph) {
-        state->pending    = ggml_graph_compute_thread_active(state);
-        state->last_graph = new_graph;
-    }
-
-    return state->pending;
-}
-
-// sync thread state after polling
-static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
-    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
-    #ifdef GGML_TSAN_ENABLED
-    atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
-    #else
-    atomic_thread_fence(memory_order_seq_cst);
-    #endif
-    UNUSED(state);
-}
-
-static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
-    struct ggml_threadpool * threadpool = state->threadpool;
-
-    // Skip polling for unused threads
-    if (!ggml_graph_compute_thread_active(state)) {
-        return state->pending;
-    }
-
-    // This seems to make 0 ... 100 a decent range for polling level across modern processors.
-    // Perhaps, we can adjust it dynamically based on load and things.
-    const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
-
-    for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) {
-        // No new work. Keep polling.
-        ggml_thread_cpu_relax();
-    }
-
-    return state->pending;
-}
-
-static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
-    struct ggml_threadpool * threadpool = state->threadpool;
-
-    if (ggml_graph_compute_poll_for_work(state)) {
-        ggml_graph_compute_thread_sync(state);
-        return state->pending;
-    }
-
-    ggml_mutex_lock_shared(&threadpool->mutex);
-    while (!ggml_graph_compute_thread_ready(state)) {
-        // No new work. Wait for the signal.
-        GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith);
-        ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
-    }
-    ggml_mutex_unlock_shared(&threadpool->mutex);
-
-    return state->pending;
-}
-
-static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
-    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-    struct ggml_threadpool * threadpool = state->threadpool;
-
-    ggml_thread_apply_priority(threadpool->prio);
-    if (ggml_thread_cpumask_is_valid(state->cpumask)) {
-        ggml_thread_apply_affinity(state->cpumask);
-    }
-
-    while (true) {
-        // Check if we need to sleep
-        while (threadpool->pause) {
-            GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
-            ggml_mutex_lock_shared(&threadpool->mutex);
-            if (threadpool->pause) {
-                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
-            }
-            GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
-            ggml_mutex_unlock_shared(&threadpool->mutex);
-        }
-
-        // This needs to be checked for after the cond_wait
-        if (threadpool->stop) break;
-
-        // Check if there is new work
-        // The main thread is the only one that can dispatch new work
-
-        ggml_graph_compute_check_for_work(state);
-        if (state->pending) {
-            state->pending = false;
-
-            ggml_graph_compute_thread(state);
-        }
-    }
-
-    return (thread_ret_t) 0;
-}
-
-// Start processing new graph
-static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
-{
-    // Always take the mutex here because the worker threads are doing hybrid poll/wait
-
-    ggml_mutex_lock(&threadpool->mutex);
-
-    GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
-
-    // Update the number of active threads
-    atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
-
-    // Indicate the graph is ready to be processed
-    // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
-    atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
-
-    if (threadpool->pause) {
-       // Update main thread prio and affinity to match the threadpool settings
-       ggml_thread_apply_priority(threadpool->prio);
-       if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
-           ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
-       }
-
-       // resume does cond broadcast
-       ggml_threadpool_resume_locked(threadpool);
-    } else {
-       ggml_cond_broadcast(&threadpool->cond);
-    }
-
-    ggml_mutex_unlock(&threadpool->mutex);
-}
-
-#endif // GGML_USE_OPENMP
-
-void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
-    p->n_threads  = n_threads;
-    p->prio       = 0;     // default priority (usually means normal or inherited)
-    p->poll       = 50;    // hybrid-polling enabled
-    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
-    p->paused     = false; // threads are ready to go
-    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
-}
-
-struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
-    struct ggml_threadpool_params p;
-    ggml_threadpool_params_init(&p, n_threads);
-    return p;
-}
-
-bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
-    if (p0->n_threads      != p1->n_threads  )    return false;
-    if (p0->prio           != p1->prio       )    return false;
-    if (p0->poll           != p1->poll       )    return false;
-    if (p0->strict_cpu     != p1->strict_cpu )    return false;
-    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
-}
-
-static struct ggml_threadpool * ggml_threadpool_new_impl(
-    struct ggml_threadpool_params * tpp,
-               struct ggml_cgraph * cgraph,
-                struct ggml_cplan * cplan) {
-
-    struct ggml_threadpool * threadpool =
-        ggml_aligned_malloc(sizeof(struct ggml_threadpool));
-    {
-        threadpool->cgraph           = cgraph;
-        threadpool->cplan            = cplan;
-        threadpool->n_graph          = 0;
-        threadpool->n_barrier        = 0;
-        threadpool->n_barrier_passed = 0;
-        threadpool->current_chunk    = 0;
-        threadpool->stop             = false;
-        threadpool->pause            = tpp->paused;
-        threadpool->abort            = false;
-        threadpool->workers          = NULL;
-        threadpool->n_threads_max    = tpp->n_threads;
-        threadpool->n_threads_cur    = tpp->n_threads;
-        threadpool->poll             = tpp->poll;
-        threadpool->prio             = tpp->prio;
-        threadpool->ec               = GGML_STATUS_SUCCESS;
-    }
-
-    // Allocate and init workers state
-    const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
-    struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
-
-    memset(workers, 0, workers_size);
-    for (int j = 0; j < tpp->n_threads; j++) {
-        workers[j].threadpool = threadpool;
-        workers[j].ith        = j;
-    }
-
-    threadpool->workers = workers;
-
-#ifndef GGML_USE_OPENMP
-    ggml_mutex_init(&threadpool->mutex);
-    ggml_cond_init(&threadpool->cond);
-
-    // Spin the threads for all workers, and update CPU placements.
-    // Place the main thread last (towards the higher numbered CPU cores).
-
-    int32_t cpumask_iter = 0;
-
-    for (int j = 1; j < tpp->n_threads; j++) {
-        ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
-
-        int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]);
-        GGML_ASSERT(rc == 0);
-    }
-
-    ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
-
-    if (!threadpool->pause) {
-        // Update main thread prio and affinity at the start, otherwise we'll do it in resume
-        ggml_thread_apply_priority(threadpool->prio);
-        if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
-            ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
-        }
-    }
-#endif // GGML_USE_OPENMP
-
-    return threadpool;
-}
-
-struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
-    return ggml_threadpool_new_impl(tpp, NULL, NULL);
-}
-
-enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
-    GGML_ASSERT(cplan);
-    GGML_ASSERT(cplan->n_threads > 0);
-    GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
-
-    int n_threads                               = cplan->n_threads;
-    struct ggml_threadpool * threadpool = cplan->threadpool;
-
-    bool disposable_threadpool = false;
-
-    if (threadpool == NULL) {
-        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
-        disposable_threadpool = true;
-
-        struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
-        threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
-    } else {
-        // Reset some of the parameters that need resetting
-        // No worker threads should be accessing the parameters below at this stage
-        threadpool->cgraph           = cgraph;
-        threadpool->cplan            = cplan;
-        threadpool->current_chunk    = 0;
-        threadpool->abort            = false;
-        threadpool->ec               = GGML_STATUS_SUCCESS;
-    }
-
-#ifdef GGML_USE_OPENMP
-    if (n_threads > 1) {
-        #pragma omp parallel num_threads(n_threads)
-        {
-            #pragma omp single
-            {
-                // update the number of threads from the actual number of threads that we got from OpenMP
-                n_threads = omp_get_num_threads();
-                atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
-            }
-
-            ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
-        }
-    } else {
-        atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
-        ggml_graph_compute_thread(&threadpool->workers[0]);
-    }
-#else
-    if (n_threads > threadpool->n_threads_max) {
-        GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
-        n_threads = threadpool->n_threads_max;
-    }
-
-    // Kick all threads to start the new graph
-    ggml_graph_compute_kickoff(threadpool, n_threads);
-
-    // This is a work thread too
-    ggml_graph_compute_thread(&threadpool->workers[0]);
-#endif
-
-    // don't leave affinity set on the main thread
-    clear_numa_thread_affinity();
-
-    enum ggml_status ret = threadpool->ec;
-
-    if (disposable_threadpool) {
-        ggml_threadpool_free(threadpool);
-    }
-
-    return ret;
-}
-
-enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
-
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    return ggml_graph_compute(cgraph, &cplan);
-}
-
 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
     for (int i = 0; i < cgraph->n_leafs; i++) {
         struct ggml_tensor * leaf = cgraph->leafs[i];
@@ -20168,490 +6540,6 @@ struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const ch
     return NULL;
 }
 
-static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
-    const int64_t * ne = tensor->ne;
-    const size_t  * nb = tensor->nb;
-
-    fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
-            ggml_type_name(tensor->type),
-            ggml_op_name  (tensor->op),
-            ggml_n_dims(tensor),
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
-            tensor->data,
-            tensor->name);
-}
-
-static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
-    const int64_t * ne = tensor->ne;
-    const size_t  * nb = tensor->nb;
-
-    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
-            arg,
-            ggml_type_name(tensor->type),
-            ggml_op_name  (tensor->op),
-            ggml_n_dims(tensor),
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
-            tensor->data,
-            tensor->name);
-}
-
-void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    uint64_t size_eval = 0;
-
-    // compute size of intermediate results
-    for (int i = 0; i < cgraph->n_nodes; ++i) {
-        size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
-    }
-
-    // print
-    {
-        FILE * fout = stdout;
-
-        fprintf(fout, "\n");
-        fprintf(fout, "%-16s %8x\n", "magic",        GGML_FILE_MAGIC);
-        fprintf(fout, "%-16s %8d\n", "version",      GGML_FILE_VERSION);
-        fprintf(fout, "%-16s %8d\n", "leafs",        cgraph->n_leafs);
-        fprintf(fout, "%-16s %8d\n", "nodes",        cgraph->n_nodes);
-        fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
-
-        // header
-        fprintf(fout, "\n");
-        fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
-                "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
-
-        for (int i = 0; i < cgraph->n_leafs; ++i) {
-            ggml_graph_export_leaf(cgraph->leafs[i], fout);
-
-            GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
-            GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
-            GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
-        }
-
-        // header
-        fprintf(fout, "\n");
-        fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
-                "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
-
-        for (int i = 0; i < cgraph->n_nodes; ++i) {
-            ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
-
-            for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                if (cgraph->nodes[i]->src[j]) {
-                    ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
-                }
-            }
-
-            fprintf(fout, "\n");
-        }
-
-        fprintf(fout, "\n");
-    }
-
-    // write binary data
-    {
-        FILE * fout = ggml_fopen(fname, "wb");
-
-        if (!fout) {
-            fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
-            return;
-        }
-
-        // header
-        {
-            const uint32_t magic   = GGML_FILE_MAGIC;
-            const uint32_t version = GGML_FILE_VERSION;
-            const uint32_t n_leafs = cgraph->n_leafs;
-            const uint32_t n_nodes = cgraph->n_nodes;
-
-            fwrite(&magic,     sizeof(uint32_t), 1, fout);
-            fwrite(&version,   sizeof(uint32_t), 1, fout);
-            fwrite(&n_leafs,   sizeof(uint32_t), 1, fout);
-            fwrite(&n_nodes,   sizeof(uint32_t), 1, fout);
-            fwrite(&size_eval, sizeof(uint64_t), 1, fout);
-        }
-
-        // leafs
-        {
-            for (int i = 0; i < cgraph->n_leafs; ++i) {
-                const struct ggml_tensor * tensor = cgraph->leafs[i];
-
-                const uint32_t type   = tensor->type;
-                const uint32_t op     = tensor->op;
-                const int32_t  flags  = tensor->flags;
-
-                fwrite(&type,   sizeof(uint32_t), 1, fout);
-                fwrite(&op,     sizeof(uint32_t), 1, fout);
-                fwrite(&flags,  sizeof(int32_t),  1, fout);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    const uint64_t ne = tensor->ne[j];
-                    const uint64_t nb = tensor->nb[j];
-
-                    fwrite(&ne, sizeof(uint64_t), 1, fout);
-                    fwrite(&nb, sizeof(uint64_t), 1, fout);
-                }
-
-                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
-                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
-
-                // dump the data
-                // TODO: pad this to 32 byte boundary
-                {
-                    const size_t size = ggml_nbytes(tensor);
-
-                    fwrite(tensor->data, sizeof(char), size, fout);
-                }
-            }
-        }
-
-        // nodes
-        {
-            for (int i = 0; i < cgraph->n_nodes; ++i) {
-                const struct ggml_tensor * tensor = cgraph->nodes[i];
-
-                const uint32_t type   = tensor->type;
-                const uint32_t op     = tensor->op;
-                const int32_t  flags  = tensor->flags;
-
-                fwrite(&type,   sizeof(uint32_t), 1, fout);
-                fwrite(&op,     sizeof(uint32_t), 1, fout);
-                fwrite(&flags,  sizeof(int32_t),  1, fout);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    const uint64_t ne = tensor->ne[j];
-                    const uint64_t nb = tensor->nb[j];
-
-                    fwrite(&ne, sizeof(uint64_t), 1, fout);
-                    fwrite(&nb, sizeof(uint64_t), 1, fout);
-                }
-
-                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
-                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
-
-                // output the op arguments
-                {
-                    struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
-
-                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                        args[j] = tensor->src[j];
-                    }
-
-                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                        if (args[j]) {
-                            int32_t idx = -1;
-
-                            // check if leaf
-                            {
-                                for (int k = 0; k < cgraph->n_leafs; ++k) {
-                                    if (args[j] == cgraph->leafs[k]) {
-                                        idx = k;
-                                        break;
-                                    }
-                                }
-                            }
-
-                            // check if node
-                            if (idx == -1) {
-                                for (int k = 0; k < cgraph->n_nodes; ++k) {
-                                    if (args[j] == cgraph->nodes[k]) {
-                                        idx = cgraph->n_leafs + k;
-                                        break;
-                                    }
-                                }
-                            }
-
-                            if (idx == -1) {
-                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
-                                fclose(fout);
-                                return;
-                            }
-
-                            fwrite(&idx, sizeof(int32_t), 1, fout);
-                        } else {
-                            const int32_t nul = -1;
-
-                            fwrite(&nul, sizeof(int32_t), 1, fout);
-                        }
-                    }
-                }
-
-                // dump the data
-                // TODO: pad this to 32 byte boundary
-                if ((flags & GGML_TENSOR_FLAG_PARAM)) {
-                    const size_t size = ggml_nbytes(tensor);
-
-                    fwrite(tensor->data, sizeof(char), size, fout);
-                }
-            }
-        }
-
-        fclose(fout);
-    }
-}
-
-struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
-    assert(*ctx_data == NULL);
-    assert(*ctx_eval == NULL);
-
-    struct ggml_cgraph * result = NULL;
-
-    struct ggml_tensor * data = NULL;
-
-    // read file into data
-    {
-        FILE * fin = ggml_fopen(fname, "rb");
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
-            return result;
-        }
-
-        size_t fsize = 0;
-
-        fseek(fin, 0, SEEK_END);
-        fsize = ftell(fin);
-        fseek(fin, 0, SEEK_SET);
-
-        // create the data context
-        {
-            const size_t overhead = 1*ggml_tensor_overhead();
-
-            struct ggml_init_params params = {
-                .mem_size   = fsize + overhead,
-                .mem_buffer = NULL,
-                .no_alloc   = false,
-            };
-
-            *ctx_data = ggml_init(params);
-
-            if (!*ctx_data) {
-                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
-                fclose(fin);
-                return result;
-            }
-        }
-
-        data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
-
-        {
-            const size_t ret = fread(data->data, sizeof(char), fsize, fin);
-            if (ret != fsize) {
-                fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
-                fclose(fin);
-                return result;
-            }
-        }
-
-        fclose(fin);
-    }
-
-    // populate result
-    {
-        char * ptr = (char *) data->data;
-
-        const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
-
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
-            return result;
-        }
-
-        const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
-
-        if (version != GGML_FILE_VERSION) {
-            fprintf(stderr, "%s: invalid version number\n", __func__);
-            return result;
-        }
-
-        const uint32_t n_leafs   = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
-        const uint32_t n_nodes   = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
-        const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
-        const int     graph_size = MAX(n_leafs, n_nodes);
-
-        // create the data context
-        {
-            const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
-
-            struct ggml_init_params params = {
-                .mem_size   = size_eval + overhead,
-                .mem_buffer = NULL,
-                .no_alloc   = true,
-            };
-
-            *ctx_eval = ggml_init(params);
-
-            if (!*ctx_eval) {
-                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
-                return result;
-            }
-        }
-
-        result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
-
-        result->n_leafs = n_leafs;
-        result->n_nodes = n_nodes;
-
-
-        // leafs
-        {
-            uint32_t type;
-            uint32_t op;
-            int32_t  flags;
-
-            for (uint32_t i = 0; i < n_leafs; ++i) {
-                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
-                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
-                flags  = *(const int32_t  *) ptr; ptr += sizeof(flags);
-
-                int64_t ne[GGML_MAX_DIMS];
-                size_t  nb[GGML_MAX_DIMS];
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    uint64_t ne_cur;
-                    uint64_t nb_cur;
-
-                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
-                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
-
-                    ne[j] = ne_cur;
-                    nb[j] = nb_cur;
-                }
-
-                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
-
-                tensor->op    = (enum ggml_op) op;
-                tensor->flags = flags;
-
-                memcpy(tensor->name,      ptr, GGML_MAX_NAME);      ptr += GGML_MAX_NAME;
-                memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    tensor->nb[j] = nb[j];
-                }
-
-                tensor->data = (void *) ptr; ptr += ggml_nbytes(tensor);
-
-                result->leafs[i] = tensor;
-
-                fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
-            }
-        }
-
-        ggml_set_no_alloc(*ctx_eval, false);
-
-        // nodes
-        {
-            uint32_t type;
-            uint32_t op;
-            int32_t  flags;
-
-            for (uint32_t i = 0; i < n_nodes; ++i) {
-                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
-                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
-                flags  = *(const int32_t  *) ptr; ptr += sizeof(flags);
-
-                enum ggml_op eop = (enum ggml_op) op;
-
-                int64_t ne[GGML_MAX_DIMS];
-                size_t  nb[GGML_MAX_DIMS];
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    uint64_t ne_cur;
-                    uint64_t nb_cur;
-
-                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
-                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
-
-                    ne[j] = ne_cur;
-                    nb[j] = nb_cur;
-                }
-
-                const char * ptr_name      = ptr; ptr += GGML_MAX_NAME;
-                const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
-
-                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
-
-                struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
-
-                // parse args
-                for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                    const int32_t arg_idx = ptr_arg_idx[j];
-
-                    if (arg_idx == -1) {
-                        continue;
-                    }
-
-                    if (arg_idx < result->n_leafs) {
-                        args[j] = result->leafs[arg_idx];
-                    } else {
-                        args[j] = result->nodes[arg_idx - result->n_leafs];
-                    }
-                }
-
-                // create the tensor
-                // "view" operations are handled differently
-                // TODO: handle inplace ops - currently a copy is always made
-
-                struct ggml_tensor * tensor = NULL;
-
-                switch (eop) {
-                    // TODO: implement other view ops
-                    case GGML_OP_RESHAPE:
-                        {
-                            tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
-                        } break;
-                    case GGML_OP_VIEW:
-                        {
-                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
-
-                            size_t offs;
-                            memcpy(&offs, ptr_op_params, sizeof(offs));
-
-                            tensor->data = ((char *) tensor->data) + offs;
-                        } break;
-                    case GGML_OP_TRANSPOSE:
-                        {
-                            tensor = ggml_transpose(*ctx_eval, args[0]);
-                        } break;
-                    case GGML_OP_PERMUTE:
-                        {
-                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
-                        } break;
-                    default:
-                        {
-                            tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
-
-                            tensor->op = eop;
-                        } break;
-                }
-
-                memcpy(tensor->name,      ptr_name,      GGML_MAX_NAME);
-                memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    tensor->nb[j] = nb[j];
-                }
-
-                for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                    tensor->src[j] = args[j];
-                }
-
-                result->nodes[i] = tensor;
-
-                // TODO tensor data is be duplicated due to ggml_new_tensor call above
-                if (flags & GGML_TENSOR_FLAG_PARAM) {
-                    tensor->data = (void *) ptr; ptr += ggml_nbytes(tensor);
-                }
-
-                fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
-            }
-        }
-    }
-
-    return result;
-}
-
 void ggml_graph_print(const struct ggml_cgraph * cgraph) {
     GGML_LOG_INFO("=== GRAPH ===\n");
 
@@ -20799,15 +6687,17 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
         if (ggml_nelements(node) < 5 && node->data != NULL) {
             fprintf(fp, " | (");
             for (int j = 0; j < ggml_nelements(node); j++) {
-                if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
-                    fprintf(fp, "%d", ggml_get_i32_1d(node, j));
-                }
-                else if (node->type == GGML_TYPE_F32 ||
-                         node->type == GGML_TYPE_F16 ||
-                         node->type == GGML_TYPE_BF16) {
-                    fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
-                }
-                else {
+                // FIXME: use ggml-backend to obtain the tensor data
+                //if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
+                //    fprintf(fp, "%d", ggml_get_i32_1d(node, j));
+                //}
+                //else if (node->type == GGML_TYPE_F32 ||
+                //         node->type == GGML_TYPE_F16 ||
+                //         node->type == GGML_TYPE_BF16) {
+                //    fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
+                //}
+                //else
+                {
                     fprintf(fp, "#");
                 }
                 if (j < ggml_nelements(node) - 1) {
@@ -20852,918 +6742,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
 
 ////////////////////////////////////////////////////////////////////////////////
 
-static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to set tensor from array
-        for (int64_t j = 0; j < ne; ++j) {
-            ggml_set_f32_1d(ps[p], j, x[i++]);
-        }
-    }
-}
-
-static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            x[i++] = ggml_get_f32_1d(ps[p], j);
-        }
-    }
-}
-
-static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
-    int64_t i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
-        }
-    }
-}
-
-static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g, float scale) {
-    int64_t i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            g[i++] += ggml_get_f32_1d(ps[p]->grad, j) * scale;
-        }
-    }
-}
-
-//
-// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
-//
-// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
-//
-
-static enum ggml_opt_result ggml_opt_adam(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        ggml_opt_callback callback,
-        void * callback_data) {
-    GGML_ASSERT(ggml_is_scalar(f));
-    GGML_ASSERT(f->type == GGML_TYPE_F32);
-
-    // these will store the parameters we want to optimize
-    struct ggml_tensor * ps[GGML_MAX_PARAMS];
-
-    int np = 0;
-    int64_t nx = 0;
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
-            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-
-            GGML_ASSERT(np < GGML_MAX_PARAMS);
-
-            ps[np++] = gf->nodes[i];
-            nx += ggml_nelements(gf->nodes[i]);
-        }
-    }
-
-    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) {
-        int iter = opt->iter;
-        ggml_opt_init(opt->ctx, opt, params, nx);
-        opt->iter = iter;
-    }
-
-    // constants
-    float sched = params.adam.sched;
-    const float alpha = params.adam.alpha;
-    const float decay = params.adam.decay * alpha;
-    const float beta1 = params.adam.beta1;
-    const float beta2 = params.adam.beta2;
-    const float eps   = params.adam.eps;
-    const float gclip = params.adam.gclip;
-    const int decay_min_ndim = params.adam.decay_min_ndim;
-    const int n_accum = MAX(1, params.n_gradient_accumulation);
-    const float accum_norm = 1.0f / (float) n_accum;
-
-    float * g  = opt->adam.g->data;  // gradients
-    float * m  = opt->adam.m->data;  // first moment
-    float * v  = opt->adam.v->data;  // second moment
-
-    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
-
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    bool cancel = false;
-
-    // compute the function value
-    float fx = 0;
-    ggml_set_zero(opt->adam.g);
-    for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-        if (callback) {
-            callback(callback_data, accum_step, &sched, &cancel);
-            if (cancel) {
-                return GGML_OPT_RESULT_CANCEL;
-            }
-        }
-        // ggml_graph_reset  (gf);
-        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(gb, &cplan);
-        ggml_opt_acc_grad(np, ps, g, accum_norm);
-        fx += ggml_get_f32_1d(f, 0);
-    }
-    fx *= accum_norm;
-
-    opt->adam.fx_prev = fx;
-    opt->adam.fx_best = opt->adam.fx_prev;
-    if (pf) {
-        pf[opt->iter % params.past] = opt->adam.fx_prev;
-    }
-
-    opt->loss_before = opt->adam.fx_prev;
-    opt->loss_after  = opt->adam.fx_prev;
-
-    // initialize
-    if (opt->just_initialized) {
-        opt->adam.n_no_improvement = 0;
-        opt->just_initialized = false;
-    }
-
-    float * fx_best = &opt->adam.fx_best;
-    float * fx_prev = &opt->adam.fx_prev;
-    int * n_no_improvement = &opt->adam.n_no_improvement;
-
-    int iter0 = opt->iter;
-
-    // run the optimizer
-    for (int t = 0; t < params.adam.n_iter; ++t) {
-        opt->iter = iter0 + t + 1;
-        GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
-
-        GGML_PRINT_DEBUG  ("f      = %10.6f\n", ggml_get_f32_1d(f, 0));
-        GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0));
-        GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0));
-
-        for (int i = 0; i < np; ++i) {
-            GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i,
-                    ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
-        }
-
-        const int64_t t_start_wall = ggml_time_us();
-        const int64_t t_start_cpu = ggml_cycles();
-        UNUSED(t_start_wall);
-        UNUSED(t_start_cpu);
-
-        {
-            float gnorm = 1.0f;
-            if (gclip > 0.0f) {
-                // gradient clipping
-                ggml_float sum = 0.0;
-                for (int64_t i = 0; i < nx; ++i) {
-                    sum += (ggml_float)(g[i]*g[i]);
-                }
-                ggml_float norm = sqrt(sum);
-                if (norm > (ggml_float) gclip) {
-                    gnorm = (float) ((ggml_float) gclip / norm);
-                }
-            }
-            const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
-            const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
-            int64_t i = 0;
-            for (int p = 0; p < np; ++p) {
-                const int64_t ne = ggml_nelements(ps[p]);
-                const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
-                for (int64_t j = 0; j < ne; ++j) {
-                    float x  = ggml_get_f32_1d(ps[p], j);
-                    float g_ = g[i]*gnorm;
-                    m[i] = m[i]*beta1 +    g_*(1.0f - beta1);
-                    v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2);
-                    float mh = m[i]*beta1h;
-                    float vh = v[i]*beta2h;
-                    vh = sqrtf(vh) + eps;
-                    x  = x*(1.0f - p_decay) - mh/vh;
-                    ggml_set_f32_1d(ps[p], j, x);
-                    ++i;
-                }
-            }
-        }
-
-        fx = 0;
-        ggml_set_zero(opt->adam.g);
-        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-            if (callback) {
-                callback(callback_data, accum_step, &sched, &cancel);
-                if (cancel) {
-                    return GGML_OPT_RESULT_CANCEL;;
-                }
-            }
-            // ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
-            ggml_opt_acc_grad(np, ps, g, accum_norm);
-            fx += ggml_get_f32_1d(f, 0);
-        }
-        fx *= accum_norm;
-
-        opt->loss_after = fx;
-
-        // check convergence
-        if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
-            GGML_PRINT_DEBUG("converged\n");
-
-            return GGML_OPT_RESULT_OK;
-        }
-
-        // delta-based convergence test
-        if (pf != NULL) {
-            // need at least params.past iterations to start checking for convergence
-            if (params.past <= iter0 + t) {
-                const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
-
-                if (fabsf(rate) < params.delta) {
-                    return GGML_OPT_RESULT_OK;
-                }
-            }
-
-            pf[(iter0 + t)%params.past] = fx;
-        }
-
-        // check for improvement
-        if (params.max_no_improvement > 0) {
-            if (fx_best[0] > fx) {
-                fx_best[0] = fx;
-                n_no_improvement[0] = 0;
-            } else {
-                ++n_no_improvement[0];
-
-                if (n_no_improvement[0] >= params.max_no_improvement) {
-                    return GGML_OPT_RESULT_OK;
-                }
-            }
-        }
-
-        fx_prev[0] = fx;
-
-        {
-            const int64_t t_end_cpu = ggml_cycles();
-            GGML_PRINT_DEBUG("time iter:      %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
-            UNUSED(t_end_cpu);
-
-            const int64_t t_end_wall = ggml_time_us();
-            GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
-            UNUSED(t_end_wall);
-        }
-    }
-
-    return GGML_OPT_RESULT_DID_NOT_CONVERGE;
-}
-
-//
-// L-BFGS
-//
-// the L-BFGS implementation below is based on the following implementation:
-//
-//   https://github.com/chokkan/liblbfgs
-//
-
-struct ggml_lbfgs_iteration_data {
-    float alpha;
-    float ys;
-    float * s;
-    float * y;
-};
-
-static enum ggml_opt_result linesearch_backtracking(
-        const struct ggml_opt_params * params,
-        int nx,
-        float * x,
-        float * fx,
-        float * g,
-        float * d,
-        float * step,
-        const float * xp,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gb,
-        struct ggml_cplan  * cplan,
-        const int np,
-        struct ggml_tensor * ps[],
-        bool * cancel,
-        ggml_opt_callback callback,
-        void * callback_data) {
-    int count = 0;
-
-    float width  = 0.0f;
-    float dg     = 0.0f;
-    float finit  = 0.0f;
-    float dginit = 0.0f;
-    float dgtest = 0.0f;
-
-    const float dec = 0.5f;
-    const float inc = 2.1f;
-
-    const int n_accum = MAX(1, params->n_gradient_accumulation);
-    const float accum_norm = 1.0f / (float) n_accum;
-
-    if (*step <= 0.f) {
-        return GGML_LINESEARCH_INVALID_PARAMETERS;
-    }
-
-    // compute the initial gradient in the search direction
-    ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
-
-    // make sure that d points to a descent direction
-    if (0 < dginit) {
-        return GGML_LINESEARCH_FAIL;
-    }
-
-    // initialize local variables
-    finit = *fx;
-    dgtest = params->lbfgs.ftol*dginit;
-
-    while (true) {
-        ggml_vec_cpy_f32(nx, x, xp);
-        ggml_vec_mad_f32(nx, x, d, *step);
-
-        // evaluate the function and gradient values
-        {
-            ggml_opt_set_params(np, ps, x);
-
-            *fx = 0;
-            memset(g, 0, sizeof(float)*nx);
-            for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-                if (callback) {
-                    // LBFG-S does not support learning rate -> ignore learning schedule
-                    float sched = 0;
-                    callback(callback_data, accum_step, &sched, cancel);
-                    if (*cancel) {
-                        return GGML_OPT_RESULT_CANCEL;
-                    }
-                }
-                // ggml_graph_reset  (gf);
-                ggml_set_f32      (f->grad, 1.0f);
-                ggml_graph_compute(gb, cplan);
-                ggml_opt_acc_grad(np, ps, g, accum_norm);
-                *fx += ggml_get_f32_1d(f, 0);
-            }
-            *fx *= accum_norm;
-
-        }
-
-        ++count;
-
-        if (*fx > finit + (*step)*dgtest) {
-            width = dec;
-        } else {
-            // Armijo condition is satisfied
-            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
-                return count;
-            }
-
-            ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
-
-            // check the Wolfe condition
-            if (dg < params->lbfgs.wolfe * dginit) {
-                width = inc;
-            } else {
-                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
-                    // regular Wolfe conditions
-                    return count;
-                }
-
-                if(dg > -params->lbfgs.wolfe*dginit) {
-                    width = dec;
-                } else {
-                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
-                    return count;
-                }
-            }
-        }
-
-        if (*step < params->lbfgs.min_step) {
-            return GGML_LINESEARCH_MINIMUM_STEP;
-        }
-        if (*step > params->lbfgs.max_step) {
-            return GGML_LINESEARCH_MAXIMUM_STEP;
-        }
-        if (params->lbfgs.max_linesearch <= count) {
-            return GGML_LINESEARCH_MAXIMUM_ITERATIONS;
-        }
-
-        (*step) *= width;
-    }
-
-    GGML_ABORT("line search failed");
-
-    //return GGML_LINESEARCH_FAIL;
-}
-
-static enum ggml_opt_result ggml_opt_lbfgs(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        ggml_opt_callback callback,
-        void * callback_data) {
-    if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
-        params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
-        if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
-            return GGML_OPT_RESULT_INVALID_WOLFE;
-        }
-    }
-
-    const int m = params.lbfgs.m;
-
-    // these will store the parameters we want to optimize
-    struct ggml_tensor * ps[GGML_MAX_PARAMS];
-
-    int np = 0;
-    int nx = 0;
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
-            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-
-            GGML_ASSERT(np < GGML_MAX_PARAMS);
-
-            ps[np++] = gf->nodes[i];
-            nx += ggml_nelements(gf->nodes[i]);
-        }
-    }
-
-    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) {
-        int iter = opt->iter;
-        ggml_opt_init(ctx, opt, params, nx);
-        opt->iter = iter;
-    }
-
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    float * x  = opt->lbfgs.x->data;  // current parameters
-    float * xp = opt->lbfgs.xp->data; // previous parameters
-    float * g  = opt->lbfgs.g->data;  // current gradient
-    float * gp = opt->lbfgs.gp->data; // previous gradient
-    float * d  = opt->lbfgs.d->data;  // search direction
-
-    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
-
-    const int n_accum = MAX(1, params.n_gradient_accumulation);
-    const float accum_norm = 1.0f / (float) n_accum;
-
-    float fx    = 0.0f; // cost function value
-    float xnorm = 0.0f; // ||x||
-    float gnorm = 0.0f; // ||g||
-
-    // initialize x from the graph nodes
-    ggml_opt_get_params(np, ps, x);
-
-    // the L-BFGS memory
-    float * lm_alpha = opt->lbfgs.lmal->data;
-    float * lm_ys    = opt->lbfgs.lmys->data;
-    float * lm_s     = opt->lbfgs.lms->data;
-    float * lm_y     = opt->lbfgs.lmy->data;
-
-    bool cancel = false;
-
-    // evaluate the function value and its gradient
-    {
-        ggml_opt_set_params(np, ps, x);
-
-        fx = 0;
-        memset(g, 0, sizeof(float)*nx);
-        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-            if (callback) {
-                // LBFG-S does not support learning rate -> ignore learning schedule
-                float sched = 0;
-                callback(callback_data, accum_step, &sched, &cancel);
-                if (cancel) {
-                    return GGML_OPT_RESULT_CANCEL;
-                }
-            }
-            // ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
-            ggml_opt_acc_grad(np, ps, g, accum_norm);
-            fx += ggml_get_f32_1d(f, 0);
-        }
-        fx *= accum_norm;
-
-        opt->loss_before = fx;
-        opt->loss_after  = fx;
-    }
-
-    // search direction = -gradient
-    ggml_vec_neg_f32(nx, d, g);
-
-    // ||x||, ||g||
-    ggml_vec_norm_f32(nx, &xnorm, x);
-    ggml_vec_norm_f32(nx, &gnorm, g);
-
-    if (xnorm < 1.0f) {
-        xnorm = 1.0f;
-    }
-
-    // already optimized
-    if (gnorm/xnorm <= params.lbfgs.eps) {
-        return GGML_OPT_RESULT_OK;
-    }
-
-    if (opt->just_initialized) {
-        if (pf) {
-            pf[0] = fx;
-        }
-        opt->lbfgs.fx_best = fx;
-
-        // initial step
-        ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d);
-        opt->lbfgs.j                = 0;
-        opt->lbfgs.k                = 1;
-        opt->lbfgs.end              = 0;
-        opt->lbfgs.n_no_improvement = 0;
-        opt->just_initialized       = false;
-    }
-
-    float * fx_best        = &opt->lbfgs.fx_best;
-    float * step           = &opt->lbfgs.step;
-    int * j                = &opt->lbfgs.j;
-    int * k                = &opt->lbfgs.k;
-    int * end              = &opt->lbfgs.end;
-    int * n_no_improvement = &opt->lbfgs.n_no_improvement;
-
-    int ls     = 0;
-    int bound  = 0;
-
-    float ys   = 0.0f;
-    float yy   = 0.0f;
-    float beta = 0.0f;
-
-    int it = 0;
-
-    while (true) {
-        // store the current position and gradient vectors
-        ggml_vec_cpy_f32(nx, xp, x);
-        ggml_vec_cpy_f32(nx, gp, g);
-
-        // TODO: instead of passing &cancel here, use the return code of the linesearch
-        //       to determine if the optimization should be cancelled
-        //       this is a simple change, but not doing this atm, since I don't have a nice
-        //       way to test and don't want to break something with so many changes lined up
-        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
-        if (cancel) {
-            return GGML_OPT_RESULT_CANCEL;
-        }
-
-        if (ls < 0) {
-            // linesearch failed - go back to the previous point and return
-            ggml_vec_cpy_f32(nx, x, xp);
-            ggml_vec_cpy_f32(nx, g, gp);
-
-            return ls;
-        }
-
-        opt->loss_after = fx;
-
-        ggml_vec_norm_f32(nx, &xnorm, x);
-        ggml_vec_norm_f32(nx, &gnorm, g);
-
-        GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0));
-
-        if (xnorm < 1.0f) {
-            xnorm = 1.0f;
-        }
-        if (gnorm/xnorm <= params.lbfgs.eps) {
-            // converged
-            return GGML_OPT_RESULT_OK;
-        }
-
-        // delta-based convergence test
-        if (pf != NULL) {
-            // need at least params.past iterations to start checking for convergence
-            if (params.past <= k[0]) {
-                const float rate = (pf[k[0]%params.past] - fx)/fx;
-
-                if (fabsf(rate) < params.delta) {
-                    return GGML_OPT_RESULT_OK;
-                }
-            }
-
-            pf[k[0]%params.past] = fx;
-        }
-
-        // check for improvement
-        if (params.max_no_improvement > 0) {
-            if (fx < fx_best[0]) {
-                fx_best[0] = fx;
-                n_no_improvement[0] = 0;
-            } else {
-                n_no_improvement[0]++;
-
-                if (n_no_improvement[0] >= params.max_no_improvement) {
-                    return GGML_OPT_RESULT_OK;
-                }
-            }
-        }
-
-        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
-            // reached the maximum number of iterations
-            return GGML_OPT_RESULT_DID_NOT_CONVERGE;
-        }
-
-        // update vectors s and y:
-        //   s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
-        //   y_{k+1} = g_{k+1} - g_{k}.
-        //
-        ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp);
-        ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp);
-
-        // compute scalars ys and yy:
-        //     ys = y^t \cdot s    -> 1 / \rho.
-        //     yy = y^t \cdot y.
-        //
-        ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
-        ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
-
-        lm_ys[end[0]] = ys;
-
-        // find new search direction
-        //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS
-
-        bound = (m <= k[0]) ? m : k[0];
-        k[0]++;
-        it++;
-        end[0] = (end[0] + 1)%m;
-
-        // initialize search direction with -g
-        ggml_vec_neg_f32(nx, d, g);
-
-        j[0] = end[0];
-        for (int i = 0; i < bound; ++i) {
-            j[0] = (j[0] + m - 1) % m;
-            // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
-            ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
-            lm_alpha[j[0]] /= lm_ys[j[0]];
-            // q_{i} = q_{i+1} - \alpha_{i} y_{i}
-            ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
-        }
-
-        ggml_vec_scale_f32(nx, d, ys/yy);
-
-        for (int i = 0; i < bound; ++i) {
-            // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
-            ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
-            beta /= lm_ys[j[0]];
-            // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
-            ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
-            j[0] = (j[0] + 1)%m;
-        }
-
-        step[0] = 1.0;
-    }
-
-    GGML_ABORT("lbfgs failed");
-
-    //return GGML_OPT_RESULT_DID_NOT_CONVERGE;
-}
-
-struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
-    struct ggml_opt_params result;
-
-    switch (type) {
-        case GGML_OPT_TYPE_ADAM:
-            {
-                result = (struct ggml_opt_params) {
-                    .type       = GGML_OPT_TYPE_ADAM,
-                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
-                    .n_threads  = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
-                    .past       = 0,
-                    .delta      = 1e-5f,
-
-                    .max_no_improvement = 100,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .n_gradient_accumulation = 1,
-
-                    .adam = {
-                        .n_iter = 10000,
-                        .sched  = 1.000f,
-                        .decay  = 0.0f,
-                        .decay_min_ndim = 2,
-                        .alpha  = 0.001f,
-                        .beta1  = 0.9f,
-                        .beta2  = 0.999f,
-                        .eps    = 1e-8f,
-                        .eps_f  = 1e-5f,
-                        .eps_g  = 1e-3f,
-                        .gclip  = 0.0f,
-                    },
-                };
-            } break;
-        case GGML_OPT_TYPE_LBFGS:
-            {
-                result = (struct ggml_opt_params) {
-                    .type       = GGML_OPT_TYPE_LBFGS,
-                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
-                    .n_threads  = 1,
-                    .past       = 0,
-                    .delta      = 1e-5f,
-
-                    .max_no_improvement = 0,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .n_gradient_accumulation = 1,
-
-                    .lbfgs = {
-                        .m              = 6,
-                        .n_iter         = 100,
-                        .max_linesearch = 20,
-
-                        .eps      = 1e-5f,
-                        .ftol     = 1e-4f,
-                        .wolfe    = 0.9f,
-                        .min_step = 1e-20f,
-                        .max_step = 1e+20f,
-
-                        .linesearch = GGML_LINESEARCH_DEFAULT,
-                    },
-                };
-            } break;
-    }
-
-    return result;
-}
-
-GGML_API void ggml_opt_init(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_opt_params params,
-        int64_t nx) {
-    opt->ctx = ctx;
-    opt->params = params;
-    opt->iter = 0;
-    opt->nx = nx;
-    opt->just_initialized = true;
-    if (opt->ctx == NULL) {
-        struct ggml_init_params ctx_opt_params;
-        if (opt->params.type == GGML_OPT_TYPE_ADAM) {
-            ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
-            if (opt->params.past > 0) {
-                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
-            }
-        } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) {
-            ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
-            if (opt->params.past > 0) {
-                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
-            }
-        }
-        ctx_opt_params.mem_buffer = NULL;
-        ctx_opt_params.no_alloc   = false;
-
-        opt->ctx = ggml_init(ctx_opt_params);
-    }
-    switch (opt->params.type) {
-        case GGML_OPT_TYPE_ADAM:
-            {
-                opt->adam.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->adam.m  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->adam.v  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->adam.pf = params.past > 0
-                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
-                    : NULL;
-                ggml_set_zero(opt->adam.m);
-                ggml_set_zero(opt->adam.v);
-                if (opt->adam.pf) {
-                    ggml_set_zero(opt->adam.pf);
-                }
-            } break;
-        case GGML_OPT_TYPE_LBFGS:
-            {
-                opt->lbfgs.x  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.gp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.d  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.pf = params.past > 0
-                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
-                    : NULL;
-                opt->lbfgs.lmal = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
-                opt->lbfgs.lmys = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
-                opt->lbfgs.lms  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
-                opt->lbfgs.lmy  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
-                ggml_set_zero(opt->lbfgs.x);
-                ggml_set_zero(opt->lbfgs.xp);
-                ggml_set_zero(opt->lbfgs.g);
-                ggml_set_zero(opt->lbfgs.gp);
-                ggml_set_zero(opt->lbfgs.d);
-                if (opt->lbfgs.pf) {
-                    ggml_set_zero(opt->lbfgs.pf);
-                }
-                ggml_set_zero(opt->lbfgs.lmal);
-                ggml_set_zero(opt->lbfgs.lmys);
-                ggml_set_zero(opt->lbfgs.lms);
-                ggml_set_zero(opt->lbfgs.lmy);
-            } break;
-    }
-}
-
-enum ggml_opt_result ggml_opt(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f) {
-    bool free_ctx = false;
-    if (ctx == NULL) {
-        struct ggml_init_params params_ctx = {
-            .mem_size   = 16*1024*1024,
-            .mem_buffer = NULL,
-            .no_alloc   = false,
-        };
-
-        ctx = ggml_init(params_ctx);
-        if (ctx == NULL) {
-            return GGML_OPT_RESULT_NO_CONTEXT;
-        }
-
-        free_ctx = true;
-    }
-
-    enum ggml_opt_result result = GGML_OPT_RESULT_OK;
-
-    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
-
-    ggml_opt_init(ctx, opt, params, 0);
-    result = ggml_opt_resume(ctx, opt, f);
-
-    if (free_ctx) {
-        ggml_free(ctx);
-    }
-
-    return result;
-}
-
-enum ggml_opt_result ggml_opt_resume(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_tensor * f) {
-
-    // build forward + backward compute graphs
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
-    ggml_build_forward_expand(gf, f);
-
-    struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
-    ggml_build_backward_expand(ctx, gf, gb, false);
-
-    return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
-}
-
-enum ggml_opt_result ggml_opt_resume_g(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        ggml_opt_callback callback,
-        void * callback_data) {
-
-    GGML_ASSERT(f->grad && "ggml_set_param must be called for at least one ancestor");
-
-    // build forward + backward compute graphs
-    enum ggml_opt_result result = GGML_OPT_RESULT_OK;
-
-    switch (opt->params.type) {
-        case GGML_OPT_TYPE_ADAM:
-            {
-                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
-            } break;
-        case GGML_OPT_TYPE_LBFGS:
-            {
-                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
-            } break;
-    }
-
-    if (opt->params.print_forward_graph) {
-        ggml_graph_print   (gf);
-        ggml_graph_dump_dot(gf, NULL, "opt-forward.dot");
-    }
-
-    if (opt->params.print_backward_graph) {
-        ggml_graph_print   (gb);
-        ggml_graph_dump_dot(gb, gf, "opt-backward.dot");
-    }
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 void ggml_set_input(struct ggml_tensor * tensor) {
     tensor->flags |= GGML_TENSOR_FLAG_INPUT;
 }
@@ -23247,22 +8225,6 @@ int ggml_cpu_has_fma(void) {
 #endif
 }
 
-int ggml_cpu_has_neon(void) {
-#if defined(__ARM_ARCH)
-    return ggml_arm_arch_features.has_neon;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_sve(void) {
-#if defined(__ARM_ARCH)
-    return ggml_arm_arch_features.has_sve;
-#else
-    return 0;
-#endif
-}
-
 int ggml_cpu_has_arm_fma(void) {
 #if defined(__ARM_FEATURE_FMA)
     return 1;
@@ -23403,22 +8365,6 @@ int ggml_cpu_has_vsx(void) {
 #endif
 }
 
-int ggml_cpu_has_matmul_int8(void) {
-#if defined(__ARM_ARCH)
-    return ggml_arm_arch_features.has_i8mm;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_get_sve_cnt(void) {
-#if defined(__ARM_ARCH)
-    return ggml_arm_arch_features.sve_cnt;
-#else
-    return 0;
-#endif
-}
-
 void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
     g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
     g_logger_state.log_callback_user_data = user_data;
diff --git a/include/llama.h b/include/llama.h
index 24005548d..ccb48f73c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -2,6 +2,7 @@
 #define LLAMA_H
 
 #include "ggml.h"
+#include "ggml-cpu.h"
 #include "ggml-backend.h"
 
 #include <stddef.h>
diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp
index 131d7c177..3df6e1f42 100644
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@@ -11,6 +11,7 @@
 #include <type_traits>
 
 #include <ggml.h>
+#include <ggml-cpu.h>
 
 constexpr int kVecSize = 1 << 16;
 
@@ -136,7 +137,7 @@ int main(int argc, char** argv) {
 
     auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
 
-    const auto * funcs = ggml_get_type_traits(ggml_type);
+    const auto * funcs = ggml_get_type_traits_cpu(ggml_type);
 
     Stat simple, ggml;
 
diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp
index 88e66ea13..e9af8a363 100644
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@@ -9,6 +9,7 @@
 #include <array>
 
 #include <ggml.h>
+#include <ggml-cpu.h>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -236,7 +237,8 @@ int main(int argc, char** argv) {
     int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
     int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
 
-    const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0);
+    const auto * funcs = ggml_get_type_traits(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
+    const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
 
     std::vector<block_q4_0> q40;
     std::vector<block_q4_1> q41;
@@ -282,10 +284,10 @@ int main(int argc, char** argv) {
             dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
         }
         else {
-            const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type);
+            const auto * vdot = ggml_get_type_traits(funcs_cpu->vec_dot_type);
             vdot->from_float(y1.data(), q8.data(), kVecSize);
-            if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
-            else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
+            if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
+            else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
         }
         sumq += result;
         t2 = std::chrono::high_resolution_clock::now();
diff --git a/spm-headers/ggml-cpu.h b/spm-headers/ggml-cpu.h
new file mode 120000
index 000000000..66e629607
--- /dev/null
+++ b/spm-headers/ggml-cpu.h
@@ -0,0 +1 @@
+../ggml/include/ggml-cpu.h
\ No newline at end of file
diff --git a/src/llama.cpp b/src/llama.cpp
index 3f534596e..3e563d811 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21900,6 +21900,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
 }
 
 const char * llama_print_system_info(void) {
+    ggml_cpu_init(); // some ARM features are detected at runtime
+
     static std::string s;
 
     s  = "";
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 2e3ad79f0..46346cbd0 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -16,6 +16,7 @@
 
 
 #include <ggml.h>
+#include <ggml-cpu.h>
 #include <ggml-alloc.h>
 #include <ggml-backend.h>
 
diff --git a/tests/test-barrier.cpp b/tests/test-barrier.cpp
index cf54237db..d85bf912b 100644
--- a/tests/test-barrier.cpp
+++ b/tests/test-barrier.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "ggml-cpu.h"
 #include "ggml-backend.h"
 
 #include <chrono>
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 2200ad93d..c712dba7f 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -1,5 +1,6 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
 #include "ggml.h"
+#include "ggml-cpu.h"
 
 #include <cfloat>
 #include <cmath>
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index d50417ba0..000e60adf 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -1,6 +1,7 @@
 // Unit tests for quantization specific functions - quantize, dequantize and dot product
 
 #include "ggml.h"
+#include "ggml-cpu.h"
 
 #undef NDEBUG
 #include <assert.h>
@@ -78,18 +79,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
 
 // Total dot product error
 static float dot_product_error(
-    const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
+    const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2
 ) {
     std::vector<uint8_t> tmp_q1(2*test_size);
     std::vector<uint8_t> tmp_q2(2*test_size);
 
-    const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
+    const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
 
     qfns->from_float(test_data1, tmp_q1.data(), test_size);
     vdot->from_float(test_data2, tmp_q2.data(), test_size);
 
     float result = INFINITY;
-    qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
+    qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
 
     const float dot_ref = dot_product(test_data1, test_data2, test_size);
 
@@ -132,6 +133,7 @@ int main(int argc, char * argv[]) {
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
         const auto * qfns = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
 
         // deprecated - skip
         if (qfns->blck_size == 0) {
@@ -166,7 +168,7 @@ int main(int argc, char * argv[]) {
                 printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
             }
 
-            const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
+            const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
             const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
                                             type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
                                           ? MAX_DOT_PRODUCT_ERROR_LOWBIT
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index bdbdd90a8..221424de8 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -1,6 +1,7 @@
 // Benchmark quantization specific functions on synthetic data
 
 #include "ggml.h"
+#include "ggml-cpu.h"
 
 #undef NDEBUG
 #include <algorithm>
@@ -271,6 +272,7 @@ int main(int argc, char * argv[]) {
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
         const auto * qfns = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
         if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
             continue;
         }
@@ -328,7 +330,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
+                        const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
                         vdot->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
@@ -346,7 +348,7 @@ int main(int argc, char * argv[]) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
                         float result;
-                        qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
+                        qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
                         return result;
                     };
                     size_t quantized_size = ggml_row_size(type, size);
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 246bb227d..4656b30f0 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "ggml-cpu.h"
 
 #include <cmath>
 #include <cstdio>

From e2292aaa17cf8530b0d0d899909588c3a095799d Mon Sep 17 00:00:00 2001
From: Plamen Minev <pacominev@gmail.com>
Date: Fri, 1 Nov 2024 16:55:10 +0200
Subject: [PATCH 152/396] metal : fix minor string leaks (ggml/1004)

---
 ggml/src/ggml-metal.m | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index a2b4d49d5..f9bd6faa4 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -450,7 +450,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
                     GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                     return NULL;
                 }
+
+#if !__has_feature(objc_arc)
+                [options release];
+#endif
             }
+#if GGML_METAL_EMBED_LIBRARY
+            [src release];
+#endif // GGML_METAL_EMBED_LIBRARY
         }
     }
 

From 284e5b0275cc1292096e72e808e41d17e8cdf019 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Sat, 2 Nov 2024 05:09:12 -0400
Subject: [PATCH 153/396] cmake : make it possible linking ggml as external lib
 (ggml/1003)

---
 ggml/src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 82b81cf12..34b81bd7f 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1396,7 +1396,7 @@ if (EMSCRIPTEN)
 endif()
 
 target_compile_definitions(ggml PUBLIC    ${GGML_CDEF_PUBLIC})
-target_include_directories(ggml PUBLIC  ../include)
+target_include_directories(ggml PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
 target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
 target_link_directories   (ggml PRIVATE   ${GGML_EXTRA_LIBDIRS})
 target_compile_features   (ggml PRIVATE c_std_11) # don't bump

From ce027adfb3b131f0d2368294fc276bb0e342b3f6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 4 Nov 2024 10:33:37 +0200
Subject: [PATCH 154/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 48863847c..020c60f34 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-bb78a40dc60e04c626bac2b65840b509988e990d
+a099cb514d6687e436a5a423d1fb0448be0feb20

From 329ed914c959c510d076fb06b43eeb3f7b804d6f Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Mon, 4 Nov 2024 19:08:22 +0800
Subject: [PATCH 155/396] CANN: adjust backend registry refactor. (#10158)

remove buffer->iface.get_name that used in cann as it was removed in backend registry refactor PR.
---
 ggml/src/ggml-cann.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index f8ac11e41..776340881 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -1227,7 +1227,6 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
 
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
     buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
     buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
 
     return buffer;

From f8e58135cff1c373df2934306f9c9da99673c2ed Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 4 Nov 2024 13:43:32 +0200
Subject: [PATCH 156/396] metal : move dequantize templates to beginning of MSL
 source (#0)

---
 ggml/src/ggml-metal.metal | 874 +++++++++++++++++++-------------------
 1 file changed, 436 insertions(+), 438 deletions(-)

diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index 57eb34f13..3eb976633 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -12,6 +12,442 @@ using namespace metal;
 
 #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
 
+constexpr constant static float kvalues_iq4nl_f[16] = {
+    -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
+};
+
+// NOTE: this is not dequantizing - we are simply fitting the template
+template <typename type4x4>
+void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
+    float4x4 temp = *(((device float4x4 *)src));
+    for (int i = 0; i < 16; i++){
+        reg[i/4][i%4] = temp[i/4][i%4];
+    }
+}
+
+template <typename type4x4>
+void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
+    half4x4 temp = *(((device half4x4 *)src));
+    for (int i = 0; i < 16; i++){
+        reg[i/4][i%4] = temp[i/4][i%4];
+    }
+}
+
+template <typename type4x4>
+void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
+    const float d1 = il ? (xb->d / 16.h) : xb->d;
+    const float d2 = d1 / 256.f;
+    const float md = -8.h * xb->d;
+    const ushort mask0 = il ? 0x00F0 : 0x000F;
+    const ushort mask1 = mask0 << 8;
+
+    for (int i=0;i<8;i++) {
+        reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
+        reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 2);
+    const float d1 = il ? (xb->d / 16.h) : xb->d;
+    const float d2 = d1 / 256.f;
+    const float  m = xb->m;
+    const ushort mask0 = il ? 0x00F0 : 0x000F;
+    const ushort mask1 = mask0 << 8;
+
+    for (int i=0;i<8;i++) {
+        reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
+        reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
+    const float d = xb->d;
+    const float md = -16.h * xb->d;
+    const ushort mask = il ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = il ? 4 : 0;
+
+    const int gh_mv = il ? 12 : 0;
+    const int gh_bk = il ?  0 : 4;
+
+    for (int i = 0; i < 8; i++) {
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg[i/2][2*(i%2)+0] = d * x0 + md;
+        reg[i/2][2*(i%2)+1] = d * x1 + md;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
+    const float d = xb->d;
+    const float m = xb->m;
+    const ushort mask = il ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = il ? 4 : 0;
+
+    const int gh_mv = il ? 12 : 0;
+    const int gh_bk = il ?  0 : 4;
+
+    for (int i = 0; i < 8; i++) {
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg[i/2][2*(i%2)+0] = d * x0 + m;
+        reg[i/2][2*(i%2)+1] = d * x1 + m;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
+    device const int8_t * qs = ((device const int8_t *)xb->qs);
+    const half d = xb->d;
+
+    for (int i = 0; i < 16; i++) {
+        reg[i/4][i%4] = (qs[i + 16*il] * d);
+    }
+}
+
+template <typename type4x4>
+void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
+    const float d = xb->d;
+    const float min = xb->dmin;
+    device const uint8_t * q = (device const uint8_t *)xb->qs;
+    float dl, ml;
+    uint8_t sc = xb->scales[il];
+
+    q = q + 32*(il/8) + 16*(il&1);
+    il = (il/2)%4;
+
+    half  coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
+    uchar mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
+    dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
+    const half d_all = xb->d;
+    device const uint8_t * q = (device const uint8_t *)xb->qs;
+    device const uint8_t * h = (device const uint8_t *)xb->hmask;
+    device const int8_t * scales = (device const int8_t *)xb->scales;
+
+    q = q + 32 * (il/8) + 16 * (il&1);
+    h = h + 16 * (il&1);
+    uint8_t m = 1 << (il/2);
+    uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
+                                 ((il/4)>0 ? 12  : 3);
+    uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
+    uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
+    int16_t  dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
+                               : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
+    float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
+    const float ml = 4.f * dl;
+
+    il = (il/2) & 3;
+    const half    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
+    const uint8_t mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
+    dl *= coef;
+
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
+    }
+}
+
+static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
+    return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
+                 : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
+}
+
+template <typename type4x4>
+void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
+    device const uchar * q = xb->qs;
+
+    short is = (il/4) * 2;
+    q = q + (il/4) * 32 + 16 * (il&1);
+    il = il & 3;
+    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
+    const float d   = il < 2 ? xb->d : xb->d / 16.h;
+    const float min = xb->dmin;
+    const float dl = d * sc[0];
+    const float ml = min * sc[1];
+
+    const ushort mask = il<2 ? 0x0F : 0xF0;
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
+    device const uint8_t * q  = xb->qs;
+    device const uint8_t * qh = xb->qh;
+
+    short is = (il/4) * 2;
+    q  = q + 32 * (il/4) + 16 * (il&1);
+    qh = qh + 16 * (il&1);
+    uint8_t ul = 1 << (il/2);
+    il = il & 3;
+    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
+    const float d = il < 2 ? xb->d : xb->d / 16.f;
+    const float min = xb->dmin;
+    const float dl = d * sc[0];
+    const float ml = min * sc[1];
+
+    const ushort mask  = il<2 ? 0x0F : 0xF0;
+    const float qh_val = il<2 ? 16.f : 256.f;
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
+    const half d_all = xb->d;
+    device const uint8_t * ql = (device const uint8_t *)xb->ql;
+    device const uint8_t * qh = (device const uint8_t *)xb->qh;
+    device const int8_t * scales = (device const int8_t *)xb->scales;
+
+    ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
+    qh = qh + 32*(il/8) + 16*(il&1);
+    float sc = scales[(il%2) + 2 * ((il/2))];
+    il = (il/2) & 3;
+
+    const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
+    const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
+    const float       coef = il>1 ? 1.f/16.f          : 1.f;
+    const float ml = d_all * sc * 32.f;
+    const float dl = d_all * sc * coef;
+    for (int i = 0; i < 16; ++i) {
+        const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
+                            : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
+        reg[i/4][i%4] = dl * q - ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's.
+    device const uint16_t * q2 = xb->qs + 4*ib32;
+    const uint32_t aux32_g = q2[0] | (q2[1] << 16);
+    const uint32_t aux32_s = q2[2] | (q2[3] << 16);
+    thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g;
+    const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f;
+    constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
+    uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127];
+    for (int i = 0; i < 8; ++i) {
+        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
+    }
+    grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
+    signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127];
+    for (int i = 0; i < 8; ++i) {
+        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint16_t * q2 = xb->qs + 4*ib32;
+    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
+    constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511));
+    uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
+    for (int i = 0; i < 8; ++i) {
+        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
+    }
+    grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511));
+    signs = ksigns_iq2xs[q2[2*il+1] >> 9];
+    for (int i = 0; i < 8; ++i) {
+        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint8_t * q3 = xb->qs + 8*ib32;
+    device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
+    const uint32_t aux32 = gas[0] | (gas[1] << 16);
+    const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
+    constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
+    constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
+    uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
+    for (int i = 0; i < 4; ++i) {
+        reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
+        reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
+    }
+    grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
+    grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
+    signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
+    for (int i = 0; i < 4; ++i) {
+        reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
+        reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint8_t * qs = xb->qs + 8*ib32;
+    device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
+    const uint8_t qh = xb->qh[ib32] >> 4*il;
+    const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
+    constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
+    constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
+    for (int i = 0; i < 4; ++i) {
+        reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
+        reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
+    }
+    grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
+    grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
+    for (int i = 0; i < 4; ++i) {
+        reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
+        reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
+    device const uint8_t * signs = qs + QK_K/8;
+    const uint8_t qh = xb->qh[ib32] >> 4*il;
+    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
+    constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
+    constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
+    for (int i = 0; i < 8; ++i) {
+        reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
+        reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const int ib32 = il/2;
+    il = il%2;
+    const float d = xb->d;
+    device const uint8_t  * qs = xb->qs + 4*ib32 + 2*il;
+    device const uint16_t * qh = xb->qh;
+    const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1);
+    const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA);
+    const uint16_t h = qh[ib32] >> 6*il;
+    constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700)));
+    constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700)));
+    for (int i = 0; i < 4; ++i) {
+        reg[0][i] = dl * (grid1[i] & 0xf) + ml;
+        reg[1][i] = dl * (grid1[i] >>  4) + ml;
+        reg[2][i] = dl * (grid2[i] & 0xf) + ml;
+        reg[3][i] = dl * (grid2[i] >>  4) + ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const int ib32 = il/2;
+    il = il%2;
+    device const uint16_t * sc = (device const uint16_t *)xb->scales;
+
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+    const float d = scale.f16;
+
+    device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
+    device const uint8_t * qh = xb->qh + 2*ib32 + il;
+
+    const float dl  = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
+    const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
+    const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
+    constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
+    constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
+    for (int i = 0; i < 4; ++i) {
+        reg[0][i] = dl * (grid1[i] & 0xf) + ml1;
+        reg[1][i] = dl * (grid1[i] >>  4) + ml1;
+        reg[2][i] = dl * (grid2[i] & 0xf) + ml2;
+        reg[3][i] = dl * (grid2[i] >>  4) + ml2;
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
+    device const uint16_t * q4 = (device const uint16_t *)xb->qs;
+    const float d = xb->d;
+    uint32_t aux32;
+    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
+    for (int i = 0; i < 4; ++i) {
+        aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
+        reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
+        reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
+        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
+        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
+    const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
+    const float d = (float)xb->d * (ls - 32);
+    uint32_t aux32;
+    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
+    for (int i = 0; i < 4; ++i) {
+        aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
+        reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
+        reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
+        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
+        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
+    }
+}
+
 enum ggml_sort_order {
     GGML_SORT_ORDER_ASC,
     GGML_SORT_ORDER_DESC,
@@ -3339,10 +3775,6 @@ static inline int best_index_int8(int n, constant float * val, float x) {
     return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
 }
 
-constexpr constant static float kvalues_iq4nl_f[16] = {
-    -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
-};
-
 kernel void kernel_cpy_f32_iq4_nl(
         device const float * src0,
         device        void * dst,
@@ -5457,440 +5889,6 @@ kernel void kernel_mul_mv_iq4_xs_f32(
     kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 }
 
-//============================= templates and their specializations =============================
-
-// NOTE: this is not dequantizing - we are simply fitting the template
-template <typename type4x4>
-void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
-    float4x4 temp = *(((device float4x4 *)src));
-    for (int i = 0; i < 16; i++){
-        reg[i/4][i%4] = temp[i/4][i%4];
-    }
-}
-
-template <typename type4x4>
-void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
-    half4x4 temp = *(((device half4x4 *)src));
-    for (int i = 0; i < 16; i++){
-        reg[i/4][i%4] = temp[i/4][i%4];
-    }
-}
-
-template <typename type4x4>
-void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
-    const float d1 = il ? (xb->d / 16.h) : xb->d;
-    const float d2 = d1 / 256.f;
-    const float md = -8.h * xb->d;
-    const ushort mask0 = il ? 0x00F0 : 0x000F;
-    const ushort mask1 = mask0 << 8;
-
-    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
-        reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 2);
-    const float d1 = il ? (xb->d / 16.h) : xb->d;
-    const float d2 = d1 / 256.f;
-    const float  m = xb->m;
-    const ushort mask0 = il ? 0x00F0 : 0x000F;
-    const ushort mask1 = mask0 << 8;
-
-    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
-        reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
-    const float d = xb->d;
-    const float md = -16.h * xb->d;
-    const ushort mask = il ? 0x00F0 : 0x000F;
-
-    const uint32_t qh = *((device const uint32_t *)xb->qh);
-
-    const int x_mv = il ? 4 : 0;
-
-    const int gh_mv = il ? 12 : 0;
-    const int gh_bk = il ?  0 : 4;
-
-    for (int i = 0; i < 8; i++) {
-        // extract the 5-th bits for x0 and x1
-        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
-        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
-
-        // combine the 4-bits from qs with the 5th bit
-        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
-        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
-
-        reg[i/2][2*(i%2)+0] = d * x0 + md;
-        reg[i/2][2*(i%2)+1] = d * x1 + md;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
-    const float d = xb->d;
-    const float m = xb->m;
-    const ushort mask = il ? 0x00F0 : 0x000F;
-
-    const uint32_t qh = *((device const uint32_t *)xb->qh);
-
-    const int x_mv = il ? 4 : 0;
-
-    const int gh_mv = il ? 12 : 0;
-    const int gh_bk = il ?  0 : 4;
-
-    for (int i = 0; i < 8; i++) {
-        // extract the 5-th bits for x0 and x1
-        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
-        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
-
-        // combine the 4-bits from qs with the 5th bit
-        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
-        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
-
-        reg[i/2][2*(i%2)+0] = d * x0 + m;
-        reg[i/2][2*(i%2)+1] = d * x1 + m;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
-    device const int8_t * qs = ((device const int8_t *)xb->qs);
-    const half d = xb->d;
-
-    for (int i = 0; i < 16; i++) {
-        reg[i/4][i%4] = (qs[i + 16*il] * d);
-    }
-}
-
-template <typename type4x4>
-void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
-    const float d = xb->d;
-    const float min = xb->dmin;
-    device const uint8_t * q = (device const uint8_t *)xb->qs;
-    float dl, ml;
-    uint8_t sc = xb->scales[il];
-
-    q = q + 32*(il/8) + 16*(il&1);
-    il = (il/2)%4;
-
-    half  coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
-    uchar mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
-    dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
-    const half d_all = xb->d;
-    device const uint8_t * q = (device const uint8_t *)xb->qs;
-    device const uint8_t * h = (device const uint8_t *)xb->hmask;
-    device const int8_t * scales = (device const int8_t *)xb->scales;
-
-    q = q + 32 * (il/8) + 16 * (il&1);
-    h = h + 16 * (il&1);
-    uint8_t m = 1 << (il/2);
-    uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
-                                 ((il/4)>0 ? 12  : 3);
-    uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
-    uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
-    int16_t  dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
-                               : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
-    float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
-    const float ml = 4.f * dl;
-
-    il = (il/2) & 3;
-    const half    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
-    const uint8_t mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
-    dl *= coef;
-
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
-    }
-}
-
-static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
-    return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
-                 : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
-}
-
-template <typename type4x4>
-void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
-    device const uchar * q = xb->qs;
-
-    short is = (il/4) * 2;
-    q = q + (il/4) * 32 + 16 * (il&1);
-    il = il & 3;
-    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
-    const float d   = il < 2 ? xb->d : xb->d / 16.h;
-    const float min = xb->dmin;
-    const float dl = d * sc[0];
-    const float ml = min * sc[1];
-
-    const ushort mask = il<2 ? 0x0F : 0xF0;
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
-    device const uint8_t * q  = xb->qs;
-    device const uint8_t * qh = xb->qh;
-
-    short is = (il/4) * 2;
-    q  = q + 32 * (il/4) + 16 * (il&1);
-    qh = qh + 16 * (il&1);
-    uint8_t ul = 1 << (il/2);
-    il = il & 3;
-    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
-    const float d = il < 2 ? xb->d : xb->d / 16.f;
-    const float min = xb->dmin;
-    const float dl = d * sc[0];
-    const float ml = min * sc[1];
-
-    const ushort mask  = il<2 ? 0x0F : 0xF0;
-    const float qh_val = il<2 ? 16.f : 256.f;
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
-    const half d_all = xb->d;
-    device const uint8_t * ql = (device const uint8_t *)xb->ql;
-    device const uint8_t * qh = (device const uint8_t *)xb->qh;
-    device const int8_t * scales = (device const int8_t *)xb->scales;
-
-    ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
-    qh = qh + 32*(il/8) + 16*(il&1);
-    float sc = scales[(il%2) + 2 * ((il/2))];
-    il = (il/2) & 3;
-
-    const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
-    const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
-    const float       coef = il>1 ? 1.f/16.f          : 1.f;
-    const float ml = d_all * sc * 32.f;
-    const float dl = d_all * sc * coef;
-    for (int i = 0; i < 16; ++i) {
-        const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
-                            : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
-        reg[i/4][i%4] = dl * q - ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's.
-    device const uint16_t * q2 = xb->qs + 4*ib32;
-    const uint32_t aux32_g = q2[0] | (q2[1] << 16);
-    const uint32_t aux32_s = q2[2] | (q2[3] << 16);
-    thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g;
-    const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f;
-    constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
-    uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127];
-    for (int i = 0; i < 8; ++i) {
-        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-    grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
-    signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127];
-    for (int i = 0; i < 8; ++i) {
-        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint16_t * q2 = xb->qs + 4*ib32;
-    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
-    constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511));
-    uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
-    for (int i = 0; i < 8; ++i) {
-        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-    grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511));
-    signs = ksigns_iq2xs[q2[2*il+1] >> 9];
-    for (int i = 0; i < 8; ++i) {
-        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint8_t * q3 = xb->qs + 8*ib32;
-    device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
-    const uint32_t aux32 = gas[0] | (gas[1] << 16);
-    const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
-    constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
-    constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
-    uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
-    for (int i = 0; i < 4; ++i) {
-        reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
-        reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
-    }
-    grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
-    grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
-    signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
-    for (int i = 0; i < 4; ++i) {
-        reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
-        reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint8_t * qs = xb->qs + 8*ib32;
-    device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
-    const uint8_t qh = xb->qh[ib32] >> 4*il;
-    const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
-    constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
-    constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
-    for (int i = 0; i < 4; ++i) {
-        reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
-        reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
-    }
-    grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
-    grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
-    for (int i = 0; i < 4; ++i) {
-        reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
-        reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
-    device const uint8_t * signs = qs + QK_K/8;
-    const uint8_t qh = xb->qh[ib32] >> 4*il;
-    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
-    constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
-    constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
-    for (int i = 0; i < 8; ++i) {
-        reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
-        reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const int ib32 = il/2;
-    il = il%2;
-    const float d = xb->d;
-    device const uint8_t  * qs = xb->qs + 4*ib32 + 2*il;
-    device const uint16_t * qh = xb->qh;
-    const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1);
-    const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA);
-    const uint16_t h = qh[ib32] >> 6*il;
-    constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700)));
-    constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700)));
-    for (int i = 0; i < 4; ++i) {
-        reg[0][i] = dl * (grid1[i] & 0xf) + ml;
-        reg[1][i] = dl * (grid1[i] >>  4) + ml;
-        reg[2][i] = dl * (grid2[i] & 0xf) + ml;
-        reg[3][i] = dl * (grid2[i] >>  4) + ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const int ib32 = il/2;
-    il = il%2;
-    device const uint16_t * sc = (device const uint16_t *)xb->scales;
-
-    iq1m_scale_t scale;
-    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-    const float d = scale.f16;
-
-    device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
-    device const uint8_t * qh = xb->qh + 2*ib32 + il;
-
-    const float dl  = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
-    const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
-    const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
-    constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
-    constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
-    for (int i = 0; i < 4; ++i) {
-        reg[0][i] = dl * (grid1[i] & 0xf) + ml1;
-        reg[1][i] = dl * (grid1[i] >>  4) + ml1;
-        reg[2][i] = dl * (grid2[i] & 0xf) + ml2;
-        reg[3][i] = dl * (grid2[i] >>  4) + ml2;
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
-    device const uint16_t * q4 = (device const uint16_t *)xb->qs;
-    const float d = xb->d;
-    uint32_t aux32;
-    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
-    for (int i = 0; i < 4; ++i) {
-        aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
-        reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
-        reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
-        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
-        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
-    const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
-    const float d = (float)xb->d * (ls - 32);
-    uint32_t aux32;
-    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
-    for (int i = 0; i < 4; ++i) {
-        aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
-        reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
-        reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
-        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
-        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
-    }
-}
-
 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
 kernel void kernel_get_rows_q(
         device const  void * src0,

From 05697f670b1ea28b80c39854832ea53527f75c55 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 4 Nov 2024 13:49:34 +0200
Subject: [PATCH 157/396] metal : simplify f16 and f32 dequant kernels (#0)

---
 ggml/src/ggml-metal.metal | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index 3eb976633..ff9d37490 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -19,18 +19,12 @@ constexpr constant static float kvalues_iq4nl_f[16] = {
 // NOTE: this is not dequantizing - we are simply fitting the template
 template <typename type4x4>
 void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
-    float4x4 temp = *(((device float4x4 *)src));
-    for (int i = 0; i < 16; i++){
-        reg[i/4][i%4] = temp[i/4][i%4];
-    }
+    reg = (type4x4)(*src);
 }
 
 template <typename type4x4>
 void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
-    half4x4 temp = *(((device half4x4 *)src));
-    for (int i = 0; i < 16; i++){
-        reg[i/4][i%4] = temp[i/4][i%4];
-    }
+    reg = (type4x4)(*src);
 }
 
 template <typename type4x4>

From ea02c753ebf9342114cb173f10b3ffc2af1e7d04 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 4 Nov 2024 13:10:23 +0100
Subject: [PATCH 158/396] cuda : clear error after changing peer access
 (#10153)

---
 ggml/src/ggml-cuda.cu | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index b57f1b3b7..e68e40550 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1297,11 +1297,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
                     cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
                     if (err != cudaErrorPeerAccessAlreadyEnabled) {
                         CUDA_CHECK(err);
+                    } else {
+                        // reset the error
+                        cudaGetLastError();
                     }
                 } else {
                     cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
                     if (err != cudaErrorPeerAccessNotEnabled) {
                         CUDA_CHECK(err);
+                    } else {
+                        // reset the error
+                        cudaGetLastError();
                     }
                 }
             }

From 6a066b9978533e2ab9890b7f4f8c0262d91798b3 Mon Sep 17 00:00:00 2001
From: snadampal <87143774+snadampal@users.noreply.github.com>
Date: Mon, 4 Nov 2024 09:08:33 -0600
Subject: [PATCH 159/396] fix build break on arm64 linux (#10166)

This fixes the build break from the recent changes
to move the CPU backend to separate files
https://github.com/ggerganov/llama.cpp/pull/10144
---
 ggml/src/ggml-quants.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7aa6dce89..f792406e1 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -4,7 +4,7 @@
 #include "ggml-quants.h"
 #include "ggml-impl.h"
 #include "ggml-cpu-impl.h"
-
+#include "ggml-cpu.h"
 
 #include <math.h>
 #include <string.h>

From 9e0ecfb697d297355e43c20559d29bcc71beb0c3 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Mon, 4 Nov 2024 16:33:29 +0100
Subject: [PATCH 160/396] server : clarify /slots endpoint, add is_processing
 (#10162)

* server : clarify /slots endpoint, add is_processing

* fix tests
---
 examples/server/README.md                     | 11 +++++------
 examples/server/server.cpp                    | 16 ++++++++--------
 examples/server/tests/features/steps/steps.py | 10 +++++-----
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 1629e456b..15f95db1e 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
 
 ### GET `/slots`: Returns the current slots processing state
 
-This endpoint can be disabled with `--no-slots`
+> [!WARNING]
+> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
+
+This endpoint is disabled by default and can be enabled with `--slots`
 
 If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
 
@@ -709,6 +712,7 @@ Example:
         "grammar": "",
         "id": 0,
         "ignore_eos": false,
+        "is_processing": false,
         "logit_bias": [],
         "min_p": 0.05000000074505806,
         "mirostat": 0,
@@ -741,7 +745,6 @@ Example:
             "temperature"
         ],
         "seed": 42,
-        "state": 1,
         "stop": [
             "\n"
         ],
@@ -755,10 +758,6 @@ Example:
 ]
 ```
 
-Possible values for `slot[i].state` are:
-- `0`: SLOT_STATE_IDLE
-- `1`: SLOT_STATE_PROCESSING
-
 ### GET `/metrics`: Prometheus compatible metrics exporter
 
 This endpoint is only accessible if `--metrics` is set.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8531a784d..f0b89b22c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1566,11 +1566,11 @@ struct server_context {
 
                     for (server_slot & slot : slots) {
                         json slot_data = get_formated_generation(slot);
-                        slot_data["id"]         = slot.id;
-                        slot_data["id_task"]    = slot.id_task;
-                        slot_data["state"]      = slot.state;
-                        slot_data["prompt"]     = common_detokenize(ctx, slot.prompt_tokens);
-                        slot_data["next_token"] = {
+                        slot_data["id"]            = slot.id;
+                        slot_data["id_task"]       = slot.id_task;
+                        slot_data["is_processing"] = slot.is_processing();
+                        slot_data["prompt"]        = common_detokenize(ctx, slot.prompt_tokens);
+                        slot_data["next_token"]    = {
                             {"has_next_token", slot.has_next_token},
                             {"has_new_line",   slot.has_new_line},
                             {"n_remain",       slot.n_remaining},
@@ -1581,10 +1581,10 @@ struct server_context {
                             {"stopping_word",  slot.stopping_word},
                         };
 
-                        if (slot_data["state"] == SLOT_STATE_IDLE) {
-                            n_idle_slots++;
-                        } else {
+                        if (slot.is_processing()) {
                             n_processing_slots++;
+                        } else {
+                            n_idle_slots++;
                         }
 
                         slots_data.push_back(slot_data);
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 2e418d8aa..687b163f4 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
 async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
     match expected_slot_status_string:
         case 'idle':
-            expected_slot_status = 0
+            expected_slot_status = False
         case 'busy':
-            expected_slot_status = 1
+            expected_slot_status = True
         case _:
             assert False, "unknown status"
 
-    expected_slots = [{'id': slot_id, 'state': expected_slot_status}
+    expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
                       for slot_id in range(context.n_slots)]
     await request_slots_status(context, expected_slots)
 
@@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
                 if status_code == 503 and status_code == expected_http_status_code:
                     return
                 if status_code == 200 and status_code == expected_http_status_code:
-                    n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
-                    n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
+                    n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
+                    n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
                     if ((slots_idle is None or slots_idle == n_slots_idle)
                         and (slots_processing is None or slots_processing == n_slots_processing)):
                         return

From 401558b7ba7a08175c153cd3607230f63c8a528e Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 4 Nov 2024 17:34:08 +0100
Subject: [PATCH 161/396] ggml : fix q4xx mat mul, increase ggml_aligned_malloc
 alignment (#10167)

---
 ggml/src/ggml-cpu.c | 5 ++---
 ggml/src/ggml.c     | 9 ++++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c
index 4b8ffb629..09ba49b13 100644
--- a/ggml/src/ggml-cpu.c
+++ b/ggml/src/ggml-cpu.c
@@ -304,6 +304,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .nrows                    = 1,
     },
     [GGML_TYPE_Q8_0] = {
+        .from_float_to_mat        = quantize_mat_q8_0,
         .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -13692,9 +13693,7 @@ void ggml_cpu_init(void) {
                     uint16_t u16;
                     ggml_fp16_t fp16;
                 } u = {i};
-                // FIXME: this table is used in conversion functions outside of compute
-                // current code depends on ggml_init initializing this table
-                float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
+                float f = GGML_FP16_TO_FP32(u.fp16);
                 ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
                 ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
             }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7dc3340a1..1ccf78d98 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -220,8 +220,10 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
 
 
 void * ggml_aligned_malloc(size_t size) {
+    const int alignment = 64;
+
 #if defined(_MSC_VER) || defined(__MINGW32__)
-    return _aligned_malloc(size, TENSOR_ALIGNMENT);
+    return _aligned_malloc(size, alignment);
 #else
     if (size == 0) {
         GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
@@ -229,8 +231,9 @@ void * ggml_aligned_malloc(size_t size) {
     }
     void * aligned_memory = NULL;
   #ifdef GGML_USE_CPU_HBM
-    int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
+    int result = hbw_posix_memalign(&aligned_memory, alignment, size);
   #elif TARGET_OS_OSX
+    GGML_UNUSED(alignment);
     kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
     int result = EFAULT;
     switch (alloc_status) {
@@ -248,7 +251,7 @@ void * ggml_aligned_malloc(size_t size) {
             break;
     }
   #else
-    int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
+    int result = posix_memalign(&aligned_memory, alignment, size);
   #endif
     if (result != 0) {
         // Handle allocation failure

From d5a409e57fe8bd24fef597ab8a31110d390a6392 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 4 Nov 2024 20:06:58 +0100
Subject: [PATCH 162/396] ggml : fix gelu tables initialization (#10172)

---
 ggml/src/ggml-cpu.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c
index 09ba49b13..0cb5b824a 100644
--- a/ggml/src/ggml-cpu.c
+++ b/ggml/src/ggml-cpu.c
@@ -13678,6 +13678,13 @@ int ggml_cpu_get_sve_cnt(void) {
 }
 
 void ggml_cpu_init(void) {
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
     ggml_critical_section_start();
 
     static bool is_first_call = true;
@@ -13685,8 +13692,7 @@ void ggml_cpu_init(void) {
     if (is_first_call) {
         // initialize GELU, Quick GELU, SILU and EXP F32 tables
         {
-            // FIXME: this may be called before ggml_init
-            //const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
+            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 
             for (int i = 0; i < (1 << 16); ++i) {
                 union {
@@ -13698,9 +13704,9 @@ void ggml_cpu_init(void) {
                 ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
             }
 
-            //const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
 
-            //GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
+            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
         }
 
 #if defined(__ARM_ARCH)

From 340736477651095a98a3b10e19b038ec62593a1d Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Mon, 4 Nov 2024 22:06:31 +0000
Subject: [PATCH 163/396] Q6_K AVX improvements (#10118)

* q6_k instruction reordering attempt

* better subtract method

* should be theoretically faster

small improvement with shuffle lut, likely because all loads are already done at that stage

* optimize bit fiddling

* handle -32 offset separately. bsums exists for a reason!

* use shift

* Update ggml-quants.c

* have to update ci macos version to 13 as 12 doesnt work now. 13 is still x86
---
 .github/workflows/build.yml |  2 +-
 ggml/src/ggml-quants.c      | 87 ++++++++++++++++---------------------
 2 files changed, 38 insertions(+), 51 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 423173b97..02dcee963 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -92,7 +92,7 @@ jobs:
           name: llama-bin-macos-arm64.zip
 
   macOS-latest-cmake-x64:
-    runs-on: macos-12
+    runs-on: macos-13
 
     steps:
       - name: Clone
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index f792406e1..82a463f27 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -9104,10 +9104,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
 #elif defined __AVX__
 
-    const __m128i m4 = _mm_set1_epi8(0xF);
     const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i m32s = _mm_set1_epi8(32);
-    const __m128i m2 = _mm_set1_epi8(2);
+    const __m128i m15 = _mm_set1_epi8(15);
 
     __m256 acc = _mm256_setzero_ps();
 
@@ -9119,12 +9117,20 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const uint8_t * restrict qh = x[i].qh;
         const int8_t  * restrict q8 = y[i].qs;
 
+        // handle the q6_k -32 offset separately using bsums
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
         const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
+        const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
+        const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
+        const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
 
         __m128i sumi_0 = _mm_setzero_si128();
         __m128i sumi_1 = _mm_setzero_si128();
 
-        __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+        int is = 0;
+
         for (int j = 0; j < QK_K/128; ++j) {
 
             const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
@@ -9132,26 +9138,26 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
             const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
             const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
-            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
-            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
-            const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
-            const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
-            const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
-            const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
+            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
+            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
+            const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
+            const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
+            const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
+            const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
 
             const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
             const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
             const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
             const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
 
-            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
-            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
-            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
-            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
-            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
-            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
-            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
-            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
+            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
+            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
+            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
+            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
+            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
+            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
+            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
+            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
 
             const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
             const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
@@ -9162,15 +9168,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
             const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
 
-            __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
-            __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
-            __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
-            __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
-            __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
-            __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
-            __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
-            __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
-
             __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
             __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
             __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
@@ -9180,32 +9177,20 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
             __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
 
-            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
-            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
-            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
-            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;
 
             p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
-            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
+            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
             p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
-            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
+            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
             p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
-            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
+            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
             p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
-            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
+            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
 
             sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
             sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
@@ -9214,8 +9199,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         }
 
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
+        sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
+        sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
+        const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
     }
 
     *s = hsum_float_8(acc);

From a9e8a9a0306a8093eef93b0022d9f45510490072 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 4 Nov 2024 23:17:01 +0100
Subject: [PATCH 164/396] ggml : fix arch check in bf16_to_fp32 (#10164)

---
 ggml/src/ggml.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1ccf78d98..e6a7824ba 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -395,6 +395,8 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
                                     16)));
         }
     }
+#endif
+#if defined(__AVX2__)
     if (ggml_cpu_has_avx2()) {
         for (; i + 8 <= n; i += 8) {
             _mm256_storeu_ps(y + i,

From b8deef0ec0af5febac1d2cfd9119ff330ed0b762 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Tue, 5 Nov 2024 05:23:04 -0700
Subject: [PATCH 165/396] llama : add <|tool_call|> formatting to Granite
 template (#10177)

Branch: GraniteToolCallTemplate

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 src/llama.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 3e563d811..0cdf0c073 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21799,8 +21799,11 @@ static int32_t llama_chat_apply_template_internal(
         // IBM Granite template
         for (const auto & message : chat) {
             std::string role(message->role);
-            ss << "<|start_of_role|>" << role << "<|end_of_role|>"
-               << message->content << "<|end_of_text|>\n";
+            ss << "<|start_of_role|>" << role << "<|end_of_role|>";
+            if (role == "assistant_tool_call") {
+                ss << "<|tool_call|>";
+            }
+            ss << message->content << "<|end_of_text|>\n";
         }
         if (add_ass) {
             ss << "<|start_of_role|>assistant<|end_of_role|>\n";

From a1eaf6a9600bb1608753420ba886a3b0a208ffc0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 6 Nov 2024 10:24:23 +0200
Subject: [PATCH 166/396] metal : add quantized FA support (#10149)

* metal : add quantized FA (vec) support

ggml-ci

* metal : add quantized FA (non-vec) support

* metal : fix support check

ggml-ci

* metal : clean-up

* metal : clean-up (cont)

* metal : fix shared memory calc + reduce smem + comments

* metal : float-correctness

* metal : minor [no ci]
---
 ggml/src/ggml-metal.m     | 302 +++++++++++++++++++++----
 ggml/src/ggml-metal.metal | 456 +++++++++++++++++++++++++-------------
 2 files changed, 567 insertions(+), 191 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index f9bd6faa4..aee354cdd 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -255,9 +255,49 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
-  //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,     // https://github.com/ggerganov/llama.cpp/issues/7261
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
-  //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,
     GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
     GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
     GGML_METAL_KERNEL_TYPE_CPY_F16_F16,
@@ -710,9 +750,49 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,        flash_attn_ext_f16_h96,         support_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,       flash_attn_ext_f16_h112,        support_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,       flash_attn_ext_f16_h128,        support_simdgroup_mm);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,       flash_attn_ext_q4_0_h64,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,       flash_attn_ext_q4_0_h80,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,       flash_attn_ext_q4_0_h96,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112,      flash_attn_ext_q4_0_h112,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128,      flash_attn_ext_q4_0_h128,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256,      flash_attn_ext_q4_0_h256,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64,       flash_attn_ext_q4_1_h64,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80,       flash_attn_ext_q4_1_h80,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96,       flash_attn_ext_q4_1_h96,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112,      flash_attn_ext_q4_1_h112,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128,      flash_attn_ext_q4_1_h128,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256,      flash_attn_ext_q4_1_h256,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64,       flash_attn_ext_q5_0_h64,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80,       flash_attn_ext_q5_0_h80,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96,       flash_attn_ext_q5_0_h96,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112,      flash_attn_ext_q5_0_h112,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128,      flash_attn_ext_q5_0_h128,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256,      flash_attn_ext_q5_0_h256,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64,       flash_attn_ext_q5_1_h64,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80,       flash_attn_ext_q5_1_h80,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96,       flash_attn_ext_q5_1_h96,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112,      flash_attn_ext_q5_1_h112,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128,      flash_attn_ext_q5_1_h128,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256,      flash_attn_ext_q5_1_h256,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64,       flash_attn_ext_q8_0_h64,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80,       flash_attn_ext_q8_0_h80,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96,       flash_attn_ext_q8_0_h96,        support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112,      flash_attn_ext_q8_0_h112,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,      flash_attn_ext_q8_0_h128,       support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,      flash_attn_ext_q8_0_h256,       support_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,   flash_attn_ext_vec_f16_h128,    support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,  flash_attn_ext_vec_q4_0_h128,   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,  flash_attn_ext_vec_q4_1_h128,   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,  flash_attn_ext_vec_q5_0_h128,   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,  flash_attn_ext_vec_q5_1_h128,   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,  flash_attn_ext_vec_q8_0_h128,   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,  flash_attn_ext_vec_q4_0_h256,   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,  flash_attn_ext_vec_q4_1_h256,   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,  flash_attn_ext_vec_q5_0_h256,   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,  flash_attn_ext_vec_q5_1_h256,   support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,  flash_attn_ext_vec_q8_0_h256,   support_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,                   cpy_f32_f16,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,                   cpy_f32_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,                   cpy_f16_f16,                    true);
@@ -869,13 +949,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_LEAKY_RELU:
             return true;
         case GGML_OP_FLASH_ATTN_EXT:
-            if (op->src[1]->type != GGML_TYPE_F16) {
-                return false;
-            }
-            if (op->src[2]->type != GGML_TYPE_F16) {
-                return false;
-            }
-            if (op->src[0]->ne[0] == 256) {
+            if (op->src[1]->type != op->src[2]->type) {
                 return false;
             }
             return support_simdgroup_mm; // TODO: over-restricted for vec-kernels
@@ -2822,6 +2896,7 @@ static void ggml_metal_encode_node(
                 GGML_ASSERT(ne11 % 32 == 0);
 
                 GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                GGML_ASSERT(src1->type == src2->type);
 
                 GGML_ASSERT(ggml_are_same_shape (src1, src2));
 
@@ -2869,26 +2944,154 @@ static void ggml_metal_encode_node(
                 bool use_vec_kernel = false;
 
                 if (ne01 >= 4 || (ne00%128 != 0)) {
-                    switch (ne00) {
-                        case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
-                        case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
-                        case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
-                        case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
-                        case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
-                                  //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
+                    switch (src1->type) {
+                        case GGML_TYPE_F16:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_Q4_0:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_Q4_1:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_Q5_0:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_Q5_1:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_Q8_0:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
                         default:
-                                  {
-                                      GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
-                                      GGML_LOG_ERROR("add template specialization for this size\n");
-                                      GGML_ABORT("add template specialization for this size");
-                                  }
+                            {
+                                GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
+                                GGML_LOG_ERROR("add template specialization for this type\n");
+                                GGML_ABORT("add template specialization for this type");
+                            }
                     }
                 } else {
                     use_vec_kernel = true;
 
                     switch (ne00) {
-                        case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
-                                  //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
+                        case 128:
+                            {
+                                switch (src1->type) {
+                                    case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
+                                    case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128].pipeline; break;
+                                    case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128].pipeline; break;
+                                    case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128].pipeline; break;
+                                    case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128].pipeline; break;
+                                    case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128].pipeline; break;
+                                    default:
+                                        {
+                                            GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
+                                            GGML_LOG_ERROR("add template specialization for this type\n");
+                                            GGML_ABORT("add template specialization for this type");
+                                        }
+                                }
+                            } break;
+                        case 256:
+                            {
+                                switch (src1->type) {
+                                    case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
+                                    case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256].pipeline; break;
+                                    case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256].pipeline; break;
+                                    case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256].pipeline; break;
+                                    case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256].pipeline; break;
+                                    case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256].pipeline; break;
+                                    default:
+                                        {
+                                            GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
+                                            GGML_LOG_ERROR("add template specialization for this type\n");
+                                            GGML_ABORT("add template specialization for this type");
+                                        }
+                                }
+                            } break;
                         default:
                                   {
                                       GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
@@ -2942,10 +3145,16 @@ static void ggml_metal_encode_node(
                     GGML_ASSERT(nqptg  % 8  == 0);
                     GGML_ASSERT(ncpsg  % 32 == 0);
 
+                    // 16*32*(nsg)
+                    // the shared memory needed for the simdgroups to load the KV cache
+                    // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
+                    //
+#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
+
                     int64_t nsgmax = 2;
 
                     while (true) {
-                        const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2);
+                        const size_t smem = FATTN_SMEM(nsgmax);
                         if (smem > device.maxThreadgroupMemoryLength) {
                             break;
                         }
@@ -2956,16 +3165,15 @@ static void ggml_metal_encode_node(
                     // simdgroups per threadgroup (a.k.a. warps)
                     const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
 
-                    const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2);
+                    const size_t smem = FATTN_SMEM(nsg);
 
-                    //printf("smem: %zu, max: %zu\n", smem, device.maxThreadgroupMemoryLength);
+                    //printf("smem: %zu, max: %zu, nsg = %d\n", smem, device.maxThreadgroupMemoryLength, (int) nsg);
                     GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength);
-
-                    [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0];
-
+                    [encoder setThreadgroupMemoryLength:smem atIndex:0];
+#undef FATTN_SMEM
                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
                 } else {
-                    // half1x4 kernel
+                    // half4x4 kernel
                     const int64_t nqptg = 1;  // queries per threadgroup    !! sync with kernel template arguments !!
                     const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
 
@@ -2973,8 +3181,28 @@ static void ggml_metal_encode_node(
                     GGML_ASSERT(nqptg  % 1  == 0);
                     GGML_ASSERT(ncpsg  % 32 == 0);
 
+                    // ne00 + 2*ncpsg*(nsg)
+                    // for each query, we load it as f16 in shared memory (ne00)
+                    // and store the attention scores (nqptg x ncpsg) as f32
+                    //
+                    // 2*ne00*(nsg)
+                    // each simdgroup has a full f32 head vector in shared mem to accumulate results
+                    //
+#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + 2*ne00*(nsg))*(sizeof(float)/2), 16))
+
+                    int64_t nsgmax = 2;
+
+                    while (true) {
+                        const size_t smem = FATTN_SMEM(nsgmax);
+                        if (smem > device.maxThreadgroupMemoryLength) {
+                            break;
+                        }
+                        nsgmax *= 2;
+                    }
+                    nsgmax /= 2;
+
                     // simdgroups per threadgroup (a.k.a. warps)
-                    const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32));
+                    const int64_t nsgt = MAX(2, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)));
 
                     int64_t nsg = 1;
                     while (nsg <= nsgt) {
@@ -2982,12 +3210,12 @@ static void ggml_metal_encode_node(
                     }
                     nsg /= 2;
 
-                    const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2);
+                    const size_t smem = FATTN_SMEM(nsg);
 
-                    //printf("smem: %zu, max: %zu\n", smem, device.maxThreadgroupMemoryLength);
+                    //printf("smem: %zu, max: %zu, nsg = %d\n", smem, device.maxThreadgroupMemoryLength, (int) nsg);
                     GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength);
-                    [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0];
-
+                    [encoder setThreadgroupMemoryLength:smem atIndex:0];
+#undef FATTN_SMEM
                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
                 }
             } break;
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index ff9d37490..b9ea9f08e 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -2723,46 +2723,10 @@ kernel void kernel_leaky_relu_f32(
     dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope;
 }
 
-typedef void (flash_attn_ext_f16_t)(
-        device const  char * q,
-        device const  char * k,
-        device const  char * v,
-        device const  char * mask,
-        device       float * dst,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant   int64_t & ne13,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant  uint64_t & nb21,
-        constant  uint64_t & nb22,
-        constant  uint64_t & nb23,
-        constant  uint64_t & nb31,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant     float & scale,
-        constant     float & max_bias,
-        constant     float & m0,
-        constant     float & m1,
-        constant  uint32_t & n_head_log2,
-        constant     float & logit_softcap,
-        threadgroup   half * shared,
-        uint3  tgpig[[threadgroup_position_in_grid]],
-        uint3  tpitg[[thread_position_in_threadgroup]],
-        uint3    ntg[[threads_per_threadgroup]],
-        ushort tiisg[[thread_index_in_simdgroup]],
-        ushort sgitg[[simdgroup_index_in_threadgroup]]);
-
 // ref: https://arxiv.org/pdf/2307.08691.pdf
-template<int64_t D, int64_t Q = 8, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
-kernel void kernel_flash_attn_ext_f16(
+// D - head size, Q - queries per threadgroup, KV - key/value processed per each simdgroup, C - cache items per threadgroup
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &), short D, short Q = 8, short KV = 8, short C = 32>
+kernel void kernel_flash_attn_ext(
         device const  char * q,
         device const  char * k,
         device const  char * v,
@@ -2800,15 +2764,15 @@ kernel void kernel_flash_attn_ext_f16(
         ushort sgitg[[simdgroup_index_in_threadgroup]]) {
     const short nsg = ntg.y; // number of simdgroups
 
-    const short iq3 = tgpig[2];
-    const short iq2 = tgpig[1];
-    const short iq1 = tgpig[0]*Q;
+    const int iq3 = tgpig[2];
+    const int iq2 = tgpig[1];
+    const int iq1 = tgpig[0]*Q;
 
-    const short D4 = D/4;
-    const short D8 = D/8;
-  //const short Q8 = Q/8;
-    const short NW = N_SIMDWIDTH;
-    const short SH = (C + Q); // shared memory per simdgroup in (half)
+    const short D4  = D/4;
+    const short D8  = D/8;
+    const short D16 = D/16;
+    const short NW  = N_SIMDWIDTH;
+    const short SH  = (C + Q); // shared memory per simdgroup in (half)
 
     const short T  = D + 2*nsg*SH; // shared memory size per query in (half)
     const short TF = T/2;        // shared memory size per query in (float)
@@ -2818,6 +2782,9 @@ kernel void kernel_flash_attn_ext_f16(
     threadgroup half4 * sq4 = (threadgroup half4 *) (shared +              0*D); // same as above but in half4
     threadgroup float * ss  = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
 
+    threadgroup half    * skv  = (threadgroup half    *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K and V in shared memory
+    threadgroup half4x4 * skv4 = (threadgroup half4x4 *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in half4x4
+
     // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
     simdgroup_half8x8 lo[D8];
 
@@ -2849,25 +2816,28 @@ kernel void kernel_flash_attn_ext_f16(
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     {
-        float S[Q] = { [0 ... Q-1] = 0.0h };
+        float S[Q] = { [0 ... Q-1] = 0.0f };
         float M[Q] = { [0 ... Q-1] = -FLT_MAX/2 };
 
+        // thread indices inside the simdgroup
+        const short tx = tiisg%4;
+        const short ty = tiisg/4;
+
         // assume K and V are same shape
         const short ne22 = ne12;
         const short ne23 = ne13;
 
-        // broadcast
+        // broadcast k
         const short rk2 = ne02/ne12;
         const short rk3 = ne03/ne13;
 
-        const short rv2 = ne02/ne22;
-        const short rv3 = ne03/ne23;
-
-        // k indices
         const short ik2 = iq2/rk2;
         const short ik3 = iq3/rk3;
 
-        // v indices
+        // broadcast v
+        const short rv2 = ne02/ne22;
+        const short rv3 = ne03/ne23;
+
         const short iv2 = iq2/rv2;
         const short iv3 = iq3/rv3;
 
@@ -2906,13 +2876,59 @@ kernel void kernel_flash_attn_ext_f16(
                 for (short cc = 0; cc < C/8; ++cc) {
                     simdgroup_float8x8 mqk = make_filled_simdgroup_matrix<float, 8>(0.h);
 
-                    device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13));
+                    // this is compile-time check, so it does not have runtime overhead
+                    if (is_same<block_q, half4x4>::value) {
+                        // we can read directly from global memory
+                        device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13));
 
-                    for (short i = 0; i < D8; ++i) {
-                        simdgroup_half8x8 mk;
-                        simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose
+                        for (short i = 0; i < D8; ++i) {
+                            simdgroup_half8x8 mk;
+                            simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose
 
-                        simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
+                            simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
+                        }
+                    } else {
+                        for (short ii = 0; ii < D16; ii += 4) {
+                            device const block_q * pk4 = (device const block_q *) ((device const char *) k + ((ic + 8*cc + ty)*nb11 + ik2*nb12 + ik3*nb13));
+
+                            if (D16%4 == 0) {
+                                // the head is evenly divisible by 4*16 = 64, so no need for bound checks
+                                half4x4 tmp;
+                                dequantize_func(pk4 + (ii + tx)/nl, (ii + tx)%nl, tmp);
+                                skv4[4*ty + tx] = tmp;
+
+                                simdgroup_barrier(mem_flags::mem_threadgroup);
+
+#pragma unroll
+                                for (short k = 0; k < 4; ++k) {
+                                    simdgroup_half8x8 mk;
+
+                                    simdgroup_load(mk, skv + 16*k + 0*8, 4*16, 0, true); // transpose
+                                    simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk);
+
+                                    simdgroup_load(mk, skv + 16*k + 1*8, 4*16, 0, true); // transpose
+                                    simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk);
+                                }
+                            } else {
+                                if (ii + tx < D16) {
+                                    half4x4 tmp;
+                                    dequantize_func(pk4 + (ii + tx)/nl, (ii + tx)%nl, tmp);
+                                    skv4[4*ty + tx] = tmp;
+                                }
+
+                                simdgroup_barrier(mem_flags::mem_threadgroup);
+
+                                for (short k = 0; k < 4 && ii + k < D16; ++k) {
+                                    simdgroup_half8x8 mk;
+
+                                    simdgroup_load(mk, skv + 16*k + 0*8, 4*16, 0, true); // transpose
+                                    simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk);
+
+                                    simdgroup_load(mk, skv + 16*k + 1*8, 4*16, 0, true); // transpose
+                                    simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk);
+                                }
+                            }
+                        }
                     }
 
                     simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
@@ -2977,16 +2993,61 @@ kernel void kernel_flash_attn_ext_f16(
             // O = O + (Q*K^T)*V
             {
                 for (short cc = 0; cc < C/8; ++cc) {
-                    device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23));
+                    simdgroup_float8x8 ms;
+                    simdgroup_load(ms, ss + 8*cc, TF, 0, false);
 
-                    for (short i = 0; i < D8; ++i) {
-                        simdgroup_half8x8 mk;
-                        simdgroup_load(mk, pv + i*8, nb21/sizeof(half), 0, false);
+                    if (is_same<block_q, half4x4>::value) {
+                        // we can read directly from global memory
+                        device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23));
+#pragma unroll
+                        for (short i = 0; i < D8; ++i) {
+                            simdgroup_half8x8 mv;
+                            simdgroup_load(mv, pv + i*8, nb21/sizeof(half), 0, false);
 
-                        simdgroup_float8x8 mv;
-                        simdgroup_load(mv, ss + 8*cc, TF, 0, false);
+                            simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
+                        }
+                    } else {
+                        for (short ii = 0; ii < D16; ii += 4) {
+                            device const block_q * pv4 = (device const block_q *) ((device const char *) v + ((ic + 8*cc + ty)*nb21 + iv2*nb22 + iv3*nb23));
 
-                        simdgroup_multiply_accumulate(lo[i], mv, mk, lo[i]);
+                            if (D16%4 == 0) {
+                                // no need for bound checks
+                                half4x4 tmp;
+                                dequantize_func(pv4 + (ii + tx)/nl, (ii + tx)%nl, tmp);
+                                skv4[4*ty + tx] = tmp;
+
+                                simdgroup_barrier(mem_flags::mem_threadgroup);
+
+#pragma unroll
+                                for (short k = 0; k < 4; ++k) {
+                                    simdgroup_half8x8 mv;
+
+                                    simdgroup_load(mv, skv + 16*k + 0*8, 4*16, 0, false);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
+
+                                    simdgroup_load(mv, skv + 16*k + 1*8, 4*16, 0, false);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
+                                }
+                            } else {
+                                if (ii + tx < D16) {
+                                    half4x4 tmp;
+                                    dequantize_func(pv4 + (ii + tx)/nl, (ii + tx)%nl, tmp);
+                                    skv4[4*ty + tx] = tmp;
+                                }
+
+                                simdgroup_barrier(mem_flags::mem_threadgroup);
+
+                                for (short k = 0; k < 4 && ii + k < D16; ++k) {
+                                    simdgroup_half8x8 mv;
+
+                                    simdgroup_load(mv, skv + 16*k + 0*8, 4*16, 0, false);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
+
+                                    simdgroup_load(mv, skv + 16*k + 1*8, 4*16, 0, false);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
+                                }
+                            }
+                        }
                     }
                 }
             }
@@ -3003,7 +3064,7 @@ kernel void kernel_flash_attn_ext_f16(
 
     // reduce the warps sequentially
     for (short sg = 1; sg < nsg; ++sg) {
-        float S = { 0.0h };
+        float S = { 0.0f };
         float M = { -FLT_MAX/2 };
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -3082,15 +3143,54 @@ kernel void kernel_flash_attn_ext_f16(
     }
 }
 
-template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64>;
-template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80>;
-template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>;
-template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>;
-template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>;
-//template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
+typedef decltype(kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 64>) flash_attn_ext_t;
 
-template<int64_t D, int64_t Q = 1, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
-kernel void kernel_flash_attn_ext_vec_f16(
+template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 64>;
+template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 80>;
+template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 96>;
+template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 112>;
+template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 128>;
+template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 256>;
+
+template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 64>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 80>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 96>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 112>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 128>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 256>;
+
+template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 64>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 80>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 96>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 112>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 128>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 256>;
+
+template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 64>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 80>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 96>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 112>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 128>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 256>;
+
+template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 64>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 80>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 96>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 112>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 128>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 256>;
+
+template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 64>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 80>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 96>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 112>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 128>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 256>;
+
+// NOTE: can use half instead of float precision for some extra perf
+// D - head size, Q - queries per threadgroup, C - cache items per threadgroup
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &), short D, short Q = 1, short C = 32>
+kernel void kernel_flash_attn_ext_vec(
         device const  char * q,
         device const  char * k,
         device const  char * v,
@@ -3128,36 +3228,27 @@ kernel void kernel_flash_attn_ext_vec_f16(
         ushort sgitg[[simdgroup_index_in_threadgroup]]) {
     const short nsg = ntg.y; // number of simdgroups
 
-    const short iq3 = tgpig[2];
-    const short iq2 = tgpig[1];
-    const short iq1 = tgpig[0];
+    const int iq3 = tgpig[2];
+    const int iq2 = tgpig[1];
+    const int iq1 = tgpig[0];
 
-    const short D4 = D/4;
-    const short NW = N_SIMDWIDTH;
-    const short SH = (C + Q); // shared memory per simdgroup in (half)
+    const short D4  = D/4;
+    const short D16 = D/16;
+    const short NW  = N_SIMDWIDTH;
+    const short NW4 = NW/4;
+    const short SH  = C; // shared memory per simdgroup in (half)
 
     const short T  = D + 2*nsg*SH; // shared memory size per query in (half)
 
-    float slope = 1.0f;
-
-    // ALiBi
-    if (max_bias > 0.0f) {
-        const uint32_t h = iq2;
-
-        const float base = h < n_head_log2 ? m0 : m1;
-        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-
-        slope = pow(base, exp);
-    }
-
-  //threadgroup half   * sq  = (threadgroup half   *) (shared +              0*D); // holds the query data
-    threadgroup half4  * sq4 = (threadgroup half4  *) (shared +              0*D); // same as above but in half4
-    threadgroup float  * ss  = (threadgroup float  *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
-    threadgroup float4 * ss4 = (threadgroup float4 *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4
-    threadgroup half4  * sr4 = (threadgroup half4  *) (shared +   sgitg*D  + 1*T); // scratch buffer for the results
+  //threadgroup half     * sq   = (threadgroup half     *) (shared +              0*D); // holds the query data
+    threadgroup half4    * sq4  = (threadgroup half4    *) (shared +              0*D); // same as above but in half4
+    threadgroup half4x4  * sq44 = (threadgroup half4x4  *) (shared +              0*D); // same as above but in half4x4
+    threadgroup float    * ss   = (threadgroup float    *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention
+    threadgroup float4   * ss4  = (threadgroup float4   *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4
+    threadgroup float4x4 * sr44 = (threadgroup float4x4 *) (shared + 2*sgitg*D  + Q*T); // scratch buffer for the results
 
     // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
-    half4 lo[D4/NW];
+    float4x4 lo[D16/NW4];
 
     // load heads from Q to shared memory
     device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03));
@@ -3171,8 +3262,8 @@ kernel void kernel_flash_attn_ext_vec_f16(
     }
 
     // zero out lo
-    for (short i = tiisg; i < D4; i += NW) {
-        lo[i/NW] = 0.0h;
+    for (short i = 0; i < D16/NW4; i += NW4) {
+        lo[i] = float4x4(0.0f);
     }
 
     // zero out shared memory SH
@@ -3183,38 +3274,52 @@ kernel void kernel_flash_attn_ext_vec_f16(
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     {
-        float S = { 0.0h };
-        float M = { -FLT_MAX/2 };
+        float S = 0.0f;
+        float M = -FLT_MAX/2;
+
+        // thread indices inside the simdgroup
+        const short tx = tiisg%8;
+        const short ty = tiisg/8;
 
         // assume K and V are same shape
         const short ne22 = ne12;
         const short ne23 = ne13;
 
-        // broadcast
+        // broadcast k
         const short rk2 = ne02/ne12;
         const short rk3 = ne03/ne13;
 
+        const short ik2 = iq2/rk2;
+        const short ik3 = iq3/rk3;
+
+        // broadcast v
         const short rv2 = ne02/ne22;
         const short rv3 = ne03/ne23;
 
-        // k indices
-        const short ik2 = iq2 / rk2;
-        const short ik3 = iq3 / rk3;
-
-        // v indices
-        const short iv2 = iq2 / rv2;
-        const short iv3 = iq3 / rv3;
+        const short iv2 = iq2/rv2;
+        const short iv3 = iq3/rv3;
 
         // load the queries from shared memory into local memory
-        float4 mq[D4/NW];
+        float4x4 mq[D16/NW4];
 
-        for (short ii = 0; ii < D4; ii += NW) {
-            short i = ii + tiisg;
-            mq[ii/NW] = (float4) sq4[i];
+        for (short ii = 0; ii < D16; ii += NW4) {
+            mq[ii/NW4] = (float4x4) sq44[ii + tx];
         }
 
         // pointer to the mask
-        device const half4 * mp4 = (device const half4 *) (mask + iq1*nb31);
+        device const half * mp = (device const half *) (mask + iq1*nb31);
+
+        float slope = 1.0f;
+
+        // ALiBi
+        if (max_bias > 0.0f) {
+            const uint32_t h = iq2;
+
+            const float base = h < n_head_log2 ? m0 : m1;
+            const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+            slope = pow(base, exp);
+        }
 
         // loop over the KV cache
         // each simdgroup handles blocks of Q rows and C columns
@@ -3226,47 +3331,54 @@ kernel void kernel_flash_attn_ext_vec_f16(
 
             // Q*K^T
             {
-#pragma unroll
+                // each simdgroup processes 1 query and 4 keys
                 for (short cc = 0; cc < C/4; ++cc) {
-                    float4 mqk = { 0.0h };
+                    float mqk = 0.0;
 
-                    device const half4 * pk4 = (device const half4 *) ((device const char *) k + ((ic + 4*cc)*nb11 + ik2*nb12 + ik3*nb13));
+                    device const block_q * pk = (device const block_q *) ((device const char *) k + ((ic + 4*cc + ty)*nb11 + ik2*nb12 + ik3*nb13));
 
 #pragma unroll
-                    for (short ii = 0; ii < D4; ii += NW) {
-                        const short i = ii + tiisg;
+                    for (short ii = 0; ii < D16; ii += NW4) {
+                        const short i = ii + tx;
 
                         float4x4 mk;
-                        mk[0] = (float4) pk4[i + 0*(nb11/8)];
-                        mk[1] = (float4) pk4[i + 1*(nb11/8)];
-                        mk[2] = (float4) pk4[i + 2*(nb11/8)];
-                        mk[3] = (float4) pk4[i + 3*(nb11/8)];
+                        dequantize_func(pk + i/nl, i%nl, mk);
 
-                        mqk += (float4) (mq[ii/NW] * mk);
+                        mqk +=
+                            dot(mq[ii/NW4][0], mk[0]) +
+                            dot(mq[ii/NW4][1], mk[1]) +
+                            dot(mq[ii/NW4][2], mk[2]) +
+                            dot(mq[ii/NW4][3], mk[3]);
                     }
 
-                    // reduce the results from the threads in the simdgroup
-                    mqk += simd_shuffle_down(mqk, 16);
-                    mqk += simd_shuffle_down(mqk,  8);
+                    // simdgroup reduce
+                    // [ 0 ..  7] -> [ 0]
+                    // [ 8 .. 15] -> [ 8]
+                    // [16 .. 23] -> [16]
+                    // [24 .. 31] -> [24]
+                  //mqk += simd_shuffle_down(mqk, 16);
+                  //mqk += simd_shuffle_down(mqk,  8);
                     mqk += simd_shuffle_down(mqk,  4);
                     mqk += simd_shuffle_down(mqk,  2);
                     mqk += simd_shuffle_down(mqk,  1);
 
                     // mqk = mqk*scale + mask*slope
-                    if (tiisg == 0) {
+                    if (tx == 0) {
                         mqk *= scale;
 
                         if (logit_softcap != 0.0f) {
                             mqk = logit_softcap*precise::tanh(mqk);
                         }
 
-                        mqk += (mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f;
+                        mqk += (mask != q) ? ((float) mp[ic + 4*cc + ty])*slope : (float) 0.0f;
 
-                        ss4[cc] = mqk;
+                        ss[4*cc + ty] = mqk;
                     }
                 }
             }
 
+            simdgroup_barrier(mem_flags::mem_threadgroup);
+
             // online softmax
             {
                 const short p = tiisg;
@@ -3286,29 +3398,32 @@ kernel void kernel_flash_attn_ext_vec_f16(
 
                 // O = diag(ms)*O
 #pragma unroll
-                for (short ii = 0; ii < D4; ii += NW) {
-                    lo[ii/NW] *= ms;
+                for (short ii = 0; ii < D16; ii += NW4) {
+                    lo[ii/NW4] *= ms;
                 }
             }
 
+            simdgroup_barrier(mem_flags::mem_threadgroup);
+
             // O = O + (Q*K^T)*V
             {
 #pragma unroll
                 for (short cc = 0; cc < C/4; ++cc) {
-                    device const half4 * pv4 = (device const half4 *) ((device const char *) v + ((ic + 4*cc)*nb21 + iv2*nb22 + iv3*nb23));
+                    device const block_q * pv4 = (device const block_q *) ((device const char *) v + ((ic + 4*cc + ty)*nb21 + iv2*nb22 + iv3*nb23));
+
+                    const float4x4 lss(ss[4*cc + ty]);
 
 #pragma unroll
-                    for (short ii = 0; ii < D4; ii += NW) {
-                        const short i = ii + tiisg;
+                    for (short ii = 0; ii < D16; ii += NW4) {
+                        const short i = ii + tx;
 
-                        lo[ii/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0];
-                        lo[ii/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1];
-                        lo[ii/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2];
-                        lo[ii/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3];
+                        float4x4 mv;
+                        dequantize_func(pv4 + i/nl, i%nl, mv);
+
+                        lo[ii/NW4] += mv*lss;
                     }
                 }
             }
-
         }
 
         // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
@@ -3318,10 +3433,32 @@ kernel void kernel_flash_attn_ext_vec_f16(
         }
     }
 
+    // simdgroup reduce
+    // [ 0,  8, 16, 24] -> [ 0]
+    // [ 1,  9, 17, 25] -> [ 1]
+    // [ 2, 10, 18, 26] -> [ 2]
+    // [ 3, 11, 19, 27] -> [ 3]
+    // [ 4, 12, 20, 28] -> [ 4]
+    // [ 5, 13, 21, 29] -> [ 5]
+    // [ 6, 14, 22, 30] -> [ 6]
+    // [ 7, 15, 23, 31] -> [ 7]
+    for (short ii = 0; ii < D16; ii += NW4) {
+        lo[ii/NW4][0] += simd_shuffle_down(lo[ii/NW4][0], 16);
+        lo[ii/NW4][0] += simd_shuffle_down(lo[ii/NW4][0],  8);
+
+        lo[ii/NW4][1] += simd_shuffle_down(lo[ii/NW4][1], 16);
+        lo[ii/NW4][1] += simd_shuffle_down(lo[ii/NW4][1],  8);
+
+        lo[ii/NW4][2] += simd_shuffle_down(lo[ii/NW4][2], 16);
+        lo[ii/NW4][2] += simd_shuffle_down(lo[ii/NW4][2],  8);
+
+        lo[ii/NW4][3] += simd_shuffle_down(lo[ii/NW4][3], 16);
+        lo[ii/NW4][3] += simd_shuffle_down(lo[ii/NW4][3],  8);
+    }
+
     // store results to shared memory
-    for (short ii = 0; ii < D4; ii += NW) {
-        short i = ii + tiisg;
-        sr4[i] = lo[ii/NW];
+    for (short i = tiisg; i < D16; i += NW4) {
+        sr44[i] = lo[i/NW4];
     }
 
     threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -3348,30 +3485,41 @@ kernel void kernel_flash_attn_ext_vec_f16(
             }
 
             // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
-            for (short ii = 0; ii < D4; ii += NW) {
-                short i = ii + tiisg;
-                sr4[i] = sr4[i]*ms0 + sr4[i + r*D4]*ms1;
+            for (short i = tiisg; i < D16; i += NW) {
+                sr44[i] = sr44[i]*ms0 + sr44[i + r*D16]*ms1;
             }
         }
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
-    device float4 * dst4 = (device float4 *) dst;
+    device float4x4 * dst44 = (device float4x4 *) dst;
 
     // final rescale with 1/S and store to global memory
     if (sgitg == 0) {
         const float S = ss[0];
 
-        for (short ii = 0; ii < D4; ii += NW) {
-            short i = ii + tiisg;
-            dst4[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D4 + i] = (float4) sr4[i]/S;
+        for (short i = tiisg; i < D16; i += NW) {
+            dst44[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D16 + i] = sr44[i]/S;
         }
     }
 }
 
-template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>;
-//template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
+typedef decltype(kernel_flash_attn_ext_vec<half4x4, 1, dequantize_f16, 64>) flash_attn_ext_vec_t;
+
+template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<half4x4,    1, dequantize_f16,  128>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q4_0, 2, dequantize_q4_0, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q4_1, 2, dequantize_q4_1, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q5_0, 2, dequantize_q5_0, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q5_1, 2, dequantize_q5_1, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q8_0, 2, dequantize_q8_0, 128>;
+
+template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<half4x4,    1, dequantize_f16,  256>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q4_0, 2, dequantize_q4_0, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q4_1, 2, dequantize_q4_1, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q5_0, 2, dequantize_q5_0, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q5_1, 2, dequantize_q5_1, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q8_0, 2, dequantize_q8_0, 256>;
 
 template<typename T0, typename T1>
 kernel void kernel_cpy(

From 1dc04b2deed2d2f2ae3aff9b14ae29674dee1fb8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 6 Nov 2024 11:20:10 +0200
Subject: [PATCH 167/396] ggml : adjust is_first_call init value (#10193)

ggml-ci
---
 ggml/src/ggml.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index e6a7824ba..266a0d6f0 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1407,11 +1407,11 @@ static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const str
 ////////////////////////////////////////////////////////////////////////////////
 
 struct ggml_context * ggml_init(struct ggml_init_params params) {
-    static bool is_first_call = false;
+    static bool is_first_call = true;
 
     ggml_critical_section_start();
 
-    if (!is_first_call) {
+    if (is_first_call) {
         // initialize time system (required on Windows)
         ggml_time_init();
 
@@ -1422,7 +1422,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
             } u = {i};
             ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
         }
-        is_first_call = true;
+
+        is_first_call = false;
     }
 
     ggml_critical_section_end();

From 94d8cb8be13b7c4d04eeca5a2b956b9148e6f222 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 6 Nov 2024 12:10:07 +0100
Subject: [PATCH 168/396] metal : fix from ptr buffer name (#10189)

---
 ggml/src/ggml-metal.m | 5 +++--
 src/llama.cpp         | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index aee354cdd..9966a9e2f 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -4072,7 +4072,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
         }
     }
 
-    return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
+    return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
 }
 
 static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
@@ -4082,7 +4082,8 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const
 }
 
 static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
+    return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
+            buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
 
     UNUSED(dev);
 }
diff --git a/src/llama.cpp b/src/llama.cpp
index 0cdf0c073..6719edb38 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9134,7 +9134,7 @@ static bool llm_load_tensors(
 
     // print memory requirements per buffer type
     for (auto & buf : model.bufs) {
-        LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+        LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
     }
 
     // populate tensors_by_name

From b11f9ba9b8ce319f04b88afe40d264e6b7f4ba46 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 6 Nov 2024 13:29:01 +0200
Subject: [PATCH 169/396] server : remove hack for extra parallel slot (#10187)

ggml-ci
---
 examples/server/server.cpp | 53 +++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f0b89b22c..1c7f0fd1d 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -378,8 +378,8 @@ struct server_queue {
     std::condition_variable condition_tasks;
 
     // callback functions
-    std::function<void(server_task&)> callback_new_task;
-    std::function<void(void)>         callback_update_slots;
+    std::function<void(server_task)> callback_new_task;
+    std::function<void(void)>        callback_update_slots;
 
     // Add a new task to the end of the queue
     int post(server_task task, bool front = false) {
@@ -431,7 +431,7 @@ struct server_queue {
     }
 
     // Register function to process a new task
-    void on_new_task(std::function<void(server_task &)> callback) {
+    void on_new_task(std::function<void(server_task)> callback) {
         callback_new_task = std::move(callback);
     }
 
@@ -481,7 +481,7 @@ struct server_queue {
                 lock.unlock();
 
                 QUE_DBG("processing task, id = %d\n", task.id);
-                callback_new_task(task);
+                callback_new_task(std::move(task));
             }
 
             // all tasks in the current loop is processed, slots data is now ready
@@ -644,17 +644,12 @@ struct server_context {
     bool load_model(const common_params & params_) {
         params = params_;
 
-        // reserve one extra sequence (seq_id == 0) for extra features
-        params.n_parallel += 1;
-
         common_init_result llama_init = common_init_from_params(params);
 
         model = llama_init.model;
         ctx   = llama_init.context;
         loras = llama_init.lora_adapters;
 
-        params.n_parallel -= 1; // but be sneaky about it
-
         if (model == nullptr) {
             SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
             return false;
@@ -1288,16 +1283,16 @@ struct server_context {
 
     void send_embedding(const server_slot & slot, const llama_batch & batch) {
         server_task_result res;
-        res.id       = slot.id_task;
-        res.error    = false;
-        res.stop     = true;
+        res.id    = slot.id_task;
+        res.error = false;
+        res.stop  = true;
 
         const int n_embd = llama_n_embd(model);
 
         std::vector<float> embd_res(n_embd, 0.0f);
 
         for (int i = 0; i < batch.n_tokens; ++i) {
-            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
+            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
                 continue;
             }
 
@@ -1332,12 +1327,12 @@ struct server_context {
 
     void send_rerank(const server_slot & slot, const llama_batch & batch) {
         server_task_result res;
-        res.id       = slot.id_task;
-        res.error    = false;
-        res.stop     = true;
+        res.id    = slot.id_task;
+        res.error = false;
+        res.stop  = true;
 
         for (int i = 0; i < batch.n_tokens; ++i) {
-            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
+            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
                 continue;
             }
 
@@ -1510,7 +1505,7 @@ struct server_context {
     // Functions to process the task
     //
 
-    void process_single_task(const server_task & task) {
+    void process_single_task(server_task task) {
         switch (task.type) {
             case SERVER_TASK_TYPE_INFERENCE:
                 {
@@ -1646,7 +1641,7 @@ struct server_context {
                     std::string filename = task.data.at("filename");
                     std::string filepath = task.data.at("filepath");
 
-                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
+                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);
 
                     const int64_t t_end = ggml_time_us();
                     const double t_save_ms = (t_end - t_start) / 1000.0;
@@ -1688,7 +1683,7 @@ struct server_context {
 
                     slot->cache_tokens.resize(slot->n_ctx);
                     size_t token_count = 0;
-                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
+                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
                     if (nread == 0) {
                         slot->cache_tokens.resize(0);
                         send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
@@ -1731,7 +1726,7 @@ struct server_context {
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
+                    llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
                     server_task_result result;
@@ -1808,8 +1803,8 @@ struct server_context {
 
                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-                llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
-                llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -1836,7 +1831,7 @@ struct server_context {
 
             slot.i_batch = batch.n_tokens;
 
-            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true);
+            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
 
             slot.n_past += 1;
 
@@ -1983,8 +1978,8 @@ struct server_context {
 
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
-                                            llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
-                                            llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1,     kv_shift);
+                                            llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
+                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, -1,     kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -2033,9 +2028,9 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
+                    if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
+                        llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
 
                         // there is no common part left
                         slot.n_past = 0;
@@ -2048,7 +2043,7 @@ struct server_context {
 
                     // add prompt tokens for processing in the current batch
                     while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
-                        common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false);
+                        common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false);
 
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);

From 5c333e014059122245c318e7ed4ec27d1085573c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 6 Nov 2024 19:53:51 +0200
Subject: [PATCH 170/396] metal : add BF16 support (#8439)

* ggml : add initial BF16 support

ggml-ci

* metal : add mul_mat_id BF16 support

ggml-ci

* metal : check for bfloat support on the Metal device

ggml-ci

* metal : better var names [no ci]

* metal : do not build bfloat kernels when not supported

ggml-ci

* metal : try to fix BF16 support check

ggml-ci

* metal : this should correctly check bfloat support
---
 common/common.cpp          |   3 +
 ggml/src/ggml-metal.m      | 438 ++++++++++++++++++++++---------------
 ggml/src/ggml-metal.metal  |  58 ++++-
 tests/test-backend-ops.cpp |   2 +-
 4 files changed, 317 insertions(+), 184 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index c8cbaae11..19674af15 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1003,6 +1003,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
     if (s == "f16") {
         return GGML_TYPE_F16;
     }
+    if (s == "bf16") {
+        return GGML_TYPE_BF16;
+    }
     if (s == "q8_0") {
         return GGML_TYPE_Q8_0;
     }
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 9966a9e2f..f13adee38 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -36,16 +36,18 @@ static struct ggml_backend_metal_device_context {
     id<MTLDevice> mtl_device;
     int           mtl_device_ref_count;
 
-    bool support_simdgroup_reduction;
-    bool support_simdgroup_mm;
+    bool has_simdgroup_reduction;
+    bool has_simdgroup_mm;
+    bool has_bfloat;
 
     char name[128];
 } g_ggml_ctx_dev_main = {
-    /*.mtl_device                  =*/ nil,
-    /*.mtl_device_ref_count        =*/ 0,
-    /*.support_simdgroup_reduction =*/ false,
-    /*.support_simdgroup_mm        =*/ false,
-    /*.name                        =*/ "",
+    /*.mtl_device              =*/ nil,
+    /*.mtl_device_ref_count    =*/ 0,
+    /*.has_simdgroup_reduction =*/ false,
+    /*.has_simdgroup_mm        =*/ false,
+    /*.has_bfloat              =*/ false,
+    /*.name                    =*/ "",
 };
 
 // acquire
@@ -55,10 +57,13 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
     if (ctx->mtl_device == nil) {
         ctx->mtl_device = MTLCreateSystemDefaultDevice();
 
-        ctx->support_simdgroup_reduction  = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
-        ctx->support_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
+        ctx->has_simdgroup_reduction  = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
+        ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
 
-        ctx->support_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
+        ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
+
+        ctx->has_bfloat  = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
+        ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
 
         strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1);
     }
@@ -120,6 +125,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,
     GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,
     GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16,
     GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,
     GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,
     GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,
@@ -146,10 +152,14 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,
     GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
     GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,
     GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16,
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
@@ -170,10 +180,11 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
-  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
   //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,
   //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,
+  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,
@@ -195,6 +206,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,
@@ -216,6 +228,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,
@@ -300,8 +313,11 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,
     GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
     GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,
     GGML_METAL_KERNEL_TYPE_CPY_F16_F16,
     GGML_METAL_KERNEL_TYPE_CPY_F16_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_BF16_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16,
     GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
     GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,
     GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,
@@ -480,7 +496,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
                 // dictionary of preprocessor macros
                 NSMutableDictionary * prep = [NSMutableDictionary dictionary];
 
-                MTLCompileOptions* options = [MTLCompileOptions new];
+                MTLCompileOptions * options = [MTLCompileOptions new];
                 options.preprocessorMacros = prep;
 
                 //[options setFastMathEnabled:false];
@@ -530,9 +546,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         }
     }
 
-    GGML_LOG_INFO("%s: simdgroup reduction support   = %s\n", __func__, ctx_dev->support_simdgroup_reduction ? "true" : "false");
-    GGML_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx_dev->support_simdgroup_mm ? "true" : "false");
-    GGML_LOG_INFO("%s: hasUnifiedMemory              = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
+    GGML_LOG_INFO("%s: simdgroup reduction   = %s\n", __func__, ctx_dev->has_simdgroup_reduction     ? "true" : "false");
+    GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm            ? "true" : "false");
+    GGML_LOG_INFO("%s: bfloat                = %s\n", __func__, ctx_dev->has_bfloat                  ? "true" : "false");
+    GGML_LOG_INFO("%s: hasUnifiedMemory      = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
 
     ctx->capture_next_compute = false;
     ctx->capture_started = false;
@@ -578,8 +595,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
             GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
         }
 
-        const bool support_simdgroup_mm        = ctx_dev->support_simdgroup_mm;
-        const bool support_simdgroup_reduction = ctx_dev->support_simdgroup_reduction;
+        const bool has_simdgroup_mm        = ctx_dev->has_simdgroup_mm;
+        const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction;
+        const bool has_bfloat              = ctx_dev->has_bfloat;
 
         // simd_sum and simd_max requires MTLGPUFamilyApple7
 
@@ -607,14 +625,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,                  gelu_quick_4,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                          silu,                           true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4,                        silu_4,                         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,                  soft_max_f16,                   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,                soft_max_f16_4,                 support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,                  soft_max_f32,                   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4,                soft_max_f32_4,                 support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,                  soft_max_f16,                   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,                soft_max_f16_4,                 has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,                  soft_max_f32,                   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4,                soft_max_f32_4,                 has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,                 diag_mask_inf,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,               diag_mask_inf_8,                true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,                  get_rows_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,                  get_rows_f16,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16,                 get_rows_bf16,                  has_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,                 get_rows_q4_0,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,                 get_rows_q4_1,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,                 get_rows_q5_0,                  true);
@@ -635,101 +654,108 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,               get_rows_iq4_nl,                true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS,               get_rows_iq4_xs,                true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,                  get_rows_i32,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                      rms_norm,                       support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                    group_norm,                     support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                      rms_norm,                       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                    group_norm,                     has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                          norm,                           true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,                  ssm_conv_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,                  ssm_scan_f32,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                mul_mv_f32_f32,                 support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,                mul_mv_f16_f16,                 support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,                mul_mv_f16_f32,                 support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,           mul_mv_f16_f32_1row,            support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,             mul_mv_f16_f32_l4,              support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,               mul_mv_q4_0_f32,                support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,               mul_mv_q4_1_f32,                support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,               mul_mv_q5_0_f32,                support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,               mul_mv_q5_1_f32,                support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,               mul_mv_q8_0_f32,                support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,               mul_mv_q2_K_f32,                support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,               mul_mv_q3_K_f32,                support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,               mul_mv_q4_K_f32,                support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,               mul_mv_q5_K_f32,                support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,               mul_mv_q6_K_f32,                support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,            mul_mv_iq2_xxs_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,             mul_mv_iq2_xs_f32,              support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,            mul_mv_iq3_xxs_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,              mul_mv_iq3_s_f32,               support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,              mul_mv_iq2_s_f32,               support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,              mul_mv_iq1_s_f32,               support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32,              mul_mv_iq1_m_f32,               support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,             mul_mv_iq4_nl_f32,              support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,             mul_mv_iq4_xs_f32,              support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,             mul_mv_id_f32_f32,              support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,             mul_mv_id_f16_f16,              support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,             mul_mv_id_f16_f32,              support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,        mul_mv_id_f16_f32_1row,         support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,          mul_mv_id_f16_f32_l4,           support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,            mul_mv_id_q4_0_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,            mul_mv_id_q4_1_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,            mul_mv_id_q5_0_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,            mul_mv_id_q5_1_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,            mul_mv_id_q8_0_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,            mul_mv_id_q2_K_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,            mul_mv_id_q3_K_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,            mul_mv_id_q4_K_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,            mul_mv_id_q5_K_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,            mul_mv_id_q6_K_f32,             support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,         mul_mv_id_iq2_xxs_f32,          support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,          mul_mv_id_iq2_xs_f32,           support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,         mul_mv_id_iq3_xxs_f32,          support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,           mul_mv_id_iq3_s_f32,            support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,           mul_mv_id_iq2_s_f32,            support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,           mul_mv_id_iq1_s_f32,            support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,           mul_mv_id_iq1_m_f32,            support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,          mul_mv_id_iq4_nl_f32,           support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,          mul_mv_id_iq4_xs_f32,           support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,                mul_mm_f32_f32,                 support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,                mul_mm_f16_f32,                 support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,               mul_mm_q4_0_f32,                support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,               mul_mm_q4_1_f32,                support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,               mul_mm_q5_0_f32,                support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,               mul_mm_q5_1_f32,                support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,               mul_mm_q8_0_f32,                support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,               mul_mm_q2_K_f32,                support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,               mul_mm_q3_K_f32,                support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,               mul_mm_q4_K_f32,                support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,               mul_mm_q5_K_f32,                support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,               mul_mm_q6_K_f32,                support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,            mul_mm_iq2_xxs_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,             mul_mm_iq2_xs_f32,              support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,            mul_mm_iq3_xxs_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,              mul_mm_iq3_s_f32,               support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,              mul_mm_iq2_s_f32,               support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,              mul_mm_iq1_s_f32,               support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,              mul_mm_iq1_m_f32,               support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,             mul_mm_iq4_nl_f32,              support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,             mul_mm_iq4_xs_f32,              support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,             mul_mm_id_f32_f32,              support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,             mul_mm_id_f16_f32,              support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,            mul_mm_id_q4_0_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,            mul_mm_id_q4_1_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,            mul_mm_id_q5_0_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,            mul_mm_id_q5_1_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,            mul_mm_id_q8_0_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,            mul_mm_id_q2_K_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,            mul_mm_id_q3_K_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,            mul_mm_id_q4_K_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,            mul_mm_id_q5_K_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,            mul_mm_id_q6_K_f32,             support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,         mul_mm_id_iq2_xxs_f32,          support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,          mul_mm_id_iq2_xs_f32,           support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,         mul_mm_id_iq3_xxs_f32,          support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,           mul_mm_id_iq3_s_f32,            support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,           mul_mm_id_iq2_s_f32,            support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,           mul_mm_id_iq1_s_f32,            support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,           mul_mm_id_iq1_m_f32,            support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,          mul_mm_id_iq4_nl_f32,           support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,          mul_mm_id_iq4_xs_f32,           support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                mul_mv_f32_f32,                 has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32,               mul_mv_bf16_f32,                has_simdgroup_reduction && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW,          mul_mv_bf16_f32_1row,           has_simdgroup_reduction && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4,            mul_mv_bf16_f32_l4,             has_simdgroup_reduction && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16,              mul_mv_bf16_bf16,               has_simdgroup_reduction && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,                mul_mv_f16_f32,                 has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,           mul_mv_f16_f32_1row,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,             mul_mv_f16_f32_l4,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,                mul_mv_f16_f16,                 has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,               mul_mv_q4_0_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,               mul_mv_q4_1_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,               mul_mv_q5_0_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,               mul_mv_q5_1_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,               mul_mv_q8_0_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,               mul_mv_q2_K_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,               mul_mv_q3_K_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,               mul_mv_q4_K_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,               mul_mv_q5_K_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,               mul_mv_q6_K_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,            mul_mv_iq2_xxs_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,             mul_mv_iq2_xs_f32,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,            mul_mv_iq3_xxs_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,              mul_mv_iq3_s_f32,               has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,              mul_mv_iq2_s_f32,               has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,              mul_mv_iq1_s_f32,               has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32,              mul_mv_iq1_m_f32,               has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,             mul_mv_iq4_nl_f32,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,             mul_mv_iq4_xs_f32,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,             mul_mv_id_f32_f32,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,             mul_mv_id_f16_f32,              has_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,        mul_mv_id_f16_f32_1row,         has_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,          mul_mv_id_f16_f32_l4,           has_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,             mul_mv_id_f16_f16,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32,            mul_mv_id_bf16_f32,             has_simdgroup_reduction && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,            mul_mv_id_q4_0_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,            mul_mv_id_q4_1_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,            mul_mv_id_q5_0_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,            mul_mv_id_q5_1_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,            mul_mv_id_q8_0_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,            mul_mv_id_q2_K_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,            mul_mv_id_q3_K_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,            mul_mv_id_q4_K_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,            mul_mv_id_q5_K_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,            mul_mv_id_q6_K_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,         mul_mv_id_iq2_xxs_f32,          has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,          mul_mv_id_iq2_xs_f32,           has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,         mul_mv_id_iq3_xxs_f32,          has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,           mul_mv_id_iq3_s_f32,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,           mul_mv_id_iq2_s_f32,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,           mul_mv_id_iq1_s_f32,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,           mul_mv_id_iq1_m_f32,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,          mul_mv_id_iq4_nl_f32,           has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,          mul_mv_id_iq4_xs_f32,           has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,                mul_mm_f32_f32,                 has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,                mul_mm_f16_f32,                 has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,               mul_mm_bf16_f32,                has_simdgroup_mm && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,               mul_mm_q4_0_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,               mul_mm_q4_1_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,               mul_mm_q5_0_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,               mul_mm_q5_1_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,               mul_mm_q8_0_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,               mul_mm_q2_K_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,               mul_mm_q3_K_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,               mul_mm_q4_K_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,               mul_mm_q5_K_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,               mul_mm_q6_K_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,            mul_mm_iq2_xxs_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,             mul_mm_iq2_xs_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,            mul_mm_iq3_xxs_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,              mul_mm_iq3_s_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,              mul_mm_iq2_s_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,              mul_mm_iq1_s_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,              mul_mm_iq1_m_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,             mul_mm_iq4_nl_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,             mul_mm_iq4_xs_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,             mul_mm_id_f32_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,             mul_mm_id_f16_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,            mul_mm_id_bf16_f32,             has_simdgroup_mm && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,            mul_mm_id_q4_0_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,            mul_mm_id_q4_1_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,            mul_mm_id_q5_0_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,            mul_mm_id_q5_1_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,            mul_mm_id_q8_0_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,            mul_mm_id_q2_K_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,            mul_mm_id_q3_K_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,            mul_mm_id_q4_K_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,            mul_mm_id_q5_K_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,            mul_mm_id_q6_K_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,         mul_mm_id_iq2_xxs_f32,          has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,          mul_mm_id_iq2_xs_f32,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,         mul_mm_id_iq3_xxs_f32,          has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,           mul_mm_id_iq3_s_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,           mul_mm_id_iq2_s_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,           mul_mm_id_iq1_s_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,           mul_mm_id_iq1_m_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,          mul_mm_id_iq4_nl_f32,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,          mul_mm_id_iq4_xs_f32,           has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                 rope_norm_f32,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                 rope_norm_f16,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                 rope_neox_f32,                  true);
@@ -745,58 +771,61 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,          argsort_f32_i32_desc,           true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,                leaky_relu_f32,                 true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,        flash_attn_ext_f16_h64,         support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,        flash_attn_ext_f16_h80,         support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,        flash_attn_ext_f16_h96,         support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,       flash_attn_ext_f16_h112,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,       flash_attn_ext_f16_h128,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,       flash_attn_ext_q4_0_h64,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,       flash_attn_ext_q4_0_h80,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,       flash_attn_ext_q4_0_h96,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112,      flash_attn_ext_q4_0_h112,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128,      flash_attn_ext_q4_0_h128,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256,      flash_attn_ext_q4_0_h256,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64,       flash_attn_ext_q4_1_h64,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80,       flash_attn_ext_q4_1_h80,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96,       flash_attn_ext_q4_1_h96,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112,      flash_attn_ext_q4_1_h112,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128,      flash_attn_ext_q4_1_h128,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256,      flash_attn_ext_q4_1_h256,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64,       flash_attn_ext_q5_0_h64,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80,       flash_attn_ext_q5_0_h80,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96,       flash_attn_ext_q5_0_h96,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112,      flash_attn_ext_q5_0_h112,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128,      flash_attn_ext_q5_0_h128,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256,      flash_attn_ext_q5_0_h256,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64,       flash_attn_ext_q5_1_h64,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80,       flash_attn_ext_q5_1_h80,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96,       flash_attn_ext_q5_1_h96,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112,      flash_attn_ext_q5_1_h112,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128,      flash_attn_ext_q5_1_h128,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256,      flash_attn_ext_q5_1_h256,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64,       flash_attn_ext_q8_0_h64,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80,       flash_attn_ext_q8_0_h80,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96,       flash_attn_ext_q8_0_h96,        support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112,      flash_attn_ext_q8_0_h112,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,      flash_attn_ext_q8_0_h128,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,      flash_attn_ext_q8_0_h256,       support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,   flash_attn_ext_vec_f16_h128,    support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,  flash_attn_ext_vec_q4_0_h128,   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,  flash_attn_ext_vec_q4_1_h128,   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,  flash_attn_ext_vec_q5_0_h128,   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,  flash_attn_ext_vec_q5_1_h128,   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,  flash_attn_ext_vec_q8_0_h128,   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,  flash_attn_ext_vec_q4_0_h256,   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,  flash_attn_ext_vec_q4_1_h256,   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,  flash_attn_ext_vec_q5_0_h256,   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,  flash_attn_ext_vec_q5_1_h256,   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,  flash_attn_ext_vec_q8_0_h256,   support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,                   cpy_f32_f16,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,        flash_attn_ext_f16_h64,         has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,        flash_attn_ext_f16_h80,         has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,        flash_attn_ext_f16_h96,         has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,       flash_attn_ext_f16_h112,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,       flash_attn_ext_f16_h128,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,       flash_attn_ext_q4_0_h64,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,       flash_attn_ext_q4_0_h80,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,       flash_attn_ext_q4_0_h96,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112,      flash_attn_ext_q4_0_h112,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128,      flash_attn_ext_q4_0_h128,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256,      flash_attn_ext_q4_0_h256,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64,       flash_attn_ext_q4_1_h64,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80,       flash_attn_ext_q4_1_h80,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96,       flash_attn_ext_q4_1_h96,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112,      flash_attn_ext_q4_1_h112,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128,      flash_attn_ext_q4_1_h128,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256,      flash_attn_ext_q4_1_h256,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64,       flash_attn_ext_q5_0_h64,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80,       flash_attn_ext_q5_0_h80,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96,       flash_attn_ext_q5_0_h96,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112,      flash_attn_ext_q5_0_h112,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128,      flash_attn_ext_q5_0_h128,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256,      flash_attn_ext_q5_0_h256,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64,       flash_attn_ext_q5_1_h64,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80,       flash_attn_ext_q5_1_h80,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96,       flash_attn_ext_q5_1_h96,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112,      flash_attn_ext_q5_1_h112,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128,      flash_attn_ext_q5_1_h128,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256,      flash_attn_ext_q5_1_h256,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64,       flash_attn_ext_q8_0_h64,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80,       flash_attn_ext_q8_0_h80,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96,       flash_attn_ext_q8_0_h96,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112,      flash_attn_ext_q8_0_h112,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,      flash_attn_ext_q8_0_h128,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,      flash_attn_ext_q8_0_h256,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,   flash_attn_ext_vec_f16_h128,    has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,  flash_attn_ext_vec_q4_0_h128,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,  flash_attn_ext_vec_q4_1_h128,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,  flash_attn_ext_vec_q5_0_h128,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,  flash_attn_ext_vec_q5_1_h128,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,  flash_attn_ext_vec_q8_0_h128,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,  flash_attn_ext_vec_q4_0_h256,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,  flash_attn_ext_vec_q4_1_h256,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,  flash_attn_ext_vec_q5_0_h256,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,  flash_attn_ext_vec_q5_1_h256,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,  flash_attn_ext_vec_q8_0_h256,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,                   cpy_f32_f32,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,                   cpy_f16_f16,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,                   cpy_f32_f16,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,                  cpy_f32_bf16,                   has_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32,                   cpy_f16_f32,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,                   cpy_f16_f16,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_F32,                  cpy_bf16_f32,                   has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16,                 cpy_bf16_bf16,                  has_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,                  cpy_f32_q8_0,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,                  cpy_f32_q4_0,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,                  cpy_f32_q4_1,                   true);
@@ -886,15 +915,18 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
 }
 
 static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_context * ctx_dev, const struct ggml_tensor * op) {
-    for (size_t i = 0, n = 3; i < n; ++i) {
-        if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
-            return false;
+    const bool has_simdgroup_mm        = ctx_dev->has_simdgroup_mm;
+    const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction;
+    const bool has_bfloat              = ctx_dev->has_bfloat;
+
+    if (!has_bfloat) {
+        for (size_t i = 0, n = 3; i < n; ++i) {
+            if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
+                return false;
+            }
         }
     }
 
-    const bool support_simdgroup_mm        = ctx_dev->support_simdgroup_mm;
-    const bool support_simdgroup_reduction = ctx_dev->support_simdgroup_reduction;
-
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -932,7 +964,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_SOFT_MAX:
         case GGML_OP_RMS_NORM:
         case GGML_OP_GROUP_NORM:
-            return support_simdgroup_reduction;
+            return has_simdgroup_reduction;
         case GGML_OP_NORM:
         case GGML_OP_ROPE:
             return true;
@@ -952,13 +984,13 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
             if (op->src[1]->type != op->src[2]->type) {
                 return false;
             }
-            return support_simdgroup_mm; // TODO: over-restricted for vec-kernels
+            return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
         case GGML_OP_SSM_CONV:
         case GGML_OP_SSM_SCAN:
             return true;
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
-            return support_simdgroup_reduction &&
+            return has_simdgroup_reduction &&
                 (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
         case GGML_OP_CPY:
         case GGML_OP_DUP:
@@ -969,6 +1001,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                         switch (op->type) {
                            case GGML_TYPE_F32:
                            case GGML_TYPE_F16:
+                           case GGML_TYPE_BF16:
                            case GGML_TYPE_Q8_0:
                            case GGML_TYPE_Q4_0:
                            case GGML_TYPE_Q4_1:
@@ -981,10 +1014,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                         }
                     case GGML_TYPE_F16:
                         switch (op->type) {
-                           case GGML_TYPE_F32:
-                           case GGML_TYPE_F16:
+                            case GGML_TYPE_F32:
+                            case GGML_TYPE_F16:
                                 return true;
-                           default:
+                            default:
+                                return false;
+                        }
+                    case GGML_TYPE_BF16:
+                        switch (op->type) {
+                            case GGML_TYPE_F32:
+                            case GGML_TYPE_BF16:
+                                return true;
+                            default:
                                 return false;
                         }
                     default:
@@ -1855,6 +1896,7 @@ static void ggml_metal_encode_node(
                             switch (src0->type) {
                                 case GGML_TYPE_F32:  GGML_ASSERT(nb01 % 16 == 0); break;
                                 case GGML_TYPE_F16:  GGML_ASSERT(nb01 % 8  == 0); break;
+                                case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8  == 0); break;
                                 default: break;
                             }
 
@@ -1863,6 +1905,7 @@ static void ggml_metal_encode_node(
                             switch (src0->type) {
                                 case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
                                 case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
+                                case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32   ].pipeline; break;
                                 case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
                                 case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
                                 case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
@@ -1940,6 +1983,25 @@ static void ggml_metal_encode_node(
                                             nrows = 4;
                                         }
                                     } break;
+                                case GGML_TYPE_BF16:
+                                    {
+                                        nth0 = 32;
+                                        nth1 = 1;
+                                        if (src1t == GGML_TYPE_F32) {
+                                            if (ne11 * ne12 < 4) {
+                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline;
+                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline;
+                                                nrows = ne11;
+                                            } else {
+                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline;
+                                                nrows = 4;
+                                            }
+                                        } else {
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline;
+                                            nrows = 4;
+                                        }
+                                    } break;
                                 case GGML_TYPE_Q4_0:
                                     {
                                         nth0 = 8;
@@ -2158,12 +2220,12 @@ static void ggml_metal_encode_node(
                 if ([device supportsFamily:MTLGPUFamilyApple7] &&
                         ne00 % 32 == 0 && ne00 >= 64 &&
                         dst_rows > dst_rows_min) {
-
                     // some Metal matrix data types require aligned pointers
                     // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
                     switch (src0->type) {
-                        case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
-                        case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8  == 0); break;
+                        case GGML_TYPE_F32:  GGML_ASSERT(nb01 % 16 == 0); break;
+                        case GGML_TYPE_F16:  GGML_ASSERT(nb01 % 8  == 0); break;
+                        case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8  == 0); break;
                         default: break;
                     }
 
@@ -2172,6 +2234,7 @@ static void ggml_metal_encode_node(
                     switch (src0->type) {
                         case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
                         case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
+                        case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32   ].pipeline; break;
                         case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
                         case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
                         case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
@@ -2241,6 +2304,13 @@ static void ggml_metal_encode_node(
                                 nth1 = 1;
                                 pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
                             } break;
+                        case GGML_TYPE_BF16:
+                            {
+                                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                nth0 = 32;
+                                nth1 = 1;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32].pipeline;
+                            } break;
                         case GGML_TYPE_Q4_0:
                             {
                                 nth0 = 8;
@@ -2438,6 +2508,7 @@ static void ggml_metal_encode_node(
                 switch (src0->type) {
                     case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
                     case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
+                    case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16   ].pipeline; break;
                     case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
                     case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
                     case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
@@ -3237,6 +3308,7 @@ static void ggml_metal_encode_node(
                             switch (dstt) {
                                 case GGML_TYPE_F32:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break;
                                 case GGML_TYPE_F16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break;
+                                case GGML_TYPE_BF16:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_BF16].pipeline; break;
                                 case GGML_TYPE_Q8_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
                                 case GGML_TYPE_Q4_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
                                 case GGML_TYPE_Q4_1:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
@@ -3254,6 +3326,14 @@ static void ggml_metal_encode_node(
                                 default: GGML_ABORT("not implemented");
                             };
                         } break;
+                    case GGML_TYPE_BF16:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_F32].pipeline; break;
+                                case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16].pipeline; break;
+                                default: GGML_ASSERT(false && "not implemented");
+                            };
+                        } break;
                     default: GGML_ABORT("not implemented");
                 }
 
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index b9ea9f08e..16b5da3ff 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -12,6 +12,20 @@ using namespace metal;
 
 #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
 
+// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+//
+// cmd:
+//   .../usr/bin/metal -dM -E -c                             ggml/src/ggml-metal.metal
+//   .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal.metal
+//
+#if __METAL_VERSION__ < 310
+#define GGML_METAL_NO_BFLOAT
+#endif
+
+#if !defined(GGML_METAL_NO_BFLOAT)
+typedef matrix<bfloat, 4, 4> bfloat4x4;
+#endif
+
 constexpr constant static float kvalues_iq4nl_f[16] = {
     -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
 };
@@ -27,6 +41,13 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
     reg = (type4x4)(*src);
 }
 
+#if !defined(GGML_METAL_NO_BFLOAT)
+template <typename type4x4>
+void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) {
+    reg = (type4x4)(*src);
+}
+#endif
+
 template <typename type4x4>
 void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
     device const uint16_t * qs = ((device const uint16_t *)xb + 1);
@@ -2041,6 +2062,10 @@ typedef decltype(kernel_mul_mv<half, half4, half, half4>) mul_mv_t;
 template [[host_name("kernel_mul_mv_f32_f32")]]   kernel mul_mv_t kernel_mul_mv<float,  float4,  float,  float4>;
 template [[host_name("kernel_mul_mv_f16_f32")]]   kernel mul_mv_t kernel_mul_mv<half,   half4,   float,  float4>;
 template [[host_name("kernel_mul_mv_f16_f16")]]   kernel mul_mv_t kernel_mul_mv<half,   half4,   half,   half4>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_mul_mv_bf16_f32")]]  kernel mul_mv_t kernel_mul_mv<bfloat, bfloat4, float,  float4>;
+template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv<bfloat, bfloat4, bfloat, bfloat4>;
+#endif
 
 template<typename T, typename T4>
 kernel void kernel_mul_mv_1row(
@@ -2110,6 +2135,9 @@ kernel void kernel_mul_mv_1row(
 typedef decltype(kernel_mul_mv_1row<half, half4>) mul_mv_1row_t;
 
 template [[host_name("kernel_mul_mv_f16_f32_1row")]]  kernel mul_mv_1row_t kernel_mul_mv_1row<half,   half4>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_mul_mv_bf16_f32_1row")]] kernel mul_mv_1row_t kernel_mul_mv_1row<bfloat, bfloat4>;
+#endif
 
 // Assumes row size (ne00) is a multiple of 4
 template<typename T, typename T4>
@@ -2169,6 +2197,9 @@ kernel void kernel_mul_mv_l4(
 typedef decltype(kernel_mul_mv_l4<half, half4>) mul_mv_l4_t;
 
 template [[host_name("kernel_mul_mv_f16_f32_l4")]]  kernel mul_mv_l4_t kernel_mul_mv_l4<half, half4>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_mul_mv_bf16_f32_l4")]] kernel mul_mv_l4_t kernel_mul_mv_l4<bfloat, bfloat4>;
+#endif
 
 static float rope_yarn_ramp(const float low, const float high, const int i0) {
     const float y = (i0 / 2 - low) / max(0.001f, high - low);
@@ -3565,10 +3596,17 @@ kernel void kernel_cpy(
 
 typedef decltype(kernel_cpy<float, float>) kernel_cpy_t;
 
-template [[host_name("kernel_cpy_f32_f32")]]  kernel kernel_cpy_t kernel_cpy<float,  float>;
-template [[host_name("kernel_cpy_f32_f16")]]  kernel kernel_cpy_t kernel_cpy<float,  half>;
-template [[host_name("kernel_cpy_f16_f16")]]  kernel kernel_cpy_t kernel_cpy<half,   half>;
-template [[host_name("kernel_cpy_f16_f32")]]  kernel kernel_cpy_t kernel_cpy<half,   float>;
+template [[host_name("kernel_cpy_f32_f32")]]   kernel kernel_cpy_t kernel_cpy<float,  float>;
+template [[host_name("kernel_cpy_f32_f16")]]   kernel kernel_cpy_t kernel_cpy<float,  half>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_cpy_f32_bf16")]]  kernel kernel_cpy_t kernel_cpy<float,  bfloat>;
+#endif
+template [[host_name("kernel_cpy_f16_f32")]]   kernel kernel_cpy_t kernel_cpy<half,   float>;
+template [[host_name("kernel_cpy_f16_f16")]]   kernel kernel_cpy_t kernel_cpy<half,   half>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_cpy_bf16_f32")]]  kernel kernel_cpy_t kernel_cpy<bfloat, float>;
+template [[host_name("kernel_cpy_bf16_bf16")]] kernel kernel_cpy_t kernel_cpy<bfloat, bfloat>;
+#endif
 
 kernel void kernel_cpy_f32_q8_0(
         device const float * src0,
@@ -6473,6 +6511,9 @@ typedef decltype(kernel_get_rows_f<float>) get_rows_f_t;
 
 template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_f_t kernel_get_rows_f<float>;
 template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_f_t kernel_get_rows_f<half>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_rows_f<bfloat>;
+#endif
 
 typedef decltype(kernel_get_rows_q<block_q4_0, 2, dequantize_q4_0>) get_rows_q_t;
 
@@ -6504,6 +6545,9 @@ typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, de
 
 template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
 template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mat_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
+#endif
 template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
 template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
 template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
@@ -6532,6 +6576,9 @@ typedef decltype(kernel_mul_mm_id<float4x4, 1, dequantize_f32>) mat_mm_id_t;
 
 template [[host_name("kernel_mul_mm_id_f32_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<float4x4,      1,     dequantize_f32>;
 template [[host_name("kernel_mul_mm_id_f16_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<half4x4,       1,     dequantize_f16>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_mul_mm_id_bf16_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<bfloat4x4,     1,     dequantize_bf16>;
+#endif
 template [[host_name("kernel_mul_mm_id_q4_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0,    2,     dequantize_q4_0>;
 template [[host_name("kernel_mul_mm_id_q4_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1,    2,     dequantize_q4_1>;
 template [[host_name("kernel_mul_mm_id_q5_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0,    2,     dequantize_q5_0>;
@@ -6755,6 +6802,9 @@ typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float
 
 template [[host_name("kernel_mul_mv_id_f32_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
 template [[host_name("kernel_mul_mv_id_f16_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<half, half4, float, float4>>>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_mul_mv_id_bf16_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<bfloat, bfloat4, float, float4>>>;
+#endif
 template [[host_name("kernel_mul_mv_id_q8_0_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q8_0_f32_impl>>;
 template [[host_name("kernel_mul_mv_id_q4_0_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
 template [[host_name("kernel_mul_mv_id_q4_1_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 46346cbd0..6cc77edab 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3599,7 +3599,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
             for (int n_mats : {4}) {
                 for (int n_used : {2}) {
                     for (bool b : {false}) {
-                        for (int n : {1}) {
+                        for (int n : {1, 32}) {
                             int m = 512;
                             int k = 256;
                             test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));

From 3bcd40b3c593d14261fb2abfabad3c0fb5b9e318 Mon Sep 17 00:00:00 2001
From: Zhiyuan Li <lizhiyuan@uniartisan.com>
Date: Thu, 7 Nov 2024 18:19:10 +1100
Subject: [PATCH 171/396] Optimize RWKV6 Operator Naming and Implement
 Multi-core CPU/ SYCL Acceleration (#10133)

* rwkv6: rename to wkv6

* rwkv6: support avx2 avx512 armv8 armv9

* rwkv6: update cuda file name

* rwkv6: rename params

* wkv on sycl

* sycl: add some ops

* sycl: Enhance OP support judgment

* wkv6: drop armv9 and tranfer to GGML style

ggml-ci

* sync : ggml

* update the function to use appropriate types

* fix define error

* Update ggml/src/ggml-cpu.c

* add appropriate asserts

* move element-wise functions outside

* put the declaration outside the loop

* rewrite to be more inline with the common pattern for distributing threads

* use recommended way GGML_TENSOR_LOCALS

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Diego Devesa <slarengh@gmail.com>
Co-authored-by: Plamen Minev <pacominev@gmail.com>
Co-authored-by: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Co-authored-by: Meng, Hengyu <airdldl@163.com>
---
 docs/backend/SYCL.md                        |    2 +-
 ggml/include/ggml.h                         |    4 +-
 ggml/src/ggml-cpu.c                         |  208 +++-
 ggml/src/ggml-cuda.cu                       |    8 +-
 ggml/src/ggml-cuda/rwkv-wkv.cuh             |    5 -
 ggml/src/ggml-cuda/{rwkv-wkv.cu => wkv6.cu} |    6 +-
 ggml/src/ggml-cuda/wkv6.cuh                 |    5 +
 ggml/src/ggml-sycl.cpp                      | 1121 +++----------------
 ggml/src/ggml-sycl/backend.hpp              |    3 +
 ggml/src/ggml-sycl/common.cpp               |   40 +
 ggml/src/ggml-sycl/common.hpp               |  258 +++++
 ggml/src/ggml-sycl/concat.cpp               |    1 +
 ggml/src/ggml-sycl/element_wise.cpp         | 1011 +++++++++++++++++
 ggml/src/ggml-sycl/element_wise.hpp         |   76 ++
 ggml/src/ggml-sycl/outprod.cpp              |   55 +
 ggml/src/ggml-sycl/outprod.hpp              |   11 +
 ggml/src/ggml-sycl/presets.hpp              |    6 +
 ggml/src/ggml-sycl/wkv6.cpp                 |  138 +++
 ggml/src/ggml-sycl/wkv6.hpp                 |   10 +
 ggml/src/ggml.c                             |   12 +-
 src/llama.cpp                               |    8 +-
 tests/test-backend-ops.cpp                  |   16 +-
 22 files changed, 1977 insertions(+), 1027 deletions(-)
 delete mode 100644 ggml/src/ggml-cuda/rwkv-wkv.cuh
 rename ggml/src/ggml-cuda/{rwkv-wkv.cu => wkv6.cu} (93%)
 create mode 100644 ggml/src/ggml-cuda/wkv6.cuh
 create mode 100644 ggml/src/ggml-sycl/element_wise.cpp
 create mode 100644 ggml/src/ggml-sycl/element_wise.hpp
 create mode 100644 ggml/src/ggml-sycl/outprod.cpp
 create mode 100644 ggml/src/ggml-sycl/outprod.hpp
 create mode 100644 ggml/src/ggml-sycl/wkv6.cpp
 create mode 100644 ggml/src/ggml-sycl/wkv6.hpp

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index ea34182e4..bc8c0f886 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -377,7 +377,7 @@ found 2 SYCL devices:
 
 |Chosen Device ID|Setting|
 |-|-|
-|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
+|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:0"` or no action|
 |1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
 |0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
 
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 8a0bcbff8..0d143d2fe 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -509,7 +509,7 @@ extern "C" {
         GGML_OP_WIN_UNPART,
         GGML_OP_GET_REL_POS,
         GGML_OP_ADD_REL_POS,
-        GGML_OP_RWKV_WKV,
+        GGML_OP_RWKV_WKV6,
 
         GGML_OP_UNARY,
 
@@ -1819,7 +1819,7 @@ extern "C" {
             struct ggml_tensor  * pw,
             struct ggml_tensor  * ph);
 
-    GGML_API struct ggml_tensor * ggml_rwkv_wkv(
+    GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
             struct ggml_context * ctx,
             struct ggml_tensor  * k,
             struct ggml_tensor  * v,
diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c
index 0cb5b824a..98c3e21ae 100644
--- a/ggml/src/ggml-cpu.c
+++ b/ggml/src/ggml-cpu.c
@@ -11642,24 +11642,30 @@ static void ggml_compute_forward_add_rel_pos(
     }
 }
 
-// ggml_compute_forward_rwkv_wkv
+// ggml_compute_forward_rwkv_wkv6
 
-static void ggml_compute_forward_rwkv_wkv_f32(
+static void ggml_compute_forward_rwkv_wkv6_f32(
         const struct ggml_compute_params * params,
         struct ggml_tensor * dst) {
-    const size_t T = dst->src[1]->ne[3];
-    const size_t C = dst->ne[0];
-    const size_t H = dst->src[1]->ne[2];
-    const size_t n_seqs = dst->src[5]->ne[1];
+    const int64_t T = dst->src[1]->ne[3];
+    const int64_t C = dst->ne[0];
+    const int64_t HEADS = dst->src[1]->ne[2];
+    const int64_t n_seqs = dst->src[5]->ne[1];
+    const int64_t head_size = C / HEADS;
 
     float * dst_data = (float *) dst->data;
     float * state = ((float *) dst->data) + C * T;
 
-    if (params->ith != 0) {
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    if (ith >= HEADS) {
         return;
     }
 
-    memset(dst_data, 0, T * C * sizeof(float));
+    const int h_start = (HEADS * ith) / nth;
+    const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ?
+                (HEADS * (ith + 1)) / nth : HEADS;
 
     float * k =          (float *) dst->src[0]->data;
     float * v =          (float *) dst->src[1]->data;
@@ -11667,54 +11673,160 @@ static void ggml_compute_forward_rwkv_wkv_f32(
     float * time_faaaa = (float *) dst->src[3]->data;
     float * time_decay = (float *) dst->src[4]->data;
 
-    size_t t_stride = H * (C / H);
+    size_t t_stride = HEADS * head_size; // Same to C
 
-    size_t h_stride = C / H;
-    size_t h_stride_2d = (C / H) * (C / H);
+    size_t h_stride = C / HEADS;
+    GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS
+    size_t h_stride_2d = head_size * head_size;
 
-    // basically fused operations:
-    // dst = r @ (time_faaaa * (k @ v) + state),
-    // state = time_decay * state + (k @ v),
-    // recursive through each token
-    for (size_t t = 0; t < T; t++) {
-        size_t t_offset = t * t_stride;
-        size_t state_offset = (C / H) * C * (t / (T / n_seqs));
-        float * state_cur = state + state_offset;
-        float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
+    if (ith == 0) {
+        memset(dst_data, 0, T * C * sizeof(float));
+    }
+    ggml_barrier(params->threadpool);
 
-        for (size_t h = 0; h < H; h++) {
-            size_t h_offset = h * h_stride;
-            size_t t_h_offset = t_offset + h_offset;
-            size_t h_2d_offset = h * h_stride_2d;
 
-            for (size_t i = 0; i < C / H; i++) {
-                size_t t_h_i_offset = t_h_offset + i;
-                size_t h_i_offset = h_offset + i;
-                size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+    #if defined(__AVX__) && !defined(__AVX512F__)
+        #define GGML_F32X GGML_F32x8
+        #define GGML_F32X_SET1 GGML_F32x8_SET1
+        #define GGML_F32X_LOAD GGML_F32x8_LOAD
+        #define GGML_F32X_STORE GGML_F32x8_STORE
+        #define GGML_F32X_MUL GGML_F32x8_MUL
+        #define GGML_F32X_FMA GGML_F32x8_FMA
+        #define WKV_VECTOR_SIZE 8
+    #elif defined(__AVX512F__)
+        #define GGML_F32X GGML_F32x16
+        #define GGML_F32X_SET1 GGML_F32x16_SET1
+        #define GGML_F32X_LOAD GGML_F32x16_LOAD
+        #define GGML_F32X_STORE GGML_F32x16_STORE
+        #define GGML_F32X_MUL GGML_F32x16_MUL
+        #define GGML_F32X_FMA GGML_F32x16_FMA
+        #define WKV_VECTOR_SIZE 16
+    #elif defined(__ARM_NEON) && defined(__aarch64__)
+        #define GGML_F32X GGML_F32x4
+        #define GGML_F32X_SET1 GGML_F32x4_SET1
+        #define GGML_F32X_LOAD GGML_F32x4_LOAD
+        #define GGML_F32X_STORE GGML_F32x4_STORE
+        #define GGML_F32X_MUL GGML_F32x4_MUL
+        #define GGML_F32X_FMA GGML_F32x4_FMA
+        #define WKV_VECTOR_SIZE 4
+    #endif
 
-                float k_val = k[t_h_i_offset];
-                float r_val = r[t_h_i_offset];
-                float time_faaaa_val = time_faaaa[h_i_offset];
-                // RWKV v6: different time_decay for each token.
-                float time_decay_val = time_decay[t_h_i_offset];
+    #ifdef WKV_VECTOR_SIZE
+        const int64_t vec_count = head_size / WKV_VECTOR_SIZE;
 
-                for (size_t j = 0; j < C / H; j ++) {
-                    size_t t_h_j_offset = t_h_offset + j;
-                    size_t h_2d_i_j_offset = h_2d_i_offset + j;
+        for (int64_t t = 0; t < T; t++) {
+            size_t t_offset = t * t_stride;
+            size_t state_offset = head_size * C * (t / (T / n_seqs));
+            float * state_cur = state + state_offset;
+            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
 
-                    float v_val = v[t_h_j_offset];
-                    float kv_val = v_val * k_val;
-                    float prev_state_val = state_prev[h_2d_i_j_offset];
-                    float temp_val = kv_val * time_faaaa_val + prev_state_val;
-                    dst_data[t_h_j_offset] += temp_val * r_val;
-                    state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val;
+            for (int64_t h = h_start; h < h_end; h++) {
+                size_t h_offset = h * h_stride;
+                size_t t_h_offset = t_offset + h_offset;
+                size_t h_2d_offset = h * h_stride_2d;
+
+                for (int64_t i = 0; i < head_size; i++) {
+                    size_t t_h_i_offset = t_h_offset + i;
+                    size_t h_i_offset = h_offset + i;
+                    size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+
+                    float k_val = k[t_h_i_offset];
+                    float r_val = r[t_h_i_offset];
+                    float time_faaaa_val = time_faaaa[h_i_offset];
+                    float time_decay_val = time_decay[t_h_i_offset];
+
+                    // Broadcast scalar values to vectors
+                    GGML_F32X k_vec = GGML_F32X_SET1(k_val);
+                    GGML_F32X r_vec = GGML_F32X_SET1(r_val);
+                    GGML_F32X time_faaaa_vec = GGML_F32X_SET1(time_faaaa_val);
+                    GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
+
+                    for (int64_t j = 0; j < vec_count; j++) {
+                        size_t base_j = j * WKV_VECTOR_SIZE;
+                        size_t t_h_j_offset = t_h_offset + base_j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
+
+                        // Load x elements at once
+                        GGML_F32X v_vec = GGML_F32X_LOAD(&v[t_h_j_offset]);
+                        GGML_F32X prev_state_vec = GGML_F32X_LOAD(&state_prev[h_2d_i_j_offset]);
+                        GGML_F32X dst_vec = GGML_F32X_LOAD(&dst_data[t_h_j_offset]);
+
+                        // Compute kv = v * k
+                        GGML_F32X kv_vec = GGML_F32X_MUL(v_vec, k_vec);
+
+                        // Compute temp = kv * time_faaaa + prev_state
+                        GGML_F32X temp_vec = GGML_F32X_FMA(prev_state_vec, kv_vec, time_faaaa_vec);
+
+                        // Update dst: dst += temp * r
+                        dst_vec = GGML_F32X_FMA(dst_vec, temp_vec, r_vec);
+                        GGML_F32X_STORE(&dst_data[t_h_j_offset], dst_vec);
+
+                        // Update state: state = prev_state * time_decay + kv
+                        GGML_F32X new_state_vec = GGML_F32X_FMA(kv_vec, prev_state_vec, time_decay_vec);
+                        GGML_F32X_STORE(&state_cur[h_2d_i_j_offset], new_state_vec);
+                    }
+
+                    // Handle remaining elements, this will not be used.
+                    for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) {
+                        size_t t_h_j_offset = t_h_offset + j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + j;
+                        float v_val = v[t_h_j_offset];
+                        float kv_val = v_val * k_val;
+                        float prev_state_val = state_prev[h_2d_i_j_offset];
+                        float temp_val = kv_val * time_faaaa_val + prev_state_val;
+                        dst_data[t_h_j_offset] += temp_val * r_val;
+                        state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val;
+                    }
                 }
             }
         }
-    }
+
+    #else
+        // basically fused operations:
+        // dst = r @ (time_faaaa * (k @ v) + state),
+        // state = time_decay * state + (k @ v),
+        // recursive through each token
+        for (int64_t t = 0; t < T; t++) {
+            size_t t_offset = t * t_stride;
+            size_t state_offset = head_size * C * (t / (T / n_seqs));
+            float * state_cur = state + state_offset;
+            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
+
+            for (int64_t h = h_start; h < h_end; h++) {
+                size_t h_offset = h * h_stride;
+                size_t t_h_offset = t_offset + h_offset;
+                size_t h_2d_offset = h * h_stride_2d;
+
+                for (int64_t i = 0; i < head_size; i++) {
+                    size_t t_h_i_offset = t_h_offset + i;
+                    size_t h_i_offset = h_offset + i;
+                    size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+
+                    float k_val = k[t_h_i_offset];
+                    float r_val = r[t_h_i_offset];
+                    float time_faaaa_val = time_faaaa[h_i_offset];
+                    // RWKV v6: different time_decay for each token.
+                    float time_decay_val = time_decay[t_h_i_offset];
+
+                    for (int64_t j = 0; j < head_size; j++) {
+                        size_t t_h_j_offset = t_h_offset + j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + j;
+
+                        float v_val = v[t_h_j_offset];
+                        float kv_val = v_val * k_val;
+                        float prev_state_val = state_prev[h_2d_i_j_offset];
+                        float temp_val = kv_val * time_faaaa_val + prev_state_val;
+                        dst_data[t_h_j_offset] += temp_val * r_val;
+                        state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val;
+                    }
+                }
+            }
+        }
+    #endif
 }
 
-static void ggml_compute_forward_rwkv_wkv(
+
+static void ggml_compute_forward_rwkv_wkv6(
         const struct ggml_compute_params * params,
         struct ggml_tensor * dst) {
 
@@ -11723,7 +11835,7 @@ static void ggml_compute_forward_rwkv_wkv(
     switch (src0->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_rwkv_wkv_f32(params, dst);
+                ggml_compute_forward_rwkv_wkv6_f32(params, dst);
             } break;
         default:
             {
@@ -12475,9 +12587,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_add_rel_pos(params, tensor);
             } break;
-        case GGML_OP_RWKV_WKV:
+        case GGML_OP_RWKV_WKV6:
             {
-                ggml_compute_forward_rwkv_wkv(params, tensor);
+                ggml_compute_forward_rwkv_wkv6(params, tensor);
             } break;
         case GGML_OP_MAP_UNARY:
             {
@@ -12775,7 +12887,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_WIN_PART:
         case GGML_OP_WIN_UNPART:
         case GGML_OP_GET_REL_POS:
-        case GGML_OP_RWKV_WKV:
+        case GGML_OP_RWKV_WKV6:
         case GGML_OP_MAP_UNARY:
         case GGML_OP_MAP_BINARY:
         case GGML_OP_MAP_CUSTOM1_F32:
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index e68e40550..e27c8e87d 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -36,7 +36,7 @@
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
-#include "ggml-cuda/rwkv-wkv.cuh"
+#include "ggml-cuda/wkv6.cuh"
 
 #include <algorithm>
 #include <array>
@@ -2319,8 +2319,8 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_CROSS_ENTROPY_LOSS:
             ggml_cuda_cross_entropy_loss(ctx, dst);
             break;
-        case GGML_OP_RWKV_WKV:
-            ggml_cuda_op_rwkv_wkv(ctx, dst);
+        case GGML_OP_RWKV_WKV6:
+            ggml_cuda_op_rwkv_wkv6(ctx, dst);
             break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             ggml_cuda_cross_entropy_loss_back(ctx, dst);
@@ -3153,7 +3153,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
-        case GGML_OP_RWKV_WKV:
+        case GGML_OP_RWKV_WKV6:
             return true;
         case GGML_OP_FLASH_ATTN_EXT: {
 #ifndef FLASH_ATTN_AVAILABLE
diff --git a/ggml/src/ggml-cuda/rwkv-wkv.cuh b/ggml/src/ggml-cuda/rwkv-wkv.cuh
deleted file mode 100644
index 13795247f..000000000
--- a/ggml/src/ggml-cuda/rwkv-wkv.cuh
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "common.cuh"
-
-#define CUDA_WKV_BLOCK_SIZE 64
-
-void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/rwkv-wkv.cu b/ggml/src/ggml-cuda/wkv6.cu
similarity index 93%
rename from ggml/src/ggml-cuda/rwkv-wkv.cu
rename to ggml/src/ggml-cuda/wkv6.cu
index 098e92d35..42578341a 100644
--- a/ggml/src/ggml-cuda/rwkv-wkv.cu
+++ b/ggml/src/ggml-cuda/wkv6.cu
@@ -1,5 +1,5 @@
 #include "common.cuh"
-#include "rwkv-wkv.cuh"
+#include "wkv6.cuh"
 
 static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) {
     const int tid = threadIdx.x;
@@ -64,7 +64,7 @@ static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const
     }
 }
 
-void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const float * k_d  = (const float *)dst->src[0]->data;
     const float * v_d  = (const float *)dst->src[1]->data;
     const float * r_d  = (const float *)dst->src[2]->data;
@@ -83,7 +83,7 @@ void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
     GGML_ASSERT(C % H == 0);
-    GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE);
+    GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE); // The current cuda kernel is designed for RWKV6, HEAD_SIZE == 64
 
     rwkv_wkv_f32<<<B * H, C / H, 0, stream>>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d);
 }
diff --git a/ggml/src/ggml-cuda/wkv6.cuh b/ggml/src/ggml-cuda/wkv6.cuh
new file mode 100644
index 000000000..a7124ee51
--- /dev/null
+++ b/ggml/src/ggml-cuda/wkv6.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_WKV_BLOCK_SIZE 64
+
+void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index a62c67f4f..255bc64c6 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -1194,272 +1194,8 @@ typedef void (*ggml_sycl_op_mul_mat_t)(
     float *dst_dd_i, const int64_t row_low, const int64_t row_high,
     const int64_t src1_ncols, const int64_t src1_padded_row_size,
     const queue_ptr &stream);
-typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                       const ggml_tensor *src1,
-                                       ggml_tensor *dst, const float *src0_dd,
-                                       const float *src1_dd, float *dst_dd,
-                                       const queue_ptr &main_stream);
 
-static __dpct_inline__ float op_repeat(const float a, const float b) {
-    return b;
-    GGML_UNUSED(a);
-}
 
-static __dpct_inline__ float op_add(const float a, const float b) {
-    return a + b;
-}
-
-static __dpct_inline__ float op_mul(const float a, const float b) {
-    return a * b;
-}
-
-static __dpct_inline__ float op_div(const float a, const float b) {
-    return a / b;
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
-    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1));
-    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) /
-                   ne3;
-    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) %
-                   ne3;
-
-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    for (int i0 = i0s; i0 < ne0;
-         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
-        const int i10 = i0 % ne10;
-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-    }
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
-
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    const int i3 = i/(ne2*ne1*ne0);
-    const int i2 = (i/(ne1*ne0)) % ne2;
-    const int i1 = (i/ne0) % ne1;
-    const int i0 = i % ne0;
-
-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    const int i10 = i0 % ne10;
-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-}
-
-static void acc_f32(const float * x, const float * y, float * dst, const int ne,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= ne) {
-        return;
-    }
-    int src1_idx = i - offset;
-    int oz = src1_idx / nb2;
-    int oy = (src1_idx - (oz * nb2)) / nb1;
-    int ox = src1_idx % nb1;
-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
-    } else {
-        dst[i] = x[i];
-    }
-}
-
-static void gelu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    float xi = x[i];
-    dst[i] = 0.5f * xi *
-             (1.0f +
-              sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
-}
-
-static void silu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
-}
-
-static void gelu_quick_f32(const float *x, float *dst, int k,
-                           const sycl::nd_item<3> &item_ct1) {
-    const float GELU_QUICK_COEF = -1.702f;
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
-}
-
-static void tanh_f32(const float *x, float *dst, int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::tanh((float)(x[i]));
-}
-
-static void relu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::fmax((float)(x[i]), (float)0);
-}
-
-static void hardsigmoid_f32(const float * x, float * dst, const int k,
-                            const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
-}
-
-static void hardswish_f32(const float * x, float * dst, const int k,
-                          const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
-}
-
-static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
-                           const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::fmax((float)(x[i]), (float)0) +
-             sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
-}
-
-static void sqr_f32(const float * x, float * dst, const int k,
-                    const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * x[i];
-}
-
-static void upscale_f32(const float  *x, float *dst, const int nb00, const int nb01,
-                        const int nb02, const int nb03, const int ne10, const int ne11,
-                        const int ne12, const int ne13, const float sf0, const float sf1,
-                        const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
-    int index = item_ct1.get_local_id(0) +
-               item_ct1.get_group(0) * item_ct1.get_local_range(0);
-    if (index >= ne10 * ne11 * ne12 * ne13) {
-        return;
-    }
-    // operation
-    int i10 = index % ne10;
-    int i11 = (index / ne10) % ne11;
-    int i12 = (index / (ne10 * ne11)) % ne12;
-    int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
-
-    int i00 = i10 / sf0;
-    int i01 = i11 / sf1;
-    int i02 = i12 / sf2;
-    int i03 = i13 / sf3;
-
-    dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
-}
-
-static void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
-                    const sycl::nd_item<3> &item_ct1) {
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-
-    // operation
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
-        item_ct1.get_group(0) < ne02) {
-        int offset_src = nidx + item_ct1.get_group(1) * ne00 +
-                         item_ct1.get_group(0) * ne00 * ne01;
-            dst[offset_dst] = x[offset_src];
-    } else {
-        dst[offset_dst] = 0.0f;
-    }
-}
 
 template<int QUANT_BLOCK_TILE>
 static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
@@ -2148,297 +1884,6 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
     (void) dst;
 }
 
-template<float (*bin_op)(const float, const float)>
-struct bin_bcast_sycl {
-    template <typename src0_t, typename src1_t, typename dst_t>
-    void operator()(ggml_backend_sycl_context & ctx,
-                    const struct ggml_tensor *src0,
-                    const struct ggml_tensor *src1, struct ggml_tensor *dst,
-                    const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
-                    queue_ptr stream) {
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        int nr0 = ne10/ne0;
-        int nr1 = ne11/ne1;
-        int nr2 = ne12/ne2;
-        int nr3 = ne13/ne3;
-
-        int nr[4] = { nr0, nr1, nr2, nr3 };
-
-        // collapse dimensions until first broadcast dimension
-        int64_t cne0[] = {ne0, ne1, ne2, ne3};
-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-        size_t cnb0[] = {nb0, nb1, nb2, nb3};
-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
-        auto collapse = [](int64_t cne[]) {
-            cne[0] *= cne[1];
-            cne[1] = cne[2];
-            cne[2] = cne[3];
-            cne[3] = 1;
-        };
-
-        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
-            cnb[1] *= cne[1];
-            cnb[2] *= cne[2];
-            cnb[3] *= cne[3];
-        };
-
-        for (int i = 0; i < 4; i++) {
-            if (nr[i] != 1) {
-                break;
-            }
-            if (i > 0) {
-                collapse_nb(cnb0, cne0);
-                collapse_nb(cnb1, cne1);
-                collapse(cne0);
-                collapse(cne1);
-            }
-        }
-        {
-            int64_t ne0 = cne0[0];
-            int64_t ne1 = cne0[1];
-            int64_t ne2 = cne0[2];
-            int64_t ne3 = cne0[3];
-
-            int64_t ne10 = cne1[0];
-            int64_t ne11 = cne1[1];
-            int64_t ne12 = cne1[2];
-            int64_t ne13 = cne1[3];
-
-            size_t nb0 = cnb0[0];
-            size_t nb1 = cnb0[1];
-            size_t nb2 = cnb0[2];
-            size_t nb3 = cnb0[3];
-
-            size_t nb10 = cnb1[0];
-            size_t nb11 = cnb1[1];
-            size_t nb12 = cnb1[2];
-            size_t nb13 = cnb1[3];
-
-            size_t s0 = nb0 / sizeof(dst_t);
-            size_t s1 = nb1 / sizeof(dst_t);
-            size_t s2 = nb2 / sizeof(dst_t);
-            size_t s3 = nb3 / sizeof(dst_t);
-
-            size_t s10 = nb10 / sizeof(src1_t);
-            size_t s11 = nb11 / sizeof(src1_t);
-            size_t s12 = nb12 / sizeof(src1_t);
-            size_t s13 = nb13 / sizeof(src1_t);
-
-            GGML_ASSERT(s0 == 1);
-            GGML_ASSERT(s10 == 1);
-
-            const int block_size = 128;
-
-            int64_t hne0 = std::max(ne0/2LL, 1LL);
-
-            sycl::range<3> block_dims(1, 1, 1);
-            block_dims[2] = std::min<unsigned int>(hne0, block_size);
-            block_dims[1] = std::min<unsigned int>(
-                ne1, block_size / (unsigned int)block_dims[2]);
-            block_dims[0] = std::min(
-                std::min<unsigned int>(
-                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
-                                   (unsigned int)block_dims[1]),
-                64U);
-
-            sycl::range<3> block_nums(
-                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
-                (ne1 + block_dims[1] - 1) / block_dims[1],
-                (hne0 + block_dims[2] - 1) / block_dims[2]);
-
-            if (block_nums[0] > 65535) {
-                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
-                {
-                    dpct::has_capability_or_fail(stream->get_device(),
-                                                 {sycl::aspect::fp16});
-
-                    stream->parallel_for(
-                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
-                                              sycl::range<3>(1, 1, block_size),
-                                          sycl::range<3>(1, 1, block_size)),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            k_bin_bcast_unravel<bin_op>(
-                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
-                                ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
-                                s13, item_ct1);
-                        });
-                }
-            } else {
-                /*
-                DPCT1049:16: The work-group size passed to the SYCL kernel may
-                exceed the limit. To get the device limit, query
-                info::device::max_work_group_size. Adjust the work-group size if
-                needed.
-                */
-                dpct::has_capability_or_fail(stream->get_device(),
-                                             {sycl::aspect::fp16});
-
-                stream->parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
-                                            ne2, ne3, ne10, ne11, ne12, ne13,
-                                            s1, s2, s3, s11, s12, s13,
-                                            item_ct1);
-                    });
-            }
-        }
-    }
-};
-
-static void acc_f32_sycl(const float *x, const float *y, float *dst,
-                         const int n_elements, const int ne10, const int ne11,
-                         const int ne12, const int nb1, const int nb2,
-                         const int offset, queue_ptr stream) {
-    int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
-                    item_ct1);
-        });
-}
-
-static void gelu_f32_sycl(const float *x, float *dst, const int k,
-                          queue_ptr stream) {
-    const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void silu_f32_sycl(const float *x, float *dst, const int k,
-                          queue_ptr stream) {
-    const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            silu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
-                                queue_ptr stream) {
-    const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu_quick_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void tanh_f32_sycl(const float *x, float *dst, const int k,
-                          queue_ptr stream) {
-    const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            tanh_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void relu_f32_sycl(const float *x, float *dst, const int k,
-                          queue_ptr stream) {
-    const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            relu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
-                                 queue_ptr stream) {
-    const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            hardsigmoid_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void hardswish_f32_sycl(const float *x, float *dst, const int k,
-                               queue_ptr stream) {
-    const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            hardswish_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
-                                const float negative_slope,
-                                queue_ptr stream) {
-    const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
-        });
-}
-
-static void sqr_f32_sycl(const float *x, float *dst, const int k,
-                         queue_ptr stream) {
-    const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            sqr_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
-                             const int nb02, const int nb03, const int ne10, const int ne11,
-                             const int ne12, const int ne13, const float sf0, const float sf1,
-                             const float sf2, const float sf3, queue_ptr stream) {
-    int dst_size = ne10 * ne11 * ne12 * ne13;
-    int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
-    sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
-        [=](sycl::nd_item<1> item_ct1) {
-            upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
-        });
-}
-
-static void pad_f32_sycl(const float *x, float *dst, const int ne00,
-                         const int ne01, const int ne02, const int ne0,
-                         const int ne1, const int ne2, queue_ptr stream) {
-    int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
-        });
-}
 
 static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
                                    const int ky, const int kx_padded,
@@ -2816,6 +2261,58 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
     }
 }
 
+static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
+                               const int nrows, queue_ptr stream) {
+    const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    const size_t shared_mem = 256 * sizeof(float);
+
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::local_accessor<float, 1> shared_data(
+            sycl::range<1>(shared_mem/sizeof(float)), cgh);
+        sycl::local_accessor<int, 1> shared_indices(
+            sycl::range<1>(shared_mem/sizeof(float)), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                const int tid = item_ct1.get_local_id(2);
+                const int row = item_ct1.get_global_id(1);
+
+                float max_val = -INFINITY;
+                int max_idx = -1;
+
+                for (int col = tid; col < ncols; col += 256) {
+                    float val = x[row * ncols + col];
+                    if (val > max_val) {
+                        max_val = val;
+                        max_idx = col;
+                    }
+                }
+
+                shared_data[tid] = max_val;
+                shared_indices[tid] = max_idx;
+                item_ct1.barrier(sycl::access::fence_space::local_space);
+
+                for (int stride = 256/2; stride > 0; stride >>= 1) {
+                    if (tid < stride) {
+                        float val1 = shared_data[tid];
+                        float val2 = shared_data[tid + stride];
+                        if (val2 > val1) {
+                            shared_data[tid] = val2;
+                            shared_indices[tid] = shared_indices[tid + stride];
+                        }
+                    }
+                    item_ct1.barrier(sycl::access::fence_space::local_space);
+                }
+
+
+                if (tid == 0) {
+                    dst[row] = shared_indices[0];
+                }
+            });
+    });
+}
 static void diag_mask_inf_f32_sycl(const float *x, float *dst,
                                    const int ncols_x, const int nrows_x,
                                    const int rows_per_channel, const int n_past,
@@ -2946,33 +2443,6 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
     }
 }
 
-template <class op>
-inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                   const ggml_tensor *src1, ggml_tensor *dst,
-                                   const float *src0_dd, const float *src1_dd,
-                                   float *dst_dd,
-                                   const queue_ptr &main_stream) {
-
-    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
-             (sycl::half *)dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
-             main_stream);
-    } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
-        op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
-             main_stream);
-    } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
-        op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
-             main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ABORT("fatal error");
-    }
-}
 
 static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                 const ggml_tensor *src1, ggml_tensor *dst,
@@ -2986,230 +2456,6 @@ static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tens
     (void) src1_d;
 }
 
-inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const queue_ptr &main_stream) {
-
-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
-
-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
-    int offset = dst->op_params[3] / 4; // offset in bytes
-
-    acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
-
-    (void) dst;
-}
-
-inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const queue_ptr &main_stream) {
-
-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const queue_ptr &main_stream) {
-
-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                     const ggml_tensor *src1, ggml_tensor *dst,
-                                     const float *src0_dd, const float *src1_dd,
-                                     float *dst_dd,
-                                     const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                   const ggml_tensor *src1, ggml_tensor *dst,
-                                   const float *src0_dd, const float *src1_dd,
-                                   float *dst_dd, const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float negative_slope;
-    memcpy(&negative_slope, dst->op_params, sizeof(float));
-
-    leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const float *src0_dd, const float *src1_dd,
-                                 float *dst_dd,
-                                 const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const float sf0 = (float)dst->ne[0]/src0->ne[0];
-    const float sf1 = (float)dst->ne[1]/src0->ne[1];
-    const float sf2 = (float)dst->ne[2]/src0->ne[2];
-    const float sf3 = (float)dst->ne[3]/src0->ne[3];
-
-    upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
-                     dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
-                     main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    pad_f32_sycl(src0_dd, dst_dd,
-        src0->ne[0], src0->ne[1], src0->ne[2],
-        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
 
 inline void ggml_sycl_op_mul_mat_sycl(
     ggml_backend_sycl_context & ctx,
@@ -3379,6 +2625,23 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
     (void) src1_dd;
 }
 
+inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const queue_ptr &main_stream) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne = ggml_nelements(src0);
+
+    sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
 inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                   const ggml_tensor *src1, ggml_tensor *dst,
                                   const float *src0_dd, const float *src1_dd,
@@ -3419,6 +2682,25 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_ten
     (void) src1_dd;
 }
 
+inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
 inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                        const ggml_tensor *src1,
                                        ggml_tensor *dst, const float *src0_dd,
@@ -3489,46 +2771,6 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tenso
     (void) src1_dd;
 }
 
-static void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const ggml_sycl_op_flatten_t op) try {
-    const int64_t nrows0 = ggml_nrows(src0);
-
-    const bool use_src1 = src1 != nullptr;
-    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
-
-    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-
-    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-
-    // dd = data device
-    float * src0_ddf = (float *) src0->data;
-    float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
-    float *  dst_ddf = (float *) dst->data;
-
-    ggml_sycl_pool_alloc<float> src0_f(ctx.pool());
-    ggml_sycl_pool_alloc<float> src1_f(ctx.pool());
-    ggml_sycl_pool_alloc<float>  dst_f(ctx.pool());
-
-    ggml_sycl_set_device(ctx.device);
-    queue_ptr main_stream = ctx.stream();
-    // GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
-        // ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device);
-
-    // do the computation
-    op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    // print_ggml_tensor("tensor", dst);
-}
-catch (sycl::exception const &exc) {
-
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
 static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
     static bool peer_access_enabled = false;
 
@@ -3908,115 +3150,24 @@ static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, const ggml_tenso
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
-static void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_add);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_acc);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_mul);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_div);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_silu);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu_quick);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_tanh);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_relu);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardsigmoid);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardswish);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_leaky_relu);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqr);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
 static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_norm);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
-static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_group_norm);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-static void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_pad);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
-
 static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rms_norm);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
+static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_group_norm);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
 static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                        const ggml_tensor *src1,
                                        ggml_tensor *dst) try {
@@ -4632,6 +3783,11 @@ static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, const ggml_tensor
     ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_im2col);
 }
 
+static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sum);
+}
+
 static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(ggml_is_contiguous(src0));
     ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sum_rows);
@@ -4642,6 +3798,11 @@ static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor
     ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_argsort);
 }
 
+static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_argmax);
+}
+
 static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     (void) src0;
     (void) src1;
@@ -4673,6 +3834,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
     ggml_sycl_func_t func;
 
     switch (tensor->op) {
+        case GGML_OP_ARGMAX:
+            func = ggml_sycl_argmax;
+            break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             func = ggml_sycl_op_conv_transpose_1d;
             break;
@@ -4686,19 +3850,32 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
             func = ggml_sycl_dup;
             break;
         case GGML_OP_ADD:
+        case GGML_OP_ADD1: // TODO: more efficient implementation
             func = ggml_sycl_add;
             break;
+        case GGML_OP_SUB:
+            func = ggml_sycl_sub;
+            break;
         case GGML_OP_ACC:
             func = ggml_sycl_acc;
             break;
         case GGML_OP_MUL:
             func = ggml_sycl_mul;
             break;
+        case GGML_OP_LOG:
+            func = ggml_sycl_log;
+            break;
         case GGML_OP_DIV:
             func = ggml_sycl_div;
             break;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(tensor)) {
+                case GGML_UNARY_OP_NEG:
+                    func = ggml_sycl_neg;
+                    break;
+                case GGML_UNARY_OP_STEP:
+                    func = ggml_sycl_step;
+                    break;
                 case GGML_UNARY_OP_GELU:
                     func = ggml_sycl_gelu;
                     break;
@@ -4714,12 +3891,18 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
                 case GGML_UNARY_OP_RELU:
                     func = ggml_sycl_relu;
                     break;
+                case GGML_UNARY_OP_SIGMOID:
+                    func = ggml_sycl_sigmoid;
+                    break;
                 case GGML_UNARY_OP_HARDSIGMOID:
                     func = ggml_sycl_hardsigmoid;
                     break;
                 case GGML_UNARY_OP_HARDSWISH:
                     func = ggml_sycl_hardswish;
                     break;
+                case GGML_UNARY_OP_EXP:
+                    func = ggml_sycl_exp;
+                    break;
                 default:
                     return false;
             }
@@ -4757,12 +3940,24 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
             }
             func = ggml_sycl_mul_mat_id;
             break;
+        case GGML_OP_OUT_PROD:
+            func = ggml_sycl_op_out_prod;
+            break;
         case GGML_OP_SCALE:
             func = ggml_sycl_scale;
             break;
         case GGML_OP_SQR:
             func = ggml_sycl_sqr;
             break;
+        case GGML_OP_SQRT:
+            func = ggml_sycl_sqrt;
+            break;
+        case GGML_OP_SIN:
+            func = ggml_sycl_sin;
+            break;
+        case GGML_OP_COS:
+            func = ggml_sycl_cos;
+            break;
         case GGML_OP_CLAMP:
             func = ggml_sycl_clamp;
             break;
@@ -4794,6 +3989,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
         case GGML_OP_POOL_2D:
             func = ggml_sycl_pool2d;
             break;
+        case GGML_OP_SUM:
+            func = ggml_sycl_sum;
+            break;
         case GGML_OP_SUM_ROWS:
             func = ggml_sycl_sum_rows;
             break;
@@ -4803,6 +4001,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
         case GGML_OP_TIMESTEP_EMBEDDING:
             func = ggml_sycl_op_timestep_embedding;
             break;
+        case GGML_OP_RWKV_WKV6:
+            func = ggml_sycl_op_rwkv_wkv6;
+            break;
         default:
             return false;
     }
@@ -5125,13 +4326,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             } break;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_NEG:
+                case GGML_UNARY_OP_STEP:
                 case GGML_UNARY_OP_GELU:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
                 case GGML_UNARY_OP_HARDSIGMOID:
                 case GGML_UNARY_OP_HARDSWISH:
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
+                case GGML_UNARY_OP_EXP:
                     return ggml_is_contiguous(op->src[0]);
                 default:
                     return false;
@@ -5168,6 +4373,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 }
                 return true;
             } break;
+        case GGML_OP_OUT_PROD:
+            return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
         case GGML_OP_GET_ROWS:
             {
                 switch (op->src[0]->type) {
@@ -5213,10 +4420,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_CONCAT:
             {
                 ggml_type src0_type = op->src[0]->type;
-                int dim = op->op_params[0];
-                return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16 && dim == 2;
+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
             } break;
         case GGML_OP_DUP:
+        case GGML_OP_ARGMAX:
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_REPEAT:
@@ -5225,11 +4432,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_TRANSPOSE:
         case GGML_OP_NORM:
         case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_LOG:
+        case GGML_OP_SUB:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
         case GGML_OP_RMS_NORM:
         case GGML_OP_SCALE:
         case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
         case GGML_OP_CLAMP:
             return true;
         case GGML_OP_CONT:
@@ -5243,6 +4456,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             // TODO: add support for the new F32 operations
             return op->src[0]->type == GGML_TYPE_F16;
         case GGML_OP_POOL_2D:
+        case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
@@ -5251,6 +4465,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_PAD:
         case GGML_OP_LEAKY_RELU:
         case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_RWKV_WKV6:
             return true;
         default:
             return false;
@@ -5268,9 +4483,23 @@ static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_
     return buft_ctx->device == sycl_ctx->device;
 }
 
+static int64_t get_op_batch_size(const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_GET_ROWS:
+            return op->ne[1]; // this will increse the speed of prefill in test
+        case GGML_OP_MUL_MAT:
+            return op->ne[1];
+        case GGML_OP_MUL_MAT_ID:
+        case GGML_OP_ROPE:
+            return op->ne[2];
+        default:
+            return ggml_nrows(op);
+    }
+}
+
 static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     const int min_batch_size = 32;
-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
+    return get_op_batch_size(op) >= min_batch_size;
     GGML_UNUSED(dev);
 }
 
diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp
index d21b5f8dd..85748a5b4 100644
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@@ -26,5 +26,8 @@
 #include "softmax.hpp"
 #include "tsembd.hpp"
 #include "im2col.hpp"
+#include "wkv6.hpp"
+#include "outprod.hpp"
+#include "element_wise.hpp"
 
 #endif // GGML_SYCL_BACKEND_HPP
diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
index cf5291b31..97ab2003c 100644
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@@ -62,3 +62,43 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
   }
   return sycl_down_blk_size;
 }
+
+void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const ggml_sycl_op_flatten_t op) try {
+    const int64_t nrows0 = ggml_nrows(src0);
+
+    const bool use_src1 = src1 != nullptr;
+    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
+
+    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+
+    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
+
+    // dd = data device
+    float * src0_ddf = (float *) src0->data;
+    float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
+    float *  dst_ddf = (float *) dst->data;
+
+    ggml_sycl_pool_alloc<float> src0_f(ctx.pool());
+    ggml_sycl_pool_alloc<float> src1_f(ctx.pool());
+    ggml_sycl_pool_alloc<float>  dst_f(ctx.pool());
+
+    ggml_sycl_set_device(ctx.device);
+    queue_ptr main_stream = ctx.stream();
+    // GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
+        // ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device);
+
+    // do the computation
+    op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
+    // print_ggml_tensor("tensor", dst);
+}
+catch (sycl::exception const &exc) {
+
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index bc0faa867..4549fa5e9 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -404,4 +404,262 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
 
 int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
 
+typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst, const float *src0_dd,
+                                       const float *src1_dd, float *dst_dd,
+                                       const queue_ptr &main_stream);
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1));
+    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) /
+                   ne3;
+    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) %
+                   ne3;
+
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    for (int i0 = i0s; i0 < ne0;
+         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
+        const int i10 = i0 % ne10;
+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+    }
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    const int i3 = i/(ne2*ne1*ne0);
+    const int i2 = (i/(ne1*ne0)) % ne2;
+    const int i1 = (i/ne0) % ne1;
+    const int i0 = i % ne0;
+
+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    const int i10 = i0 % ne10;
+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+}
+
+
+template<float (*bin_op)(const float, const float)>
+struct bin_bcast_sycl {
+    template <typename src0_t, typename src1_t, typename dst_t>
+    void operator()(ggml_backend_sycl_context & ctx,
+                    const struct ggml_tensor *src0,
+                    const struct ggml_tensor *src1, struct ggml_tensor *dst,
+                    const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
+                    queue_ptr stream) {
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        int nr0 = ne10/ne0;
+        int nr1 = ne11/ne1;
+        int nr2 = ne12/ne2;
+        int nr3 = ne13/ne3;
+
+        int nr[4] = { nr0, nr1, nr2, nr3 };
+
+        // collapse dimensions until first broadcast dimension
+        int64_t cne0[] = {ne0, ne1, ne2, ne3};
+        int64_t cne1[] = {ne10, ne11, ne12, ne13};
+        size_t cnb0[] = {nb0, nb1, nb2, nb3};
+        size_t cnb1[] = {nb10, nb11, nb12, nb13};
+        auto collapse = [](int64_t cne[]) {
+            cne[0] *= cne[1];
+            cne[1] = cne[2];
+            cne[2] = cne[3];
+            cne[3] = 1;
+        };
+
+        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
+            cnb[1] *= cne[1];
+            cnb[2] *= cne[2];
+            cnb[3] *= cne[3];
+        };
+
+        for (int i = 0; i < 4; i++) {
+            if (nr[i] != 1) {
+                break;
+            }
+            if (i > 0) {
+                collapse_nb(cnb0, cne0);
+                collapse_nb(cnb1, cne1);
+                collapse(cne0);
+                collapse(cne1);
+            }
+        }
+        {
+            int64_t ne0 = cne0[0];
+            int64_t ne1 = cne0[1];
+            int64_t ne2 = cne0[2];
+            int64_t ne3 = cne0[3];
+
+            int64_t ne10 = cne1[0];
+            int64_t ne11 = cne1[1];
+            int64_t ne12 = cne1[2];
+            int64_t ne13 = cne1[3];
+
+            size_t nb0 = cnb0[0];
+            size_t nb1 = cnb0[1];
+            size_t nb2 = cnb0[2];
+            size_t nb3 = cnb0[3];
+
+            size_t nb10 = cnb1[0];
+            size_t nb11 = cnb1[1];
+            size_t nb12 = cnb1[2];
+            size_t nb13 = cnb1[3];
+
+            size_t s0 = nb0 / sizeof(dst_t);
+            size_t s1 = nb1 / sizeof(dst_t);
+            size_t s2 = nb2 / sizeof(dst_t);
+            size_t s3 = nb3 / sizeof(dst_t);
+
+            size_t s10 = nb10 / sizeof(src1_t);
+            size_t s11 = nb11 / sizeof(src1_t);
+            size_t s12 = nb12 / sizeof(src1_t);
+            size_t s13 = nb13 / sizeof(src1_t);
+
+            GGML_ASSERT(s0 == 1);
+            GGML_ASSERT(s10 == 1);
+
+            const int block_size = 128;
+
+            int64_t hne0 = std::max(ne0/2LL, 1LL);
+
+            sycl::range<3> block_dims(1, 1, 1);
+            block_dims[2] = std::min<unsigned int>(hne0, block_size);
+            block_dims[1] = std::min<unsigned int>(
+                ne1, block_size / (unsigned int)block_dims[2]);
+            block_dims[0] = std::min(
+                std::min<unsigned int>(
+                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
+                                   (unsigned int)block_dims[1]),
+                64U);
+
+            sycl::range<3> block_nums(
+                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
+                (ne1 + block_dims[1] - 1) / block_dims[1],
+                (hne0 + block_dims[2] - 1) / block_dims[2]);
+
+            if (block_nums[0] > 65535) {
+                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
+                {
+                    dpct::has_capability_or_fail(stream->get_device(),
+                                                 {sycl::aspect::fp16});
+
+                    stream->parallel_for(
+                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
+                                              sycl::range<3>(1, 1, block_size),
+                                          sycl::range<3>(1, 1, block_size)),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            k_bin_bcast_unravel<bin_op>(
+                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
+                                ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
+                                s13, item_ct1);
+                        });
+                }
+            } else {
+                /*
+                DPCT1049:16: The work-group size passed to the SYCL kernel may
+                exceed the limit. To get the device limit, query
+                info::device::max_work_group_size. Adjust the work-group size if
+                needed.
+                */
+                dpct::has_capability_or_fail(stream->get_device(),
+                                             {sycl::aspect::fp16});
+
+                stream->parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
+                                            ne2, ne3, ne10, ne11, ne12, ne13,
+                                            s1, s2, s3, s11, s12, s13,
+                                            item_ct1);
+                    });
+            }
+        }
+    }
+};
+
+template <class op>
+inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd,
+                                   const queue_ptr &main_stream) {
+
+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+        op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
+             (sycl::half *)dst_dd, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
+             main_stream);
+    } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
+        op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
+             main_stream);
+    } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
+        op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
+             main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+
+void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const ggml_sycl_op_flatten_t op);
+
 #endif // GGML_SYCL_COMMON_HPP
diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp
index 632eedb9d..c90c452d8 100644
--- a/ggml/src/ggml-sycl/concat.cpp
+++ b/ggml/src/ggml-sycl/concat.cpp
@@ -106,6 +106,7 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
           concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
         });
     break;
+  // dim >=2 will be dispatched to the default path
   default:
     stream->parallel_for(
         sycl::nd_range<3>(gridDim *
diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
new file mode 100644
index 000000000..e5cd736eb
--- /dev/null
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -0,0 +1,1011 @@
+#include "common.hpp"
+#include "element_wise.hpp"
+
+void acc_f32(const float * x, const float * y, float * dst, const int ne,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= ne) {
+        return;
+    }
+    int src1_idx = i - offset;
+    int oz = src1_idx / nb2;
+    int oy = (src1_idx - (oz * nb2)) / nb1;
+    int ox = src1_idx % nb1;
+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
+    } else {
+        dst[i] = x[i];
+    }
+}
+
+void gelu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    float xi = x[i];
+    dst[i] = 0.5f * xi *
+             (1.0f +
+              sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
+}
+
+void silu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
+}
+
+void gelu_quick_f32(const float *x, float *dst, int k,
+                           const sycl::nd_item<3> &item_ct1) {
+    const float GELU_QUICK_COEF = -1.702f;
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
+}
+
+void tanh_f32(const float *x, float *dst, int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::tanh((float)(x[i]));
+}
+
+void relu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmax((float)(x[i]), (float)0);
+}
+
+void sigmoid_f32(const float * x, float * dst, const int k,
+                            const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
+}
+
+void sqrt_f32(const float * x, float * dst, const int k,
+                            const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::sqrt(x[i]);
+}
+
+void sin_f32(const float * x, float * dst, const int k,
+                            const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::sin(x[i]);
+}
+
+void cos_f32(const float * x, float * dst, const int k,
+                            const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::cos(x[i]);
+}
+
+void hardsigmoid_f32(const float * x, float * dst, const int k,
+                            const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
+}
+
+void hardswish_f32(const float * x, float * dst, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
+}
+
+void exp_f32(const float * x, float * dst, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::exp(x[i]);
+}
+
+void log_f32(const float * x, float * dst, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    float xi = x[i];
+    if (xi <= 0) {
+        dst[i] = -INFINITY;
+    } else {
+        dst[i] = sycl::log(xi);
+    }
+}
+
+void neg_f32(const float * x, float * dst, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = -x[i];
+}
+
+void step_f32(const float * x, float * dst, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] > 0.0f;
+}
+
+void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmax((float)(x[i]), (float)0) +
+             sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
+}
+
+void sqr_f32(const float * x, float * dst, const int k,
+                    const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * x[i];
+}
+
+void upscale_f32(const float  *x, float *dst, const int nb00, const int nb01,
+                        const int nb02, const int nb03, const int ne10, const int ne11,
+                        const int ne12, const int ne13, const float sf0, const float sf1,
+                        const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
+    int index = item_ct1.get_local_id(0) +
+               item_ct1.get_group(0) * item_ct1.get_local_range(0);
+    if (index >= ne10 * ne11 * ne12 * ne13) {
+        return;
+    }
+    // operation
+    int i10 = index % ne10;
+    int i11 = (index / ne10) % ne11;
+    int i12 = (index / (ne10 * ne11)) % ne12;
+    int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
+
+    int i00 = i10 / sf0;
+    int i01 = i11 / sf1;
+    int i02 = i12 / sf2;
+    int i03 = i13 / sf3;
+
+    dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
+}
+
+void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
+                    const sycl::nd_item<3> &item_ct1) {
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+
+    // operation
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
+        item_ct1.get_group(0) < ne02) {
+        int offset_src = nidx + item_ct1.get_group(1) * ne00 +
+                         item_ct1.get_group(0) * ne00 * ne01;
+            dst[offset_dst] = x[offset_src];
+    } else {
+        dst[offset_dst] = 0.0f;
+    }
+}
+
+
+
+void acc_f32_sycl(const float *x, const float *y, float *dst,
+                         const int n_elements, const int ne10, const int ne11,
+                         const int ne12, const int nb1, const int nb2,
+                         const int offset, queue_ptr stream) {
+    int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
+                    item_ct1);
+        });
+}
+
+void gelu_f32_sycl(const float *x, float *dst, const int k,
+                          queue_ptr stream) {
+    const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_f32(x, dst, k, item_ct1);
+        });
+}
+
+void silu_f32_sycl(const float *x, float *dst, const int k,
+                          queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            silu_f32(x, dst, k, item_ct1);
+        });
+}
+
+void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
+                                queue_ptr stream) {
+    const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_quick_f32(x, dst, k, item_ct1);
+        });
+}
+
+void tanh_f32_sycl(const float *x, float *dst, const int k,
+                          queue_ptr stream) {
+    const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            tanh_f32(x, dst, k, item_ct1);
+        });
+}
+
+void relu_f32_sycl(const float *x, float *dst, const int k,
+                          queue_ptr stream) {
+    const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            relu_f32(x, dst, k, item_ct1);
+        });
+}
+
+void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
+                                 queue_ptr stream) {
+    const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            hardsigmoid_f32(x, dst, k, item_ct1);
+        });
+}
+
+void hardswish_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            hardswish_f32(x, dst, k, item_ct1);
+        });
+}
+
+void exp_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            exp_f32(x, dst, k, item_ct1);
+        });
+}
+
+void log_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            log_f32(x, dst, k, item_ct1);
+        });
+}
+
+void neg_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            neg_f32(x, dst, k, item_ct1);
+        });
+}
+
+void step_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            step_f32(x, dst, k, item_ct1);
+        });
+}
+
+void sigmoid_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sigmoid_f32(x, dst, k, item_ct1);
+        });
+}
+
+void sqrt_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sqrt_f32(x, dst, k, item_ct1);
+        });
+}
+
+void sin_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sin_f32(x, dst, k, item_ct1);
+        });
+}
+
+void cos_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            cos_f32(x, dst, k, item_ct1);
+        });
+}
+
+void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
+                                const float negative_slope,
+                                queue_ptr stream) {
+    const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
+        });
+}
+
+void sqr_f32_sycl(const float *x, float *dst, const int k,
+                         queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sqr_f32(x, dst, k, item_ct1);
+        });
+}
+
+void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
+                             const int nb02, const int nb03, const int ne10, const int ne11,
+                             const int ne12, const int ne13, const float sf0, const float sf1,
+                             const float sf2, const float sf3, queue_ptr stream) {
+    int dst_size = ne10 * ne11 * ne12 * ne13;
+    int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
+    sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
+        [=](sycl::nd_item<1> item_ct1) {
+            upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
+        });
+}
+
+void pad_f32_sycl(const float *x, float *dst, const int ne00,
+                         const int ne01, const int ne02, const int ne0,
+                         const int ne1, const int ne2, queue_ptr stream) {
+    int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne2, ne1, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
+        });
+}
+
+inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                     const ggml_tensor *src1, ggml_tensor *dst,
+                                     const float *src0_dd, const float *src1_dd,
+                                     float *dst_dd,
+                                     const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+
+    leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const float sf0 = (float)dst->ne[0]/src0->ne[0];
+    const float sf1 = (float)dst->ne[1]/src0->ne[1];
+    const float sf2 = (float)dst->ne[2]/src0->ne[2];
+    const float sf3 = (float)dst->ne[3]/src0->ne[3];
+
+    upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                     dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
+                     main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    pad_f32_sycl(src0_dd, dst_dd,
+        src0->ne[0], src0->ne[1], src0->ne[2],
+        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
+
+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
+    int offset = dst->op_params[3] / 4; // offset in bytes
+
+    acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
+
+    (void) dst;
+}
+
+inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+
+void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqrt);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_sin(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sin);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_cos(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_cos);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_acc);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_silu);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu_quick);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_tanh);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_relu);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sigmoid);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardsigmoid);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardswish);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+
+void ggml_sycl_exp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_exp);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_log(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_log);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_neg(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_neg);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_step(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_step);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_leaky_relu);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqr);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_pad);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+
+
+void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_add);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_sub(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sub);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_mul);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_div);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp
new file mode 100644
index 000000000..8152edf58
--- /dev/null
+++ b/ggml/src/ggml-sycl/element_wise.hpp
@@ -0,0 +1,76 @@
+#ifndef GGML_SYCL_ELEMENTWISE_HPP
+#define GGML_SYCL_ELEMENTWISE_HPP
+
+#include "common.hpp"
+
+static __dpct_inline__ float op_repeat(const float a, const float b) {
+    return b;
+    GGML_UNUSED(a);
+}
+
+static __dpct_inline__ float op_add(const float a, const float b) {
+    return a + b;
+}
+
+static __dpct_inline__ float op_sub(const float a, const float b) {
+    return a - b;
+}
+
+static __dpct_inline__ float op_mul(const float a, const float b) {
+    return a * b;
+}
+
+static __dpct_inline__ float op_div(const float a, const float b) {
+    return a / b;
+}
+
+
+void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_sin(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_cos(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_exp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_log(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_neg(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_step(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_sub(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+#endif // GGML_SYCL_ELEMENTWISE_HPP
diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp
new file mode 100644
index 000000000..c2779df0e
--- /dev/null
+++ b/ggml/src/ggml-sycl/outprod.cpp
@@ -0,0 +1,55 @@
+#include <sycl/sycl.hpp>
+#include "outprod.hpp"
+
+
+void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    // Get SYCL queue
+    dpct::queue_ptr stream = ctx.stream();
+
+    // Dimension checks
+    GGML_ASSERT(ne01 == ne11);  // Inner dimensions must match
+    GGML_ASSERT(ne0 == ne00);   // Output rows match src0 rows
+    GGML_ASSERT(ne1 == ne10);   // Output cols match src1 cols
+
+    // Get data pointers
+    const float* src0_d = (const float*)src0->data;
+    const float* src1_d = (const float*)src1->data;
+    float* dst_d = (float*)dst->data;
+
+    // GEMM parameters
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+
+    // Handle transposition of src1
+    const bool src1_T = ggml_is_transposed(src1);
+    const oneapi::mkl::transpose src1_op =
+        src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
+    const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
+
+    try {
+        // Perform matrix multiplication using oneMKL GEMM
+        oneapi::mkl::blas::gemm(*stream,
+            oneapi::mkl::transpose::nontrans, src1_op,
+            ne0, ne1, ne01,
+            alpha,
+            src0_d, ne00,
+            src1_d, ldb,
+            beta,
+            dst_d, ne0);
+    }
+    catch (sycl::exception const& exc) {
+        std::cerr << exc.what() << std::endl;
+        GGML_ASSERT(false);
+    }
+}
diff --git a/ggml/src/ggml-sycl/outprod.hpp b/ggml/src/ggml-sycl/outprod.hpp
new file mode 100644
index 000000000..9c042738a
--- /dev/null
+++ b/ggml/src/ggml-sycl/outprod.hpp
@@ -0,0 +1,11 @@
+#ifndef GGML_SYCL_OUTPROD_HPP
+#define GGML_SYCL_OUTPROD_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst);
+
+
+#endif // GGML_SYCL_OUTPROD_HPP
+
diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp
index 340ab8e93..af1890727 100644
--- a/ggml/src/ggml-sycl/presets.hpp
+++ b/ggml/src/ggml-sycl/presets.hpp
@@ -25,6 +25,11 @@
 #define SYCL_RELU_BLOCK_SIZE 256
 #define SYCL_HARDSIGMOID_BLOCK_SIZE 256
 #define SYCL_HARDSWISH_BLOCK_SIZE 256
+#define SYCL_EXP_BLOCK_SIZE 256
+#define SYCL_NEG_BLOCK_SIZE 256
+#define SYCL_SIGMOID_BLOCK_SIZE 256
+#define SYCL_SQRT_BLOCK_SIZE 256
+#define SYCL_SIN_BLOCK_SIZE 256
 #define SYCL_SQR_BLOCK_SIZE 256
 #define SYCL_CPY_BLOCK_SIZE 32
 #define SYCL_SCALE_BLOCK_SIZE 256
@@ -41,6 +46,7 @@
 #define SYCL_ACC_BLOCK_SIZE 256
 #define SYCL_IM2COL_BLOCK_SIZE 256
 #define SYCL_POOL2D_BLOCK_SIZE 256
+#define SYCL_ARGMAX_BLOCK_SIZE 256
 #define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
 #define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
 
diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp
new file mode 100644
index 000000000..4c737f4bf
--- /dev/null
+++ b/ggml/src/ggml-sycl/wkv6.cpp
@@ -0,0 +1,138 @@
+#include <sycl/sycl.hpp>
+#include "wkv6.hpp"
+
+constexpr int WKV_BLOCK_SIZE = 64;  // Matching CUDA_WKV_BLOCK_SIZE
+
+// Helper function for the main kernel
+static void rwkv_wkv_f32_kernel(
+    const int B, const int T, const int C, const int H,
+    const float* k, const float* v, const float* r,
+    const float* tf, const float* td, const float* s,
+    float* dst, const sycl::nd_item<3>& item_ct1, float* shared_mem) {
+
+    const int tid = item_ct1.get_local_id(2);
+    const int bid = item_ct1.get_group(2);
+
+    const int head_size = WKV_BLOCK_SIZE;
+    const int batch_i = bid / H;
+    const int head_i = bid % H;
+    const int state_size = C * head_size;
+    const int n_seq_tokens = T / B;
+
+    // Set up shared memory pointers
+    float* _k = shared_mem;
+    float* _r = _k + head_size;
+    float* _tf = _r + head_size;
+    float* _td = _tf + head_size;
+
+    // Local state array
+    float state[WKV_BLOCK_SIZE];
+
+    // Load initial state
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
+    }
+
+    // Sync threads before shared memory operations
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    // Load time-mixing parameters
+    _tf[tid] = tf[head_i * head_size + tid];
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    // Main sequence processing loop
+    for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid;
+         t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid;
+         t += C) {
+
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+
+        // Load current timestep data to shared memory
+        _k[tid] = k[t];
+        _r[tid] = r[t];
+        _td[tid] = td[t];
+
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+
+        const float _v = v[t];
+        float y = 0;
+
+        // Process in chunks of 4 for better vectorization
+        sycl::float4 k4, r4, tf4, td4, s4, kv4;
+        #pragma unroll
+        for (int j = 0; j < head_size; j += 4) {
+            // Load data in vec4 chunks
+            k4 = sycl::float4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
+            r4 = sycl::float4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
+            tf4 = sycl::float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
+            td4 = sycl::float4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
+            s4 = sycl::float4(state[j], state[j+1], state[j+2], state[j+3]);
+
+            // Compute key-value product
+            sycl::float4 kv4 = k4 * _v;
+
+            // Accumulate weighted sum
+            y += sycl::dot(r4, tf4 * kv4 + s4);
+
+            // Update state
+            s4 = s4 * td4 + kv4;
+
+            // Store updated state
+            state[j] = s4.x();
+            state[j+1] = s4.y();
+            state[j+2] = s4.z();
+            state[j+3] = s4.w();
+        }
+
+        dst[t] = y;
+    }
+
+    // Save final state
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
+    }
+}
+
+void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+
+    const float* k_d = (const float*)dst->src[0]->data;
+    const float* v_d = (const float*)dst->src[1]->data;
+    const float* r_d = (const float*)dst->src[2]->data;
+    const float* tf_d = (const float*)dst->src[3]->data;
+    const float* td_d = (const float*)dst->src[4]->data;
+    const float* s_d = (const float*)dst->src[5]->data;
+    float* dst_d = (float*)dst->data;
+
+    const int64_t B = dst->src[5]->ne[1];
+    const int64_t T = dst->src[0]->ne[3];
+    const int64_t C = dst->ne[0];
+    const int64_t H = dst->src[0]->ne[2];
+
+    GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
+    GGML_ASSERT(C % H == 0);
+    GGML_ASSERT(C / H == WKV_BLOCK_SIZE); // The current sycl kernel is designed for RWKV6, HEAD_SIZE == 64
+
+    dpct::queue_ptr stream = ctx.stream();
+
+    // Calculate execution configuration
+    const size_t shared_mem_size = WKV_BLOCK_SIZE * 4 * sizeof(float); // For k, r, tf, td
+    sycl::range<3> block_dims(1, 1, C / H);
+    sycl::range<3> grid_dims(1, 1, B * H);
+
+    // Submit kernel
+    stream->submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rwkv_wkv_f32_kernel(
+                    B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
+                    item_ct1, shared_mem_acc.get_pointer()
+                );
+            });
+    });
+}
diff --git a/ggml/src/ggml-sycl/wkv6.hpp b/ggml/src/ggml-sycl/wkv6.hpp
new file mode 100644
index 000000000..ddfa3377b
--- /dev/null
+++ b/ggml/src/ggml-sycl/wkv6.hpp
@@ -0,0 +1,10 @@
+#ifndef GGML_SYCL_WKV6_HPP
+#define GGML_SYCL_WKV6_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+    const ggml_tensor *src1, ggml_tensor * dst);
+
+
+#endif // GGML_SYCL_WKV6_HPP
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 266a0d6f0..bc034015f 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -975,7 +975,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "WIN_UNPART",
     "GET_REL_POS",
     "ADD_REL_POS",
-    "RWKV_WKV",
+    "RWKV_WKV6",
 
     "UNARY",
 
@@ -1070,7 +1070,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "win_unpart(x)",
     "get_rel_pos(x)",
     "add_rel_pos(x)",
-    "rwkv_wkv(k, v, r, tf, td, s)",
+    "rwkv_wkv6(k, v, r, tf, td, s)",
 
     "unary(x)",
 
@@ -4503,9 +4503,9 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
     return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
 }
 
-// ggml_rwkv_wkv
+// ggml_rwkv_wkv6
 
-struct ggml_tensor * ggml_rwkv_wkv(
+struct ggml_tensor * ggml_rwkv_wkv6(
         struct ggml_context * ctx,
         struct ggml_tensor  * k,
         struct ggml_tensor  * v,
@@ -4537,7 +4537,7 @@ struct ggml_tensor * ggml_rwkv_wkv(
     const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    result->op     = GGML_OP_RWKV_WKV;
+    result->op     = GGML_OP_RWKV_WKV6;
     result->src[0] = k;
     result->src[1] = v;
     result->src[2] = r;
@@ -6084,7 +6084,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_GET_REL_POS:
         case GGML_OP_ADD_REL_POS:
-        case GGML_OP_RWKV_WKV:
+        case GGML_OP_RWKV_WKV6:
         case GGML_OP_MAP_UNARY:
         case GGML_OP_MAP_BINARY:
         case GGML_OP_MAP_CUSTOM1_F32:
diff --git a/src/llama.cpp b/src/llama.cpp
index 6719edb38..034441e1f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7011,7 +7011,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_TIME_MIX_LERP_R,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_TIME_MIX_LERP_G,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_TIME_MIX_DECAY,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
-    {LLM_TENSOR_TIME_MIX_FIRST,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV}},
+    {LLM_TENSOR_TIME_MIX_FIRST,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
     {LLM_TENSOR_ATTN_NORM,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_ATTN_NORM_2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_ATTN_OUT_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -7127,7 +7127,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
                 ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
                 op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
             } break;
-        case GGML_OP_RWKV_WKV:
+        case GGML_OP_RWKV_WKV6:
             {
                 // FIXME
                 const int64_t S = 123;
@@ -7140,7 +7140,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
                 ggml_tensor  * tf = w;
                 ggml_tensor  * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
                 ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
-                op_tensor = ggml_rwkv_wkv(ctx, k, v, r, tf, td, state);
+                op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
             } break;
         default:
             GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
@@ -10083,7 +10083,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
     v = ggml_transpose(ctx, v);
     r = ggml_transpose(ctx, r);
 
-    struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
+    struct ggml_tensor * wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
     cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
     *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 6cc77edab..9d48a2717 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1614,8 +1614,8 @@ struct test_ssm_scan : public test_case {
     }
 };
 
-// GGML_OP_RWKV_WKV
-struct test_rwkv_wkv : public test_case {
+// GGML_OP_RWKV_WKV6
+struct test_rwkv_wkv6 : public test_case {
     const ggml_type type;
 
     const int64_t head_count;
@@ -1627,7 +1627,7 @@ struct test_rwkv_wkv : public test_case {
         return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
     }
 
-    test_rwkv_wkv(ggml_type type = GGML_TYPE_F32,
+    test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32,
             int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
         : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
 
@@ -1639,7 +1639,7 @@ struct test_rwkv_wkv : public test_case {
         ggml_tensor * tf  = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
         ggml_tensor * td  = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
         ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
-        ggml_tensor * out = ggml_rwkv_wkv(ctx, k, v, r, tf, td, s);
+        ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s);
         return out;
     }
 };
@@ -3499,10 +3499,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
     test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
 
-    test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 1, 1));
-    test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 1));
-    test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 4));
-    test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 128, 4));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4));
 
 #if 1
     for (ggml_type type_a : base_types) {

From 2319126a70b541f8670225a04a38202bbdccbedb Mon Sep 17 00:00:00 2001
From: snadampal <87143774+snadampal@users.noreply.github.com>
Date: Thu, 7 Nov 2024 02:02:08 -0600
Subject: [PATCH 172/396] fix q4_0_8_8 format for corrupted tokens issue
 (#10198)

Co-authored-by: EC2 Default User <ec2-user@ip-172-31-62-167.us-west-2.compute.internal>
---
 ggml/src/ggml-cpu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c
index 98c3e21ae..de1de18ec 100644
--- a/ggml/src/ggml-cpu.c
+++ b/ggml/src/ggml-cpu.c
@@ -409,6 +409,8 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
     },
     [GGML_TYPE_Q4_0_8_8] = {
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
         .ncols                    = 8,
         .gemv                     = ggml_gemv_q4_0_8x8_q8_0,

From 5107e8cea35be46a27cfc940e6841c0cf81c0525 Mon Sep 17 00:00:00 2001
From: wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
Date: Thu, 7 Nov 2024 08:20:25 -0700
Subject: [PATCH 173/396] DRY: Fixes clone functionality (#10192)

---
 src/llama-sampling.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index c2cfe0a77..fd8ca8a9e 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1876,8 +1876,11 @@ static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
 static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (llama_sampler_dry *) smpl->ctx;
 
-    // nullptr is passed as vocab because it is only needed for raw sequence breaker processing, which we have already done and will be copying
-    auto * result = llama_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
+    llama_vocab dummy_vocab;
+
+    // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
+    auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
+
     // Copy the state, including the processed breakers
     {
         auto * result_ctx = (llama_sampler_dry *) result->ctx;

From 60e17ce23c2740369af6304113a2dfa0454eaf26 Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Thu, 7 Nov 2024 11:46:12 -0500
Subject: [PATCH 174/396] Remove identical wte/etw logic for jais (#10203)

---
 convert_hf_to_gguf.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 76ee6cef5..39afa5ef4 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -3748,10 +3748,7 @@ class JaisModel(Model):
 
         # Embeddings scale
         self.embeddings_scale = 1.0
-        # note: For some JAIS flavors, output is tied to (same as) wte in original model
-        self.output_is_wte = False
         if 'mup_embeddings_scale' in self.hparams:
-            self.output_is_wte = True   # Hack (?)
             self.embeddings_scale = self.hparams['mup_embeddings_scale']
         elif 'embeddings_scale' in self.hparams:
             self.embeddings_scale = self.hparams['embeddings_scale']
@@ -3808,10 +3805,7 @@ class JaisModel(Model):
 
         if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
             tensors.append((new_name, data_torch * self.embeddings_scale))
-            if self.output_is_wte:
-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
         elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
-            assert not self.output_is_wte
             tensors.append((new_name, data_torch * self.width_scale))
         else:
             tensors.append((new_name, data_torch))

From 97404c4a0374cac45c8c34a32d13819de1dd023d Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 7 Nov 2024 18:16:08 +0100
Subject: [PATCH 175/396] ggml : add ggml-cpu.h to the public headers (#10204)

---
 ggml/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index cfa6e3f70..6866a25d3 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -218,12 +218,12 @@ include(CMakePackageConfigHelpers)
 # all public headers
 set(GGML_PUBLIC_HEADERS
     include/ggml.h
+    include/ggml-cpu.h
     include/ggml-alloc.h
     include/ggml-backend.h
     include/ggml-blas.h
     include/ggml-cann.h
     include/ggml-cuda.h
-    include/ggml.h
     include/ggml-kompute.h
     include/ggml-metal.h
     include/ggml-rpc.h

From a2c6fd747c77fe183e2f556a4a2f1fb0a0be4c7b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 7 Nov 2024 23:07:55 +0200
Subject: [PATCH 176/396] scripts : sync update

---
 scripts/sync-ggml-am.sh | 88 ++++++++++-------------------------------
 scripts/sync-ggml.sh    | 46 +++++----------------
 2 files changed, 30 insertions(+), 104 deletions(-)

diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index fba29b935..06a04745b 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -114,46 +114,22 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
 
     # replace filenames:
     #
-    # CMakelists.txt          -> ggml/CMakeLists.txt
-    # src/CMakeLists.txt      -> ggml/src/CMakeLists.txt
-    # cmake/FindSIMD.cmake    -> ggml/cmake/FindSIMD.cmake
+    # CMakelists.txt       -> ggml/CMakeLists.txt
+    # src/CMakeLists.txt   -> ggml/src/CMakeLists.txt
+    # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
     #
-    # src/ggml.c              -> ggml/src/ggml.c
-    # src/ggml-aarch64.c      -> ggml/src/ggml-aarch64.c
-    # src/ggml-aarch64.h      -> ggml/src/ggml-aarch64.h
-    # src/ggml-alloc.c        -> ggml/src/ggml-alloc.c
-    # src/ggml-amx/*          -> ggml/src/ggml-amx/
-    # src/ggml-amx.cpp        -> ggml/src/ggml-amx.cpp
-    # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h
-    # src/ggml-backend.cpp    -> ggml/src/ggml-backend.cpp
-    # src/ggml-cann/*         -> ggml/src/ggml-cann/
-    # src/ggml-cann.cpp       -> ggml/src/ggml-cann.cpp
-    # src/ggml-common.h       -> ggml/src/ggml-common.h
-    # src/ggml-cuda/*         -> ggml/src/ggml-cuda/
-    # src/ggml-cuda.cu        -> ggml/src/ggml-cuda.cu
-    # src/ggml-impl.h         -> ggml/src/ggml-impl.h
-    # src/ggml-kompute.cpp    -> ggml/src/ggml-kompute.cpp
-    # src/ggml-metal.m        -> ggml/src/ggml-metal.m
-    # src/ggml-quants.c       -> ggml/src/ggml-quants.c
-    # src/ggml-quants.h       -> ggml/src/ggml-quants.h
-    # src/ggml-rpc.cpp        -> ggml/src/ggml-rpc.cpp
-    # src/ggml-sycl/*         -> ggml/src/ggml-sycl/
-    # src/ggml-sycl.cpp       -> ggml/src/ggml-sycl.cpp
-    # src/ggml-vulkan.cpp     -> ggml/src/ggml-vulkan.cpp
-    # src/vulkan-shaders/*    -> ggml/src/vulkan-shaders/
+    # src/ggml*.c          -> ggml/src/ggml*.c
+    # src/ggml*.cpp        -> ggml/src/ggml*.cpp
+    # src/ggml*.h          -> ggml/src/ggml*.h
+    # src/ggml*.cu         -> ggml/src/ggml*.cu
+    # src/ggml*.m          -> ggml/src/ggml*.m
+    # src/ggml-amx/*       -> ggml/src/ggml-amx/
+    # src/ggml-cann/*      -> ggml/src/ggml-cann/
+    # src/ggml-cuda/*      -> ggml/src/ggml-cuda/
+    # src/ggml-sycl/*      -> ggml/src/ggml-sycl/
+    # src/vulkan-shaders/* -> ggml/src/vulkan-shaders/
     #
-    # include/ggml.h         -> ggml/include/ggml.h
-    # include/ggml-alloc.h   -> ggml/include/ggml-alloc.h
-    # include/ggml-amx.h     -> ggml/include/ggml-amx.h
-    # include/ggml-backend.h -> ggml/include/ggml-backend.h
-    # include/ggml-blas.h    -> ggml/include/ggml-blas.h
-    # include/ggml-cann.h    -> ggml/include/ggml-cann.h
-    # include/ggml-cuda.h    -> ggml/include/ggml-cuda.h
-    # include/ggml-kompute.h -> ggml/include/ggml-kompute.h
-    # include/ggml-metal.h   -> ggml/include/ggml-metal.h
-    # include/ggml-rpc.h     -> ggml/include/ggml-rpc.h
-    # include/ggml-sycl.h    -> ggml/include/ggml-sycl.h
-    # include/ggml-vulkan.h  -> ggml/include/ggml-vulkan.h
+    # include/ggml*.h -> ggml/include/ggml*.h
     #
     # tests/test-opt.cpp           -> tests/test-opt.cpp
     # tests/test-grad0.cpp         -> tests/test-grad0.cpp
@@ -168,41 +144,17 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
         -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
         -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\1.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\1.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\1.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\1.cu/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\1.m/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\.cpp/\1ggml\/src\/ggml-amx.cpp/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.cpp/\1ggml\/src\/ggml-backend.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-amx\.h/\1ggml\/include\/ggml-amx.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-cann\.h/\1ggml\/include\/ggml-cann.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\1.h/g' \
         -e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \
         -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index f5d87324a..8192a8673 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -4,43 +4,17 @@ cp -rpv ../ggml/CMakeLists.txt       ./ggml/CMakeLists.txt
 cp -rpv ../ggml/src/CMakeLists.txt   ./ggml/src/CMakeLists.txt
 cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
 
-cp -rpv ../ggml/src/ggml.c              ./ggml/src/ggml.c
-cp -rpv ../ggml/src/ggml-aarch64.c      ./ggml/src/ggml-aarch64.c
-cp -rpv ../ggml/src/ggml-aarch64.h      ./ggml/src/ggml-aarch64.h
-cp -rpv ../ggml/src/ggml-alloc.c        ./ggml/src/ggml-alloc.c
-cp -rpv ../ggml/src/ggml-amx/*          ./ggml/src/ggml-amx/
-cp -rpv ../ggml/src/ggml-amx.cpp        ./ggml/src/ggml-amx.cpp
-cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h
-cp -rpv ../ggml/src/ggml-backend.cpp    ./ggml/src/ggml-backend.cpp
-cp -rpv ../ggml/src/ggml-cann/*         ./ggml/src/ggml-cann/
-cp -rpv ../ggml/src/ggml-cann.cpp       ./ggml/src/ggml-cann.cpp
-cp -rpv ../ggml/src/ggml-common.h       ./ggml/src/ggml-common.h
-cp -rpv ../ggml/src/ggml-cuda/*         ./ggml/src/ggml-cuda/
-cp -rpv ../ggml/src/ggml-cuda.cu        ./ggml/src/ggml-cuda.cu
-cp -rpv ../ggml/src/ggml-impl.h         ./ggml/src/ggml-impl.h
-cp -rpv ../ggml/src/ggml-kompute.cpp    ./ggml/src/ggml-kompute.cpp
-cp -rpv ../ggml/src/ggml-metal.m        ./ggml/src/ggml-metal.m
-cp -rpv ../ggml/src/ggml-metal.metal    ./ggml/src/ggml-metal.metal
-cp -rpv ../ggml/src/ggml-quants.c       ./ggml/src/ggml-quants.c
-cp -rpv ../ggml/src/ggml-quants.h       ./ggml/src/ggml-quants.h
-cp -rpv ../ggml/src/ggml-rpc.cpp        ./ggml/src/ggml-rpc.cpp
-cp -rpv ../ggml/src/ggml-sycl/*         ./ggml/src/ggml-sycl/
-cp -rpv ../ggml/src/ggml-sycl.cpp       ./ggml/src/ggml-sycl.cpp
-cp -rpv ../ggml/src/ggml-vulkan.cpp     ./ggml/src/ggml-vulkan.cpp
-cp -rpv ../ggml/src/vulkan-shaders/*    ./ggml/src/vulkan-shaders/
+cp -rpv ../ggml/src/ggml*.c          ./ggml/src/
+cp -rpv ../ggml/src/ggml*.cpp        ./ggml/src/
+cp -rpv ../ggml/src/ggml*.h          ./ggml/src/
+cp -rpv ../ggml/src/ggml*.cu         ./ggml/src/
+cp -rpv ../ggml/src/ggml*.m          ./ggml/src/
+cp -rpv ../ggml/src/ggml-cann/*      ./ggml/src/ggml-cann/
+cp -rpv ../ggml/src/ggml-cuda/*      ./ggml/src/ggml-cuda/
+cp -rpv ../ggml/src/ggml-sycl/*      ./ggml/src/ggml-sycl/
+cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/
 
-cp -rpv ../ggml/include/ggml.h         ./ggml/include/ggml.h
-cp -rpv ../ggml/include/ggml-alloc.h   ./ggml/include/ggml-alloc.h
-cp -rpv ../ggml/include/ggml-amx.h     ./ggml/include/ggml-amx.h
-cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h
-cp -rpv ../ggml/include/ggml-blas.h    ./ggml/include/ggml-blas.h
-cp -rpv ../ggml/include/ggml-cann.h    ./ggml/include/ggml-cann.h
-cp -rpv ../ggml/include/ggml-cuda.h    ./ggml/include/ggml-cuda.h
-cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h
-cp -rpv ../ggml/include/ggml-metal.h   ./ggml/include/ggml-metal.h
-cp -rpv ../ggml/include/ggml-rpc.h     ./ggml/include/ggml-rpc.h
-cp -rpv ../ggml/include/ggml-sycl.h    ./ggml/include/ggml-sycl.h
-cp -rpv ../ggml/include/ggml-vulkan.h  ./ggml/include/ggml-vulkan.h
+cp -rpv ../ggml/include/ggml*.h ./ggml/include/
 
 cp -rpv ../ggml/tests/test-opt.cpp           ./tests/test-opt.cpp
 cp -rpv ../ggml/tests/test-grad0.cpp         ./tests/test-grad0.cpp

From 3b08828674f561c78af182d47fc0636fc3ccd1e9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 7 Nov 2024 23:08:24 +0200
Subject: [PATCH 177/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 020c60f34..e82984f49 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-a099cb514d6687e436a5a423d1fb0448be0feb20
+89952d649e0c5cabbb9ff8c4906f5a843a789fb2

From eec4d71737b32f312e0082b671629a0368e1a20d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 7 Nov 2024 23:11:36 +0200
Subject: [PATCH 178/396] scripts : add amx to sync-ggml.sh [no ci]

---
 scripts/sync-ggml.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index 8192a8673..f29554c82 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -9,6 +9,7 @@ cp -rpv ../ggml/src/ggml*.cpp        ./ggml/src/
 cp -rpv ../ggml/src/ggml*.h          ./ggml/src/
 cp -rpv ../ggml/src/ggml*.cu         ./ggml/src/
 cp -rpv ../ggml/src/ggml*.m          ./ggml/src/
+cp -rpv ../ggml/src/ggml-amx/*       ./ggml/src/ggml-amx/
 cp -rpv ../ggml/src/ggml-cann/*      ./ggml/src/ggml-cann/
 cp -rpv ../ggml/src/ggml-cuda/*      ./ggml/src/ggml-cuda/
 cp -rpv ../ggml/src/ggml-sycl/*      ./ggml/src/ggml-sycl/

From a71d81cf8c1afb26b166f897c94ee1581f9fac7d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 7 Nov 2024 17:31:10 -0400
Subject: [PATCH 179/396] server : revamp chat UI with vuejs and daisyui
 (#10175)

* server : simple chat UI with vuejs and daisyui

* move old files to legacy folder

* embed deps into binary

* basic markdown support

* add conversation history, save to localStorage

* fix bg-base classes

* save theme preferences

* fix tests

* regenerate, edit, copy buttons

* small fixes

* docs: how to use legacy ui

* better error handling

* make CORS preflight more explicit

* add GET method for CORS

* fix tests

* clean up a bit

* better auto scroll

* small fixes

* use collapse-arrow

* fix closeAndSaveConfigDialog

* small fix

* remove console.log

* fix style for <pre> element

* lighter bubble color (less distract when reading)
---
 .editorconfig                                 |    10 +
 Makefile                                      |    17 +-
 examples/server/CMakeLists.txt                |    17 +-
 examples/server/README.md                     |    10 +
 examples/server/chat.mjs                      |     2 +-
 examples/server/deps.sh                       |    19 +-
 examples/server/public/completion.js          |    29 +-
 examples/server/public/deps_daisyui.min.css   |    13 +
 examples/server/public/deps_markdown-it.js    |  8442 +++++++
 examples/server/public/deps_tailwindcss.js    |    82 +
 .../server/public/deps_vue.esm-browser.js     | 18160 ++++++++++++++++
 examples/server/public/index.html             |  1851 +-
 .../{public => public_legacy}/colorthemes.css |     0
 examples/server/public_legacy/completion.js   |   209 +
 .../{public => public_legacy}/favicon.ico     |   Bin
 .../{public => public_legacy}/index-new.html  |     0
 examples/server/public_legacy/index.html      |  1303 ++
 .../server/{public => public_legacy}/index.js |     0
 .../json-schema-to-grammar.mjs                |     0
 examples/server/public_legacy/loading.html    |    12 +
 .../prompt-formats.js                         |     0
 .../{public => public_legacy}/style.css       |     0
 .../system-prompts.js                         |     0
 .../theme-beeninorder.css                     |     0
 .../theme-ketivah.css                         |     0
 .../theme-mangotango.css                      |     0
 .../theme-playground.css                      |     0
 .../theme-polarnight.css                      |     0
 .../theme-snowstorm.css                       |     0
 examples/server/server.cpp                    |    71 +-
 .../server/tests/features/security.feature    |     2 +-
 grammars/README.md                            |     2 +-
 tests/run-json-schema-to-grammar.mjs          |     2 +-
 33 files changed, 28884 insertions(+), 1369 deletions(-)
 create mode 100644 examples/server/public/deps_daisyui.min.css
 create mode 100644 examples/server/public/deps_markdown-it.js
 create mode 100644 examples/server/public/deps_tailwindcss.js
 create mode 100644 examples/server/public/deps_vue.esm-browser.js
 rename examples/server/{public => public_legacy}/colorthemes.css (100%)
 create mode 100644 examples/server/public_legacy/completion.js
 rename examples/server/{public => public_legacy}/favicon.ico (100%)
 rename examples/server/{public => public_legacy}/index-new.html (100%)
 create mode 100644 examples/server/public_legacy/index.html
 rename examples/server/{public => public_legacy}/index.js (100%)
 rename examples/server/{public => public_legacy}/json-schema-to-grammar.mjs (100%)
 create mode 100644 examples/server/public_legacy/loading.html
 rename examples/server/{public => public_legacy}/prompt-formats.js (100%)
 rename examples/server/{public => public_legacy}/style.css (100%)
 rename examples/server/{public => public_legacy}/system-prompts.js (100%)
 rename examples/server/{public => public_legacy}/theme-beeninorder.css (100%)
 rename examples/server/{public => public_legacy}/theme-ketivah.css (100%)
 rename examples/server/{public => public_legacy}/theme-mangotango.css (100%)
 rename examples/server/{public => public_legacy}/theme-playground.css (100%)
 rename examples/server/{public => public_legacy}/theme-polarnight.css (100%)
 rename examples/server/{public => public_legacy}/theme-snowstorm.css (100%)

diff --git a/.editorconfig b/.editorconfig
index f88f8da67..eac38a15f 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -24,6 +24,16 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 
+[examples/server/public/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
+[examples/server/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
 
diff --git a/Makefile b/Makefile
index eb1da90f1..b9131eae5 100644
--- a/Makefile
+++ b/Makefile
@@ -1455,22 +1455,13 @@ llama-server: \
 	examples/server/server.cpp \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
-	examples/server/colorthemes.css.hpp \
-	examples/server/style.css.hpp \
-	examples/server/theme-beeninorder.css.hpp \
-	examples/server/theme-ketivah.css.hpp \
-	examples/server/theme-mangotango.css.hpp \
-	examples/server/theme-playground.css.hpp \
-	examples/server/theme-polarnight.css.hpp \
-	examples/server/theme-snowstorm.css.hpp \
 	examples/server/index.html.hpp \
-	examples/server/index-new.html.hpp \
-	examples/server/index.js.hpp \
 	examples/server/completion.js.hpp \
-	examples/server/system-prompts.js.hpp \
-	examples/server/prompt-formats.js.hpp \
-	examples/server/json-schema-to-grammar.mjs.hpp \
 	examples/server/loading.html.hpp \
+	examples/server/deps_daisyui.min.css.hpp \
+	examples/server/deps_markdown-it.js.hpp \
+	examples/server/deps_tailwindcss.js.hpp \
+	examples/server/deps_vue.esm-browser.js.hpp \
 	common/json.hpp \
 	common/stb_image.h \
 	$(OBJ_ALL)
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 3e717e882..93e876f5a 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -15,22 +15,13 @@ set(TARGET_SRCS
     httplib.h
 )
 set(PUBLIC_ASSETS
-    colorthemes.css
-    style.css
-    theme-beeninorder.css
-    theme-ketivah.css
-    theme-mangotango.css
-    theme-playground.css
-    theme-polarnight.css
-    theme-snowstorm.css
     index.html
-    index-new.html
-    index.js
     completion.js
-    system-prompts.js
-    prompt-formats.js
-    json-schema-to-grammar.mjs
     loading.html
+    deps_daisyui.min.css
+    deps_markdown-it.js
+    deps_tailwindcss.js
+    deps_vue.esm-browser.js
 )
 
 foreach(asset ${PUBLIC_ASSETS})
diff --git a/examples/server/README.md b/examples/server/README.md
index 15f95db1e..562494077 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -928,6 +928,16 @@ Apart from error types supported by OAI, we also have custom types that are spec
 }
 ```
 
+### Legacy completion web UI
+
+A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
+
+For example:
+
+```sh
+./llama-server -m my_model.gguf -c 8192 --path ./examples/server/public_legacy
+```
+
 ### Extending or building alternative Web Front End
 
 You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs
index a79c8a3cd..4fef5655a 100644
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@@ -1,7 +1,7 @@
 import * as readline from 'node:readline'
 import { stdin, stdout } from 'node:process'
 import { readFileSync } from 'node:fs'
-import { SchemaConverter }  from './public/json-schema-to-grammar.mjs'
+import { SchemaConverter }  from './public_legacy/json-schema-to-grammar.mjs'
 
 const args = process.argv.slice(2);
 const grammarJsonSchemaFile = args.find(
diff --git a/examples/server/deps.sh b/examples/server/deps.sh
index d28378901..1ff80d056 100755
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@@ -6,5 +6,20 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 PUBLIC=$DIR/public
 
 echo "download js bundle files"
-curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
-echo >> $PUBLIC/index.js # add newline
+
+# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
+
+curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
+echo >> $PUBLIC/deps_tailwindcss.js # add newline
+
+curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
+curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
+echo >> $PUBLIC/deps_daisyui.min.css # add newline
+
+curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
+echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
+
+curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
+echo >> $PUBLIC/deps_markdown-it.js # add newline
+
+ls -lah $PUBLIC
diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
index 36818f764..54a0f22f5 100644
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -1,12 +1,16 @@
 const paramDefaults = {
   stream: true,
-  n_predict: 500,
   temperature: 0.2,
-  stop: ["</s>"]
 };
 
 let generation_settings = null;
 
+export class CompletionError extends Error {
+  constructor(message, name, data) {
+    super(message);
+    this.name = name;
+  }
+};
 
 // Completes the prompt as a generator. Recommended for most use cases.
 //
@@ -29,7 +33,7 @@ export async function* llama(prompt, params = {}, config = {}) {
 
   const completionParams = { ...paramDefaults, ...params, prompt };
 
-  const response = await fetch(`${api_url}/completion`, {
+  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
     method: 'POST',
     body: JSON.stringify(completionParams),
     headers: {
@@ -41,6 +45,18 @@ export async function* llama(prompt, params = {}, config = {}) {
     signal: controller.signal,
   });
 
+  const status = response.status;
+  if (status !== 200) {
+    try {
+      const body = await response.json();
+      if (body && body.error && body.error.message) {
+        throw new CompletionError(body.error.message, 'ServerError');
+      }
+    } catch (err) {
+      throw new CompletionError(err.message, 'ServerError');
+    }
+  }
+
   const reader = response.body.getReader();
   const decoder = new TextDecoder();
 
@@ -78,7 +94,12 @@ export async function* llama(prompt, params = {}, config = {}) {
       for (const line of lines) {
         const match = regex.exec(line);
         if (match) {
-          result[match[1]] = match[2]
+          result[match[1]] = match[2];
+          if (result.data === '[DONE]') {
+            cont = false;
+            break;
+          }
+
           // since we know this is llama.cpp, let's just decode the json in data
           if (result.data) {
             result.data = JSON.parse(result.data);
diff --git a/examples/server/public/deps_daisyui.min.css b/examples/server/public/deps_daisyui.min.css
new file mode 100644
index 000000000..bc8529651
--- /dev/null
+++ b/examples/server/public/deps_daisyui.min.css
@@ -0,0 +1,13 @@
+.alert{display:grid;width:100%;grid-auto-flow:row;align-content:flex-start;align-items:center;justify-items:center;gap:1rem;text-align:center}@media (min-width:640px){.alert{grid-auto-flow:column;grid-template-columns:auto minmax(auto,1fr);justify-items:start;text-align:start}}.artboard{width:100%}.avatar{position:relative;display:inline-flex}.avatar>div{display:block;aspect-ratio:1/1;overflow:hidden}.avatar img{height:100%;width:100%;object-fit:cover}.avatar.placeholder>div{display:flex;align-items:center;justify-content:center}.badge{display:inline-flex;align-items:center;justify-content:center;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);height:1.25rem;font-size:.875rem;line-height:1.25rem;width:fit-content;padding-left:.563rem;padding-right:.563rem}.btm-nav{position:fixed;bottom:0;left:0;right:0;display:flex;width:100%;flex-direction:row;align-items:center;justify-content:space-around;padding-bottom:env(safe-area-inset-bottom)}.btm-nav>*{position:relative;display:flex;height:100%;flex-basis:100%;cursor:pointer;flex-direction:column;align-items:center;justify-content:center;gap:.25rem}.breadcrumbs{max-width:100%;overflow-x:auto}.breadcrumbs>ol,.breadcrumbs>ul{display:flex;align-items:center;white-space:nowrap;min-height:min-content}.breadcrumbs>ol>li,.breadcrumbs>ul>li{display:flex;align-items:center}.breadcrumbs>ol>li>a,.breadcrumbs>ul>li>a{display:flex;cursor:pointer;align-items:center}@media(hover:hover){.breadcrumbs>ol>li>a:hover,.breadcrumbs>ul>li>a:hover{text-decoration-line:underline}}.btn{display:inline-flex;height:3rem;min-height:3rem;flex-shrink:0;cursor:pointer;user-select:none;flex-wrap:wrap;align-items:center;justify-content:center;border-radius:var(--rounded-btn,.5rem);border-color:transparent;padding-left:1rem;padding-right:1rem;text-align:center;font-size:.875rem;line-height:1em}.btn-disabled,.btn:disabled,.btn[disabled]{pointer-events:none}.btn-square{height:3rem;width:3rem;padding:0}.btn-circle{height:3rem;width:3rem;border-radius:9999px;padding:0}:where(.btn:is(input[type=checkbox])),:where(.btn:is(input[type=radio])){width:auto;appearance:none}.btn:is(input[type=checkbox]):after,.btn:is(input[type=radio]):after{--tw-content:attr(aria-label);content:var(--tw-content)}.card{position:relative;display:flex;flex-direction:column}.card:focus{outline:2px solid transparent;outline-offset:2px}.card-body{display:flex;flex:1 1 auto;flex-direction:column}.card-body :where(p){flex-grow:1}.card-actions{display:flex;flex-wrap:wrap;align-items:flex-start;gap:.5rem}.card figure{display:flex;align-items:center;justify-content:center}.card.image-full{display:grid}.card.image-full:before{position:relative;content:""}.card.image-full:before,.card.image-full>*{grid-column-start:1;grid-row-start:1}.card.image-full>figure img{height:100%;object-fit:cover}.card.image-full>.card-body{position:relative}.carousel{display:inline-flex;overflow-x:scroll;scroll-snap-type:x mandatory;scroll-behavior:smooth}.carousel-vertical{flex-direction:column;overflow-y:scroll;scroll-snap-type:y mandatory}.carousel-item{box-sizing:content-box;display:flex;flex:none;scroll-snap-align:start}.carousel-start .carousel-item{scroll-snap-align:start}.carousel-center .carousel-item{scroll-snap-align:center}.carousel-end .carousel-item{scroll-snap-align:end}.chat{display:grid;grid-template-columns:repeat(2,minmax(0,1fr));column-gap:.75rem;padding-top:.25rem;padding-bottom:.25rem}.chat-image{grid-row:span 2/span 2;align-self:flex-end}.chat-header{grid-row-start:1;font-size:.875rem;line-height:1.25rem}.chat-footer{grid-row-start:3;font-size:.875rem;line-height:1.25rem}.chat-bubble{position:relative;display:block;width:fit-content;padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;max-width:90%}.chat-bubble:before{position:absolute;bottom:0;height:.75rem;width:.75rem;background-color:inherit;content:"";mask-size:contain;mask-repeat:no-repeat;mask-position:center}.chat-start{place-items:start;grid-template-columns:auto 1fr}.chat-start .chat-header{grid-column-start:2}.chat-start .chat-footer{grid-column-start:2}.chat-start .chat-image{grid-column-start:1}.chat-start .chat-bubble{grid-column-start:2}.chat-start .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e")}[dir=rtl] .chat-start .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e")}.chat-end{place-items:end;grid-template-columns:1fr auto}.chat-end .chat-header{grid-column-start:1}.chat-end .chat-footer{grid-column-start:1}.chat-end .chat-image{grid-column-start:2}.chat-end .chat-bubble{grid-column-start:1}.chat-end .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e")}[dir=rtl] .chat-end .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e")}.checkbox{flex-shrink:0}.collapse:not(td):not(tr):not(colgroup){visibility:visible}.collapse{position:relative;display:grid;overflow:hidden;grid-template-rows:auto 0fr;transition:grid-template-rows .2s}.collapse-content,.collapse-title,.collapse>input[type=checkbox],.collapse>input[type=radio]{grid-column-start:1;grid-row-start:1}.collapse>input[type=checkbox],.collapse>input[type=radio]{appearance:none;opacity:0}.collapse-content{visibility:hidden;grid-column-start:1;grid-row-start:2;min-height:0;transition:visibility .2s}.collapse-open,.collapse:focus:not(.collapse-close),.collapse[open]{grid-template-rows:auto 1fr}.collapse:not(.collapse-close):has(>input[type=checkbox]:checked),.collapse:not(.collapse-close):has(>input[type=radio]:checked){grid-template-rows:auto 1fr}.collapse-open>.collapse-content,.collapse:focus:not(.collapse-close)>.collapse-content,.collapse:not(.collapse-close)>input[type=checkbox]:checked~.collapse-content,.collapse:not(.collapse-close)>input[type=radio]:checked~.collapse-content,.collapse[open]>.collapse-content{visibility:visible;min-height:fit-content}:root .countdown{line-height:1em}.countdown{display:inline-flex}.countdown>*{height:1em;display:inline-block;overflow-y:hidden}.countdown>:before{position:relative;content:"00\A 01\A 02\A 03\A 04\A 05\A 06\A 07\A 08\A 09\A 10\A 11\A 12\A 13\A 14\A 15\A 16\A 17\A 18\A 19\A 20\A 21\A 22\A 23\A 24\A 25\A 26\A 27\A 28\A 29\A 30\A 31\A 32\A 33\A 34\A 35\A 36\A 37\A 38\A 39\A 40\A 41\A 42\A 43\A 44\A 45\A 46\A 47\A 48\A 49\A 50\A 51\A 52\A 53\A 54\A 55\A 56\A 57\A 58\A 59\A 60\A 61\A 62\A 63\A 64\A 65\A 66\A 67\A 68\A 69\A 70\A 71\A 72\A 73\A 74\A 75\A 76\A 77\A 78\A 79\A 80\A 81\A 82\A 83\A 84\A 85\A 86\A 87\A 88\A 89\A 90\A 91\A 92\A 93\A 94\A 95\A 96\A 97\A 98\A 99\A";white-space:pre;top:calc(var(--value) * -1em)}.diff{position:relative;display:grid;width:100%;overflow:hidden;container-type:inline-size;grid-template-columns:auto 1fr}.diff-resizer{position:relative;top:50%;z-index:1;height:3rem;width:25rem;min-width:1rem;max-width:calc(100cqi - 1rem);resize:horizontal;overflow:hidden;opacity:0;transform-origin:100% 100%;scale:4;translate:1.5rem -1.5rem;clip-path:inset(calc(100% - .75rem) 0 0 calc(100% - .75rem))}.diff-item-1,.diff-item-2,.diff-resizer{position:relative;grid-column-start:1;grid-row-start:1}.diff-item-1:after{pointer-events:none;position:absolute;bottom:0;right:1px;top:50%;z-index:1;height:2rem;width:2rem;--tw-content:'';content:var(--tw-content);translate:50% -50%}.diff-item-2{overflow:hidden}.diff-item-1>*,.diff-item-2>*{pointer-events:none;position:absolute;bottom:0;left:0;top:0;height:100%;width:100cqi;max-width:none;object-fit:cover;object-position:center}.divider{display:flex;flex-direction:row;align-items:center;align-self:stretch}.divider:after,.divider:before{height:.125rem;width:100%;flex-grow:1;--tw-content:'';content:var(--tw-content)}.divider-start:before{display:none}.divider-end:after{display:none}.drawer{position:relative;display:grid;grid-auto-columns:max-content auto}.drawer-content{grid-column-start:2;grid-row-start:1;min-width:0}.drawer-side{pointer-events:none;position:fixed;inset-inline-start:0;top:0;grid-column-start:1;grid-row-start:1;display:grid;width:100%;grid-template-columns:repeat(1,minmax(0,1fr));grid-template-rows:repeat(1,minmax(0,1fr));align-items:flex-start;justify-items:start;overflow-x:hidden;overflow-y:hidden;overscroll-behavior:contain;height:100vh;height:100dvh}.drawer-side>.drawer-overlay{position:sticky;top:0;place-self:stretch}.drawer-side>*{grid-column-start:1;grid-row-start:1}.drawer-side>:not(.drawer-overlay){transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);will-change:transform;transform:translateX(-100%)}[dir=rtl] .drawer-side>:not(.drawer-overlay){transform:translateX(100%)}.drawer-toggle{position:fixed;height:0;width:0;appearance:none;opacity:0}.drawer-toggle:checked~.drawer-side{pointer-events:auto;visibility:visible;overflow-y:auto}.drawer-toggle:checked~.drawer-side>:not(.drawer-overlay){transform:translateX(0)}.drawer-end{grid-auto-columns:auto max-content}.drawer-end>.drawer-toggle~.drawer-content{grid-column-start:1}.drawer-end>.drawer-toggle~.drawer-side{grid-column-start:2;justify-items:end}.drawer-end>.drawer-toggle~.drawer-side>:not(.drawer-overlay){transform:translateX(100%)}[dir=rtl] .drawer-end>.drawer-toggle~.drawer-side>:not(.drawer-overlay){transform:translateX(-100%)}.drawer-end>.drawer-toggle:checked~.drawer-side>:not(.drawer-overlay){transform:translateX(0)}.dropdown{position:relative;display:inline-block}.dropdown>:not(summary):focus{outline:2px solid transparent;outline-offset:2px}.dropdown .dropdown-content{position:absolute}.dropdown:is(:not(details)) .dropdown-content{visibility:hidden;opacity:0}.dropdown-end .dropdown-content{inset-inline-end:0}.dropdown-left .dropdown-content{bottom:auto;inset-inline-end:100%;top:0}.dropdown-right .dropdown-content{bottom:auto;inset-inline-start:100%;top:0}.dropdown-bottom .dropdown-content{bottom:auto;top:100%}.dropdown-top .dropdown-content{bottom:100%;top:auto}.dropdown-end.dropdown-right .dropdown-content{bottom:0;top:auto}.dropdown-end.dropdown-left .dropdown-content{bottom:0;top:auto}.dropdown.dropdown-open .dropdown-content,.dropdown:focus-within .dropdown-content,.dropdown:not(.dropdown-hover):focus .dropdown-content{visibility:visible;opacity:1}@media (hover:hover){.dropdown.dropdown-hover:hover .dropdown-content{visibility:visible;opacity:1}}.dropdown:is(details) summary::-webkit-details-marker{display:none}.file-input{height:3rem;flex-shrink:1;padding-inline-end:1rem;font-size:.875rem;line-height:1.25rem;line-height:2}.file-input::file-selector-button{margin-inline-end:1rem;display:inline-flex;height:100%;flex-shrink:0;cursor:pointer;user-select:none;flex-wrap:wrap;align-items:center;justify-content:center;padding-left:1rem;padding-right:1rem;text-align:center;font-size:.875rem;line-height:1.25rem;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);line-height:1em}.footer{display:grid;width:100%;grid-auto-flow:row;place-items:start}.footer>*{display:grid;place-items:start}.footer-center{place-items:center;text-align:center}.footer-center>*{place-items:center}@media (min-width:48rem){.footer{grid-auto-flow:column}.footer-center{grid-auto-flow:row dense}}.form-control{display:flex;flex-direction:column}.label{display:flex;user-select:none;align-items:center;justify-content:space-between}.hero{display:grid;width:100%;place-items:center;background-size:cover;background-position:center}.hero>*{grid-column-start:1;grid-row-start:1}.hero-overlay{grid-column-start:1;grid-row-start:1;height:100%;width:100%}.hero-content{z-index:0;display:flex;align-items:center;justify-content:center}.indicator{position:relative;display:inline-flex;width:max-content}.indicator :where(.indicator-item){z-index:1;position:absolute;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));white-space:nowrap}.input{flex-shrink:1;appearance:none;height:3rem;padding-left:1rem;padding-right:1rem;font-size:.875rem;line-height:1.25rem;line-height:2}.input-md[type=number]::-webkit-inner-spin-button,.input[type=number]::-webkit-inner-spin-button{margin-top:-1rem;margin-bottom:-1rem;margin-inline-end:-1rem}.input-xs[type=number]::-webkit-inner-spin-button{margin-top:-.25rem;margin-bottom:-.25rem;margin-inline-end:0}.input-sm[type=number]::-webkit-inner-spin-button{margin-top:0;margin-bottom:0;margin-inline-end:0}.input-lg[type=number]::-webkit-inner-spin-button{margin-top:-1.5rem;margin-bottom:-1.5rem;margin-inline-end:-1.5rem}.join{display:inline-flex;align-items:stretch}.join :where(.join-item){border-start-end-radius:0;border-end-end-radius:0;border-end-start-radius:0;border-start-start-radius:0}.join .join-item:not(:first-child):not(:last-child),.join :not(:first-child):not(:last-child) .join-item{border-start-end-radius:0;border-end-end-radius:0;border-end-start-radius:0;border-start-start-radius:0}.join .join-item:first-child:not(:last-child),.join :first-child:not(:last-child) .join-item{border-start-end-radius:0;border-end-end-radius:0}.join .dropdown .join-item:first-child:not(:last-child),.join :first-child:not(:last-child) .dropdown .join-item{border-start-end-radius:inherit;border-end-end-radius:inherit}.join :where(.join-item:first-child:not(:last-child)),.join :where(:first-child:not(:last-child).join-item){border-end-start-radius:inherit;border-start-start-radius:inherit}.join .join-item:last-child:not(:first-child),.join :last-child:not(:first-child) .join-item{border-end-start-radius:0;border-start-start-radius:0}.join :where(.join-item:last-child:not(:first-child)),.join :where(:last-child:not(:first-child).join-item){border-start-end-radius:inherit;border-end-end-radius:inherit}@supports not selector(:has(*)){:where(.join*){border-radius:inherit}}@supports selector(:has(*)){:where(.join:has(.join-item)){border-radius:inherit}}.kbd{display:inline-flex;align-items:center;justify-content:center}.link{cursor:pointer;text-decoration-line:underline}.link-hover{text-decoration-line:none}@media(hover:hover){.link-hover:hover{text-decoration-line:underline}}.mask{mask-size:contain;mask-repeat:no-repeat;mask-position:center}.mask-half-1{mask-size:200%;mask-position:left}.mask-half-1:where([dir=rtl],[dir=rtl]*){mask-position:right}.mask-half-2{mask-size:200%;mask-position:right}.mask-half-2:where([dir=rtl],[dir=rtl]*){mask-position:left}.menu{display:flex;flex-direction:column;flex-wrap:wrap;font-size:.875rem;line-height:1.25rem}.menu :where(liul){position:relative;white-space:nowrap}.menu :where(li:not(.menu-title)>:not(ul,details,.menu-title,.btn)),.menu :where(li:not(.menu-title)>details>summary:not(.menu-title)){display:grid;grid-auto-flow:column;align-content:flex-start;align-items:center;gap:.5rem;grid-auto-columns:minmax(auto,max-content) auto max-content;user-select:none}.menu li.disabled{cursor:not-allowed;user-select:none}.menu :where(li>.menu-dropdown:not(.menu-dropdown-show)){display:none}:where(.menuli){position:relative;display:flex;flex-shrink:0;flex-direction:column;flex-wrap:wrap;align-items:stretch}:where(.menuli) .badge{justify-self:end}.mockup-code{position:relative;overflow:hidden;overflow-x:auto}.mockup-code pre[data-prefix]:before{content:attr(data-prefix);display:inline-block;text-align:right}.mockup-window{position:relative;overflow:hidden;overflow-x:auto}.mockup-window pre[data-prefix]:before{content:attr(data-prefix);display:inline-block;text-align:right}.mockup-browser{position:relative;overflow:hidden;overflow-x:auto}.mockup-browser pre[data-prefix]:before{content:attr(data-prefix);display:inline-block;text-align:right}.modal{pointer-events:none;position:fixed;inset:0;margin:0;display:grid;height:100%;max-height:none;width:100%;max-width:none;justify-items:center;padding:0;opacity:0;overscroll-behavior:contain;z-index:999}.modal-scroll{overscroll-behavior:auto}:where(.modal){align-items:center}.modal-box{max-height:calc(100vh - 5em)}.modal-open,.modal-toggle:checked+.modal,.modal:target,.modal[open]{pointer-events:auto;visibility:visible;opacity:1}.modal-action{display:flex}.modal-toggle{position:fixed;height:0;width:0;appearance:none;opacity:0}:root:has(:is(.modal-open,.modal:target,.modal-toggle:checked+.modal,.modal[open])){overflow:hidden;scrollbar-gutter:stable}.navbar{display:flex;align-items:center}:where(.navbar>:not(script,style)){display:inline-flex;align-items:center}.navbar-start{width:50%;justify-content:flex-start}.navbar-center{flex-shrink:0}.navbar-end{width:50%;justify-content:flex-end}.progress{position:relative;width:100%;appearance:none;overflow:hidden}.radial-progress{position:relative;display:inline-grid;height:var(--size);width:var(--size);place-content:center;border-radius:9999px;background-color:transparent;vertical-align:middle;box-sizing:content-box}.radial-progress::-moz-progress-bar{appearance:none;background-color:transparent}.radial-progress::-webkit-progress-value{appearance:none;background-color:transparent}.radial-progress::-webkit-progress-bar{appearance:none;background-color:transparent}.radial-progress:after,.radial-progress:before{position:absolute;border-radius:9999px;content:""}.radial-progress:before{inset:0;background:radial-gradient(farthest-side,currentColor 98%,#0000) top/var(--thickness) var(--thickness) no-repeat,conic-gradient(currentColor calc(var(--value) * 1%),#0000 0);-webkit-mask:radial-gradient(farthest-side,#0000 calc(99% - var(--thickness)),#000 calc(100% - var(--thickness)));mask:radial-gradient(farthest-side,#0000 calc(99% - var(--thickness)),#000 calc(100% - var(--thickness)))}.radial-progress:after{inset:calc(50% - var(--thickness)/ 2);transform:rotate(calc(var(--value) * 3.6deg - 90deg)) translate(calc(var(--size)/ 2 - 50%))}.radio{flex-shrink:0}.range{height:1.5rem;width:100%;cursor:pointer}.range:focus{outline:0}.rating{position:relative;display:inline-flex}.rating :where(input){cursor:pointer;border-radius:0}.select{display:inline-flex;cursor:pointer;user-select:none;appearance:none;height:3rem;min-height:3rem;padding-inline-start:1rem;padding-inline-end:2.5rem;font-size:.875rem;line-height:1.25rem;line-height:2}.select[multiple]{height:auto}.stack{display:inline-grid}.stack>*{grid-column-start:1;grid-row-start:1;transform:translateY(10%) scale(.9);z-index:1}.stack>:nth-child(2){transform:translateY(5%) scale(.95);z-index:2}.stack>:nth-child(1){transform:translateY(0) scale(1);z-index:3}.stats{display:inline-grid}:where(.stats){grid-auto-flow:column}.stat{display:inline-grid;width:100%;grid-template-columns:repeat(1,1fr)}.stat-figure{grid-column-start:2;grid-row:span 3/span 3;grid-row-start:1;place-self:center;justify-self:end}.stat-title{grid-column-start:1;white-space:nowrap}.stat-value{grid-column-start:1;white-space:nowrap}.stat-desc{grid-column-start:1;white-space:nowrap}.stat-actions{grid-column-start:1;white-space:nowrap}.steps{display:inline-grid;grid-auto-flow:column;overflow:hidden;overflow-x:auto;counter-reset:step;grid-auto-columns:1fr}.steps .step{display:grid;grid-template-columns:repeat(1,minmax(0,1fr));grid-template-rows:repeat(2,minmax(0,1fr));place-items:center;text-align:center}.swap{position:relative;display:inline-grid;user-select:none;place-content:center}.swap>*{grid-column-start:1;grid-row-start:1}.swap input{appearance:none}.swap .swap-indeterminate,.swap .swap-on,.swap input:indeterminate~.swap-on{opacity:0}.swap input:checked~.swap-off,.swap input:indeterminate~.swap-off,.swap-active .swap-off{opacity:0}.swap input:checked~.swap-on,.swap input:indeterminate~.swap-indeterminate,.swap-active .swap-on{opacity:1}.tabs{display:grid;align-items:flex-end}.tabs-lifted:has(.tab-content[class*=" rounded-"]) .tab:first-child:not(:is(.tab-active,[aria-selected=true])),.tabs-lifted:has(.tab-content[class^=rounded-]) .tab:first-child:not(:is(.tab-active,[aria-selected=true])){border-bottom-color:transparent}.tab{position:relative;grid-row-start:1;display:inline-flex;height:2rem;cursor:pointer;user-select:none;appearance:none;flex-wrap:wrap;align-items:center;justify-content:center;text-align:center;font-size:.875rem;line-height:1.25rem;line-height:2;--tab-padding:1rem}.tab:is(input[type=radio]){width:auto;border-bottom-right-radius:0;border-bottom-left-radius:0}.tab:is(input[type=radio]):after{--tw-content:attr(aria-label);content:var(--tw-content)}.tab:not(input):empty{cursor:default;grid-column-start:span 9999}.tab-content{grid-column-start:1;grid-column-end:span 9999;grid-row-start:2;margin-top:calc(var(--tab-border) * -1);display:none;border-color:transparent;border-width:var(--tab-border,0)}:checked+.tab-content:nth-child(2),:is(.tab-active,[aria-selected=true])+.tab-content:nth-child(2){border-start-start-radius:0}:is(.tab-active,[aria-selected=true])+.tab-content,input.tab:checked+.tab-content{display:block}.table{position:relative;width:100%}.table :where(.table-pin-rowstheadtr){position:sticky;top:0;z-index:1;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.table :where(.table-pin-rowstfoottr){position:sticky;bottom:0;z-index:1;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.table :where(.table-pin-colstrth){position:sticky;left:0;right:0;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.table-zebra tbody tr:nth-child(even) :where(.table-pin-colstrth){--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)))}.textarea{min-height:3rem;flex-shrink:1;padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;font-size:.875rem;line-height:1.25rem;line-height:2}.timeline{position:relative;display:flex}:where(.timeline>li){position:relative;display:grid;flex-shrink:0;align-items:center;grid-template-rows:var(--timeline-row-start,minmax(0,1fr)) auto var(--timeline-row-end,minmax(0,1fr));grid-template-columns:var(--timeline-col-start,minmax(0,1fr)) auto var(--timeline-col-end,minmax(0,1fr))}.timeline>li>hr{width:100%;border-width:0}:where(.timeline>li>hr):first-child{grid-column-start:1;grid-row-start:2}:where(.timeline>li>hr):last-child{grid-column-start:3;grid-column-end:none;grid-row-start:2;grid-row-end:auto}.timeline-start{grid-column-start:1;grid-column-end:4;grid-row-start:1;grid-row-end:2;margin:.25rem;align-self:flex-end;justify-self:center}.timeline-middle{grid-column-start:2;grid-row-start:2}.timeline-end{grid-column-start:1;grid-column-end:4;grid-row-start:3;grid-row-end:4;margin:.25rem;align-self:flex-start;justify-self:center}.toast{position:fixed;display:flex;min-width:fit-content;flex-direction:column;white-space:nowrap}.toggle{flex-shrink:0}.alert{border-radius:var(--rounded-box,1rem);border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));padding:1rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-b2,oklch(var(--b2)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1));background-color:var(--alert-bg)}.alert-info{border-color:var(--fallback-in,oklch(var(--in)/.2));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-in,oklch(var(--in)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.alert-success{border-color:var(--fallback-su,oklch(var(--su)/.2));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-su,oklch(var(--su)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.alert-warning{border-color:var(--fallback-wa,oklch(var(--wa)/.2));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)));--alert-bg:var(--fallback-wa,oklch(var(--wa)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.alert-error{border-color:var(--fallback-er,oklch(var(--er)/.2));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-er,oklch(var(--er)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.avatar-group{display:flex;overflow:hidden}.avatar-group :where(.avatar){overflow:hidden;border-radius:9999px;border-width:4px;--tw-border-opacity:1;border-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-border-opacity)))}.badge{border-radius:var(--rounded-badge,1.9rem);border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.badge-neutral{--tw-border-opacity:1;border-color:var(--fallback-n,oklch(var(--n)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.badge-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.badge-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.badge-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.badge-info{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.badge-success{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.badge-warning{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.badge-error{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.badge-ghost{--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.badge-outline{border-color:currentColor;--tw-border-opacity:0.5;background-color:transparent;color:currentColor}.badge-outline.badge-neutral{--tw-text-opacity:1;color:var(--fallback-n,oklch(var(--n)/var(--tw-text-opacity)))}.badge-outline.badge-primary{--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}.badge-outline.badge-secondary{--tw-text-opacity:1;color:var(--fallback-s,oklch(var(--s)/var(--tw-text-opacity)))}.badge-outline.badge-accent{--tw-text-opacity:1;color:var(--fallback-a,oklch(var(--a)/var(--tw-text-opacity)))}.badge-outline.badge-info{--tw-text-opacity:1;color:var(--fallback-in,oklch(var(--in)/var(--tw-text-opacity)))}.badge-outline.badge-success{--tw-text-opacity:1;color:var(--fallback-su,oklch(var(--su)/var(--tw-text-opacity)))}.badge-outline.badge-warning{--tw-text-opacity:1;color:var(--fallback-wa,oklch(var(--wa)/var(--tw-text-opacity)))}.badge-outline.badge-error{--tw-text-opacity:1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity)))}.btm-nav{height:4rem;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));color:currentColor}.btm-nav>*{border-color:currentColor}.btm-nav>:not(.active){padding-top:.125rem}.btm-nav>:where(.active){border-top-width:2px;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.btm-nav>.disabled,.btm-nav>[disabled]{pointer-events:none;--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}@media (hover:hover){.btm-nav>.disabled:hover,.btm-nav>[disabled]:hover{pointer-events:none;--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}}.btm-nav>* .label{font-size:1rem;line-height:1.5rem}.breadcrumbs{padding-top:.5rem;padding-bottom:.5rem}.breadcrumbs>ol>li>a:focus,.breadcrumbs>ul>li>a:focus{outline:2px solid transparent;outline-offset:2px}.breadcrumbs>ol>li>a:focus-visible,.breadcrumbs>ul>li>a:focus-visible{outline:2px solid currentColor;outline-offset:2px}.breadcrumbs>ol>li+:before,.breadcrumbs>ul>li+:before{content:"";margin-left:.5rem;margin-right:.75rem;display:block;height:.375rem;width:.375rem;--tw-rotate:45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));opacity:.4;border-top:1px solid;border-right:1px solid;background-color:transparent}[dir=rtl] .breadcrumbs>ol>li+:before,[dir=rtl] .breadcrumbs>ul>li+:before{--tw-rotate:-135deg}.btn{gap:.5rem;font-weight:600;text-decoration-line:none;transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);border-width:var(--border-btn,1px);transition-property:color,background-color,border-color,opacity,box-shadow,transform}@media (prefers-reduced-motion:no-preference){.btn{animation:button-pop var(--animation-btn,.25s) ease-out}}.btn:active:focus,.btn:active:hover{animation:button-pop 0s ease-out;transform:scale(var(--btn-focus-scale,.97))}.btn{--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));text-decoration-line:none;--tw-shadow:0 1px 2px 0 rgb(0 0 0 / 0.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:var(--fallback-bc,oklch(var(--bc)/1));background-color:oklch(var(--btn-color,var(--b2)) / var(--tw-bg-opacity));--tw-bg-opacity:1;border-color:oklch(var(--btn-color,var(--b2)) / var(--tw-border-opacity));--tw-border-opacity:1}@supports not (color:oklch(0% 0 0)){.btn{background-color:var(--btn-color,var(--fallback-b2));border-color:var(--btn-color,var(--fallback-b2))}}@media (hover:hover){.btn:hover{--tw-border-opacity:1;border-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn:hover{background-color:color-mix(in oklab,oklch(var(--btn-color,var(--b2)) / var(--tw-bg-opacity,1)) 90%,#000);border-color:color-mix(in oklab,oklch(var(--btn-color,var(--b2)) / var(--tw-border-opacity,1)) 90%,#000)}}@supports not (color:oklch(0% 0 0)){.btn:hover{background-color:var(--btn-color,var(--fallback-b2));border-color:var(--btn-color,var(--fallback-b2))}}}@supports (color:color-mix(in oklab,black,black)){.btn-active{background-color:color-mix(in oklab,oklch(var(--btn-color,var(--b3)) / var(--tw-bg-opacity,1)) 90%,#000);border-color:color-mix(in oklab,oklch(var(--btn-color,var(--b3)) / var(--tw-border-opacity,1)) 90%,#000)}}.btn:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px}.btn-primary{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}@supports (color:oklch(0% 0 0)){.btn-primary{--btn-color:var(--p)}}@supports not (color:oklch(0% 0 0)){.btn-primary{--btn-color:var(--fallback-p)}}.btn-secondary{--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}@supports (color:oklch(0% 0 0)){.btn-secondary{--btn-color:var(--s)}}@supports not (color:oklch(0% 0 0)){.btn-secondary{--btn-color:var(--fallback-s)}}.btn-accent{--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}@supports (color:oklch(0% 0 0)){.btn-accent{--btn-color:var(--a)}}@supports not (color:oklch(0% 0 0)){.btn-accent{--btn-color:var(--fallback-a)}}.btn-neutral{--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));outline-color:var(--fallback-n,oklch(var(--n)/1))}@supports (color:oklch(0% 0 0)){.btn-neutral{--btn-color:var(--n)}}@supports not (color:oklch(0% 0 0)){.btn-neutral{--btn-color:var(--fallback-n)}}.btn-info{--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}@supports (color:oklch(0% 0 0)){.btn-info{--btn-color:var(--in)}}@supports not (color:oklch(0% 0 0)){.btn-info{--btn-color:var(--fallback-in)}}.btn-success{--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}@supports (color:oklch(0% 0 0)){.btn-success{--btn-color:var(--su)}}@supports not (color:oklch(0% 0 0)){.btn-success{--btn-color:var(--fallback-su)}}.btn-warning{--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}@supports (color:oklch(0% 0 0)){.btn-warning{--btn-color:var(--wa)}}@supports not (color:oklch(0% 0 0)){.btn-warning{--btn-color:var(--fallback-wa)}}.btn-error{--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}@supports (color:oklch(0% 0 0)){.btn-error{--btn-color:var(--er)}}@supports not (color:oklch(0% 0 0)){.btn-error{--btn-color:var(--fallback-er)}}.btn.glass{--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:currentColor}@media (hover:hover){.btn.glass:hover{--glass-opacity:25%;--glass-border-opacity:15%}}.btn.glass.btn-active{--glass-opacity:25%;--glass-border-opacity:15%}.btn-ghost{border-width:1px;border-color:transparent;background-color:transparent;color:currentColor;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:currentColor}@media (hover:hover){.btn-ghost:hover{border-color:transparent}@supports (color:oklch(0% 0 0)){.btn-ghost:hover{background-color:var(--fallback-bc,oklch(var(--bc)/.2))}}}.btn-ghost.btn-active{border-color:transparent;background-color:var(--fallback-bc,oklch(var(--bc)/.2))}.btn-link{border-color:transparent;background-color:transparent;--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)));text-decoration-line:underline;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:currentColor}@media (hover:hover){.btn-link:hover{border-color:transparent;background-color:transparent;text-decoration-line:underline}}.btn-link.btn-active{border-color:transparent;background-color:transparent;text-decoration-line:underline}.btn-outline{border-color:currentColor;background-color:transparent;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}@media (hover:hover){.btn-outline:hover{--tw-border-opacity:1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-b1,oklch(var(--b1)/var(--tw-text-opacity)))}}.btn-outline.btn-active{--tw-border-opacity:1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-b1,oklch(var(--b1)/var(--tw-text-opacity)))}.btn-outline.btn-primary{--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-primary:hover{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-primary:hover{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000)}}}.btn-outline.btn-primary.btn-active{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-primary.btn-active{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000)}}.btn-outline.btn-secondary{--tw-text-opacity:1;color:var(--fallback-s,oklch(var(--s)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-secondary:hover{--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-secondary:hover{background-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000)}}}.btn-outline.btn-secondary.btn-active{--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-secondary.btn-active{background-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000)}}.btn-outline.btn-accent{--tw-text-opacity:1;color:var(--fallback-a,oklch(var(--a)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-accent:hover{--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-accent:hover{background-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000)}}}.btn-outline.btn-accent.btn-active{--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-accent.btn-active{background-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000)}}.btn-outline.btn-success{--tw-text-opacity:1;color:var(--fallback-su,oklch(var(--su)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-success:hover{--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-success:hover{background-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000)}}}.btn-outline.btn-success.btn-active{--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-success.btn-active{background-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000)}}.btn-outline.btn-info{--tw-text-opacity:1;color:var(--fallback-in,oklch(var(--in)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-info:hover{--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-info:hover{background-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000)}}}.btn-outline.btn-info.btn-active{--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-info.btn-active{background-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000)}}.btn-outline.btn-warning{--tw-text-opacity:1;color:var(--fallback-wa,oklch(var(--wa)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-warning:hover{--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-warning:hover{background-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000)}}}.btn-outline.btn-warning.btn-active{--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-warning.btn-active{background-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000)}}.btn-outline.btn-error{--tw-text-opacity:1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-error:hover{--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-error:hover{background-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000)}}}.btn-outline.btn-error.btn-active{--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-error.btn-active{background-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000)}}.btn.btn-disabled,.btn:disabled,.btn[disabled]{--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}@media (hover:hover){.btn-disabled:hover,.btn:disabled:hover,.btn[disabled]:hover{--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}}.btn:is(input[type=checkbox]:checked),.btn:is(input[type=radio]:checked){--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@media (hover:hover){@supports (color:color-mix(in oklab,black,black)){.btn:is(input[type=checkbox]:checked):hover,.btn:is(input[type=radio]:checked):hover{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000)}}}.btn:is(input[type=checkbox]:checked):focus-visible,.btn:is(input[type=radio]:checked):focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}@keyframes button-pop{0%{transform:scale(var(--btn-focus-scale,.98))}40%{transform:scale(1.02)}100%{transform:scale(1)}}.card{border-radius:var(--rounded-box,1rem)}.card :where(figure:first-child){overflow:hidden;border-start-start-radius:inherit;border-start-end-radius:inherit;border-end-start-radius:unset;border-end-end-radius:unset}.card :where(figure:last-child){overflow:hidden;border-start-start-radius:unset;border-start-end-radius:unset;border-end-start-radius:inherit;border-end-end-radius:inherit}.card:focus-visible{outline:2px solid currentColor;outline-offset:2px}.card.bordered{border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.card-bordered{border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.card.compact .card-body{padding:1rem;font-size:.875rem;line-height:1.25rem}.card-body{padding:var(--padding-card,2rem);display:flex;flex-direction:column;gap:.5rem}.card-title{display:flex;align-items:center;gap:.5rem;font-size:1.25rem;line-height:1.75rem;font-weight:600}.card.image-full:before{z-index:10;border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));opacity:.75}.card.image-full>.card-body{z-index:20;--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.card.image-full :where(figure){overflow:hidden;border-radius:inherit}.carousel{-ms-overflow-style:none;scrollbar-width:none}.carousel::-webkit-scrollbar{display:none}.chat-bubble{border-radius:var(--rounded-box,1rem);min-height:2.75rem;min-width:2.75rem}.chat-bubble{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.chat-bubble-primary{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.chat-bubble-secondary{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.chat-bubble-accent{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.chat-bubble-info{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.chat-bubble-success{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.chat-bubble-warning{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.chat-bubble-error{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.chat-start .chat-bubble{border-end-start-radius:0}.chat-start .chat-bubble:before{inset-inline-start:-.749rem}.chat-end .chat-bubble{border-end-end-radius:0}.chat-end .chat-bubble:before{inset-inline-start:99.9%}.checkbox{--chkbg:var(--fallback-bc,oklch(var(--bc)/1));--chkfg:var(--fallback-b1,oklch(var(--b1)/1));height:1.5rem;width:1.5rem;cursor:pointer;appearance:none;border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2}.checkbox:focus{box-shadow:none}.checkbox:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.checkbox:disabled{border-width:0}.checkbox:checked,.checkbox[aria-checked=true]{background-repeat:no-repeat;animation:checkmark var(--animation-input,.2s) ease-out;background-color:var(--chkbg);background-image:linear-gradient(-45deg,transparent 65%,var(--chkbg) 65.99%),linear-gradient(45deg,transparent 75%,var(--chkbg) 75.99%),linear-gradient(-45deg,var(--chkbg) 40%,transparent 40.99%),linear-gradient(45deg,var(--chkbg) 30%,var(--chkfg) 30.99%,var(--chkfg) 40%,transparent 40.99%),linear-gradient(-45deg,var(--chkfg) 50%,var(--chkbg) 50.99%)}.checkbox:indeterminate{--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));background-repeat:no-repeat;animation:checkmark var(--animation-input,.2s) ease-out;background-image:linear-gradient(90deg,transparent 80%,var(--chkbg) 80%),linear-gradient(-90deg,transparent 80%,var(--chkbg) 80%),linear-gradient(0deg,var(--chkbg) 43%,var(--chkfg) 43%,var(--chkfg) 57%,var(--chkbg) 57%)}.checkbox-primary{--chkbg:var(--fallback-p,oklch(var(--p)/1));--chkfg:var(--fallback-pc,oklch(var(--pc)/1));--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-primary:hover{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}}.checkbox-primary:focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}.checkbox-primary:checked,.checkbox-primary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.checkbox-secondary{--chkbg:var(--fallback-s,oklch(var(--s)/1));--chkfg:var(--fallback-sc,oklch(var(--sc)/1));--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-secondary:hover{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}}.checkbox-secondary:focus-visible{outline-color:var(--fallback-s,oklch(var(--s)/1))}.checkbox-secondary:checked,.checkbox-secondary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.checkbox-accent{--chkbg:var(--fallback-a,oklch(var(--a)/1));--chkfg:var(--fallback-ac,oklch(var(--ac)/1));--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-accent:hover{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}}.checkbox-accent:focus-visible{outline-color:var(--fallback-a,oklch(var(--a)/1))}.checkbox-accent:checked,.checkbox-accent[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.checkbox-success{--chkbg:var(--fallback-su,oklch(var(--su)/1));--chkfg:var(--fallback-suc,oklch(var(--suc)/1));--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-success:hover{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}}.checkbox-success:focus-visible{outline-color:var(--fallback-su,oklch(var(--su)/1))}.checkbox-success:checked,.checkbox-success[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.checkbox-warning{--chkbg:var(--fallback-wa,oklch(var(--wa)/1));--chkfg:var(--fallback-wac,oklch(var(--wac)/1));--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-warning:hover{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}}.checkbox-warning:focus-visible{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.checkbox-warning:checked,.checkbox-warning[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.checkbox-info{--chkbg:var(--fallback-in,oklch(var(--in)/1));--chkfg:var(--fallback-inc,oklch(var(--inc)/1));--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-info:hover{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}}.checkbox-info:focus-visible{outline-color:var(--fallback-in,oklch(var(--in)/1))}.checkbox-info:checked,.checkbox-info[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.checkbox-error{--chkbg:var(--fallback-er,oklch(var(--er)/1));--chkfg:var(--fallback-erc,oklch(var(--erc)/1));--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-error:hover{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}}.checkbox-error:focus-visible{outline-color:var(--fallback-er,oklch(var(--er)/1))}.checkbox-error:checked,.checkbox-error[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.checkbox:disabled{cursor:not-allowed;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.2}@keyframes checkmark{0%{background-position-y:5px}50%{background-position-y:-2px}100%{background-position-y:0}}.checkbox-mark{display:none}.collapse{width:100%;border-radius:var(--rounded-box,1rem)}details.collapse{width:100%}details.collapse summary{position:relative;display:block}details.collapse summary::-webkit-details-marker{display:none}.collapse:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}details.collapse summary{outline:2px solid transparent;outline-offset:2px}.collapse:has(.collapse-title:focus-visible),.collapse:has(>input[type=checkbox]:focus-visible),.collapse:has(>input[type=radio]:focus-visible){outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.collapse-arrow>.collapse-title:after{position:absolute;display:block;height:.5rem;width:.5rem;--tw-translate-y:-100%;--tw-rotate:45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:150ms;transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s;top:1.9rem;inset-inline-end:1.4rem;content:"";transform-origin:75% 75%;box-shadow:2px 2px;pointer-events:none}.collapse-plus>.collapse-title:after{position:absolute;display:block;height:.5rem;width:.5rem;transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);top:.9rem;inset-inline-end:1.4rem;content:"+";pointer-events:none}.collapse:not(.collapse-open):not(.collapse-close)>.collapse-title,.collapse:not(.collapse-open):not(.collapse-close)>input[type=checkbox],.collapse:not(.collapse-open):not(.collapse-close)>input[type=radio]:not(:checked){cursor:pointer}.collapse:focus:not(.collapse-open):not(.collapse-close):not(.collapse[open])>.collapse-title{cursor:unset}.collapse-title{position:relative}:where(.collapse>input[type=checkbox]),:where(.collapse>input[type=radio]){z-index:1}.collapse-title,:where(.collapse>input[type=checkbox]),:where(.collapse>input[type=radio]){width:100%;padding:1rem;padding-inline-end:3rem;min-height:3.75rem;transition:background-color .2s ease-out}.collapse-content{padding-left:1rem;padding-right:1rem;cursor:unset;transition:padding .2s ease-out,background-color .2s ease-out}.collapse-open>:where(.collapse-content),.collapse:focus:not(.collapse-close)>:where(.collapse-content),.collapse:not(.collapse-close)>:where(input[type=checkbox]:checked~.collapse-content),.collapse:not(.collapse-close)>:where(input[type=radio]:checked~.collapse-content),.collapse[open]>:where(.collapse-content){padding-bottom:1rem;transition:padding .2s ease-out,background-color .2s ease-out}.collapse-arrow:focus:not(.collapse-close)>.collapse-title:after,.collapse-arrow:not(.collapse-close)>input[type=checkbox]:checked~.collapse-title:after,.collapse-arrow:not(.collapse-close)>input[type=radio]:checked~.collapse-title:after,.collapse-open.collapse-arrow>.collapse-title:after,.collapse[open].collapse-arrow>.collapse-title:after{--tw-translate-y:-50%;--tw-rotate:225deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.collapse-open.collapse-plus>.collapse-title:after,.collapse-plus:focus:not(.collapse-close)>.collapse-title:after,.collapse-plus:not(.collapse-close)>input[type=checkbox]:checked~.collapse-title:after,.collapse-plus:not(.collapse-close)>input[type=radio]:checked~.collapse-title:after,.collapse[open].collapse-plus>.collapse-title:after{content:"−"}.countdown>:before{text-align:center;transition:all 1s cubic-bezier(1,0,0,1)}.diff-item-1:after{border-radius:9999px;border-width:2px;--tw-border-opacity:1;border-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-border-opacity)));background-color:var(--fallback-b1,oklch(var(--b1)/.5));--tw-shadow:0 1px 2px 0 rgb(0 0 0 / 0.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-style:solid;outline-offset:-3px;outline-color:var(--fallback-bc,oklch(var(--bc)/.05));--tw-backdrop-blur:blur(8px);backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);translate:50% -50%}.diff-item-2{border-right-width:2px;--tw-border-opacity:1;border-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-border-opacity)))}.divider{margin-top:1rem;margin-bottom:1rem;height:1rem;white-space:nowrap}.divider:after,.divider:before{background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.divider:not(:empty){gap:1rem}.divider-neutral:after,.divider-neutral:before{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)))}.divider-primary:after,.divider-primary:before{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)))}.divider-secondary:after,.divider-secondary:before{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)))}.divider-accent:after,.divider-accent:before{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)))}.divider-success:after,.divider-success:before{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.divider-warning:after,.divider-warning:before{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.divider-info:after,.divider-info:before{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.divider-error:after,.divider-error:before{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.drawer{width:100%}.drawer-side>.drawer-overlay{cursor:pointer;background-color:transparent;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1)}.drawer-toggle:checked~.drawer-side>.drawer-overlay{background-color:#0006}.drawer-toggle:focus-visible~.drawer-content label.drawer-button{outline-style:solid;outline-width:2px;outline-offset:2px}.dropdown:is(:not(details)) .dropdown-content{transform-origin:top;--tw-scale-x:.95;--tw-scale-y:.95;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1)}.dropdown-bottom .dropdown-content{transform-origin:top}.dropdown-top .dropdown-content{transform-origin:bottom}.dropdown-left .dropdown-content{transform-origin:right}.dropdown-right .dropdown-content{transform-origin:left}.dropdown.dropdown-open .dropdown-content,.dropdown:focus .dropdown-content,.dropdown:focus-within .dropdown-content{--tw-scale-x:1;--tw-scale-y:1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@media (hover:hover){.dropdown.dropdown-hover:hover .dropdown-content{--tw-scale-x:1;--tw-scale-y:1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}}.file-input{overflow:hidden;border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));font-size:1rem;line-height:1.5rem}.file-input::file-selector-button{border-style:solid;--tw-border-opacity:1;border-color:var(--fallback-n,oklch(var(--n)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));font-weight:600;text-transform:uppercase;--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));text-decoration-line:none;border-width:var(--border-btn,1px);animation:button-pop var(--animation-btn,.25s) ease-out}.file-input-bordered{--tw-border-opacity:0.2}.file-input:focus{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.file-input-ghost{--tw-bg-opacity:0.05}.file-input-ghost:focus{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:none}.file-input-ghost::file-selector-button{border-width:1px;border-color:transparent;background-color:transparent;color:currentColor}.file-input-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.file-input-primary:focus{outline-color:var(--fallback-p,oklch(var(--p)/1))}.file-input-primary::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.file-input-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.file-input-secondary:focus{outline-color:var(--fallback-s,oklch(var(--s)/1))}.file-input-secondary::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.file-input-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.file-input-accent:focus{outline-color:var(--fallback-a,oklch(var(--a)/1))}.file-input-accent::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.file-input-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.file-input-info:focus{outline-color:var(--fallback-in,oklch(var(--in)/1))}.file-input-info::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.file-input-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.file-input-success:focus{outline-color:var(--fallback-su,oklch(var(--su)/1))}.file-input-success::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.file-input-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.file-input-warning:focus{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.file-input-warning::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.file-input-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.file-input-error:focus{outline-color:var(--fallback-er,oklch(var(--er)/1))}.file-input-error::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.file-input-disabled,.file-input[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));--tw-text-opacity:0.2}.file-input-disabled::placeholder,.file-input[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.file-input-disabled::file-selector-button,.file-input[disabled]::file-selector-button{--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}.footer{column-gap:1rem;row-gap:2.5rem;font-size:.875rem;line-height:1.25rem}.footer>*{gap:.5rem}.footer-title{margin-bottom:.5rem;font-weight:700;text-transform:uppercase;opacity:.6}.label{padding-left:.25rem;padding-right:.25rem;padding-top:.5rem;padding-bottom:.5rem}.label-text{font-size:.875rem;line-height:1.25rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.label-text-alt{font-size:.75rem;line-height:1rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}@media(hover:hover){.label a:hover{--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}}.hero-overlay{background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.5}.hero-content{max-width:80rem;gap:1rem;padding:1rem}.input{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));font-size:1rem;line-height:1.5rem}.input input{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));background-color:transparent}.input input:focus{outline:2px solid transparent;outline-offset:2px}.input[list]::-webkit-calendar-picker-indicator{line-height:1em}.input-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.input:focus,.input:focus-within{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.input-ghost{--tw-bg-opacity:0.05}.input-ghost:focus,.input-ghost:focus-within{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:none}.input-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.input-primary:focus,.input-primary:focus-within{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}.input-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.input-secondary:focus,.input-secondary:focus-within{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}.input-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.input-accent:focus,.input-accent:focus-within{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}.input-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.input-info:focus,.input-info:focus-within{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}.input-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.input-success:focus,.input-success:focus-within{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}.input-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.input-warning:focus,.input-warning:focus-within{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.input-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.input-error:focus,.input-error:focus-within{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}.input-disabled,.input:disabled,.input:has(>input[disabled]),.input[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.input-disabled::placeholder,.input:disabled::placeholder,.input:has(>input[disabled])::placeholder,.input[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.input:has(>input[disabled])>input[disabled]{cursor:not-allowed}.input::-webkit-date-and-time-value{text-align:inherit}.join{border-radius:var(--rounded-btn,.5rem)}.join>:where(:not(:first-child)){margin-top:0;margin-bottom:0;margin-inline-start:-1px}.join>:where(:not(:first-child)):is(.btn){margin-inline-start:calc(var(--border-btn) * -1)}.join-item:focus{isolation:isolate}.kbd{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2;--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding-left:.5rem;padding-right:.5rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));border-bottom-width:2px;min-height:2.2em;min-width:2.2em}.link-primary{--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-primary:hover{color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 80%,#000)}}}.link-secondary{--tw-text-opacity:1;color:var(--fallback-s,oklch(var(--s)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-secondary:hover{color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 80%,#000)}}}.link-accent{--tw-text-opacity:1;color:var(--fallback-a,oklch(var(--a)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-accent:hover{color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 80%,#000)}}}.link-neutral{--tw-text-opacity:1;color:var(--fallback-n,oklch(var(--n)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-neutral:hover{color:color-mix(in oklab,var(--fallback-n,oklch(var(--n)/1)) 80%,#000)}}}.link-success{--tw-text-opacity:1;color:var(--fallback-su,oklch(var(--su)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-success:hover{color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 80%,#000)}}}.link-info{--tw-text-opacity:1;color:var(--fallback-in,oklch(var(--in)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-info:hover{color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 80%,#000)}}}.link-warning{--tw-text-opacity:1;color:var(--fallback-wa,oklch(var(--wa)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-warning:hover{color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 80%,#000)}}}.link-error{--tw-text-opacity:1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-error:hover{color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 80%,#000)}}}.link:focus{outline:2px solid transparent;outline-offset:2px}.link:focus-visible{outline:2px solid currentColor;outline-offset:2px}.loading{pointer-events:none;display:inline-block;aspect-ratio:1/1;width:1.5rem;background-color:currentColor;mask-size:100%;mask-repeat:no-repeat;mask-position:center;mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' stroke='%23000' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_V8m1%7Btransform-origin:center;animation:spinner_zKoa 2s linear infinite%7D.spinner_V8m1 circle%7Bstroke-linecap:round;animation:spinner_YpZS 1.5s ease-out infinite%7D%40keyframes spinner_zKoa%7B100%25%7Btransform:rotate(360deg)%7D%7D%40keyframes spinner_YpZS%7B0%25%7Bstroke-dasharray:0 150;stroke-dashoffset:0%7D47.5%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-16%7D95%25%2C100%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-59%7D%7D%3C%2Fstyle%3E%3Cg class='spinner_V8m1'%3E%3Ccircle cx='12' cy='12' r='9.5' fill='none' stroke-width='3'%3E%3C%2Fcircle%3E%3C%2Fg%3E%3C%2Fsvg%3E")}.loading-spinner{mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' stroke='%23000' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_V8m1%7Btransform-origin:center;animation:spinner_zKoa 2s linear infinite%7D.spinner_V8m1 circle%7Bstroke-linecap:round;animation:spinner_YpZS 1.5s ease-out infinite%7D%40keyframes spinner_zKoa%7B100%25%7Btransform:rotate(360deg)%7D%7D%40keyframes spinner_YpZS%7B0%25%7Bstroke-dasharray:0 150;stroke-dashoffset:0%7D47.5%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-16%7D95%25%2C100%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-59%7D%7D%3C%2Fstyle%3E%3Cg class='spinner_V8m1'%3E%3Ccircle cx='12' cy='12' r='9.5' fill='none' stroke-width='3'%3E%3C%2Fcircle%3E%3C%2Fg%3E%3C%2Fsvg%3E")}.loading-dots{mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_qM83%7Banimation:spinner_8HQG 1.05s infinite%7D.spinner_oXPr%7Banimation-delay:.1s%7D.spinner_ZTLf%7Banimation-delay:.2s%7D@keyframes spinner_8HQG%7B0%25,57.14%25%7Banimation-timing-function:cubic-bezier(0.33,.66,.66,1);transform:translate(0)%7D28.57%25%7Banimation-timing-function:cubic-bezier(0.33,0,.66,.33);transform:translateY(-6px)%7D100%25%7Btransform:translate(0)%7D%7D%3C/style%3E%3Ccircle class='spinner_qM83' cx='4' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_oXPr' cx='12' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_ZTLf' cx='20' cy='12' r='3'/%3E%3C/svg%3E")}.loading-ring{mask-image:url("data:image/svg+xml,%3Csvg width='44' height='44' viewBox='0 0 44 44' xmlns='http://www.w3.org/2000/svg' stroke='%23fff'%3E%3Cg fill='none' fill-rule='evenodd' stroke-width='2'%3E%3Ccircle cx='22' cy='22' r='1'%3E%3Canimate attributeName='r' begin='0s' dur='1.8s' values='1; 20' calcMode='spline' keyTimes='0; 1' keySplines='0.165, 0.84, 0.44, 1' repeatCount='indefinite' /%3E%3Canimate attributeName='stroke-opacity' begin='0s' dur='1.8s' values='1; 0' calcMode='spline' keyTimes='0; 1' keySplines='0.3, 0.61, 0.355, 1' repeatCount='indefinite' /%3E%3C/circle%3E%3Ccircle cx='22' cy='22' r='1'%3E%3Canimate attributeName='r' begin='-0.9s' dur='1.8s' values='1; 20' calcMode='spline' keyTimes='0; 1' keySplines='0.165, 0.84, 0.44, 1' repeatCount='indefinite' /%3E%3Canimate attributeName='stroke-opacity' begin='-0.9s' dur='1.8s' values='1; 0' calcMode='spline' keyTimes='0; 1' keySplines='0.3, 0.61, 0.355, 1' repeatCount='indefinite' /%3E%3C/circle%3E%3C/g%3E%3C/svg%3E")}.loading-ball{mask-image:url("data:image/svg+xml,%0A%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_rXNP%7Banimation:spinner_YeBj .8s infinite%7D@keyframes spinner_YeBj%7B0%25%7Banimation-timing-function:cubic-bezier(0.33,0,.66,.33);cy:5px%7D46.875%25%7Bcy:20px;rx:4px;ry:4px%7D50%25%7Banimation-timing-function:cubic-bezier(0.33,.66,.66,1);cy:20.5px;rx:4.8px;ry:3px%7D53.125%25%7Brx:4px;ry:4px%7D100%25%7Bcy:5px%7D%7D%3C/style%3E%3Cellipse class='spinner_rXNP' cx='12' cy='5' rx='4' ry='4'/%3E%3C/svg%3E")}.loading-bars{mask-image:url("data:image/svg+xml,%0A%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_hzlK%7Banimation:spinner_vc4H .8s linear infinite;animation-delay:-.8s%7D.spinner_koGT%7Banimation-delay:-.65s%7D.spinner_YF1u%7Banimation-delay:-.5s%7D@keyframes spinner_vc4H%7B0%25%7By:1px;height:22px%7D93.75%25%7By:5px;height:14px;opacity:.2%7D%7D%3C/style%3E%3Crect class='spinner_hzlK' x='1' y='1' width='6' height='22'/%3E%3Crect class='spinner_hzlK spinner_koGT' x='9' y='1' width='6' height='22'/%3E%3Crect class='spinner_hzlK spinner_YF1u' x='17' y='1' width='6' height='22'/%3E%3C/svg%3E")}.loading-infinity{mask-image:url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' style='shape-rendering: auto;' width='200px' height='200px' viewBox='0 0 100 100' preserveAspectRatio='xMidYMid'%3E%3Cpath fill='none' stroke='%230a0a0a' stroke-width='10' stroke-dasharray='205.271142578125 51.317785644531256' d='M24.3 30C11.4 30 5 43.3 5 50s6.4 20 19.3 20c19.3 0 32.1-40 51.4-40 C88.6 30 95 43.3 95 50s-6.4 20-19.3 20C56.4 70 43.6 30 24.3 30z' stroke-linecap='round' style='transform:scale(0.8);transform-origin:50px 50px'%3E%3Canimate attributeName='stroke-dashoffset' repeatCount='indefinite' dur='2s' keyTimes='0;1' values='0;256.58892822265625'%3E%3C/animate%3E%3C/path%3E%3C/svg%3E")}.loading-xs{width:1rem}.loading-sm{width:1.25rem}.loading-md{width:1.5rem}.loading-lg{width:2.5rem}.mask-squircle{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M100 0C20 0 0 20 0 100s20 100 100 100 100-20 100-100S180 0 100 0Z'/%3e%3c/svg%3e")}.mask-decagon{mask-image:url("data:image/svg+xml,%3csvg width='192' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 0 58.779 19.098 36.327 50v61.804l-36.327 50L96 200l-58.779-19.098-36.327-50V69.098l36.327-50z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-diamond{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m100 0 100 100-100 100L0 100z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-heart{mask-image:url("data:image/svg+xml,%3csvg width='200' height='185' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M100 184.606a15.384 15.384 0 0 1-8.653-2.678C53.565 156.28 37.205 138.695 28.182 127.7 8.952 104.264-.254 80.202.005 54.146.308 24.287 24.264 0 53.406 0c21.192 0 35.869 11.937 44.416 21.879a2.884 2.884 0 0 0 4.356 0C110.725 11.927 125.402 0 146.594 0c29.142 0 53.098 24.287 53.4 54.151.26 26.061-8.956 50.122-28.176 73.554-9.023 10.994-25.383 28.58-63.165 54.228a15.384 15.384 0 0 1-8.653 2.673Z' fill='black' fill-rule='nonzero'/%3e%3c/svg%3e")}.mask-hexagon{mask-image:url("data:image/svg+xml,%3csvg width='182' height='201' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M.3 65.486c0-9.196 6.687-20.063 14.211-25.078l61.86-35.946c8.36-5.016 20.899-5.016 29.258 0l61.86 35.946c8.36 5.015 14.211 15.882 14.211 25.078v71.055c0 9.196-6.687 20.063-14.211 25.079l-61.86 35.945c-8.36 4.18-20.899 4.18-29.258 0L14.51 161.62C6.151 157.44.3 145.737.3 136.54V65.486Z' fill='black' fill-rule='nonzero'/%3e%3c/svg%3e")}.mask-hexagon-2{mask-image:url("data:image/svg+xml,%3csvg width='200' height='182' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M64.786 181.4c-9.196 0-20.063-6.687-25.079-14.21L3.762 105.33c-5.016-8.36-5.016-20.9 0-29.259l35.945-61.86C44.723 5.851 55.59 0 64.786 0h71.055c9.196 0 20.063 6.688 25.079 14.211l35.945 61.86c4.18 8.36 4.18 20.899 0 29.258l-35.945 61.86c-4.18 8.36-15.883 14.211-25.079 14.211H64.786Z' fill='black' fill-rule='nonzero'/%3e%3c/svg%3e")}.mask-circle{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle fill='black' cx='100' cy='100' r='100' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram{mask-image:url("data:image/svg+xml,%3csvg width='200' height='154' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M46.154 0H200l-46.154 153.846H0z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram-2{mask-image:url("data:image/svg+xml,%3csvg width='200' height='154' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M153.846 0H0l46.154 153.846H200z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram-3{mask-image:url("data:image/svg+xml,%3csvg width='154' height='201' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M.077 47.077v153.846l153.846-46.154V.923z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram-4{mask-image:url("data:image/svg+xml,%3csvg width='154' height='201' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M153.923 47.077v153.846L.077 154.77V.923z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-pentagon{mask-image:url("data:image/svg+xml,%3csvg width='192' height='181' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 0 95.106 69.098-36.327 111.804H37.22L.894 69.098z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-square{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M0 0h200v200H0z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-star{mask-image:url("data:image/svg+xml,%3csvg width='192' height='180' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 137.263-58.779 42.024 22.163-68.389L.894 68.481l72.476-.243L96 0l22.63 68.238 72.476.243-58.49 42.417 22.163 68.389z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-star-2{mask-image:url("data:image/svg+xml,%3csvg width='192' height='180' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 153.044-58.779 26.243 7.02-63.513L.894 68.481l63.117-13.01L96 0l31.989 55.472 63.117 13.01-43.347 47.292 7.02 63.513z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle{mask-image:url("data:image/svg+xml,%3csvg width='174' height='149' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m87 148.476-86.603.185L43.86 74.423 87 0l43.14 74.423 43.463 74.238z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle-2{mask-image:url("data:image/svg+xml,%3csvg width='174' height='150' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m87 .738 86.603-.184-43.463 74.238L87 149.214 43.86 74.792.397.554z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle-3{mask-image:url("data:image/svg+xml,%3csvg width='150' height='174' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m149.369 87.107.185 86.603-74.239-43.463L.893 87.107l74.422-43.14L149.554.505z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle-4{mask-image:url("data:image/svg+xml,%3csvg width='150' height='174' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M.631 87.107.446.505l74.239 43.462 74.422 43.14-74.422 43.14L.446 173.71z' fill-rule='evenodd'/%3e%3c/svg%3e")}.menu{padding:.5rem}:where(.menuli:empty){--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.1;margin:.5rem 1rem;height:1px}.menu :where(liul){margin-inline-start:1rem;padding-inline-start:.5rem}.menu :where(liul):before{position:absolute;bottom:.75rem;inset-inline-start:0;top:.75rem;width:1px;--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.1;content:""}.menu :where(li:not(.menu-title)>:not(ul,details,.menu-title,.btn)),.menu :where(li:not(.menu-title)>details>summary:not(.menu-title)){border-radius:var(--rounded-btn,.5rem);padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;text-align:start;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);text-wrap:balance}:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):is(summary):not(.active,.btn):focus-visible,:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(summary,.active,.btn).focus,:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(summary,.active,.btn):focus,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):is(summary):not(.active,.btn):focus-visible,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(summary,.active,.btn).focus,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(summary,.active,.btn):focus{cursor:pointer;background-color:var(--fallback-bc,oklch(var(--bc)/.1));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));outline:2px solid transparent;outline-offset:2px}@media (hover:hover){:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(.active,.btn):hover,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(.active,.btn):hover{cursor:pointer;outline:2px solid transparent;outline-offset:2px}@supports (color:oklch(0% 0 0)){:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(.active,.btn):hover,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(.active,.btn):hover{background-color:var(--fallback-bc,oklch(var(--bc)/.1))}}}.menu li>:not(ul,.menu-title,details,.btn).active,.menu li>:not(ul,.menu-title,details,.btn):active,.menu li>details>summary:active{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}@media(hover:hover){.menu li>:not(ul,.menu-title,details,.btn).active,.menu li>:not(ul,.menu-title,details,.btn):active,.menu li>details>summary:active{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}}.menu li.disabled{color:var(--fallback-bc,oklch(var(--bc)/.3))}.menu :where(li>details>summary)::-webkit-details-marker{display:none}.menu :where(li>.menu-dropdown-toggle):after,.menu :where(li>details>summary):after{justify-self:end;display:block;margin-top:-.5rem;height:.5rem;width:.5rem;transform:rotate(45deg);transition-property:transform,margin-top;transition-duration:.3s;transition-timing-function:cubic-bezier(.4,0,.2,1);content:"";transform-origin:75% 75%;box-shadow:2px 2px;pointer-events:none}.menu :where(li>.menu-dropdown-toggle.menu-dropdown-show):after,.menu :where(li>details[open]>summary):after{transform:rotate(225deg);margin-top:0}.menu-title{padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;font-size:.875rem;line-height:1.25rem;font-weight:700;color:var(--fallback-bc,oklch(var(--bc)/.4))}.mockup-code{min-width:18rem;border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));padding-top:1.25rem;padding-bottom:1.25rem;--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));direction:ltr}.mockup-code:before{content:"";margin-bottom:1rem;display:block;height:.75rem;width:.75rem;border-radius:9999px;opacity:.3;box-shadow:1.4em 0,2.8em 0,4.2em 0}.mockup-code pre{padding-right:1.25rem}.mockup-code pre:before{content:"";margin-right:2ch}.mockup-code pre[data-prefix]:before{content:attr(data-prefix);width:2rem;opacity:.5}.mockup-window{display:flex;flex-direction:column;border-radius:var(--rounded-box,1rem);padding-top:1.25rem}.mockup-window:before{content:"";margin-bottom:1rem;display:block;aspect-ratio:1/1;height:.75rem;flex-shrink:0;align-self:flex-start;border-radius:9999px;opacity:.3}.mockup-window:where([dir=rtl],[dir=rtl]*):before{align-self:flex-end}.mockup-window:before{box-shadow:1.4em 0,2.8em 0,4.2em 0}.mockup-phone{display:inline-block;border:4px solid #444;border-radius:50px;background-color:#000;padding:10px;margin:0 auto;overflow:hidden}.mockup-phone .camera{position:relative;top:0;left:0;background:#000;height:25px;width:150px;margin:0 auto;border-bottom-left-radius:17px;border-bottom-right-radius:17px;z-index:11}.mockup-phone .camera:before{content:"";position:absolute;top:35%;left:50%;width:50px;height:4px;border-radius:5px;background-color:#0c0b0e;transform:translate(-50%,-50%)}.mockup-phone .camera:after{content:"";position:absolute;top:20%;left:70%;width:8px;height:8px;border-radius:5px;background-color:#0f0b25}.mockup-phone .display{overflow:hidden;border-radius:40px;margin-top:-25px}.mockup-browser{border-radius:var(--rounded-box,1rem)}.mockup-browser .mockup-browser-toolbar{margin-top:.75rem;margin-bottom:.75rem;display:inline-flex;width:100%;align-items:center;padding-right:1.4em}.mockup-browser .mockup-browser-toolbar:where([dir=rtl],[dir=rtl]*){flex-direction:row-reverse}.mockup-browser .mockup-browser-toolbar:before{content:"";margin-right:4.8rem;display:inline-block;aspect-ratio:1/1;height:.75rem;border-radius:9999px;opacity:.3;box-shadow:1.4em 0,2.8em 0,4.2em 0}.mockup-browser .mockup-browser-toolbar .input{position:relative;margin-left:auto;margin-right:auto;display:block;height:1.75rem;width:24rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding-left:2rem;direction:ltr}.mockup-browser .mockup-browser-toolbar .input:before{content:"";position:absolute;left:.5rem;top:50%;aspect-ratio:1/1;height:.75rem;--tw-translate-y:-50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:9999px;border-width:2px;border-color:currentColor;opacity:.6}.mockup-browser .mockup-browser-toolbar .input:after{content:"";position:absolute;left:1.25rem;top:50%;height:.5rem;--tw-translate-y:25%;--tw-rotate:-45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:9999px;border-width:1px;border-color:currentColor;opacity:.6}.modal{background-color:transparent;color:inherit;transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);transition-property:transform,opacity,visibility;overflow-y:hidden;overscroll-behavior:contain}.modal::backdrop,.modal:not(dialog:not(.modal-open)){background-color:#0006;animation:modal-pop .2s ease-out}.modal-backdrop{z-index:-1;grid-column-start:1;grid-row-start:1;display:grid;align-self:stretch;justify-self:stretch;color:transparent}.modal-box{grid-column-start:1;grid-row-start:1;width:91.666667%;max-width:32rem;--tw-scale-x:.9;--tw-scale-y:.9;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-bottom-right-radius:var(--rounded-box,1rem);border-bottom-left-radius:var(--rounded-box,1rem);border-top-left-radius:var(--rounded-box,1rem);border-top-right-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding:1.5rem;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);box-shadow:rgba(0,0,0,.25) 0 25px 50px -12px;overflow-y:auto;overscroll-behavior:contain}.modal-open .modal-box,.modal-toggle:checked+.modal .modal-box,.modal:target .modal-box,.modal[open] .modal-box{--tw-translate-y:0px;--tw-scale-x:1;--tw-scale-y:1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.modal-action{margin-top:1.5rem;justify-content:flex-end}.modal-action>:not([hidden])~:not([hidden]){--tw-space-x-reverse:0;margin-right:calc(.5rem * var(--tw-space-x-reverse));margin-left:calc(.5rem * calc(1 - var(--tw-space-x-reverse)))}@keyframes modal-pop{0%{opacity:0}}.navbar{padding:var(--navbar-padding,.5rem);min-height:4rem;width:100%}.progress{height:.5rem;border-radius:var(--rounded-box,1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.2))}.progress::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)))}.progress-primary::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)))}.progress-secondary::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)))}.progress-accent::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)))}.progress-info::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.progress-success::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.progress-warning::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.progress-error::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.progress:indeterminate{--progress-color:var(--fallback-bc,oklch(var(--bc)/1))}.progress-primary:indeterminate{--progress-color:var(--fallback-p,oklch(var(--p)/1))}.progress-secondary:indeterminate{--progress-color:var(--fallback-s,oklch(var(--s)/1))}.progress-accent:indeterminate{--progress-color:var(--fallback-a,oklch(var(--a)/1))}.progress-info:indeterminate{--progress-color:var(--fallback-in,oklch(var(--in)/1))}.progress-success:indeterminate{--progress-color:var(--fallback-su,oklch(var(--su)/1))}.progress-warning:indeterminate{--progress-color:var(--fallback-wa,oklch(var(--wa)/1))}.progress-error:indeterminate{--progress-color:var(--fallback-er,oklch(var(--er)/1))}.progress::-webkit-progress-bar{border-radius:var(--rounded-box,1rem);background-color:transparent}.progress::-webkit-progress-value{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)))}.progress-primary::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)))}.progress-secondary::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)))}.progress-accent::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)))}.progress-info::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.progress-success::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.progress-warning::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.progress-error::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.progress:indeterminate{background-image:repeating-linear-gradient(90deg,var(--progress-color) -1%,var(--progress-color) 10%,transparent 10%,transparent 90%);background-size:200%;background-position-x:15%;animation:progress-loading 5s ease-in-out infinite}.progress:indeterminate::-moz-progress-bar{background-color:transparent;background-image:repeating-linear-gradient(90deg,var(--progress-color) -1%,var(--progress-color) 10%,transparent 10%,transparent 90%);background-size:200%;background-position-x:15%;animation:progress-loading 5s ease-in-out infinite}@keyframes progress-loading{50%{background-position-x:-115%}}.radial-progress{--value:0;--size:5rem;--thickness:calc(var(--size) / 10)}.radial-progress:after{background-color:currentColor}.radio{--chkbg:var(--bc);height:1.5rem;width:1.5rem;cursor:pointer;appearance:none;border-radius:9999px;border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2}.radio:focus{box-shadow:none}.radio:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.radio:checked,.radio[aria-checked=true]{--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));background-image:none;animation:radiomark var(--animation-input,.2s) ease-out;box-shadow:0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset}.radio-primary{--chkbg:var(--p);--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}@media(hover:hover){.radio-primary:hover{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}}.radio-primary:focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}.radio-primary:checked,.radio-primary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.radio-secondary{--chkbg:var(--s);--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}@media(hover:hover){.radio-secondary:hover{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}}.radio-secondary:focus-visible{outline-color:var(--fallback-s,oklch(var(--s)/1))}.radio-secondary:checked,.radio-secondary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.radio-accent{--chkbg:var(--a);--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}@media(hover:hover){.radio-accent:hover{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}}.radio-accent:focus-visible{outline-color:var(--fallback-a,oklch(var(--a)/1))}.radio-accent:checked,.radio-accent[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.radio-success{--chkbg:var(--su);--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}@media(hover:hover){.radio-success:hover{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}}.radio-success:focus-visible{outline-color:var(--fallback-su,oklch(var(--su)/1))}.radio-success:checked,.radio-success[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.radio-warning{--chkbg:var(--wa);--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}@media(hover:hover){.radio-warning:hover{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}}.radio-warning:focus-visible{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.radio-warning:checked,.radio-warning[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.radio-info{--chkbg:var(--in);--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}@media(hover:hover){.radio-info:hover{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}}.radio-info:focus-visible{outline-color:var(--fallback-in,oklch(var(--in)/1))}.radio-info:checked,.radio-info[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.radio-error{--chkbg:var(--er);--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}@media(hover:hover){.radio-error:hover{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}}.radio-error:focus-visible{outline-color:var(--fallback-er,oklch(var(--er)/1))}.radio-error:checked,.radio-error[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.radio:disabled{cursor:not-allowed;opacity:.2}@keyframes radiomark{0%{box-shadow:0 0 0 12px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 12px var(--fallback-b1,oklch(var(--b1)/1)) inset}50%{box-shadow:0 0 0 3px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 3px var(--fallback-b1,oklch(var(--b1)/1)) inset}100%{box-shadow:0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset}}.radio-mark{display:none}.range{appearance:none;-webkit-appearance:none;--range-shdw:var(--fallback-bc,oklch(var(--bc)/1));overflow:hidden;border-radius:var(--rounded-box,1rem);background-color:transparent}.range:focus-visible::-webkit-slider-thumb{--focus-shadow:0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 2rem var(--range-shdw) inset}.range:focus-visible::-moz-range-thumb{--focus-shadow:0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 2rem var(--range-shdw) inset}.range::-webkit-slider-runnable-track{height:.5rem;width:100%;border-radius:var(--rounded-box,1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.range::-moz-range-track{height:.5rem;width:100%;border-radius:var(--rounded-box,1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.range::-webkit-slider-thumb{position:relative;height:1.5rem;width:1.5rem;border-radius:var(--rounded-box,1rem);border-style:none;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));appearance:none;-webkit-appearance:none;top:50%;color:var(--range-shdw);transform:translateY(-50%);--filler-size:100rem;--filler-offset:0.6rem;box-shadow:0 0 0 3px var(--range-shdw) inset,var(--focus-shadow,0 0),calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size)}.range::-moz-range-thumb{position:relative;height:1.5rem;width:1.5rem;border-radius:var(--rounded-box,1rem);border-style:none;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));top:50%;color:var(--range-shdw);--filler-size:100rem;--filler-offset:0.5rem;box-shadow:0 0 0 3px var(--range-shdw) inset,var(--focus-shadow,0 0),calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size)}.range-primary{--range-shdw:var(--fallback-p,oklch(var(--p)/1))}.range-secondary{--range-shdw:var(--fallback-s,oklch(var(--s)/1))}.range-accent{--range-shdw:var(--fallback-a,oklch(var(--a)/1))}.range-success{--range-shdw:var(--fallback-su,oklch(var(--su)/1))}.range-warning{--range-shdw:var(--fallback-wa,oklch(var(--wa)/1))}.range-info{--range-shdw:var(--fallback-in,oklch(var(--in)/1))}.range-error{--range-shdw:var(--fallback-er,oklch(var(--er)/1))}.rating input{appearance:none;-webkit-appearance:none}.rating :where(input){animation:rating-pop var(--animation-input,.25s) ease-out;height:1.5rem;width:1.5rem;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-bg-opacity:1}.rating .rating-hidden{width:.5rem;background-color:transparent}.rating input[type=radio]:checked{background-image:none}.rating input:checked~input,.rating input[aria-checked=true]~input{--tw-bg-opacity:0.2}.rating input:focus-visible{transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);transform:translateY(-.125em)}.rating input:active:focus{animation:none;transform:translateY(-.125em)}.rating-half :where(input:not(.rating-hidden)){width:.75rem}@keyframes rating-pop{0%{transform:translateY(-.125em)}40%{transform:translateY(-.125em)}100%{transform:translateY(0)}}.select{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding-inline-end:2.5rem}.select-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.select{background-image:linear-gradient(45deg,transparent 50%,currentColor 50%),linear-gradient(135deg,currentColor 50%,transparent 50%);background-position:calc(100% - 20px) calc(1px + 50%),calc(100% - 16.1px) calc(1px + 50%);background-size:4px 4px,4px 4px;background-repeat:no-repeat}.select:focus{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.select-ghost{--tw-bg-opacity:0.05}.select-ghost:focus{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.select-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.select-primary:focus{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}.select-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.select-secondary:focus{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}.select-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.select-accent:focus{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}.select-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.select-info:focus{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}.select-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.select-success:focus{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}.select-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.select-warning:focus{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.select-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.select-error:focus{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}.select-disabled,.select:disabled,.select[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.select-disabled::placeholder,.select:disabled::placeholder,.select[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.select-multiple,.select[multiple],.select[size].select:not([size="1"]){background-image:none;padding-right:1rem}[dir=rtl] .select{background-position:calc(0% + 12px) calc(1px + 50%),calc(0% + 16px) calc(1px + 50%)}.skeleton{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)));will-change:background-position;animation:skeleton 1.8s ease-in-out infinite;background-image:linear-gradient(105deg,transparent 0,transparent 40%,var(--fallback-b1,oklch(var(--b1)/1)) 50%,transparent 60%,transparent 100%);background-size:200% auto;background-repeat:no-repeat;background-position-x:-50%}@media (prefers-reduced-motion){.skeleton{animation-duration:15s}}@keyframes skeleton{from{background-position:150%}to{background-position:-50%}}.stack{place-items:center;align-items:flex-end}.stack>*{width:100%;opacity:.6}.stack>:nth-child(2){opacity:.8}.stack>:nth-child(1){opacity:1}.stats{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}:where(.stats)>:not([hidden])~:not([hidden]){--tw-divide-x-reverse:0;border-right-width:calc(1px * var(--tw-divide-x-reverse));border-left-width:calc(1px * calc(1 - var(--tw-divide-x-reverse)));--tw-divide-y-reverse:0;border-top-width:calc(0px * calc(1 - var(--tw-divide-y-reverse)));border-bottom-width:calc(0px * var(--tw-divide-y-reverse))}:where(.stats){overflow-x:auto}[dir=rtl] .stats>:not([hidden])~:not([hidden]){--tw-divide-x-reverse:1}.stat{column-gap:1rem;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.1;padding-left:1.5rem;padding-right:1.5rem;padding-top:1rem;padding-bottom:1rem}.stat-title{color:var(--fallback-bc,oklch(var(--bc)/.6))}.stat-value{font-size:2.25rem;line-height:2.5rem;font-weight:800}.stat-desc{font-size:.75rem;line-height:1rem;color:var(--fallback-bc,oklch(var(--bc)/.6))}.stat-actions{margin-top:1rem}.steps .step{grid-template-rows:40px 1fr;grid-template-columns:auto;min-width:4rem}.steps .step:before{top:0;grid-column-start:1;grid-row-start:1;height:.5rem;width:100%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));content:"";margin-inline-start:-100%}.steps .step:after{content:counter(step);counter-increment:step;z-index:1;position:relative;grid-column-start:1;grid-row-start:1;display:grid;height:2rem;width:2rem;place-items:center;place-self:center;border-radius:9999px;--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.steps .step:first-child:before{content:none}.steps .step[data-content]:after{content:attr(data-content)}.steps .step-neutral+.step-neutral:before,.steps .step-neutral:after{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.steps .step-primary+.step-primary:before,.steps .step-primary:after{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.steps .step-secondary+.step-secondary:before,.steps .step-secondary:after{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.steps .step-accent+.step-accent:before,.steps .step-accent:after{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.steps .step-info+.step-info:before{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.steps .step-info:after{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.steps .step-success+.step-success:before{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.steps .step-success:after{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.steps .step-warning+.step-warning:before{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.steps .step-warning:after{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.steps .step-error+.step-error:before{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.steps .step-error:after{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.swap{cursor:pointer}.swap>*{transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);transition-property:transform,opacity}.swap-rotate .swap-indeterminate,.swap-rotate .swap-on,.swap-rotate input:indeterminate~.swap-on{--tw-rotate:45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.swap-active:where(.swap-rotate) .swap-off,.swap-rotate input:checked~.swap-off,.swap-rotate input:indeterminate~.swap-off{--tw-rotate:-45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.swap-active:where(.swap-rotate) .swap-on,.swap-rotate input:checked~.swap-on,.swap-rotate input:indeterminate~.swap-indeterminate{--tw-rotate:0deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.swap-flip{transform-style:preserve-3d;perspective:16em}.swap-flip .swap-indeterminate,.swap-flip .swap-on,.swap-flip input:indeterminate~.swap-on{transform:rotateY(180deg);backface-visibility:hidden;opacity:1}.swap-active:where(.swap-flip) .swap-off,.swap-flip input:checked~.swap-off,.swap-flip input:indeterminate~.swap-off{transform:rotateY(-180deg);backface-visibility:hidden;opacity:1}.swap-active:where(.swap-flip) .swap-on,.swap-flip input:checked~.swap-on,.swap-flip input:indeterminate~.swap-indeterminate{transform:rotateY(0)}.tabs-lifted>.tab:focus-visible{border-end-end-radius:0;border-end-start-radius:0}.tab{--tw-text-opacity:0.5}@media(hover:hover){.tab:hover{--tw-text-opacity:1}}.tab{--tab-color:var(--fallback-bc,oklch(var(--bc)/1));--tab-bg:var(--fallback-b1,oklch(var(--b1)/1));--tab-border-color:var(--fallback-b3,oklch(var(--b3)/1));color:var(--tab-color);padding-inline-start:var(--tab-padding,1rem);padding-inline-end:var(--tab-padding,1rem)}.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]),.tab:is(input:checked){border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:1;--tw-text-opacity:1}.tab:focus{outline:2px solid transparent;outline-offset:2px}.tab:focus-visible{outline:2px solid currentColor;outline-offset:-5px}.tab-disabled,.tab[disabled]{cursor:not-allowed;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}@media (hover:hover){.tab[disabled],.tab[disabled]:hover{cursor:not-allowed;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}}.tabs-bordered>.tab{border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2;border-style:solid;border-bottom-width:calc(var(--tab-border,1px) + 1px)}.tabs-lifted>.tab{border:var(--tab-border,1px) solid transparent;border-width:0 0 var(--tab-border,1px) 0;border-start-start-radius:var(--tab-radius,.5rem);border-start-end-radius:var(--tab-radius,.5rem);border-bottom-color:var(--tab-border-color);padding-inline-start:var(--tab-padding,1rem);padding-inline-end:var(--tab-padding,1rem);padding-top:var(--tab-border,1px)}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]),.tabs-lifted>.tab:is(input:checked){background-color:var(--tab-bg);border-width:var(--tab-border,1px) var(--tab-border,1px) 0 var(--tab-border,1px);border-inline-start-color:var(--tab-border-color);border-inline-end-color:var(--tab-border-color);border-top-color:var(--tab-border-color);padding-inline-start:calc(var(--tab-padding,1rem) - var(--tab-border,1px));padding-inline-end:calc(var(--tab-padding,1rem) - var(--tab-border,1px));padding-bottom:var(--tab-border,1px);padding-top:0}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):before,.tabs-lifted>.tab:is(input:checked):before{z-index:1;content:"";display:block;position:absolute;width:calc(100% + var(--tab-radius,.5rem) * 2);height:var(--tab-radius,.5rem);bottom:0;background-size:var(--tab-radius,.5rem);background-position:top left,top right;background-repeat:no-repeat;--tab-grad:calc(69% - var(--tab-border, 1px));--radius-start:radial-gradient(
+        circle at top left,
+        transparent var(--tab-grad),
+        var(--tab-border-color) calc(var(--tab-grad) + 0.25px),
+        var(--tab-border-color) calc(var(--tab-grad) + var(--tab-border, 1px)),
+        var(--tab-bg) calc(var(--tab-grad) + var(--tab-border, 1px) + 0.25px)
+      );--radius-end:radial-gradient(
+        circle at top right,
+        transparent var(--tab-grad),
+        var(--tab-border-color) calc(var(--tab-grad) + 0.25px),
+        var(--tab-border-color) calc(var(--tab-grad) + var(--tab-border, 1px)),
+        var(--tab-bg) calc(var(--tab-grad) + var(--tab-border, 1px) + 0.25px)
+      );background-image:var(--radius-start),var(--radius-end)}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):first-child:before,.tabs-lifted>.tab:is(input:checked):first-child:before{background-image:var(--radius-end);background-position:top right}[dir=rtl] .tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):first-child:before,[dir=rtl] .tabs-lifted>.tab:is(input:checked):first-child:before{background-image:var(--radius-start);background-position:top left}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):last-child:before,.tabs-lifted>.tab:is(input:checked):last-child:before{background-image:var(--radius-start);background-position:top left}[dir=rtl] .tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):last-child:before,[dir=rtl] .tabs-lifted>.tab:is(input:checked):last-child:before{background-image:var(--radius-end);background-position:top right}.tabs-lifted>.tab:is(input:checked)+.tabs-lifted .tab:is(input:checked):before,.tabs-lifted>:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled])+.tabs-lifted :is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):before{background-image:var(--radius-end);background-position:top right}.tabs-boxed{border-radius:var(--rounded-btn,.5rem);--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding:.25rem}.tabs-boxed .tab{border-radius:var(--rounded-btn,.5rem)}.tabs-boxed :is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]),.tabs-boxed :is(input:checked){--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@media(hover:hover){.tabs-boxed :is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):hover,.tabs-boxed :is(input:checked):hover{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}}.table{border-radius:var(--rounded-box,1rem);text-align:left;font-size:.875rem;line-height:1.25rem}.table:where([dir=rtl],[dir=rtl]*){text-align:right}.table :where(th,td){padding-left:1rem;padding-right:1rem;padding-top:.75rem;padding-bottom:.75rem;vertical-align:middle}.table tr.active,.table tr.active:nth-child(even),.table-zebra tbody tr:nth-child(even){--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)))}@media(hover:hover){.table tr.hover:hover,.table tr.hover:nth-child(even):hover{--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)))}}.table-zebra tr.active,.table-zebra tr.active:nth-child(even),.table-zebra-zebra tbody tr:nth-child(even){--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}@media(hover:hover){.table-zebra tr.hover:hover,.table-zebra tr.hover:nth-child(even):hover{--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}}.table :where(theadtr,tbodytr:not(:last-child),tbodytr:first-child:last-child){border-bottom-width:1px;--tw-border-opacity:1;border-bottom-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.table :where(thead,tfoot){white-space:nowrap;font-size:.75rem;line-height:1rem;font-weight:700;color:var(--fallback-bc,oklch(var(--bc)/.6))}.table :where(tfoot){border-top-width:1px;--tw-border-opacity:1;border-top-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.textarea{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.textarea-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.textarea:focus{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.textarea-ghost{--tw-bg-opacity:0.05}.textarea-ghost:focus{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:none}.textarea-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.textarea-primary:focus{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}.textarea-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.textarea-secondary:focus{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}.textarea-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.textarea-accent:focus{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}.textarea-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.textarea-info:focus{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}.textarea-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.textarea-success:focus{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}.textarea-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.textarea-warning:focus{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.textarea-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.textarea-error:focus{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}.textarea-disabled,.textarea:disabled,.textarea[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.textarea-disabled::placeholder,.textarea:disabled::placeholder,.textarea[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.timeline hr{height:.25rem}:where(.timelinehr){--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}:where(.timeline:has(.timeline-middle)hr):first-child{border-start-end-radius:var(--rounded-badge,1.9rem);border-end-end-radius:var(--rounded-badge,1.9rem);border-start-start-radius:0;border-end-start-radius:0}:where(.timeline:has(.timeline-middle)hr):last-child{border-start-start-radius:var(--rounded-badge,1.9rem);border-end-start-radius:var(--rounded-badge,1.9rem);border-start-end-radius:0;border-end-end-radius:0}:where(.timeline:not(:has(.timeline-middle)):first-childhr:last-child){border-start-start-radius:var(--rounded-badge,1.9rem);border-end-start-radius:var(--rounded-badge,1.9rem);border-start-end-radius:0;border-end-end-radius:0}:where(.timeline:not(:has(.timeline-middle)):last-childhr:first-child){border-start-end-radius:var(--rounded-badge,1.9rem);border-end-end-radius:var(--rounded-badge,1.9rem);border-start-start-radius:0;border-end-start-radius:0}.timeline-box{border-radius:var(--rounded-box,1rem);border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;--tw-shadow:0 1px 2px 0 rgb(0 0 0 / 0.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.toast{gap:.5rem;padding:1rem}.toast>*{animation:toast-pop .25s ease-out}@keyframes toast-pop{0%{transform:scale(.9);opacity:0}100%{transform:scale(1);opacity:1}}.toggle{--tglbg:var(--fallback-b1,oklch(var(--b1)/1));--handleoffset:1.5rem;--handleoffsetcalculator:calc(var(--handleoffset) * -1);--togglehandleborder:0 0;height:1.5rem;width:3rem;cursor:pointer;appearance:none;border-radius:var(--rounded-badge,1.9rem);border-width:1px;border-color:currentColor;background-color:currentColor;color:var(--fallback-bc,oklch(var(--bc)/.5));transition:background,box-shadow var(--animation-input,.2s) ease-out;box-shadow:var(--handleoffsetcalculator) 0 0 2px var(--tglbg) inset,0 0 0 2px var(--tglbg) inset,var(--togglehandleborder)}[dir=rtl] .toggle{--handleoffsetcalculator:calc(var(--handleoffset) * 1)}.toggle:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.toggle:hover{background-color:currentColor}.toggle:checked,.toggle[aria-checked=true]{background-image:none;--handleoffsetcalculator:var(--handleoffset);--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}[dir=rtl] .toggle:checked,[dir=rtl] .toggle[aria-checked=true]{--handleoffsetcalculator:calc(var(--handleoffset) * -1)}.toggle:indeterminate{--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:calc(var(--handleoffset)/ 2) 0 0 2px var(--tglbg) inset,calc(var(--handleoffset)/ -2) 0 0 2px var(--tglbg) inset,0 0 0 2px var(--tglbg) inset}[dir=rtl] .toggle:indeterminate{box-shadow:calc(var(--handleoffset)/ 2) 0 0 2px var(--tglbg) inset,calc(var(--handleoffset)/ -2) 0 0 2px var(--tglbg) inset,0 0 0 2px var(--tglbg) inset}.toggle-primary:focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}.toggle-primary:checked,.toggle-primary[aria-checked=true]{border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.toggle-secondary:focus-visible{outline-color:var(--fallback-s,oklch(var(--s)/1))}.toggle-secondary:checked,.toggle-secondary[aria-checked=true]{border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.toggle-accent:focus-visible{outline-color:var(--fallback-a,oklch(var(--a)/1))}.toggle-accent:checked,.toggle-accent[aria-checked=true]{border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.toggle-success:focus-visible{outline-color:var(--fallback-su,oklch(var(--su)/1))}.toggle-success:checked,.toggle-success[aria-checked=true]{border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.toggle-warning:focus-visible{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.toggle-warning:checked,.toggle-warning[aria-checked=true]{border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.toggle-info:focus-visible{outline-color:var(--fallback-in,oklch(var(--in)/1))}.toggle-info:checked,.toggle-info[aria-checked=true]{border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.toggle-error:focus-visible{outline-color:var(--fallback-er,oklch(var(--er)/1))}.toggle-error:checked,.toggle-error[aria-checked=true]{border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.toggle:disabled{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));background-color:transparent;opacity:.3;--togglehandleborder:0 0 0 3px var(--fallback-bc,oklch(var(--bc)/1)) inset,var(--handleoffsetcalculator) 0 0 3px var(--fallback-bc,oklch(var(--bc)/1)) inset}.toggle-mark{display:none}:root .prose{--tw-prose-body:var(--fallback-bc,oklch(var(--bc)/0.8));--tw-prose-headings:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-lead:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-links:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-bold:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-counters:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-bullets:var(--fallback-bc,oklch(var(--bc)/0.5));--tw-prose-hr:var(--fallback-bc,oklch(var(--bc)/0.2));--tw-prose-quotes:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-quote-borders:var(--fallback-bc,oklch(var(--bc)/0.2));--tw-prose-captions:var(--fallback-bc,oklch(var(--bc)/0.5));--tw-prose-code:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-pre-code:var(--fallback-nc,oklch(var(--nc)/1));--tw-prose-pre-bg:var(--fallback-n,oklch(var(--n)/1));--tw-prose-th-borders:var(--fallback-bc,oklch(var(--bc)/0.5));--tw-prose-td-borders:var(--fallback-bc,oklch(var(--bc)/0.2))}.prose :where(code):not(:where([class~=not-prose]*,pre*)){padding:1px 8px;border-radius:var(--rounded-badge);font-weight:initial;background-color:var(--fallback-bc,oklch(var(--bc)/.1))}@supports not (color:oklch(0% 0 0)){.prose :where(code):not(:where([class~=not-prose]*,pre*)){background-color:var(--fallback-b3,oklch(var(--b3)/1))}}.prose :where(code):not(:where([class~=not-prose],[class~=not-prose]*))::after,.prose :where(code):not(:where([class~=not-prose],[class~=not-prose]*))::before{display:none}.prose pre code{border-radius:0;padding:0}.prose :where(tbodytr,thead):not(:where([class~=not-prose]*)){border-bottom-color:var(--fallback-bc,oklch(var(--bc)/.2))}:root{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:89.824% 0.06192 275.75;--ac:15.352% 0.0368 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:49.12% 0.3096 275.75;--s:69.71% 0.329 342.55;--sc:98.71% 0.0106 342.55;--a:76.76% 0.184 183.61;--n:32.1785% 0.02476 255.701624;--nc:89.4994% 0.011585 252.096176;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.4169% 0.00108 197.137559;--bc:27.8078% 0.029596 256.847952}@media (prefers-color-scheme:dark){:root{color-scheme:dark;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:13.138% 0.0392 275.75;--sc:14.96% 0.052 342.55;--ac:14.902% 0.0334 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:65.69% 0.196 275.75;--s:74.8% 0.26 342.55;--a:74.51% 0.167 183.61;--n:31.3815% 0.021108 254.139175;--nc:74.6477% 0.0216 264.435964;--b1:25.3267% 0.015896 252.417568;--b2:23.2607% 0.013807 253.100675;--b3:21.1484% 0.01165 254.087939;--bc:74.6477% 0.0216 264.435964}}[data-theme=light]{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:89.824% 0.06192 275.75;--ac:15.352% 0.0368 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:49.12% 0.3096 275.75;--s:69.71% 0.329 342.55;--sc:98.71% 0.0106 342.55;--a:76.76% 0.184 183.61;--n:32.1785% 0.02476 255.701624;--nc:89.4994% 0.011585 252.096176;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.4169% 0.00108 197.137559;--bc:27.8078% 0.029596 256.847952}:root:has(input.theme-controller[value=light]:checked){color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:89.824% 0.06192 275.75;--ac:15.352% 0.0368 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:49.12% 0.3096 275.75;--s:69.71% 0.329 342.55;--sc:98.71% 0.0106 342.55;--a:76.76% 0.184 183.61;--n:32.1785% 0.02476 255.701624;--nc:89.4994% 0.011585 252.096176;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.4169% 0.00108 197.137559;--bc:27.8078% 0.029596 256.847952}[data-theme=dark]{color-scheme:dark;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:13.138% 0.0392 275.75;--sc:14.96% 0.052 342.55;--ac:14.902% 0.0334 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:65.69% 0.196 275.75;--s:74.8% 0.26 342.55;--a:74.51% 0.167 183.61;--n:31.3815% 0.021108 254.139175;--nc:74.6477% 0.0216 264.435964;--b1:25.3267% 0.015896 252.417568;--b2:23.2607% 0.013807 253.100675;--b3:21.1484% 0.01165 254.087939;--bc:74.6477% 0.0216 264.435964}:root:has(input.theme-controller[value=dark]:checked){color-scheme:dark;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:13.138% 0.0392 275.75;--sc:14.96% 0.052 342.55;--ac:14.902% 0.0334 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:65.69% 0.196 275.75;--s:74.8% 0.26 342.55;--a:74.51% 0.167 183.61;--n:31.3815% 0.021108 254.139175;--nc:74.6477% 0.0216 264.435964;--b1:25.3267% 0.015896 252.417568;--b2:23.2607% 0.013807 253.100675;--b3:21.1484% 0.01165 254.087939;--bc:74.6477% 0.0216 264.435964}[data-theme=cupcake]{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:15.2344% 0.017892 200.026556;--sc:15.787% 0.020249 356.29965;--ac:15.8762% 0.029206 78.618794;--nc:84.7148% 0.013247 313.189598;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--p:76.172% 0.089459 200.026556;--s:78.9351% 0.101246 356.29965;--a:79.3811% 0.146032 78.618794;--n:23.5742% 0.066235 313.189598;--b1:97.7882% 0.00418 56.375637;--b2:93.9822% 0.007638 61.449292;--b3:91.5861% 0.006811 53.440502;--bc:23.5742% 0.066235 313.189598;--rounded-btn:1.9rem;--tab-border:2px;--tab-radius:0.7rem}:root:has(input.theme-controller[value=cupcake]:checked){color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:15.2344% 0.017892 200.026556;--sc:15.787% 0.020249 356.29965;--ac:15.8762% 0.029206 78.618794;--nc:84.7148% 0.013247 313.189598;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--p:76.172% 0.089459 200.026556;--s:78.9351% 0.101246 356.29965;--a:79.3811% 0.146032 78.618794;--n:23.5742% 0.066235 313.189598;--b1:97.7882% 0.00418 56.375637;--b2:93.9822% 0.007638 61.449292;--b3:91.5861% 0.006811 53.440502;--bc:23.5742% 0.066235 313.189598;--rounded-btn:1.9rem;--tab-border:2px;--tab-radius:0.7rem}[data-theme=bumblebee]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--ac:16.254% 0.0314 56.52;--nc:82.55% 0.015 281.99;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:89.51% 0.2132 96.61;--pc:38.92% 0.046 96.61;--s:80.39% 0.194 70.76;--sc:39.38% 0.068 70.76;--a:81.27% 0.157 56.52;--n:12.75% 0.075 281.99;--b1:100% 0 0}:root:has(input.theme-controller[value=bumblebee]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--ac:16.254% 0.0314 56.52;--nc:82.55% 0.015 281.99;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:89.51% 0.2132 96.61;--pc:38.92% 0.046 96.61;--s:80.39% 0.194 70.76;--sc:39.38% 0.068 70.76;--a:81.27% 0.157 56.52;--n:12.75% 0.075 281.99;--b1:100% 0 0}[data-theme=emerald]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:76.6626% 0.135433 153.450024;--pc:33.3872% 0.040618 162.240129;--s:61.3028% 0.202368 261.294233;--sc:100% 0 0;--a:72.7725% 0.149783 33.200363;--ac:0% 0 0;--n:35.5192% 0.032071 262.988584;--nc:98.4625% 0.001706 247.838921;--b1:100% 0 0;--bc:35.5192% 0.032071 262.988584;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}:root:has(input.theme-controller[value=emerald]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:76.6626% 0.135433 153.450024;--pc:33.3872% 0.040618 162.240129;--s:61.3028% 0.202368 261.294233;--sc:100% 0 0;--a:72.7725% 0.149783 33.200363;--ac:0% 0 0;--n:35.5192% 0.032071 262.988584;--nc:98.4625% 0.001706 247.838921;--b1:100% 0 0;--bc:35.5192% 0.032071 262.988584;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}[data-theme=corporate]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:12.078% 0.0456 269.1;--sc:13.0739% 0.010951 256.688055;--ac:15.3934% 0.022799 163.57888;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--border-btn:1px;--tab-border:1px;--p:60.39% 0.228 269.1;--s:65.3694% 0.054756 256.688055;--a:76.9669% 0.113994 163.57888;--n:22.3899% 0.031305 278.07229;--nc:95.8796% 0.008588 247.915135;--b1:100% 0 0;--bc:22.3899% 0.031305 278.07229;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem;--tab-radius:0.25rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}:root:has(input.theme-controller[value=corporate]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:12.078% 0.0456 269.1;--sc:13.0739% 0.010951 256.688055;--ac:15.3934% 0.022799 163.57888;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--border-btn:1px;--tab-border:1px;--p:60.39% 0.228 269.1;--s:65.3694% 0.054756 256.688055;--a:76.9669% 0.113994 163.57888;--n:22.3899% 0.031305 278.07229;--nc:95.8796% 0.008588 247.915135;--b1:100% 0 0;--bc:22.3899% 0.031305 278.07229;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem;--tab-radius:0.25rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}[data-theme=synthwave]{color-scheme:dark;--b2:20.2941% 0.076211 287.835609;--b3:18.7665% 0.070475 287.835609;--pc:14.4421% 0.031903 342.009383;--sc:15.6543% 0.02362 227.382405;--ac:17.608% 0.0412 93.72;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:72.2105% 0.159514 342.009383;--s:78.2714% 0.118101 227.382405;--a:88.04% 0.206 93.72;--n:25.5554% 0.103537 286.507967;--nc:97.9365% 0.00819 301.358346;--b1:21.8216% 0.081948 287.835609;--bc:97.9365% 0.00819 301.358346;--in:76.5197% 0.12273 231.831603;--inc:23.5017% 0.096418 290.329844;--su:86.0572% 0.115038 178.624677;--suc:23.5017% 0.096418 290.329844;--wa:85.531% 0.122117 93.722227;--wac:23.5017% 0.096418 290.329844;--er:73.7005% 0.121339 32.639257;--erc:23.5017% 0.096418 290.329844}:root:has(input.theme-controller[value=synthwave]:checked){color-scheme:dark;--b2:20.2941% 0.076211 287.835609;--b3:18.7665% 0.070475 287.835609;--pc:14.4421% 0.031903 342.009383;--sc:15.6543% 0.02362 227.382405;--ac:17.608% 0.0412 93.72;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:72.2105% 0.159514 342.009383;--s:78.2714% 0.118101 227.382405;--a:88.04% 0.206 93.72;--n:25.5554% 0.103537 286.507967;--nc:97.9365% 0.00819 301.358346;--b1:21.8216% 0.081948 287.835609;--bc:97.9365% 0.00819 301.358346;--in:76.5197% 0.12273 231.831603;--inc:23.5017% 0.096418 290.329844;--su:86.0572% 0.115038 178.624677;--suc:23.5017% 0.096418 290.329844;--wa:85.531% 0.122117 93.722227;--wac:23.5017% 0.096418 290.329844;--er:73.7005% 0.121339 32.639257;--erc:23.5017% 0.096418 290.329844}[data-theme=retro]{color-scheme:light;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:76.8664% 0.104092 22.664655;--pc:26.5104% 0.006243 0.522862;--s:80.7415% 0.052534 159.094608;--sc:26.5104% 0.006243 0.522862;--a:70.3919% 0.125455 52.953428;--ac:26.5104% 0.006243 0.522862;--n:28.4181% 0.009519 355.534017;--nc:92.5604% 0.025113 89.217311;--b1:91.6374% 0.034554 90.51575;--b2:88.2722% 0.049418 91.774344;--b3:84.133% 0.065952 90.856665;--bc:26.5104% 0.006243 0.522862;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33;--rounded-box:0.4rem;--rounded-btn:0.4rem;--rounded-badge:0.4rem;--tab-radius:0.4rem}:root:has(input.theme-controller[value=retro]:checked){color-scheme:light;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:76.8664% 0.104092 22.664655;--pc:26.5104% 0.006243 0.522862;--s:80.7415% 0.052534 159.094608;--sc:26.5104% 0.006243 0.522862;--a:70.3919% 0.125455 52.953428;--ac:26.5104% 0.006243 0.522862;--n:28.4181% 0.009519 355.534017;--nc:92.5604% 0.025113 89.217311;--b1:91.6374% 0.034554 90.51575;--b2:88.2722% 0.049418 91.774344;--b3:84.133% 0.065952 90.856665;--bc:26.5104% 0.006243 0.522862;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33;--rounded-box:0.4rem;--rounded-btn:0.4rem;--rounded-badge:0.4rem;--tab-radius:0.4rem}[data-theme=cyberpunk]{color-scheme:light;--b2:87.8943% 0.16647 104.32;--b3:81.2786% 0.15394 104.32;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:18.902% 0.0358 104.32;--pc:14.844% 0.0418 6.35;--sc:16.666% 0.0368 204.72;--ac:14.372% 0.04352 310.43;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;--p:74.22% 0.209 6.35;--s:83.33% 0.184 204.72;--a:71.86% 0.2176 310.43;--n:23.04% 0.065 269.31;--nc:94.51% 0.179 104.32;--b1:94.51% 0.179 104.32;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--tab-radius:0}:root:has(input.theme-controller[value=cyberpunk]:checked){color-scheme:light;--b2:87.8943% 0.16647 104.32;--b3:81.2786% 0.15394 104.32;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:18.902% 0.0358 104.32;--pc:14.844% 0.0418 6.35;--sc:16.666% 0.0368 204.72;--ac:14.372% 0.04352 310.43;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;--p:74.22% 0.209 6.35;--s:83.33% 0.184 204.72;--a:71.86% 0.2176 310.43;--n:23.04% 0.065 269.31;--nc:94.51% 0.179 104.32;--b1:94.51% 0.179 104.32;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--tab-radius:0}[data-theme=valentine]{color-scheme:light;--b2:88.0567% 0.024834 337.06289;--b3:81.4288% 0.022964 337.06289;--pc:13.7239% 0.030755 15.066527;--sc:14.3942% 0.029258 293.189609;--ac:14.2537% 0.014961 197.828857;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.614% 0.0414 27.33;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:68.6197% 0.153774 15.066527;--s:71.971% 0.14629 293.189609;--a:71.2685% 0.074804 197.828857;--n:54.6053% 0.143342 358.004839;--nc:90.2701% 0.037202 336.955191;--b1:94.6846% 0.026703 337.06289;--bc:37.3085% 0.081131 4.606426;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.07% 0.207 27.33;--rounded-btn:1.9rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=valentine]:checked){color-scheme:light;--b2:88.0567% 0.024834 337.06289;--b3:81.4288% 0.022964 337.06289;--pc:13.7239% 0.030755 15.066527;--sc:14.3942% 0.029258 293.189609;--ac:14.2537% 0.014961 197.828857;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.614% 0.0414 27.33;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:68.6197% 0.153774 15.066527;--s:71.971% 0.14629 293.189609;--a:71.2685% 0.074804 197.828857;--n:54.6053% 0.143342 358.004839;--nc:90.2701% 0.037202 336.955191;--b1:94.6846% 0.026703 337.06289;--bc:37.3085% 0.081131 4.606426;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.07% 0.207 27.33;--rounded-btn:1.9rem;--tab-radius:0.7rem}[data-theme=halloween]{color-scheme:dark;--b2:23.0416% 0 0;--b3:21.3072% 0 0;--bc:84.9552% 0 0;--sc:89.196% 0.0496 305.03;--nc:84.8742% 0.009322 65.681484;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:77.48% 0.204 60.62;--pc:19.6935% 0.004671 196.779412;--s:45.98% 0.248 305.03;--a:64.8% 0.223 136.073479;--ac:0% 0 0;--n:24.371% 0.046608 65.681484;--b1:24.7759% 0 0;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33}:root:has(input.theme-controller[value=halloween]:checked){color-scheme:dark;--b2:23.0416% 0 0;--b3:21.3072% 0 0;--bc:84.9552% 0 0;--sc:89.196% 0.0496 305.03;--nc:84.8742% 0.009322 65.681484;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:77.48% 0.204 60.62;--pc:19.6935% 0.004671 196.779412;--s:45.98% 0.248 305.03;--a:64.8% 0.223 136.073479;--ac:0% 0 0;--n:24.371% 0.046608 65.681484;--b1:24.7759% 0 0;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33}[data-theme=garden]{color-scheme:light;--b2:86.4453% 0.002011 17.197414;--b3:79.9386% 0.00186 17.197414;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--sc:89.699% 0.022197 355.095988;--ac:11.2547% 0.010859 154.390187;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:62.45% 0.278 3.83636;--pc:100% 0 0;--s:48.4952% 0.110985 355.095988;--a:56.2735% 0.054297 154.390187;--n:24.1559% 0.049362 89.070594;--nc:92.9519% 0.002163 17.197414;--b1:92.9519% 0.002163 17.197414;--bc:16.9617% 0.001664 17.32068}:root:has(input.theme-controller[value=garden]:checked){color-scheme:light;--b2:86.4453% 0.002011 17.197414;--b3:79.9386% 0.00186 17.197414;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--sc:89.699% 0.022197 355.095988;--ac:11.2547% 0.010859 154.390187;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:62.45% 0.278 3.83636;--pc:100% 0 0;--s:48.4952% 0.110985 355.095988;--a:56.2735% 0.054297 154.390187;--n:24.1559% 0.049362 89.070594;--nc:92.9519% 0.002163 17.197414;--b1:92.9519% 0.002163 17.197414;--bc:16.9617% 0.001664 17.32068}[data-theme=forest]{color-scheme:dark;--b2:17.522% 0.007709 17.911578;--b3:16.2032% 0.007129 17.911578;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:83.7682% 0.001658 17.911578;--sc:13.9553% 0.027077 168.327128;--ac:14.1257% 0.02389 185.713193;--nc:86.1397% 0.007806 171.364646;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:68.6283% 0.185567 148.958922;--pc:0% 0 0;--s:69.7764% 0.135385 168.327128;--a:70.6285% 0.119451 185.713193;--n:30.6985% 0.039032 171.364646;--b1:18.8409% 0.00829 17.911578;--rounded-btn:1.9rem}:root:has(input.theme-controller[value=forest]:checked){color-scheme:dark;--b2:17.522% 0.007709 17.911578;--b3:16.2032% 0.007129 17.911578;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:83.7682% 0.001658 17.911578;--sc:13.9553% 0.027077 168.327128;--ac:14.1257% 0.02389 185.713193;--nc:86.1397% 0.007806 171.364646;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:68.6283% 0.185567 148.958922;--pc:0% 0 0;--s:69.7764% 0.135385 168.327128;--a:70.6285% 0.119451 185.713193;--n:30.6985% 0.039032 171.364646;--b1:18.8409% 0.00829 17.911578;--rounded-btn:1.9rem}[data-theme=aqua]{color-scheme:dark;--b2:45.3464% 0.118611 261.181672;--b3:41.9333% 0.109683 261.181672;--bc:89.7519% 0.025508 261.181672;--sc:12.1365% 0.02175 309.782946;--ac:18.6854% 0.020445 94.555431;--nc:12.2124% 0.023402 243.760661;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.79% 0.038 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:85.6617% 0.14498 198.6458;--pc:40.1249% 0.068266 197.603872;--s:60.6827% 0.108752 309.782946;--a:93.4269% 0.102225 94.555431;--n:61.0622% 0.117009 243.760661;--b1:48.7596% 0.127539 261.181672;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.95% 0.19 27.33}:root:has(input.theme-controller[value=aqua]:checked){color-scheme:dark;--b2:45.3464% 0.118611 261.181672;--b3:41.9333% 0.109683 261.181672;--bc:89.7519% 0.025508 261.181672;--sc:12.1365% 0.02175 309.782946;--ac:18.6854% 0.020445 94.555431;--nc:12.2124% 0.023402 243.760661;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.79% 0.038 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:85.6617% 0.14498 198.6458;--pc:40.1249% 0.068266 197.603872;--s:60.6827% 0.108752 309.782946;--a:93.4269% 0.102225 94.555431;--n:61.0622% 0.117009 243.760661;--b1:48.7596% 0.127539 261.181672;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.95% 0.19 27.33}[data-theme=lofi]{color-scheme:light;--inc:15.908% 0.0206 205.9;--suc:18.026% 0.0306 164.14;--wac:17.674% 0.027 79.94;--erc:15.732% 0.03 28.47;--border-btn:1px;--tab-border:1px;--p:15.9066% 0 0;--pc:100% 0 0;--s:21.455% 0.001566 17.278957;--sc:100% 0 0;--a:26.8618% 0 0;--ac:100% 0 0;--n:0% 0 0;--nc:100% 0 0;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.268% 0.001082 17.17934;--bc:0% 0 0;--in:79.54% 0.103 205.9;--su:90.13% 0.153 164.14;--wa:88.37% 0.135 79.94;--er:78.66% 0.15 28.47;--rounded-box:0.25rem;--rounded-btn:0.125rem;--rounded-badge:0.125rem;--tab-radius:0.125rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}:root:has(input.theme-controller[value=lofi]:checked){color-scheme:light;--inc:15.908% 0.0206 205.9;--suc:18.026% 0.0306 164.14;--wac:17.674% 0.027 79.94;--erc:15.732% 0.03 28.47;--border-btn:1px;--tab-border:1px;--p:15.9066% 0 0;--pc:100% 0 0;--s:21.455% 0.001566 17.278957;--sc:100% 0 0;--a:26.8618% 0 0;--ac:100% 0 0;--n:0% 0 0;--nc:100% 0 0;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.268% 0.001082 17.17934;--bc:0% 0 0;--in:79.54% 0.103 205.9;--su:90.13% 0.153 164.14;--wa:88.37% 0.135 79.94;--er:78.66% 0.15 28.47;--rounded-box:0.25rem;--rounded-btn:0.125rem;--rounded-badge:0.125rem;--tab-radius:0.125rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}[data-theme=pastel]{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--pc:16.6166% 0.006979 316.8737;--sc:17.6153% 0.009839 8.688364;--ac:17.8419% 0.012056 170.923263;--nc:14.2681% 0.014702 228.183906;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:83.0828% 0.034896 316.8737;--s:88.0763% 0.049197 8.688364;--a:89.2096% 0.06028 170.923263;--n:71.3406% 0.07351 228.183906;--b1:100% 0 0;--b2:98.4625% 0.001706 247.838921;--b3:87.1681% 0.009339 258.338227;--rounded-btn:1.9rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=pastel]:checked){color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--pc:16.6166% 0.006979 316.8737;--sc:17.6153% 0.009839 8.688364;--ac:17.8419% 0.012056 170.923263;--nc:14.2681% 0.014702 228.183906;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:83.0828% 0.034896 316.8737;--s:88.0763% 0.049197 8.688364;--a:89.2096% 0.06028 170.923263;--n:71.3406% 0.07351 228.183906;--b1:100% 0 0;--b2:98.4625% 0.001706 247.838921;--b3:87.1681% 0.009339 258.338227;--rounded-btn:1.9rem;--tab-radius:0.7rem}[data-theme=fantasy]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:87.49% 0.0378 325.02;--sc:90.784% 0.0324 241.36;--ac:15.196% 0.0408 56.72;--nc:85.5616% 0.005919 256.847952;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:37.45% 0.189 325.02;--s:53.92% 0.162 241.36;--a:75.98% 0.204 56.72;--n:27.8078% 0.029596 256.847952;--b1:100% 0 0;--bc:27.8078% 0.029596 256.847952}:root:has(input.theme-controller[value=fantasy]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:87.49% 0.0378 325.02;--sc:90.784% 0.0324 241.36;--ac:15.196% 0.0408 56.72;--nc:85.5616% 0.005919 256.847952;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:37.45% 0.189 325.02;--s:53.92% 0.162 241.36;--a:75.98% 0.204 56.72;--n:27.8078% 0.029596 256.847952;--b1:100% 0 0;--bc:27.8078% 0.029596 256.847952}[data-theme=wireframe]{color-scheme:light;--bc:20% 0 0;--pc:15.6521% 0 0;--sc:15.6521% 0 0;--ac:15.6521% 0 0;--nc:18.8014% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:14.1626% 0.019994 108.702381;--erc:12.5591% 0.051537 29.233885;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:Chalkboard,comic sans ms,sans-serif;--p:78.2604% 0 0;--s:78.2604% 0 0;--a:78.2604% 0 0;--n:94.007% 0 0;--b1:100% 0 0;--b2:94.9119% 0 0;--b3:89.7547% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:70.8131% 0.099969 108.702381;--er:62.7955% 0.257683 29.233885;--rounded-box:0.2rem;--rounded-btn:0.2rem;--rounded-badge:0.2rem;--tab-radius:0.2rem}:root:has(input.theme-controller[value=wireframe]:checked){color-scheme:light;--bc:20% 0 0;--pc:15.6521% 0 0;--sc:15.6521% 0 0;--ac:15.6521% 0 0;--nc:18.8014% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:14.1626% 0.019994 108.702381;--erc:12.5591% 0.051537 29.233885;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:Chalkboard,comic sans ms,sans-serif;--p:78.2604% 0 0;--s:78.2604% 0 0;--a:78.2604% 0 0;--n:94.007% 0 0;--b1:100% 0 0;--b2:94.9119% 0 0;--b3:89.7547% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:70.8131% 0.099969 108.702381;--er:62.7955% 0.257683 29.233885;--rounded-box:0.2rem;--rounded-btn:0.2rem;--rounded-badge:0.2rem;--tab-radius:0.2rem}[data-theme=black]{color-scheme:dark;--pc:86.736% 0 0;--sc:86.736% 0 0;--ac:86.736% 0 0;--nc:86.736% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:19.3597% 0.042201 109.769232;--erc:12.5591% 0.051537 29.233885;--border-btn:1px;--tab-border:1px;--p:33.6799% 0 0;--s:33.6799% 0 0;--a:33.6799% 0 0;--b1:0% 0 0;--b2:19.1251% 0 0;--b3:26.8618% 0 0;--bc:87.6096% 0 0;--n:33.6799% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:96.7983% 0.211006 109.769232;--er:62.7955% 0.257683 29.233885;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--animation-btn:0;--animation-input:0;--btn-focus-scale:1;--tab-radius:0}:root:has(input.theme-controller[value=black]:checked){color-scheme:dark;--pc:86.736% 0 0;--sc:86.736% 0 0;--ac:86.736% 0 0;--nc:86.736% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:19.3597% 0.042201 109.769232;--erc:12.5591% 0.051537 29.233885;--border-btn:1px;--tab-border:1px;--p:33.6799% 0 0;--s:33.6799% 0 0;--a:33.6799% 0 0;--b1:0% 0 0;--b2:19.1251% 0 0;--b3:26.8618% 0 0;--bc:87.6096% 0 0;--n:33.6799% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:96.7983% 0.211006 109.769232;--er:62.7955% 0.257683 29.233885;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--animation-btn:0;--animation-input:0;--btn-focus-scale:1;--tab-radius:0}[data-theme=luxury]{color-scheme:dark;--pc:20% 0 0;--sc:85.5163% 0.012821 261.069149;--ac:87.3349% 0.010348 338.82597;--inc:15.8122% 0.024356 237.133883;--suc:15.6239% 0.038579 132.154381;--wac:17.2255% 0.027305 102.89115;--erc:14.3506% 0.035271 22.568916;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:100% 0 0;--s:27.5815% 0.064106 261.069149;--a:36.6744% 0.051741 338.82597;--n:24.27% 0.057015 59.825019;--nc:93.2033% 0.089631 90.861683;--b1:14.0765% 0.004386 285.822869;--b2:20.2191% 0.004211 308.22937;--b3:29.8961% 0.003818 308.318612;--bc:75.6879% 0.123666 76.890484;--in:79.0612% 0.121778 237.133883;--su:78.1197% 0.192894 132.154381;--wa:86.1274% 0.136524 102.89115;--er:71.7531% 0.176357 22.568916}:root:has(input.theme-controller[value=luxury]:checked){color-scheme:dark;--pc:20% 0 0;--sc:85.5163% 0.012821 261.069149;--ac:87.3349% 0.010348 338.82597;--inc:15.8122% 0.024356 237.133883;--suc:15.6239% 0.038579 132.154381;--wac:17.2255% 0.027305 102.89115;--erc:14.3506% 0.035271 22.568916;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:100% 0 0;--s:27.5815% 0.064106 261.069149;--a:36.6744% 0.051741 338.82597;--n:24.27% 0.057015 59.825019;--nc:93.2033% 0.089631 90.861683;--b1:14.0765% 0.004386 285.822869;--b2:20.2191% 0.004211 308.22937;--b3:29.8961% 0.003818 308.318612;--bc:75.6879% 0.123666 76.890484;--in:79.0612% 0.121778 237.133883;--su:78.1197% 0.192894 132.154381;--wa:86.1274% 0.136524 102.89115;--er:71.7531% 0.176357 22.568916}[data-theme=dracula]{color-scheme:dark;--b2:26.8053% 0.020556 277.508664;--b3:24.7877% 0.019009 277.508664;--pc:15.0922% 0.036614 346.812432;--sc:14.8405% 0.029709 301.883095;--ac:16.6785% 0.024826 66.558491;--nc:87.8891% 0.006515 275.524078;--inc:17.6526% 0.018676 212.846491;--suc:17.4199% 0.043903 148.024881;--wac:19.1068% 0.026849 112.757109;--erc:13.6441% 0.041266 24.430965;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.4611% 0.18307 346.812432;--s:74.2023% 0.148546 301.883095;--a:83.3927% 0.124132 66.558491;--n:39.4456% 0.032576 275.524078;--b1:28.8229% 0.022103 277.508664;--bc:97.7477% 0.007913 106.545019;--in:88.263% 0.09338 212.846491;--su:87.0995% 0.219516 148.024881;--wa:95.5338% 0.134246 112.757109;--er:68.2204% 0.206328 24.430965}:root:has(input.theme-controller[value=dracula]:checked){color-scheme:dark;--b2:26.8053% 0.020556 277.508664;--b3:24.7877% 0.019009 277.508664;--pc:15.0922% 0.036614 346.812432;--sc:14.8405% 0.029709 301.883095;--ac:16.6785% 0.024826 66.558491;--nc:87.8891% 0.006515 275.524078;--inc:17.6526% 0.018676 212.846491;--suc:17.4199% 0.043903 148.024881;--wac:19.1068% 0.026849 112.757109;--erc:13.6441% 0.041266 24.430965;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.4611% 0.18307 346.812432;--s:74.2023% 0.148546 301.883095;--a:83.3927% 0.124132 66.558491;--n:39.4456% 0.032576 275.524078;--b1:28.8229% 0.022103 277.508664;--bc:97.7477% 0.007913 106.545019;--in:88.263% 0.09338 212.846491;--su:87.0995% 0.219516 148.024881;--wa:95.5338% 0.134246 112.757109;--er:68.2204% 0.206328 24.430965}[data-theme=cmyk]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--bc:20% 0 0;--pc:14.3544% 0.02666 239.443325;--sc:12.8953% 0.040552 359.339283;--ac:18.8458% 0.037948 105.306968;--nc:84.3557% 0 0;--inc:13.6952% 0.0189 217.284104;--suc:89.3898% 0.032505 321.406278;--wac:14.2473% 0.031969 52.023412;--erc:12.4027% 0.041677 28.717543;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.7722% 0.133298 239.443325;--s:64.4766% 0.202758 359.339283;--a:94.2289% 0.189741 105.306968;--n:21.7787% 0 0;--b1:100% 0 0;--in:68.4759% 0.094499 217.284104;--su:46.949% 0.162524 321.406278;--wa:71.2364% 0.159843 52.023412;--er:62.0133% 0.208385 28.717543}:root:has(input.theme-controller[value=cmyk]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--bc:20% 0 0;--pc:14.3544% 0.02666 239.443325;--sc:12.8953% 0.040552 359.339283;--ac:18.8458% 0.037948 105.306968;--nc:84.3557% 0 0;--inc:13.6952% 0.0189 217.284104;--suc:89.3898% 0.032505 321.406278;--wac:14.2473% 0.031969 52.023412;--erc:12.4027% 0.041677 28.717543;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.7722% 0.133298 239.443325;--s:64.4766% 0.202758 359.339283;--a:94.2289% 0.189741 105.306968;--n:21.7787% 0 0;--b1:100% 0 0;--in:68.4759% 0.094499 217.284104;--su:46.949% 0.162524 321.406278;--wa:71.2364% 0.159843 52.023412;--er:62.0133% 0.208385 28.717543}[data-theme=autumn]{color-scheme:light;--b2:89.1077% 0 0;--b3:82.4006% 0 0;--bc:19.1629% 0 0;--pc:88.1446% 0.032232 17.530175;--sc:12.3353% 0.033821 23.865865;--ac:14.6851% 0.018999 60.729616;--nc:90.8734% 0.007475 51.902819;--inc:13.8449% 0.019596 207.284192;--suc:12.199% 0.016032 174.616213;--wac:14.0163% 0.032982 56.844303;--erc:90.614% 0.0482 24.16;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:40.7232% 0.16116 17.530175;--s:61.6763% 0.169105 23.865865;--a:73.4253% 0.094994 60.729616;--n:54.3672% 0.037374 51.902819;--b1:95.8147% 0 0;--in:69.2245% 0.097979 207.284192;--su:60.9951% 0.080159 174.616213;--wa:70.0817% 0.164909 56.844303;--er:53.07% 0.241 24.16}:root:has(input.theme-controller[value=autumn]:checked){color-scheme:light;--b2:89.1077% 0 0;--b3:82.4006% 0 0;--bc:19.1629% 0 0;--pc:88.1446% 0.032232 17.530175;--sc:12.3353% 0.033821 23.865865;--ac:14.6851% 0.018999 60.729616;--nc:90.8734% 0.007475 51.902819;--inc:13.8449% 0.019596 207.284192;--suc:12.199% 0.016032 174.616213;--wac:14.0163% 0.032982 56.844303;--erc:90.614% 0.0482 24.16;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:40.7232% 0.16116 17.530175;--s:61.6763% 0.169105 23.865865;--a:73.4253% 0.094994 60.729616;--n:54.3672% 0.037374 51.902819;--b1:95.8147% 0 0;--in:69.2245% 0.097979 207.284192;--su:60.9951% 0.080159 174.616213;--wa:70.0817% 0.164909 56.844303;--er:53.07% 0.241 24.16}[data-theme=business]{color-scheme:dark;--b2:22.6487% 0 0;--b3:20.944% 0 0;--bc:84.8707% 0 0;--pc:88.3407% 0.019811 251.473931;--sc:12.8185% 0.005481 229.389418;--ac:13.4542% 0.033545 35.791525;--nc:85.4882% 0.00265 253.041249;--inc:12.5233% 0.028702 240.033697;--suc:14.0454% 0.018919 156.59611;--wac:15.4965% 0.023141 81.519177;--erc:90.3221% 0.029356 29.674507;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:41.7036% 0.099057 251.473931;--s:64.0924% 0.027405 229.389418;--a:67.271% 0.167726 35.791525;--n:27.441% 0.01325 253.041249;--b1:24.3535% 0 0;--in:62.6163% 0.143511 240.033697;--su:70.2268% 0.094594 156.59611;--wa:77.4824% 0.115704 81.519177;--er:51.6105% 0.14678 29.674507;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem}:root:has(input.theme-controller[value=business]:checked){color-scheme:dark;--b2:22.6487% 0 0;--b3:20.944% 0 0;--bc:84.8707% 0 0;--pc:88.3407% 0.019811 251.473931;--sc:12.8185% 0.005481 229.389418;--ac:13.4542% 0.033545 35.791525;--nc:85.4882% 0.00265 253.041249;--inc:12.5233% 0.028702 240.033697;--suc:14.0454% 0.018919 156.59611;--wac:15.4965% 0.023141 81.519177;--erc:90.3221% 0.029356 29.674507;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:41.7036% 0.099057 251.473931;--s:64.0924% 0.027405 229.389418;--a:67.271% 0.167726 35.791525;--n:27.441% 0.01325 253.041249;--b1:24.3535% 0 0;--in:62.6163% 0.143511 240.033697;--su:70.2268% 0.094594 156.59611;--wa:77.4824% 0.115704 81.519177;--er:51.6105% 0.14678 29.674507;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem}[data-theme=acid]{color-scheme:light;--b2:91.6146% 0 0;--b3:84.7189% 0 0;--bc:19.7021% 0 0;--pc:14.38% 0.0714 330.759573;--sc:14.674% 0.0448 48.250878;--ac:18.556% 0.0528 122.962951;--nc:84.262% 0.0256 278.68;--inc:12.144% 0.0454 252.05;--suc:17.144% 0.0532 158.53;--wac:18.202% 0.0424 100.5;--erc:12.968% 0.0586 29.349188;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:71.9% 0.357 330.759573;--s:73.37% 0.224 48.250878;--a:92.78% 0.264 122.962951;--n:21.31% 0.128 278.68;--b1:98.5104% 0 0;--in:60.72% 0.227 252.05;--su:85.72% 0.266 158.53;--wa:91.01% 0.212 100.5;--er:64.84% 0.293 29.349188;--rounded-box:1.25rem;--rounded-btn:1rem;--rounded-badge:1rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=acid]:checked){color-scheme:light;--b2:91.6146% 0 0;--b3:84.7189% 0 0;--bc:19.7021% 0 0;--pc:14.38% 0.0714 330.759573;--sc:14.674% 0.0448 48.250878;--ac:18.556% 0.0528 122.962951;--nc:84.262% 0.0256 278.68;--inc:12.144% 0.0454 252.05;--suc:17.144% 0.0532 158.53;--wac:18.202% 0.0424 100.5;--erc:12.968% 0.0586 29.349188;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:71.9% 0.357 330.759573;--s:73.37% 0.224 48.250878;--a:92.78% 0.264 122.962951;--n:21.31% 0.128 278.68;--b1:98.5104% 0 0;--in:60.72% 0.227 252.05;--su:85.72% 0.266 158.53;--wa:91.01% 0.212 100.5;--er:64.84% 0.293 29.349188;--rounded-box:1.25rem;--rounded-btn:1rem;--rounded-badge:1rem;--tab-radius:0.7rem}[data-theme=lemonade]{color-scheme:light;--b2:91.8003% 0.0186 123.72;--b3:84.8906% 0.0172 123.72;--bc:19.742% 0.004 123.72;--pc:11.784% 0.0398 134.6;--sc:15.55% 0.0392 111.09;--ac:17.078% 0.0402 100.73;--nc:86.196% 0.015 108.6;--inc:17.238% 0.0094 224.14;--suc:17.238% 0.0094 157.85;--wac:17.238% 0.0094 102.15;--erc:17.238% 0.0094 25.85;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:58.92% 0.199 134.6;--s:77.75% 0.196 111.09;--a:85.39% 0.201 100.73;--n:30.98% 0.075 108.6;--b1:98.71% 0.02 123.72;--in:86.19% 0.047 224.14;--su:86.19% 0.047 157.85;--wa:86.19% 0.047 102.15;--er:86.19% 0.047 25.85}:root:has(input.theme-controller[value=lemonade]:checked){color-scheme:light;--b2:91.8003% 0.0186 123.72;--b3:84.8906% 0.0172 123.72;--bc:19.742% 0.004 123.72;--pc:11.784% 0.0398 134.6;--sc:15.55% 0.0392 111.09;--ac:17.078% 0.0402 100.73;--nc:86.196% 0.015 108.6;--inc:17.238% 0.0094 224.14;--suc:17.238% 0.0094 157.85;--wac:17.238% 0.0094 102.15;--erc:17.238% 0.0094 25.85;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:58.92% 0.199 134.6;--s:77.75% 0.196 111.09;--a:85.39% 0.201 100.73;--n:30.98% 0.075 108.6;--b1:98.71% 0.02 123.72;--in:86.19% 0.047 224.14;--su:86.19% 0.047 157.85;--wa:86.19% 0.047 102.15;--er:86.19% 0.047 25.85}[data-theme=night]{color-scheme:dark;--b2:19.3144% 0.037037 265.754874;--b3:17.8606% 0.034249 265.754874;--bc:84.1536% 0.007965 265.754874;--pc:15.0703% 0.027798 232.66148;--sc:13.6023% 0.031661 276.934902;--ac:14.4721% 0.035244 350.048739;--nc:85.5899% 0.00737 260.030984;--suc:15.6904% 0.026506 181.911977;--wac:16.6486% 0.027912 82.95003;--erc:14.3572% 0.034051 13.11834;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.3513% 0.138989 232.66148;--s:68.0113% 0.158303 276.934902;--a:72.3603% 0.176218 350.048739;--n:27.9495% 0.036848 260.030984;--b1:20.7682% 0.039824 265.754874;--in:68.4553% 0.148062 237.25135;--inc:0% 0 0;--su:78.452% 0.132529 181.911977;--wa:83.2428% 0.139558 82.95003;--er:71.7858% 0.170255 13.11834}:root:has(input.theme-controller[value=night]:checked){color-scheme:dark;--b2:19.3144% 0.037037 265.754874;--b3:17.8606% 0.034249 265.754874;--bc:84.1536% 0.007965 265.754874;--pc:15.0703% 0.027798 232.66148;--sc:13.6023% 0.031661 276.934902;--ac:14.4721% 0.035244 350.048739;--nc:85.5899% 0.00737 260.030984;--suc:15.6904% 0.026506 181.911977;--wac:16.6486% 0.027912 82.95003;--erc:14.3572% 0.034051 13.11834;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.3513% 0.138989 232.66148;--s:68.0113% 0.158303 276.934902;--a:72.3603% 0.176218 350.048739;--n:27.9495% 0.036848 260.030984;--b1:20.7682% 0.039824 265.754874;--in:68.4553% 0.148062 237.25135;--inc:0% 0 0;--su:78.452% 0.132529 181.911977;--wa:83.2428% 0.139558 82.95003;--er:71.7858% 0.170255 13.11834}[data-theme=coffee]{color-scheme:dark;--b2:20.1585% 0.021457 329.708637;--b3:18.6412% 0.019842 329.708637;--pc:14.3993% 0.024765 62.756393;--sc:86.893% 0.00597 199.19444;--ac:88.5243% 0.014881 224.389184;--nc:83.3022% 0.003149 326.261446;--inc:15.898% 0.012774 184.558367;--suc:14.9445% 0.014491 131.116276;--wac:17.6301% 0.028162 87.722413;--erc:15.4637% 0.025644 31.871922;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.9967% 0.123825 62.756393;--s:34.465% 0.029849 199.19444;--a:42.6213% 0.074405 224.389184;--n:16.5109% 0.015743 326.261446;--b1:21.6758% 0.023072 329.708637;--bc:72.3547% 0.092794 79.129387;--in:79.4902% 0.063869 184.558367;--su:74.7224% 0.072456 131.116276;--wa:88.1503% 0.140812 87.722413;--er:77.3187% 0.12822 31.871922}:root:has(input.theme-controller[value=coffee]:checked){color-scheme:dark;--b2:20.1585% 0.021457 329.708637;--b3:18.6412% 0.019842 329.708637;--pc:14.3993% 0.024765 62.756393;--sc:86.893% 0.00597 199.19444;--ac:88.5243% 0.014881 224.389184;--nc:83.3022% 0.003149 326.261446;--inc:15.898% 0.012774 184.558367;--suc:14.9445% 0.014491 131.116276;--wac:17.6301% 0.028162 87.722413;--erc:15.4637% 0.025644 31.871922;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.9967% 0.123825 62.756393;--s:34.465% 0.029849 199.19444;--a:42.6213% 0.074405 224.389184;--n:16.5109% 0.015743 326.261446;--b1:21.6758% 0.023072 329.708637;--bc:72.3547% 0.092794 79.129387;--in:79.4902% 0.063869 184.558367;--su:74.7224% 0.072456 131.116276;--wa:88.1503% 0.140812 87.722413;--er:77.3187% 0.12822 31.871922}[data-theme=winter]{color-scheme:light;--pc:91.372% 0.051 257.57;--sc:88.5103% 0.03222 282.339433;--ac:11.988% 0.038303 335.171434;--nc:83.9233% 0.012704 257.651965;--inc:17.6255% 0.017178 214.515264;--suc:16.0988% 0.015404 197.823719;--wac:17.8345% 0.009167 71.47031;--erc:14.6185% 0.022037 20.076293;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:56.86% 0.255 257.57;--s:42.5516% 0.161098 282.339433;--a:59.9398% 0.191515 335.171434;--n:19.6166% 0.063518 257.651965;--b1:100% 0 0;--b2:97.4663% 0.011947 259.822565;--b3:93.2686% 0.016223 262.751375;--bc:41.8869% 0.053885 255.824911;--in:88.1275% 0.085888 214.515264;--su:80.4941% 0.077019 197.823719;--wa:89.1725% 0.045833 71.47031;--er:73.0926% 0.110185 20.076293}:root:has(input.theme-controller[value=winter]:checked){color-scheme:light;--pc:91.372% 0.051 257.57;--sc:88.5103% 0.03222 282.339433;--ac:11.988% 0.038303 335.171434;--nc:83.9233% 0.012704 257.651965;--inc:17.6255% 0.017178 214.515264;--suc:16.0988% 0.015404 197.823719;--wac:17.8345% 0.009167 71.47031;--erc:14.6185% 0.022037 20.076293;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:56.86% 0.255 257.57;--s:42.5516% 0.161098 282.339433;--a:59.9398% 0.191515 335.171434;--n:19.6166% 0.063518 257.651965;--b1:100% 0 0;--b2:97.4663% 0.011947 259.822565;--b3:93.2686% 0.016223 262.751375;--bc:41.8869% 0.053885 255.824911;--in:88.1275% 0.085888 214.515264;--su:80.4941% 0.077019 197.823719;--wa:89.1725% 0.045833 71.47031;--er:73.0926% 0.110185 20.076293}[data-theme=dim]{color-scheme:dark;--pc:17.2267% 0.028331 139.549991;--sc:14.6752% 0.033181 35.353059;--ac:14.8459% 0.026728 311.37924;--inc:17.2157% 0.028409 206.182959;--suc:17.2343% 0.028437 166.534048;--wac:17.2327% 0.028447 94.818679;--erc:16.4838% 0.019914 33.756357;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:86.1335% 0.141656 139.549991;--s:73.3759% 0.165904 35.353059;--a:74.2296% 0.133641 311.37924;--n:24.7311% 0.020483 264.094728;--nc:82.9011% 0.031335 222.959324;--b1:30.8577% 0.023243 264.149498;--b2:28.0368% 0.01983 264.182074;--b3:26.3469% 0.018403 262.177739;--bc:82.9011% 0.031335 222.959324;--in:86.0785% 0.142046 206.182959;--su:86.1717% 0.142187 166.534048;--wa:86.1634% 0.142236 94.818679;--er:82.4189% 0.09957 33.756357}:root:has(input.theme-controller[value=dim]:checked){color-scheme:dark;--pc:17.2267% 0.028331 139.549991;--sc:14.6752% 0.033181 35.353059;--ac:14.8459% 0.026728 311.37924;--inc:17.2157% 0.028409 206.182959;--suc:17.2343% 0.028437 166.534048;--wac:17.2327% 0.028447 94.818679;--erc:16.4838% 0.019914 33.756357;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:86.1335% 0.141656 139.549991;--s:73.3759% 0.165904 35.353059;--a:74.2296% 0.133641 311.37924;--n:24.7311% 0.020483 264.094728;--nc:82.9011% 0.031335 222.959324;--b1:30.8577% 0.023243 264.149498;--b2:28.0368% 0.01983 264.182074;--b3:26.3469% 0.018403 262.177739;--bc:82.9011% 0.031335 222.959324;--in:86.0785% 0.142046 206.182959;--su:86.1717% 0.142187 166.534048;--wa:86.1634% 0.142236 94.818679;--er:82.4189% 0.09957 33.756357}[data-theme=nord]{color-scheme:light;--pc:11.8872% 0.015449 254.027774;--sc:13.9303% 0.011822 248.687186;--ac:15.4929% 0.01245 217.469017;--inc:13.8414% 0.012499 332.664922;--suc:15.3654% 0.01498 131.063061;--wac:17.0972% 0.017847 84.093335;--erc:12.122% 0.024119 15.341883;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:59.4359% 0.077246 254.027774;--s:69.6516% 0.059108 248.687186;--a:77.4643% 0.062249 217.469017;--n:45.229% 0.035214 264.1312;--nc:89.9258% 0.016374 262.749256;--b1:95.1276% 0.007445 260.731539;--b2:93.2996% 0.010389 261.788485;--b3:89.9258% 0.016374 262.749256;--bc:32.4374% 0.022945 264.182036;--in:69.2072% 0.062496 332.664922;--su:76.827% 0.074899 131.063061;--wa:85.4862% 0.089234 84.093335;--er:60.61% 0.120594 15.341883;--rounded-box:0.4rem;--rounded-btn:0.2rem;--rounded-badge:0.4rem;--tab-radius:0.2rem}:root:has(input.theme-controller[value=nord]:checked){color-scheme:light;--pc:11.8872% 0.015449 254.027774;--sc:13.9303% 0.011822 248.687186;--ac:15.4929% 0.01245 217.469017;--inc:13.8414% 0.012499 332.664922;--suc:15.3654% 0.01498 131.063061;--wac:17.0972% 0.017847 84.093335;--erc:12.122% 0.024119 15.341883;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:59.4359% 0.077246 254.027774;--s:69.6516% 0.059108 248.687186;--a:77.4643% 0.062249 217.469017;--n:45.229% 0.035214 264.1312;--nc:89.9258% 0.016374 262.749256;--b1:95.1276% 0.007445 260.731539;--b2:93.2996% 0.010389 261.788485;--b3:89.9258% 0.016374 262.749256;--bc:32.4374% 0.022945 264.182036;--in:69.2072% 0.062496 332.664922;--su:76.827% 0.074899 131.063061;--wa:85.4862% 0.089234 84.093335;--er:60.61% 0.120594 15.341883;--rounded-box:0.4rem;--rounded-btn:0.2rem;--rounded-badge:0.4rem;--tab-radius:0.2rem}[data-theme=sunset]{color-scheme:dark;--pc:14.9408% 0.031656 39.94703;--sc:14.5075% 0.035531 2.72034;--ac:14.2589% 0.033336 299.844533;--inc:17.1119% 0.017054 206.015183;--suc:17.1122% 0.017172 144.77874;--wac:17.1139% 0.016961 74.427797;--erc:17.1023% 0.015778 16.886379;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:74.7039% 0.158278 39.94703;--s:72.5375% 0.177654 2.72034;--a:71.2947% 0.166678 299.844533;--n:26% 0.019 237.69;--nc:70% 0.019 237.69;--b1:22% 0.019 237.69;--b2:20% 0.019 237.69;--b3:18% 0.019 237.69;--bc:77.3835% 0.043586 245.096534;--in:85.5596% 0.085271 206.015183;--su:85.5609% 0.08586 144.77874;--wa:85.5695% 0.084806 74.427797;--er:85.5116% 0.07889 16.886379;--rounded-box:1.2rem;--rounded-btn:0.8rem;--rounded-badge:0.4rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=sunset]:checked){color-scheme:dark;--pc:14.9408% 0.031656 39.94703;--sc:14.5075% 0.035531 2.72034;--ac:14.2589% 0.033336 299.844533;--inc:17.1119% 0.017054 206.015183;--suc:17.1122% 0.017172 144.77874;--wac:17.1139% 0.016961 74.427797;--erc:17.1023% 0.015778 16.886379;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:74.7039% 0.158278 39.94703;--s:72.5375% 0.177654 2.72034;--a:71.2947% 0.166678 299.844533;--n:26% 0.019 237.69;--nc:70% 0.019 237.69;--b1:22% 0.019 237.69;--b2:20% 0.019 237.69;--b3:18% 0.019 237.69;--bc:77.3835% 0.043586 245.096534;--in:85.5596% 0.085271 206.015183;--su:85.5609% 0.08586 144.77874;--wa:85.5695% 0.084806 74.427797;--er:85.5116% 0.07889 16.886379;--rounded-box:1.2rem;--rounded-btn:0.8rem;--rounded-badge:0.4rem;--tab-radius:0.7rem}
diff --git a/examples/server/public/deps_markdown-it.js b/examples/server/public/deps_markdown-it.js
new file mode 100644
index 000000000..1be0cebe6
--- /dev/null
+++ b/examples/server/public/deps_markdown-it.js
@@ -0,0 +1,8442 @@
+/*! markdown-it 13.0.2 https://github.com/markdown-it/markdown-it @license MIT */
+(function(global, factory) {
+  typeof exports === "object" && typeof module !== "undefined" ? module.exports = factory() : typeof define === "function" && define.amd ? define(factory) : (global = typeof globalThis !== "undefined" ? globalThis : global || self, 
+  global.markdownit = factory());
+})(this, (function() {
+  "use strict";
+  function createCommonjsModule(fn, basedir, module) {
+    return module = {
+      path: basedir,
+      exports: {},
+      require: function(path, base) {
+        return commonjsRequire(path, base === undefined || base === null ? module.path : base);
+      }
+    }, fn(module, module.exports), module.exports;
+  }
+  function getAugmentedNamespace(n) {
+    if (n.__esModule) return n;
+    var a = Object.defineProperty({}, "__esModule", {
+      value: true
+    });
+    Object.keys(n).forEach((function(k) {
+      var d = Object.getOwnPropertyDescriptor(n, k);
+      Object.defineProperty(a, k, d.get ? d : {
+        enumerable: true,
+        get: function() {
+          return n[k];
+        }
+      });
+    }));
+    return a;
+  }
+  function commonjsRequire() {
+    throw new Error("Dynamic requires are not currently supported by @rollup/plugin-commonjs");
+  }
+  var require$$0 = {
+    Aacute: "\xc1",
+    aacute: "\xe1",
+    Abreve: "\u0102",
+    abreve: "\u0103",
+    ac: "\u223e",
+    acd: "\u223f",
+    acE: "\u223e\u0333",
+    Acirc: "\xc2",
+    acirc: "\xe2",
+    acute: "\xb4",
+    Acy: "\u0410",
+    acy: "\u0430",
+    AElig: "\xc6",
+    aelig: "\xe6",
+    af: "\u2061",
+    Afr: "\ud835\udd04",
+    afr: "\ud835\udd1e",
+    Agrave: "\xc0",
+    agrave: "\xe0",
+    alefsym: "\u2135",
+    aleph: "\u2135",
+    Alpha: "\u0391",
+    alpha: "\u03b1",
+    Amacr: "\u0100",
+    amacr: "\u0101",
+    amalg: "\u2a3f",
+    amp: "&",
+    AMP: "&",
+    andand: "\u2a55",
+    And: "\u2a53",
+    and: "\u2227",
+    andd: "\u2a5c",
+    andslope: "\u2a58",
+    andv: "\u2a5a",
+    ang: "\u2220",
+    ange: "\u29a4",
+    angle: "\u2220",
+    angmsdaa: "\u29a8",
+    angmsdab: "\u29a9",
+    angmsdac: "\u29aa",
+    angmsdad: "\u29ab",
+    angmsdae: "\u29ac",
+    angmsdaf: "\u29ad",
+    angmsdag: "\u29ae",
+    angmsdah: "\u29af",
+    angmsd: "\u2221",
+    angrt: "\u221f",
+    angrtvb: "\u22be",
+    angrtvbd: "\u299d",
+    angsph: "\u2222",
+    angst: "\xc5",
+    angzarr: "\u237c",
+    Aogon: "\u0104",
+    aogon: "\u0105",
+    Aopf: "\ud835\udd38",
+    aopf: "\ud835\udd52",
+    apacir: "\u2a6f",
+    ap: "\u2248",
+    apE: "\u2a70",
+    ape: "\u224a",
+    apid: "\u224b",
+    apos: "'",
+    ApplyFunction: "\u2061",
+    approx: "\u2248",
+    approxeq: "\u224a",
+    Aring: "\xc5",
+    aring: "\xe5",
+    Ascr: "\ud835\udc9c",
+    ascr: "\ud835\udcb6",
+    Assign: "\u2254",
+    ast: "*",
+    asymp: "\u2248",
+    asympeq: "\u224d",
+    Atilde: "\xc3",
+    atilde: "\xe3",
+    Auml: "\xc4",
+    auml: "\xe4",
+    awconint: "\u2233",
+    awint: "\u2a11",
+    backcong: "\u224c",
+    backepsilon: "\u03f6",
+    backprime: "\u2035",
+    backsim: "\u223d",
+    backsimeq: "\u22cd",
+    Backslash: "\u2216",
+    Barv: "\u2ae7",
+    barvee: "\u22bd",
+    barwed: "\u2305",
+    Barwed: "\u2306",
+    barwedge: "\u2305",
+    bbrk: "\u23b5",
+    bbrktbrk: "\u23b6",
+    bcong: "\u224c",
+    Bcy: "\u0411",
+    bcy: "\u0431",
+    bdquo: "\u201e",
+    becaus: "\u2235",
+    because: "\u2235",
+    Because: "\u2235",
+    bemptyv: "\u29b0",
+    bepsi: "\u03f6",
+    bernou: "\u212c",
+    Bernoullis: "\u212c",
+    Beta: "\u0392",
+    beta: "\u03b2",
+    beth: "\u2136",
+    between: "\u226c",
+    Bfr: "\ud835\udd05",
+    bfr: "\ud835\udd1f",
+    bigcap: "\u22c2",
+    bigcirc: "\u25ef",
+    bigcup: "\u22c3",
+    bigodot: "\u2a00",
+    bigoplus: "\u2a01",
+    bigotimes: "\u2a02",
+    bigsqcup: "\u2a06",
+    bigstar: "\u2605",
+    bigtriangledown: "\u25bd",
+    bigtriangleup: "\u25b3",
+    biguplus: "\u2a04",
+    bigvee: "\u22c1",
+    bigwedge: "\u22c0",
+    bkarow: "\u290d",
+    blacklozenge: "\u29eb",
+    blacksquare: "\u25aa",
+    blacktriangle: "\u25b4",
+    blacktriangledown: "\u25be",
+    blacktriangleleft: "\u25c2",
+    blacktriangleright: "\u25b8",
+    blank: "\u2423",
+    blk12: "\u2592",
+    blk14: "\u2591",
+    blk34: "\u2593",
+    block: "\u2588",
+    bne: "=\u20e5",
+    bnequiv: "\u2261\u20e5",
+    bNot: "\u2aed",
+    bnot: "\u2310",
+    Bopf: "\ud835\udd39",
+    bopf: "\ud835\udd53",
+    bot: "\u22a5",
+    bottom: "\u22a5",
+    bowtie: "\u22c8",
+    boxbox: "\u29c9",
+    boxdl: "\u2510",
+    boxdL: "\u2555",
+    boxDl: "\u2556",
+    boxDL: "\u2557",
+    boxdr: "\u250c",
+    boxdR: "\u2552",
+    boxDr: "\u2553",
+    boxDR: "\u2554",
+    boxh: "\u2500",
+    boxH: "\u2550",
+    boxhd: "\u252c",
+    boxHd: "\u2564",
+    boxhD: "\u2565",
+    boxHD: "\u2566",
+    boxhu: "\u2534",
+    boxHu: "\u2567",
+    boxhU: "\u2568",
+    boxHU: "\u2569",
+    boxminus: "\u229f",
+    boxplus: "\u229e",
+    boxtimes: "\u22a0",
+    boxul: "\u2518",
+    boxuL: "\u255b",
+    boxUl: "\u255c",
+    boxUL: "\u255d",
+    boxur: "\u2514",
+    boxuR: "\u2558",
+    boxUr: "\u2559",
+    boxUR: "\u255a",
+    boxv: "\u2502",
+    boxV: "\u2551",
+    boxvh: "\u253c",
+    boxvH: "\u256a",
+    boxVh: "\u256b",
+    boxVH: "\u256c",
+    boxvl: "\u2524",
+    boxvL: "\u2561",
+    boxVl: "\u2562",
+    boxVL: "\u2563",
+    boxvr: "\u251c",
+    boxvR: "\u255e",
+    boxVr: "\u255f",
+    boxVR: "\u2560",
+    bprime: "\u2035",
+    breve: "\u02d8",
+    Breve: "\u02d8",
+    brvbar: "\xa6",
+    bscr: "\ud835\udcb7",
+    Bscr: "\u212c",
+    bsemi: "\u204f",
+    bsim: "\u223d",
+    bsime: "\u22cd",
+    bsolb: "\u29c5",
+    bsol: "\\",
+    bsolhsub: "\u27c8",
+    bull: "\u2022",
+    bullet: "\u2022",
+    bump: "\u224e",
+    bumpE: "\u2aae",
+    bumpe: "\u224f",
+    Bumpeq: "\u224e",
+    bumpeq: "\u224f",
+    Cacute: "\u0106",
+    cacute: "\u0107",
+    capand: "\u2a44",
+    capbrcup: "\u2a49",
+    capcap: "\u2a4b",
+    cap: "\u2229",
+    Cap: "\u22d2",
+    capcup: "\u2a47",
+    capdot: "\u2a40",
+    CapitalDifferentialD: "\u2145",
+    caps: "\u2229\ufe00",
+    caret: "\u2041",
+    caron: "\u02c7",
+    Cayleys: "\u212d",
+    ccaps: "\u2a4d",
+    Ccaron: "\u010c",
+    ccaron: "\u010d",
+    Ccedil: "\xc7",
+    ccedil: "\xe7",
+    Ccirc: "\u0108",
+    ccirc: "\u0109",
+    Cconint: "\u2230",
+    ccups: "\u2a4c",
+    ccupssm: "\u2a50",
+    Cdot: "\u010a",
+    cdot: "\u010b",
+    cedil: "\xb8",
+    Cedilla: "\xb8",
+    cemptyv: "\u29b2",
+    cent: "\xa2",
+    centerdot: "\xb7",
+    CenterDot: "\xb7",
+    cfr: "\ud835\udd20",
+    Cfr: "\u212d",
+    CHcy: "\u0427",
+    chcy: "\u0447",
+    check: "\u2713",
+    checkmark: "\u2713",
+    Chi: "\u03a7",
+    chi: "\u03c7",
+    circ: "\u02c6",
+    circeq: "\u2257",
+    circlearrowleft: "\u21ba",
+    circlearrowright: "\u21bb",
+    circledast: "\u229b",
+    circledcirc: "\u229a",
+    circleddash: "\u229d",
+    CircleDot: "\u2299",
+    circledR: "\xae",
+    circledS: "\u24c8",
+    CircleMinus: "\u2296",
+    CirclePlus: "\u2295",
+    CircleTimes: "\u2297",
+    cir: "\u25cb",
+    cirE: "\u29c3",
+    cire: "\u2257",
+    cirfnint: "\u2a10",
+    cirmid: "\u2aef",
+    cirscir: "\u29c2",
+    ClockwiseContourIntegral: "\u2232",
+    CloseCurlyDoubleQuote: "\u201d",
+    CloseCurlyQuote: "\u2019",
+    clubs: "\u2663",
+    clubsuit: "\u2663",
+    colon: ":",
+    Colon: "\u2237",
+    Colone: "\u2a74",
+    colone: "\u2254",
+    coloneq: "\u2254",
+    comma: ",",
+    commat: "@",
+    comp: "\u2201",
+    compfn: "\u2218",
+    complement: "\u2201",
+    complexes: "\u2102",
+    cong: "\u2245",
+    congdot: "\u2a6d",
+    Congruent: "\u2261",
+    conint: "\u222e",
+    Conint: "\u222f",
+    ContourIntegral: "\u222e",
+    copf: "\ud835\udd54",
+    Copf: "\u2102",
+    coprod: "\u2210",
+    Coproduct: "\u2210",
+    copy: "\xa9",
+    COPY: "\xa9",
+    copysr: "\u2117",
+    CounterClockwiseContourIntegral: "\u2233",
+    crarr: "\u21b5",
+    cross: "\u2717",
+    Cross: "\u2a2f",
+    Cscr: "\ud835\udc9e",
+    cscr: "\ud835\udcb8",
+    csub: "\u2acf",
+    csube: "\u2ad1",
+    csup: "\u2ad0",
+    csupe: "\u2ad2",
+    ctdot: "\u22ef",
+    cudarrl: "\u2938",
+    cudarrr: "\u2935",
+    cuepr: "\u22de",
+    cuesc: "\u22df",
+    cularr: "\u21b6",
+    cularrp: "\u293d",
+    cupbrcap: "\u2a48",
+    cupcap: "\u2a46",
+    CupCap: "\u224d",
+    cup: "\u222a",
+    Cup: "\u22d3",
+    cupcup: "\u2a4a",
+    cupdot: "\u228d",
+    cupor: "\u2a45",
+    cups: "\u222a\ufe00",
+    curarr: "\u21b7",
+    curarrm: "\u293c",
+    curlyeqprec: "\u22de",
+    curlyeqsucc: "\u22df",
+    curlyvee: "\u22ce",
+    curlywedge: "\u22cf",
+    curren: "\xa4",
+    curvearrowleft: "\u21b6",
+    curvearrowright: "\u21b7",
+    cuvee: "\u22ce",
+    cuwed: "\u22cf",
+    cwconint: "\u2232",
+    cwint: "\u2231",
+    cylcty: "\u232d",
+    dagger: "\u2020",
+    Dagger: "\u2021",
+    daleth: "\u2138",
+    darr: "\u2193",
+    Darr: "\u21a1",
+    dArr: "\u21d3",
+    dash: "\u2010",
+    Dashv: "\u2ae4",
+    dashv: "\u22a3",
+    dbkarow: "\u290f",
+    dblac: "\u02dd",
+    Dcaron: "\u010e",
+    dcaron: "\u010f",
+    Dcy: "\u0414",
+    dcy: "\u0434",
+    ddagger: "\u2021",
+    ddarr: "\u21ca",
+    DD: "\u2145",
+    dd: "\u2146",
+    DDotrahd: "\u2911",
+    ddotseq: "\u2a77",
+    deg: "\xb0",
+    Del: "\u2207",
+    Delta: "\u0394",
+    delta: "\u03b4",
+    demptyv: "\u29b1",
+    dfisht: "\u297f",
+    Dfr: "\ud835\udd07",
+    dfr: "\ud835\udd21",
+    dHar: "\u2965",
+    dharl: "\u21c3",
+    dharr: "\u21c2",
+    DiacriticalAcute: "\xb4",
+    DiacriticalDot: "\u02d9",
+    DiacriticalDoubleAcute: "\u02dd",
+    DiacriticalGrave: "`",
+    DiacriticalTilde: "\u02dc",
+    diam: "\u22c4",
+    diamond: "\u22c4",
+    Diamond: "\u22c4",
+    diamondsuit: "\u2666",
+    diams: "\u2666",
+    die: "\xa8",
+    DifferentialD: "\u2146",
+    digamma: "\u03dd",
+    disin: "\u22f2",
+    div: "\xf7",
+    divide: "\xf7",
+    divideontimes: "\u22c7",
+    divonx: "\u22c7",
+    DJcy: "\u0402",
+    djcy: "\u0452",
+    dlcorn: "\u231e",
+    dlcrop: "\u230d",
+    dollar: "$",
+    Dopf: "\ud835\udd3b",
+    dopf: "\ud835\udd55",
+    Dot: "\xa8",
+    dot: "\u02d9",
+    DotDot: "\u20dc",
+    doteq: "\u2250",
+    doteqdot: "\u2251",
+    DotEqual: "\u2250",
+    dotminus: "\u2238",
+    dotplus: "\u2214",
+    dotsquare: "\u22a1",
+    doublebarwedge: "\u2306",
+    DoubleContourIntegral: "\u222f",
+    DoubleDot: "\xa8",
+    DoubleDownArrow: "\u21d3",
+    DoubleLeftArrow: "\u21d0",
+    DoubleLeftRightArrow: "\u21d4",
+    DoubleLeftTee: "\u2ae4",
+    DoubleLongLeftArrow: "\u27f8",
+    DoubleLongLeftRightArrow: "\u27fa",
+    DoubleLongRightArrow: "\u27f9",
+    DoubleRightArrow: "\u21d2",
+    DoubleRightTee: "\u22a8",
+    DoubleUpArrow: "\u21d1",
+    DoubleUpDownArrow: "\u21d5",
+    DoubleVerticalBar: "\u2225",
+    DownArrowBar: "\u2913",
+    downarrow: "\u2193",
+    DownArrow: "\u2193",
+    Downarrow: "\u21d3",
+    DownArrowUpArrow: "\u21f5",
+    DownBreve: "\u0311",
+    downdownarrows: "\u21ca",
+    downharpoonleft: "\u21c3",
+    downharpoonright: "\u21c2",
+    DownLeftRightVector: "\u2950",
+    DownLeftTeeVector: "\u295e",
+    DownLeftVectorBar: "\u2956",
+    DownLeftVector: "\u21bd",
+    DownRightTeeVector: "\u295f",
+    DownRightVectorBar: "\u2957",
+    DownRightVector: "\u21c1",
+    DownTeeArrow: "\u21a7",
+    DownTee: "\u22a4",
+    drbkarow: "\u2910",
+    drcorn: "\u231f",
+    drcrop: "\u230c",
+    Dscr: "\ud835\udc9f",
+    dscr: "\ud835\udcb9",
+    DScy: "\u0405",
+    dscy: "\u0455",
+    dsol: "\u29f6",
+    Dstrok: "\u0110",
+    dstrok: "\u0111",
+    dtdot: "\u22f1",
+    dtri: "\u25bf",
+    dtrif: "\u25be",
+    duarr: "\u21f5",
+    duhar: "\u296f",
+    dwangle: "\u29a6",
+    DZcy: "\u040f",
+    dzcy: "\u045f",
+    dzigrarr: "\u27ff",
+    Eacute: "\xc9",
+    eacute: "\xe9",
+    easter: "\u2a6e",
+    Ecaron: "\u011a",
+    ecaron: "\u011b",
+    Ecirc: "\xca",
+    ecirc: "\xea",
+    ecir: "\u2256",
+    ecolon: "\u2255",
+    Ecy: "\u042d",
+    ecy: "\u044d",
+    eDDot: "\u2a77",
+    Edot: "\u0116",
+    edot: "\u0117",
+    eDot: "\u2251",
+    ee: "\u2147",
+    efDot: "\u2252",
+    Efr: "\ud835\udd08",
+    efr: "\ud835\udd22",
+    eg: "\u2a9a",
+    Egrave: "\xc8",
+    egrave: "\xe8",
+    egs: "\u2a96",
+    egsdot: "\u2a98",
+    el: "\u2a99",
+    Element: "\u2208",
+    elinters: "\u23e7",
+    ell: "\u2113",
+    els: "\u2a95",
+    elsdot: "\u2a97",
+    Emacr: "\u0112",
+    emacr: "\u0113",
+    empty: "\u2205",
+    emptyset: "\u2205",
+    EmptySmallSquare: "\u25fb",
+    emptyv: "\u2205",
+    EmptyVerySmallSquare: "\u25ab",
+    emsp13: "\u2004",
+    emsp14: "\u2005",
+    emsp: "\u2003",
+    ENG: "\u014a",
+    eng: "\u014b",
+    ensp: "\u2002",
+    Eogon: "\u0118",
+    eogon: "\u0119",
+    Eopf: "\ud835\udd3c",
+    eopf: "\ud835\udd56",
+    epar: "\u22d5",
+    eparsl: "\u29e3",
+    eplus: "\u2a71",
+    epsi: "\u03b5",
+    Epsilon: "\u0395",
+    epsilon: "\u03b5",
+    epsiv: "\u03f5",
+    eqcirc: "\u2256",
+    eqcolon: "\u2255",
+    eqsim: "\u2242",
+    eqslantgtr: "\u2a96",
+    eqslantless: "\u2a95",
+    Equal: "\u2a75",
+    equals: "=",
+    EqualTilde: "\u2242",
+    equest: "\u225f",
+    Equilibrium: "\u21cc",
+    equiv: "\u2261",
+    equivDD: "\u2a78",
+    eqvparsl: "\u29e5",
+    erarr: "\u2971",
+    erDot: "\u2253",
+    escr: "\u212f",
+    Escr: "\u2130",
+    esdot: "\u2250",
+    Esim: "\u2a73",
+    esim: "\u2242",
+    Eta: "\u0397",
+    eta: "\u03b7",
+    ETH: "\xd0",
+    eth: "\xf0",
+    Euml: "\xcb",
+    euml: "\xeb",
+    euro: "\u20ac",
+    excl: "!",
+    exist: "\u2203",
+    Exists: "\u2203",
+    expectation: "\u2130",
+    exponentiale: "\u2147",
+    ExponentialE: "\u2147",
+    fallingdotseq: "\u2252",
+    Fcy: "\u0424",
+    fcy: "\u0444",
+    female: "\u2640",
+    ffilig: "\ufb03",
+    fflig: "\ufb00",
+    ffllig: "\ufb04",
+    Ffr: "\ud835\udd09",
+    ffr: "\ud835\udd23",
+    filig: "\ufb01",
+    FilledSmallSquare: "\u25fc",
+    FilledVerySmallSquare: "\u25aa",
+    fjlig: "fj",
+    flat: "\u266d",
+    fllig: "\ufb02",
+    fltns: "\u25b1",
+    fnof: "\u0192",
+    Fopf: "\ud835\udd3d",
+    fopf: "\ud835\udd57",
+    forall: "\u2200",
+    ForAll: "\u2200",
+    fork: "\u22d4",
+    forkv: "\u2ad9",
+    Fouriertrf: "\u2131",
+    fpartint: "\u2a0d",
+    frac12: "\xbd",
+    frac13: "\u2153",
+    frac14: "\xbc",
+    frac15: "\u2155",
+    frac16: "\u2159",
+    frac18: "\u215b",
+    frac23: "\u2154",
+    frac25: "\u2156",
+    frac34: "\xbe",
+    frac35: "\u2157",
+    frac38: "\u215c",
+    frac45: "\u2158",
+    frac56: "\u215a",
+    frac58: "\u215d",
+    frac78: "\u215e",
+    frasl: "\u2044",
+    frown: "\u2322",
+    fscr: "\ud835\udcbb",
+    Fscr: "\u2131",
+    gacute: "\u01f5",
+    Gamma: "\u0393",
+    gamma: "\u03b3",
+    Gammad: "\u03dc",
+    gammad: "\u03dd",
+    gap: "\u2a86",
+    Gbreve: "\u011e",
+    gbreve: "\u011f",
+    Gcedil: "\u0122",
+    Gcirc: "\u011c",
+    gcirc: "\u011d",
+    Gcy: "\u0413",
+    gcy: "\u0433",
+    Gdot: "\u0120",
+    gdot: "\u0121",
+    ge: "\u2265",
+    gE: "\u2267",
+    gEl: "\u2a8c",
+    gel: "\u22db",
+    geq: "\u2265",
+    geqq: "\u2267",
+    geqslant: "\u2a7e",
+    gescc: "\u2aa9",
+    ges: "\u2a7e",
+    gesdot: "\u2a80",
+    gesdoto: "\u2a82",
+    gesdotol: "\u2a84",
+    gesl: "\u22db\ufe00",
+    gesles: "\u2a94",
+    Gfr: "\ud835\udd0a",
+    gfr: "\ud835\udd24",
+    gg: "\u226b",
+    Gg: "\u22d9",
+    ggg: "\u22d9",
+    gimel: "\u2137",
+    GJcy: "\u0403",
+    gjcy: "\u0453",
+    gla: "\u2aa5",
+    gl: "\u2277",
+    glE: "\u2a92",
+    glj: "\u2aa4",
+    gnap: "\u2a8a",
+    gnapprox: "\u2a8a",
+    gne: "\u2a88",
+    gnE: "\u2269",
+    gneq: "\u2a88",
+    gneqq: "\u2269",
+    gnsim: "\u22e7",
+    Gopf: "\ud835\udd3e",
+    gopf: "\ud835\udd58",
+    grave: "`",
+    GreaterEqual: "\u2265",
+    GreaterEqualLess: "\u22db",
+    GreaterFullEqual: "\u2267",
+    GreaterGreater: "\u2aa2",
+    GreaterLess: "\u2277",
+    GreaterSlantEqual: "\u2a7e",
+    GreaterTilde: "\u2273",
+    Gscr: "\ud835\udca2",
+    gscr: "\u210a",
+    gsim: "\u2273",
+    gsime: "\u2a8e",
+    gsiml: "\u2a90",
+    gtcc: "\u2aa7",
+    gtcir: "\u2a7a",
+    gt: ">",
+    GT: ">",
+    Gt: "\u226b",
+    gtdot: "\u22d7",
+    gtlPar: "\u2995",
+    gtquest: "\u2a7c",
+    gtrapprox: "\u2a86",
+    gtrarr: "\u2978",
+    gtrdot: "\u22d7",
+    gtreqless: "\u22db",
+    gtreqqless: "\u2a8c",
+    gtrless: "\u2277",
+    gtrsim: "\u2273",
+    gvertneqq: "\u2269\ufe00",
+    gvnE: "\u2269\ufe00",
+    Hacek: "\u02c7",
+    hairsp: "\u200a",
+    half: "\xbd",
+    hamilt: "\u210b",
+    HARDcy: "\u042a",
+    hardcy: "\u044a",
+    harrcir: "\u2948",
+    harr: "\u2194",
+    hArr: "\u21d4",
+    harrw: "\u21ad",
+    Hat: "^",
+    hbar: "\u210f",
+    Hcirc: "\u0124",
+    hcirc: "\u0125",
+    hearts: "\u2665",
+    heartsuit: "\u2665",
+    hellip: "\u2026",
+    hercon: "\u22b9",
+    hfr: "\ud835\udd25",
+    Hfr: "\u210c",
+    HilbertSpace: "\u210b",
+    hksearow: "\u2925",
+    hkswarow: "\u2926",
+    hoarr: "\u21ff",
+    homtht: "\u223b",
+    hookleftarrow: "\u21a9",
+    hookrightarrow: "\u21aa",
+    hopf: "\ud835\udd59",
+    Hopf: "\u210d",
+    horbar: "\u2015",
+    HorizontalLine: "\u2500",
+    hscr: "\ud835\udcbd",
+    Hscr: "\u210b",
+    hslash: "\u210f",
+    Hstrok: "\u0126",
+    hstrok: "\u0127",
+    HumpDownHump: "\u224e",
+    HumpEqual: "\u224f",
+    hybull: "\u2043",
+    hyphen: "\u2010",
+    Iacute: "\xcd",
+    iacute: "\xed",
+    ic: "\u2063",
+    Icirc: "\xce",
+    icirc: "\xee",
+    Icy: "\u0418",
+    icy: "\u0438",
+    Idot: "\u0130",
+    IEcy: "\u0415",
+    iecy: "\u0435",
+    iexcl: "\xa1",
+    iff: "\u21d4",
+    ifr: "\ud835\udd26",
+    Ifr: "\u2111",
+    Igrave: "\xcc",
+    igrave: "\xec",
+    ii: "\u2148",
+    iiiint: "\u2a0c",
+    iiint: "\u222d",
+    iinfin: "\u29dc",
+    iiota: "\u2129",
+    IJlig: "\u0132",
+    ijlig: "\u0133",
+    Imacr: "\u012a",
+    imacr: "\u012b",
+    image: "\u2111",
+    ImaginaryI: "\u2148",
+    imagline: "\u2110",
+    imagpart: "\u2111",
+    imath: "\u0131",
+    Im: "\u2111",
+    imof: "\u22b7",
+    imped: "\u01b5",
+    Implies: "\u21d2",
+    incare: "\u2105",
+    in: "\u2208",
+    infin: "\u221e",
+    infintie: "\u29dd",
+    inodot: "\u0131",
+    intcal: "\u22ba",
+    int: "\u222b",
+    Int: "\u222c",
+    integers: "\u2124",
+    Integral: "\u222b",
+    intercal: "\u22ba",
+    Intersection: "\u22c2",
+    intlarhk: "\u2a17",
+    intprod: "\u2a3c",
+    InvisibleComma: "\u2063",
+    InvisibleTimes: "\u2062",
+    IOcy: "\u0401",
+    iocy: "\u0451",
+    Iogon: "\u012e",
+    iogon: "\u012f",
+    Iopf: "\ud835\udd40",
+    iopf: "\ud835\udd5a",
+    Iota: "\u0399",
+    iota: "\u03b9",
+    iprod: "\u2a3c",
+    iquest: "\xbf",
+    iscr: "\ud835\udcbe",
+    Iscr: "\u2110",
+    isin: "\u2208",
+    isindot: "\u22f5",
+    isinE: "\u22f9",
+    isins: "\u22f4",
+    isinsv: "\u22f3",
+    isinv: "\u2208",
+    it: "\u2062",
+    Itilde: "\u0128",
+    itilde: "\u0129",
+    Iukcy: "\u0406",
+    iukcy: "\u0456",
+    Iuml: "\xcf",
+    iuml: "\xef",
+    Jcirc: "\u0134",
+    jcirc: "\u0135",
+    Jcy: "\u0419",
+    jcy: "\u0439",
+    Jfr: "\ud835\udd0d",
+    jfr: "\ud835\udd27",
+    jmath: "\u0237",
+    Jopf: "\ud835\udd41",
+    jopf: "\ud835\udd5b",
+    Jscr: "\ud835\udca5",
+    jscr: "\ud835\udcbf",
+    Jsercy: "\u0408",
+    jsercy: "\u0458",
+    Jukcy: "\u0404",
+    jukcy: "\u0454",
+    Kappa: "\u039a",
+    kappa: "\u03ba",
+    kappav: "\u03f0",
+    Kcedil: "\u0136",
+    kcedil: "\u0137",
+    Kcy: "\u041a",
+    kcy: "\u043a",
+    Kfr: "\ud835\udd0e",
+    kfr: "\ud835\udd28",
+    kgreen: "\u0138",
+    KHcy: "\u0425",
+    khcy: "\u0445",
+    KJcy: "\u040c",
+    kjcy: "\u045c",
+    Kopf: "\ud835\udd42",
+    kopf: "\ud835\udd5c",
+    Kscr: "\ud835\udca6",
+    kscr: "\ud835\udcc0",
+    lAarr: "\u21da",
+    Lacute: "\u0139",
+    lacute: "\u013a",
+    laemptyv: "\u29b4",
+    lagran: "\u2112",
+    Lambda: "\u039b",
+    lambda: "\u03bb",
+    lang: "\u27e8",
+    Lang: "\u27ea",
+    langd: "\u2991",
+    langle: "\u27e8",
+    lap: "\u2a85",
+    Laplacetrf: "\u2112",
+    laquo: "\xab",
+    larrb: "\u21e4",
+    larrbfs: "\u291f",
+    larr: "\u2190",
+    Larr: "\u219e",
+    lArr: "\u21d0",
+    larrfs: "\u291d",
+    larrhk: "\u21a9",
+    larrlp: "\u21ab",
+    larrpl: "\u2939",
+    larrsim: "\u2973",
+    larrtl: "\u21a2",
+    latail: "\u2919",
+    lAtail: "\u291b",
+    lat: "\u2aab",
+    late: "\u2aad",
+    lates: "\u2aad\ufe00",
+    lbarr: "\u290c",
+    lBarr: "\u290e",
+    lbbrk: "\u2772",
+    lbrace: "{",
+    lbrack: "[",
+    lbrke: "\u298b",
+    lbrksld: "\u298f",
+    lbrkslu: "\u298d",
+    Lcaron: "\u013d",
+    lcaron: "\u013e",
+    Lcedil: "\u013b",
+    lcedil: "\u013c",
+    lceil: "\u2308",
+    lcub: "{",
+    Lcy: "\u041b",
+    lcy: "\u043b",
+    ldca: "\u2936",
+    ldquo: "\u201c",
+    ldquor: "\u201e",
+    ldrdhar: "\u2967",
+    ldrushar: "\u294b",
+    ldsh: "\u21b2",
+    le: "\u2264",
+    lE: "\u2266",
+    LeftAngleBracket: "\u27e8",
+    LeftArrowBar: "\u21e4",
+    leftarrow: "\u2190",
+    LeftArrow: "\u2190",
+    Leftarrow: "\u21d0",
+    LeftArrowRightArrow: "\u21c6",
+    leftarrowtail: "\u21a2",
+    LeftCeiling: "\u2308",
+    LeftDoubleBracket: "\u27e6",
+    LeftDownTeeVector: "\u2961",
+    LeftDownVectorBar: "\u2959",
+    LeftDownVector: "\u21c3",
+    LeftFloor: "\u230a",
+    leftharpoondown: "\u21bd",
+    leftharpoonup: "\u21bc",
+    leftleftarrows: "\u21c7",
+    leftrightarrow: "\u2194",
+    LeftRightArrow: "\u2194",
+    Leftrightarrow: "\u21d4",
+    leftrightarrows: "\u21c6",
+    leftrightharpoons: "\u21cb",
+    leftrightsquigarrow: "\u21ad",
+    LeftRightVector: "\u294e",
+    LeftTeeArrow: "\u21a4",
+    LeftTee: "\u22a3",
+    LeftTeeVector: "\u295a",
+    leftthreetimes: "\u22cb",
+    LeftTriangleBar: "\u29cf",
+    LeftTriangle: "\u22b2",
+    LeftTriangleEqual: "\u22b4",
+    LeftUpDownVector: "\u2951",
+    LeftUpTeeVector: "\u2960",
+    LeftUpVectorBar: "\u2958",
+    LeftUpVector: "\u21bf",
+    LeftVectorBar: "\u2952",
+    LeftVector: "\u21bc",
+    lEg: "\u2a8b",
+    leg: "\u22da",
+    leq: "\u2264",
+    leqq: "\u2266",
+    leqslant: "\u2a7d",
+    lescc: "\u2aa8",
+    les: "\u2a7d",
+    lesdot: "\u2a7f",
+    lesdoto: "\u2a81",
+    lesdotor: "\u2a83",
+    lesg: "\u22da\ufe00",
+    lesges: "\u2a93",
+    lessapprox: "\u2a85",
+    lessdot: "\u22d6",
+    lesseqgtr: "\u22da",
+    lesseqqgtr: "\u2a8b",
+    LessEqualGreater: "\u22da",
+    LessFullEqual: "\u2266",
+    LessGreater: "\u2276",
+    lessgtr: "\u2276",
+    LessLess: "\u2aa1",
+    lesssim: "\u2272",
+    LessSlantEqual: "\u2a7d",
+    LessTilde: "\u2272",
+    lfisht: "\u297c",
+    lfloor: "\u230a",
+    Lfr: "\ud835\udd0f",
+    lfr: "\ud835\udd29",
+    lg: "\u2276",
+    lgE: "\u2a91",
+    lHar: "\u2962",
+    lhard: "\u21bd",
+    lharu: "\u21bc",
+    lharul: "\u296a",
+    lhblk: "\u2584",
+    LJcy: "\u0409",
+    ljcy: "\u0459",
+    llarr: "\u21c7",
+    ll: "\u226a",
+    Ll: "\u22d8",
+    llcorner: "\u231e",
+    Lleftarrow: "\u21da",
+    llhard: "\u296b",
+    lltri: "\u25fa",
+    Lmidot: "\u013f",
+    lmidot: "\u0140",
+    lmoustache: "\u23b0",
+    lmoust: "\u23b0",
+    lnap: "\u2a89",
+    lnapprox: "\u2a89",
+    lne: "\u2a87",
+    lnE: "\u2268",
+    lneq: "\u2a87",
+    lneqq: "\u2268",
+    lnsim: "\u22e6",
+    loang: "\u27ec",
+    loarr: "\u21fd",
+    lobrk: "\u27e6",
+    longleftarrow: "\u27f5",
+    LongLeftArrow: "\u27f5",
+    Longleftarrow: "\u27f8",
+    longleftrightarrow: "\u27f7",
+    LongLeftRightArrow: "\u27f7",
+    Longleftrightarrow: "\u27fa",
+    longmapsto: "\u27fc",
+    longrightarrow: "\u27f6",
+    LongRightArrow: "\u27f6",
+    Longrightarrow: "\u27f9",
+    looparrowleft: "\u21ab",
+    looparrowright: "\u21ac",
+    lopar: "\u2985",
+    Lopf: "\ud835\udd43",
+    lopf: "\ud835\udd5d",
+    loplus: "\u2a2d",
+    lotimes: "\u2a34",
+    lowast: "\u2217",
+    lowbar: "_",
+    LowerLeftArrow: "\u2199",
+    LowerRightArrow: "\u2198",
+    loz: "\u25ca",
+    lozenge: "\u25ca",
+    lozf: "\u29eb",
+    lpar: "(",
+    lparlt: "\u2993",
+    lrarr: "\u21c6",
+    lrcorner: "\u231f",
+    lrhar: "\u21cb",
+    lrhard: "\u296d",
+    lrm: "\u200e",
+    lrtri: "\u22bf",
+    lsaquo: "\u2039",
+    lscr: "\ud835\udcc1",
+    Lscr: "\u2112",
+    lsh: "\u21b0",
+    Lsh: "\u21b0",
+    lsim: "\u2272",
+    lsime: "\u2a8d",
+    lsimg: "\u2a8f",
+    lsqb: "[",
+    lsquo: "\u2018",
+    lsquor: "\u201a",
+    Lstrok: "\u0141",
+    lstrok: "\u0142",
+    ltcc: "\u2aa6",
+    ltcir: "\u2a79",
+    lt: "<",
+    LT: "<",
+    Lt: "\u226a",
+    ltdot: "\u22d6",
+    lthree: "\u22cb",
+    ltimes: "\u22c9",
+    ltlarr: "\u2976",
+    ltquest: "\u2a7b",
+    ltri: "\u25c3",
+    ltrie: "\u22b4",
+    ltrif: "\u25c2",
+    ltrPar: "\u2996",
+    lurdshar: "\u294a",
+    luruhar: "\u2966",
+    lvertneqq: "\u2268\ufe00",
+    lvnE: "\u2268\ufe00",
+    macr: "\xaf",
+    male: "\u2642",
+    malt: "\u2720",
+    maltese: "\u2720",
+    Map: "\u2905",
+    map: "\u21a6",
+    mapsto: "\u21a6",
+    mapstodown: "\u21a7",
+    mapstoleft: "\u21a4",
+    mapstoup: "\u21a5",
+    marker: "\u25ae",
+    mcomma: "\u2a29",
+    Mcy: "\u041c",
+    mcy: "\u043c",
+    mdash: "\u2014",
+    mDDot: "\u223a",
+    measuredangle: "\u2221",
+    MediumSpace: "\u205f",
+    Mellintrf: "\u2133",
+    Mfr: "\ud835\udd10",
+    mfr: "\ud835\udd2a",
+    mho: "\u2127",
+    micro: "\xb5",
+    midast: "*",
+    midcir: "\u2af0",
+    mid: "\u2223",
+    middot: "\xb7",
+    minusb: "\u229f",
+    minus: "\u2212",
+    minusd: "\u2238",
+    minusdu: "\u2a2a",
+    MinusPlus: "\u2213",
+    mlcp: "\u2adb",
+    mldr: "\u2026",
+    mnplus: "\u2213",
+    models: "\u22a7",
+    Mopf: "\ud835\udd44",
+    mopf: "\ud835\udd5e",
+    mp: "\u2213",
+    mscr: "\ud835\udcc2",
+    Mscr: "\u2133",
+    mstpos: "\u223e",
+    Mu: "\u039c",
+    mu: "\u03bc",
+    multimap: "\u22b8",
+    mumap: "\u22b8",
+    nabla: "\u2207",
+    Nacute: "\u0143",
+    nacute: "\u0144",
+    nang: "\u2220\u20d2",
+    nap: "\u2249",
+    napE: "\u2a70\u0338",
+    napid: "\u224b\u0338",
+    napos: "\u0149",
+    napprox: "\u2249",
+    natural: "\u266e",
+    naturals: "\u2115",
+    natur: "\u266e",
+    nbsp: "\xa0",
+    nbump: "\u224e\u0338",
+    nbumpe: "\u224f\u0338",
+    ncap: "\u2a43",
+    Ncaron: "\u0147",
+    ncaron: "\u0148",
+    Ncedil: "\u0145",
+    ncedil: "\u0146",
+    ncong: "\u2247",
+    ncongdot: "\u2a6d\u0338",
+    ncup: "\u2a42",
+    Ncy: "\u041d",
+    ncy: "\u043d",
+    ndash: "\u2013",
+    nearhk: "\u2924",
+    nearr: "\u2197",
+    neArr: "\u21d7",
+    nearrow: "\u2197",
+    ne: "\u2260",
+    nedot: "\u2250\u0338",
+    NegativeMediumSpace: "\u200b",
+    NegativeThickSpace: "\u200b",
+    NegativeThinSpace: "\u200b",
+    NegativeVeryThinSpace: "\u200b",
+    nequiv: "\u2262",
+    nesear: "\u2928",
+    nesim: "\u2242\u0338",
+    NestedGreaterGreater: "\u226b",
+    NestedLessLess: "\u226a",
+    NewLine: "\n",
+    nexist: "\u2204",
+    nexists: "\u2204",
+    Nfr: "\ud835\udd11",
+    nfr: "\ud835\udd2b",
+    ngE: "\u2267\u0338",
+    nge: "\u2271",
+    ngeq: "\u2271",
+    ngeqq: "\u2267\u0338",
+    ngeqslant: "\u2a7e\u0338",
+    nges: "\u2a7e\u0338",
+    nGg: "\u22d9\u0338",
+    ngsim: "\u2275",
+    nGt: "\u226b\u20d2",
+    ngt: "\u226f",
+    ngtr: "\u226f",
+    nGtv: "\u226b\u0338",
+    nharr: "\u21ae",
+    nhArr: "\u21ce",
+    nhpar: "\u2af2",
+    ni: "\u220b",
+    nis: "\u22fc",
+    nisd: "\u22fa",
+    niv: "\u220b",
+    NJcy: "\u040a",
+    njcy: "\u045a",
+    nlarr: "\u219a",
+    nlArr: "\u21cd",
+    nldr: "\u2025",
+    nlE: "\u2266\u0338",
+    nle: "\u2270",
+    nleftarrow: "\u219a",
+    nLeftarrow: "\u21cd",
+    nleftrightarrow: "\u21ae",
+    nLeftrightarrow: "\u21ce",
+    nleq: "\u2270",
+    nleqq: "\u2266\u0338",
+    nleqslant: "\u2a7d\u0338",
+    nles: "\u2a7d\u0338",
+    nless: "\u226e",
+    nLl: "\u22d8\u0338",
+    nlsim: "\u2274",
+    nLt: "\u226a\u20d2",
+    nlt: "\u226e",
+    nltri: "\u22ea",
+    nltrie: "\u22ec",
+    nLtv: "\u226a\u0338",
+    nmid: "\u2224",
+    NoBreak: "\u2060",
+    NonBreakingSpace: "\xa0",
+    nopf: "\ud835\udd5f",
+    Nopf: "\u2115",
+    Not: "\u2aec",
+    not: "\xac",
+    NotCongruent: "\u2262",
+    NotCupCap: "\u226d",
+    NotDoubleVerticalBar: "\u2226",
+    NotElement: "\u2209",
+    NotEqual: "\u2260",
+    NotEqualTilde: "\u2242\u0338",
+    NotExists: "\u2204",
+    NotGreater: "\u226f",
+    NotGreaterEqual: "\u2271",
+    NotGreaterFullEqual: "\u2267\u0338",
+    NotGreaterGreater: "\u226b\u0338",
+    NotGreaterLess: "\u2279",
+    NotGreaterSlantEqual: "\u2a7e\u0338",
+    NotGreaterTilde: "\u2275",
+    NotHumpDownHump: "\u224e\u0338",
+    NotHumpEqual: "\u224f\u0338",
+    notin: "\u2209",
+    notindot: "\u22f5\u0338",
+    notinE: "\u22f9\u0338",
+    notinva: "\u2209",
+    notinvb: "\u22f7",
+    notinvc: "\u22f6",
+    NotLeftTriangleBar: "\u29cf\u0338",
+    NotLeftTriangle: "\u22ea",
+    NotLeftTriangleEqual: "\u22ec",
+    NotLess: "\u226e",
+    NotLessEqual: "\u2270",
+    NotLessGreater: "\u2278",
+    NotLessLess: "\u226a\u0338",
+    NotLessSlantEqual: "\u2a7d\u0338",
+    NotLessTilde: "\u2274",
+    NotNestedGreaterGreater: "\u2aa2\u0338",
+    NotNestedLessLess: "\u2aa1\u0338",
+    notni: "\u220c",
+    notniva: "\u220c",
+    notnivb: "\u22fe",
+    notnivc: "\u22fd",
+    NotPrecedes: "\u2280",
+    NotPrecedesEqual: "\u2aaf\u0338",
+    NotPrecedesSlantEqual: "\u22e0",
+    NotReverseElement: "\u220c",
+    NotRightTriangleBar: "\u29d0\u0338",
+    NotRightTriangle: "\u22eb",
+    NotRightTriangleEqual: "\u22ed",
+    NotSquareSubset: "\u228f\u0338",
+    NotSquareSubsetEqual: "\u22e2",
+    NotSquareSuperset: "\u2290\u0338",
+    NotSquareSupersetEqual: "\u22e3",
+    NotSubset: "\u2282\u20d2",
+    NotSubsetEqual: "\u2288",
+    NotSucceeds: "\u2281",
+    NotSucceedsEqual: "\u2ab0\u0338",
+    NotSucceedsSlantEqual: "\u22e1",
+    NotSucceedsTilde: "\u227f\u0338",
+    NotSuperset: "\u2283\u20d2",
+    NotSupersetEqual: "\u2289",
+    NotTilde: "\u2241",
+    NotTildeEqual: "\u2244",
+    NotTildeFullEqual: "\u2247",
+    NotTildeTilde: "\u2249",
+    NotVerticalBar: "\u2224",
+    nparallel: "\u2226",
+    npar: "\u2226",
+    nparsl: "\u2afd\u20e5",
+    npart: "\u2202\u0338",
+    npolint: "\u2a14",
+    npr: "\u2280",
+    nprcue: "\u22e0",
+    nprec: "\u2280",
+    npreceq: "\u2aaf\u0338",
+    npre: "\u2aaf\u0338",
+    nrarrc: "\u2933\u0338",
+    nrarr: "\u219b",
+    nrArr: "\u21cf",
+    nrarrw: "\u219d\u0338",
+    nrightarrow: "\u219b",
+    nRightarrow: "\u21cf",
+    nrtri: "\u22eb",
+    nrtrie: "\u22ed",
+    nsc: "\u2281",
+    nsccue: "\u22e1",
+    nsce: "\u2ab0\u0338",
+    Nscr: "\ud835\udca9",
+    nscr: "\ud835\udcc3",
+    nshortmid: "\u2224",
+    nshortparallel: "\u2226",
+    nsim: "\u2241",
+    nsime: "\u2244",
+    nsimeq: "\u2244",
+    nsmid: "\u2224",
+    nspar: "\u2226",
+    nsqsube: "\u22e2",
+    nsqsupe: "\u22e3",
+    nsub: "\u2284",
+    nsubE: "\u2ac5\u0338",
+    nsube: "\u2288",
+    nsubset: "\u2282\u20d2",
+    nsubseteq: "\u2288",
+    nsubseteqq: "\u2ac5\u0338",
+    nsucc: "\u2281",
+    nsucceq: "\u2ab0\u0338",
+    nsup: "\u2285",
+    nsupE: "\u2ac6\u0338",
+    nsupe: "\u2289",
+    nsupset: "\u2283\u20d2",
+    nsupseteq: "\u2289",
+    nsupseteqq: "\u2ac6\u0338",
+    ntgl: "\u2279",
+    Ntilde: "\xd1",
+    ntilde: "\xf1",
+    ntlg: "\u2278",
+    ntriangleleft: "\u22ea",
+    ntrianglelefteq: "\u22ec",
+    ntriangleright: "\u22eb",
+    ntrianglerighteq: "\u22ed",
+    Nu: "\u039d",
+    nu: "\u03bd",
+    num: "#",
+    numero: "\u2116",
+    numsp: "\u2007",
+    nvap: "\u224d\u20d2",
+    nvdash: "\u22ac",
+    nvDash: "\u22ad",
+    nVdash: "\u22ae",
+    nVDash: "\u22af",
+    nvge: "\u2265\u20d2",
+    nvgt: ">\u20d2",
+    nvHarr: "\u2904",
+    nvinfin: "\u29de",
+    nvlArr: "\u2902",
+    nvle: "\u2264\u20d2",
+    nvlt: "<\u20d2",
+    nvltrie: "\u22b4\u20d2",
+    nvrArr: "\u2903",
+    nvrtrie: "\u22b5\u20d2",
+    nvsim: "\u223c\u20d2",
+    nwarhk: "\u2923",
+    nwarr: "\u2196",
+    nwArr: "\u21d6",
+    nwarrow: "\u2196",
+    nwnear: "\u2927",
+    Oacute: "\xd3",
+    oacute: "\xf3",
+    oast: "\u229b",
+    Ocirc: "\xd4",
+    ocirc: "\xf4",
+    ocir: "\u229a",
+    Ocy: "\u041e",
+    ocy: "\u043e",
+    odash: "\u229d",
+    Odblac: "\u0150",
+    odblac: "\u0151",
+    odiv: "\u2a38",
+    odot: "\u2299",
+    odsold: "\u29bc",
+    OElig: "\u0152",
+    oelig: "\u0153",
+    ofcir: "\u29bf",
+    Ofr: "\ud835\udd12",
+    ofr: "\ud835\udd2c",
+    ogon: "\u02db",
+    Ograve: "\xd2",
+    ograve: "\xf2",
+    ogt: "\u29c1",
+    ohbar: "\u29b5",
+    ohm: "\u03a9",
+    oint: "\u222e",
+    olarr: "\u21ba",
+    olcir: "\u29be",
+    olcross: "\u29bb",
+    oline: "\u203e",
+    olt: "\u29c0",
+    Omacr: "\u014c",
+    omacr: "\u014d",
+    Omega: "\u03a9",
+    omega: "\u03c9",
+    Omicron: "\u039f",
+    omicron: "\u03bf",
+    omid: "\u29b6",
+    ominus: "\u2296",
+    Oopf: "\ud835\udd46",
+    oopf: "\ud835\udd60",
+    opar: "\u29b7",
+    OpenCurlyDoubleQuote: "\u201c",
+    OpenCurlyQuote: "\u2018",
+    operp: "\u29b9",
+    oplus: "\u2295",
+    orarr: "\u21bb",
+    Or: "\u2a54",
+    or: "\u2228",
+    ord: "\u2a5d",
+    order: "\u2134",
+    orderof: "\u2134",
+    ordf: "\xaa",
+    ordm: "\xba",
+    origof: "\u22b6",
+    oror: "\u2a56",
+    orslope: "\u2a57",
+    orv: "\u2a5b",
+    oS: "\u24c8",
+    Oscr: "\ud835\udcaa",
+    oscr: "\u2134",
+    Oslash: "\xd8",
+    oslash: "\xf8",
+    osol: "\u2298",
+    Otilde: "\xd5",
+    otilde: "\xf5",
+    otimesas: "\u2a36",
+    Otimes: "\u2a37",
+    otimes: "\u2297",
+    Ouml: "\xd6",
+    ouml: "\xf6",
+    ovbar: "\u233d",
+    OverBar: "\u203e",
+    OverBrace: "\u23de",
+    OverBracket: "\u23b4",
+    OverParenthesis: "\u23dc",
+    para: "\xb6",
+    parallel: "\u2225",
+    par: "\u2225",
+    parsim: "\u2af3",
+    parsl: "\u2afd",
+    part: "\u2202",
+    PartialD: "\u2202",
+    Pcy: "\u041f",
+    pcy: "\u043f",
+    percnt: "%",
+    period: ".",
+    permil: "\u2030",
+    perp: "\u22a5",
+    pertenk: "\u2031",
+    Pfr: "\ud835\udd13",
+    pfr: "\ud835\udd2d",
+    Phi: "\u03a6",
+    phi: "\u03c6",
+    phiv: "\u03d5",
+    phmmat: "\u2133",
+    phone: "\u260e",
+    Pi: "\u03a0",
+    pi: "\u03c0",
+    pitchfork: "\u22d4",
+    piv: "\u03d6",
+    planck: "\u210f",
+    planckh: "\u210e",
+    plankv: "\u210f",
+    plusacir: "\u2a23",
+    plusb: "\u229e",
+    pluscir: "\u2a22",
+    plus: "+",
+    plusdo: "\u2214",
+    plusdu: "\u2a25",
+    pluse: "\u2a72",
+    PlusMinus: "\xb1",
+    plusmn: "\xb1",
+    plussim: "\u2a26",
+    plustwo: "\u2a27",
+    pm: "\xb1",
+    Poincareplane: "\u210c",
+    pointint: "\u2a15",
+    popf: "\ud835\udd61",
+    Popf: "\u2119",
+    pound: "\xa3",
+    prap: "\u2ab7",
+    Pr: "\u2abb",
+    pr: "\u227a",
+    prcue: "\u227c",
+    precapprox: "\u2ab7",
+    prec: "\u227a",
+    preccurlyeq: "\u227c",
+    Precedes: "\u227a",
+    PrecedesEqual: "\u2aaf",
+    PrecedesSlantEqual: "\u227c",
+    PrecedesTilde: "\u227e",
+    preceq: "\u2aaf",
+    precnapprox: "\u2ab9",
+    precneqq: "\u2ab5",
+    precnsim: "\u22e8",
+    pre: "\u2aaf",
+    prE: "\u2ab3",
+    precsim: "\u227e",
+    prime: "\u2032",
+    Prime: "\u2033",
+    primes: "\u2119",
+    prnap: "\u2ab9",
+    prnE: "\u2ab5",
+    prnsim: "\u22e8",
+    prod: "\u220f",
+    Product: "\u220f",
+    profalar: "\u232e",
+    profline: "\u2312",
+    profsurf: "\u2313",
+    prop: "\u221d",
+    Proportional: "\u221d",
+    Proportion: "\u2237",
+    propto: "\u221d",
+    prsim: "\u227e",
+    prurel: "\u22b0",
+    Pscr: "\ud835\udcab",
+    pscr: "\ud835\udcc5",
+    Psi: "\u03a8",
+    psi: "\u03c8",
+    puncsp: "\u2008",
+    Qfr: "\ud835\udd14",
+    qfr: "\ud835\udd2e",
+    qint: "\u2a0c",
+    qopf: "\ud835\udd62",
+    Qopf: "\u211a",
+    qprime: "\u2057",
+    Qscr: "\ud835\udcac",
+    qscr: "\ud835\udcc6",
+    quaternions: "\u210d",
+    quatint: "\u2a16",
+    quest: "?",
+    questeq: "\u225f",
+    quot: '"',
+    QUOT: '"',
+    rAarr: "\u21db",
+    race: "\u223d\u0331",
+    Racute: "\u0154",
+    racute: "\u0155",
+    radic: "\u221a",
+    raemptyv: "\u29b3",
+    rang: "\u27e9",
+    Rang: "\u27eb",
+    rangd: "\u2992",
+    range: "\u29a5",
+    rangle: "\u27e9",
+    raquo: "\xbb",
+    rarrap: "\u2975",
+    rarrb: "\u21e5",
+    rarrbfs: "\u2920",
+    rarrc: "\u2933",
+    rarr: "\u2192",
+    Rarr: "\u21a0",
+    rArr: "\u21d2",
+    rarrfs: "\u291e",
+    rarrhk: "\u21aa",
+    rarrlp: "\u21ac",
+    rarrpl: "\u2945",
+    rarrsim: "\u2974",
+    Rarrtl: "\u2916",
+    rarrtl: "\u21a3",
+    rarrw: "\u219d",
+    ratail: "\u291a",
+    rAtail: "\u291c",
+    ratio: "\u2236",
+    rationals: "\u211a",
+    rbarr: "\u290d",
+    rBarr: "\u290f",
+    RBarr: "\u2910",
+    rbbrk: "\u2773",
+    rbrace: "}",
+    rbrack: "]",
+    rbrke: "\u298c",
+    rbrksld: "\u298e",
+    rbrkslu: "\u2990",
+    Rcaron: "\u0158",
+    rcaron: "\u0159",
+    Rcedil: "\u0156",
+    rcedil: "\u0157",
+    rceil: "\u2309",
+    rcub: "}",
+    Rcy: "\u0420",
+    rcy: "\u0440",
+    rdca: "\u2937",
+    rdldhar: "\u2969",
+    rdquo: "\u201d",
+    rdquor: "\u201d",
+    rdsh: "\u21b3",
+    real: "\u211c",
+    realine: "\u211b",
+    realpart: "\u211c",
+    reals: "\u211d",
+    Re: "\u211c",
+    rect: "\u25ad",
+    reg: "\xae",
+    REG: "\xae",
+    ReverseElement: "\u220b",
+    ReverseEquilibrium: "\u21cb",
+    ReverseUpEquilibrium: "\u296f",
+    rfisht: "\u297d",
+    rfloor: "\u230b",
+    rfr: "\ud835\udd2f",
+    Rfr: "\u211c",
+    rHar: "\u2964",
+    rhard: "\u21c1",
+    rharu: "\u21c0",
+    rharul: "\u296c",
+    Rho: "\u03a1",
+    rho: "\u03c1",
+    rhov: "\u03f1",
+    RightAngleBracket: "\u27e9",
+    RightArrowBar: "\u21e5",
+    rightarrow: "\u2192",
+    RightArrow: "\u2192",
+    Rightarrow: "\u21d2",
+    RightArrowLeftArrow: "\u21c4",
+    rightarrowtail: "\u21a3",
+    RightCeiling: "\u2309",
+    RightDoubleBracket: "\u27e7",
+    RightDownTeeVector: "\u295d",
+    RightDownVectorBar: "\u2955",
+    RightDownVector: "\u21c2",
+    RightFloor: "\u230b",
+    rightharpoondown: "\u21c1",
+    rightharpoonup: "\u21c0",
+    rightleftarrows: "\u21c4",
+    rightleftharpoons: "\u21cc",
+    rightrightarrows: "\u21c9",
+    rightsquigarrow: "\u219d",
+    RightTeeArrow: "\u21a6",
+    RightTee: "\u22a2",
+    RightTeeVector: "\u295b",
+    rightthreetimes: "\u22cc",
+    RightTriangleBar: "\u29d0",
+    RightTriangle: "\u22b3",
+    RightTriangleEqual: "\u22b5",
+    RightUpDownVector: "\u294f",
+    RightUpTeeVector: "\u295c",
+    RightUpVectorBar: "\u2954",
+    RightUpVector: "\u21be",
+    RightVectorBar: "\u2953",
+    RightVector: "\u21c0",
+    ring: "\u02da",
+    risingdotseq: "\u2253",
+    rlarr: "\u21c4",
+    rlhar: "\u21cc",
+    rlm: "\u200f",
+    rmoustache: "\u23b1",
+    rmoust: "\u23b1",
+    rnmid: "\u2aee",
+    roang: "\u27ed",
+    roarr: "\u21fe",
+    robrk: "\u27e7",
+    ropar: "\u2986",
+    ropf: "\ud835\udd63",
+    Ropf: "\u211d",
+    roplus: "\u2a2e",
+    rotimes: "\u2a35",
+    RoundImplies: "\u2970",
+    rpar: ")",
+    rpargt: "\u2994",
+    rppolint: "\u2a12",
+    rrarr: "\u21c9",
+    Rrightarrow: "\u21db",
+    rsaquo: "\u203a",
+    rscr: "\ud835\udcc7",
+    Rscr: "\u211b",
+    rsh: "\u21b1",
+    Rsh: "\u21b1",
+    rsqb: "]",
+    rsquo: "\u2019",
+    rsquor: "\u2019",
+    rthree: "\u22cc",
+    rtimes: "\u22ca",
+    rtri: "\u25b9",
+    rtrie: "\u22b5",
+    rtrif: "\u25b8",
+    rtriltri: "\u29ce",
+    RuleDelayed: "\u29f4",
+    ruluhar: "\u2968",
+    rx: "\u211e",
+    Sacute: "\u015a",
+    sacute: "\u015b",
+    sbquo: "\u201a",
+    scap: "\u2ab8",
+    Scaron: "\u0160",
+    scaron: "\u0161",
+    Sc: "\u2abc",
+    sc: "\u227b",
+    sccue: "\u227d",
+    sce: "\u2ab0",
+    scE: "\u2ab4",
+    Scedil: "\u015e",
+    scedil: "\u015f",
+    Scirc: "\u015c",
+    scirc: "\u015d",
+    scnap: "\u2aba",
+    scnE: "\u2ab6",
+    scnsim: "\u22e9",
+    scpolint: "\u2a13",
+    scsim: "\u227f",
+    Scy: "\u0421",
+    scy: "\u0441",
+    sdotb: "\u22a1",
+    sdot: "\u22c5",
+    sdote: "\u2a66",
+    searhk: "\u2925",
+    searr: "\u2198",
+    seArr: "\u21d8",
+    searrow: "\u2198",
+    sect: "\xa7",
+    semi: ";",
+    seswar: "\u2929",
+    setminus: "\u2216",
+    setmn: "\u2216",
+    sext: "\u2736",
+    Sfr: "\ud835\udd16",
+    sfr: "\ud835\udd30",
+    sfrown: "\u2322",
+    sharp: "\u266f",
+    SHCHcy: "\u0429",
+    shchcy: "\u0449",
+    SHcy: "\u0428",
+    shcy: "\u0448",
+    ShortDownArrow: "\u2193",
+    ShortLeftArrow: "\u2190",
+    shortmid: "\u2223",
+    shortparallel: "\u2225",
+    ShortRightArrow: "\u2192",
+    ShortUpArrow: "\u2191",
+    shy: "\xad",
+    Sigma: "\u03a3",
+    sigma: "\u03c3",
+    sigmaf: "\u03c2",
+    sigmav: "\u03c2",
+    sim: "\u223c",
+    simdot: "\u2a6a",
+    sime: "\u2243",
+    simeq: "\u2243",
+    simg: "\u2a9e",
+    simgE: "\u2aa0",
+    siml: "\u2a9d",
+    simlE: "\u2a9f",
+    simne: "\u2246",
+    simplus: "\u2a24",
+    simrarr: "\u2972",
+    slarr: "\u2190",
+    SmallCircle: "\u2218",
+    smallsetminus: "\u2216",
+    smashp: "\u2a33",
+    smeparsl: "\u29e4",
+    smid: "\u2223",
+    smile: "\u2323",
+    smt: "\u2aaa",
+    smte: "\u2aac",
+    smtes: "\u2aac\ufe00",
+    SOFTcy: "\u042c",
+    softcy: "\u044c",
+    solbar: "\u233f",
+    solb: "\u29c4",
+    sol: "/",
+    Sopf: "\ud835\udd4a",
+    sopf: "\ud835\udd64",
+    spades: "\u2660",
+    spadesuit: "\u2660",
+    spar: "\u2225",
+    sqcap: "\u2293",
+    sqcaps: "\u2293\ufe00",
+    sqcup: "\u2294",
+    sqcups: "\u2294\ufe00",
+    Sqrt: "\u221a",
+    sqsub: "\u228f",
+    sqsube: "\u2291",
+    sqsubset: "\u228f",
+    sqsubseteq: "\u2291",
+    sqsup: "\u2290",
+    sqsupe: "\u2292",
+    sqsupset: "\u2290",
+    sqsupseteq: "\u2292",
+    square: "\u25a1",
+    Square: "\u25a1",
+    SquareIntersection: "\u2293",
+    SquareSubset: "\u228f",
+    SquareSubsetEqual: "\u2291",
+    SquareSuperset: "\u2290",
+    SquareSupersetEqual: "\u2292",
+    SquareUnion: "\u2294",
+    squarf: "\u25aa",
+    squ: "\u25a1",
+    squf: "\u25aa",
+    srarr: "\u2192",
+    Sscr: "\ud835\udcae",
+    sscr: "\ud835\udcc8",
+    ssetmn: "\u2216",
+    ssmile: "\u2323",
+    sstarf: "\u22c6",
+    Star: "\u22c6",
+    star: "\u2606",
+    starf: "\u2605",
+    straightepsilon: "\u03f5",
+    straightphi: "\u03d5",
+    strns: "\xaf",
+    sub: "\u2282",
+    Sub: "\u22d0",
+    subdot: "\u2abd",
+    subE: "\u2ac5",
+    sube: "\u2286",
+    subedot: "\u2ac3",
+    submult: "\u2ac1",
+    subnE: "\u2acb",
+    subne: "\u228a",
+    subplus: "\u2abf",
+    subrarr: "\u2979",
+    subset: "\u2282",
+    Subset: "\u22d0",
+    subseteq: "\u2286",
+    subseteqq: "\u2ac5",
+    SubsetEqual: "\u2286",
+    subsetneq: "\u228a",
+    subsetneqq: "\u2acb",
+    subsim: "\u2ac7",
+    subsub: "\u2ad5",
+    subsup: "\u2ad3",
+    succapprox: "\u2ab8",
+    succ: "\u227b",
+    succcurlyeq: "\u227d",
+    Succeeds: "\u227b",
+    SucceedsEqual: "\u2ab0",
+    SucceedsSlantEqual: "\u227d",
+    SucceedsTilde: "\u227f",
+    succeq: "\u2ab0",
+    succnapprox: "\u2aba",
+    succneqq: "\u2ab6",
+    succnsim: "\u22e9",
+    succsim: "\u227f",
+    SuchThat: "\u220b",
+    sum: "\u2211",
+    Sum: "\u2211",
+    sung: "\u266a",
+    sup1: "\xb9",
+    sup2: "\xb2",
+    sup3: "\xb3",
+    sup: "\u2283",
+    Sup: "\u22d1",
+    supdot: "\u2abe",
+    supdsub: "\u2ad8",
+    supE: "\u2ac6",
+    supe: "\u2287",
+    supedot: "\u2ac4",
+    Superset: "\u2283",
+    SupersetEqual: "\u2287",
+    suphsol: "\u27c9",
+    suphsub: "\u2ad7",
+    suplarr: "\u297b",
+    supmult: "\u2ac2",
+    supnE: "\u2acc",
+    supne: "\u228b",
+    supplus: "\u2ac0",
+    supset: "\u2283",
+    Supset: "\u22d1",
+    supseteq: "\u2287",
+    supseteqq: "\u2ac6",
+    supsetneq: "\u228b",
+    supsetneqq: "\u2acc",
+    supsim: "\u2ac8",
+    supsub: "\u2ad4",
+    supsup: "\u2ad6",
+    swarhk: "\u2926",
+    swarr: "\u2199",
+    swArr: "\u21d9",
+    swarrow: "\u2199",
+    swnwar: "\u292a",
+    szlig: "\xdf",
+    Tab: "\t",
+    target: "\u2316",
+    Tau: "\u03a4",
+    tau: "\u03c4",
+    tbrk: "\u23b4",
+    Tcaron: "\u0164",
+    tcaron: "\u0165",
+    Tcedil: "\u0162",
+    tcedil: "\u0163",
+    Tcy: "\u0422",
+    tcy: "\u0442",
+    tdot: "\u20db",
+    telrec: "\u2315",
+    Tfr: "\ud835\udd17",
+    tfr: "\ud835\udd31",
+    there4: "\u2234",
+    therefore: "\u2234",
+    Therefore: "\u2234",
+    Theta: "\u0398",
+    theta: "\u03b8",
+    thetasym: "\u03d1",
+    thetav: "\u03d1",
+    thickapprox: "\u2248",
+    thicksim: "\u223c",
+    ThickSpace: "\u205f\u200a",
+    ThinSpace: "\u2009",
+    thinsp: "\u2009",
+    thkap: "\u2248",
+    thksim: "\u223c",
+    THORN: "\xde",
+    thorn: "\xfe",
+    tilde: "\u02dc",
+    Tilde: "\u223c",
+    TildeEqual: "\u2243",
+    TildeFullEqual: "\u2245",
+    TildeTilde: "\u2248",
+    timesbar: "\u2a31",
+    timesb: "\u22a0",
+    times: "\xd7",
+    timesd: "\u2a30",
+    tint: "\u222d",
+    toea: "\u2928",
+    topbot: "\u2336",
+    topcir: "\u2af1",
+    top: "\u22a4",
+    Topf: "\ud835\udd4b",
+    topf: "\ud835\udd65",
+    topfork: "\u2ada",
+    tosa: "\u2929",
+    tprime: "\u2034",
+    trade: "\u2122",
+    TRADE: "\u2122",
+    triangle: "\u25b5",
+    triangledown: "\u25bf",
+    triangleleft: "\u25c3",
+    trianglelefteq: "\u22b4",
+    triangleq: "\u225c",
+    triangleright: "\u25b9",
+    trianglerighteq: "\u22b5",
+    tridot: "\u25ec",
+    trie: "\u225c",
+    triminus: "\u2a3a",
+    TripleDot: "\u20db",
+    triplus: "\u2a39",
+    trisb: "\u29cd",
+    tritime: "\u2a3b",
+    trpezium: "\u23e2",
+    Tscr: "\ud835\udcaf",
+    tscr: "\ud835\udcc9",
+    TScy: "\u0426",
+    tscy: "\u0446",
+    TSHcy: "\u040b",
+    tshcy: "\u045b",
+    Tstrok: "\u0166",
+    tstrok: "\u0167",
+    twixt: "\u226c",
+    twoheadleftarrow: "\u219e",
+    twoheadrightarrow: "\u21a0",
+    Uacute: "\xda",
+    uacute: "\xfa",
+    uarr: "\u2191",
+    Uarr: "\u219f",
+    uArr: "\u21d1",
+    Uarrocir: "\u2949",
+    Ubrcy: "\u040e",
+    ubrcy: "\u045e",
+    Ubreve: "\u016c",
+    ubreve: "\u016d",
+    Ucirc: "\xdb",
+    ucirc: "\xfb",
+    Ucy: "\u0423",
+    ucy: "\u0443",
+    udarr: "\u21c5",
+    Udblac: "\u0170",
+    udblac: "\u0171",
+    udhar: "\u296e",
+    ufisht: "\u297e",
+    Ufr: "\ud835\udd18",
+    ufr: "\ud835\udd32",
+    Ugrave: "\xd9",
+    ugrave: "\xf9",
+    uHar: "\u2963",
+    uharl: "\u21bf",
+    uharr: "\u21be",
+    uhblk: "\u2580",
+    ulcorn: "\u231c",
+    ulcorner: "\u231c",
+    ulcrop: "\u230f",
+    ultri: "\u25f8",
+    Umacr: "\u016a",
+    umacr: "\u016b",
+    uml: "\xa8",
+    UnderBar: "_",
+    UnderBrace: "\u23df",
+    UnderBracket: "\u23b5",
+    UnderParenthesis: "\u23dd",
+    Union: "\u22c3",
+    UnionPlus: "\u228e",
+    Uogon: "\u0172",
+    uogon: "\u0173",
+    Uopf: "\ud835\udd4c",
+    uopf: "\ud835\udd66",
+    UpArrowBar: "\u2912",
+    uparrow: "\u2191",
+    UpArrow: "\u2191",
+    Uparrow: "\u21d1",
+    UpArrowDownArrow: "\u21c5",
+    updownarrow: "\u2195",
+    UpDownArrow: "\u2195",
+    Updownarrow: "\u21d5",
+    UpEquilibrium: "\u296e",
+    upharpoonleft: "\u21bf",
+    upharpoonright: "\u21be",
+    uplus: "\u228e",
+    UpperLeftArrow: "\u2196",
+    UpperRightArrow: "\u2197",
+    upsi: "\u03c5",
+    Upsi: "\u03d2",
+    upsih: "\u03d2",
+    Upsilon: "\u03a5",
+    upsilon: "\u03c5",
+    UpTeeArrow: "\u21a5",
+    UpTee: "\u22a5",
+    upuparrows: "\u21c8",
+    urcorn: "\u231d",
+    urcorner: "\u231d",
+    urcrop: "\u230e",
+    Uring: "\u016e",
+    uring: "\u016f",
+    urtri: "\u25f9",
+    Uscr: "\ud835\udcb0",
+    uscr: "\ud835\udcca",
+    utdot: "\u22f0",
+    Utilde: "\u0168",
+    utilde: "\u0169",
+    utri: "\u25b5",
+    utrif: "\u25b4",
+    uuarr: "\u21c8",
+    Uuml: "\xdc",
+    uuml: "\xfc",
+    uwangle: "\u29a7",
+    vangrt: "\u299c",
+    varepsilon: "\u03f5",
+    varkappa: "\u03f0",
+    varnothing: "\u2205",
+    varphi: "\u03d5",
+    varpi: "\u03d6",
+    varpropto: "\u221d",
+    varr: "\u2195",
+    vArr: "\u21d5",
+    varrho: "\u03f1",
+    varsigma: "\u03c2",
+    varsubsetneq: "\u228a\ufe00",
+    varsubsetneqq: "\u2acb\ufe00",
+    varsupsetneq: "\u228b\ufe00",
+    varsupsetneqq: "\u2acc\ufe00",
+    vartheta: "\u03d1",
+    vartriangleleft: "\u22b2",
+    vartriangleright: "\u22b3",
+    vBar: "\u2ae8",
+    Vbar: "\u2aeb",
+    vBarv: "\u2ae9",
+    Vcy: "\u0412",
+    vcy: "\u0432",
+    vdash: "\u22a2",
+    vDash: "\u22a8",
+    Vdash: "\u22a9",
+    VDash: "\u22ab",
+    Vdashl: "\u2ae6",
+    veebar: "\u22bb",
+    vee: "\u2228",
+    Vee: "\u22c1",
+    veeeq: "\u225a",
+    vellip: "\u22ee",
+    verbar: "|",
+    Verbar: "\u2016",
+    vert: "|",
+    Vert: "\u2016",
+    VerticalBar: "\u2223",
+    VerticalLine: "|",
+    VerticalSeparator: "\u2758",
+    VerticalTilde: "\u2240",
+    VeryThinSpace: "\u200a",
+    Vfr: "\ud835\udd19",
+    vfr: "\ud835\udd33",
+    vltri: "\u22b2",
+    vnsub: "\u2282\u20d2",
+    vnsup: "\u2283\u20d2",
+    Vopf: "\ud835\udd4d",
+    vopf: "\ud835\udd67",
+    vprop: "\u221d",
+    vrtri: "\u22b3",
+    Vscr: "\ud835\udcb1",
+    vscr: "\ud835\udccb",
+    vsubnE: "\u2acb\ufe00",
+    vsubne: "\u228a\ufe00",
+    vsupnE: "\u2acc\ufe00",
+    vsupne: "\u228b\ufe00",
+    Vvdash: "\u22aa",
+    vzigzag: "\u299a",
+    Wcirc: "\u0174",
+    wcirc: "\u0175",
+    wedbar: "\u2a5f",
+    wedge: "\u2227",
+    Wedge: "\u22c0",
+    wedgeq: "\u2259",
+    weierp: "\u2118",
+    Wfr: "\ud835\udd1a",
+    wfr: "\ud835\udd34",
+    Wopf: "\ud835\udd4e",
+    wopf: "\ud835\udd68",
+    wp: "\u2118",
+    wr: "\u2240",
+    wreath: "\u2240",
+    Wscr: "\ud835\udcb2",
+    wscr: "\ud835\udccc",
+    xcap: "\u22c2",
+    xcirc: "\u25ef",
+    xcup: "\u22c3",
+    xdtri: "\u25bd",
+    Xfr: "\ud835\udd1b",
+    xfr: "\ud835\udd35",
+    xharr: "\u27f7",
+    xhArr: "\u27fa",
+    Xi: "\u039e",
+    xi: "\u03be",
+    xlarr: "\u27f5",
+    xlArr: "\u27f8",
+    xmap: "\u27fc",
+    xnis: "\u22fb",
+    xodot: "\u2a00",
+    Xopf: "\ud835\udd4f",
+    xopf: "\ud835\udd69",
+    xoplus: "\u2a01",
+    xotime: "\u2a02",
+    xrarr: "\u27f6",
+    xrArr: "\u27f9",
+    Xscr: "\ud835\udcb3",
+    xscr: "\ud835\udccd",
+    xsqcup: "\u2a06",
+    xuplus: "\u2a04",
+    xutri: "\u25b3",
+    xvee: "\u22c1",
+    xwedge: "\u22c0",
+    Yacute: "\xdd",
+    yacute: "\xfd",
+    YAcy: "\u042f",
+    yacy: "\u044f",
+    Ycirc: "\u0176",
+    ycirc: "\u0177",
+    Ycy: "\u042b",
+    ycy: "\u044b",
+    yen: "\xa5",
+    Yfr: "\ud835\udd1c",
+    yfr: "\ud835\udd36",
+    YIcy: "\u0407",
+    yicy: "\u0457",
+    Yopf: "\ud835\udd50",
+    yopf: "\ud835\udd6a",
+    Yscr: "\ud835\udcb4",
+    yscr: "\ud835\udcce",
+    YUcy: "\u042e",
+    yucy: "\u044e",
+    yuml: "\xff",
+    Yuml: "\u0178",
+    Zacute: "\u0179",
+    zacute: "\u017a",
+    Zcaron: "\u017d",
+    zcaron: "\u017e",
+    Zcy: "\u0417",
+    zcy: "\u0437",
+    Zdot: "\u017b",
+    zdot: "\u017c",
+    zeetrf: "\u2128",
+    ZeroWidthSpace: "\u200b",
+    Zeta: "\u0396",
+    zeta: "\u03b6",
+    zfr: "\ud835\udd37",
+    Zfr: "\u2128",
+    ZHcy: "\u0416",
+    zhcy: "\u0436",
+    zigrarr: "\u21dd",
+    zopf: "\ud835\udd6b",
+    Zopf: "\u2124",
+    Zscr: "\ud835\udcb5",
+    zscr: "\ud835\udccf",
+    zwj: "\u200d",
+    zwnj: "\u200c"
+  };
+  /*eslint quotes:0*/  var entities = require$$0;
+  var regex$4 = /[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]/;
+  var encodeCache = {};
+  // Create a lookup array where anything but characters in `chars` string
+  // and alphanumeric chars is percent-encoded.
+  
+    function getEncodeCache(exclude) {
+    var i, ch, cache = encodeCache[exclude];
+    if (cache) {
+      return cache;
+    }
+    cache = encodeCache[exclude] = [];
+    for (i = 0; i < 128; i++) {
+      ch = String.fromCharCode(i);
+      if (/^[0-9a-z]$/i.test(ch)) {
+        // always allow unencoded alphanumeric characters
+        cache.push(ch);
+      } else {
+        cache.push("%" + ("0" + i.toString(16).toUpperCase()).slice(-2));
+      }
+    }
+    for (i = 0; i < exclude.length; i++) {
+      cache[exclude.charCodeAt(i)] = exclude[i];
+    }
+    return cache;
+  }
+  // Encode unsafe characters with percent-encoding, skipping already
+  // encoded sequences.
+  
+  //  - string       - string to encode
+  //  - exclude      - list of characters to ignore (in addition to a-zA-Z0-9)
+  //  - keepEscaped  - don't encode '%' in a correct escape sequence (default: true)
+  
+    function encode$2(string, exclude, keepEscaped) {
+    var i, l, code, nextCode, cache, result = "";
+    if (typeof exclude !== "string") {
+      // encode(string, keepEscaped)
+      keepEscaped = exclude;
+      exclude = encode$2.defaultChars;
+    }
+    if (typeof keepEscaped === "undefined") {
+      keepEscaped = true;
+    }
+    cache = getEncodeCache(exclude);
+    for (i = 0, l = string.length; i < l; i++) {
+      code = string.charCodeAt(i);
+      if (keepEscaped && code === 37 /* % */ && i + 2 < l) {
+        if (/^[0-9a-f]{2}$/i.test(string.slice(i + 1, i + 3))) {
+          result += string.slice(i, i + 3);
+          i += 2;
+          continue;
+        }
+      }
+      if (code < 128) {
+        result += cache[code];
+        continue;
+      }
+      if (code >= 55296 && code <= 57343) {
+        if (code >= 55296 && code <= 56319 && i + 1 < l) {
+          nextCode = string.charCodeAt(i + 1);
+          if (nextCode >= 56320 && nextCode <= 57343) {
+            result += encodeURIComponent(string[i] + string[i + 1]);
+            i++;
+            continue;
+          }
+        }
+        result += "%EF%BF%BD";
+        continue;
+      }
+      result += encodeURIComponent(string[i]);
+    }
+    return result;
+  }
+  encode$2.defaultChars = ";/?:@&=+$,-_.!~*'()#";
+  encode$2.componentChars = "-_.!~*'()";
+  var encode_1 = encode$2;
+  /* eslint-disable no-bitwise */  var decodeCache = {};
+  function getDecodeCache(exclude) {
+    var i, ch, cache = decodeCache[exclude];
+    if (cache) {
+      return cache;
+    }
+    cache = decodeCache[exclude] = [];
+    for (i = 0; i < 128; i++) {
+      ch = String.fromCharCode(i);
+      cache.push(ch);
+    }
+    for (i = 0; i < exclude.length; i++) {
+      ch = exclude.charCodeAt(i);
+      cache[ch] = "%" + ("0" + ch.toString(16).toUpperCase()).slice(-2);
+    }
+    return cache;
+  }
+  // Decode percent-encoded string.
+  
+    function decode$2(string, exclude) {
+    var cache;
+    if (typeof exclude !== "string") {
+      exclude = decode$2.defaultChars;
+    }
+    cache = getDecodeCache(exclude);
+    return string.replace(/(%[a-f0-9]{2})+/gi, (function(seq) {
+      var i, l, b1, b2, b3, b4, chr, result = "";
+      for (i = 0, l = seq.length; i < l; i += 3) {
+        b1 = parseInt(seq.slice(i + 1, i + 3), 16);
+        if (b1 < 128) {
+          result += cache[b1];
+          continue;
+        }
+        if ((b1 & 224) === 192 && i + 3 < l) {
+          // 110xxxxx 10xxxxxx
+          b2 = parseInt(seq.slice(i + 4, i + 6), 16);
+          if ((b2 & 192) === 128) {
+            chr = b1 << 6 & 1984 | b2 & 63;
+            if (chr < 128) {
+              result += "\ufffd\ufffd";
+            } else {
+              result += String.fromCharCode(chr);
+            }
+            i += 3;
+            continue;
+          }
+        }
+        if ((b1 & 240) === 224 && i + 6 < l) {
+          // 1110xxxx 10xxxxxx 10xxxxxx
+          b2 = parseInt(seq.slice(i + 4, i + 6), 16);
+          b3 = parseInt(seq.slice(i + 7, i + 9), 16);
+          if ((b2 & 192) === 128 && (b3 & 192) === 128) {
+            chr = b1 << 12 & 61440 | b2 << 6 & 4032 | b3 & 63;
+            if (chr < 2048 || chr >= 55296 && chr <= 57343) {
+              result += "\ufffd\ufffd\ufffd";
+            } else {
+              result += String.fromCharCode(chr);
+            }
+            i += 6;
+            continue;
+          }
+        }
+        if ((b1 & 248) === 240 && i + 9 < l) {
+          // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
+          b2 = parseInt(seq.slice(i + 4, i + 6), 16);
+          b3 = parseInt(seq.slice(i + 7, i + 9), 16);
+          b4 = parseInt(seq.slice(i + 10, i + 12), 16);
+          if ((b2 & 192) === 128 && (b3 & 192) === 128 && (b4 & 192) === 128) {
+            chr = b1 << 18 & 1835008 | b2 << 12 & 258048 | b3 << 6 & 4032 | b4 & 63;
+            if (chr < 65536 || chr > 1114111) {
+              result += "\ufffd\ufffd\ufffd\ufffd";
+            } else {
+              chr -= 65536;
+              result += String.fromCharCode(55296 + (chr >> 10), 56320 + (chr & 1023));
+            }
+            i += 9;
+            continue;
+          }
+        }
+        result += "\ufffd";
+      }
+      return result;
+    }));
+  }
+  decode$2.defaultChars = ";/?:@&=+$,#";
+  decode$2.componentChars = "";
+  var decode_1 = decode$2;
+  var format$1 = function format(url) {
+    var result = "";
+    result += url.protocol || "";
+    result += url.slashes ? "//" : "";
+    result += url.auth ? url.auth + "@" : "";
+    if (url.hostname && url.hostname.indexOf(":") !== -1) {
+      // ipv6 address
+      result += "[" + url.hostname + "]";
+    } else {
+      result += url.hostname || "";
+    }
+    result += url.port ? ":" + url.port : "";
+    result += url.pathname || "";
+    result += url.search || "";
+    result += url.hash || "";
+    return result;
+  };
+  // Copyright Joyent, Inc. and other Node contributors.
+  
+  // Changes from joyent/node:
+  
+  // 1. No leading slash in paths,
+  //    e.g. in `url.parse('http://foo?bar')` pathname is ``, not `/`
+  
+  // 2. Backslashes are not replaced with slashes,
+  //    so `http:\\example.org\` is treated like a relative path
+  
+  // 3. Trailing colon is treated like a part of the path,
+  //    i.e. in `http://example.org:foo` pathname is `:foo`
+  
+  // 4. Nothing is URL-encoded in the resulting object,
+  //    (in joyent/node some chars in auth and paths are encoded)
+  
+  // 5. `url.parse()` does not have `parseQueryString` argument
+  
+  // 6. Removed extraneous result properties: `host`, `path`, `query`, etc.,
+  //    which can be constructed using other parts of the url.
+  
+    function Url() {
+    this.protocol = null;
+    this.slashes = null;
+    this.auth = null;
+    this.port = null;
+    this.hostname = null;
+    this.hash = null;
+    this.search = null;
+    this.pathname = null;
+  }
+  // Reference: RFC 3986, RFC 1808, RFC 2396
+  // define these here so at least they only have to be
+  // compiled once on the first module load.
+    var protocolPattern = /^([a-z0-9.+-]+:)/i, portPattern = /:[0-9]*$/, 
+  // Special case for a simple path URL
+  simplePathPattern = /^(\/\/?(?!\/)[^\?\s]*)(\?[^\s]*)?$/, 
+  // RFC 2396: characters reserved for delimiting URLs.
+  // We actually just auto-escape these.
+  delims = [ "<", ">", '"', "`", " ", "\r", "\n", "\t" ], 
+  // RFC 2396: characters not allowed for various reasons.
+  unwise = [ "{", "}", "|", "\\", "^", "`" ].concat(delims), 
+  // Allowed by RFCs, but cause of XSS attacks.  Always escape these.
+  autoEscape = [ "'" ].concat(unwise), 
+  // Characters that are never ever allowed in a hostname.
+  // Note that any invalid chars are also handled, but these
+  // are the ones that are *expected* to be seen, so we fast-path
+  // them.
+  nonHostChars = [ "%", "/", "?", ";", "#" ].concat(autoEscape), hostEndingChars = [ "/", "?", "#" ], hostnameMaxLen = 255, hostnamePartPattern = /^[+a-z0-9A-Z_-]{0,63}$/, hostnamePartStart = /^([+a-z0-9A-Z_-]{0,63})(.*)$/, 
+  // protocols that can allow "unsafe" and "unwise" chars.
+  /* eslint-disable no-script-url */
+  // protocols that never have a hostname.
+  hostlessProtocol = {
+    javascript: true,
+    "javascript:": true
+  }, 
+  // protocols that always contain a // bit.
+  slashedProtocol = {
+    http: true,
+    https: true,
+    ftp: true,
+    gopher: true,
+    file: true,
+    "http:": true,
+    "https:": true,
+    "ftp:": true,
+    "gopher:": true,
+    "file:": true
+  };
+  /* eslint-enable no-script-url */  function urlParse(url, slashesDenoteHost) {
+    if (url && url instanceof Url) {
+      return url;
+    }
+    var u = new Url;
+    u.parse(url, slashesDenoteHost);
+    return u;
+  }
+  Url.prototype.parse = function(url, slashesDenoteHost) {
+    var i, l, lowerProto, hec, slashes, rest = url;
+    // trim before proceeding.
+    // This is to support parse stuff like "  http://foo.com  \n"
+        rest = rest.trim();
+    if (!slashesDenoteHost && url.split("#").length === 1) {
+      // Try fast path regexp
+      var simplePath = simplePathPattern.exec(rest);
+      if (simplePath) {
+        this.pathname = simplePath[1];
+        if (simplePath[2]) {
+          this.search = simplePath[2];
+        }
+        return this;
+      }
+    }
+    var proto = protocolPattern.exec(rest);
+    if (proto) {
+      proto = proto[0];
+      lowerProto = proto.toLowerCase();
+      this.protocol = proto;
+      rest = rest.substr(proto.length);
+    }
+    // figure out if it's got a host
+    // user@server is *always* interpreted as a hostname, and url
+    // resolution will treat //foo/bar as host=foo,path=bar because that's
+    // how the browser resolves relative URLs.
+        if (slashesDenoteHost || proto || rest.match(/^\/\/[^@\/]+@[^@\/]+/)) {
+      slashes = rest.substr(0, 2) === "//";
+      if (slashes && !(proto && hostlessProtocol[proto])) {
+        rest = rest.substr(2);
+        this.slashes = true;
+      }
+    }
+    if (!hostlessProtocol[proto] && (slashes || proto && !slashedProtocol[proto])) {
+      // there's a hostname.
+      // the first instance of /, ?, ;, or # ends the host.
+      // If there is an @ in the hostname, then non-host chars *are* allowed
+      // to the left of the last @ sign, unless some host-ending character
+      // comes *before* the @-sign.
+      // URLs are obnoxious.
+      // ex:
+      // http://a@b@c/ => user:a@b host:c
+      // http://a@b?@c => user:a host:c path:/?@c
+      // v0.12 TODO(isaacs): This is not quite how Chrome does things.
+      // Review our test case against browsers more comprehensively.
+      // find the first instance of any hostEndingChars
+      var hostEnd = -1;
+      for (i = 0; i < hostEndingChars.length; i++) {
+        hec = rest.indexOf(hostEndingChars[i]);
+        if (hec !== -1 && (hostEnd === -1 || hec < hostEnd)) {
+          hostEnd = hec;
+        }
+      }
+      // at this point, either we have an explicit point where the
+      // auth portion cannot go past, or the last @ char is the decider.
+            var auth, atSign;
+      if (hostEnd === -1) {
+        // atSign can be anywhere.
+        atSign = rest.lastIndexOf("@");
+      } else {
+        // atSign must be in auth portion.
+        // http://a@b/c@d => host:b auth:a path:/c@d
+        atSign = rest.lastIndexOf("@", hostEnd);
+      }
+      // Now we have a portion which is definitely the auth.
+      // Pull that off.
+            if (atSign !== -1) {
+        auth = rest.slice(0, atSign);
+        rest = rest.slice(atSign + 1);
+        this.auth = auth;
+      }
+      // the host is the remaining to the left of the first non-host char
+            hostEnd = -1;
+      for (i = 0; i < nonHostChars.length; i++) {
+        hec = rest.indexOf(nonHostChars[i]);
+        if (hec !== -1 && (hostEnd === -1 || hec < hostEnd)) {
+          hostEnd = hec;
+        }
+      }
+      // if we still have not hit it, then the entire thing is a host.
+            if (hostEnd === -1) {
+        hostEnd = rest.length;
+      }
+      if (rest[hostEnd - 1] === ":") {
+        hostEnd--;
+      }
+      var host = rest.slice(0, hostEnd);
+      rest = rest.slice(hostEnd);
+      // pull out port.
+            this.parseHost(host);
+      // we've indicated that there is a hostname,
+      // so even if it's empty, it has to be present.
+            this.hostname = this.hostname || "";
+      // if hostname begins with [ and ends with ]
+      // assume that it's an IPv6 address.
+            var ipv6Hostname = this.hostname[0] === "[" && this.hostname[this.hostname.length - 1] === "]";
+      // validate a little.
+            if (!ipv6Hostname) {
+        var hostparts = this.hostname.split(/\./);
+        for (i = 0, l = hostparts.length; i < l; i++) {
+          var part = hostparts[i];
+          if (!part) {
+            continue;
+          }
+          if (!part.match(hostnamePartPattern)) {
+            var newpart = "";
+            for (var j = 0, k = part.length; j < k; j++) {
+              if (part.charCodeAt(j) > 127) {
+                // we replace non-ASCII char with a temporary placeholder
+                // we need this to make sure size of hostname is not
+                // broken by replacing non-ASCII by nothing
+                newpart += "x";
+              } else {
+                newpart += part[j];
+              }
+            }
+            // we test again with ASCII char only
+                        if (!newpart.match(hostnamePartPattern)) {
+              var validParts = hostparts.slice(0, i);
+              var notHost = hostparts.slice(i + 1);
+              var bit = part.match(hostnamePartStart);
+              if (bit) {
+                validParts.push(bit[1]);
+                notHost.unshift(bit[2]);
+              }
+              if (notHost.length) {
+                rest = notHost.join(".") + rest;
+              }
+              this.hostname = validParts.join(".");
+              break;
+            }
+          }
+        }
+      }
+      if (this.hostname.length > hostnameMaxLen) {
+        this.hostname = "";
+      }
+      // strip [ and ] from the hostname
+      // the host field still retains them, though
+            if (ipv6Hostname) {
+        this.hostname = this.hostname.substr(1, this.hostname.length - 2);
+      }
+    }
+    // chop off from the tail first.
+        var hash = rest.indexOf("#");
+    if (hash !== -1) {
+      // got a fragment string.
+      this.hash = rest.substr(hash);
+      rest = rest.slice(0, hash);
+    }
+    var qm = rest.indexOf("?");
+    if (qm !== -1) {
+      this.search = rest.substr(qm);
+      rest = rest.slice(0, qm);
+    }
+    if (rest) {
+      this.pathname = rest;
+    }
+    if (slashedProtocol[lowerProto] && this.hostname && !this.pathname) {
+      this.pathname = "";
+    }
+    return this;
+  };
+  Url.prototype.parseHost = function(host) {
+    var port = portPattern.exec(host);
+    if (port) {
+      port = port[0];
+      if (port !== ":") {
+        this.port = port.substr(1);
+      }
+      host = host.substr(0, host.length - port.length);
+    }
+    if (host) {
+      this.hostname = host;
+    }
+  };
+  var parse$1 = urlParse;
+  var encode$1 = encode_1;
+  var decode$1 = decode_1;
+  var format = format$1;
+  var parse = parse$1;
+  var mdurl = {
+    encode: encode$1,
+    decode: decode$1,
+    format: format,
+    parse: parse
+  };
+  var regex$3 = /[\0-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]/;
+  var regex$2 = /[\0-\x1F\x7F-\x9F]/;
+  var regex$1 = /[\xAD\u0600-\u0605\u061C\u06DD\u070F\u08E2\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\uFFF9-\uFFFB]|\uD804[\uDCBD\uDCCD]|\uD82F[\uDCA0-\uDCA3]|\uD834[\uDD73-\uDD7A]|\uDB40[\uDC01\uDC20-\uDC7F]/;
+  var regex = /[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]/;
+  var Any = regex$3;
+  var Cc = regex$2;
+  var Cf = regex$1;
+  var P = regex$4;
+  var Z = regex;
+  var uc_micro = {
+    Any: Any,
+    Cc: Cc,
+    Cf: Cf,
+    P: P,
+    Z: Z
+  };
+  var utils = createCommonjsModule((function(module, exports) {
+    function _class(obj) {
+      return Object.prototype.toString.call(obj);
+    }
+    function isString(obj) {
+      return _class(obj) === "[object String]";
+    }
+    var _hasOwnProperty = Object.prototype.hasOwnProperty;
+    function has(object, key) {
+      return _hasOwnProperty.call(object, key);
+    }
+    // Merge objects
+    
+        function assign(obj /*from1, from2, from3, ...*/) {
+      var sources = Array.prototype.slice.call(arguments, 1);
+      sources.forEach((function(source) {
+        if (!source) {
+          return;
+        }
+        if (typeof source !== "object") {
+          throw new TypeError(source + "must be object");
+        }
+        Object.keys(source).forEach((function(key) {
+          obj[key] = source[key];
+        }));
+      }));
+      return obj;
+    }
+    // Remove element from array and put another array at those position.
+    // Useful for some operations with tokens
+        function arrayReplaceAt(src, pos, newElements) {
+      return [].concat(src.slice(0, pos), newElements, src.slice(pos + 1));
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+        function isValidEntityCode(c) {
+      /*eslint no-bitwise:0*/
+      // broken sequence
+      if (c >= 55296 && c <= 57343) {
+        return false;
+      }
+      // never used
+            if (c >= 64976 && c <= 65007) {
+        return false;
+      }
+      if ((c & 65535) === 65535 || (c & 65535) === 65534) {
+        return false;
+      }
+      // control codes
+            if (c >= 0 && c <= 8) {
+        return false;
+      }
+      if (c === 11) {
+        return false;
+      }
+      if (c >= 14 && c <= 31) {
+        return false;
+      }
+      if (c >= 127 && c <= 159) {
+        return false;
+      }
+      // out of range
+            if (c > 1114111) {
+        return false;
+      }
+      return true;
+    }
+    function fromCodePoint(c) {
+      /*eslint no-bitwise:0*/
+      if (c > 65535) {
+        c -= 65536;
+        var surrogate1 = 55296 + (c >> 10), surrogate2 = 56320 + (c & 1023);
+        return String.fromCharCode(surrogate1, surrogate2);
+      }
+      return String.fromCharCode(c);
+    }
+    var UNESCAPE_MD_RE = /\\([!"#$%&'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])/g;
+    var ENTITY_RE = /&([a-z#][a-z0-9]{1,31});/gi;
+    var UNESCAPE_ALL_RE = new RegExp(UNESCAPE_MD_RE.source + "|" + ENTITY_RE.source, "gi");
+    var DIGITAL_ENTITY_TEST_RE = /^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))$/i;
+    function replaceEntityPattern(match, name) {
+      var code;
+      if (has(entities, name)) {
+        return entities[name];
+      }
+      if (name.charCodeAt(0) === 35 /* # */ && DIGITAL_ENTITY_TEST_RE.test(name)) {
+        code = name[1].toLowerCase() === "x" ? parseInt(name.slice(2), 16) : parseInt(name.slice(1), 10);
+        if (isValidEntityCode(code)) {
+          return fromCodePoint(code);
+        }
+      }
+      return match;
+    }
+    /*function replaceEntities(str) {
+	  if (str.indexOf('&') < 0) { return str; }
+
+	  return str.replace(ENTITY_RE, replaceEntityPattern);
+	}*/    function unescapeMd(str) {
+      if (str.indexOf("\\") < 0) {
+        return str;
+      }
+      return str.replace(UNESCAPE_MD_RE, "$1");
+    }
+    function unescapeAll(str) {
+      if (str.indexOf("\\") < 0 && str.indexOf("&") < 0) {
+        return str;
+      }
+      return str.replace(UNESCAPE_ALL_RE, (function(match, escaped, entity) {
+        if (escaped) {
+          return escaped;
+        }
+        return replaceEntityPattern(match, entity);
+      }));
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+        var HTML_ESCAPE_TEST_RE = /[&<>"]/;
+    var HTML_ESCAPE_REPLACE_RE = /[&<>"]/g;
+    var HTML_REPLACEMENTS = {
+      "&": "&amp;",
+      "<": "&lt;",
+      ">": "&gt;",
+      '"': "&quot;"
+    };
+    function replaceUnsafeChar(ch) {
+      return HTML_REPLACEMENTS[ch];
+    }
+    function escapeHtml(str) {
+      if (HTML_ESCAPE_TEST_RE.test(str)) {
+        return str.replace(HTML_ESCAPE_REPLACE_RE, replaceUnsafeChar);
+      }
+      return str;
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+        var REGEXP_ESCAPE_RE = /[.?*+^$[\]\\(){}|-]/g;
+    function escapeRE(str) {
+      return str.replace(REGEXP_ESCAPE_RE, "\\$&");
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+        function isSpace(code) {
+      switch (code) {
+       case 9:
+       case 32:
+        return true;
+      }
+      return false;
+    }
+    // Zs (unicode class) || [\t\f\v\r\n]
+        function isWhiteSpace(code) {
+      if (code >= 8192 && code <= 8202) {
+        return true;
+      }
+      switch (code) {
+       case 9:
+ // \t
+               case 10:
+ // \n
+               case 11:
+ // \v
+               case 12:
+ // \f
+               case 13:
+ // \r
+               case 32:
+       case 160:
+       case 5760:
+       case 8239:
+       case 8287:
+       case 12288:
+        return true;
+      }
+      return false;
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+    /*eslint-disable max-len*/
+    // Currently without astral characters support.
+        function isPunctChar(ch) {
+      return regex$4.test(ch);
+    }
+    // Markdown ASCII punctuation characters.
+    
+    // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~
+    // http://spec.commonmark.org/0.15/#ascii-punctuation-character
+    
+    // Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
+    
+        function isMdAsciiPunct(ch) {
+      switch (ch) {
+       case 33 /* ! */ :
+       case 34 /* " */ :
+       case 35 /* # */ :
+       case 36 /* $ */ :
+       case 37 /* % */ :
+       case 38 /* & */ :
+       case 39 /* ' */ :
+       case 40 /* ( */ :
+       case 41 /* ) */ :
+       case 42 /* * */ :
+       case 43 /* + */ :
+       case 44 /* , */ :
+       case 45 /* - */ :
+       case 46 /* . */ :
+       case 47 /* / */ :
+       case 58 /* : */ :
+       case 59 /* ; */ :
+       case 60 /* < */ :
+       case 61 /* = */ :
+       case 62 /* > */ :
+       case 63 /* ? */ :
+       case 64 /* @ */ :
+       case 91 /* [ */ :
+       case 92 /* \ */ :
+       case 93 /* ] */ :
+       case 94 /* ^ */ :
+       case 95 /* _ */ :
+       case 96 /* ` */ :
+       case 123 /* { */ :
+       case 124 /* | */ :
+       case 125 /* } */ :
+       case 126 /* ~ */ :
+        return true;
+
+       default:
+        return false;
+      }
+    }
+    // Hepler to unify [reference labels].
+    
+        function normalizeReference(str) {
+      // Trim and collapse whitespace
+      str = str.trim().replace(/\s+/g, " ");
+      // In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
+      // fixed in v12 (couldn't find any details).
+      
+      // So treat this one as a special case
+      // (remove this when node v10 is no longer supported).
+      
+            if ("\u1e9e".toLowerCase() === "\u1e7e") {
+        str = str.replace(/\u1e9e/g, "\xdf");
+      }
+      // .toLowerCase().toUpperCase() should get rid of all differences
+      // between letter variants.
+      
+      // Simple .toLowerCase() doesn't normalize 125 code points correctly,
+      // and .toUpperCase doesn't normalize 6 of them (list of exceptions:
+      // İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
+      // uppercased versions).
+      
+      // Here's an example showing how it happens. Lets take greek letter omega:
+      // uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
+      
+      // Unicode entries:
+      // 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8;
+      // 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
+      // 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
+      // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8;
+      
+      // Case-insensitive comparison should treat all of them as equivalent.
+      
+      // But .toLowerCase() doesn't change ϑ (it's already lowercase),
+      // and .toUpperCase() doesn't change ϴ (already uppercase).
+      
+      // Applying first lower then upper case normalizes any character:
+      // '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
+      
+      // Note: this is equivalent to unicode case folding; unicode normalization
+      // is a different step that is not required here.
+      
+      // Final result should be uppercased, because it's later stored in an object
+      // (this avoid a conflict with Object.prototype members,
+      // most notably, `__proto__`)
+      
+            return str.toLowerCase().toUpperCase();
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+    // Re-export libraries commonly used in both markdown-it and its plugins,
+    // so plugins won't have to depend on them explicitly, which reduces their
+    // bundled size (e.g. a browser build).
+    
+        exports.lib = {};
+    exports.lib.mdurl = mdurl;
+    exports.lib.ucmicro = uc_micro;
+    exports.assign = assign;
+    exports.isString = isString;
+    exports.has = has;
+    exports.unescapeMd = unescapeMd;
+    exports.unescapeAll = unescapeAll;
+    exports.isValidEntityCode = isValidEntityCode;
+    exports.fromCodePoint = fromCodePoint;
+    // exports.replaceEntities     = replaceEntities;
+        exports.escapeHtml = escapeHtml;
+    exports.arrayReplaceAt = arrayReplaceAt;
+    exports.isSpace = isSpace;
+    exports.isWhiteSpace = isWhiteSpace;
+    exports.isMdAsciiPunct = isMdAsciiPunct;
+    exports.isPunctChar = isPunctChar;
+    exports.escapeRE = escapeRE;
+    exports.normalizeReference = normalizeReference;
+  }));
+  // Parse link label
+    var parse_link_label = function parseLinkLabel(state, start, disableNested) {
+    var level, found, marker, prevPos, labelEnd = -1, max = state.posMax, oldPos = state.pos;
+    state.pos = start + 1;
+    level = 1;
+    while (state.pos < max) {
+      marker = state.src.charCodeAt(state.pos);
+      if (marker === 93 /* ] */) {
+        level--;
+        if (level === 0) {
+          found = true;
+          break;
+        }
+      }
+      prevPos = state.pos;
+      state.md.inline.skipToken(state);
+      if (marker === 91 /* [ */) {
+        if (prevPos === state.pos - 1) {
+          // increase level if we find text `[`, which is not a part of any token
+          level++;
+        } else if (disableNested) {
+          state.pos = oldPos;
+          return -1;
+        }
+      }
+    }
+    if (found) {
+      labelEnd = state.pos;
+    }
+    // restore old state
+        state.pos = oldPos;
+    return labelEnd;
+  };
+  var unescapeAll$2 = utils.unescapeAll;
+  var parse_link_destination = function parseLinkDestination(str, start, max) {
+    var code, level, pos = start, result = {
+      ok: false,
+      pos: 0,
+      lines: 0,
+      str: ""
+    };
+    if (str.charCodeAt(pos) === 60 /* < */) {
+      pos++;
+      while (pos < max) {
+        code = str.charCodeAt(pos);
+        if (code === 10 /* \n */) {
+          return result;
+        }
+        if (code === 60 /* < */) {
+          return result;
+        }
+        if (code === 62 /* > */) {
+          result.pos = pos + 1;
+          result.str = unescapeAll$2(str.slice(start + 1, pos));
+          result.ok = true;
+          return result;
+        }
+        if (code === 92 /* \ */ && pos + 1 < max) {
+          pos += 2;
+          continue;
+        }
+        pos++;
+      }
+      // no closing '>'
+            return result;
+    }
+    // this should be ... } else { ... branch
+        level = 0;
+    while (pos < max) {
+      code = str.charCodeAt(pos);
+      if (code === 32) {
+        break;
+      }
+      // ascii control characters
+            if (code < 32 || code === 127) {
+        break;
+      }
+      if (code === 92 /* \ */ && pos + 1 < max) {
+        if (str.charCodeAt(pos + 1) === 32) {
+          break;
+        }
+        pos += 2;
+        continue;
+      }
+      if (code === 40 /* ( */) {
+        level++;
+        if (level > 32) {
+          return result;
+        }
+      }
+      if (code === 41 /* ) */) {
+        if (level === 0) {
+          break;
+        }
+        level--;
+      }
+      pos++;
+    }
+    if (start === pos) {
+      return result;
+    }
+    if (level !== 0) {
+      return result;
+    }
+    result.str = unescapeAll$2(str.slice(start, pos));
+    result.pos = pos;
+    result.ok = true;
+    return result;
+  };
+  var unescapeAll$1 = utils.unescapeAll;
+  var parse_link_title = function parseLinkTitle(str, start, max) {
+    var code, marker, lines = 0, pos = start, result = {
+      ok: false,
+      pos: 0,
+      lines: 0,
+      str: ""
+    };
+    if (pos >= max) {
+      return result;
+    }
+    marker = str.charCodeAt(pos);
+    if (marker !== 34 /* " */ && marker !== 39 /* ' */ && marker !== 40 /* ( */) {
+      return result;
+    }
+    pos++;
+    // if opening marker is "(", switch it to closing marker ")"
+        if (marker === 40) {
+      marker = 41;
+    }
+    while (pos < max) {
+      code = str.charCodeAt(pos);
+      if (code === marker) {
+        result.pos = pos + 1;
+        result.lines = lines;
+        result.str = unescapeAll$1(str.slice(start + 1, pos));
+        result.ok = true;
+        return result;
+      } else if (code === 40 /* ( */ && marker === 41 /* ) */) {
+        return result;
+      } else if (code === 10) {
+        lines++;
+      } else if (code === 92 /* \ */ && pos + 1 < max) {
+        pos++;
+        if (str.charCodeAt(pos) === 10) {
+          lines++;
+        }
+      }
+      pos++;
+    }
+    return result;
+  };
+  var parseLinkLabel = parse_link_label;
+  var parseLinkDestination = parse_link_destination;
+  var parseLinkTitle = parse_link_title;
+  var helpers = {
+    parseLinkLabel: parseLinkLabel,
+    parseLinkDestination: parseLinkDestination,
+    parseLinkTitle: parseLinkTitle
+  };
+  var assign$1 = utils.assign;
+  var unescapeAll = utils.unescapeAll;
+  var escapeHtml = utils.escapeHtml;
+  ////////////////////////////////////////////////////////////////////////////////
+    var default_rules = {};
+  default_rules.code_inline = function(tokens, idx, options, env, slf) {
+    var token = tokens[idx];
+    return "<code" + slf.renderAttrs(token) + ">" + escapeHtml(token.content) + "</code>";
+  };
+  default_rules.code_block = function(tokens, idx, options, env, slf) {
+    var token = tokens[idx];
+    return "<pre" + slf.renderAttrs(token) + "><code>" + escapeHtml(tokens[idx].content) + "</code></pre>\n";
+  };
+  default_rules.fence = function(tokens, idx, options, env, slf) {
+    var token = tokens[idx], info = token.info ? unescapeAll(token.info).trim() : "", langName = "", langAttrs = "", highlighted, i, arr, tmpAttrs, tmpToken;
+    if (info) {
+      arr = info.split(/(\s+)/g);
+      langName = arr[0];
+      langAttrs = arr.slice(2).join("");
+    }
+    if (options.highlight) {
+      highlighted = options.highlight(token.content, langName, langAttrs) || escapeHtml(token.content);
+    } else {
+      highlighted = escapeHtml(token.content);
+    }
+    if (highlighted.indexOf("<pre") === 0) {
+      return highlighted + "\n";
+    }
+    // If language exists, inject class gently, without modifying original token.
+    // May be, one day we will add .deepClone() for token and simplify this part, but
+    // now we prefer to keep things local.
+        if (info) {
+      i = token.attrIndex("class");
+      tmpAttrs = token.attrs ? token.attrs.slice() : [];
+      if (i < 0) {
+        tmpAttrs.push([ "class", options.langPrefix + langName ]);
+      } else {
+        tmpAttrs[i] = tmpAttrs[i].slice();
+        tmpAttrs[i][1] += " " + options.langPrefix + langName;
+      }
+      // Fake token just to render attributes
+            tmpToken = {
+        attrs: tmpAttrs
+      };
+      return "<pre><code" + slf.renderAttrs(tmpToken) + ">" + highlighted + "</code></pre>\n";
+    }
+    return "<pre><code" + slf.renderAttrs(token) + ">" + highlighted + "</code></pre>\n";
+  };
+  default_rules.image = function(tokens, idx, options, env, slf) {
+    var token = tokens[idx];
+    // "alt" attr MUST be set, even if empty. Because it's mandatory and
+    // should be placed on proper position for tests.
+    
+    // Replace content with actual value
+        token.attrs[token.attrIndex("alt")][1] = slf.renderInlineAsText(token.children, options, env);
+    return slf.renderToken(tokens, idx, options);
+  };
+  default_rules.hardbreak = function(tokens, idx, options /*, env */) {
+    return options.xhtmlOut ? "\n" : "\n";
+  };
+  default_rules.softbreak = function(tokens, idx, options /*, env */) {
+    return options.breaks ? options.xhtmlOut ? "\n" : "\n" : "\n";
+  };
+  default_rules.text = function(tokens, idx /*, options, env */) {
+    return escapeHtml(tokens[idx].content);
+  };
+  default_rules.html_block = function(tokens, idx /*, options, env */) {
+    return tokens[idx].content;
+  };
+  default_rules.html_inline = function(tokens, idx /*, options, env */) {
+    return tokens[idx].content;
+  };
+  /**
+	 * new Renderer()
+	 *
+	 * Creates new [[Renderer]] instance and fill [[Renderer#rules]] with defaults.
+	 **/  function Renderer() {
+    /**
+	   * Renderer#rules -> Object
+	   *
+	   * Contains render rules for tokens. Can be updated and extended.
+	   *
+	   * ##### Example
+	   *
+	   * ```javascript
+	   * var md = require('markdown-it')();
+	   *
+	   * md.renderer.rules.strong_open  = function () { return '<b>'; };
+	   * md.renderer.rules.strong_close = function () { return '</b>'; };
+	   *
+	   * var result = md.renderInline(...);
+	   * ```
+	   *
+	   * Each rule is called as independent static function with fixed signature:
+	   *
+	   * ```javascript
+	   * function my_token_render(tokens, idx, options, env, renderer) {
+	   *   // ...
+	   *   return renderedHTML;
+	   * }
+	   * ```
+	   *
+	   * See [source code](https://github.com/markdown-it/markdown-it/blob/master/lib/renderer.js)
+	   * for more details and examples.
+	   **/
+    this.rules = assign$1({}, default_rules);
+  }
+  /**
+	 * Renderer.renderAttrs(token) -> String
+	 *
+	 * Render token attributes to string.
+	 **/  Renderer.prototype.renderAttrs = function renderAttrs(token) {
+    var i, l, result;
+    if (!token.attrs) {
+      return "";
+    }
+    result = "";
+    for (i = 0, l = token.attrs.length; i < l; i++) {
+      result += " " + escapeHtml(token.attrs[i][0]) + '="' + escapeHtml(token.attrs[i][1]) + '"';
+    }
+    return result;
+  };
+  /**
+	 * Renderer.renderToken(tokens, idx, options) -> String
+	 * - tokens (Array): list of tokens
+	 * - idx (Numbed): token index to render
+	 * - options (Object): params of parser instance
+	 *
+	 * Default token renderer. Can be overriden by custom function
+	 * in [[Renderer#rules]].
+	 **/  Renderer.prototype.renderToken = function renderToken(tokens, idx, options) {
+    var nextToken, result = "", needLf = false, token = tokens[idx];
+    // Tight list paragraphs
+        if (token.hidden) {
+      return "";
+    }
+    // Insert a newline between hidden paragraph and subsequent opening
+    // block-level tag.
+    
+    // For example, here we should insert a newline before blockquote:
+    //  - a
+    //    >
+    
+        if (token.block && token.nesting !== -1 && idx && tokens[idx - 1].hidden) {
+      result += "\n";
+    }
+    // Add token name, e.g. `<img`
+        result += (token.nesting === -1 ? "</" : "<") + token.tag;
+    // Encode attributes, e.g. `<img src="foo"`
+        result += this.renderAttrs(token);
+    // Add a slash for self-closing tags, e.g. `<img src="foo" /`
+        if (token.nesting === 0 && options.xhtmlOut) {
+      result += " /";
+    }
+    // Check if we need to add a newline after this tag
+        if (token.block) {
+      needLf = true;
+      if (token.nesting === 1) {
+        if (idx + 1 < tokens.length) {
+          nextToken = tokens[idx + 1];
+          if (nextToken.type === "inline" || nextToken.hidden) {
+            // Block-level tag containing an inline tag.
+            needLf = false;
+          } else if (nextToken.nesting === -1 && nextToken.tag === token.tag) {
+            // Opening tag + closing tag of the same type. E.g. `<li></li>`.
+            needLf = false;
+          }
+        }
+      }
+    }
+    result += needLf ? ">\n" : ">";
+    return result;
+  };
+  /**
+	 * Renderer.renderInline(tokens, options, env) -> String
+	 * - tokens (Array): list on block tokens to render
+	 * - options (Object): params of parser instance
+	 * - env (Object): additional data from parsed input (references, for example)
+	 *
+	 * The same as [[Renderer.render]], but for single token of `inline` type.
+	 **/  Renderer.prototype.renderInline = function(tokens, options, env) {
+    var type, result = "", rules = this.rules;
+    for (var i = 0, len = tokens.length; i < len; i++) {
+      type = tokens[i].type;
+      if (typeof rules[type] !== "undefined") {
+        result += rules[type](tokens, i, options, env, this);
+      } else {
+        result += this.renderToken(tokens, i, options);
+      }
+    }
+    return result;
+  };
+  /** internal
+	 * Renderer.renderInlineAsText(tokens, options, env) -> String
+	 * - tokens (Array): list on block tokens to render
+	 * - options (Object): params of parser instance
+	 * - env (Object): additional data from parsed input (references, for example)
+	 *
+	 * Special kludge for image `alt` attributes to conform CommonMark spec.
+	 * Don't try to use it! Spec requires to show `alt` content with stripped markup,
+	 * instead of simple escaping.
+	 **/  Renderer.prototype.renderInlineAsText = function(tokens, options, env) {
+    var result = "";
+    for (var i = 0, len = tokens.length; i < len; i++) {
+      if (tokens[i].type === "text") {
+        result += tokens[i].content;
+      } else if (tokens[i].type === "image") {
+        result += this.renderInlineAsText(tokens[i].children, options, env);
+      } else if (tokens[i].type === "softbreak") {
+        result += "\n";
+      }
+    }
+    return result;
+  };
+  /**
+	 * Renderer.render(tokens, options, env) -> String
+	 * - tokens (Array): list on block tokens to render
+	 * - options (Object): params of parser instance
+	 * - env (Object): additional data from parsed input (references, for example)
+	 *
+	 * Takes token stream and generates HTML. Probably, you will never need to call
+	 * this method directly.
+	 **/  Renderer.prototype.render = function(tokens, options, env) {
+    var i, len, type, result = "", rules = this.rules;
+    for (i = 0, len = tokens.length; i < len; i++) {
+      type = tokens[i].type;
+      if (type === "inline") {
+        result += this.renderInline(tokens[i].children, options, env);
+      } else if (typeof rules[type] !== "undefined") {
+        result += rules[type](tokens, i, options, env, this);
+      } else {
+        result += this.renderToken(tokens, i, options, env);
+      }
+    }
+    return result;
+  };
+  var renderer = Renderer;
+  /**
+	 * class Ruler
+	 *
+	 * Helper class, used by [[MarkdownIt#core]], [[MarkdownIt#block]] and
+	 * [[MarkdownIt#inline]] to manage sequences of functions (rules):
+	 *
+	 * - keep rules in defined order
+	 * - assign the name to each rule
+	 * - enable/disable rules
+	 * - add/replace rules
+	 * - allow assign rules to additional named chains (in the same)
+	 * - cacheing lists of active rules
+	 *
+	 * You will not need use this class directly until write plugins. For simple
+	 * rules control use [[MarkdownIt.disable]], [[MarkdownIt.enable]] and
+	 * [[MarkdownIt.use]].
+	 **/
+  /**
+	 * new Ruler()
+	 **/  function Ruler() {
+    // List of added rules. Each element is:
+    // {
+    //   name: XXX,
+    //   enabled: Boolean,
+    //   fn: Function(),
+    //   alt: [ name2, name3 ]
+    // }
+    this.__rules__ = [];
+    // Cached rule chains.
+    
+    // First level - chain name, '' for default.
+    // Second level - diginal anchor for fast filtering by charcodes.
+    
+        this.__cache__ = null;
+  }
+  ////////////////////////////////////////////////////////////////////////////////
+  // Helper methods, should not be used directly
+  // Find rule index by name
+  
+    Ruler.prototype.__find__ = function(name) {
+    for (var i = 0; i < this.__rules__.length; i++) {
+      if (this.__rules__[i].name === name) {
+        return i;
+      }
+    }
+    return -1;
+  };
+  // Build rules lookup cache
+  
+    Ruler.prototype.__compile__ = function() {
+    var self = this;
+    var chains = [ "" ];
+    // collect unique names
+        self.__rules__.forEach((function(rule) {
+      if (!rule.enabled) {
+        return;
+      }
+      rule.alt.forEach((function(altName) {
+        if (chains.indexOf(altName) < 0) {
+          chains.push(altName);
+        }
+      }));
+    }));
+    self.__cache__ = {};
+    chains.forEach((function(chain) {
+      self.__cache__[chain] = [];
+      self.__rules__.forEach((function(rule) {
+        if (!rule.enabled) {
+          return;
+        }
+        if (chain && rule.alt.indexOf(chain) < 0) {
+          return;
+        }
+        self.__cache__[chain].push(rule.fn);
+      }));
+    }));
+  };
+  /**
+	 * Ruler.at(name, fn [, options])
+	 * - name (String): rule name to replace.
+	 * - fn (Function): new rule function.
+	 * - options (Object): new rule options (not mandatory).
+	 *
+	 * Replace rule by name with new function & options. Throws error if name not
+	 * found.
+	 *
+	 * ##### Options:
+	 *
+	 * - __alt__ - array with names of "alternate" chains.
+	 *
+	 * ##### Example
+	 *
+	 * Replace existing typographer replacement rule with new one:
+	 *
+	 * ```javascript
+	 * var md = require('markdown-it')();
+	 *
+	 * md.core.ruler.at('replacements', function replace(state) {
+	 *   //...
+	 * });
+	 * ```
+	 **/  Ruler.prototype.at = function(name, fn, options) {
+    var index = this.__find__(name);
+    var opt = options || {};
+    if (index === -1) {
+      throw new Error("Parser rule not found: " + name);
+    }
+    this.__rules__[index].fn = fn;
+    this.__rules__[index].alt = opt.alt || [];
+    this.__cache__ = null;
+  };
+  /**
+	 * Ruler.before(beforeName, ruleName, fn [, options])
+	 * - beforeName (String): new rule will be added before this one.
+	 * - ruleName (String): name of added rule.
+	 * - fn (Function): rule function.
+	 * - options (Object): rule options (not mandatory).
+	 *
+	 * Add new rule to chain before one with given name. See also
+	 * [[Ruler.after]], [[Ruler.push]].
+	 *
+	 * ##### Options:
+	 *
+	 * - __alt__ - array with names of "alternate" chains.
+	 *
+	 * ##### Example
+	 *
+	 * ```javascript
+	 * var md = require('markdown-it')();
+	 *
+	 * md.block.ruler.before('paragraph', 'my_rule', function replace(state) {
+	 *   //...
+	 * });
+	 * ```
+	 **/  Ruler.prototype.before = function(beforeName, ruleName, fn, options) {
+    var index = this.__find__(beforeName);
+    var opt = options || {};
+    if (index === -1) {
+      throw new Error("Parser rule not found: " + beforeName);
+    }
+    this.__rules__.splice(index, 0, {
+      name: ruleName,
+      enabled: true,
+      fn: fn,
+      alt: opt.alt || []
+    });
+    this.__cache__ = null;
+  };
+  /**
+	 * Ruler.after(afterName, ruleName, fn [, options])
+	 * - afterName (String): new rule will be added after this one.
+	 * - ruleName (String): name of added rule.
+	 * - fn (Function): rule function.
+	 * - options (Object): rule options (not mandatory).
+	 *
+	 * Add new rule to chain after one with given name. See also
+	 * [[Ruler.before]], [[Ruler.push]].
+	 *
+	 * ##### Options:
+	 *
+	 * - __alt__ - array with names of "alternate" chains.
+	 *
+	 * ##### Example
+	 *
+	 * ```javascript
+	 * var md = require('markdown-it')();
+	 *
+	 * md.inline.ruler.after('text', 'my_rule', function replace(state) {
+	 *   //...
+	 * });
+	 * ```
+	 **/  Ruler.prototype.after = function(afterName, ruleName, fn, options) {
+    var index = this.__find__(afterName);
+    var opt = options || {};
+    if (index === -1) {
+      throw new Error("Parser rule not found: " + afterName);
+    }
+    this.__rules__.splice(index + 1, 0, {
+      name: ruleName,
+      enabled: true,
+      fn: fn,
+      alt: opt.alt || []
+    });
+    this.__cache__ = null;
+  };
+  /**
+	 * Ruler.push(ruleName, fn [, options])
+	 * - ruleName (String): name of added rule.
+	 * - fn (Function): rule function.
+	 * - options (Object): rule options (not mandatory).
+	 *
+	 * Push new rule to the end of chain. See also
+	 * [[Ruler.before]], [[Ruler.after]].
+	 *
+	 * ##### Options:
+	 *
+	 * - __alt__ - array with names of "alternate" chains.
+	 *
+	 * ##### Example
+	 *
+	 * ```javascript
+	 * var md = require('markdown-it')();
+	 *
+	 * md.core.ruler.push('my_rule', function replace(state) {
+	 *   //...
+	 * });
+	 * ```
+	 **/  Ruler.prototype.push = function(ruleName, fn, options) {
+    var opt = options || {};
+    this.__rules__.push({
+      name: ruleName,
+      enabled: true,
+      fn: fn,
+      alt: opt.alt || []
+    });
+    this.__cache__ = null;
+  };
+  /**
+	 * Ruler.enable(list [, ignoreInvalid]) -> Array
+	 * - list (String|Array): list of rule names to enable.
+	 * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found.
+	 *
+	 * Enable rules with given names. If any rule name not found - throw Error.
+	 * Errors can be disabled by second param.
+	 *
+	 * Returns list of found rule names (if no exception happened).
+	 *
+	 * See also [[Ruler.disable]], [[Ruler.enableOnly]].
+	 **/  Ruler.prototype.enable = function(list, ignoreInvalid) {
+    if (!Array.isArray(list)) {
+      list = [ list ];
+    }
+    var result = [];
+    // Search by name and enable
+        list.forEach((function(name) {
+      var idx = this.__find__(name);
+      if (idx < 0) {
+        if (ignoreInvalid) {
+          return;
+        }
+        throw new Error("Rules manager: invalid rule name " + name);
+      }
+      this.__rules__[idx].enabled = true;
+      result.push(name);
+    }), this);
+    this.__cache__ = null;
+    return result;
+  };
+  /**
+	 * Ruler.enableOnly(list [, ignoreInvalid])
+	 * - list (String|Array): list of rule names to enable (whitelist).
+	 * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found.
+	 *
+	 * Enable rules with given names, and disable everything else. If any rule name
+	 * not found - throw Error. Errors can be disabled by second param.
+	 *
+	 * See also [[Ruler.disable]], [[Ruler.enable]].
+	 **/  Ruler.prototype.enableOnly = function(list, ignoreInvalid) {
+    if (!Array.isArray(list)) {
+      list = [ list ];
+    }
+    this.__rules__.forEach((function(rule) {
+      rule.enabled = false;
+    }));
+    this.enable(list, ignoreInvalid);
+  };
+  /**
+	 * Ruler.disable(list [, ignoreInvalid]) -> Array
+	 * - list (String|Array): list of rule names to disable.
+	 * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found.
+	 *
+	 * Disable rules with given names. If any rule name not found - throw Error.
+	 * Errors can be disabled by second param.
+	 *
+	 * Returns list of found rule names (if no exception happened).
+	 *
+	 * See also [[Ruler.enable]], [[Ruler.enableOnly]].
+	 **/  Ruler.prototype.disable = function(list, ignoreInvalid) {
+    if (!Array.isArray(list)) {
+      list = [ list ];
+    }
+    var result = [];
+    // Search by name and disable
+        list.forEach((function(name) {
+      var idx = this.__find__(name);
+      if (idx < 0) {
+        if (ignoreInvalid) {
+          return;
+        }
+        throw new Error("Rules manager: invalid rule name " + name);
+      }
+      this.__rules__[idx].enabled = false;
+      result.push(name);
+    }), this);
+    this.__cache__ = null;
+    return result;
+  };
+  /**
+	 * Ruler.getRules(chainName) -> Array
+	 *
+	 * Return array of active functions (rules) for given chain name. It analyzes
+	 * rules configuration, compiles caches if not exists and returns result.
+	 *
+	 * Default chain name is `''` (empty string). It can't be skipped. That's
+	 * done intentionally, to keep signature monomorphic for high speed.
+	 **/  Ruler.prototype.getRules = function(chainName) {
+    if (this.__cache__ === null) {
+      this.__compile__();
+    }
+    // Chain can be empty, if rules disabled. But we still have to return Array.
+        return this.__cache__[chainName] || [];
+  };
+  var ruler = Ruler;
+  // Normalize input string
+  // https://spec.commonmark.org/0.29/#line-ending
+    var NEWLINES_RE = /\r\n?|\n/g;
+  var NULL_RE = /\0/g;
+  var normalize = function normalize(state) {
+    var str;
+    // Normalize newlines
+        str = state.src.replace(NEWLINES_RE, "\n");
+    // Replace NULL characters
+        str = str.replace(NULL_RE, "\ufffd");
+    state.src = str;
+  };
+  var block = function block(state) {
+    var token;
+    if (state.inlineMode) {
+      token = new state.Token("inline", "", 0);
+      token.content = state.src;
+      token.map = [ 0, 1 ];
+      token.children = [];
+      state.tokens.push(token);
+    } else {
+      state.md.block.parse(state.src, state.md, state.env, state.tokens);
+    }
+  };
+  var inline = function inline(state) {
+    var tokens = state.tokens, tok, i, l;
+    // Parse inlines
+        for (i = 0, l = tokens.length; i < l; i++) {
+      tok = tokens[i];
+      if (tok.type === "inline") {
+        state.md.inline.parse(tok.content, state.md, state.env, tok.children);
+      }
+    }
+  };
+  var arrayReplaceAt = utils.arrayReplaceAt;
+  function isLinkOpen$1(str) {
+    return /^<a[>\s]/i.test(str);
+  }
+  function isLinkClose$1(str) {
+    return /^<\/a\s*>/i.test(str);
+  }
+  var linkify$1 = function linkify(state) {
+    var i, j, l, tokens, token, currentToken, nodes, ln, text, pos, lastPos, level, htmlLinkLevel, url, fullUrl, urlText, blockTokens = state.tokens, links;
+    if (!state.md.options.linkify) {
+      return;
+    }
+    for (j = 0, l = blockTokens.length; j < l; j++) {
+      if (blockTokens[j].type !== "inline" || !state.md.linkify.pretest(blockTokens[j].content)) {
+        continue;
+      }
+      tokens = blockTokens[j].children;
+      htmlLinkLevel = 0;
+      // We scan from the end, to keep position when new tags added.
+      // Use reversed logic in links start/end match
+            for (i = tokens.length - 1; i >= 0; i--) {
+        currentToken = tokens[i];
+        // Skip content of markdown links
+                if (currentToken.type === "link_close") {
+          i--;
+          while (tokens[i].level !== currentToken.level && tokens[i].type !== "link_open") {
+            i--;
+          }
+          continue;
+        }
+        // Skip content of html tag links
+                if (currentToken.type === "html_inline") {
+          if (isLinkOpen$1(currentToken.content) && htmlLinkLevel > 0) {
+            htmlLinkLevel--;
+          }
+          if (isLinkClose$1(currentToken.content)) {
+            htmlLinkLevel++;
+          }
+        }
+        if (htmlLinkLevel > 0) {
+          continue;
+        }
+        if (currentToken.type === "text" && state.md.linkify.test(currentToken.content)) {
+          text = currentToken.content;
+          links = state.md.linkify.match(text);
+          // Now split string to nodes
+                    nodes = [];
+          level = currentToken.level;
+          lastPos = 0;
+          // forbid escape sequence at the start of the string,
+          // this avoids http\://example.com/ from being linkified as
+          // http:<a href="//example.com/">//example.com/</a>
+                    if (links.length > 0 && links[0].index === 0 && i > 0 && tokens[i - 1].type === "text_special") {
+            links = links.slice(1);
+          }
+          for (ln = 0; ln < links.length; ln++) {
+            url = links[ln].url;
+            fullUrl = state.md.normalizeLink(url);
+            if (!state.md.validateLink(fullUrl)) {
+              continue;
+            }
+            urlText = links[ln].text;
+            // Linkifier might send raw hostnames like "example.com", where url
+            // starts with domain name. So we prepend http:// in those cases,
+            // and remove it afterwards.
+            
+                        if (!links[ln].schema) {
+              urlText = state.md.normalizeLinkText("http://" + urlText).replace(/^http:\/\//, "");
+            } else if (links[ln].schema === "mailto:" && !/^mailto:/i.test(urlText)) {
+              urlText = state.md.normalizeLinkText("mailto:" + urlText).replace(/^mailto:/, "");
+            } else {
+              urlText = state.md.normalizeLinkText(urlText);
+            }
+            pos = links[ln].index;
+            if (pos > lastPos) {
+              token = new state.Token("text", "", 0);
+              token.content = text.slice(lastPos, pos);
+              token.level = level;
+              nodes.push(token);
+            }
+            token = new state.Token("link_open", "a", 1);
+            token.attrs = [ [ "href", fullUrl ] ];
+            token.level = level++;
+            token.markup = "linkify";
+            token.info = "auto";
+            nodes.push(token);
+            token = new state.Token("text", "", 0);
+            token.content = urlText;
+            token.level = level;
+            nodes.push(token);
+            token = new state.Token("link_close", "a", -1);
+            token.level = --level;
+            token.markup = "linkify";
+            token.info = "auto";
+            nodes.push(token);
+            lastPos = links[ln].lastIndex;
+          }
+          if (lastPos < text.length) {
+            token = new state.Token("text", "", 0);
+            token.content = text.slice(lastPos);
+            token.level = level;
+            nodes.push(token);
+          }
+          // replace current node
+                    blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes);
+        }
+      }
+    }
+  };
+  // Simple typographic replacements
+  // TODO:
+  // - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾
+  // - multiplications 2 x 4 -> 2 × 4
+    var RARE_RE = /\+-|\.\.|\?\?\?\?|!!!!|,,|--/;
+  // Workaround for phantomjs - need regex without /g flag,
+  // or root check will fail every second time
+    var SCOPED_ABBR_TEST_RE = /\((c|tm|r)\)/i;
+  var SCOPED_ABBR_RE = /\((c|tm|r)\)/gi;
+  var SCOPED_ABBR = {
+    c: "\xa9",
+    r: "\xae",
+    tm: "\u2122"
+  };
+  function replaceFn(match, name) {
+    return SCOPED_ABBR[name.toLowerCase()];
+  }
+  function replace_scoped(inlineTokens) {
+    var i, token, inside_autolink = 0;
+    for (i = inlineTokens.length - 1; i >= 0; i--) {
+      token = inlineTokens[i];
+      if (token.type === "text" && !inside_autolink) {
+        token.content = token.content.replace(SCOPED_ABBR_RE, replaceFn);
+      }
+      if (token.type === "link_open" && token.info === "auto") {
+        inside_autolink--;
+      }
+      if (token.type === "link_close" && token.info === "auto") {
+        inside_autolink++;
+      }
+    }
+  }
+  function replace_rare(inlineTokens) {
+    var i, token, inside_autolink = 0;
+    for (i = inlineTokens.length - 1; i >= 0; i--) {
+      token = inlineTokens[i];
+      if (token.type === "text" && !inside_autolink) {
+        if (RARE_RE.test(token.content)) {
+          token.content = token.content.replace(/\+-/g, "\xb1").replace(/\.{2,}/g, "\u2026").replace(/([?!])\u2026/g, "$1..").replace(/([?!]){4,}/g, "$1$1$1").replace(/,{2,}/g, ",").replace(/(^|[^-])---(?=[^-]|$)/gm, "$1\u2014").replace(/(^|\s)--(?=\s|$)/gm, "$1\u2013").replace(/(^|[^-\s])--(?=[^-\s]|$)/gm, "$1\u2013");
+        }
+      }
+      if (token.type === "link_open" && token.info === "auto") {
+        inside_autolink--;
+      }
+      if (token.type === "link_close" && token.info === "auto") {
+        inside_autolink++;
+      }
+    }
+  }
+  var replacements = function replace(state) {
+    var blkIdx;
+    if (!state.md.options.typographer) {
+      return;
+    }
+    for (blkIdx = state.tokens.length - 1; blkIdx >= 0; blkIdx--) {
+      if (state.tokens[blkIdx].type !== "inline") {
+        continue;
+      }
+      if (SCOPED_ABBR_TEST_RE.test(state.tokens[blkIdx].content)) {
+        replace_scoped(state.tokens[blkIdx].children);
+      }
+      if (RARE_RE.test(state.tokens[blkIdx].content)) {
+        replace_rare(state.tokens[blkIdx].children);
+      }
+    }
+  };
+  var isWhiteSpace$1 = utils.isWhiteSpace;
+  var isPunctChar$1 = utils.isPunctChar;
+  var isMdAsciiPunct$1 = utils.isMdAsciiPunct;
+  var QUOTE_TEST_RE = /['"]/;
+  var QUOTE_RE = /['"]/g;
+  var APOSTROPHE = "\u2019";
+ /* ’ */  function replaceAt(str, index, ch) {
+    return str.slice(0, index) + ch + str.slice(index + 1);
+  }
+  function process_inlines(tokens, state) {
+    var i, token, text, t, pos, max, thisLevel, item, lastChar, nextChar, isLastPunctChar, isNextPunctChar, isLastWhiteSpace, isNextWhiteSpace, canOpen, canClose, j, isSingle, stack, openQuote, closeQuote;
+    stack = [];
+    for (i = 0; i < tokens.length; i++) {
+      token = tokens[i];
+      thisLevel = tokens[i].level;
+      for (j = stack.length - 1; j >= 0; j--) {
+        if (stack[j].level <= thisLevel) {
+          break;
+        }
+      }
+      stack.length = j + 1;
+      if (token.type !== "text") {
+        continue;
+      }
+      text = token.content;
+      pos = 0;
+      max = text.length;
+      /*eslint no-labels:0,block-scoped-var:0*/      OUTER: while (pos < max) {
+        QUOTE_RE.lastIndex = pos;
+        t = QUOTE_RE.exec(text);
+        if (!t) {
+          break;
+        }
+        canOpen = canClose = true;
+        pos = t.index + 1;
+        isSingle = t[0] === "'";
+        // Find previous character,
+        // default to space if it's the beginning of the line
+        
+                lastChar = 32;
+        if (t.index - 1 >= 0) {
+          lastChar = text.charCodeAt(t.index - 1);
+        } else {
+          for (j = i - 1; j >= 0; j--) {
+            if (tokens[j].type === "softbreak" || tokens[j].type === "hardbreak") break;
+ // lastChar defaults to 0x20
+                        if (!tokens[j].content) continue;
+ // should skip all tokens except 'text', 'html_inline' or 'code_inline'
+                        lastChar = tokens[j].content.charCodeAt(tokens[j].content.length - 1);
+            break;
+          }
+        }
+        // Find next character,
+        // default to space if it's the end of the line
+        
+                nextChar = 32;
+        if (pos < max) {
+          nextChar = text.charCodeAt(pos);
+        } else {
+          for (j = i + 1; j < tokens.length; j++) {
+            if (tokens[j].type === "softbreak" || tokens[j].type === "hardbreak") break;
+ // nextChar defaults to 0x20
+                        if (!tokens[j].content) continue;
+ // should skip all tokens except 'text', 'html_inline' or 'code_inline'
+                        nextChar = tokens[j].content.charCodeAt(0);
+            break;
+          }
+        }
+        isLastPunctChar = isMdAsciiPunct$1(lastChar) || isPunctChar$1(String.fromCharCode(lastChar));
+        isNextPunctChar = isMdAsciiPunct$1(nextChar) || isPunctChar$1(String.fromCharCode(nextChar));
+        isLastWhiteSpace = isWhiteSpace$1(lastChar);
+        isNextWhiteSpace = isWhiteSpace$1(nextChar);
+        if (isNextWhiteSpace) {
+          canOpen = false;
+        } else if (isNextPunctChar) {
+          if (!(isLastWhiteSpace || isLastPunctChar)) {
+            canOpen = false;
+          }
+        }
+        if (isLastWhiteSpace) {
+          canClose = false;
+        } else if (isLastPunctChar) {
+          if (!(isNextWhiteSpace || isNextPunctChar)) {
+            canClose = false;
+          }
+        }
+        if (nextChar === 34 /* " */ && t[0] === '"') {
+          if (lastChar >= 48 /* 0 */ && lastChar <= 57 /* 9 */) {
+            // special case: 1"" - count first quote as an inch
+            canClose = canOpen = false;
+          }
+        }
+        if (canOpen && canClose) {
+          // Replace quotes in the middle of punctuation sequence, but not
+          // in the middle of the words, i.e.:
+          // 1. foo " bar " baz - not replaced
+          // 2. foo-"-bar-"-baz - replaced
+          // 3. foo"bar"baz     - not replaced
+          canOpen = isLastPunctChar;
+          canClose = isNextPunctChar;
+        }
+        if (!canOpen && !canClose) {
+          // middle of word
+          if (isSingle) {
+            token.content = replaceAt(token.content, t.index, APOSTROPHE);
+          }
+          continue;
+        }
+        if (canClose) {
+          // this could be a closing quote, rewind the stack to get a match
+          for (j = stack.length - 1; j >= 0; j--) {
+            item = stack[j];
+            if (stack[j].level < thisLevel) {
+              break;
+            }
+            if (item.single === isSingle && stack[j].level === thisLevel) {
+              item = stack[j];
+              if (isSingle) {
+                openQuote = state.md.options.quotes[2];
+                closeQuote = state.md.options.quotes[3];
+              } else {
+                openQuote = state.md.options.quotes[0];
+                closeQuote = state.md.options.quotes[1];
+              }
+              // replace token.content *before* tokens[item.token].content,
+              // because, if they are pointing at the same token, replaceAt
+              // could mess up indices when quote length != 1
+                            token.content = replaceAt(token.content, t.index, closeQuote);
+              tokens[item.token].content = replaceAt(tokens[item.token].content, item.pos, openQuote);
+              pos += closeQuote.length - 1;
+              if (item.token === i) {
+                pos += openQuote.length - 1;
+              }
+              text = token.content;
+              max = text.length;
+              stack.length = j;
+              continue OUTER;
+            }
+          }
+        }
+        if (canOpen) {
+          stack.push({
+            token: i,
+            pos: t.index,
+            single: isSingle,
+            level: thisLevel
+          });
+        } else if (canClose && isSingle) {
+          token.content = replaceAt(token.content, t.index, APOSTROPHE);
+        }
+      }
+    }
+  }
+  var smartquotes = function smartquotes(state) {
+    /*eslint max-depth:0*/
+    var blkIdx;
+    if (!state.md.options.typographer) {
+      return;
+    }
+    for (blkIdx = state.tokens.length - 1; blkIdx >= 0; blkIdx--) {
+      if (state.tokens[blkIdx].type !== "inline" || !QUOTE_TEST_RE.test(state.tokens[blkIdx].content)) {
+        continue;
+      }
+      process_inlines(state.tokens[blkIdx].children, state);
+    }
+  };
+  // Join raw text tokens with the rest of the text
+    var text_join = function text_join(state) {
+    var j, l, tokens, curr, max, last, blockTokens = state.tokens;
+    for (j = 0, l = blockTokens.length; j < l; j++) {
+      if (blockTokens[j].type !== "inline") continue;
+      tokens = blockTokens[j].children;
+      max = tokens.length;
+      for (curr = 0; curr < max; curr++) {
+        if (tokens[curr].type === "text_special") {
+          tokens[curr].type = "text";
+        }
+      }
+      for (curr = last = 0; curr < max; curr++) {
+        if (tokens[curr].type === "text" && curr + 1 < max && tokens[curr + 1].type === "text") {
+          // collapse two adjacent text nodes
+          tokens[curr + 1].content = tokens[curr].content + tokens[curr + 1].content;
+        } else {
+          if (curr !== last) {
+            tokens[last] = tokens[curr];
+          }
+          last++;
+        }
+      }
+      if (curr !== last) {
+        tokens.length = last;
+      }
+    }
+  };
+  // Token class
+  /**
+	 * class Token
+	 **/
+  /**
+	 * new Token(type, tag, nesting)
+	 *
+	 * Create new token and fill passed properties.
+	 **/  function Token(type, tag, nesting) {
+    /**
+	   * Token#type -> String
+	   *
+	   * Type of the token (string, e.g. "paragraph_open")
+	   **/
+    this.type = type;
+    /**
+	   * Token#tag -> String
+	   *
+	   * html tag name, e.g. "p"
+	   **/    this.tag = tag;
+    /**
+	   * Token#attrs -> Array
+	   *
+	   * Html attributes. Format: `[ [ name1, value1 ], [ name2, value2 ] ]`
+	   **/    this.attrs = null;
+    /**
+	   * Token#map -> Array
+	   *
+	   * Source map info. Format: `[ line_begin, line_end ]`
+	   **/    this.map = null;
+    /**
+	   * Token#nesting -> Number
+	   *
+	   * Level change (number in {-1, 0, 1} set), where:
+	   *
+	   * -  `1` means the tag is opening
+	   * -  `0` means the tag is self-closing
+	   * - `-1` means the tag is closing
+	   **/    this.nesting = nesting;
+    /**
+	   * Token#level -> Number
+	   *
+	   * nesting level, the same as `state.level`
+	   **/    this.level = 0;
+    /**
+	   * Token#children -> Array
+	   *
+	   * An array of child nodes (inline and img tokens)
+	   **/    this.children = null;
+    /**
+	   * Token#content -> String
+	   *
+	   * In a case of self-closing tag (code, html, fence, etc.),
+	   * it has contents of this tag.
+	   **/    this.content = "";
+    /**
+	   * Token#markup -> String
+	   *
+	   * '*' or '_' for emphasis, fence string for fence, etc.
+	   **/    this.markup = "";
+    /**
+	   * Token#info -> String
+	   *
+	   * Additional information:
+	   *
+	   * - Info string for "fence" tokens
+	   * - The value "auto" for autolink "link_open" and "link_close" tokens
+	   * - The string value of the item marker for ordered-list "list_item_open" tokens
+	   **/    this.info = "";
+    /**
+	   * Token#meta -> Object
+	   *
+	   * A place for plugins to store an arbitrary data
+	   **/    this.meta = null;
+    /**
+	   * Token#block -> Boolean
+	   *
+	   * True for block-level tokens, false for inline tokens.
+	   * Used in renderer to calculate line breaks
+	   **/    this.block = false;
+    /**
+	   * Token#hidden -> Boolean
+	   *
+	   * If it's true, ignore this element when rendering. Used for tight lists
+	   * to hide paragraphs.
+	   **/    this.hidden = false;
+  }
+  /**
+	 * Token.attrIndex(name) -> Number
+	 *
+	 * Search attribute index by name.
+	 **/  Token.prototype.attrIndex = function attrIndex(name) {
+    var attrs, i, len;
+    if (!this.attrs) {
+      return -1;
+    }
+    attrs = this.attrs;
+    for (i = 0, len = attrs.length; i < len; i++) {
+      if (attrs[i][0] === name) {
+        return i;
+      }
+    }
+    return -1;
+  };
+  /**
+	 * Token.attrPush(attrData)
+	 *
+	 * Add `[ name, value ]` attribute to list. Init attrs if necessary
+	 **/  Token.prototype.attrPush = function attrPush(attrData) {
+    if (this.attrs) {
+      this.attrs.push(attrData);
+    } else {
+      this.attrs = [ attrData ];
+    }
+  };
+  /**
+	 * Token.attrSet(name, value)
+	 *
+	 * Set `name` attribute to `value`. Override old value if exists.
+	 **/  Token.prototype.attrSet = function attrSet(name, value) {
+    var idx = this.attrIndex(name), attrData = [ name, value ];
+    if (idx < 0) {
+      this.attrPush(attrData);
+    } else {
+      this.attrs[idx] = attrData;
+    }
+  };
+  /**
+	 * Token.attrGet(name)
+	 *
+	 * Get the value of attribute `name`, or null if it does not exist.
+	 **/  Token.prototype.attrGet = function attrGet(name) {
+    var idx = this.attrIndex(name), value = null;
+    if (idx >= 0) {
+      value = this.attrs[idx][1];
+    }
+    return value;
+  };
+  /**
+	 * Token.attrJoin(name, value)
+	 *
+	 * Join value to existing attribute via space. Or create new attribute if not
+	 * exists. Useful to operate with token classes.
+	 **/  Token.prototype.attrJoin = function attrJoin(name, value) {
+    var idx = this.attrIndex(name);
+    if (idx < 0) {
+      this.attrPush([ name, value ]);
+    } else {
+      this.attrs[idx][1] = this.attrs[idx][1] + " " + value;
+    }
+  };
+  var token = Token;
+  function StateCore(src, md, env) {
+    this.src = src;
+    this.env = env;
+    this.tokens = [];
+    this.inlineMode = false;
+    this.md = md;
+ // link to parser instance
+    }
+  // re-export Token class to use in core rules
+    StateCore.prototype.Token = token;
+  var state_core = StateCore;
+  var _rules$2 = [ [ "normalize", normalize ], [ "block", block ], [ "inline", inline ], [ "linkify", linkify$1 ], [ "replacements", replacements ], [ "smartquotes", smartquotes ], 
+  // `text_join` finds `text_special` tokens (for escape sequences)
+  // and joins them with the rest of the text
+  [ "text_join", text_join ] ];
+  /**
+	 * new Core()
+	 **/  function Core() {
+    /**
+	   * Core#ruler -> Ruler
+	   *
+	   * [[Ruler]] instance. Keep configuration of core rules.
+	   **/
+    this.ruler = new ruler;
+    for (var i = 0; i < _rules$2.length; i++) {
+      this.ruler.push(_rules$2[i][0], _rules$2[i][1]);
+    }
+  }
+  /**
+	 * Core.process(state)
+	 *
+	 * Executes core chain rules.
+	 **/  Core.prototype.process = function(state) {
+    var i, l, rules;
+    rules = this.ruler.getRules("");
+    for (i = 0, l = rules.length; i < l; i++) {
+      rules[i](state);
+    }
+  };
+  Core.prototype.State = state_core;
+  var parser_core = Core;
+  var isSpace$a = utils.isSpace;
+  function getLine(state, line) {
+    var pos = state.bMarks[line] + state.tShift[line], max = state.eMarks[line];
+    return state.src.slice(pos, max);
+  }
+  function escapedSplit(str) {
+    var result = [], pos = 0, max = str.length, ch, isEscaped = false, lastPos = 0, current = "";
+    ch = str.charCodeAt(pos);
+    while (pos < max) {
+      if (ch === 124 /* | */) {
+        if (!isEscaped) {
+          // pipe separating cells, '|'
+          result.push(current + str.substring(lastPos, pos));
+          current = "";
+          lastPos = pos + 1;
+        } else {
+          // escaped pipe, '\|'
+          current += str.substring(lastPos, pos - 1);
+          lastPos = pos;
+        }
+      }
+      isEscaped = ch === 92 /* \ */;
+      pos++;
+      ch = str.charCodeAt(pos);
+    }
+    result.push(current + str.substring(lastPos));
+    return result;
+  }
+  var table = function table(state, startLine, endLine, silent) {
+    var ch, lineText, pos, i, l, nextLine, columns, columnCount, token, aligns, t, tableLines, tbodyLines, oldParentType, terminate, terminatorRules, firstCh, secondCh;
+    // should have at least two lines
+        if (startLine + 2 > endLine) {
+      return false;
+    }
+    nextLine = startLine + 1;
+    if (state.sCount[nextLine] < state.blkIndent) {
+      return false;
+    }
+    // if it's indented more than 3 spaces, it should be a code block
+        if (state.sCount[nextLine] - state.blkIndent >= 4) {
+      return false;
+    }
+    // first character of the second line should be '|', '-', ':',
+    // and no other characters are allowed but spaces;
+    // basically, this is the equivalent of /^[-:|][-:|\s]*$/ regexp
+        pos = state.bMarks[nextLine] + state.tShift[nextLine];
+    if (pos >= state.eMarks[nextLine]) {
+      return false;
+    }
+    firstCh = state.src.charCodeAt(pos++);
+    if (firstCh !== 124 /* | */ && firstCh !== 45 /* - */ && firstCh !== 58 /* : */) {
+      return false;
+    }
+    if (pos >= state.eMarks[nextLine]) {
+      return false;
+    }
+    secondCh = state.src.charCodeAt(pos++);
+    if (secondCh !== 124 /* | */ && secondCh !== 45 /* - */ && secondCh !== 58 /* : */ && !isSpace$a(secondCh)) {
+      return false;
+    }
+    // if first character is '-', then second character must not be a space
+    // (due to parsing ambiguity with list)
+        if (firstCh === 45 /* - */ && isSpace$a(secondCh)) {
+      return false;
+    }
+    while (pos < state.eMarks[nextLine]) {
+      ch = state.src.charCodeAt(pos);
+      if (ch !== 124 /* | */ && ch !== 45 /* - */ && ch !== 58 /* : */ && !isSpace$a(ch)) {
+        return false;
+      }
+      pos++;
+    }
+    lineText = getLine(state, startLine + 1);
+    columns = lineText.split("|");
+    aligns = [];
+    for (i = 0; i < columns.length; i++) {
+      t = columns[i].trim();
+      if (!t) {
+        // allow empty columns before and after table, but not in between columns;
+        // e.g. allow ` |---| `, disallow ` ---||--- `
+        if (i === 0 || i === columns.length - 1) {
+          continue;
+        } else {
+          return false;
+        }
+      }
+      if (!/^:?-+:?$/.test(t)) {
+        return false;
+      }
+      if (t.charCodeAt(t.length - 1) === 58 /* : */) {
+        aligns.push(t.charCodeAt(0) === 58 /* : */ ? "center" : "right");
+      } else if (t.charCodeAt(0) === 58 /* : */) {
+        aligns.push("left");
+      } else {
+        aligns.push("");
+      }
+    }
+    lineText = getLine(state, startLine).trim();
+    if (lineText.indexOf("|") === -1) {
+      return false;
+    }
+    if (state.sCount[startLine] - state.blkIndent >= 4) {
+      return false;
+    }
+    columns = escapedSplit(lineText);
+    if (columns.length && columns[0] === "") columns.shift();
+    if (columns.length && columns[columns.length - 1] === "") columns.pop();
+    // header row will define an amount of columns in the entire table,
+    // and align row should be exactly the same (the rest of the rows can differ)
+        columnCount = columns.length;
+    if (columnCount === 0 || columnCount !== aligns.length) {
+      return false;
+    }
+    if (silent) {
+      return true;
+    }
+    oldParentType = state.parentType;
+    state.parentType = "table";
+    // use 'blockquote' lists for termination because it's
+    // the most similar to tables
+        terminatorRules = state.md.block.ruler.getRules("blockquote");
+    token = state.push("table_open", "table", 1);
+    token.map = tableLines = [ startLine, 0 ];
+    token = state.push("thead_open", "thead", 1);
+    token.map = [ startLine, startLine + 1 ];
+    token = state.push("tr_open", "tr", 1);
+    token.map = [ startLine, startLine + 1 ];
+    for (i = 0; i < columns.length; i++) {
+      token = state.push("th_open", "th", 1);
+      if (aligns[i]) {
+        token.attrs = [ [ "style", "text-align:" + aligns[i] ] ];
+      }
+      token = state.push("inline", "", 0);
+      token.content = columns[i].trim();
+      token.children = [];
+      token = state.push("th_close", "th", -1);
+    }
+    token = state.push("tr_close", "tr", -1);
+    token = state.push("thead_close", "thead", -1);
+    for (nextLine = startLine + 2; nextLine < endLine; nextLine++) {
+      if (state.sCount[nextLine] < state.blkIndent) {
+        break;
+      }
+      terminate = false;
+      for (i = 0, l = terminatorRules.length; i < l; i++) {
+        if (terminatorRules[i](state, nextLine, endLine, true)) {
+          terminate = true;
+          break;
+        }
+      }
+      if (terminate) {
+        break;
+      }
+      lineText = getLine(state, nextLine).trim();
+      if (!lineText) {
+        break;
+      }
+      if (state.sCount[nextLine] - state.blkIndent >= 4) {
+        break;
+      }
+      columns = escapedSplit(lineText);
+      if (columns.length && columns[0] === "") columns.shift();
+      if (columns.length && columns[columns.length - 1] === "") columns.pop();
+      if (nextLine === startLine + 2) {
+        token = state.push("tbody_open", "tbody", 1);
+        token.map = tbodyLines = [ startLine + 2, 0 ];
+      }
+      token = state.push("tr_open", "tr", 1);
+      token.map = [ nextLine, nextLine + 1 ];
+      for (i = 0; i < columnCount; i++) {
+        token = state.push("td_open", "td", 1);
+        if (aligns[i]) {
+          token.attrs = [ [ "style", "text-align:" + aligns[i] ] ];
+        }
+        token = state.push("inline", "", 0);
+        token.content = columns[i] ? columns[i].trim() : "";
+        token.children = [];
+        token = state.push("td_close", "td", -1);
+      }
+      token = state.push("tr_close", "tr", -1);
+    }
+    if (tbodyLines) {
+      token = state.push("tbody_close", "tbody", -1);
+      tbodyLines[1] = nextLine;
+    }
+    token = state.push("table_close", "table", -1);
+    tableLines[1] = nextLine;
+    state.parentType = oldParentType;
+    state.line = nextLine;
+    return true;
+  };
+  // Code block (4 spaces padded)
+    var code = function code(state, startLine, endLine /*, silent*/) {
+    var nextLine, last, token;
+    if (state.sCount[startLine] - state.blkIndent < 4) {
+      return false;
+    }
+    last = nextLine = startLine + 1;
+    while (nextLine < endLine) {
+      if (state.isEmpty(nextLine)) {
+        nextLine++;
+        continue;
+      }
+      if (state.sCount[nextLine] - state.blkIndent >= 4) {
+        nextLine++;
+        last = nextLine;
+        continue;
+      }
+      break;
+    }
+    state.line = last;
+    token = state.push("code_block", "code", 0);
+    token.content = state.getLines(startLine, last, 4 + state.blkIndent, false) + "\n";
+    token.map = [ startLine, state.line ];
+    return true;
+  };
+  // fences (``` lang, ~~~ lang)
+    var fence = function fence(state, startLine, endLine, silent) {
+    var marker, len, params, nextLine, mem, token, markup, haveEndMarker = false, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine];
+    // if it's indented more than 3 spaces, it should be a code block
+        if (state.sCount[startLine] - state.blkIndent >= 4) {
+      return false;
+    }
+    if (pos + 3 > max) {
+      return false;
+    }
+    marker = state.src.charCodeAt(pos);
+    if (marker !== 126 /* ~ */ && marker !== 96 /* ` */) {
+      return false;
+    }
+    // scan marker length
+        mem = pos;
+    pos = state.skipChars(pos, marker);
+    len = pos - mem;
+    if (len < 3) {
+      return false;
+    }
+    markup = state.src.slice(mem, pos);
+    params = state.src.slice(pos, max);
+    if (marker === 96 /* ` */) {
+      if (params.indexOf(String.fromCharCode(marker)) >= 0) {
+        return false;
+      }
+    }
+    // Since start is found, we can report success here in validation mode
+        if (silent) {
+      return true;
+    }
+    // search end of block
+        nextLine = startLine;
+    for (;;) {
+      nextLine++;
+      if (nextLine >= endLine) {
+        // unclosed block should be autoclosed by end of document.
+        // also block seems to be autoclosed by end of parent
+        break;
+      }
+      pos = mem = state.bMarks[nextLine] + state.tShift[nextLine];
+      max = state.eMarks[nextLine];
+      if (pos < max && state.sCount[nextLine] < state.blkIndent) {
+        // non-empty line with negative indent should stop the list:
+        // - ```
+        //  test
+        break;
+      }
+      if (state.src.charCodeAt(pos) !== marker) {
+        continue;
+      }
+      if (state.sCount[nextLine] - state.blkIndent >= 4) {
+        // closing fence should be indented less than 4 spaces
+        continue;
+      }
+      pos = state.skipChars(pos, marker);
+      // closing code fence must be at least as long as the opening one
+            if (pos - mem < len) {
+        continue;
+      }
+      // make sure tail has spaces only
+            pos = state.skipSpaces(pos);
+      if (pos < max) {
+        continue;
+      }
+      haveEndMarker = true;
+      // found!
+            break;
+    }
+    // If a fence has heading spaces, they should be removed from its inner block
+        len = state.sCount[startLine];
+    state.line = nextLine + (haveEndMarker ? 1 : 0);
+    token = state.push("fence", "code", 0);
+    token.info = params;
+    token.content = state.getLines(startLine + 1, nextLine, len, true);
+    token.markup = markup;
+    token.map = [ startLine, state.line ];
+    return true;
+  };
+  var isSpace$9 = utils.isSpace;
+  var blockquote = function blockquote(state, startLine, endLine, silent) {
+    var adjustTab, ch, i, initial, l, lastLineEmpty, lines, nextLine, offset, oldBMarks, oldBSCount, oldIndent, oldParentType, oldSCount, oldTShift, spaceAfterMarker, terminate, terminatorRules, token, isOutdented, oldLineMax = state.lineMax, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine];
+    // if it's indented more than 3 spaces, it should be a code block
+        if (state.sCount[startLine] - state.blkIndent >= 4) {
+      return false;
+    }
+    // check the block quote marker
+        if (state.src.charCodeAt(pos) !== 62 /* > */) {
+      return false;
+    }
+    // we know that it's going to be a valid blockquote,
+    // so no point trying to find the end of it in silent mode
+        if (silent) {
+      return true;
+    }
+    oldBMarks = [];
+    oldBSCount = [];
+    oldSCount = [];
+    oldTShift = [];
+    terminatorRules = state.md.block.ruler.getRules("blockquote");
+    oldParentType = state.parentType;
+    state.parentType = "blockquote";
+    // Search the end of the block
+    
+    // Block ends with either:
+    //  1. an empty line outside:
+    //     ```
+    //     > test
+    
+    //     ```
+    //  2. an empty line inside:
+    //     ```
+    //     >
+    //     test
+    //     ```
+    //  3. another tag:
+    //     ```
+    //     > test
+    //      - - -
+    //     ```
+        for (nextLine = startLine; nextLine < endLine; nextLine++) {
+      // check if it's outdented, i.e. it's inside list item and indented
+      // less than said list item:
+      // ```
+      // 1. anything
+      //    > current blockquote
+      // 2. checking this line
+      // ```
+      isOutdented = state.sCount[nextLine] < state.blkIndent;
+      pos = state.bMarks[nextLine] + state.tShift[nextLine];
+      max = state.eMarks[nextLine];
+      if (pos >= max) {
+        // Case 1: line is not inside the blockquote, and this line is empty.
+        break;
+      }
+      if (state.src.charCodeAt(pos++) === 62 /* > */ && !isOutdented) {
+        // This line is inside the blockquote.
+        // set offset past spaces and ">"
+        initial = state.sCount[nextLine] + 1;
+        // skip one optional space after '>'
+                if (state.src.charCodeAt(pos) === 32 /* space */) {
+          // ' >   test '
+          //     ^ -- position start of line here:
+          pos++;
+          initial++;
+          adjustTab = false;
+          spaceAfterMarker = true;
+        } else if (state.src.charCodeAt(pos) === 9 /* tab */) {
+          spaceAfterMarker = true;
+          if ((state.bsCount[nextLine] + initial) % 4 === 3) {
+            // '  >\t  test '
+            //       ^ -- position start of line here (tab has width===1)
+            pos++;
+            initial++;
+            adjustTab = false;
+          } else {
+            // ' >\t  test '
+            //    ^ -- position start of line here + shift bsCount slightly
+            //         to make extra space appear
+            adjustTab = true;
+          }
+        } else {
+          spaceAfterMarker = false;
+        }
+        offset = initial;
+        oldBMarks.push(state.bMarks[nextLine]);
+        state.bMarks[nextLine] = pos;
+        while (pos < max) {
+          ch = state.src.charCodeAt(pos);
+          if (isSpace$9(ch)) {
+            if (ch === 9) {
+              offset += 4 - (offset + state.bsCount[nextLine] + (adjustTab ? 1 : 0)) % 4;
+            } else {
+              offset++;
+            }
+          } else {
+            break;
+          }
+          pos++;
+        }
+        lastLineEmpty = pos >= max;
+        oldBSCount.push(state.bsCount[nextLine]);
+        state.bsCount[nextLine] = state.sCount[nextLine] + 1 + (spaceAfterMarker ? 1 : 0);
+        oldSCount.push(state.sCount[nextLine]);
+        state.sCount[nextLine] = offset - initial;
+        oldTShift.push(state.tShift[nextLine]);
+        state.tShift[nextLine] = pos - state.bMarks[nextLine];
+        continue;
+      }
+      // Case 2: line is not inside the blockquote, and the last line was empty.
+            if (lastLineEmpty) {
+        break;
+      }
+      // Case 3: another tag found.
+            terminate = false;
+      for (i = 0, l = terminatorRules.length; i < l; i++) {
+        if (terminatorRules[i](state, nextLine, endLine, true)) {
+          terminate = true;
+          break;
+        }
+      }
+      if (terminate) {
+        // Quirk to enforce "hard termination mode" for paragraphs;
+        // normally if you call `tokenize(state, startLine, nextLine)`,
+        // paragraphs will look below nextLine for paragraph continuation,
+        // but if blockquote is terminated by another tag, they shouldn't
+        state.lineMax = nextLine;
+        if (state.blkIndent !== 0) {
+          // state.blkIndent was non-zero, we now set it to zero,
+          // so we need to re-calculate all offsets to appear as
+          // if indent wasn't changed
+          oldBMarks.push(state.bMarks[nextLine]);
+          oldBSCount.push(state.bsCount[nextLine]);
+          oldTShift.push(state.tShift[nextLine]);
+          oldSCount.push(state.sCount[nextLine]);
+          state.sCount[nextLine] -= state.blkIndent;
+        }
+        break;
+      }
+      oldBMarks.push(state.bMarks[nextLine]);
+      oldBSCount.push(state.bsCount[nextLine]);
+      oldTShift.push(state.tShift[nextLine]);
+      oldSCount.push(state.sCount[nextLine]);
+      // A negative indentation means that this is a paragraph continuation
+      
+            state.sCount[nextLine] = -1;
+    }
+    oldIndent = state.blkIndent;
+    state.blkIndent = 0;
+    token = state.push("blockquote_open", "blockquote", 1);
+    token.markup = ">";
+    token.map = lines = [ startLine, 0 ];
+    state.md.block.tokenize(state, startLine, nextLine);
+    token = state.push("blockquote_close", "blockquote", -1);
+    token.markup = ">";
+    state.lineMax = oldLineMax;
+    state.parentType = oldParentType;
+    lines[1] = state.line;
+    // Restore original tShift; this might not be necessary since the parser
+    // has already been here, but just to make sure we can do that.
+        for (i = 0; i < oldTShift.length; i++) {
+      state.bMarks[i + startLine] = oldBMarks[i];
+      state.tShift[i + startLine] = oldTShift[i];
+      state.sCount[i + startLine] = oldSCount[i];
+      state.bsCount[i + startLine] = oldBSCount[i];
+    }
+    state.blkIndent = oldIndent;
+    return true;
+  };
+  var isSpace$8 = utils.isSpace;
+  var hr = function hr(state, startLine, endLine, silent) {
+    var marker, cnt, ch, token, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine];
+    // if it's indented more than 3 spaces, it should be a code block
+        if (state.sCount[startLine] - state.blkIndent >= 4) {
+      return false;
+    }
+    marker = state.src.charCodeAt(pos++);
+    // Check hr marker
+        if (marker !== 42 /* * */ && marker !== 45 /* - */ && marker !== 95 /* _ */) {
+      return false;
+    }
+    // markers can be mixed with spaces, but there should be at least 3 of them
+        cnt = 1;
+    while (pos < max) {
+      ch = state.src.charCodeAt(pos++);
+      if (ch !== marker && !isSpace$8(ch)) {
+        return false;
+      }
+      if (ch === marker) {
+        cnt++;
+      }
+    }
+    if (cnt < 3) {
+      return false;
+    }
+    if (silent) {
+      return true;
+    }
+    state.line = startLine + 1;
+    token = state.push("hr", "hr", 0);
+    token.map = [ startLine, state.line ];
+    token.markup = Array(cnt + 1).join(String.fromCharCode(marker));
+    return true;
+  };
+  var isSpace$7 = utils.isSpace;
+  // Search `[-+*][\n ]`, returns next pos after marker on success
+  // or -1 on fail.
+    function skipBulletListMarker(state, startLine) {
+    var marker, pos, max, ch;
+    pos = state.bMarks[startLine] + state.tShift[startLine];
+    max = state.eMarks[startLine];
+    marker = state.src.charCodeAt(pos++);
+    // Check bullet
+        if (marker !== 42 /* * */ && marker !== 45 /* - */ && marker !== 43 /* + */) {
+      return -1;
+    }
+    if (pos < max) {
+      ch = state.src.charCodeAt(pos);
+      if (!isSpace$7(ch)) {
+        // " -test " - is not a list item
+        return -1;
+      }
+    }
+    return pos;
+  }
+  // Search `\d+[.)][\n ]`, returns next pos after marker on success
+  // or -1 on fail.
+    function skipOrderedListMarker(state, startLine) {
+    var ch, start = state.bMarks[startLine] + state.tShift[startLine], pos = start, max = state.eMarks[startLine];
+    // List marker should have at least 2 chars (digit + dot)
+        if (pos + 1 >= max) {
+      return -1;
+    }
+    ch = state.src.charCodeAt(pos++);
+    if (ch < 48 /* 0 */ || ch > 57 /* 9 */) {
+      return -1;
+    }
+    for (;;) {
+      // EOL -> fail
+      if (pos >= max) {
+        return -1;
+      }
+      ch = state.src.charCodeAt(pos++);
+      if (ch >= 48 /* 0 */ && ch <= 57 /* 9 */) {
+        // List marker should have no more than 9 digits
+        // (prevents integer overflow in browsers)
+        if (pos - start >= 10) {
+          return -1;
+        }
+        continue;
+      }
+      // found valid marker
+            if (ch === 41 /* ) */ || ch === 46 /* . */) {
+        break;
+      }
+      return -1;
+    }
+    if (pos < max) {
+      ch = state.src.charCodeAt(pos);
+      if (!isSpace$7(ch)) {
+        // " 1.test " - is not a list item
+        return -1;
+      }
+    }
+    return pos;
+  }
+  function markTightParagraphs(state, idx) {
+    var i, l, level = state.level + 2;
+    for (i = idx + 2, l = state.tokens.length - 2; i < l; i++) {
+      if (state.tokens[i].level === level && state.tokens[i].type === "paragraph_open") {
+        state.tokens[i + 2].hidden = true;
+        state.tokens[i].hidden = true;
+        i += 2;
+      }
+    }
+  }
+  var list = function list(state, startLine, endLine, silent) {
+    var ch, contentStart, i, indent, indentAfterMarker, initial, isOrdered, itemLines, l, listLines, listTokIdx, markerCharCode, markerValue, max, offset, oldListIndent, oldParentType, oldSCount, oldTShift, oldTight, pos, posAfterMarker, prevEmptyEnd, start, terminate, terminatorRules, token, nextLine = startLine, isTerminatingParagraph = false, tight = true;
+    // if it's indented more than 3 spaces, it should be a code block
+        if (state.sCount[nextLine] - state.blkIndent >= 4) {
+      return false;
+    }
+    // Special case:
+    //  - item 1
+    //   - item 2
+    //    - item 3
+    //     - item 4
+    //      - this one is a paragraph continuation
+        if (state.listIndent >= 0 && state.sCount[nextLine] - state.listIndent >= 4 && state.sCount[nextLine] < state.blkIndent) {
+      return false;
+    }
+    // limit conditions when list can interrupt
+    // a paragraph (validation mode only)
+        if (silent && state.parentType === "paragraph") {
+      // Next list item should still terminate previous list item;
+      // This code can fail if plugins use blkIndent as well as lists,
+      // but I hope the spec gets fixed long before that happens.
+      if (state.sCount[nextLine] >= state.blkIndent) {
+        isTerminatingParagraph = true;
+      }
+    }
+    // Detect list type and position after marker
+        if ((posAfterMarker = skipOrderedListMarker(state, nextLine)) >= 0) {
+      isOrdered = true;
+      start = state.bMarks[nextLine] + state.tShift[nextLine];
+      markerValue = Number(state.src.slice(start, posAfterMarker - 1));
+      // If we're starting a new ordered list right after
+      // a paragraph, it should start with 1.
+            if (isTerminatingParagraph && markerValue !== 1) return false;
+    } else if ((posAfterMarker = skipBulletListMarker(state, nextLine)) >= 0) {
+      isOrdered = false;
+    } else {
+      return false;
+    }
+    // If we're starting a new unordered list right after
+    // a paragraph, first line should not be empty.
+        if (isTerminatingParagraph) {
+      if (state.skipSpaces(posAfterMarker) >= state.eMarks[nextLine]) return false;
+    }
+    // For validation mode we can terminate immediately
+        if (silent) {
+      return true;
+    }
+    // We should terminate list on style change. Remember first one to compare.
+        markerCharCode = state.src.charCodeAt(posAfterMarker - 1);
+    // Start list
+        listTokIdx = state.tokens.length;
+    if (isOrdered) {
+      token = state.push("ordered_list_open", "ol", 1);
+      if (markerValue !== 1) {
+        token.attrs = [ [ "start", markerValue ] ];
+      }
+    } else {
+      token = state.push("bullet_list_open", "ul", 1);
+    }
+    token.map = listLines = [ nextLine, 0 ];
+    token.markup = String.fromCharCode(markerCharCode);
+    
+    // Iterate list items
+    
+        prevEmptyEnd = false;
+    terminatorRules = state.md.block.ruler.getRules("list");
+    oldParentType = state.parentType;
+    state.parentType = "list";
+    while (nextLine < endLine) {
+      pos = posAfterMarker;
+      max = state.eMarks[nextLine];
+      initial = offset = state.sCount[nextLine] + posAfterMarker - (state.bMarks[nextLine] + state.tShift[nextLine]);
+      while (pos < max) {
+        ch = state.src.charCodeAt(pos);
+        if (ch === 9) {
+          offset += 4 - (offset + state.bsCount[nextLine]) % 4;
+        } else if (ch === 32) {
+          offset++;
+        } else {
+          break;
+        }
+        pos++;
+      }
+      contentStart = pos;
+      if (contentStart >= max) {
+        // trimming space in "-    \n  3" case, indent is 1 here
+        indentAfterMarker = 1;
+      } else {
+        indentAfterMarker = offset - initial;
+      }
+      // If we have more than 4 spaces, the indent is 1
+      // (the rest is just indented code block)
+            if (indentAfterMarker > 4) {
+        indentAfterMarker = 1;
+      }
+      // "  -  test"
+      //  ^^^^^ - calculating total length of this thing
+            indent = initial + indentAfterMarker;
+      // Run subparser & write tokens
+            token = state.push("list_item_open", "li", 1);
+      token.markup = String.fromCharCode(markerCharCode);
+      token.map = itemLines = [ nextLine, 0 ];
+      if (isOrdered) {
+        token.info = state.src.slice(start, posAfterMarker - 1);
+      }
+      // change current state, then restore it after parser subcall
+            oldTight = state.tight;
+      oldTShift = state.tShift[nextLine];
+      oldSCount = state.sCount[nextLine];
+      //  - example list
+      // ^ listIndent position will be here
+      //   ^ blkIndent position will be here
+      
+            oldListIndent = state.listIndent;
+      state.listIndent = state.blkIndent;
+      state.blkIndent = indent;
+      state.tight = true;
+      state.tShift[nextLine] = contentStart - state.bMarks[nextLine];
+      state.sCount[nextLine] = offset;
+      if (contentStart >= max && state.isEmpty(nextLine + 1)) {
+        // workaround for this case
+        // (list item is empty, list terminates before "foo"):
+        // ~~~~~~~~
+        //   -
+        //     foo
+        // ~~~~~~~~
+        state.line = Math.min(state.line + 2, endLine);
+      } else {
+        state.md.block.tokenize(state, nextLine, endLine, true);
+      }
+      // If any of list item is tight, mark list as tight
+            if (!state.tight || prevEmptyEnd) {
+        tight = false;
+      }
+      // Item become loose if finish with empty line,
+      // but we should filter last element, because it means list finish
+            prevEmptyEnd = state.line - nextLine > 1 && state.isEmpty(state.line - 1);
+      state.blkIndent = state.listIndent;
+      state.listIndent = oldListIndent;
+      state.tShift[nextLine] = oldTShift;
+      state.sCount[nextLine] = oldSCount;
+      state.tight = oldTight;
+      token = state.push("list_item_close", "li", -1);
+      token.markup = String.fromCharCode(markerCharCode);
+      nextLine = state.line;
+      itemLines[1] = nextLine;
+      if (nextLine >= endLine) {
+        break;
+      }
+      
+      // Try to check if list is terminated or continued.
+      
+            if (state.sCount[nextLine] < state.blkIndent) {
+        break;
+      }
+      // if it's indented more than 3 spaces, it should be a code block
+            if (state.sCount[nextLine] - state.blkIndent >= 4) {
+        break;
+      }
+      // fail if terminating block found
+            terminate = false;
+      for (i = 0, l = terminatorRules.length; i < l; i++) {
+        if (terminatorRules[i](state, nextLine, endLine, true)) {
+          terminate = true;
+          break;
+        }
+      }
+      if (terminate) {
+        break;
+      }
+      // fail if list has another type
+            if (isOrdered) {
+        posAfterMarker = skipOrderedListMarker(state, nextLine);
+        if (posAfterMarker < 0) {
+          break;
+        }
+        start = state.bMarks[nextLine] + state.tShift[nextLine];
+      } else {
+        posAfterMarker = skipBulletListMarker(state, nextLine);
+        if (posAfterMarker < 0) {
+          break;
+        }
+      }
+      if (markerCharCode !== state.src.charCodeAt(posAfterMarker - 1)) {
+        break;
+      }
+    }
+    // Finalize list
+        if (isOrdered) {
+      token = state.push("ordered_list_close", "ol", -1);
+    } else {
+      token = state.push("bullet_list_close", "ul", -1);
+    }
+    token.markup = String.fromCharCode(markerCharCode);
+    listLines[1] = nextLine;
+    state.line = nextLine;
+    state.parentType = oldParentType;
+    // mark paragraphs tight if needed
+        if (tight) {
+      markTightParagraphs(state, listTokIdx);
+    }
+    return true;
+  };
+  var normalizeReference$2 = utils.normalizeReference;
+  var isSpace$6 = utils.isSpace;
+  var reference = function reference(state, startLine, _endLine, silent) {
+    var ch, destEndPos, destEndLineNo, endLine, href, i, l, label, labelEnd, oldParentType, res, start, str, terminate, terminatorRules, title, lines = 0, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine], nextLine = startLine + 1;
+    // if it's indented more than 3 spaces, it should be a code block
+        if (state.sCount[startLine] - state.blkIndent >= 4) {
+      return false;
+    }
+    if (state.src.charCodeAt(pos) !== 91 /* [ */) {
+      return false;
+    }
+    // Simple check to quickly interrupt scan on [link](url) at the start of line.
+    // Can be useful on practice: https://github.com/markdown-it/markdown-it/issues/54
+        while (++pos < max) {
+      if (state.src.charCodeAt(pos) === 93 /* ] */ && state.src.charCodeAt(pos - 1) !== 92 /* \ */) {
+        if (pos + 1 === max) {
+          return false;
+        }
+        if (state.src.charCodeAt(pos + 1) !== 58 /* : */) {
+          return false;
+        }
+        break;
+      }
+    }
+    endLine = state.lineMax;
+    // jump line-by-line until empty one or EOF
+        terminatorRules = state.md.block.ruler.getRules("reference");
+    oldParentType = state.parentType;
+    state.parentType = "reference";
+    for (;nextLine < endLine && !state.isEmpty(nextLine); nextLine++) {
+      // this would be a code block normally, but after paragraph
+      // it's considered a lazy continuation regardless of what's there
+      if (state.sCount[nextLine] - state.blkIndent > 3) {
+        continue;
+      }
+      // quirk for blockquotes, this line should already be checked by that rule
+            if (state.sCount[nextLine] < 0) {
+        continue;
+      }
+      // Some tags can terminate paragraph without empty line.
+            terminate = false;
+      for (i = 0, l = terminatorRules.length; i < l; i++) {
+        if (terminatorRules[i](state, nextLine, endLine, true)) {
+          terminate = true;
+          break;
+        }
+      }
+      if (terminate) {
+        break;
+      }
+    }
+    str = state.getLines(startLine, nextLine, state.blkIndent, false).trim();
+    max = str.length;
+    for (pos = 1; pos < max; pos++) {
+      ch = str.charCodeAt(pos);
+      if (ch === 91 /* [ */) {
+        return false;
+      } else if (ch === 93 /* ] */) {
+        labelEnd = pos;
+        break;
+      } else if (ch === 10 /* \n */) {
+        lines++;
+      } else if (ch === 92 /* \ */) {
+        pos++;
+        if (pos < max && str.charCodeAt(pos) === 10) {
+          lines++;
+        }
+      }
+    }
+    if (labelEnd < 0 || str.charCodeAt(labelEnd + 1) !== 58 /* : */) {
+      return false;
+    }
+    // [label]:   destination   'title'
+    //         ^^^ skip optional whitespace here
+        for (pos = labelEnd + 2; pos < max; pos++) {
+      ch = str.charCodeAt(pos);
+      if (ch === 10) {
+        lines++;
+      } else if (isSpace$6(ch)) ; else {
+        break;
+      }
+    }
+    // [label]:   destination   'title'
+    //            ^^^^^^^^^^^ parse this
+        res = state.md.helpers.parseLinkDestination(str, pos, max);
+    if (!res.ok) {
+      return false;
+    }
+    href = state.md.normalizeLink(res.str);
+    if (!state.md.validateLink(href)) {
+      return false;
+    }
+    pos = res.pos;
+    lines += res.lines;
+    // save cursor state, we could require to rollback later
+        destEndPos = pos;
+    destEndLineNo = lines;
+    // [label]:   destination   'title'
+    //                       ^^^ skipping those spaces
+        start = pos;
+    for (;pos < max; pos++) {
+      ch = str.charCodeAt(pos);
+      if (ch === 10) {
+        lines++;
+      } else if (isSpace$6(ch)) ; else {
+        break;
+      }
+    }
+    // [label]:   destination   'title'
+    //                          ^^^^^^^ parse this
+        res = state.md.helpers.parseLinkTitle(str, pos, max);
+    if (pos < max && start !== pos && res.ok) {
+      title = res.str;
+      pos = res.pos;
+      lines += res.lines;
+    } else {
+      title = "";
+      pos = destEndPos;
+      lines = destEndLineNo;
+    }
+    // skip trailing spaces until the rest of the line
+        while (pos < max) {
+      ch = str.charCodeAt(pos);
+      if (!isSpace$6(ch)) {
+        break;
+      }
+      pos++;
+    }
+    if (pos < max && str.charCodeAt(pos) !== 10) {
+      if (title) {
+        // garbage at the end of the line after title,
+        // but it could still be a valid reference if we roll back
+        title = "";
+        pos = destEndPos;
+        lines = destEndLineNo;
+        while (pos < max) {
+          ch = str.charCodeAt(pos);
+          if (!isSpace$6(ch)) {
+            break;
+          }
+          pos++;
+        }
+      }
+    }
+    if (pos < max && str.charCodeAt(pos) !== 10) {
+      // garbage at the end of the line
+      return false;
+    }
+    label = normalizeReference$2(str.slice(1, labelEnd));
+    if (!label) {
+      // CommonMark 0.20 disallows empty labels
+      return false;
+    }
+    // Reference can not terminate anything. This check is for safety only.
+    /*istanbul ignore if*/    if (silent) {
+      return true;
+    }
+    if (typeof state.env.references === "undefined") {
+      state.env.references = {};
+    }
+    if (typeof state.env.references[label] === "undefined") {
+      state.env.references[label] = {
+        title: title,
+        href: href
+      };
+    }
+    state.parentType = oldParentType;
+    state.line = startLine + lines + 1;
+    return true;
+  };
+  // List of valid html blocks names, accorting to commonmark spec
+    var html_blocks = [ "address", "article", "aside", "base", "basefont", "blockquote", "body", "caption", "center", "col", "colgroup", "dd", "details", "dialog", "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hr", "html", "iframe", "legend", "li", "link", "main", "menu", "menuitem", "nav", "noframes", "ol", "optgroup", "option", "p", "param", "section", "source", "summary", "table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "track", "ul" ];
+  // Regexps to match html elements
+    var attr_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*";
+  var unquoted = "[^\"'=<>`\\x00-\\x20]+";
+  var single_quoted = "'[^']*'";
+  var double_quoted = '"[^"]*"';
+  var attr_value = "(?:" + unquoted + "|" + single_quoted + "|" + double_quoted + ")";
+  var attribute = "(?:\\s+" + attr_name + "(?:\\s*=\\s*" + attr_value + ")?)";
+  var open_tag = "<[A-Za-z][A-Za-z0-9\\-]*" + attribute + "*\\s*\\/?>";
+  var close_tag = "<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>";
+  var comment = "\x3c!----\x3e|\x3c!--(?:-?[^>-])(?:-?[^-])*--\x3e";
+  var processing = "<[?][\\s\\S]*?[?]>";
+  var declaration = "<![A-Z]+\\s+[^>]*>";
+  var cdata = "<!\\[CDATA\\[[\\s\\S]*?\\]\\]>";
+  var HTML_TAG_RE$1 = new RegExp("^(?:" + open_tag + "|" + close_tag + "|" + comment + "|" + processing + "|" + declaration + "|" + cdata + ")");
+  var HTML_OPEN_CLOSE_TAG_RE$1 = new RegExp("^(?:" + open_tag + "|" + close_tag + ")");
+  var HTML_TAG_RE_1 = HTML_TAG_RE$1;
+  var HTML_OPEN_CLOSE_TAG_RE_1 = HTML_OPEN_CLOSE_TAG_RE$1;
+  var html_re = {
+    HTML_TAG_RE: HTML_TAG_RE_1,
+    HTML_OPEN_CLOSE_TAG_RE: HTML_OPEN_CLOSE_TAG_RE_1
+  };
+  var HTML_OPEN_CLOSE_TAG_RE = html_re.HTML_OPEN_CLOSE_TAG_RE;
+  // An array of opening and corresponding closing sequences for html tags,
+  // last argument defines whether it can terminate a paragraph or not
+  
+    var HTML_SEQUENCES = [ [ /^<(script|pre|style|textarea)(?=(\s|>|$))/i, /<\/(script|pre|style|textarea)>/i, true ], [ /^<!--/, /-->/, true ], [ /^<\?/, /\?>/, true ], [ /^<![A-Z]/, />/, true ], [ /^<!\[CDATA\[/, /\]\]>/, true ], [ new RegExp("^</?(" + html_blocks.join("|") + ")(?=(\\s|/?>|$))", "i"), /^$/, true ], [ new RegExp(HTML_OPEN_CLOSE_TAG_RE.source + "\\s*$"), /^$/, false ] ];
+  var html_block = function html_block(state, startLine, endLine, silent) {
+    var i, nextLine, token, lineText, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine];
+    // if it's indented more than 3 spaces, it should be a code block
+        if (state.sCount[startLine] - state.blkIndent >= 4) {
+      return false;
+    }
+    if (!state.md.options.html) {
+      return false;
+    }
+    if (state.src.charCodeAt(pos) !== 60 /* < */) {
+      return false;
+    }
+    lineText = state.src.slice(pos, max);
+    for (i = 0; i < HTML_SEQUENCES.length; i++) {
+      if (HTML_SEQUENCES[i][0].test(lineText)) {
+        break;
+      }
+    }
+    if (i === HTML_SEQUENCES.length) {
+      return false;
+    }
+    if (silent) {
+      // true if this sequence can be a terminator, false otherwise
+      return HTML_SEQUENCES[i][2];
+    }
+    nextLine = startLine + 1;
+    // If we are here - we detected HTML block.
+    // Let's roll down till block end.
+        if (!HTML_SEQUENCES[i][1].test(lineText)) {
+      for (;nextLine < endLine; nextLine++) {
+        if (state.sCount[nextLine] < state.blkIndent) {
+          break;
+        }
+        pos = state.bMarks[nextLine] + state.tShift[nextLine];
+        max = state.eMarks[nextLine];
+        lineText = state.src.slice(pos, max);
+        if (HTML_SEQUENCES[i][1].test(lineText)) {
+          if (lineText.length !== 0) {
+            nextLine++;
+          }
+          break;
+        }
+      }
+    }
+    state.line = nextLine;
+    token = state.push("html_block", "", 0);
+    token.map = [ startLine, nextLine ];
+    token.content = state.getLines(startLine, nextLine, state.blkIndent, true);
+    return true;
+  };
+  var isSpace$5 = utils.isSpace;
+  var heading = function heading(state, startLine, endLine, silent) {
+    var ch, level, tmp, token, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine];
+    // if it's indented more than 3 spaces, it should be a code block
+        if (state.sCount[startLine] - state.blkIndent >= 4) {
+      return false;
+    }
+    ch = state.src.charCodeAt(pos);
+    if (ch !== 35 /* # */ || pos >= max) {
+      return false;
+    }
+    // count heading level
+        level = 1;
+    ch = state.src.charCodeAt(++pos);
+    while (ch === 35 /* # */ && pos < max && level <= 6) {
+      level++;
+      ch = state.src.charCodeAt(++pos);
+    }
+    if (level > 6 || pos < max && !isSpace$5(ch)) {
+      return false;
+    }
+    if (silent) {
+      return true;
+    }
+    // Let's cut tails like '    ###  ' from the end of string
+        max = state.skipSpacesBack(max, pos);
+    tmp = state.skipCharsBack(max, 35, pos);
+ // #
+        if (tmp > pos && isSpace$5(state.src.charCodeAt(tmp - 1))) {
+      max = tmp;
+    }
+    state.line = startLine + 1;
+    token = state.push("heading_open", "h" + String(level), 1);
+    token.markup = "########".slice(0, level);
+    token.map = [ startLine, state.line ];
+    token = state.push("inline", "", 0);
+    token.content = state.src.slice(pos, max).trim();
+    token.map = [ startLine, state.line ];
+    token.children = [];
+    token = state.push("heading_close", "h" + String(level), -1);
+    token.markup = "########".slice(0, level);
+    return true;
+  };
+  // lheading (---, ===)
+    var lheading = function lheading(state, startLine, endLine /*, silent*/) {
+    var content, terminate, i, l, token, pos, max, level, marker, nextLine = startLine + 1, oldParentType, terminatorRules = state.md.block.ruler.getRules("paragraph");
+    // if it's indented more than 3 spaces, it should be a code block
+        if (state.sCount[startLine] - state.blkIndent >= 4) {
+      return false;
+    }
+    oldParentType = state.parentType;
+    state.parentType = "paragraph";
+ // use paragraph to match terminatorRules
+    // jump line-by-line until empty one or EOF
+        for (;nextLine < endLine && !state.isEmpty(nextLine); nextLine++) {
+      // this would be a code block normally, but after paragraph
+      // it's considered a lazy continuation regardless of what's there
+      if (state.sCount[nextLine] - state.blkIndent > 3) {
+        continue;
+      }
+      
+      // Check for underline in setext header
+      
+            if (state.sCount[nextLine] >= state.blkIndent) {
+        pos = state.bMarks[nextLine] + state.tShift[nextLine];
+        max = state.eMarks[nextLine];
+        if (pos < max) {
+          marker = state.src.charCodeAt(pos);
+          if (marker === 45 /* - */ || marker === 61 /* = */) {
+            pos = state.skipChars(pos, marker);
+            pos = state.skipSpaces(pos);
+            if (pos >= max) {
+              level = marker === 61 /* = */ ? 1 : 2;
+              break;
+            }
+          }
+        }
+      }
+      // quirk for blockquotes, this line should already be checked by that rule
+            if (state.sCount[nextLine] < 0) {
+        continue;
+      }
+      // Some tags can terminate paragraph without empty line.
+            terminate = false;
+      for (i = 0, l = terminatorRules.length; i < l; i++) {
+        if (terminatorRules[i](state, nextLine, endLine, true)) {
+          terminate = true;
+          break;
+        }
+      }
+      if (terminate) {
+        break;
+      }
+    }
+    if (!level) {
+      // Didn't find valid underline
+      return false;
+    }
+    content = state.getLines(startLine, nextLine, state.blkIndent, false).trim();
+    state.line = nextLine + 1;
+    token = state.push("heading_open", "h" + String(level), 1);
+    token.markup = String.fromCharCode(marker);
+    token.map = [ startLine, state.line ];
+    token = state.push("inline", "", 0);
+    token.content = content;
+    token.map = [ startLine, state.line - 1 ];
+    token.children = [];
+    token = state.push("heading_close", "h" + String(level), -1);
+    token.markup = String.fromCharCode(marker);
+    state.parentType = oldParentType;
+    return true;
+  };
+  // Paragraph
+    var paragraph = function paragraph(state, startLine, endLine) {
+    var content, terminate, i, l, token, oldParentType, nextLine = startLine + 1, terminatorRules = state.md.block.ruler.getRules("paragraph");
+    oldParentType = state.parentType;
+    state.parentType = "paragraph";
+    // jump line-by-line until empty one or EOF
+        for (;nextLine < endLine && !state.isEmpty(nextLine); nextLine++) {
+      // this would be a code block normally, but after paragraph
+      // it's considered a lazy continuation regardless of what's there
+      if (state.sCount[nextLine] - state.blkIndent > 3) {
+        continue;
+      }
+      // quirk for blockquotes, this line should already be checked by that rule
+            if (state.sCount[nextLine] < 0) {
+        continue;
+      }
+      // Some tags can terminate paragraph without empty line.
+            terminate = false;
+      for (i = 0, l = terminatorRules.length; i < l; i++) {
+        if (terminatorRules[i](state, nextLine, endLine, true)) {
+          terminate = true;
+          break;
+        }
+      }
+      if (terminate) {
+        break;
+      }
+    }
+    content = state.getLines(startLine, nextLine, state.blkIndent, false).trim();
+    state.line = nextLine;
+    token = state.push("paragraph_open", "p", 1);
+    token.map = [ startLine, state.line ];
+    token = state.push("inline", "", 0);
+    token.content = content;
+    token.map = [ startLine, state.line ];
+    token.children = [];
+    token = state.push("paragraph_close", "p", -1);
+    state.parentType = oldParentType;
+    return true;
+  };
+  var isSpace$4 = utils.isSpace;
+  function StateBlock(src, md, env, tokens) {
+    var ch, s, start, pos, len, indent, offset, indent_found;
+    this.src = src;
+    // link to parser instance
+        this.md = md;
+    this.env = env;
+    
+    // Internal state vartiables
+    
+        this.tokens = tokens;
+    this.bMarks = [];
+ // line begin offsets for fast jumps
+        this.eMarks = [];
+ // line end offsets for fast jumps
+        this.tShift = [];
+ // offsets of the first non-space characters (tabs not expanded)
+        this.sCount = [];
+ // indents for each line (tabs expanded)
+    // An amount of virtual spaces (tabs expanded) between beginning
+    // of each line (bMarks) and real beginning of that line.
+    
+    // It exists only as a hack because blockquotes override bMarks
+    // losing information in the process.
+    
+    // It's used only when expanding tabs, you can think about it as
+    // an initial tab length, e.g. bsCount=21 applied to string `\t123`
+    // means first tab should be expanded to 4-21%4 === 3 spaces.
+    
+        this.bsCount = [];
+    // block parser variables
+        this.blkIndent = 0;
+ // required block content indent (for example, if we are
+    // inside a list, it would be positioned after list marker)
+        this.line = 0;
+ // line index in src
+        this.lineMax = 0;
+ // lines count
+        this.tight = false;
+ // loose/tight mode for lists
+        this.ddIndent = -1;
+ // indent of the current dd block (-1 if there isn't any)
+        this.listIndent = -1;
+ // indent of the current list block (-1 if there isn't any)
+    // can be 'blockquote', 'list', 'root', 'paragraph' or 'reference'
+    // used in lists to determine if they interrupt a paragraph
+        this.parentType = "root";
+    this.level = 0;
+    // renderer
+        this.result = "";
+    // Create caches
+    // Generate markers.
+        s = this.src;
+    indent_found = false;
+    for (start = pos = indent = offset = 0, len = s.length; pos < len; pos++) {
+      ch = s.charCodeAt(pos);
+      if (!indent_found) {
+        if (isSpace$4(ch)) {
+          indent++;
+          if (ch === 9) {
+            offset += 4 - offset % 4;
+          } else {
+            offset++;
+          }
+          continue;
+        } else {
+          indent_found = true;
+        }
+      }
+      if (ch === 10 || pos === len - 1) {
+        if (ch !== 10) {
+          pos++;
+        }
+        this.bMarks.push(start);
+        this.eMarks.push(pos);
+        this.tShift.push(indent);
+        this.sCount.push(offset);
+        this.bsCount.push(0);
+        indent_found = false;
+        indent = 0;
+        offset = 0;
+        start = pos + 1;
+      }
+    }
+    // Push fake entry to simplify cache bounds checks
+        this.bMarks.push(s.length);
+    this.eMarks.push(s.length);
+    this.tShift.push(0);
+    this.sCount.push(0);
+    this.bsCount.push(0);
+    this.lineMax = this.bMarks.length - 1;
+ // don't count last fake line
+    }
+  // Push new token to "stream".
+  
+    StateBlock.prototype.push = function(type, tag, nesting) {
+    var token$1 = new token(type, tag, nesting);
+    token$1.block = true;
+    if (nesting < 0) this.level--;
+ // closing tag
+        token$1.level = this.level;
+    if (nesting > 0) this.level++;
+ // opening tag
+        this.tokens.push(token$1);
+    return token$1;
+  };
+  StateBlock.prototype.isEmpty = function isEmpty(line) {
+    return this.bMarks[line] + this.tShift[line] >= this.eMarks[line];
+  };
+  StateBlock.prototype.skipEmptyLines = function skipEmptyLines(from) {
+    for (var max = this.lineMax; from < max; from++) {
+      if (this.bMarks[from] + this.tShift[from] < this.eMarks[from]) {
+        break;
+      }
+    }
+    return from;
+  };
+  // Skip spaces from given position.
+    StateBlock.prototype.skipSpaces = function skipSpaces(pos) {
+    var ch;
+    for (var max = this.src.length; pos < max; pos++) {
+      ch = this.src.charCodeAt(pos);
+      if (!isSpace$4(ch)) {
+        break;
+      }
+    }
+    return pos;
+  };
+  // Skip spaces from given position in reverse.
+    StateBlock.prototype.skipSpacesBack = function skipSpacesBack(pos, min) {
+    if (pos <= min) {
+      return pos;
+    }
+    while (pos > min) {
+      if (!isSpace$4(this.src.charCodeAt(--pos))) {
+        return pos + 1;
+      }
+    }
+    return pos;
+  };
+  // Skip char codes from given position
+    StateBlock.prototype.skipChars = function skipChars(pos, code) {
+    for (var max = this.src.length; pos < max; pos++) {
+      if (this.src.charCodeAt(pos) !== code) {
+        break;
+      }
+    }
+    return pos;
+  };
+  // Skip char codes reverse from given position - 1
+    StateBlock.prototype.skipCharsBack = function skipCharsBack(pos, code, min) {
+    if (pos <= min) {
+      return pos;
+    }
+    while (pos > min) {
+      if (code !== this.src.charCodeAt(--pos)) {
+        return pos + 1;
+      }
+    }
+    return pos;
+  };
+  // cut lines range from source.
+    StateBlock.prototype.getLines = function getLines(begin, end, indent, keepLastLF) {
+    var i, lineIndent, ch, first, last, queue, lineStart, line = begin;
+    if (begin >= end) {
+      return "";
+    }
+    queue = new Array(end - begin);
+    for (i = 0; line < end; line++, i++) {
+      lineIndent = 0;
+      lineStart = first = this.bMarks[line];
+      if (line + 1 < end || keepLastLF) {
+        // No need for bounds check because we have fake entry on tail.
+        last = this.eMarks[line] + 1;
+      } else {
+        last = this.eMarks[line];
+      }
+      while (first < last && lineIndent < indent) {
+        ch = this.src.charCodeAt(first);
+        if (isSpace$4(ch)) {
+          if (ch === 9) {
+            lineIndent += 4 - (lineIndent + this.bsCount[line]) % 4;
+          } else {
+            lineIndent++;
+          }
+        } else if (first - lineStart < this.tShift[line]) {
+          // patched tShift masked characters to look like spaces (blockquotes, list markers)
+          lineIndent++;
+        } else {
+          break;
+        }
+        first++;
+      }
+      if (lineIndent > indent) {
+        // partially expanding tabs in code blocks, e.g '\t\tfoobar'
+        // with indent=2 becomes '  \tfoobar'
+        queue[i] = new Array(lineIndent - indent + 1).join(" ") + this.src.slice(first, last);
+      } else {
+        queue[i] = this.src.slice(first, last);
+      }
+    }
+    return queue.join("");
+  };
+  // re-export Token class to use in block rules
+    StateBlock.prototype.Token = token;
+  var state_block = StateBlock;
+  var _rules$1 = [ 
+  // First 2 params - rule name & source. Secondary array - list of rules,
+  // which can be terminated by this one.
+  [ "table", table, [ "paragraph", "reference" ] ], [ "code", code ], [ "fence", fence, [ "paragraph", "reference", "blockquote", "list" ] ], [ "blockquote", blockquote, [ "paragraph", "reference", "blockquote", "list" ] ], [ "hr", hr, [ "paragraph", "reference", "blockquote", "list" ] ], [ "list", list, [ "paragraph", "reference", "blockquote" ] ], [ "reference", reference ], [ "html_block", html_block, [ "paragraph", "reference", "blockquote" ] ], [ "heading", heading, [ "paragraph", "reference", "blockquote" ] ], [ "lheading", lheading ], [ "paragraph", paragraph ] ];
+  /**
+	 * new ParserBlock()
+	 **/  function ParserBlock() {
+    /**
+	   * ParserBlock#ruler -> Ruler
+	   *
+	   * [[Ruler]] instance. Keep configuration of block rules.
+	   **/
+    this.ruler = new ruler;
+    for (var i = 0; i < _rules$1.length; i++) {
+      this.ruler.push(_rules$1[i][0], _rules$1[i][1], {
+        alt: (_rules$1[i][2] || []).slice()
+      });
+    }
+  }
+  // Generate tokens for input range
+  
+    ParserBlock.prototype.tokenize = function(state, startLine, endLine) {
+    var ok, i, prevLine, rules = this.ruler.getRules(""), len = rules.length, line = startLine, hasEmptyLines = false, maxNesting = state.md.options.maxNesting;
+    while (line < endLine) {
+      state.line = line = state.skipEmptyLines(line);
+      if (line >= endLine) {
+        break;
+      }
+      // Termination condition for nested calls.
+      // Nested calls currently used for blockquotes & lists
+            if (state.sCount[line] < state.blkIndent) {
+        break;
+      }
+      // If nesting level exceeded - skip tail to the end. That's not ordinary
+      // situation and we should not care about content.
+            if (state.level >= maxNesting) {
+        state.line = endLine;
+        break;
+      }
+      // Try all possible rules.
+      // On success, rule should:
+      
+      // - update `state.line`
+      // - update `state.tokens`
+      // - return true
+            prevLine = state.line;
+      for (i = 0; i < len; i++) {
+        ok = rules[i](state, line, endLine, false);
+        if (ok) {
+          if (prevLine >= state.line) {
+            throw new Error("block rule didn't increment state.line");
+          }
+          break;
+        }
+      }
+      // this can only happen if user disables paragraph rule
+            if (!ok) throw new Error("none of the block rules matched");
+      // set state.tight if we had an empty line before current tag
+      // i.e. latest empty line should not count
+            state.tight = !hasEmptyLines;
+      // paragraph might "eat" one newline after it in nested lists
+            if (state.isEmpty(state.line - 1)) {
+        hasEmptyLines = true;
+      }
+      line = state.line;
+      if (line < endLine && state.isEmpty(line)) {
+        hasEmptyLines = true;
+        line++;
+        state.line = line;
+      }
+    }
+  };
+  /**
+	 * ParserBlock.parse(str, md, env, outTokens)
+	 *
+	 * Process input string and push block tokens into `outTokens`
+	 **/  ParserBlock.prototype.parse = function(src, md, env, outTokens) {
+    var state;
+    if (!src) {
+      return;
+    }
+    state = new this.State(src, md, env, outTokens);
+    this.tokenize(state, state.line, state.lineMax);
+  };
+  ParserBlock.prototype.State = state_block;
+  var parser_block = ParserBlock;
+  // Skip text characters for text token, place those to pending buffer
+  // Rule to skip pure text
+  // '{}$%@~+=:' reserved for extentions
+  // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~
+  // !!!! Don't confuse with "Markdown ASCII Punctuation" chars
+  // http://spec.commonmark.org/0.15/#ascii-punctuation-character
+    function isTerminatorChar(ch) {
+    switch (ch) {
+     case 10 /* \n */ :
+     case 33 /* ! */ :
+     case 35 /* # */ :
+     case 36 /* $ */ :
+     case 37 /* % */ :
+     case 38 /* & */ :
+     case 42 /* * */ :
+     case 43 /* + */ :
+     case 45 /* - */ :
+     case 58 /* : */ :
+     case 60 /* < */ :
+     case 61 /* = */ :
+     case 62 /* > */ :
+     case 64 /* @ */ :
+     case 91 /* [ */ :
+     case 92 /* \ */ :
+     case 93 /* ] */ :
+     case 94 /* ^ */ :
+     case 95 /* _ */ :
+     case 96 /* ` */ :
+     case 123 /* { */ :
+     case 125 /* } */ :
+     case 126 /* ~ */ :
+      return true;
+
+     default:
+      return false;
+    }
+  }
+  var text = function text(state, silent) {
+    var pos = state.pos;
+    while (pos < state.posMax && !isTerminatorChar(state.src.charCodeAt(pos))) {
+      pos++;
+    }
+    if (pos === state.pos) {
+      return false;
+    }
+    if (!silent) {
+      state.pending += state.src.slice(state.pos, pos);
+    }
+    state.pos = pos;
+    return true;
+  };
+  // Process links like https://example.org/
+  // RFC3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+    var SCHEME_RE = /(?:^|[^a-z0-9.+-])([a-z][a-z0-9.+-]*)$/i;
+  var linkify = function linkify(state, silent) {
+    var pos, max, match, proto, link, url, fullUrl, token;
+    if (!state.md.options.linkify) return false;
+    if (state.linkLevel > 0) return false;
+    pos = state.pos;
+    max = state.posMax;
+    if (pos + 3 > max) return false;
+    if (state.src.charCodeAt(pos) !== 58 /* : */) return false;
+    if (state.src.charCodeAt(pos + 1) !== 47 /* / */) return false;
+    if (state.src.charCodeAt(pos + 2) !== 47 /* / */) return false;
+    match = state.pending.match(SCHEME_RE);
+    if (!match) return false;
+    proto = match[1];
+    link = state.md.linkify.matchAtStart(state.src.slice(pos - proto.length));
+    if (!link) return false;
+    url = link.url;
+    // invalid link, but still detected by linkify somehow;
+    // need to check to prevent infinite loop below
+        if (url.length <= proto.length) return false;
+    // disallow '*' at the end of the link (conflicts with emphasis)
+        url = url.replace(/\*+$/, "");
+    fullUrl = state.md.normalizeLink(url);
+    if (!state.md.validateLink(fullUrl)) return false;
+    if (!silent) {
+      state.pending = state.pending.slice(0, -proto.length);
+      token = state.push("link_open", "a", 1);
+      token.attrs = [ [ "href", fullUrl ] ];
+      token.markup = "linkify";
+      token.info = "auto";
+      token = state.push("text", "", 0);
+      token.content = state.md.normalizeLinkText(url);
+      token = state.push("link_close", "a", -1);
+      token.markup = "linkify";
+      token.info = "auto";
+    }
+    state.pos += url.length - proto.length;
+    return true;
+  };
+  var isSpace$3 = utils.isSpace;
+  var newline = function newline(state, silent) {
+    var pmax, max, ws, pos = state.pos;
+    if (state.src.charCodeAt(pos) !== 10 /* \n */) {
+      return false;
+    }
+    pmax = state.pending.length - 1;
+    max = state.posMax;
+    // '  \n' -> hardbreak
+    // Lookup in pending chars is bad practice! Don't copy to other rules!
+    // Pending string is stored in concat mode, indexed lookups will cause
+    // convertion to flat mode.
+        if (!silent) {
+      if (pmax >= 0 && state.pending.charCodeAt(pmax) === 32) {
+        if (pmax >= 1 && state.pending.charCodeAt(pmax - 1) === 32) {
+          // Find whitespaces tail of pending chars.
+          ws = pmax - 1;
+          while (ws >= 1 && state.pending.charCodeAt(ws - 1) === 32) ws--;
+          state.pending = state.pending.slice(0, ws);
+          state.push("hardbreak", "br", 0);
+        } else {
+          state.pending = state.pending.slice(0, -1);
+          state.push("softbreak", "br", 0);
+        }
+      } else {
+        state.push("softbreak", "br", 0);
+      }
+    }
+    pos++;
+    // skip heading spaces for next line
+        while (pos < max && isSpace$3(state.src.charCodeAt(pos))) {
+      pos++;
+    }
+    state.pos = pos;
+    return true;
+  };
+  var isSpace$2 = utils.isSpace;
+  var ESCAPED = [];
+  for (var i = 0; i < 256; i++) {
+    ESCAPED.push(0);
+  }
+  "\\!\"#$%&'()*+,./:;<=>?@[]^_`{|}~-".split("").forEach((function(ch) {
+    ESCAPED[ch.charCodeAt(0)] = 1;
+  }));
+  var _escape = function escape(state, silent) {
+    var ch1, ch2, origStr, escapedStr, token, pos = state.pos, max = state.posMax;
+    if (state.src.charCodeAt(pos) !== 92 /* \ */) return false;
+    pos++;
+    // '\' at the end of the inline block
+        if (pos >= max) return false;
+    ch1 = state.src.charCodeAt(pos);
+    if (ch1 === 10) {
+      if (!silent) {
+        state.push("hardbreak", "br", 0);
+      }
+      pos++;
+      // skip leading whitespaces from next line
+            while (pos < max) {
+        ch1 = state.src.charCodeAt(pos);
+        if (!isSpace$2(ch1)) break;
+        pos++;
+      }
+      state.pos = pos;
+      return true;
+    }
+    escapedStr = state.src[pos];
+    if (ch1 >= 55296 && ch1 <= 56319 && pos + 1 < max) {
+      ch2 = state.src.charCodeAt(pos + 1);
+      if (ch2 >= 56320 && ch2 <= 57343) {
+        escapedStr += state.src[pos + 1];
+        pos++;
+      }
+    }
+    origStr = "\\" + escapedStr;
+    if (!silent) {
+      token = state.push("text_special", "", 0);
+      if (ch1 < 256 && ESCAPED[ch1] !== 0) {
+        token.content = escapedStr;
+      } else {
+        token.content = origStr;
+      }
+      token.markup = origStr;
+      token.info = "escape";
+    }
+    state.pos = pos + 1;
+    return true;
+  };
+  // Parse backticks
+    var backticks = function backtick(state, silent) {
+    var start, max, marker, token, matchStart, matchEnd, openerLength, closerLength, pos = state.pos, ch = state.src.charCodeAt(pos);
+    if (ch !== 96 /* ` */) {
+      return false;
+    }
+    start = pos;
+    pos++;
+    max = state.posMax;
+    // scan marker length
+        while (pos < max && state.src.charCodeAt(pos) === 96 /* ` */) {
+      pos++;
+    }
+    marker = state.src.slice(start, pos);
+    openerLength = marker.length;
+    if (state.backticksScanned && (state.backticks[openerLength] || 0) <= start) {
+      if (!silent) state.pending += marker;
+      state.pos += openerLength;
+      return true;
+    }
+    matchEnd = pos;
+    // Nothing found in the cache, scan until the end of the line (or until marker is found)
+        while ((matchStart = state.src.indexOf("`", matchEnd)) !== -1) {
+      matchEnd = matchStart + 1;
+      // scan marker length
+            while (matchEnd < max && state.src.charCodeAt(matchEnd) === 96 /* ` */) {
+        matchEnd++;
+      }
+      closerLength = matchEnd - matchStart;
+      if (closerLength === openerLength) {
+        // Found matching closer length.
+        if (!silent) {
+          token = state.push("code_inline", "code", 0);
+          token.markup = marker;
+          token.content = state.src.slice(pos, matchStart).replace(/\n/g, " ").replace(/^ (.+) $/, "$1");
+        }
+        state.pos = matchEnd;
+        return true;
+      }
+      // Some different length found, put it in cache as upper limit of where closer can be found
+            state.backticks[closerLength] = matchStart;
+    }
+    // Scanned through the end, didn't find anything
+        state.backticksScanned = true;
+    if (!silent) state.pending += marker;
+    state.pos += openerLength;
+    return true;
+  };
+  // ~~strike through~~
+  // Insert each marker as a separate text token, and add it to delimiter list
+  
+    var tokenize$1 = function strikethrough(state, silent) {
+    var i, scanned, token, len, ch, start = state.pos, marker = state.src.charCodeAt(start);
+    if (silent) {
+      return false;
+    }
+    if (marker !== 126 /* ~ */) {
+      return false;
+    }
+    scanned = state.scanDelims(state.pos, true);
+    len = scanned.length;
+    ch = String.fromCharCode(marker);
+    if (len < 2) {
+      return false;
+    }
+    if (len % 2) {
+      token = state.push("text", "", 0);
+      token.content = ch;
+      len--;
+    }
+    for (i = 0; i < len; i += 2) {
+      token = state.push("text", "", 0);
+      token.content = ch + ch;
+      state.delimiters.push({
+        marker: marker,
+        length: 0,
+        // disable "rule of 3" length checks meant for emphasis
+        token: state.tokens.length - 1,
+        end: -1,
+        open: scanned.can_open,
+        close: scanned.can_close
+      });
+    }
+    state.pos += scanned.length;
+    return true;
+  };
+  function postProcess$1(state, delimiters) {
+    var i, j, startDelim, endDelim, token, loneMarkers = [], max = delimiters.length;
+    for (i = 0; i < max; i++) {
+      startDelim = delimiters[i];
+      if (startDelim.marker !== 126 /* ~ */) {
+        continue;
+      }
+      if (startDelim.end === -1) {
+        continue;
+      }
+      endDelim = delimiters[startDelim.end];
+      token = state.tokens[startDelim.token];
+      token.type = "s_open";
+      token.tag = "s";
+      token.nesting = 1;
+      token.markup = "~~";
+      token.content = "";
+      token = state.tokens[endDelim.token];
+      token.type = "s_close";
+      token.tag = "s";
+      token.nesting = -1;
+      token.markup = "~~";
+      token.content = "";
+      if (state.tokens[endDelim.token - 1].type === "text" && state.tokens[endDelim.token - 1].content === "~") {
+        loneMarkers.push(endDelim.token - 1);
+      }
+    }
+    // If a marker sequence has an odd number of characters, it's splitted
+    // like this: `~~~~~` -> `~` + `~~` + `~~`, leaving one marker at the
+    // start of the sequence.
+    
+    // So, we have to move all those markers after subsequent s_close tags.
+    
+        while (loneMarkers.length) {
+      i = loneMarkers.pop();
+      j = i + 1;
+      while (j < state.tokens.length && state.tokens[j].type === "s_close") {
+        j++;
+      }
+      j--;
+      if (i !== j) {
+        token = state.tokens[j];
+        state.tokens[j] = state.tokens[i];
+        state.tokens[i] = token;
+      }
+    }
+  }
+  // Walk through delimiter list and replace text tokens with tags
+  
+    var postProcess_1$1 = function strikethrough(state) {
+    var curr, tokens_meta = state.tokens_meta, max = state.tokens_meta.length;
+    postProcess$1(state, state.delimiters);
+    for (curr = 0; curr < max; curr++) {
+      if (tokens_meta[curr] && tokens_meta[curr].delimiters) {
+        postProcess$1(state, tokens_meta[curr].delimiters);
+      }
+    }
+  };
+  var strikethrough = {
+    tokenize: tokenize$1,
+    postProcess: postProcess_1$1
+  };
+  // Process *this* and _that_
+  // Insert each marker as a separate text token, and add it to delimiter list
+  
+    var tokenize = function emphasis(state, silent) {
+    var i, scanned, token, start = state.pos, marker = state.src.charCodeAt(start);
+    if (silent) {
+      return false;
+    }
+    if (marker !== 95 /* _ */ && marker !== 42 /* * */) {
+      return false;
+    }
+    scanned = state.scanDelims(state.pos, marker === 42);
+    for (i = 0; i < scanned.length; i++) {
+      token = state.push("text", "", 0);
+      token.content = String.fromCharCode(marker);
+      state.delimiters.push({
+        // Char code of the starting marker (number).
+        marker: marker,
+        // Total length of these series of delimiters.
+        length: scanned.length,
+        // A position of the token this delimiter corresponds to.
+        token: state.tokens.length - 1,
+        // If this delimiter is matched as a valid opener, `end` will be
+        // equal to its position, otherwise it's `-1`.
+        end: -1,
+        // Boolean flags that determine if this delimiter could open or close
+        // an emphasis.
+        open: scanned.can_open,
+        close: scanned.can_close
+      });
+    }
+    state.pos += scanned.length;
+    return true;
+  };
+  function postProcess(state, delimiters) {
+    var i, startDelim, endDelim, token, ch, isStrong, max = delimiters.length;
+    for (i = max - 1; i >= 0; i--) {
+      startDelim = delimiters[i];
+      if (startDelim.marker !== 95 /* _ */ && startDelim.marker !== 42 /* * */) {
+        continue;
+      }
+      // Process only opening markers
+            if (startDelim.end === -1) {
+        continue;
+      }
+      endDelim = delimiters[startDelim.end];
+      // If the previous delimiter has the same marker and is adjacent to this one,
+      // merge those into one strong delimiter.
+      
+      // `<em><em>whatever</em></em>` -> `<strong>whatever</strong>`
+      
+            isStrong = i > 0 && delimiters[i - 1].end === startDelim.end + 1 && 
+      // check that first two markers match and adjacent
+      delimiters[i - 1].marker === startDelim.marker && delimiters[i - 1].token === startDelim.token - 1 && 
+      // check that last two markers are adjacent (we can safely assume they match)
+      delimiters[startDelim.end + 1].token === endDelim.token + 1;
+      ch = String.fromCharCode(startDelim.marker);
+      token = state.tokens[startDelim.token];
+      token.type = isStrong ? "strong_open" : "em_open";
+      token.tag = isStrong ? "strong" : "em";
+      token.nesting = 1;
+      token.markup = isStrong ? ch + ch : ch;
+      token.content = "";
+      token = state.tokens[endDelim.token];
+      token.type = isStrong ? "strong_close" : "em_close";
+      token.tag = isStrong ? "strong" : "em";
+      token.nesting = -1;
+      token.markup = isStrong ? ch + ch : ch;
+      token.content = "";
+      if (isStrong) {
+        state.tokens[delimiters[i - 1].token].content = "";
+        state.tokens[delimiters[startDelim.end + 1].token].content = "";
+        i--;
+      }
+    }
+  }
+  // Walk through delimiter list and replace text tokens with tags
+  
+    var postProcess_1 = function emphasis(state) {
+    var curr, tokens_meta = state.tokens_meta, max = state.tokens_meta.length;
+    postProcess(state, state.delimiters);
+    for (curr = 0; curr < max; curr++) {
+      if (tokens_meta[curr] && tokens_meta[curr].delimiters) {
+        postProcess(state, tokens_meta[curr].delimiters);
+      }
+    }
+  };
+  var emphasis = {
+    tokenize: tokenize,
+    postProcess: postProcess_1
+  };
+  var normalizeReference$1 = utils.normalizeReference;
+  var isSpace$1 = utils.isSpace;
+  var link = function link(state, silent) {
+    var attrs, code, label, labelEnd, labelStart, pos, res, ref, token, href = "", title = "", oldPos = state.pos, max = state.posMax, start = state.pos, parseReference = true;
+    if (state.src.charCodeAt(state.pos) !== 91 /* [ */) {
+      return false;
+    }
+    labelStart = state.pos + 1;
+    labelEnd = state.md.helpers.parseLinkLabel(state, state.pos, true);
+    // parser failed to find ']', so it's not a valid link
+        if (labelEnd < 0) {
+      return false;
+    }
+    pos = labelEnd + 1;
+    if (pos < max && state.src.charCodeAt(pos) === 40 /* ( */) {
+      // Inline link
+      // might have found a valid shortcut link, disable reference parsing
+      parseReference = false;
+      // [link](  <href>  "title"  )
+      //        ^^ skipping these spaces
+            pos++;
+      for (;pos < max; pos++) {
+        code = state.src.charCodeAt(pos);
+        if (!isSpace$1(code) && code !== 10) {
+          break;
+        }
+      }
+      if (pos >= max) {
+        return false;
+      }
+      // [link](  <href>  "title"  )
+      //          ^^^^^^ parsing link destination
+            start = pos;
+      res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax);
+      if (res.ok) {
+        href = state.md.normalizeLink(res.str);
+        if (state.md.validateLink(href)) {
+          pos = res.pos;
+        } else {
+          href = "";
+        }
+        // [link](  <href>  "title"  )
+        //                ^^ skipping these spaces
+                start = pos;
+        for (;pos < max; pos++) {
+          code = state.src.charCodeAt(pos);
+          if (!isSpace$1(code) && code !== 10) {
+            break;
+          }
+        }
+        // [link](  <href>  "title"  )
+        //                  ^^^^^^^ parsing link title
+                res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax);
+        if (pos < max && start !== pos && res.ok) {
+          title = res.str;
+          pos = res.pos;
+          // [link](  <href>  "title"  )
+          //                         ^^ skipping these spaces
+                    for (;pos < max; pos++) {
+            code = state.src.charCodeAt(pos);
+            if (!isSpace$1(code) && code !== 10) {
+              break;
+            }
+          }
+        }
+      }
+      if (pos >= max || state.src.charCodeAt(pos) !== 41 /* ) */) {
+        // parsing a valid shortcut link failed, fallback to reference
+        parseReference = true;
+      }
+      pos++;
+    }
+    if (parseReference) {
+      // Link reference
+      if (typeof state.env.references === "undefined") {
+        return false;
+      }
+      if (pos < max && state.src.charCodeAt(pos) === 91 /* [ */) {
+        start = pos + 1;
+        pos = state.md.helpers.parseLinkLabel(state, pos);
+        if (pos >= 0) {
+          label = state.src.slice(start, pos++);
+        } else {
+          pos = labelEnd + 1;
+        }
+      } else {
+        pos = labelEnd + 1;
+      }
+      // covers label === '' and label === undefined
+      // (collapsed reference link and shortcut reference link respectively)
+            if (!label) {
+        label = state.src.slice(labelStart, labelEnd);
+      }
+      ref = state.env.references[normalizeReference$1(label)];
+      if (!ref) {
+        state.pos = oldPos;
+        return false;
+      }
+      href = ref.href;
+      title = ref.title;
+    }
+    
+    // We found the end of the link, and know for a fact it's a valid link;
+    // so all that's left to do is to call tokenizer.
+    
+        if (!silent) {
+      state.pos = labelStart;
+      state.posMax = labelEnd;
+      token = state.push("link_open", "a", 1);
+      token.attrs = attrs = [ [ "href", href ] ];
+      if (title) {
+        attrs.push([ "title", title ]);
+      }
+      state.linkLevel++;
+      state.md.inline.tokenize(state);
+      state.linkLevel--;
+      token = state.push("link_close", "a", -1);
+    }
+    state.pos = pos;
+    state.posMax = max;
+    return true;
+  };
+  var normalizeReference = utils.normalizeReference;
+  var isSpace = utils.isSpace;
+  var image = function image(state, silent) {
+    var attrs, code, content, label, labelEnd, labelStart, pos, ref, res, title, token, tokens, start, href = "", oldPos = state.pos, max = state.posMax;
+    if (state.src.charCodeAt(state.pos) !== 33 /* ! */) {
+      return false;
+    }
+    if (state.src.charCodeAt(state.pos + 1) !== 91 /* [ */) {
+      return false;
+    }
+    labelStart = state.pos + 2;
+    labelEnd = state.md.helpers.parseLinkLabel(state, state.pos + 1, false);
+    // parser failed to find ']', so it's not a valid link
+        if (labelEnd < 0) {
+      return false;
+    }
+    pos = labelEnd + 1;
+    if (pos < max && state.src.charCodeAt(pos) === 40 /* ( */) {
+      // Inline link
+      // [link](  <href>  "title"  )
+      //        ^^ skipping these spaces
+      pos++;
+      for (;pos < max; pos++) {
+        code = state.src.charCodeAt(pos);
+        if (!isSpace(code) && code !== 10) {
+          break;
+        }
+      }
+      if (pos >= max) {
+        return false;
+      }
+      // [link](  <href>  "title"  )
+      //          ^^^^^^ parsing link destination
+            start = pos;
+      res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax);
+      if (res.ok) {
+        href = state.md.normalizeLink(res.str);
+        if (state.md.validateLink(href)) {
+          pos = res.pos;
+        } else {
+          href = "";
+        }
+      }
+      // [link](  <href>  "title"  )
+      //                ^^ skipping these spaces
+            start = pos;
+      for (;pos < max; pos++) {
+        code = state.src.charCodeAt(pos);
+        if (!isSpace(code) && code !== 10) {
+          break;
+        }
+      }
+      // [link](  <href>  "title"  )
+      //                  ^^^^^^^ parsing link title
+            res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax);
+      if (pos < max && start !== pos && res.ok) {
+        title = res.str;
+        pos = res.pos;
+        // [link](  <href>  "title"  )
+        //                         ^^ skipping these spaces
+                for (;pos < max; pos++) {
+          code = state.src.charCodeAt(pos);
+          if (!isSpace(code) && code !== 10) {
+            break;
+          }
+        }
+      } else {
+        title = "";
+      }
+      if (pos >= max || state.src.charCodeAt(pos) !== 41 /* ) */) {
+        state.pos = oldPos;
+        return false;
+      }
+      pos++;
+    } else {
+      // Link reference
+      if (typeof state.env.references === "undefined") {
+        return false;
+      }
+      if (pos < max && state.src.charCodeAt(pos) === 91 /* [ */) {
+        start = pos + 1;
+        pos = state.md.helpers.parseLinkLabel(state, pos);
+        if (pos >= 0) {
+          label = state.src.slice(start, pos++);
+        } else {
+          pos = labelEnd + 1;
+        }
+      } else {
+        pos = labelEnd + 1;
+      }
+      // covers label === '' and label === undefined
+      // (collapsed reference link and shortcut reference link respectively)
+            if (!label) {
+        label = state.src.slice(labelStart, labelEnd);
+      }
+      ref = state.env.references[normalizeReference(label)];
+      if (!ref) {
+        state.pos = oldPos;
+        return false;
+      }
+      href = ref.href;
+      title = ref.title;
+    }
+    
+    // We found the end of the link, and know for a fact it's a valid link;
+    // so all that's left to do is to call tokenizer.
+    
+        if (!silent) {
+      content = state.src.slice(labelStart, labelEnd);
+      state.md.inline.parse(content, state.md, state.env, tokens = []);
+      token = state.push("image", "img", 0);
+      token.attrs = attrs = [ [ "src", href ], [ "alt", "" ] ];
+      token.children = tokens;
+      token.content = content;
+      if (title) {
+        attrs.push([ "title", title ]);
+      }
+    }
+    state.pos = pos;
+    state.posMax = max;
+    return true;
+  };
+  // Process autolinks '<protocol:...>'
+  /*eslint max-len:0*/  var EMAIL_RE = /^([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$/;
+  var AUTOLINK_RE = /^([a-zA-Z][a-zA-Z0-9+.\-]{1,31}):([^<>\x00-\x20]*)$/;
+  var autolink = function autolink(state, silent) {
+    var url, fullUrl, token, ch, start, max, pos = state.pos;
+    if (state.src.charCodeAt(pos) !== 60 /* < */) {
+      return false;
+    }
+    start = state.pos;
+    max = state.posMax;
+    for (;;) {
+      if (++pos >= max) return false;
+      ch = state.src.charCodeAt(pos);
+      if (ch === 60 /* < */) return false;
+      if (ch === 62 /* > */) break;
+    }
+    url = state.src.slice(start + 1, pos);
+    if (AUTOLINK_RE.test(url)) {
+      fullUrl = state.md.normalizeLink(url);
+      if (!state.md.validateLink(fullUrl)) {
+        return false;
+      }
+      if (!silent) {
+        token = state.push("link_open", "a", 1);
+        token.attrs = [ [ "href", fullUrl ] ];
+        token.markup = "autolink";
+        token.info = "auto";
+        token = state.push("text", "", 0);
+        token.content = state.md.normalizeLinkText(url);
+        token = state.push("link_close", "a", -1);
+        token.markup = "autolink";
+        token.info = "auto";
+      }
+      state.pos += url.length + 2;
+      return true;
+    }
+    if (EMAIL_RE.test(url)) {
+      fullUrl = state.md.normalizeLink("mailto:" + url);
+      if (!state.md.validateLink(fullUrl)) {
+        return false;
+      }
+      if (!silent) {
+        token = state.push("link_open", "a", 1);
+        token.attrs = [ [ "href", fullUrl ] ];
+        token.markup = "autolink";
+        token.info = "auto";
+        token = state.push("text", "", 0);
+        token.content = state.md.normalizeLinkText(url);
+        token = state.push("link_close", "a", -1);
+        token.markup = "autolink";
+        token.info = "auto";
+      }
+      state.pos += url.length + 2;
+      return true;
+    }
+    return false;
+  };
+  var HTML_TAG_RE = html_re.HTML_TAG_RE;
+  function isLinkOpen(str) {
+    return /^<a[>\s]/i.test(str);
+  }
+  function isLinkClose(str) {
+    return /^<\/a\s*>/i.test(str);
+  }
+  function isLetter(ch) {
+    /*eslint no-bitwise:0*/
+    var lc = ch | 32;
+ // to lower case
+        return lc >= 97 /* a */ && lc <= 122 /* z */;
+  }
+  var html_inline = function html_inline(state, silent) {
+    var ch, match, max, token, pos = state.pos;
+    if (!state.md.options.html) {
+      return false;
+    }
+    // Check start
+        max = state.posMax;
+    if (state.src.charCodeAt(pos) !== 60 /* < */ || pos + 2 >= max) {
+      return false;
+    }
+    // Quick fail on second char
+        ch = state.src.charCodeAt(pos + 1);
+    if (ch !== 33 /* ! */ && ch !== 63 /* ? */ && ch !== 47 /* / */ && !isLetter(ch)) {
+      return false;
+    }
+    match = state.src.slice(pos).match(HTML_TAG_RE);
+    if (!match) {
+      return false;
+    }
+    if (!silent) {
+      token = state.push("html_inline", "", 0);
+      token.content = match[0];
+      if (isLinkOpen(token.content)) state.linkLevel++;
+      if (isLinkClose(token.content)) state.linkLevel--;
+    }
+    state.pos += match[0].length;
+    return true;
+  };
+  var has = utils.has;
+  var isValidEntityCode = utils.isValidEntityCode;
+  var fromCodePoint = utils.fromCodePoint;
+  var DIGITAL_RE = /^&#((?:x[a-f0-9]{1,6}|[0-9]{1,7}));/i;
+  var NAMED_RE = /^&([a-z][a-z0-9]{1,31});/i;
+  var entity = function entity(state, silent) {
+    var ch, code, match, token, pos = state.pos, max = state.posMax;
+    if (state.src.charCodeAt(pos) !== 38 /* & */) return false;
+    if (pos + 1 >= max) return false;
+    ch = state.src.charCodeAt(pos + 1);
+    if (ch === 35 /* # */) {
+      match = state.src.slice(pos).match(DIGITAL_RE);
+      if (match) {
+        if (!silent) {
+          code = match[1][0].toLowerCase() === "x" ? parseInt(match[1].slice(1), 16) : parseInt(match[1], 10);
+          token = state.push("text_special", "", 0);
+          token.content = isValidEntityCode(code) ? fromCodePoint(code) : fromCodePoint(65533);
+          token.markup = match[0];
+          token.info = "entity";
+        }
+        state.pos += match[0].length;
+        return true;
+      }
+    } else {
+      match = state.src.slice(pos).match(NAMED_RE);
+      if (match) {
+        if (has(entities, match[1])) {
+          if (!silent) {
+            token = state.push("text_special", "", 0);
+            token.content = entities[match[1]];
+            token.markup = match[0];
+            token.info = "entity";
+          }
+          state.pos += match[0].length;
+          return true;
+        }
+      }
+    }
+    return false;
+  };
+  // For each opening emphasis-like marker find a matching closing one
+    function processDelimiters(delimiters) {
+    var closerIdx, openerIdx, closer, opener, minOpenerIdx, newMinOpenerIdx, isOddMatch, lastJump, openersBottom = {}, max = delimiters.length;
+    if (!max) return;
+    // headerIdx is the first delimiter of the current (where closer is) delimiter run
+        var headerIdx = 0;
+    var lastTokenIdx = -2;
+ // needs any value lower than -1
+        var jumps = [];
+    for (closerIdx = 0; closerIdx < max; closerIdx++) {
+      closer = delimiters[closerIdx];
+      jumps.push(0);
+      // markers belong to same delimiter run if:
+      //  - they have adjacent tokens
+      //  - AND markers are the same
+      
+            if (delimiters[headerIdx].marker !== closer.marker || lastTokenIdx !== closer.token - 1) {
+        headerIdx = closerIdx;
+      }
+      lastTokenIdx = closer.token;
+      // Length is only used for emphasis-specific "rule of 3",
+      // if it's not defined (in strikethrough or 3rd party plugins),
+      // we can default it to 0 to disable those checks.
+      
+            closer.length = closer.length || 0;
+      if (!closer.close) continue;
+      // Previously calculated lower bounds (previous fails)
+      // for each marker, each delimiter length modulo 3,
+      // and for whether this closer can be an opener;
+      // https://github.com/commonmark/cmark/commit/34250e12ccebdc6372b8b49c44fab57c72443460
+            if (!openersBottom.hasOwnProperty(closer.marker)) {
+        openersBottom[closer.marker] = [ -1, -1, -1, -1, -1, -1 ];
+      }
+      minOpenerIdx = openersBottom[closer.marker][(closer.open ? 3 : 0) + closer.length % 3];
+      openerIdx = headerIdx - jumps[headerIdx] - 1;
+      newMinOpenerIdx = openerIdx;
+      for (;openerIdx > minOpenerIdx; openerIdx -= jumps[openerIdx] + 1) {
+        opener = delimiters[openerIdx];
+        if (opener.marker !== closer.marker) continue;
+        if (opener.open && opener.end < 0) {
+          isOddMatch = false;
+          // from spec:
+          
+          // If one of the delimiters can both open and close emphasis, then the
+          // sum of the lengths of the delimiter runs containing the opening and
+          // closing delimiters must not be a multiple of 3 unless both lengths
+          // are multiples of 3.
+          
+                    if (opener.close || closer.open) {
+            if ((opener.length + closer.length) % 3 === 0) {
+              if (opener.length % 3 !== 0 || closer.length % 3 !== 0) {
+                isOddMatch = true;
+              }
+            }
+          }
+          if (!isOddMatch) {
+            // If previous delimiter cannot be an opener, we can safely skip
+            // the entire sequence in future checks. This is required to make
+            // sure algorithm has linear complexity (see *_*_*_*_*_... case).
+            lastJump = openerIdx > 0 && !delimiters[openerIdx - 1].open ? jumps[openerIdx - 1] + 1 : 0;
+            jumps[closerIdx] = closerIdx - openerIdx + lastJump;
+            jumps[openerIdx] = lastJump;
+            closer.open = false;
+            opener.end = closerIdx;
+            opener.close = false;
+            newMinOpenerIdx = -1;
+            // treat next token as start of run,
+            // it optimizes skips in **<...>**a**<...>** pathological case
+                        lastTokenIdx = -2;
+            break;
+          }
+        }
+      }
+      if (newMinOpenerIdx !== -1) {
+        // If match for this delimiter run failed, we want to set lower bound for
+        // future lookups. This is required to make sure algorithm has linear
+        // complexity.
+        // See details here:
+        // https://github.com/commonmark/cmark/issues/178#issuecomment-270417442
+        openersBottom[closer.marker][(closer.open ? 3 : 0) + (closer.length || 0) % 3] = newMinOpenerIdx;
+      }
+    }
+  }
+  var balance_pairs = function link_pairs(state) {
+    var curr, tokens_meta = state.tokens_meta, max = state.tokens_meta.length;
+    processDelimiters(state.delimiters);
+    for (curr = 0; curr < max; curr++) {
+      if (tokens_meta[curr] && tokens_meta[curr].delimiters) {
+        processDelimiters(tokens_meta[curr].delimiters);
+      }
+    }
+  };
+  // Clean up tokens after emphasis and strikethrough postprocessing:
+    var fragments_join = function fragments_join(state) {
+    var curr, last, level = 0, tokens = state.tokens, max = state.tokens.length;
+    for (curr = last = 0; curr < max; curr++) {
+      // re-calculate levels after emphasis/strikethrough turns some text nodes
+      // into opening/closing tags
+      if (tokens[curr].nesting < 0) level--;
+ // closing tag
+            tokens[curr].level = level;
+      if (tokens[curr].nesting > 0) level++;
+ // opening tag
+            if (tokens[curr].type === "text" && curr + 1 < max && tokens[curr + 1].type === "text") {
+        // collapse two adjacent text nodes
+        tokens[curr + 1].content = tokens[curr].content + tokens[curr + 1].content;
+      } else {
+        if (curr !== last) {
+          tokens[last] = tokens[curr];
+        }
+        last++;
+      }
+    }
+    if (curr !== last) {
+      tokens.length = last;
+    }
+  };
+  var isWhiteSpace = utils.isWhiteSpace;
+  var isPunctChar = utils.isPunctChar;
+  var isMdAsciiPunct = utils.isMdAsciiPunct;
+  function StateInline(src, md, env, outTokens) {
+    this.src = src;
+    this.env = env;
+    this.md = md;
+    this.tokens = outTokens;
+    this.tokens_meta = Array(outTokens.length);
+    this.pos = 0;
+    this.posMax = this.src.length;
+    this.level = 0;
+    this.pending = "";
+    this.pendingLevel = 0;
+    // Stores { start: end } pairs. Useful for backtrack
+    // optimization of pairs parse (emphasis, strikes).
+        this.cache = {};
+    // List of emphasis-like delimiters for current tag
+        this.delimiters = [];
+    // Stack of delimiter lists for upper level tags
+        this._prev_delimiters = [];
+    // backtick length => last seen position
+        this.backticks = {};
+    this.backticksScanned = false;
+    // Counter used to disable inline linkify-it execution
+    // inside <a> and markdown links
+        this.linkLevel = 0;
+  }
+  // Flush pending text
+  
+    StateInline.prototype.pushPending = function() {
+    var token$1 = new token("text", "", 0);
+    token$1.content = this.pending;
+    token$1.level = this.pendingLevel;
+    this.tokens.push(token$1);
+    this.pending = "";
+    return token$1;
+  };
+  // Push new token to "stream".
+  // If pending text exists - flush it as text token
+  
+    StateInline.prototype.push = function(type, tag, nesting) {
+    if (this.pending) {
+      this.pushPending();
+    }
+    var token$1 = new token(type, tag, nesting);
+    var token_meta = null;
+    if (nesting < 0) {
+      // closing tag
+      this.level--;
+      this.delimiters = this._prev_delimiters.pop();
+    }
+    token$1.level = this.level;
+    if (nesting > 0) {
+      // opening tag
+      this.level++;
+      this._prev_delimiters.push(this.delimiters);
+      this.delimiters = [];
+      token_meta = {
+        delimiters: this.delimiters
+      };
+    }
+    this.pendingLevel = this.level;
+    this.tokens.push(token$1);
+    this.tokens_meta.push(token_meta);
+    return token$1;
+  };
+  // Scan a sequence of emphasis-like markers, and determine whether
+  // it can start an emphasis sequence or end an emphasis sequence.
+  
+  //  - start - position to scan from (it should point at a valid marker);
+  //  - canSplitWord - determine if these markers can be found inside a word
+  
+    StateInline.prototype.scanDelims = function(start, canSplitWord) {
+    var pos = start, lastChar, nextChar, count, can_open, can_close, isLastWhiteSpace, isLastPunctChar, isNextWhiteSpace, isNextPunctChar, left_flanking = true, right_flanking = true, max = this.posMax, marker = this.src.charCodeAt(start);
+    // treat beginning of the line as a whitespace
+        lastChar = start > 0 ? this.src.charCodeAt(start - 1) : 32;
+    while (pos < max && this.src.charCodeAt(pos) === marker) {
+      pos++;
+    }
+    count = pos - start;
+    // treat end of the line as a whitespace
+        nextChar = pos < max ? this.src.charCodeAt(pos) : 32;
+    isLastPunctChar = isMdAsciiPunct(lastChar) || isPunctChar(String.fromCharCode(lastChar));
+    isNextPunctChar = isMdAsciiPunct(nextChar) || isPunctChar(String.fromCharCode(nextChar));
+    isLastWhiteSpace = isWhiteSpace(lastChar);
+    isNextWhiteSpace = isWhiteSpace(nextChar);
+    if (isNextWhiteSpace) {
+      left_flanking = false;
+    } else if (isNextPunctChar) {
+      if (!(isLastWhiteSpace || isLastPunctChar)) {
+        left_flanking = false;
+      }
+    }
+    if (isLastWhiteSpace) {
+      right_flanking = false;
+    } else if (isLastPunctChar) {
+      if (!(isNextWhiteSpace || isNextPunctChar)) {
+        right_flanking = false;
+      }
+    }
+    if (!canSplitWord) {
+      can_open = left_flanking && (!right_flanking || isLastPunctChar);
+      can_close = right_flanking && (!left_flanking || isNextPunctChar);
+    } else {
+      can_open = left_flanking;
+      can_close = right_flanking;
+    }
+    return {
+      can_open: can_open,
+      can_close: can_close,
+      length: count
+    };
+  };
+  // re-export Token class to use in block rules
+    StateInline.prototype.Token = token;
+  var state_inline = StateInline;
+  ////////////////////////////////////////////////////////////////////////////////
+  // Parser rules
+    var _rules = [ [ "text", text ], [ "linkify", linkify ], [ "newline", newline ], [ "escape", _escape ], [ "backticks", backticks ], [ "strikethrough", strikethrough.tokenize ], [ "emphasis", emphasis.tokenize ], [ "link", link ], [ "image", image ], [ "autolink", autolink ], [ "html_inline", html_inline ], [ "entity", entity ] ];
+  // `rule2` ruleset was created specifically for emphasis/strikethrough
+  // post-processing and may be changed in the future.
+  
+  // Don't use this for anything except pairs (plugins working with `balance_pairs`).
+  
+    var _rules2 = [ [ "balance_pairs", balance_pairs ], [ "strikethrough", strikethrough.postProcess ], [ "emphasis", emphasis.postProcess ], 
+  // rules for pairs separate '**' into its own text tokens, which may be left unused,
+  // rule below merges unused segments back with the rest of the text
+  [ "fragments_join", fragments_join ] ];
+  /**
+	 * new ParserInline()
+	 **/  function ParserInline() {
+    var i;
+    /**
+	   * ParserInline#ruler -> Ruler
+	   *
+	   * [[Ruler]] instance. Keep configuration of inline rules.
+	   **/    this.ruler = new ruler;
+    for (i = 0; i < _rules.length; i++) {
+      this.ruler.push(_rules[i][0], _rules[i][1]);
+    }
+    /**
+	   * ParserInline#ruler2 -> Ruler
+	   *
+	   * [[Ruler]] instance. Second ruler used for post-processing
+	   * (e.g. in emphasis-like rules).
+	   **/    this.ruler2 = new ruler;
+    for (i = 0; i < _rules2.length; i++) {
+      this.ruler2.push(_rules2[i][0], _rules2[i][1]);
+    }
+  }
+  // Skip single token by running all rules in validation mode;
+  // returns `true` if any rule reported success
+  
+    ParserInline.prototype.skipToken = function(state) {
+    var ok, i, pos = state.pos, rules = this.ruler.getRules(""), len = rules.length, maxNesting = state.md.options.maxNesting, cache = state.cache;
+    if (typeof cache[pos] !== "undefined") {
+      state.pos = cache[pos];
+      return;
+    }
+    if (state.level < maxNesting) {
+      for (i = 0; i < len; i++) {
+        // Increment state.level and decrement it later to limit recursion.
+        // It's harmless to do here, because no tokens are created. But ideally,
+        // we'd need a separate private state variable for this purpose.
+        state.level++;
+        ok = rules[i](state, true);
+        state.level--;
+        if (ok) {
+          if (pos >= state.pos) {
+            throw new Error("inline rule didn't increment state.pos");
+          }
+          break;
+        }
+      }
+    } else {
+      // Too much nesting, just skip until the end of the paragraph.
+      // NOTE: this will cause links to behave incorrectly in the following case,
+      //       when an amount of `[` is exactly equal to `maxNesting + 1`:
+      //       [[[[[[[[[[[[[[[[[[[[[foo]()
+      // TODO: remove this workaround when CM standard will allow nested links
+      //       (we can replace it by preventing links from being parsed in
+      //       validation mode)
+      state.pos = state.posMax;
+    }
+    if (!ok) {
+      state.pos++;
+    }
+    cache[pos] = state.pos;
+  };
+  // Generate tokens for input range
+  
+    ParserInline.prototype.tokenize = function(state) {
+    var ok, i, prevPos, rules = this.ruler.getRules(""), len = rules.length, end = state.posMax, maxNesting = state.md.options.maxNesting;
+    while (state.pos < end) {
+      // Try all possible rules.
+      // On success, rule should:
+      // - update `state.pos`
+      // - update `state.tokens`
+      // - return true
+      prevPos = state.pos;
+      if (state.level < maxNesting) {
+        for (i = 0; i < len; i++) {
+          ok = rules[i](state, false);
+          if (ok) {
+            if (prevPos >= state.pos) {
+              throw new Error("inline rule didn't increment state.pos");
+            }
+            break;
+          }
+        }
+      }
+      if (ok) {
+        if (state.pos >= end) {
+          break;
+        }
+        continue;
+      }
+      state.pending += state.src[state.pos++];
+    }
+    if (state.pending) {
+      state.pushPending();
+    }
+  };
+  /**
+	 * ParserInline.parse(str, md, env, outTokens)
+	 *
+	 * Process input string and push inline tokens into `outTokens`
+	 **/  ParserInline.prototype.parse = function(str, md, env, outTokens) {
+    var i, rules, len;
+    var state = new this.State(str, md, env, outTokens);
+    this.tokenize(state);
+    rules = this.ruler2.getRules("");
+    len = rules.length;
+    for (i = 0; i < len; i++) {
+      rules[i](state);
+    }
+  };
+  ParserInline.prototype.State = state_inline;
+  var parser_inline = ParserInline;
+  var re = function(opts) {
+    var re = {};
+    opts = opts || {};
+    // Use direct extract instead of `regenerate` to reduse browserified size
+        re.src_Any = regex$3.source;
+    re.src_Cc = regex$2.source;
+    re.src_Z = regex.source;
+    re.src_P = regex$4.source;
+    // \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation)
+        re.src_ZPCc = [ re.src_Z, re.src_P, re.src_Cc ].join("|");
+    // \p{\Z\Cc} (white spaces + control)
+        re.src_ZCc = [ re.src_Z, re.src_Cc ].join("|");
+    // Experimental. List of chars, completely prohibited in links
+    // because can separate it from other part of text
+        var text_separators = "[><\uff5c]";
+    // All possible word characters (everything without punctuation, spaces & controls)
+    // Defined via punctuation & spaces to save space
+    // Should be something like \p{\L\N\S\M} (\w but without `_`)
+        re.src_pseudo_letter = "(?:(?!" + text_separators + "|" + re.src_ZPCc + ")" + re.src_Any + ")";
+    // The same as abothe but without [0-9]
+    // var src_pseudo_letter_non_d = '(?:(?![0-9]|' + src_ZPCc + ')' + src_Any + ')';
+    ////////////////////////////////////////////////////////////////////////////////
+        re.src_ip4 = "(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)";
+    // Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch.
+        re.src_auth = "(?:(?:(?!" + re.src_ZCc + "|[@/\\[\\]()]).)+@)?";
+    re.src_port = "(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?";
+    re.src_host_terminator = "(?=$|" + text_separators + "|" + re.src_ZPCc + ")" + "(?!" + (opts["---"] ? "-(?!--)|" : "-|") + "_|:\\d|\\.-|\\.(?!$|" + re.src_ZPCc + "))";
+    re.src_path = "(?:" + "[/?#]" + "(?:" + "(?!" + re.src_ZCc + "|" + text_separators + "|[()[\\]{}.,\"'?!\\-;]).|" + "\\[(?:(?!" + re.src_ZCc + "|\\]).)*\\]|" + "\\((?:(?!" + re.src_ZCc + "|[)]).)*\\)|" + "\\{(?:(?!" + re.src_ZCc + "|[}]).)*\\}|" + '\\"(?:(?!' + re.src_ZCc + '|["]).)+\\"|' + "\\'(?:(?!" + re.src_ZCc + "|[']).)+\\'|" + "\\'(?=" + re.src_pseudo_letter + "|[-])|" + // allow `I'm_king` if no pair found
+    "\\.{2,}[a-zA-Z0-9%/&]|" + // google has many dots in "google search" links (#66, #81).
+    // github has ... in commit range links,
+    // Restrict to
+    // - english
+    // - percent-encoded
+    // - parts of file path
+    // - params separator
+    // until more examples found.
+    "\\.(?!" + re.src_ZCc + "|[.]|$)|" + (opts["---"] ? "\\-(?!--(?:[^-]|$))(?:-*)|" : "\\-+|") + ",(?!" + re.src_ZCc + "|$)|" + // allow `,,,` in paths
+    ";(?!" + re.src_ZCc + "|$)|" + // allow `;` if not followed by space-like char
+    "\\!+(?!" + re.src_ZCc + "|[!]|$)|" + // allow `!!!` in paths, but not at the end
+    "\\?(?!" + re.src_ZCc + "|[?]|$)" + ")+" + "|\\/" + ")?";
+    // Allow anything in markdown spec, forbid quote (") at the first position
+    // because emails enclosed in quotes are far more common
+        re.src_email_name = '[\\-;:&=\\+\\$,\\.a-zA-Z0-9_][\\-;:&=\\+\\$,\\"\\.a-zA-Z0-9_]*';
+    re.src_xn = "xn--[a-z0-9\\-]{1,59}";
+    // More to read about domain names
+    // http://serverfault.com/questions/638260/
+        re.src_domain_root = 
+    // Allow letters & digits (http://test1)
+    "(?:" + re.src_xn + "|" + re.src_pseudo_letter + "{1,63}" + ")";
+    re.src_domain = "(?:" + re.src_xn + "|" + "(?:" + re.src_pseudo_letter + ")" + "|" + "(?:" + re.src_pseudo_letter + "(?:-|" + re.src_pseudo_letter + "){0,61}" + re.src_pseudo_letter + ")" + ")";
+    re.src_host = "(?:" + 
+    // Don't need IP check, because digits are already allowed in normal domain names
+    //   src_ip4 +
+    // '|' +
+    "(?:(?:(?:" + re.src_domain + ")\\.)*" + re.src_domain /*_root*/ + ")" + ")";
+    re.tpl_host_fuzzy = "(?:" + re.src_ip4 + "|" + "(?:(?:(?:" + re.src_domain + ")\\.)+(?:%TLDS%))" + ")";
+    re.tpl_host_no_ip_fuzzy = "(?:(?:(?:" + re.src_domain + ")\\.)+(?:%TLDS%))";
+    re.src_host_strict = re.src_host + re.src_host_terminator;
+    re.tpl_host_fuzzy_strict = re.tpl_host_fuzzy + re.src_host_terminator;
+    re.src_host_port_strict = re.src_host + re.src_port + re.src_host_terminator;
+    re.tpl_host_port_fuzzy_strict = re.tpl_host_fuzzy + re.src_port + re.src_host_terminator;
+    re.tpl_host_port_no_ip_fuzzy_strict = re.tpl_host_no_ip_fuzzy + re.src_port + re.src_host_terminator;
+    ////////////////////////////////////////////////////////////////////////////////
+    // Main rules
+    // Rude test fuzzy links by host, for quick deny
+        re.tpl_host_fuzzy_test = "localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:" + re.src_ZPCc + "|>|$))";
+    re.tpl_email_fuzzy = "(^|" + text_separators + '|"|\\(|' + re.src_ZCc + ")" + "(" + re.src_email_name + "@" + re.tpl_host_fuzzy_strict + ")";
+    re.tpl_link_fuzzy = 
+    // Fuzzy link can't be prepended with .:/\- and non punctuation.
+    // but can start with > (markdown blockquote)
+    "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + re.src_ZPCc + "))" + "((?![$+<=>^`|\uff5c])" + re.tpl_host_port_fuzzy_strict + re.src_path + ")";
+    re.tpl_link_no_ip_fuzzy = 
+    // Fuzzy link can't be prepended with .:/\- and non punctuation.
+    // but can start with > (markdown blockquote)
+    "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + re.src_ZPCc + "))" + "((?![$+<=>^`|\uff5c])" + re.tpl_host_port_no_ip_fuzzy_strict + re.src_path + ")";
+    return re;
+  };
+  ////////////////////////////////////////////////////////////////////////////////
+  // Helpers
+  // Merge objects
+  
+    function assign(obj /*from1, from2, from3, ...*/) {
+    var sources = Array.prototype.slice.call(arguments, 1);
+    sources.forEach((function(source) {
+      if (!source) {
+        return;
+      }
+      Object.keys(source).forEach((function(key) {
+        obj[key] = source[key];
+      }));
+    }));
+    return obj;
+  }
+  function _class(obj) {
+    return Object.prototype.toString.call(obj);
+  }
+  function isString(obj) {
+    return _class(obj) === "[object String]";
+  }
+  function isObject(obj) {
+    return _class(obj) === "[object Object]";
+  }
+  function isRegExp(obj) {
+    return _class(obj) === "[object RegExp]";
+  }
+  function isFunction(obj) {
+    return _class(obj) === "[object Function]";
+  }
+  function escapeRE(str) {
+    return str.replace(/[.?*+^$[\]\\(){}|-]/g, "\\$&");
+  }
+  ////////////////////////////////////////////////////////////////////////////////
+    var defaultOptions = {
+    fuzzyLink: true,
+    fuzzyEmail: true,
+    fuzzyIP: false
+  };
+  function isOptionsObj(obj) {
+    return Object.keys(obj || {}).reduce((function(acc, k) {
+      return acc || defaultOptions.hasOwnProperty(k);
+    }), false);
+  }
+  var defaultSchemas = {
+    "http:": {
+      validate: function(text, pos, self) {
+        var tail = text.slice(pos);
+        if (!self.re.http) {
+          // compile lazily, because "host"-containing variables can change on tlds update.
+          self.re.http = new RegExp("^\\/\\/" + self.re.src_auth + self.re.src_host_port_strict + self.re.src_path, "i");
+        }
+        if (self.re.http.test(tail)) {
+          return tail.match(self.re.http)[0].length;
+        }
+        return 0;
+      }
+    },
+    "https:": "http:",
+    "ftp:": "http:",
+    "//": {
+      validate: function(text, pos, self) {
+        var tail = text.slice(pos);
+        if (!self.re.no_http) {
+          // compile lazily, because "host"-containing variables can change on tlds update.
+          self.re.no_http = new RegExp("^" + self.re.src_auth + 
+          // Don't allow single-level domains, because of false positives like '//test'
+          // with code comments
+          "(?:localhost|(?:(?:" + self.re.src_domain + ")\\.)+" + self.re.src_domain_root + ")" + self.re.src_port + self.re.src_host_terminator + self.re.src_path, "i");
+        }
+        if (self.re.no_http.test(tail)) {
+          // should not be `://` & `///`, that protects from errors in protocol name
+          if (pos >= 3 && text[pos - 3] === ":") {
+            return 0;
+          }
+          if (pos >= 3 && text[pos - 3] === "/") {
+            return 0;
+          }
+          return tail.match(self.re.no_http)[0].length;
+        }
+        return 0;
+      }
+    },
+    "mailto:": {
+      validate: function(text, pos, self) {
+        var tail = text.slice(pos);
+        if (!self.re.mailto) {
+          self.re.mailto = new RegExp("^" + self.re.src_email_name + "@" + self.re.src_host_strict, "i");
+        }
+        if (self.re.mailto.test(tail)) {
+          return tail.match(self.re.mailto)[0].length;
+        }
+        return 0;
+      }
+    }
+  };
+  /*eslint-disable max-len*/
+  // RE pattern for 2-character tlds (autogenerated by ./support/tlds_2char_gen.js)
+    var tlds_2ch_src_re = "a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]";
+  // DON'T try to make PRs with changes. Extend TLDs with LinkifyIt.tlds() instead
+    var tlds_default = "biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|\u0440\u0444".split("|");
+  /*eslint-enable max-len*/
+  ////////////////////////////////////////////////////////////////////////////////
+    function resetScanCache(self) {
+    self.__index__ = -1;
+    self.__text_cache__ = "";
+  }
+  function createValidator(re) {
+    return function(text, pos) {
+      var tail = text.slice(pos);
+      if (re.test(tail)) {
+        return tail.match(re)[0].length;
+      }
+      return 0;
+    };
+  }
+  function createNormalizer() {
+    return function(match, self) {
+      self.normalize(match);
+    };
+  }
+  // Schemas compiler. Build regexps.
+  
+    function compile(self) {
+    // Load & clone RE patterns.
+    var re$1 = self.re = re(self.__opts__);
+    // Define dynamic patterns
+        var tlds = self.__tlds__.slice();
+    self.onCompile();
+    if (!self.__tlds_replaced__) {
+      tlds.push(tlds_2ch_src_re);
+    }
+    tlds.push(re$1.src_xn);
+    re$1.src_tlds = tlds.join("|");
+    function untpl(tpl) {
+      return tpl.replace("%TLDS%", re$1.src_tlds);
+    }
+    re$1.email_fuzzy = RegExp(untpl(re$1.tpl_email_fuzzy), "i");
+    re$1.link_fuzzy = RegExp(untpl(re$1.tpl_link_fuzzy), "i");
+    re$1.link_no_ip_fuzzy = RegExp(untpl(re$1.tpl_link_no_ip_fuzzy), "i");
+    re$1.host_fuzzy_test = RegExp(untpl(re$1.tpl_host_fuzzy_test), "i");
+    
+    // Compile each schema
+    
+        var aliases = [];
+    self.__compiled__ = {};
+ // Reset compiled data
+        function schemaError(name, val) {
+      throw new Error('(LinkifyIt) Invalid schema "' + name + '": ' + val);
+    }
+    Object.keys(self.__schemas__).forEach((function(name) {
+      var val = self.__schemas__[name];
+      // skip disabled methods
+            if (val === null) {
+        return;
+      }
+      var compiled = {
+        validate: null,
+        link: null
+      };
+      self.__compiled__[name] = compiled;
+      if (isObject(val)) {
+        if (isRegExp(val.validate)) {
+          compiled.validate = createValidator(val.validate);
+        } else if (isFunction(val.validate)) {
+          compiled.validate = val.validate;
+        } else {
+          schemaError(name, val);
+        }
+        if (isFunction(val.normalize)) {
+          compiled.normalize = val.normalize;
+        } else if (!val.normalize) {
+          compiled.normalize = createNormalizer();
+        } else {
+          schemaError(name, val);
+        }
+        return;
+      }
+      if (isString(val)) {
+        aliases.push(name);
+        return;
+      }
+      schemaError(name, val);
+    }));
+    
+    // Compile postponed aliases
+    
+        aliases.forEach((function(alias) {
+      if (!self.__compiled__[self.__schemas__[alias]]) {
+        // Silently fail on missed schemas to avoid errons on disable.
+        // schemaError(alias, self.__schemas__[alias]);
+        return;
+      }
+      self.__compiled__[alias].validate = self.__compiled__[self.__schemas__[alias]].validate;
+      self.__compiled__[alias].normalize = self.__compiled__[self.__schemas__[alias]].normalize;
+    }));
+    
+    // Fake record for guessed links
+    
+        self.__compiled__[""] = {
+      validate: null,
+      normalize: createNormalizer()
+    };
+    
+    // Build schema condition
+    
+        var slist = Object.keys(self.__compiled__).filter((function(name) {
+      // Filter disabled & fake schemas
+      return name.length > 0 && self.__compiled__[name];
+    })).map(escapeRE).join("|");
+    // (?!_) cause 1.5x slowdown
+        self.re.schema_test = RegExp("(^|(?!_)(?:[><\uff5c]|" + re$1.src_ZPCc + "))(" + slist + ")", "i");
+    self.re.schema_search = RegExp("(^|(?!_)(?:[><\uff5c]|" + re$1.src_ZPCc + "))(" + slist + ")", "ig");
+    self.re.schema_at_start = RegExp("^" + self.re.schema_search.source, "i");
+    self.re.pretest = RegExp("(" + self.re.schema_test.source + ")|(" + self.re.host_fuzzy_test.source + ")|@", "i");
+    
+    // Cleanup
+    
+        resetScanCache(self);
+  }
+  /**
+	 * class Match
+	 *
+	 * Match result. Single element of array, returned by [[LinkifyIt#match]]
+	 **/  function Match(self, shift) {
+    var start = self.__index__, end = self.__last_index__, text = self.__text_cache__.slice(start, end);
+    /**
+	   * Match#schema -> String
+	   *
+	   * Prefix (protocol) for matched string.
+	   **/    this.schema = self.__schema__.toLowerCase();
+    /**
+	   * Match#index -> Number
+	   *
+	   * First position of matched string.
+	   **/    this.index = start + shift;
+    /**
+	   * Match#lastIndex -> Number
+	   *
+	   * Next position after matched string.
+	   **/    this.lastIndex = end + shift;
+    /**
+	   * Match#raw -> String
+	   *
+	   * Matched string.
+	   **/    this.raw = text;
+    /**
+	   * Match#text -> String
+	   *
+	   * Notmalized text of matched string.
+	   **/    this.text = text;
+    /**
+	   * Match#url -> String
+	   *
+	   * Normalized url of matched string.
+	   **/    this.url = text;
+  }
+  function createMatch(self, shift) {
+    var match = new Match(self, shift);
+    self.__compiled__[match.schema].normalize(match, self);
+    return match;
+  }
+  /**
+	 * class LinkifyIt
+	 **/
+  /**
+	 * new LinkifyIt(schemas, options)
+	 * - schemas (Object): Optional. Additional schemas to validate (prefix/validator)
+	 * - options (Object): { fuzzyLink|fuzzyEmail|fuzzyIP: true|false }
+	 *
+	 * Creates new linkifier instance with optional additional schemas.
+	 * Can be called without `new` keyword for convenience.
+	 *
+	 * By default understands:
+	 *
+	 * - `http(s)://...` , `ftp://...`, `mailto:...` & `//...` links
+	 * - "fuzzy" links and emails (example.com, foo@bar.com).
+	 *
+	 * `schemas` is an object, where each key/value describes protocol/rule:
+	 *
+	 * - __key__ - link prefix (usually, protocol name with `:` at the end, `skype:`
+	 *   for example). `linkify-it` makes shure that prefix is not preceeded with
+	 *   alphanumeric char and symbols. Only whitespaces and punctuation allowed.
+	 * - __value__ - rule to check tail after link prefix
+	 *   - _String_ - just alias to existing rule
+	 *   - _Object_
+	 *     - _validate_ - validator function (should return matched length on success),
+	 *       or `RegExp`.
+	 *     - _normalize_ - optional function to normalize text & url of matched result
+	 *       (for example, for @twitter mentions).
+	 *
+	 * `options`:
+	 *
+	 * - __fuzzyLink__ - recognige URL-s without `http(s):` prefix. Default `true`.
+	 * - __fuzzyIP__ - allow IPs in fuzzy links above. Can conflict with some texts
+	 *   like version numbers. Default `false`.
+	 * - __fuzzyEmail__ - recognize emails without `mailto:` prefix.
+	 *
+	 **/  function LinkifyIt(schemas, options) {
+    if (!(this instanceof LinkifyIt)) {
+      return new LinkifyIt(schemas, options);
+    }
+    if (!options) {
+      if (isOptionsObj(schemas)) {
+        options = schemas;
+        schemas = {};
+      }
+    }
+    this.__opts__ = assign({}, defaultOptions, options);
+    // Cache last tested result. Used to skip repeating steps on next `match` call.
+        this.__index__ = -1;
+    this.__last_index__ = -1;
+ // Next scan position
+        this.__schema__ = "";
+    this.__text_cache__ = "";
+    this.__schemas__ = assign({}, defaultSchemas, schemas);
+    this.__compiled__ = {};
+    this.__tlds__ = tlds_default;
+    this.__tlds_replaced__ = false;
+    this.re = {};
+    compile(this);
+  }
+  /** chainable
+	 * LinkifyIt#add(schema, definition)
+	 * - schema (String): rule name (fixed pattern prefix)
+	 * - definition (String|RegExp|Object): schema definition
+	 *
+	 * Add new rule definition. See constructor description for details.
+	 **/  LinkifyIt.prototype.add = function add(schema, definition) {
+    this.__schemas__[schema] = definition;
+    compile(this);
+    return this;
+  };
+  /** chainable
+	 * LinkifyIt#set(options)
+	 * - options (Object): { fuzzyLink|fuzzyEmail|fuzzyIP: true|false }
+	 *
+	 * Set recognition options for links without schema.
+	 **/  LinkifyIt.prototype.set = function set(options) {
+    this.__opts__ = assign(this.__opts__, options);
+    return this;
+  };
+  /**
+	 * LinkifyIt#test(text) -> Boolean
+	 *
+	 * Searches linkifiable pattern and returns `true` on success or `false` on fail.
+	 **/  LinkifyIt.prototype.test = function test(text) {
+    // Reset scan cache
+    this.__text_cache__ = text;
+    this.__index__ = -1;
+    if (!text.length) {
+      return false;
+    }
+    var m, ml, me, len, shift, next, re, tld_pos, at_pos;
+    // try to scan for link with schema - that's the most simple rule
+        if (this.re.schema_test.test(text)) {
+      re = this.re.schema_search;
+      re.lastIndex = 0;
+      while ((m = re.exec(text)) !== null) {
+        len = this.testSchemaAt(text, m[2], re.lastIndex);
+        if (len) {
+          this.__schema__ = m[2];
+          this.__index__ = m.index + m[1].length;
+          this.__last_index__ = m.index + m[0].length + len;
+          break;
+        }
+      }
+    }
+    if (this.__opts__.fuzzyLink && this.__compiled__["http:"]) {
+      // guess schemaless links
+      tld_pos = text.search(this.re.host_fuzzy_test);
+      if (tld_pos >= 0) {
+        // if tld is located after found link - no need to check fuzzy pattern
+        if (this.__index__ < 0 || tld_pos < this.__index__) {
+          if ((ml = text.match(this.__opts__.fuzzyIP ? this.re.link_fuzzy : this.re.link_no_ip_fuzzy)) !== null) {
+            shift = ml.index + ml[1].length;
+            if (this.__index__ < 0 || shift < this.__index__) {
+              this.__schema__ = "";
+              this.__index__ = shift;
+              this.__last_index__ = ml.index + ml[0].length;
+            }
+          }
+        }
+      }
+    }
+    if (this.__opts__.fuzzyEmail && this.__compiled__["mailto:"]) {
+      // guess schemaless emails
+      at_pos = text.indexOf("@");
+      if (at_pos >= 0) {
+        // We can't skip this check, because this cases are possible:
+        // 192.168.1.1@gmail.com, my.in@example.com
+        if ((me = text.match(this.re.email_fuzzy)) !== null) {
+          shift = me.index + me[1].length;
+          next = me.index + me[0].length;
+          if (this.__index__ < 0 || shift < this.__index__ || shift === this.__index__ && next > this.__last_index__) {
+            this.__schema__ = "mailto:";
+            this.__index__ = shift;
+            this.__last_index__ = next;
+          }
+        }
+      }
+    }
+    return this.__index__ >= 0;
+  };
+  /**
+	 * LinkifyIt#pretest(text) -> Boolean
+	 *
+	 * Very quick check, that can give false positives. Returns true if link MAY BE
+	 * can exists. Can be used for speed optimization, when you need to check that
+	 * link NOT exists.
+	 **/  LinkifyIt.prototype.pretest = function pretest(text) {
+    return this.re.pretest.test(text);
+  };
+  /**
+	 * LinkifyIt#testSchemaAt(text, name, position) -> Number
+	 * - text (String): text to scan
+	 * - name (String): rule (schema) name
+	 * - position (Number): text offset to check from
+	 *
+	 * Similar to [[LinkifyIt#test]] but checks only specific protocol tail exactly
+	 * at given position. Returns length of found pattern (0 on fail).
+	 **/  LinkifyIt.prototype.testSchemaAt = function testSchemaAt(text, schema, pos) {
+    // If not supported schema check requested - terminate
+    if (!this.__compiled__[schema.toLowerCase()]) {
+      return 0;
+    }
+    return this.__compiled__[schema.toLowerCase()].validate(text, pos, this);
+  };
+  /**
+	 * LinkifyIt#match(text) -> Array|null
+	 *
+	 * Returns array of found link descriptions or `null` on fail. We strongly
+	 * recommend to use [[LinkifyIt#test]] first, for best speed.
+	 *
+	 * ##### Result match description
+	 *
+	 * - __schema__ - link schema, can be empty for fuzzy links, or `//` for
+	 *   protocol-neutral  links.
+	 * - __index__ - offset of matched text
+	 * - __lastIndex__ - index of next char after mathch end
+	 * - __raw__ - matched text
+	 * - __text__ - normalized text
+	 * - __url__ - link, generated from matched text
+	 **/  LinkifyIt.prototype.match = function match(text) {
+    var shift = 0, result = [];
+    // Try to take previous element from cache, if .test() called before
+        if (this.__index__ >= 0 && this.__text_cache__ === text) {
+      result.push(createMatch(this, shift));
+      shift = this.__last_index__;
+    }
+    // Cut head if cache was used
+        var tail = shift ? text.slice(shift) : text;
+    // Scan string until end reached
+        while (this.test(tail)) {
+      result.push(createMatch(this, shift));
+      tail = tail.slice(this.__last_index__);
+      shift += this.__last_index__;
+    }
+    if (result.length) {
+      return result;
+    }
+    return null;
+  };
+  /**
+	 * LinkifyIt#matchAtStart(text) -> Match|null
+	 *
+	 * Returns fully-formed (not fuzzy) link if it starts at the beginning
+	 * of the string, and null otherwise.
+	 **/  LinkifyIt.prototype.matchAtStart = function matchAtStart(text) {
+    // Reset scan cache
+    this.__text_cache__ = text;
+    this.__index__ = -1;
+    if (!text.length) return null;
+    var m = this.re.schema_at_start.exec(text);
+    if (!m) return null;
+    var len = this.testSchemaAt(text, m[2], m[0].length);
+    if (!len) return null;
+    this.__schema__ = m[2];
+    this.__index__ = m.index + m[1].length;
+    this.__last_index__ = m.index + m[0].length + len;
+    return createMatch(this, 0);
+  };
+  /** chainable
+	 * LinkifyIt#tlds(list [, keepOld]) -> this
+	 * - list (Array): list of tlds
+	 * - keepOld (Boolean): merge with current list if `true` (`false` by default)
+	 *
+	 * Load (or merge) new tlds list. Those are user for fuzzy links (without prefix)
+	 * to avoid false positives. By default this algorythm used:
+	 *
+	 * - hostname with any 2-letter root zones are ok.
+	 * - biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф
+	 *   are ok.
+	 * - encoded (`xn--...`) root zones are ok.
+	 *
+	 * If list is replaced, then exact match for 2-chars root zones will be checked.
+	 **/  LinkifyIt.prototype.tlds = function tlds(list, keepOld) {
+    list = Array.isArray(list) ? list : [ list ];
+    if (!keepOld) {
+      this.__tlds__ = list.slice();
+      this.__tlds_replaced__ = true;
+      compile(this);
+      return this;
+    }
+    this.__tlds__ = this.__tlds__.concat(list).sort().filter((function(el, idx, arr) {
+      return el !== arr[idx - 1];
+    })).reverse();
+    compile(this);
+    return this;
+  };
+  /**
+	 * LinkifyIt#normalize(match)
+	 *
+	 * Default normalizer (if schema does not define it's own).
+	 **/  LinkifyIt.prototype.normalize = function normalize(match) {
+    // Do minimal possible changes by default. Need to collect feedback prior
+    // to move forward https://github.com/markdown-it/linkify-it/issues/1
+    if (!match.schema) {
+      match.url = "http://" + match.url;
+    }
+    if (match.schema === "mailto:" && !/^mailto:/i.test(match.url)) {
+      match.url = "mailto:" + match.url;
+    }
+  };
+  /**
+	 * LinkifyIt#onCompile()
+	 *
+	 * Override to modify basic RegExp-s.
+	 **/  LinkifyIt.prototype.onCompile = function onCompile() {};
+  var linkifyIt = LinkifyIt;
+  /*! https://mths.be/punycode v1.4.1 by @mathias */
+  /** Highest positive signed 32-bit float value */  var maxInt = 2147483647;
+ // aka. 0x7FFFFFFF or 2^31-1
+  /** Bootstring parameters */  var base = 36;
+  var tMin = 1;
+  var tMax = 26;
+  var skew = 38;
+  var damp = 700;
+  var initialBias = 72;
+  var initialN = 128;
+ // 0x80
+    var delimiter = "-";
+ // '\x2D'
+  /** Regular expressions */  var regexPunycode = /^xn--/;
+  var regexNonASCII = /[^\x20-\x7E]/;
+ // unprintable ASCII chars + non-ASCII chars
+    var regexSeparators = /[\x2E\u3002\uFF0E\uFF61]/g;
+ // RFC 3490 separators
+  /** Error messages */  var errors = {
+    overflow: "Overflow: input needs wider integers to process",
+    "not-basic": "Illegal input >= 0x80 (not a basic code point)",
+    "invalid-input": "Invalid input"
+  };
+  /** Convenience shortcuts */  var baseMinusTMin = base - tMin;
+  var floor = Math.floor;
+  var stringFromCharCode = String.fromCharCode;
+  /*--------------------------------------------------------------------------*/
+  /**
+	 * A generic error utility function.
+	 * @private
+	 * @param {String} type The error type.
+	 * @returns {Error} Throws a `RangeError` with the applicable error message.
+	 */  function error(type) {
+    throw new RangeError(errors[type]);
+  }
+  /**
+	 * A generic `Array#map` utility function.
+	 * @private
+	 * @param {Array} array The array to iterate over.
+	 * @param {Function} callback The function that gets called for every array
+	 * item.
+	 * @returns {Array} A new array of values returned by the callback function.
+	 */  function map(array, fn) {
+    var length = array.length;
+    var result = [];
+    while (length--) {
+      result[length] = fn(array[length]);
+    }
+    return result;
+  }
+  /**
+	 * A simple `Array#map`-like wrapper to work with domain name strings or email
+	 * addresses.
+	 * @private
+	 * @param {String} domain The domain name or email address.
+	 * @param {Function} callback The function that gets called for every
+	 * character.
+	 * @returns {Array} A new string of characters returned by the callback
+	 * function.
+	 */  function mapDomain(string, fn) {
+    var parts = string.split("@");
+    var result = "";
+    if (parts.length > 1) {
+      // In email addresses, only the domain name should be punycoded. Leave
+      // the local part (i.e. everything up to `@`) intact.
+      result = parts[0] + "@";
+      string = parts[1];
+    }
+    // Avoid `split(regex)` for IE8 compatibility. See #17.
+        string = string.replace(regexSeparators, ".");
+    var labels = string.split(".");
+    var encoded = map(labels, fn).join(".");
+    return result + encoded;
+  }
+  /**
+	 * Creates an array containing the numeric code points of each Unicode
+	 * character in the string. While JavaScript uses UCS-2 internally,
+	 * this function will convert a pair of surrogate halves (each of which
+	 * UCS-2 exposes as separate characters) into a single code point,
+	 * matching UTF-16.
+	 * @see `punycode.ucs2.encode`
+	 * @see <https://mathiasbynens.be/notes/javascript-encoding>
+	 * @memberOf punycode.ucs2
+	 * @name decode
+	 * @param {String} string The Unicode input string (UCS-2).
+	 * @returns {Array} The new array of code points.
+	 */  function ucs2decode(string) {
+    var output = [], counter = 0, length = string.length, value, extra;
+    while (counter < length) {
+      value = string.charCodeAt(counter++);
+      if (value >= 55296 && value <= 56319 && counter < length) {
+        // high surrogate, and there is a next character
+        extra = string.charCodeAt(counter++);
+        if ((extra & 64512) == 56320) {
+          // low surrogate
+          output.push(((value & 1023) << 10) + (extra & 1023) + 65536);
+        } else {
+          // unmatched surrogate; only append this code unit, in case the next
+          // code unit is the high surrogate of a surrogate pair
+          output.push(value);
+          counter--;
+        }
+      } else {
+        output.push(value);
+      }
+    }
+    return output;
+  }
+  /**
+	 * Creates a string based on an array of numeric code points.
+	 * @see `punycode.ucs2.decode`
+	 * @memberOf punycode.ucs2
+	 * @name encode
+	 * @param {Array} codePoints The array of numeric code points.
+	 * @returns {String} The new Unicode string (UCS-2).
+	 */  function ucs2encode(array) {
+    return map(array, (function(value) {
+      var output = "";
+      if (value > 65535) {
+        value -= 65536;
+        output += stringFromCharCode(value >>> 10 & 1023 | 55296);
+        value = 56320 | value & 1023;
+      }
+      output += stringFromCharCode(value);
+      return output;
+    })).join("");
+  }
+  /**
+	 * Converts a basic code point into a digit/integer.
+	 * @see `digitToBasic()`
+	 * @private
+	 * @param {Number} codePoint The basic numeric code point value.
+	 * @returns {Number} The numeric value of a basic code point (for use in
+	 * representing integers) in the range `0` to `base - 1`, or `base` if
+	 * the code point does not represent a value.
+	 */  function basicToDigit(codePoint) {
+    if (codePoint - 48 < 10) {
+      return codePoint - 22;
+    }
+    if (codePoint - 65 < 26) {
+      return codePoint - 65;
+    }
+    if (codePoint - 97 < 26) {
+      return codePoint - 97;
+    }
+    return base;
+  }
+  /**
+	 * Converts a digit/integer into a basic code point.
+	 * @see `basicToDigit()`
+	 * @private
+	 * @param {Number} digit The numeric value of a basic code point.
+	 * @returns {Number} The basic code point whose value (when used for
+	 * representing integers) is `digit`, which needs to be in the range
+	 * `0` to `base - 1`. If `flag` is non-zero, the uppercase form is
+	 * used; else, the lowercase form is used. The behavior is undefined
+	 * if `flag` is non-zero and `digit` has no uppercase form.
+	 */  function digitToBasic(digit, flag) {
+    //  0..25 map to ASCII a..z or A..Z
+    // 26..35 map to ASCII 0..9
+    return digit + 22 + 75 * (digit < 26) - ((flag != 0) << 5);
+  }
+  /**
+	 * Bias adaptation function as per section 3.4 of RFC 3492.
+	 * https://tools.ietf.org/html/rfc3492#section-3.4
+	 * @private
+	 */  function adapt(delta, numPoints, firstTime) {
+    var k = 0;
+    delta = firstTime ? floor(delta / damp) : delta >> 1;
+    delta += floor(delta / numPoints);
+    for (;delta > baseMinusTMin * tMax >> 1; k += base) {
+      delta = floor(delta / baseMinusTMin);
+    }
+    return floor(k + (baseMinusTMin + 1) * delta / (delta + skew));
+  }
+  /**
+	 * Converts a Punycode string of ASCII-only symbols to a string of Unicode
+	 * symbols.
+	 * @memberOf punycode
+	 * @param {String} input The Punycode string of ASCII-only symbols.
+	 * @returns {String} The resulting string of Unicode symbols.
+	 */  function decode(input) {
+    // Don't use UCS-2
+    var output = [], inputLength = input.length, out, i = 0, n = initialN, bias = initialBias, basic, j, index, oldi, w, k, digit, t, 
+    /** Cached calculation results */
+    baseMinusT;
+    // Handle the basic code points: let `basic` be the number of input code
+    // points before the last delimiter, or `0` if there is none, then copy
+    // the first basic code points to the output.
+        basic = input.lastIndexOf(delimiter);
+    if (basic < 0) {
+      basic = 0;
+    }
+    for (j = 0; j < basic; ++j) {
+      // if it's not a basic code point
+      if (input.charCodeAt(j) >= 128) {
+        error("not-basic");
+      }
+      output.push(input.charCodeAt(j));
+    }
+    // Main decoding loop: start just after the last delimiter if any basic code
+    // points were copied; start at the beginning otherwise.
+        for (index = basic > 0 ? basic + 1 : 0; index < inputLength; ) {
+      // `index` is the index of the next character to be consumed.
+      // Decode a generalized variable-length integer into `delta`,
+      // which gets added to `i`. The overflow checking is easier
+      // if we increase `i` as we go, then subtract off its starting
+      // value at the end to obtain `delta`.
+      for (oldi = i, w = 1, k = base; ;k += base) {
+        if (index >= inputLength) {
+          error("invalid-input");
+        }
+        digit = basicToDigit(input.charCodeAt(index++));
+        if (digit >= base || digit > floor((maxInt - i) / w)) {
+          error("overflow");
+        }
+        i += digit * w;
+        t = k <= bias ? tMin : k >= bias + tMax ? tMax : k - bias;
+        if (digit < t) {
+          break;
+        }
+        baseMinusT = base - t;
+        if (w > floor(maxInt / baseMinusT)) {
+          error("overflow");
+        }
+        w *= baseMinusT;
+      }
+      out = output.length + 1;
+      bias = adapt(i - oldi, out, oldi == 0);
+      // `i` was supposed to wrap around from `out` to `0`,
+      // incrementing `n` each time, so we'll fix that now:
+            if (floor(i / out) > maxInt - n) {
+        error("overflow");
+      }
+      n += floor(i / out);
+      i %= out;
+      // Insert `n` at position `i` of the output
+            output.splice(i++, 0, n);
+    }
+    return ucs2encode(output);
+  }
+  /**
+	 * Converts a string of Unicode symbols (e.g. a domain name label) to a
+	 * Punycode string of ASCII-only symbols.
+	 * @memberOf punycode
+	 * @param {String} input The string of Unicode symbols.
+	 * @returns {String} The resulting Punycode string of ASCII-only symbols.
+	 */  function encode(input) {
+    var n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, currentValue, output = [], 
+    /** `inputLength` will hold the number of code points in `input`. */
+    inputLength, 
+    /** Cached calculation results */
+    handledCPCountPlusOne, baseMinusT, qMinusT;
+    // Convert the input in UCS-2 to Unicode
+        input = ucs2decode(input);
+    // Cache the length
+        inputLength = input.length;
+    // Initialize the state
+        n = initialN;
+    delta = 0;
+    bias = initialBias;
+    // Handle the basic code points
+        for (j = 0; j < inputLength; ++j) {
+      currentValue = input[j];
+      if (currentValue < 128) {
+        output.push(stringFromCharCode(currentValue));
+      }
+    }
+    handledCPCount = basicLength = output.length;
+    // `handledCPCount` is the number of code points that have been handled;
+    // `basicLength` is the number of basic code points.
+    // Finish the basic string - if it is not empty - with a delimiter
+        if (basicLength) {
+      output.push(delimiter);
+    }
+    // Main encoding loop:
+        while (handledCPCount < inputLength) {
+      // All non-basic code points < n have been handled already. Find the next
+      // larger one:
+      for (m = maxInt, j = 0; j < inputLength; ++j) {
+        currentValue = input[j];
+        if (currentValue >= n && currentValue < m) {
+          m = currentValue;
+        }
+      }
+      // Increase `delta` enough to advance the decoder's <n,i> state to <m,0>,
+      // but guard against overflow
+            handledCPCountPlusOne = handledCPCount + 1;
+      if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) {
+        error("overflow");
+      }
+      delta += (m - n) * handledCPCountPlusOne;
+      n = m;
+      for (j = 0; j < inputLength; ++j) {
+        currentValue = input[j];
+        if (currentValue < n && ++delta > maxInt) {
+          error("overflow");
+        }
+        if (currentValue == n) {
+          // Represent delta as a generalized variable-length integer
+          for (q = delta, k = base; ;k += base) {
+            t = k <= bias ? tMin : k >= bias + tMax ? tMax : k - bias;
+            if (q < t) {
+              break;
+            }
+            qMinusT = q - t;
+            baseMinusT = base - t;
+            output.push(stringFromCharCode(digitToBasic(t + qMinusT % baseMinusT, 0)));
+            q = floor(qMinusT / baseMinusT);
+          }
+          output.push(stringFromCharCode(digitToBasic(q, 0)));
+          bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength);
+          delta = 0;
+          ++handledCPCount;
+        }
+      }
+      ++delta;
+      ++n;
+    }
+    return output.join("");
+  }
+  /**
+	 * Converts a Punycode string representing a domain name or an email address
+	 * to Unicode. Only the Punycoded parts of the input will be converted, i.e.
+	 * it doesn't matter if you call it on a string that has already been
+	 * converted to Unicode.
+	 * @memberOf punycode
+	 * @param {String} input The Punycoded domain name or email address to
+	 * convert to Unicode.
+	 * @returns {String} The Unicode representation of the given Punycode
+	 * string.
+	 */  function toUnicode(input) {
+    return mapDomain(input, (function(string) {
+      return regexPunycode.test(string) ? decode(string.slice(4).toLowerCase()) : string;
+    }));
+  }
+  /**
+	 * Converts a Unicode string representing a domain name or an email address to
+	 * Punycode. Only the non-ASCII parts of the domain name will be converted,
+	 * i.e. it doesn't matter if you call it with a domain that's already in
+	 * ASCII.
+	 * @memberOf punycode
+	 * @param {String} input The domain name or email address to convert, as a
+	 * Unicode string.
+	 * @returns {String} The Punycode representation of the given domain name or
+	 * email address.
+	 */  function toASCII(input) {
+    return mapDomain(input, (function(string) {
+      return regexNonASCII.test(string) ? "xn--" + encode(string) : string;
+    }));
+  }
+  var version = "1.4.1";
+  /**
+	 * An object of methods to convert from JavaScript's internal character
+	 * representation (UCS-2) to Unicode code points, and back.
+	 * @see <https://mathiasbynens.be/notes/javascript-encoding>
+	 * @memberOf punycode
+	 * @type Object
+	 */  var ucs2 = {
+    decode: ucs2decode,
+    encode: ucs2encode
+  };
+  var punycode$1 = {
+    version: version,
+    ucs2: ucs2,
+    toASCII: toASCII,
+    toUnicode: toUnicode,
+    encode: encode,
+    decode: decode
+  };
+  var punycode$2 =  Object.freeze({
+    __proto__: null,
+    decode: decode,
+    encode: encode,
+    toUnicode: toUnicode,
+    toASCII: toASCII,
+    version: version,
+    ucs2: ucs2,
+    default: punycode$1
+  });
+  // markdown-it default options
+    var _default = {
+    options: {
+      html: false,
+      // Enable HTML tags in source
+      xhtmlOut: false,
+      // Use '/' to close single tags ()
+      breaks: false,
+      // Convert '\n' in paragraphs into 
+      langPrefix: "language-",
+      // CSS language prefix for fenced blocks
+      linkify: false,
+      // autoconvert URL-like texts to links
+      // Enable some language-neutral replacements + quotes beautification
+      typographer: false,
+      // Double + single quotes replacement pairs, when typographer enabled,
+      // and smartquotes on. Could be either a String or an Array.
+      // For example, you can use '«»„“' for Russian, '„“‚‘' for German,
+      // and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp).
+      quotes: "\u201c\u201d\u2018\u2019",
+      /* “”‘’ */
+      // Highlighter function. Should return escaped HTML,
+      // or '' if the source string is not changed and should be escaped externaly.
+      // If result starts with <pre... internal wrapper is skipped.
+      // function (/*str, lang*/) { return ''; }
+      highlight: null,
+      maxNesting: 100
+    },
+    components: {
+      core: {},
+      block: {},
+      inline: {}
+    }
+  };
+  // "Zero" preset, with nothing enabled. Useful for manual configuring of simple
+    var zero = {
+    options: {
+      html: false,
+      // Enable HTML tags in source
+      xhtmlOut: false,
+      // Use '/' to close single tags ()
+      breaks: false,
+      // Convert '\n' in paragraphs into 
+      langPrefix: "language-",
+      // CSS language prefix for fenced blocks
+      linkify: false,
+      // autoconvert URL-like texts to links
+      // Enable some language-neutral replacements + quotes beautification
+      typographer: false,
+      // Double + single quotes replacement pairs, when typographer enabled,
+      // and smartquotes on. Could be either a String or an Array.
+      // For example, you can use '«»„“' for Russian, '„“‚‘' for German,
+      // and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp).
+      quotes: "\u201c\u201d\u2018\u2019",
+      /* “”‘’ */
+      // Highlighter function. Should return escaped HTML,
+      // or '' if the source string is not changed and should be escaped externaly.
+      // If result starts with <pre... internal wrapper is skipped.
+      // function (/*str, lang*/) { return ''; }
+      highlight: null,
+      maxNesting: 20
+    },
+    components: {
+      core: {
+        rules: [ "normalize", "block", "inline", "text_join" ]
+      },
+      block: {
+        rules: [ "paragraph" ]
+      },
+      inline: {
+        rules: [ "text" ],
+        rules2: [ "balance_pairs", "fragments_join" ]
+      }
+    }
+  };
+  // Commonmark default options
+    var commonmark = {
+    options: {
+      html: true,
+      // Enable HTML tags in source
+      xhtmlOut: true,
+      // Use '/' to close single tags ()
+      breaks: false,
+      // Convert '\n' in paragraphs into 
+      langPrefix: "language-",
+      // CSS language prefix for fenced blocks
+      linkify: false,
+      // autoconvert URL-like texts to links
+      // Enable some language-neutral replacements + quotes beautification
+      typographer: false,
+      // Double + single quotes replacement pairs, when typographer enabled,
+      // and smartquotes on. Could be either a String or an Array.
+      // For example, you can use '«»„“' for Russian, '„“‚‘' for German,
+      // and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp).
+      quotes: "\u201c\u201d\u2018\u2019",
+      /* “”‘’ */
+      // Highlighter function. Should return escaped HTML,
+      // or '' if the source string is not changed and should be escaped externaly.
+      // If result starts with <pre... internal wrapper is skipped.
+      // function (/*str, lang*/) { return ''; }
+      highlight: null,
+      maxNesting: 20
+    },
+    components: {
+      core: {
+        rules: [ "normalize", "block", "inline", "text_join" ]
+      },
+      block: {
+        rules: [ "blockquote", "code", "fence", "heading", "hr", "html_block", "lheading", "list", "reference", "paragraph" ]
+      },
+      inline: {
+        rules: [ "autolink", "backticks", "emphasis", "entity", "escape", "html_inline", "image", "link", "newline", "text" ],
+        rules2: [ "balance_pairs", "emphasis", "fragments_join" ]
+      }
+    }
+  };
+  var punycode =  getAugmentedNamespace(punycode$2);
+  var config = {
+    default: _default,
+    zero: zero,
+    commonmark: commonmark
+  };
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  // This validator can prohibit more than really needed to prevent XSS. It's a
+  // tradeoff to keep code simple and to be secure by default.
+  
+  // If you need different setup - override validator method as you wish. Or
+  // replace it with dummy function and use external sanitizer.
+  
+    var BAD_PROTO_RE = /^(vbscript|javascript|file|data):/;
+  var GOOD_DATA_RE = /^data:image\/(gif|png|jpeg|webp);/;
+  function validateLink(url) {
+    // url should be normalized at this point, and existing entities are decoded
+    var str = url.trim().toLowerCase();
+    return BAD_PROTO_RE.test(str) ? GOOD_DATA_RE.test(str) ? true : false : true;
+  }
+  ////////////////////////////////////////////////////////////////////////////////
+    var RECODE_HOSTNAME_FOR = [ "http:", "https:", "mailto:" ];
+  function normalizeLink(url) {
+    var parsed = mdurl.parse(url, true);
+    if (parsed.hostname) {
+      // Encode hostnames in urls like:
+      // `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
+      // We don't encode unknown schemas, because it's likely that we encode
+      // something we shouldn't (e.g. `skype:name` treated as `skype:host`)
+      if (!parsed.protocol || RECODE_HOSTNAME_FOR.indexOf(parsed.protocol) >= 0) {
+        try {
+          parsed.hostname = punycode.toASCII(parsed.hostname);
+        } catch (er) {}
+      }
+    }
+    return mdurl.encode(mdurl.format(parsed));
+  }
+  function normalizeLinkText(url) {
+    var parsed = mdurl.parse(url, true);
+    if (parsed.hostname) {
+      // Encode hostnames in urls like:
+      // `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
+      // We don't encode unknown schemas, because it's likely that we encode
+      // something we shouldn't (e.g. `skype:name` treated as `skype:host`)
+      if (!parsed.protocol || RECODE_HOSTNAME_FOR.indexOf(parsed.protocol) >= 0) {
+        try {
+          parsed.hostname = punycode.toUnicode(parsed.hostname);
+        } catch (er) {}
+      }
+    }
+    // add '%' to exclude list because of https://github.com/markdown-it/markdown-it/issues/720
+        return mdurl.decode(mdurl.format(parsed), mdurl.decode.defaultChars + "%");
+  }
+  /**
+	 * class MarkdownIt
+	 *
+	 * Main parser/renderer class.
+	 *
+	 * ##### Usage
+	 *
+	 * ```javascript
+	 * // node.js, "classic" way:
+	 * var MarkdownIt = require('markdown-it'),
+	 *     md = new MarkdownIt();
+	 * var result = md.render('# markdown-it rulezz!');
+	 *
+	 * // node.js, the same, but with sugar:
+	 * var md = require('markdown-it')();
+	 * var result = md.render('# markdown-it rulezz!');
+	 *
+	 * // browser without AMD, added to "window" on script load
+	 * // Note, there are no dash.
+	 * var md = window.markdownit();
+	 * var result = md.render('# markdown-it rulezz!');
+	 * ```
+	 *
+	 * Single line rendering, without paragraph wrap:
+	 *
+	 * ```javascript
+	 * var md = require('markdown-it')();
+	 * var result = md.renderInline('__markdown-it__ rulezz!');
+	 * ```
+	 **/
+  /**
+	 * new MarkdownIt([presetName, options])
+	 * - presetName (String): optional, `commonmark` / `zero`
+	 * - options (Object)
+	 *
+	 * Creates parser instanse with given config. Can be called without `new`.
+	 *
+	 * ##### presetName
+	 *
+	 * MarkdownIt provides named presets as a convenience to quickly
+	 * enable/disable active syntax rules and options for common use cases.
+	 *
+	 * - ["commonmark"](https://github.com/markdown-it/markdown-it/blob/master/lib/presets/commonmark.js) -
+	 *   configures parser to strict [CommonMark](http://commonmark.org/) mode.
+	 * - [default](https://github.com/markdown-it/markdown-it/blob/master/lib/presets/default.js) -
+	 *   similar to GFM, used when no preset name given. Enables all available rules,
+	 *   but still without html, typographer & autolinker.
+	 * - ["zero"](https://github.com/markdown-it/markdown-it/blob/master/lib/presets/zero.js) -
+	 *   all rules disabled. Useful to quickly setup your config via `.enable()`.
+	 *   For example, when you need only `bold` and `italic` markup and nothing else.
+	 *
+	 * ##### options:
+	 *
+	 * - __html__ - `false`. Set `true` to enable HTML tags in source. Be careful!
+	 *   That's not safe! You may need external sanitizer to protect output from XSS.
+	 *   It's better to extend features via plugins, instead of enabling HTML.
+	 * - __xhtmlOut__ - `false`. Set `true` to add '/' when closing single tags
+	 *   (``). This is needed only for full CommonMark compatibility. In real
+	 *   world you will need HTML output.
+	 * - __breaks__ - `false`. Set `true` to convert `\n` in paragraphs into ``.
+	 * - __langPrefix__ - `language-`. CSS language class prefix for fenced blocks.
+	 *   Can be useful for external highlighters.
+	 * - __linkify__ - `false`. Set `true` to autoconvert URL-like text to links.
+	 * - __typographer__  - `false`. Set `true` to enable [some language-neutral
+	 *   replacement](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_core/replacements.js) +
+	 *   quotes beautification (smartquotes).
+	 * - __quotes__ - `“”‘’`, String or Array. Double + single quotes replacement
+	 *   pairs, when typographer enabled and smartquotes on. For example, you can
+	 *   use `'«»„“'` for Russian, `'„“‚‘'` for German, and
+	 *   `['«\xA0', '\xA0»', '‹\xA0', '\xA0›']` for French (including nbsp).
+	 * - __highlight__ - `null`. Highlighter function for fenced code blocks.
+	 *   Highlighter `function (str, lang)` should return escaped HTML. It can also
+	 *   return empty string if the source was not changed and should be escaped
+	 *   externaly. If result starts with <pre... internal wrapper is skipped.
+	 *
+	 * ##### Example
+	 *
+	 * ```javascript
+	 * // commonmark mode
+	 * var md = require('markdown-it')('commonmark');
+	 *
+	 * // default mode
+	 * var md = require('markdown-it')();
+	 *
+	 * // enable everything
+	 * var md = require('markdown-it')({
+	 *   html: true,
+	 *   linkify: true,
+	 *   typographer: true
+	 * });
+	 * ```
+	 *
+	 * ##### Syntax highlighting
+	 *
+	 * ```js
+	 * var hljs = require('highlight.js') // https://highlightjs.org/
+	 *
+	 * var md = require('markdown-it')({
+	 *   highlight: function (str, lang) {
+	 *     if (lang && hljs.getLanguage(lang)) {
+	 *       try {
+	 *         return hljs.highlight(str, { language: lang, ignoreIllegals: true }).value;
+	 *       } catch (__) {}
+	 *     }
+	 *
+	 *     return ''; // use external default escaping
+	 *   }
+	 * });
+	 * ```
+	 *
+	 * Or with full wrapper override (if you need assign class to `<pre>`):
+	 *
+	 * ```javascript
+	 * var hljs = require('highlight.js') // https://highlightjs.org/
+	 *
+	 * // Actual default values
+	 * var md = require('markdown-it')({
+	 *   highlight: function (str, lang) {
+	 *     if (lang && hljs.getLanguage(lang)) {
+	 *       try {
+	 *         return '<pre class="hljs"><code>' +
+	 *                hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
+	 *                '</code></pre>';
+	 *       } catch (__) {}
+	 *     }
+	 *
+	 *     return '<pre class="hljs"><code>' + md.utils.escapeHtml(str) + '</code></pre>';
+	 *   }
+	 * });
+	 * ```
+	 *
+	 **/  function MarkdownIt(presetName, options) {
+    if (!(this instanceof MarkdownIt)) {
+      return new MarkdownIt(presetName, options);
+    }
+    if (!options) {
+      if (!utils.isString(presetName)) {
+        options = presetName || {};
+        presetName = "default";
+      }
+    }
+    /**
+	   * MarkdownIt#inline -> ParserInline
+	   *
+	   * Instance of [[ParserInline]]. You may need it to add new rules when
+	   * writing plugins. For simple rules control use [[MarkdownIt.disable]] and
+	   * [[MarkdownIt.enable]].
+	   **/    this.inline = new parser_inline;
+    /**
+	   * MarkdownIt#block -> ParserBlock
+	   *
+	   * Instance of [[ParserBlock]]. You may need it to add new rules when
+	   * writing plugins. For simple rules control use [[MarkdownIt.disable]] and
+	   * [[MarkdownIt.enable]].
+	   **/    this.block = new parser_block;
+    /**
+	   * MarkdownIt#core -> Core
+	   *
+	   * Instance of [[Core]] chain executor. You may need it to add new rules when
+	   * writing plugins. For simple rules control use [[MarkdownIt.disable]] and
+	   * [[MarkdownIt.enable]].
+	   **/    this.core = new parser_core;
+    /**
+	   * MarkdownIt#renderer -> Renderer
+	   *
+	   * Instance of [[Renderer]]. Use it to modify output look. Or to add rendering
+	   * rules for new token types, generated by plugins.
+	   *
+	   * ##### Example
+	   *
+	   * ```javascript
+	   * var md = require('markdown-it')();
+	   *
+	   * function myToken(tokens, idx, options, env, self) {
+	   *   //...
+	   *   return result;
+	   * };
+	   *
+	   * md.renderer.rules['my_token'] = myToken
+	   * ```
+	   *
+	   * See [[Renderer]] docs and [source code](https://github.com/markdown-it/markdown-it/blob/master/lib/renderer.js).
+	   **/    this.renderer = new renderer;
+    /**
+	   * MarkdownIt#linkify -> LinkifyIt
+	   *
+	   * [linkify-it](https://github.com/markdown-it/linkify-it) instance.
+	   * Used by [linkify](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_core/linkify.js)
+	   * rule.
+	   **/    this.linkify = new linkifyIt;
+    /**
+	   * MarkdownIt#validateLink(url) -> Boolean
+	   *
+	   * Link validation function. CommonMark allows too much in links. By default
+	   * we disable `javascript:`, `vbscript:`, `file:` schemas, and almost all `data:...` schemas
+	   * except some embedded image types.
+	   *
+	   * You can change this behaviour:
+	   *
+	   * ```javascript
+	   * var md = require('markdown-it')();
+	   * // enable everything
+	   * md.validateLink = function () { return true; }
+	   * ```
+	   **/    this.validateLink = validateLink;
+    /**
+	   * MarkdownIt#normalizeLink(url) -> String
+	   *
+	   * Function used to encode link url to a machine-readable format,
+	   * which includes url-encoding, punycode, etc.
+	   **/    this.normalizeLink = normalizeLink;
+    /**
+	   * MarkdownIt#normalizeLinkText(url) -> String
+	   *
+	   * Function used to decode link url to a human-readable format`
+	   **/    this.normalizeLinkText = normalizeLinkText;
+    // Expose utils & helpers for easy acces from plugins
+    /**
+	   * MarkdownIt#utils -> utils
+	   *
+	   * Assorted utility functions, useful to write plugins. See details
+	   * [here](https://github.com/markdown-it/markdown-it/blob/master/lib/common/utils.js).
+	   **/    this.utils = utils;
+    /**
+	   * MarkdownIt#helpers -> helpers
+	   *
+	   * Link components parser functions, useful to write plugins. See details
+	   * [here](https://github.com/markdown-it/markdown-it/blob/master/lib/helpers).
+	   **/    this.helpers = utils.assign({}, helpers);
+    this.options = {};
+    this.configure(presetName);
+    if (options) {
+      this.set(options);
+    }
+  }
+  /** chainable
+	 * MarkdownIt.set(options)
+	 *
+	 * Set parser options (in the same format as in constructor). Probably, you
+	 * will never need it, but you can change options after constructor call.
+	 *
+	 * ##### Example
+	 *
+	 * ```javascript
+	 * var md = require('markdown-it')()
+	 *             .set({ html: true, breaks: true })
+	 *             .set({ typographer, true });
+	 * ```
+	 *
+	 * __Note:__ To achieve the best possible performance, don't modify a
+	 * `markdown-it` instance options on the fly. If you need multiple configurations
+	 * it's best to create multiple instances and initialize each with separate
+	 * config.
+	 **/  MarkdownIt.prototype.set = function(options) {
+    utils.assign(this.options, options);
+    return this;
+  };
+  /** chainable, internal
+	 * MarkdownIt.configure(presets)
+	 *
+	 * Batch load of all options and compenent settings. This is internal method,
+	 * and you probably will not need it. But if you will - see available presets
+	 * and data structure [here](https://github.com/markdown-it/markdown-it/tree/master/lib/presets)
+	 *
+	 * We strongly recommend to use presets instead of direct config loads. That
+	 * will give better compatibility with next versions.
+	 **/  MarkdownIt.prototype.configure = function(presets) {
+    var self = this, presetName;
+    if (utils.isString(presets)) {
+      presetName = presets;
+      presets = config[presetName];
+      if (!presets) {
+        throw new Error('Wrong `markdown-it` preset "' + presetName + '", check name');
+      }
+    }
+    if (!presets) {
+      throw new Error("Wrong `markdown-it` preset, can't be empty");
+    }
+    if (presets.options) {
+      self.set(presets.options);
+    }
+    if (presets.components) {
+      Object.keys(presets.components).forEach((function(name) {
+        if (presets.components[name].rules) {
+          self[name].ruler.enableOnly(presets.components[name].rules);
+        }
+        if (presets.components[name].rules2) {
+          self[name].ruler2.enableOnly(presets.components[name].rules2);
+        }
+      }));
+    }
+    return this;
+  };
+  /** chainable
+	 * MarkdownIt.enable(list, ignoreInvalid)
+	 * - list (String|Array): rule name or list of rule names to enable
+	 * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found.
+	 *
+	 * Enable list or rules. It will automatically find appropriate components,
+	 * containing rules with given names. If rule not found, and `ignoreInvalid`
+	 * not set - throws exception.
+	 *
+	 * ##### Example
+	 *
+	 * ```javascript
+	 * var md = require('markdown-it')()
+	 *             .enable(['sub', 'sup'])
+	 *             .disable('smartquotes');
+	 * ```
+	 **/  MarkdownIt.prototype.enable = function(list, ignoreInvalid) {
+    var result = [];
+    if (!Array.isArray(list)) {
+      list = [ list ];
+    }
+    [ "core", "block", "inline" ].forEach((function(chain) {
+      result = result.concat(this[chain].ruler.enable(list, true));
+    }), this);
+    result = result.concat(this.inline.ruler2.enable(list, true));
+    var missed = list.filter((function(name) {
+      return result.indexOf(name) < 0;
+    }));
+    if (missed.length && !ignoreInvalid) {
+      throw new Error("MarkdownIt. Failed to enable unknown rule(s): " + missed);
+    }
+    return this;
+  };
+  /** chainable
+	 * MarkdownIt.disable(list, ignoreInvalid)
+	 * - list (String|Array): rule name or list of rule names to disable.
+	 * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found.
+	 *
+	 * The same as [[MarkdownIt.enable]], but turn specified rules off.
+	 **/  MarkdownIt.prototype.disable = function(list, ignoreInvalid) {
+    var result = [];
+    if (!Array.isArray(list)) {
+      list = [ list ];
+    }
+    [ "core", "block", "inline" ].forEach((function(chain) {
+      result = result.concat(this[chain].ruler.disable(list, true));
+    }), this);
+    result = result.concat(this.inline.ruler2.disable(list, true));
+    var missed = list.filter((function(name) {
+      return result.indexOf(name) < 0;
+    }));
+    if (missed.length && !ignoreInvalid) {
+      throw new Error("MarkdownIt. Failed to disable unknown rule(s): " + missed);
+    }
+    return this;
+  };
+  /** chainable
+	 * MarkdownIt.use(plugin, params)
+	 *
+	 * Load specified plugin with given params into current parser instance.
+	 * It's just a sugar to call `plugin(md, params)` with curring.
+	 *
+	 * ##### Example
+	 *
+	 * ```javascript
+	 * var iterator = require('markdown-it-for-inline');
+	 * var md = require('markdown-it')()
+	 *             .use(iterator, 'foo_replace', 'text', function (tokens, idx) {
+	 *               tokens[idx].content = tokens[idx].content.replace(/foo/g, 'bar');
+	 *             });
+	 * ```
+	 **/  MarkdownIt.prototype.use = function(plugin /*, params, ... */) {
+    var args = [ this ].concat(Array.prototype.slice.call(arguments, 1));
+    plugin.apply(plugin, args);
+    return this;
+  };
+  /** internal
+	 * MarkdownIt.parse(src, env) -> Array
+	 * - src (String): source string
+	 * - env (Object): environment sandbox
+	 *
+	 * Parse input string and return list of block tokens (special token type
+	 * "inline" will contain list of inline tokens). You should not call this
+	 * method directly, until you write custom renderer (for example, to produce
+	 * AST).
+	 *
+	 * `env` is used to pass data between "distributed" rules and return additional
+	 * metadata like reference info, needed for the renderer. It also can be used to
+	 * inject data in specific cases. Usually, you will be ok to pass `{}`,
+	 * and then pass updated object to renderer.
+	 **/  MarkdownIt.prototype.parse = function(src, env) {
+    if (typeof src !== "string") {
+      throw new Error("Input data should be a String");
+    }
+    var state = new this.core.State(src, this, env);
+    this.core.process(state);
+    return state.tokens;
+  };
+  /**
+	 * MarkdownIt.render(src [, env]) -> String
+	 * - src (String): source string
+	 * - env (Object): environment sandbox
+	 *
+	 * Render markdown string into html. It does all magic for you :).
+	 *
+	 * `env` can be used to inject additional metadata (`{}` by default).
+	 * But you will not need it with high probability. See also comment
+	 * in [[MarkdownIt.parse]].
+	 **/  MarkdownIt.prototype.render = function(src, env) {
+    env = env || {};
+    return this.renderer.render(this.parse(src, env), this.options, env);
+  };
+  /** internal
+	 * MarkdownIt.parseInline(src, env) -> Array
+	 * - src (String): source string
+	 * - env (Object): environment sandbox
+	 *
+	 * The same as [[MarkdownIt.parse]] but skip all block rules. It returns the
+	 * block tokens list with the single `inline` element, containing parsed inline
+	 * tokens in `children` property. Also updates `env` object.
+	 **/  MarkdownIt.prototype.parseInline = function(src, env) {
+    var state = new this.core.State(src, this, env);
+    state.inlineMode = true;
+    this.core.process(state);
+    return state.tokens;
+  };
+  /**
+	 * MarkdownIt.renderInline(src [, env]) -> String
+	 * - src (String): source string
+	 * - env (Object): environment sandbox
+	 *
+	 * Similar to [[MarkdownIt.render]] but for single paragraph content. Result
+	 * will NOT be wrapped into `` tags.
+	 **/  MarkdownIt.prototype.renderInline = function(src, env) {
+    env = env || {};
+    return this.renderer.render(this.parseInline(src, env), this.options, env);
+  };
+  var lib = MarkdownIt;
+  var markdownIt = lib;
+  return markdownIt;
+}));
+
diff --git a/examples/server/public/deps_tailwindcss.js b/examples/server/public/deps_tailwindcss.js
new file mode 100644
index 000000000..6736cb8ca
--- /dev/null
+++ b/examples/server/public/deps_tailwindcss.js
@@ -0,0 +1,82 @@
+(()=>{var Iv=Object.create;var Ui=Object.defineProperty;var Dv=Object.getOwnPropertyDescriptor;var qv=Object.getOwnPropertyNames;var $v=Object.getPrototypeOf,Lv=Object.prototype.hasOwnProperty;var cf=r=>Ui(r,"__esModule",{value:!0});var pf=r=>{if(typeof require!="undefined")return require(r);throw new Error('Dynamic require of "'+r+'" is not supported')};var R=(r,e)=>()=>(r&&(e=r(r=0)),e);var x=(r,e)=>()=>(e||r((e={exports:{}}).exports,e),e.exports),Ge=(r,e)=>{cf(r);for(var t in e)Ui(r,t,{get:e[t],enumerable:!0})},Mv=(r,e,t)=>{if(e&&typeof e=="object"||typeof e=="function")for(let i of qv(e))!Lv.call(r,i)&&i!=="default"&&Ui(r,i,{get:()=>e[i],enumerable:!(t=Dv(e,i))||t.enumerable});return r},pe=r=>Mv(cf(Ui(r!=null?Iv($v(r)):{},"default",r&&r.__esModule&&"default"in r?{get:()=>r.default,enumerable:!0}:{value:r,enumerable:!0})),r);var m,u=R(()=>{m={platform:"",env:{},versions:{node:"14.17.6"}}});var Nv,be,ft=R(()=>{u();Nv=0,be={readFileSync:r=>self[r]||"",statSync:()=>({mtimeMs:Nv++}),promises:{readFile:r=>Promise.resolve(self[r]||"")}}});var Ns=x((sP,hf)=>{u();"use strict";var df=class{constructor(e={}){if(!(e.maxSize&&e.maxSize>0))throw new TypeError("`maxSize` must be a number greater than 0");if(typeof e.maxAge=="number"&&e.maxAge===0)throw new TypeError("`maxAge` must be a number greater than 0");this.maxSize=e.maxSize,this.maxAge=e.maxAge||1/0,this.onEviction=e.onEviction,this.cache=new Map,this.oldCache=new Map,this._size=0}_emitEvictions(e){if(typeof this.onEviction=="function")for(let[t,i]of e)this.onEviction(t,i.value)}_deleteIfExpired(e,t){return typeof t.expiry=="number"&&t.expiry<=Date.now()?(typeof this.onEviction=="function"&&this.onEviction(e,t.value),this.delete(e)):!1}_getOrDeleteIfExpired(e,t){if(this._deleteIfExpired(e,t)===!1)return t.value}_getItemValue(e,t){return t.expiry?this._getOrDeleteIfExpired(e,t):t.value}_peek(e,t){let i=t.get(e);return this._getItemValue(e,i)}_set(e,t){this.cache.set(e,t),this._size++,this._size>=this.maxSize&&(this._size=0,this._emitEvictions(this.oldCache),this.oldCache=this.cache,this.cache=new Map)}_moveToRecent(e,t){this.oldCache.delete(e),this._set(e,t)}*_entriesAscending(){for(let e of this.oldCache){let[t,i]=e;this.cache.has(t)||this._deleteIfExpired(t,i)===!1&&(yield e)}for(let e of this.cache){let[t,i]=e;this._deleteIfExpired(t,i)===!1&&(yield e)}}get(e){if(this.cache.has(e)){let t=this.cache.get(e);return this._getItemValue(e,t)}if(this.oldCache.has(e)){let t=this.oldCache.get(e);if(this._deleteIfExpired(e,t)===!1)return this._moveToRecent(e,t),t.value}}set(e,t,{maxAge:i=this.maxAge===1/0?void 0:Date.now()+this.maxAge}={}){this.cache.has(e)?this.cache.set(e,{value:t,maxAge:i}):this._set(e,{value:t,expiry:i})}has(e){return this.cache.has(e)?!this._deleteIfExpired(e,this.cache.get(e)):this.oldCache.has(e)?!this._deleteIfExpired(e,this.oldCache.get(e)):!1}peek(e){if(this.cache.has(e))return this._peek(e,this.cache);if(this.oldCache.has(e))return this._peek(e,this.oldCache)}delete(e){let t=this.cache.delete(e);return t&&this._size--,this.oldCache.delete(e)||t}clear(){this.cache.clear(),this.oldCache.clear(),this._size=0}resize(e){if(!(e&&e>0))throw new TypeError("`maxSize` must be a number greater than 0");let t=[...this._entriesAscending()],i=t.length-e;i<0?(this.cache=new Map(t),this.oldCache=new Map,this._size=t.length):(i>0&&this._emitEvictions(t.slice(0,i)),this.oldCache=new Map(t.slice(i)),this.cache=new Map,this._size=0),this.maxSize=e}*keys(){for(let[e]of this)yield e}*values(){for(let[,e]of this)yield e}*[Symbol.iterator](){for(let e of this.cache){let[t,i]=e;this._deleteIfExpired(t,i)===!1&&(yield[t,i.value])}for(let e of this.oldCache){let[t,i]=e;this.cache.has(t)||this._deleteIfExpired(t,i)===!1&&(yield[t,i.value])}}*entriesDescending(){let e=[...this.cache];for(let t=e.length-1;t>=0;--t){let i=e[t],[n,a]=i;this._deleteIfExpired(n,a)===!1&&(yield[n,a.value])}e=[...this.oldCache];for(let t=e.length-1;t>=0;--t){let i=e[t],[n,a]=i;this.cache.has(n)||this._deleteIfExpired(n,a)===!1&&(yield[n,a.value])}}*entriesAscending(){for(let[e,t]of this._entriesAscending())yield[e,t.value]}get size(){if(!this._size)return this.oldCache.size;let e=0;for(let t of this.oldCache.keys())this.cache.has(t)||e++;return Math.min(this._size+e,this.maxSize)}};hf.exports=df});var mf,gf=R(()=>{u();mf=r=>r&&r._hash});function Vi(r){return mf(r,{ignoreUnknown:!0})}var yf=R(()=>{u();gf()});function xt(r){if(r=`${r}`,r==="0")return"0";if(/^[+-]?(\d+|\d*\.\d+)(e[+-]?\d+)?(%|\w+)?$/.test(r))return r.replace(/^[+-]?/,t=>t==="-"?"":"-");let e=["var","calc","min","max","clamp"];for(let t of e)if(r.includes(`${t}(`))return`calc(${r} * -1)`}var Hi=R(()=>{u()});var bf,wf=R(()=>{u();bf=["preflight","container","accessibility","pointerEvents","visibility","position","inset","isolation","zIndex","order","gridColumn","gridColumnStart","gridColumnEnd","gridRow","gridRowStart","gridRowEnd","float","clear","margin","boxSizing","lineClamp","display","aspectRatio","size","height","maxHeight","minHeight","width","minWidth","maxWidth","flex","flexShrink","flexGrow","flexBasis","tableLayout","captionSide","borderCollapse","borderSpacing","transformOrigin","translate","rotate","skew","scale","transform","animation","cursor","touchAction","userSelect","resize","scrollSnapType","scrollSnapAlign","scrollSnapStop","scrollMargin","scrollPadding","listStylePosition","listStyleType","listStyleImage","appearance","columns","breakBefore","breakInside","breakAfter","gridAutoColumns","gridAutoFlow","gridAutoRows","gridTemplateColumns","gridTemplateRows","flexDirection","flexWrap","placeContent","placeItems","alignContent","alignItems","justifyContent","justifyItems","gap","space","divideWidth","divideStyle","divideColor","divideOpacity","placeSelf","alignSelf","justifySelf","overflow","overscrollBehavior","scrollBehavior","textOverflow","hyphens","whitespace","textWrap","wordBreak","borderRadius","borderWidth","borderStyle","borderColor","borderOpacity","backgroundColor","backgroundOpacity","backgroundImage","gradientColorStops","boxDecorationBreak","backgroundSize","backgroundAttachment","backgroundClip","backgroundPosition","backgroundRepeat","backgroundOrigin","fill","stroke","strokeWidth","objectFit","objectPosition","padding","textAlign","textIndent","verticalAlign","fontFamily","fontSize","fontWeight","textTransform","fontStyle","fontVariantNumeric","lineHeight","letterSpacing","textColor","textOpacity","textDecoration","textDecorationColor","textDecorationStyle","textDecorationThickness","textUnderlineOffset","fontSmoothing","placeholderColor","placeholderOpacity","caretColor","accentColor","opacity","backgroundBlendMode","mixBlendMode","boxShadow","boxShadowColor","outlineStyle","outlineWidth","outlineOffset","outlineColor","ringWidth","ringColor","ringOpacity","ringOffsetWidth","ringOffsetColor","blur","brightness","contrast","dropShadow","grayscale","hueRotate","invert","saturate","sepia","filter","backdropBlur","backdropBrightness","backdropContrast","backdropGrayscale","backdropHueRotate","backdropInvert","backdropOpacity","backdropSaturate","backdropSepia","backdropFilter","transitionProperty","transitionDelay","transitionDuration","transitionTimingFunction","willChange","contain","content","forcedColorAdjust"]});function vf(r,e){return r===void 0?e:Array.isArray(r)?r:[...new Set(e.filter(i=>r!==!1&&r[i]!==!1).concat(Object.keys(r).filter(i=>r[i]!==!1)))]}var xf=R(()=>{u()});var kf={};Ge(kf,{default:()=>Qe});var Qe,Wi=R(()=>{u();Qe=new Proxy({},{get:()=>String})});function Bs(r,e,t){typeof m!="undefined"&&m.env.JEST_WORKER_ID||t&&Sf.has(t)||(t&&Sf.add(t),console.warn(""),e.forEach(i=>console.warn(r,"-",i)))}function Fs(r){return Qe.dim(r)}var Sf,G,Be=R(()=>{u();Wi();Sf=new Set;G={info(r,e){Bs(Qe.bold(Qe.cyan("info")),...Array.isArray(r)?[r]:[e,r])},warn(r,e){["content-problems"].includes(r)||Bs(Qe.bold(Qe.yellow("warn")),...Array.isArray(r)?[r]:[e,r])},risk(r,e){Bs(Qe.bold(Qe.magenta("risk")),...Array.isArray(r)?[r]:[e,r])}}});var Af={};Ge(Af,{default:()=>js});function qr({version:r,from:e,to:t}){G.warn(`${e}-color-renamed`,[`As of Tailwind CSS ${r}, \`${e}\` has been renamed to \`${t}\`.`,"Update your configuration file to silence this warning."])}var js,zs=R(()=>{u();Be();js={inherit:"inherit",current:"currentColor",transparent:"transparent",black:"#000",white:"#fff",slate:{50:"#f8fafc",100:"#f1f5f9",200:"#e2e8f0",300:"#cbd5e1",400:"#94a3b8",500:"#64748b",600:"#475569",700:"#334155",800:"#1e293b",900:"#0f172a",950:"#020617"},gray:{50:"#f9fafb",100:"#f3f4f6",200:"#e5e7eb",300:"#d1d5db",400:"#9ca3af",500:"#6b7280",600:"#4b5563",700:"#374151",800:"#1f2937",900:"#111827",950:"#030712"},zinc:{50:"#fafafa",100:"#f4f4f5",200:"#e4e4e7",300:"#d4d4d8",400:"#a1a1aa",500:"#71717a",600:"#52525b",700:"#3f3f46",800:"#27272a",900:"#18181b",950:"#09090b"},neutral:{50:"#fafafa",100:"#f5f5f5",200:"#e5e5e5",300:"#d4d4d4",400:"#a3a3a3",500:"#737373",600:"#525252",700:"#404040",800:"#262626",900:"#171717",950:"#0a0a0a"},stone:{50:"#fafaf9",100:"#f5f5f4",200:"#e7e5e4",300:"#d6d3d1",400:"#a8a29e",500:"#78716c",600:"#57534e",700:"#44403c",800:"#292524",900:"#1c1917",950:"#0c0a09"},red:{50:"#fef2f2",100:"#fee2e2",200:"#fecaca",300:"#fca5a5",400:"#f87171",500:"#ef4444",600:"#dc2626",700:"#b91c1c",800:"#991b1b",900:"#7f1d1d",950:"#450a0a"},orange:{50:"#fff7ed",100:"#ffedd5",200:"#fed7aa",300:"#fdba74",400:"#fb923c",500:"#f97316",600:"#ea580c",700:"#c2410c",800:"#9a3412",900:"#7c2d12",950:"#431407"},amber:{50:"#fffbeb",100:"#fef3c7",200:"#fde68a",300:"#fcd34d",400:"#fbbf24",500:"#f59e0b",600:"#d97706",700:"#b45309",800:"#92400e",900:"#78350f",950:"#451a03"},yellow:{50:"#fefce8",100:"#fef9c3",200:"#fef08a",300:"#fde047",400:"#facc15",500:"#eab308",600:"#ca8a04",700:"#a16207",800:"#854d0e",900:"#713f12",950:"#422006"},lime:{50:"#f7fee7",100:"#ecfccb",200:"#d9f99d",300:"#bef264",400:"#a3e635",500:"#84cc16",600:"#65a30d",700:"#4d7c0f",800:"#3f6212",900:"#365314",950:"#1a2e05"},green:{50:"#f0fdf4",100:"#dcfce7",200:"#bbf7d0",300:"#86efac",400:"#4ade80",500:"#22c55e",600:"#16a34a",700:"#15803d",800:"#166534",900:"#14532d",950:"#052e16"},emerald:{50:"#ecfdf5",100:"#d1fae5",200:"#a7f3d0",300:"#6ee7b7",400:"#34d399",500:"#10b981",600:"#059669",700:"#047857",800:"#065f46",900:"#064e3b",950:"#022c22"},teal:{50:"#f0fdfa",100:"#ccfbf1",200:"#99f6e4",300:"#5eead4",400:"#2dd4bf",500:"#14b8a6",600:"#0d9488",700:"#0f766e",800:"#115e59",900:"#134e4a",950:"#042f2e"},cyan:{50:"#ecfeff",100:"#cffafe",200:"#a5f3fc",300:"#67e8f9",400:"#22d3ee",500:"#06b6d4",600:"#0891b2",700:"#0e7490",800:"#155e75",900:"#164e63",950:"#083344"},sky:{50:"#f0f9ff",100:"#e0f2fe",200:"#bae6fd",300:"#7dd3fc",400:"#38bdf8",500:"#0ea5e9",600:"#0284c7",700:"#0369a1",800:"#075985",900:"#0c4a6e",950:"#082f49"},blue:{50:"#eff6ff",100:"#dbeafe",200:"#bfdbfe",300:"#93c5fd",400:"#60a5fa",500:"#3b82f6",600:"#2563eb",700:"#1d4ed8",800:"#1e40af",900:"#1e3a8a",950:"#172554"},indigo:{50:"#eef2ff",100:"#e0e7ff",200:"#c7d2fe",300:"#a5b4fc",400:"#818cf8",500:"#6366f1",600:"#4f46e5",700:"#4338ca",800:"#3730a3",900:"#312e81",950:"#1e1b4b"},violet:{50:"#f5f3ff",100:"#ede9fe",200:"#ddd6fe",300:"#c4b5fd",400:"#a78bfa",500:"#8b5cf6",600:"#7c3aed",700:"#6d28d9",800:"#5b21b6",900:"#4c1d95",950:"#2e1065"},purple:{50:"#faf5ff",100:"#f3e8ff",200:"#e9d5ff",300:"#d8b4fe",400:"#c084fc",500:"#a855f7",600:"#9333ea",700:"#7e22ce",800:"#6b21a8",900:"#581c87",950:"#3b0764"},fuchsia:{50:"#fdf4ff",100:"#fae8ff",200:"#f5d0fe",300:"#f0abfc",400:"#e879f9",500:"#d946ef",600:"#c026d3",700:"#a21caf",800:"#86198f",900:"#701a75",950:"#4a044e"},pink:{50:"#fdf2f8",100:"#fce7f3",200:"#fbcfe8",300:"#f9a8d4",400:"#f472b6",500:"#ec4899",600:"#db2777",700:"#be185d",800:"#9d174d",900:"#831843",950:"#500724"},rose:{50:"#fff1f2",100:"#ffe4e6",200:"#fecdd3",300:"#fda4af",400:"#fb7185",500:"#f43f5e",600:"#e11d48",700:"#be123c",800:"#9f1239",900:"#881337",950:"#4c0519"},get lightBlue(){return qr({version:"v2.2",from:"lightBlue",to:"sky"}),this.sky},get warmGray(){return qr({version:"v3.0",from:"warmGray",to:"stone"}),this.stone},get trueGray(){return qr({version:"v3.0",from:"trueGray",to:"neutral"}),this.neutral},get coolGray(){return qr({version:"v3.0",from:"coolGray",to:"gray"}),this.gray},get blueGray(){return qr({version:"v3.0",from:"blueGray",to:"slate"}),this.slate}}});function Us(r,...e){for(let t of e){for(let i in t)r?.hasOwnProperty?.(i)||(r[i]=t[i]);for(let i of Object.getOwnPropertySymbols(t))r?.hasOwnProperty?.(i)||(r[i]=t[i])}return r}var Cf=R(()=>{u()});function kt(r){if(Array.isArray(r))return r;let e=r.split("[").length-1,t=r.split("]").length-1;if(e!==t)throw new Error(`Path is invalid. Has unbalanced brackets: ${r}`);return r.split(/\.(?![^\[]*\])|[\[\]]/g).filter(Boolean)}var Gi=R(()=>{u()});function we(r,e){return Qi.future.includes(e)?r.future==="all"||(r?.future?.[e]??_f[e]??!1):Qi.experimental.includes(e)?r.experimental==="all"||(r?.experimental?.[e]??_f[e]??!1):!1}function Ef(r){return r.experimental==="all"?Qi.experimental:Object.keys(r?.experimental??{}).filter(e=>Qi.experimental.includes(e)&&r.experimental[e])}function Of(r){if(m.env.JEST_WORKER_ID===void 0&&Ef(r).length>0){let e=Ef(r).map(t=>Qe.yellow(t)).join(", ");G.warn("experimental-flags-enabled",[`You have enabled experimental features: ${e}`,"Experimental features in Tailwind CSS are not covered by semver, may introduce breaking changes, and can change at any time."])}}var _f,Qi,ct=R(()=>{u();Wi();Be();_f={optimizeUniversalDefaults:!1,generalizedModifiers:!0,disableColorOpacityUtilitiesByDefault:!1,relativeContentPathsByDefault:!1},Qi={future:["hoverOnlyWhenSupported","respectDefaultRingColorOpacity","disableColorOpacityUtilitiesByDefault","relativeContentPathsByDefault"],experimental:["optimizeUniversalDefaults","generalizedModifiers"]}});function Tf(r){(()=>{if(r.purge||!r.content||!Array.isArray(r.content)&&!(typeof r.content=="object"&&r.content!==null))return!1;if(Array.isArray(r.content))return r.content.every(t=>typeof t=="string"?!0:!(typeof t?.raw!="string"||t?.extension&&typeof t?.extension!="string"));if(typeof r.content=="object"&&r.content!==null){if(Object.keys(r.content).some(t=>!["files","relative","extract","transform"].includes(t)))return!1;if(Array.isArray(r.content.files)){if(!r.content.files.every(t=>typeof t=="string"?!0:!(typeof t?.raw!="string"||t?.extension&&typeof t?.extension!="string")))return!1;if(typeof r.content.extract=="object"){for(let t of Object.values(r.content.extract))if(typeof t!="function")return!1}else if(!(r.content.extract===void 0||typeof r.content.extract=="function"))return!1;if(typeof r.content.transform=="object"){for(let t of Object.values(r.content.transform))if(typeof t!="function")return!1}else if(!(r.content.transform===void 0||typeof r.content.transform=="function"))return!1;if(typeof r.content.relative!="boolean"&&typeof r.content.relative!="undefined")return!1}return!0}return!1})()||G.warn("purge-deprecation",["The `purge`/`content` options have changed in Tailwind CSS v3.0.","Update your configuration file to eliminate this warning.","https://tailwindcss.com/docs/upgrade-guide#configure-content-sources"]),r.safelist=(()=>{let{content:t,purge:i,safelist:n}=r;return Array.isArray(n)?n:Array.isArray(t?.safelist)?t.safelist:Array.isArray(i?.safelist)?i.safelist:Array.isArray(i?.options?.safelist)?i.options.safelist:[]})(),r.blocklist=(()=>{let{blocklist:t}=r;if(Array.isArray(t)){if(t.every(i=>typeof i=="string"))return t;G.warn("blocklist-invalid",["The `blocklist` option must be an array of strings.","https://tailwindcss.com/docs/content-configuration#discarding-classes"])}return[]})(),typeof r.prefix=="function"?(G.warn("prefix-function",["As of Tailwind CSS v3.0, `prefix` cannot be a function.","Update `prefix` in your configuration to be a string to eliminate this warning.","https://tailwindcss.com/docs/upgrade-guide#prefix-cannot-be-a-function"]),r.prefix=""):r.prefix=r.prefix??"",r.content={relative:(()=>{let{content:t}=r;return t?.relative?t.relative:we(r,"relativeContentPathsByDefault")})(),files:(()=>{let{content:t,purge:i}=r;return Array.isArray(i)?i:Array.isArray(i?.content)?i.content:Array.isArray(t)?t:Array.isArray(t?.content)?t.content:Array.isArray(t?.files)?t.files:[]})(),extract:(()=>{let t=(()=>r.purge?.extract?r.purge.extract:r.content?.extract?r.content.extract:r.purge?.extract?.DEFAULT?r.purge.extract.DEFAULT:r.content?.extract?.DEFAULT?r.content.extract.DEFAULT:r.purge?.options?.extractors?r.purge.options.extractors:r.content?.options?.extractors?r.content.options.extractors:{})(),i={},n=(()=>{if(r.purge?.options?.defaultExtractor)return r.purge.options.defaultExtractor;if(r.content?.options?.defaultExtractor)return r.content.options.defaultExtractor})();if(n!==void 0&&(i.DEFAULT=n),typeof t=="function")i.DEFAULT=t;else if(Array.isArray(t))for(let{extensions:a,extractor:s}of t??[])for(let o of a)i[o]=s;else typeof t=="object"&&t!==null&&Object.assign(i,t);return i})(),transform:(()=>{let t=(()=>r.purge?.transform?r.purge.transform:r.content?.transform?r.content.transform:r.purge?.transform?.DEFAULT?r.purge.transform.DEFAULT:r.content?.transform?.DEFAULT?r.content.transform.DEFAULT:{})(),i={};return typeof t=="function"?i.DEFAULT=t:typeof t=="object"&&t!==null&&Object.assign(i,t),i})()};for(let t of r.content.files)if(typeof t=="string"&&/{([^,]*?)}/g.test(t)){G.warn("invalid-glob-braces",[`The glob pattern ${Fs(t)} in your Tailwind CSS configuration is invalid.`,`Update it to ${Fs(t.replace(/{([^,]*?)}/g,"$1"))} to silence this warning.`]);break}return r}var Rf=R(()=>{u();ct();Be()});function ke(r){if(Object.prototype.toString.call(r)!=="[object Object]")return!1;let e=Object.getPrototypeOf(r);return e===null||Object.getPrototypeOf(e)===null}var Kt=R(()=>{u()});function St(r){return Array.isArray(r)?r.map(e=>St(e)):typeof r=="object"&&r!==null?Object.fromEntries(Object.entries(r).map(([e,t])=>[e,St(t)])):r}var Yi=R(()=>{u()});function jt(r){return r.replace(/\\,/g,"\\2c ")}var Ki=R(()=>{u()});var Vs,Pf=R(()=>{u();Vs={aliceblue:[240,248,255],antiquewhite:[250,235,215],aqua:[0,255,255],aquamarine:[127,255,212],azure:[240,255,255],beige:[245,245,220],bisque:[255,228,196],black:[0,0,0],blanchedalmond:[255,235,205],blue:[0,0,255],blueviolet:[138,43,226],brown:[165,42,42],burlywood:[222,184,135],cadetblue:[95,158,160],chartreuse:[127,255,0],chocolate:[210,105,30],coral:[255,127,80],cornflowerblue:[100,149,237],cornsilk:[255,248,220],crimson:[220,20,60],cyan:[0,255,255],darkblue:[0,0,139],darkcyan:[0,139,139],darkgoldenrod:[184,134,11],darkgray:[169,169,169],darkgreen:[0,100,0],darkgrey:[169,169,169],darkkhaki:[189,183,107],darkmagenta:[139,0,139],darkolivegreen:[85,107,47],darkorange:[255,140,0],darkorchid:[153,50,204],darkred:[139,0,0],darksalmon:[233,150,122],darkseagreen:[143,188,143],darkslateblue:[72,61,139],darkslategray:[47,79,79],darkslategrey:[47,79,79],darkturquoise:[0,206,209],darkviolet:[148,0,211],deeppink:[255,20,147],deepskyblue:[0,191,255],dimgray:[105,105,105],dimgrey:[105,105,105],dodgerblue:[30,144,255],firebrick:[178,34,34],floralwhite:[255,250,240],forestgreen:[34,139,34],fuchsia:[255,0,255],gainsboro:[220,220,220],ghostwhite:[248,248,255],gold:[255,215,0],goldenrod:[218,165,32],gray:[128,128,128],green:[0,128,0],greenyellow:[173,255,47],grey:[128,128,128],honeydew:[240,255,240],hotpink:[255,105,180],indianred:[205,92,92],indigo:[75,0,130],ivory:[255,255,240],khaki:[240,230,140],lavender:[230,230,250],lavenderblush:[255,240,245],lawngreen:[124,252,0],lemonchiffon:[255,250,205],lightblue:[173,216,230],lightcoral:[240,128,128],lightcyan:[224,255,255],lightgoldenrodyellow:[250,250,210],lightgray:[211,211,211],lightgreen:[144,238,144],lightgrey:[211,211,211],lightpink:[255,182,193],lightsalmon:[255,160,122],lightseagreen:[32,178,170],lightskyblue:[135,206,250],lightslategray:[119,136,153],lightslategrey:[119,136,153],lightsteelblue:[176,196,222],lightyellow:[255,255,224],lime:[0,255,0],limegreen:[50,205,50],linen:[250,240,230],magenta:[255,0,255],maroon:[128,0,0],mediumaquamarine:[102,205,170],mediumblue:[0,0,205],mediumorchid:[186,85,211],mediumpurple:[147,112,219],mediumseagreen:[60,179,113],mediumslateblue:[123,104,238],mediumspringgreen:[0,250,154],mediumturquoise:[72,209,204],mediumvioletred:[199,21,133],midnightblue:[25,25,112],mintcream:[245,255,250],mistyrose:[255,228,225],moccasin:[255,228,181],navajowhite:[255,222,173],navy:[0,0,128],oldlace:[253,245,230],olive:[128,128,0],olivedrab:[107,142,35],orange:[255,165,0],orangered:[255,69,0],orchid:[218,112,214],palegoldenrod:[238,232,170],palegreen:[152,251,152],paleturquoise:[175,238,238],palevioletred:[219,112,147],papayawhip:[255,239,213],peachpuff:[255,218,185],peru:[205,133,63],pink:[255,192,203],plum:[221,160,221],powderblue:[176,224,230],purple:[128,0,128],rebeccapurple:[102,51,153],red:[255,0,0],rosybrown:[188,143,143],royalblue:[65,105,225],saddlebrown:[139,69,19],salmon:[250,128,114],sandybrown:[244,164,96],seagreen:[46,139,87],seashell:[255,245,238],sienna:[160,82,45],silver:[192,192,192],skyblue:[135,206,235],slateblue:[106,90,205],slategray:[112,128,144],slategrey:[112,128,144],snow:[255,250,250],springgreen:[0,255,127],steelblue:[70,130,180],tan:[210,180,140],teal:[0,128,128],thistle:[216,191,216],tomato:[255,99,71],turquoise:[64,224,208],violet:[238,130,238],wheat:[245,222,179],white:[255,255,255],whitesmoke:[245,245,245],yellow:[255,255,0],yellowgreen:[154,205,50]}});function $r(r,{loose:e=!1}={}){if(typeof r!="string")return null;if(r=r.trim(),r==="transparent")return{mode:"rgb",color:["0","0","0"],alpha:"0"};if(r in Vs)return{mode:"rgb",color:Vs[r].map(a=>a.toString())};let t=r.replace(Fv,(a,s,o,l,c)=>["#",s,s,o,o,l,l,c?c+c:""].join("")).match(Bv);if(t!==null)return{mode:"rgb",color:[parseInt(t[1],16),parseInt(t[2],16),parseInt(t[3],16)].map(a=>a.toString()),alpha:t[4]?(parseInt(t[4],16)/255).toString():void 0};let i=r.match(jv)??r.match(zv);if(i===null)return null;let n=[i[2],i[3],i[4]].filter(Boolean).map(a=>a.toString());return n.length===2&&n[0].startsWith("var(")?{mode:i[1],color:[n[0]],alpha:n[1]}:!e&&n.length!==3||n.length<3&&!n.some(a=>/^var\(.*?\)$/.test(a))?null:{mode:i[1],color:n,alpha:i[5]?.toString?.()}}function Hs({mode:r,color:e,alpha:t}){let i=t!==void 0;return r==="rgba"||r==="hsla"?`${r}(${e.join(", ")}${i?`, ${t}`:""})`:`${r}(${e.join(" ")}${i?` / ${t}`:""})`}var Bv,Fv,At,Xi,If,Ct,jv,zv,Ws=R(()=>{u();Pf();Bv=/^#([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})?$/i,Fv=/^#([a-f\d])([a-f\d])([a-f\d])([a-f\d])?$/i,At=/(?:\d+|\d*\.\d+)%?/,Xi=/(?:\s*,\s*|\s+)/,If=/\s*[,/]\s*/,Ct=/var\(--(?:[^ )]*?)(?:,(?:[^ )]*?|var\(--[^ )]*?\)))?\)/,jv=new RegExp(`^(rgba?)\\(\\s*(${At.source}|${Ct.source})(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${If.source}(${At.source}|${Ct.source}))?\\s*\\)$`),zv=new RegExp(`^(hsla?)\\(\\s*((?:${At.source})(?:deg|rad|grad|turn)?|${Ct.source})(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${If.source}(${At.source}|${Ct.source}))?\\s*\\)$`)});function Ze(r,e,t){if(typeof r=="function")return r({opacityValue:e});let i=$r(r,{loose:!0});return i===null?t:Hs({...i,alpha:e})}function Ae({color:r,property:e,variable:t}){let i=[].concat(e);if(typeof r=="function")return{[t]:"1",...Object.fromEntries(i.map(a=>[a,r({opacityVariable:t,opacityValue:`var(${t})`})]))};let n=$r(r);return n===null?Object.fromEntries(i.map(a=>[a,r])):n.alpha!==void 0?Object.fromEntries(i.map(a=>[a,r])):{[t]:"1",...Object.fromEntries(i.map(a=>[a,Hs({...n,alpha:`var(${t})`})]))}}var Lr=R(()=>{u();Ws()});function ve(r,e){let t=[],i=[],n=0,a=!1;for(let s=0;s<r.length;s++){let o=r[s];t.length===0&&o===e[0]&&!a&&(e.length===1||r.slice(s,s+e.length)===e)&&(i.push(r.slice(n,s)),n=s+e.length),a=a?!1:o==="\\",o==="("||o==="["||o==="{"?t.push(o):(o===")"&&t[t.length-1]==="("||o==="]"&&t[t.length-1]==="["||o==="}"&&t[t.length-1]==="{")&&t.pop()}return i.push(r.slice(n)),i}var zt=R(()=>{u()});function Ji(r){return ve(r,",").map(t=>{let i=t.trim(),n={raw:i},a=i.split(Vv),s=new Set;for(let o of a)Df.lastIndex=0,!s.has("KEYWORD")&&Uv.has(o)?(n.keyword=o,s.add("KEYWORD")):Df.test(o)?s.has("X")?s.has("Y")?s.has("BLUR")?s.has("SPREAD")||(n.spread=o,s.add("SPREAD")):(n.blur=o,s.add("BLUR")):(n.y=o,s.add("Y")):(n.x=o,s.add("X")):n.color?(n.unknown||(n.unknown=[]),n.unknown.push(o)):n.color=o;return n.valid=n.x!==void 0&&n.y!==void 0,n})}function qf(r){return r.map(e=>e.valid?[e.keyword,e.x,e.y,e.blur,e.spread,e.color].filter(Boolean).join(" "):e.raw).join(", ")}var Uv,Vv,Df,Gs=R(()=>{u();zt();Uv=new Set(["inset","inherit","initial","revert","unset"]),Vv=/\ +(?![^(]*\))/g,Df=/^-?(\d+|\.\d+)(.*?)$/g});function Qs(r){return Hv.some(e=>new RegExp(`^${e}\\(.*\\)`).test(r))}function K(r,e=null,t=!0){let i=e&&Wv.has(e.property);return r.startsWith("--")&&!i?`var(${r})`:r.includes("url(")?r.split(/(url\(.*?\))/g).filter(Boolean).map(n=>/^url\(.*?\)$/.test(n)?n:K(n,e,!1)).join(""):(r=r.replace(/([^\\])_+/g,(n,a)=>a+" ".repeat(n.length-1)).replace(/^_/g," ").replace(/\\_/g,"_"),t&&(r=r.trim()),r=Gv(r),r)}function Ye(r){return r.includes("=")&&(r=r.replace(/(=.*)/g,(e,t)=>{if(t[1]==="'"||t[1]==='"')return t;if(t.length>2){let i=t[t.length-1];if(t[t.length-2]===" "&&(i==="i"||i==="I"||i==="s"||i==="S"))return`="${t.slice(1,-2)}" ${t[t.length-1]}`}return`="${t.slice(1)}"`})),r}function Gv(r){let e=["theme"],t=["min-content","max-content","fit-content","safe-area-inset-top","safe-area-inset-right","safe-area-inset-bottom","safe-area-inset-left","titlebar-area-x","titlebar-area-y","titlebar-area-width","titlebar-area-height","keyboard-inset-top","keyboard-inset-right","keyboard-inset-bottom","keyboard-inset-left","keyboard-inset-width","keyboard-inset-height","radial-gradient","linear-gradient","conic-gradient","repeating-radial-gradient","repeating-linear-gradient","repeating-conic-gradient","anchor-size"];return r.replace(/(calc|min|max|clamp)\(.+\)/g,i=>{let n="";function a(){let s=n.trimEnd();return s[s.length-1]}for(let s=0;s<i.length;s++){let o=function(f){return f.split("").every((d,p)=>i[s+p]===d)},l=function(f){let d=1/0;for(let h of f){let b=i.indexOf(h,s);b!==-1&&b<d&&(d=b)}let p=i.slice(s,d);return s+=p.length-1,p},c=i[s];if(o("var"))n+=l([")",","]);else if(t.some(f=>o(f))){let f=t.find(d=>o(d));n+=f,s+=f.length-1}else e.some(f=>o(f))?n+=l([")"]):o("[")?n+=l(["]"]):["+","-","*","/"].includes(c)&&!["(","+","-","*","/",","].includes(a())?n+=` ${c} `:n+=c}return n.replace(/\s+/g," ")})}function Ys(r){return r.startsWith("url(")}function Ks(r){return!isNaN(Number(r))||Qs(r)}function Mr(r){return r.endsWith("%")&&Ks(r.slice(0,-1))||Qs(r)}function Nr(r){return r==="0"||new RegExp(`^[+-]?[0-9]*.?[0-9]+(?:[eE][+-]?[0-9]+)?${Yv}$`).test(r)||Qs(r)}function $f(r){return Kv.has(r)}function Lf(r){let e=Ji(K(r));for(let t of e)if(!t.valid)return!1;return!0}function Mf(r){let e=0;return ve(r,"_").every(i=>(i=K(i),i.startsWith("var(")?!0:$r(i,{loose:!0})!==null?(e++,!0):!1))?e>0:!1}function Nf(r){let e=0;return ve(r,",").every(i=>(i=K(i),i.startsWith("var(")?!0:Ys(i)||Jv(i)||["element(","image(","cross-fade(","image-set("].some(n=>i.startsWith(n))?(e++,!0):!1))?e>0:!1}function Jv(r){r=K(r);for(let e of Xv)if(r.startsWith(`${e}(`))return!0;return!1}function Bf(r){let e=0;return ve(r,"_").every(i=>(i=K(i),i.startsWith("var(")?!0:Zv.has(i)||Nr(i)||Mr(i)?(e++,!0):!1))?e>0:!1}function Ff(r){let e=0;return ve(r,",").every(i=>(i=K(i),i.startsWith("var(")?!0:i.includes(" ")&&!/(['"])([^"']+)\1/g.test(i)||/^\d/g.test(i)?!1:(e++,!0)))?e>0:!1}function jf(r){return ex.has(r)}function zf(r){return tx.has(r)}function Uf(r){return rx.has(r)}var Hv,Wv,Qv,Yv,Kv,Xv,Zv,ex,tx,rx,Br=R(()=>{u();Ws();Gs();zt();Hv=["min","max","clamp","calc"];Wv=new Set(["scroll-timeline-name","timeline-scope","view-timeline-name","font-palette","anchor-name","anchor-scope","position-anchor","position-try-options","scroll-timeline","animation-timeline","view-timeline","position-try"]);Qv=["cm","mm","Q","in","pc","pt","px","em","ex","ch","rem","lh","rlh","vw","vh","vmin","vmax","vb","vi","svw","svh","lvw","lvh","dvw","dvh","cqw","cqh","cqi","cqb","cqmin","cqmax"],Yv=`(?:${Qv.join("|")})`;Kv=new Set(["thin","medium","thick"]);Xv=new Set(["conic-gradient","linear-gradient","radial-gradient","repeating-conic-gradient","repeating-linear-gradient","repeating-radial-gradient"]);Zv=new Set(["center","top","right","bottom","left"]);ex=new Set(["serif","sans-serif","monospace","cursive","fantasy","system-ui","ui-serif","ui-sans-serif","ui-monospace","ui-rounded","math","emoji","fangsong"]);tx=new Set(["xx-small","x-small","small","medium","large","x-large","xx-large","xxx-large"]);rx=new Set(["larger","smaller"])});function Vf(r){let e=["cover","contain"];return ve(r,",").every(t=>{let i=ve(t,"_").filter(Boolean);return i.length===1&&e.includes(i[0])?!0:i.length!==1&&i.length!==2?!1:i.every(n=>Nr(n)||Mr(n)||n==="auto")})}var Hf=R(()=>{u();Br();zt()});function Wf(r,e){r.walkClasses(t=>{t.value=e(t.value),t.raws&&t.raws.value&&(t.raws.value=jt(t.raws.value))})}function Gf(r,e){if(!_t(r))return;let t=r.slice(1,-1);if(!!e(t))return K(t)}function ix(r,e={},t){let i=e[r];if(i!==void 0)return xt(i);if(_t(r)){let n=Gf(r,t);return n===void 0?void 0:xt(n)}}function Zi(r,e={},{validate:t=()=>!0}={}){let i=e.values?.[r];return i!==void 0?i:e.supportsNegativeValues&&r.startsWith("-")?ix(r.slice(1),e.values,t):Gf(r,t)}function _t(r){return r.startsWith("[")&&r.endsWith("]")}function Qf(r){let e=r.lastIndexOf("/"),t=r.lastIndexOf("[",e),i=r.indexOf("]",e);return r[e-1]==="]"||r[e+1]==="["||t!==-1&&i!==-1&&t<e&&e<i&&(e=r.lastIndexOf("/",t)),e===-1||e===r.length-1?[r,void 0]:_t(r)&&!r.includes("]/[")?[r,void 0]:[r.slice(0,e),r.slice(e+1)]}function Xt(r){if(typeof r=="string"&&r.includes("<alpha-value>")){let e=r;return({opacityValue:t=1})=>e.replace(/<alpha-value>/g,t)}return r}function Yf(r){return K(r.slice(1,-1))}function nx(r,e={},{tailwindConfig:t={}}={}){if(e.values?.[r]!==void 0)return Xt(e.values?.[r]);let[i,n]=Qf(r);if(n!==void 0){let a=e.values?.[i]??(_t(i)?i.slice(1,-1):void 0);return a===void 0?void 0:(a=Xt(a),_t(n)?Ze(a,Yf(n)):t.theme?.opacity?.[n]===void 0?void 0:Ze(a,t.theme.opacity[n]))}return Zi(r,e,{validate:Mf})}function sx(r,e={}){return e.values?.[r]}function qe(r){return(e,t)=>Zi(e,t,{validate:r})}function ax(r,e){let t=r.indexOf(e);return t===-1?[void 0,r]:[r.slice(0,t),r.slice(t+1)]}function Js(r,e,t,i){if(t.values&&e in t.values)for(let{type:a}of r??[]){let s=Xs[a](e,t,{tailwindConfig:i});if(s!==void 0)return[s,a,null]}if(_t(e)){let a=e.slice(1,-1),[s,o]=ax(a,":");if(!/^[\w-_]+$/g.test(s))o=a;else if(s!==void 0&&!Kf.includes(s))return[];if(o.length>0&&Kf.includes(s))return[Zi(`[${o}]`,t),s,null]}let n=Zs(r,e,t,i);for(let a of n)return a;return[]}function*Zs(r,e,t,i){let n=we(i,"generalizedModifiers"),[a,s]=Qf(e);if(n&&t.modifiers!=null&&(t.modifiers==="any"||typeof t.modifiers=="object"&&(s&&_t(s)||s in t.modifiers))||(a=e,s=void 0),s!==void 0&&a===""&&(a="DEFAULT"),s!==void 0&&typeof t.modifiers=="object"){let l=t.modifiers?.[s]??null;l!==null?s=l:_t(s)&&(s=Yf(s))}for(let{type:l}of r??[]){let c=Xs[l](a,t,{tailwindConfig:i});c!==void 0&&(yield[c,l,s??null])}}var Xs,Kf,Fr=R(()=>{u();Ki();Lr();Br();Hi();Hf();ct();Xs={any:Zi,color:nx,url:qe(Ys),image:qe(Nf),length:qe(Nr),percentage:qe(Mr),position:qe(Bf),lookup:sx,"generic-name":qe(jf),"family-name":qe(Ff),number:qe(Ks),"line-width":qe($f),"absolute-size":qe(zf),"relative-size":qe(Uf),shadow:qe(Lf),size:qe(Vf)},Kf=Object.keys(Xs)});function X(r){return typeof r=="function"?r({}):r}var ea=R(()=>{u()});function Jt(r){return typeof r=="function"}function jr(r,...e){let t=e.pop();for(let i of e)for(let n in i){let a=t(r[n],i[n]);a===void 0?ke(r[n])&&ke(i[n])?r[n]=jr({},r[n],i[n],t):r[n]=i[n]:r[n]=a}return r}function ox(r,...e){return Jt(r)?r(...e):r}function lx(r){return r.reduce((e,{extend:t})=>jr(e,t,(i,n)=>i===void 0?[n]:Array.isArray(i)?[n,...i]:[n,i]),{})}function ux(r){return{...r.reduce((e,t)=>Us(e,t),{}),extend:lx(r)}}function Xf(r,e){if(Array.isArray(r)&&ke(r[0]))return r.concat(e);if(Array.isArray(e)&&ke(e[0])&&ke(r))return[r,...e];if(Array.isArray(e))return e}function fx({extend:r,...e}){return jr(e,r,(t,i)=>!Jt(t)&&!i.some(Jt)?jr({},t,...i,Xf):(n,a)=>jr({},...[t,...i].map(s=>ox(s,n,a)),Xf))}function*cx(r){let e=kt(r);if(e.length===0||(yield e,Array.isArray(r)))return;let t=/^(.*?)\s*\/\s*([^/]+)$/,i=r.match(t);if(i!==null){let[,n,a]=i,s=kt(n);s.alpha=a,yield s}}function px(r){let e=(t,i)=>{for(let n of cx(t)){let a=0,s=r;for(;s!=null&&a<n.length;)s=s[n[a++]],s=Jt(s)&&(n.alpha===void 0||a<=n.length-1)?s(e,ta):s;if(s!==void 0){if(n.alpha!==void 0){let o=Xt(s);return Ze(o,n.alpha,X(o))}return ke(s)?St(s):s}}return i};return Object.assign(e,{theme:e,...ta}),Object.keys(r).reduce((t,i)=>(t[i]=Jt(r[i])?r[i](e,ta):r[i],t),{})}function Jf(r){let e=[];return r.forEach(t=>{e=[...e,t];let i=t?.plugins??[];i.length!==0&&i.forEach(n=>{n.__isOptionsFunction&&(n=n()),e=[...e,...Jf([n?.config??{}])]})}),e}function dx(r){return[...r].reduceRight((t,i)=>Jt(i)?i({corePlugins:t}):vf(i,t),bf)}function hx(r){return[...r].reduceRight((t,i)=>[...t,...i],[])}function ra(r){let e=[...Jf(r),{prefix:"",important:!1,separator:":"}];return Tf(Us({theme:px(fx(ux(e.map(t=>t?.theme??{})))),corePlugins:dx(e.map(t=>t.corePlugins)),plugins:hx(r.map(t=>t?.plugins??[]))},...e))}var ta,Zf=R(()=>{u();Hi();wf();xf();zs();Cf();Gi();Rf();Kt();Yi();Fr();Lr();ea();ta={colors:js,negative(r){return Object.keys(r).filter(e=>r[e]!=="0").reduce((e,t)=>{let i=xt(r[t]);return i!==void 0&&(e[`-${t}`]=i),e},{})},breakpoints(r){return Object.keys(r).filter(e=>typeof r[e]=="string").reduce((e,t)=>({...e,[`screen-${t}`]:r[t]}),{})}}});var en=x((l3,ec)=>{u();ec.exports={content:[],presets:[],darkMode:"media",theme:{accentColor:({theme:r})=>({...r("colors"),auto:"auto"}),animation:{none:"none",spin:"spin 1s linear infinite",ping:"ping 1s cubic-bezier(0, 0, 0.2, 1) infinite",pulse:"pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite",bounce:"bounce 1s infinite"},aria:{busy:'busy="true"',checked:'checked="true"',disabled:'disabled="true"',expanded:'expanded="true"',hidden:'hidden="true"',pressed:'pressed="true"',readonly:'readonly="true"',required:'required="true"',selected:'selected="true"'},aspectRatio:{auto:"auto",square:"1 / 1",video:"16 / 9"},backdropBlur:({theme:r})=>r("blur"),backdropBrightness:({theme:r})=>r("brightness"),backdropContrast:({theme:r})=>r("contrast"),backdropGrayscale:({theme:r})=>r("grayscale"),backdropHueRotate:({theme:r})=>r("hueRotate"),backdropInvert:({theme:r})=>r("invert"),backdropOpacity:({theme:r})=>r("opacity"),backdropSaturate:({theme:r})=>r("saturate"),backdropSepia:({theme:r})=>r("sepia"),backgroundColor:({theme:r})=>r("colors"),backgroundImage:{none:"none","gradient-to-t":"linear-gradient(to top, var(--tw-gradient-stops))","gradient-to-tr":"linear-gradient(to top right, var(--tw-gradient-stops))","gradient-to-r":"linear-gradient(to right, var(--tw-gradient-stops))","gradient-to-br":"linear-gradient(to bottom right, var(--tw-gradient-stops))","gradient-to-b":"linear-gradient(to bottom, var(--tw-gradient-stops))","gradient-to-bl":"linear-gradient(to bottom left, var(--tw-gradient-stops))","gradient-to-l":"linear-gradient(to left, var(--tw-gradient-stops))","gradient-to-tl":"linear-gradient(to top left, var(--tw-gradient-stops))"},backgroundOpacity:({theme:r})=>r("opacity"),backgroundPosition:{bottom:"bottom",center:"center",left:"left","left-bottom":"left bottom","left-top":"left top",right:"right","right-bottom":"right bottom","right-top":"right top",top:"top"},backgroundSize:{auto:"auto",cover:"cover",contain:"contain"},blur:{0:"0",none:"",sm:"4px",DEFAULT:"8px",md:"12px",lg:"16px",xl:"24px","2xl":"40px","3xl":"64px"},borderColor:({theme:r})=>({...r("colors"),DEFAULT:r("colors.gray.200","currentColor")}),borderOpacity:({theme:r})=>r("opacity"),borderRadius:{none:"0px",sm:"0.125rem",DEFAULT:"0.25rem",md:"0.375rem",lg:"0.5rem",xl:"0.75rem","2xl":"1rem","3xl":"1.5rem",full:"9999px"},borderSpacing:({theme:r})=>({...r("spacing")}),borderWidth:{DEFAULT:"1px",0:"0px",2:"2px",4:"4px",8:"8px"},boxShadow:{sm:"0 1px 2px 0 rgb(0 0 0 / 0.05)",DEFAULT:"0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",md:"0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",lg:"0 10px 15px -3px rgb(0 0 0 / 0.1), 0 4px 6px -4px rgb(0 0 0 / 0.1)",xl:"0 20px 25px -5px rgb(0 0 0 / 0.1), 0 8px 10px -6px rgb(0 0 0 / 0.1)","2xl":"0 25px 50px -12px rgb(0 0 0 / 0.25)",inner:"inset 0 2px 4px 0 rgb(0 0 0 / 0.05)",none:"none"},boxShadowColor:({theme:r})=>r("colors"),brightness:{0:"0",50:".5",75:".75",90:".9",95:".95",100:"1",105:"1.05",110:"1.1",125:"1.25",150:"1.5",200:"2"},caretColor:({theme:r})=>r("colors"),colors:({colors:r})=>({inherit:r.inherit,current:r.current,transparent:r.transparent,black:r.black,white:r.white,slate:r.slate,gray:r.gray,zinc:r.zinc,neutral:r.neutral,stone:r.stone,red:r.red,orange:r.orange,amber:r.amber,yellow:r.yellow,lime:r.lime,green:r.green,emerald:r.emerald,teal:r.teal,cyan:r.cyan,sky:r.sky,blue:r.blue,indigo:r.indigo,violet:r.violet,purple:r.purple,fuchsia:r.fuchsia,pink:r.pink,rose:r.rose}),columns:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12","3xs":"16rem","2xs":"18rem",xs:"20rem",sm:"24rem",md:"28rem",lg:"32rem",xl:"36rem","2xl":"42rem","3xl":"48rem","4xl":"56rem","5xl":"64rem","6xl":"72rem","7xl":"80rem"},container:{},content:{none:"none"},contrast:{0:"0",50:".5",75:".75",100:"1",125:"1.25",150:"1.5",200:"2"},cursor:{auto:"auto",default:"default",pointer:"pointer",wait:"wait",text:"text",move:"move",help:"help","not-allowed":"not-allowed",none:"none","context-menu":"context-menu",progress:"progress",cell:"cell",crosshair:"crosshair","vertical-text":"vertical-text",alias:"alias",copy:"copy","no-drop":"no-drop",grab:"grab",grabbing:"grabbing","all-scroll":"all-scroll","col-resize":"col-resize","row-resize":"row-resize","n-resize":"n-resize","e-resize":"e-resize","s-resize":"s-resize","w-resize":"w-resize","ne-resize":"ne-resize","nw-resize":"nw-resize","se-resize":"se-resize","sw-resize":"sw-resize","ew-resize":"ew-resize","ns-resize":"ns-resize","nesw-resize":"nesw-resize","nwse-resize":"nwse-resize","zoom-in":"zoom-in","zoom-out":"zoom-out"},divideColor:({theme:r})=>r("borderColor"),divideOpacity:({theme:r})=>r("borderOpacity"),divideWidth:({theme:r})=>r("borderWidth"),dropShadow:{sm:"0 1px 1px rgb(0 0 0 / 0.05)",DEFAULT:["0 1px 2px rgb(0 0 0 / 0.1)","0 1px 1px rgb(0 0 0 / 0.06)"],md:["0 4px 3px rgb(0 0 0 / 0.07)","0 2px 2px rgb(0 0 0 / 0.06)"],lg:["0 10px 8px rgb(0 0 0 / 0.04)","0 4px 3px rgb(0 0 0 / 0.1)"],xl:["0 20px 13px rgb(0 0 0 / 0.03)","0 8px 5px rgb(0 0 0 / 0.08)"],"2xl":"0 25px 25px rgb(0 0 0 / 0.15)",none:"0 0 #0000"},fill:({theme:r})=>({none:"none",...r("colors")}),flex:{1:"1 1 0%",auto:"1 1 auto",initial:"0 1 auto",none:"none"},flexBasis:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%","1/12":"8.333333%","2/12":"16.666667%","3/12":"25%","4/12":"33.333333%","5/12":"41.666667%","6/12":"50%","7/12":"58.333333%","8/12":"66.666667%","9/12":"75%","10/12":"83.333333%","11/12":"91.666667%",full:"100%"}),flexGrow:{0:"0",DEFAULT:"1"},flexShrink:{0:"0",DEFAULT:"1"},fontFamily:{sans:["ui-sans-serif","system-ui","sans-serif",'"Apple Color Emoji"','"Segoe UI Emoji"','"Segoe UI Symbol"','"Noto Color Emoji"'],serif:["ui-serif","Georgia","Cambria",'"Times New Roman"',"Times","serif"],mono:["ui-monospace","SFMono-Regular","Menlo","Monaco","Consolas",'"Liberation Mono"','"Courier New"',"monospace"]},fontSize:{xs:["0.75rem",{lineHeight:"1rem"}],sm:["0.875rem",{lineHeight:"1.25rem"}],base:["1rem",{lineHeight:"1.5rem"}],lg:["1.125rem",{lineHeight:"1.75rem"}],xl:["1.25rem",{lineHeight:"1.75rem"}],"2xl":["1.5rem",{lineHeight:"2rem"}],"3xl":["1.875rem",{lineHeight:"2.25rem"}],"4xl":["2.25rem",{lineHeight:"2.5rem"}],"5xl":["3rem",{lineHeight:"1"}],"6xl":["3.75rem",{lineHeight:"1"}],"7xl":["4.5rem",{lineHeight:"1"}],"8xl":["6rem",{lineHeight:"1"}],"9xl":["8rem",{lineHeight:"1"}]},fontWeight:{thin:"100",extralight:"200",light:"300",normal:"400",medium:"500",semibold:"600",bold:"700",extrabold:"800",black:"900"},gap:({theme:r})=>r("spacing"),gradientColorStops:({theme:r})=>r("colors"),gradientColorStopPositions:{"0%":"0%","5%":"5%","10%":"10%","15%":"15%","20%":"20%","25%":"25%","30%":"30%","35%":"35%","40%":"40%","45%":"45%","50%":"50%","55%":"55%","60%":"60%","65%":"65%","70%":"70%","75%":"75%","80%":"80%","85%":"85%","90%":"90%","95%":"95%","100%":"100%"},grayscale:{0:"0",DEFAULT:"100%"},gridAutoColumns:{auto:"auto",min:"min-content",max:"max-content",fr:"minmax(0, 1fr)"},gridAutoRows:{auto:"auto",min:"min-content",max:"max-content",fr:"minmax(0, 1fr)"},gridColumn:{auto:"auto","span-1":"span 1 / span 1","span-2":"span 2 / span 2","span-3":"span 3 / span 3","span-4":"span 4 / span 4","span-5":"span 5 / span 5","span-6":"span 6 / span 6","span-7":"span 7 / span 7","span-8":"span 8 / span 8","span-9":"span 9 / span 9","span-10":"span 10 / span 10","span-11":"span 11 / span 11","span-12":"span 12 / span 12","span-full":"1 / -1"},gridColumnEnd:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridColumnStart:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridRow:{auto:"auto","span-1":"span 1 / span 1","span-2":"span 2 / span 2","span-3":"span 3 / span 3","span-4":"span 4 / span 4","span-5":"span 5 / span 5","span-6":"span 6 / span 6","span-7":"span 7 / span 7","span-8":"span 8 / span 8","span-9":"span 9 / span 9","span-10":"span 10 / span 10","span-11":"span 11 / span 11","span-12":"span 12 / span 12","span-full":"1 / -1"},gridRowEnd:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridRowStart:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridTemplateColumns:{none:"none",subgrid:"subgrid",1:"repeat(1, minmax(0, 1fr))",2:"repeat(2, minmax(0, 1fr))",3:"repeat(3, minmax(0, 1fr))",4:"repeat(4, minmax(0, 1fr))",5:"repeat(5, minmax(0, 1fr))",6:"repeat(6, minmax(0, 1fr))",7:"repeat(7, minmax(0, 1fr))",8:"repeat(8, minmax(0, 1fr))",9:"repeat(9, minmax(0, 1fr))",10:"repeat(10, minmax(0, 1fr))",11:"repeat(11, minmax(0, 1fr))",12:"repeat(12, minmax(0, 1fr))"},gridTemplateRows:{none:"none",subgrid:"subgrid",1:"repeat(1, minmax(0, 1fr))",2:"repeat(2, minmax(0, 1fr))",3:"repeat(3, minmax(0, 1fr))",4:"repeat(4, minmax(0, 1fr))",5:"repeat(5, minmax(0, 1fr))",6:"repeat(6, minmax(0, 1fr))",7:"repeat(7, minmax(0, 1fr))",8:"repeat(8, minmax(0, 1fr))",9:"repeat(9, minmax(0, 1fr))",10:"repeat(10, minmax(0, 1fr))",11:"repeat(11, minmax(0, 1fr))",12:"repeat(12, minmax(0, 1fr))"},height:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%",full:"100%",screen:"100vh",svh:"100svh",lvh:"100lvh",dvh:"100dvh",min:"min-content",max:"max-content",fit:"fit-content"}),hueRotate:{0:"0deg",15:"15deg",30:"30deg",60:"60deg",90:"90deg",180:"180deg"},inset:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%",full:"100%"}),invert:{0:"0",DEFAULT:"100%"},keyframes:{spin:{to:{transform:"rotate(360deg)"}},ping:{"75%, 100%":{transform:"scale(2)",opacity:"0"}},pulse:{"50%":{opacity:".5"}},bounce:{"0%, 100%":{transform:"translateY(-25%)",animationTimingFunction:"cubic-bezier(0.8,0,1,1)"},"50%":{transform:"none",animationTimingFunction:"cubic-bezier(0,0,0.2,1)"}}},letterSpacing:{tighter:"-0.05em",tight:"-0.025em",normal:"0em",wide:"0.025em",wider:"0.05em",widest:"0.1em"},lineHeight:{none:"1",tight:"1.25",snug:"1.375",normal:"1.5",relaxed:"1.625",loose:"2",3:".75rem",4:"1rem",5:"1.25rem",6:"1.5rem",7:"1.75rem",8:"2rem",9:"2.25rem",10:"2.5rem"},listStyleType:{none:"none",disc:"disc",decimal:"decimal"},listStyleImage:{none:"none"},margin:({theme:r})=>({auto:"auto",...r("spacing")}),lineClamp:{1:"1",2:"2",3:"3",4:"4",5:"5",6:"6"},maxHeight:({theme:r})=>({...r("spacing"),none:"none",full:"100%",screen:"100vh",svh:"100svh",lvh:"100lvh",dvh:"100dvh",min:"min-content",max:"max-content",fit:"fit-content"}),maxWidth:({theme:r,breakpoints:e})=>({...r("spacing"),none:"none",xs:"20rem",sm:"24rem",md:"28rem",lg:"32rem",xl:"36rem","2xl":"42rem","3xl":"48rem","4xl":"56rem","5xl":"64rem","6xl":"72rem","7xl":"80rem",full:"100%",min:"min-content",max:"max-content",fit:"fit-content",prose:"65ch",...e(r("screens"))}),minHeight:({theme:r})=>({...r("spacing"),full:"100%",screen:"100vh",svh:"100svh",lvh:"100lvh",dvh:"100dvh",min:"min-content",max:"max-content",fit:"fit-content"}),minWidth:({theme:r})=>({...r("spacing"),full:"100%",min:"min-content",max:"max-content",fit:"fit-content"}),objectPosition:{bottom:"bottom",center:"center",left:"left","left-bottom":"left bottom","left-top":"left top",right:"right","right-bottom":"right bottom","right-top":"right top",top:"top"},opacity:{0:"0",5:"0.05",10:"0.1",15:"0.15",20:"0.2",25:"0.25",30:"0.3",35:"0.35",40:"0.4",45:"0.45",50:"0.5",55:"0.55",60:"0.6",65:"0.65",70:"0.7",75:"0.75",80:"0.8",85:"0.85",90:"0.9",95:"0.95",100:"1"},order:{first:"-9999",last:"9999",none:"0",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12"},outlineColor:({theme:r})=>r("colors"),outlineOffset:{0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},outlineWidth:{0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},padding:({theme:r})=>r("spacing"),placeholderColor:({theme:r})=>r("colors"),placeholderOpacity:({theme:r})=>r("opacity"),ringColor:({theme:r})=>({DEFAULT:r("colors.blue.500","#3b82f6"),...r("colors")}),ringOffsetColor:({theme:r})=>r("colors"),ringOffsetWidth:{0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},ringOpacity:({theme:r})=>({DEFAULT:"0.5",...r("opacity")}),ringWidth:{DEFAULT:"3px",0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},rotate:{0:"0deg",1:"1deg",2:"2deg",3:"3deg",6:"6deg",12:"12deg",45:"45deg",90:"90deg",180:"180deg"},saturate:{0:"0",50:".5",100:"1",150:"1.5",200:"2"},scale:{0:"0",50:".5",75:".75",90:".9",95:".95",100:"1",105:"1.05",110:"1.1",125:"1.25",150:"1.5"},screens:{sm:"640px",md:"768px",lg:"1024px",xl:"1280px","2xl":"1536px"},scrollMargin:({theme:r})=>({...r("spacing")}),scrollPadding:({theme:r})=>r("spacing"),sepia:{0:"0",DEFAULT:"100%"},skew:{0:"0deg",1:"1deg",2:"2deg",3:"3deg",6:"6deg",12:"12deg"},space:({theme:r})=>({...r("spacing")}),spacing:{px:"1px",0:"0px",.5:"0.125rem",1:"0.25rem",1.5:"0.375rem",2:"0.5rem",2.5:"0.625rem",3:"0.75rem",3.5:"0.875rem",4:"1rem",5:"1.25rem",6:"1.5rem",7:"1.75rem",8:"2rem",9:"2.25rem",10:"2.5rem",11:"2.75rem",12:"3rem",14:"3.5rem",16:"4rem",20:"5rem",24:"6rem",28:"7rem",32:"8rem",36:"9rem",40:"10rem",44:"11rem",48:"12rem",52:"13rem",56:"14rem",60:"15rem",64:"16rem",72:"18rem",80:"20rem",96:"24rem"},stroke:({theme:r})=>({none:"none",...r("colors")}),strokeWidth:{0:"0",1:"1",2:"2"},supports:{},data:{},textColor:({theme:r})=>r("colors"),textDecorationColor:({theme:r})=>r("colors"),textDecorationThickness:{auto:"auto","from-font":"from-font",0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},textIndent:({theme:r})=>({...r("spacing")}),textOpacity:({theme:r})=>r("opacity"),textUnderlineOffset:{auto:"auto",0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},transformOrigin:{center:"center",top:"top","top-right":"top right",right:"right","bottom-right":"bottom right",bottom:"bottom","bottom-left":"bottom left",left:"left","top-left":"top left"},transitionDelay:{0:"0s",75:"75ms",100:"100ms",150:"150ms",200:"200ms",300:"300ms",500:"500ms",700:"700ms",1e3:"1000ms"},transitionDuration:{DEFAULT:"150ms",0:"0s",75:"75ms",100:"100ms",150:"150ms",200:"200ms",300:"300ms",500:"500ms",700:"700ms",1e3:"1000ms"},transitionProperty:{none:"none",all:"all",DEFAULT:"color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, backdrop-filter",colors:"color, background-color, border-color, text-decoration-color, fill, stroke",opacity:"opacity",shadow:"box-shadow",transform:"transform"},transitionTimingFunction:{DEFAULT:"cubic-bezier(0.4, 0, 0.2, 1)",linear:"linear",in:"cubic-bezier(0.4, 0, 1, 1)",out:"cubic-bezier(0, 0, 0.2, 1)","in-out":"cubic-bezier(0.4, 0, 0.2, 1)"},translate:({theme:r})=>({...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%",full:"100%"}),size:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%","1/12":"8.333333%","2/12":"16.666667%","3/12":"25%","4/12":"33.333333%","5/12":"41.666667%","6/12":"50%","7/12":"58.333333%","8/12":"66.666667%","9/12":"75%","10/12":"83.333333%","11/12":"91.666667%",full:"100%",min:"min-content",max:"max-content",fit:"fit-content"}),width:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%","1/12":"8.333333%","2/12":"16.666667%","3/12":"25%","4/12":"33.333333%","5/12":"41.666667%","6/12":"50%","7/12":"58.333333%","8/12":"66.666667%","9/12":"75%","10/12":"83.333333%","11/12":"91.666667%",full:"100%",screen:"100vw",svw:"100svw",lvw:"100lvw",dvw:"100dvw",min:"min-content",max:"max-content",fit:"fit-content"}),willChange:{auto:"auto",scroll:"scroll-position",contents:"contents",transform:"transform"},zIndex:{auto:"auto",0:"0",10:"10",20:"20",30:"30",40:"40",50:"50"}},plugins:[]}});function tn(r){let e=(r?.presets??[tc.default]).slice().reverse().flatMap(n=>tn(n instanceof Function?n():n)),t={respectDefaultRingColorOpacity:{theme:{ringColor:({theme:n})=>({DEFAULT:"#3b82f67f",...n("colors")})}},disableColorOpacityUtilitiesByDefault:{corePlugins:{backgroundOpacity:!1,borderOpacity:!1,divideOpacity:!1,placeholderOpacity:!1,ringOpacity:!1,textOpacity:!1}}},i=Object.keys(t).filter(n=>we(r,n)).map(n=>t[n]);return[r,...i,...e]}var tc,rc=R(()=>{u();tc=pe(en());ct()});var ic={};Ge(ic,{default:()=>zr});function zr(...r){let[,...e]=tn(r[0]);return ra([...r,...e])}var ia=R(()=>{u();Zf();rc()});var Ur={};Ge(Ur,{default:()=>me});var me,et=R(()=>{u();me={resolve:r=>r,extname:r=>"."+r.split(".").pop()}});function rn(r){return typeof r=="object"&&r!==null}function gx(r){return Object.keys(r).length===0}function nc(r){return typeof r=="string"||r instanceof String}function na(r){return rn(r)&&r.config===void 0&&!gx(r)?null:rn(r)&&r.config!==void 0&&nc(r.config)?me.resolve(r.config):rn(r)&&r.config!==void 0&&rn(r.config)?null:nc(r)?me.resolve(r):yx()}function yx(){for(let r of mx)try{let e=me.resolve(r);return be.accessSync(e),e}catch(e){}return null}var mx,sc=R(()=>{u();ft();et();mx=["./tailwind.config.js","./tailwind.config.cjs","./tailwind.config.mjs","./tailwind.config.ts","./tailwind.config.cts","./tailwind.config.mts"]});var ac={};Ge(ac,{default:()=>sa});var sa,aa=R(()=>{u();sa={parse:r=>({href:r})}});var oa=x(()=>{u()});var nn=x((b3,uc)=>{u();"use strict";var oc=(Wi(),kf),lc=oa(),Zt=class extends Error{constructor(e,t,i,n,a,s){super(e);this.name="CssSyntaxError",this.reason=e,a&&(this.file=a),n&&(this.source=n),s&&(this.plugin=s),typeof t!="undefined"&&typeof i!="undefined"&&(typeof t=="number"?(this.line=t,this.column=i):(this.line=t.line,this.column=t.column,this.endLine=i.line,this.endColumn=i.column)),this.setMessage(),Error.captureStackTrace&&Error.captureStackTrace(this,Zt)}setMessage(){this.message=this.plugin?this.plugin+": ":"",this.message+=this.file?this.file:"<css input>",typeof this.line!="undefined"&&(this.message+=":"+this.line+":"+this.column),this.message+=": "+this.reason}showSourceCode(e){if(!this.source)return"";let t=this.source;e==null&&(e=oc.isColorSupported),lc&&e&&(t=lc(t));let i=t.split(/\r?\n/),n=Math.max(this.line-3,0),a=Math.min(this.line+2,i.length),s=String(a).length,o,l;if(e){let{bold:c,red:f,gray:d}=oc.createColors(!0);o=p=>c(f(p)),l=p=>d(p)}else o=l=c=>c;return i.slice(n,a).map((c,f)=>{let d=n+1+f,p=" "+(" "+d).slice(-s)+" | ";if(d===this.line){let h=l(p.replace(/\d/g," "))+c.slice(0,this.column-1).replace(/[^\t]/g," ");return o(">")+l(p)+c+`
+ `+h+o("^")}return" "+l(p)+c}).join(`
+`)}toString(){let e=this.showSourceCode();return e&&(e=`
+
+`+e+`
+`),this.name+": "+this.message+e}};uc.exports=Zt;Zt.default=Zt});var sn=x((w3,la)=>{u();"use strict";la.exports.isClean=Symbol("isClean");la.exports.my=Symbol("my")});var ua=x((v3,cc)=>{u();"use strict";var fc={colon:": ",indent:"    ",beforeDecl:`
+`,beforeRule:`
+`,beforeOpen:" ",beforeClose:`
+`,beforeComment:`
+`,after:`
+`,emptyBody:"",commentLeft:" ",commentRight:" ",semicolon:!1};function bx(r){return r[0].toUpperCase()+r.slice(1)}var an=class{constructor(e){this.builder=e}stringify(e,t){if(!this[e.type])throw new Error("Unknown AST node type "+e.type+". Maybe you need to change PostCSS stringifier.");this[e.type](e,t)}document(e){this.body(e)}root(e){this.body(e),e.raws.after&&this.builder(e.raws.after)}comment(e){let t=this.raw(e,"left","commentLeft"),i=this.raw(e,"right","commentRight");this.builder("/*"+t+e.text+i+"*/",e)}decl(e,t){let i=this.raw(e,"between","colon"),n=e.prop+i+this.rawValue(e,"value");e.important&&(n+=e.raws.important||" !important"),t&&(n+=";"),this.builder(n,e)}rule(e){this.block(e,this.rawValue(e,"selector")),e.raws.ownSemicolon&&this.builder(e.raws.ownSemicolon,e,"end")}atrule(e,t){let i="@"+e.name,n=e.params?this.rawValue(e,"params"):"";if(typeof e.raws.afterName!="undefined"?i+=e.raws.afterName:n&&(i+=" "),e.nodes)this.block(e,i+n);else{let a=(e.raws.between||"")+(t?";":"");this.builder(i+n+a,e)}}body(e){let t=e.nodes.length-1;for(;t>0&&e.nodes[t].type==="comment";)t-=1;let i=this.raw(e,"semicolon");for(let n=0;n<e.nodes.length;n++){let a=e.nodes[n],s=this.raw(a,"before");s&&this.builder(s),this.stringify(a,t!==n||i)}}block(e,t){let i=this.raw(e,"between","beforeOpen");this.builder(t+i+"{",e,"start");let n;e.nodes&&e.nodes.length?(this.body(e),n=this.raw(e,"after")):n=this.raw(e,"after","emptyBody"),n&&this.builder(n),this.builder("}",e,"end")}raw(e,t,i){let n;if(i||(i=t),t&&(n=e.raws[t],typeof n!="undefined"))return n;let a=e.parent;if(i==="before"&&(!a||a.type==="root"&&a.first===e||a&&a.type==="document"))return"";if(!a)return fc[i];let s=e.root();if(s.rawCache||(s.rawCache={}),typeof s.rawCache[i]!="undefined")return s.rawCache[i];if(i==="before"||i==="after")return this.beforeAfter(e,i);{let o="raw"+bx(i);this[o]?n=this[o](s,e):s.walk(l=>{if(n=l.raws[t],typeof n!="undefined")return!1})}return typeof n=="undefined"&&(n=fc[i]),s.rawCache[i]=n,n}rawSemicolon(e){let t;return e.walk(i=>{if(i.nodes&&i.nodes.length&&i.last.type==="decl"&&(t=i.raws.semicolon,typeof t!="undefined"))return!1}),t}rawEmptyBody(e){let t;return e.walk(i=>{if(i.nodes&&i.nodes.length===0&&(t=i.raws.after,typeof t!="undefined"))return!1}),t}rawIndent(e){if(e.raws.indent)return e.raws.indent;let t;return e.walk(i=>{let n=i.parent;if(n&&n!==e&&n.parent&&n.parent===e&&typeof i.raws.before!="undefined"){let a=i.raws.before.split(`
+`);return t=a[a.length-1],t=t.replace(/\S/g,""),!1}}),t}rawBeforeComment(e,t){let i;return e.walkComments(n=>{if(typeof n.raws.before!="undefined")return i=n.raws.before,i.includes(`
+`)&&(i=i.replace(/[^\n]+$/,"")),!1}),typeof i=="undefined"?i=this.raw(t,null,"beforeDecl"):i&&(i=i.replace(/\S/g,"")),i}rawBeforeDecl(e,t){let i;return e.walkDecls(n=>{if(typeof n.raws.before!="undefined")return i=n.raws.before,i.includes(`
+`)&&(i=i.replace(/[^\n]+$/,"")),!1}),typeof i=="undefined"?i=this.raw(t,null,"beforeRule"):i&&(i=i.replace(/\S/g,"")),i}rawBeforeRule(e){let t;return e.walk(i=>{if(i.nodes&&(i.parent!==e||e.first!==i)&&typeof i.raws.before!="undefined")return t=i.raws.before,t.includes(`
+`)&&(t=t.replace(/[^\n]+$/,"")),!1}),t&&(t=t.replace(/\S/g,"")),t}rawBeforeClose(e){let t;return e.walk(i=>{if(i.nodes&&i.nodes.length>0&&typeof i.raws.after!="undefined")return t=i.raws.after,t.includes(`
+`)&&(t=t.replace(/[^\n]+$/,"")),!1}),t&&(t=t.replace(/\S/g,"")),t}rawBeforeOpen(e){let t;return e.walk(i=>{if(i.type!=="decl"&&(t=i.raws.between,typeof t!="undefined"))return!1}),t}rawColon(e){let t;return e.walkDecls(i=>{if(typeof i.raws.between!="undefined")return t=i.raws.between.replace(/[^\s:]/g,""),!1}),t}beforeAfter(e,t){let i;e.type==="decl"?i=this.raw(e,null,"beforeDecl"):e.type==="comment"?i=this.raw(e,null,"beforeComment"):t==="before"?i=this.raw(e,null,"beforeRule"):i=this.raw(e,null,"beforeClose");let n=e.parent,a=0;for(;n&&n.type!=="root";)a+=1,n=n.parent;if(i.includes(`
+`)){let s=this.raw(e,null,"indent");if(s.length)for(let o=0;o<a;o++)i+=s}return i}rawValue(e,t){let i=e[t],n=e.raws[t];return n&&n.value===i?n.raw:i}};cc.exports=an;an.default=an});var Vr=x((x3,pc)=>{u();"use strict";var wx=ua();function fa(r,e){new wx(e).stringify(r)}pc.exports=fa;fa.default=fa});var Hr=x((k3,dc)=>{u();"use strict";var{isClean:on,my:vx}=sn(),xx=nn(),kx=ua(),Sx=Vr();function ca(r,e){let t=new r.constructor;for(let i in r){if(!Object.prototype.hasOwnProperty.call(r,i)||i==="proxyCache")continue;let n=r[i],a=typeof n;i==="parent"&&a==="object"?e&&(t[i]=e):i==="source"?t[i]=n:Array.isArray(n)?t[i]=n.map(s=>ca(s,t)):(a==="object"&&n!==null&&(n=ca(n)),t[i]=n)}return t}var ln=class{constructor(e={}){this.raws={},this[on]=!1,this[vx]=!0;for(let t in e)if(t==="nodes"){this.nodes=[];for(let i of e[t])typeof i.clone=="function"?this.append(i.clone()):this.append(i)}else this[t]=e[t]}error(e,t={}){if(this.source){let{start:i,end:n}=this.rangeBy(t);return this.source.input.error(e,{line:i.line,column:i.column},{line:n.line,column:n.column},t)}return new xx(e)}warn(e,t,i){let n={node:this};for(let a in i)n[a]=i[a];return e.warn(t,n)}remove(){return this.parent&&this.parent.removeChild(this),this.parent=void 0,this}toString(e=Sx){e.stringify&&(e=e.stringify);let t="";return e(this,i=>{t+=i}),t}assign(e={}){for(let t in e)this[t]=e[t];return this}clone(e={}){let t=ca(this);for(let i in e)t[i]=e[i];return t}cloneBefore(e={}){let t=this.clone(e);return this.parent.insertBefore(this,t),t}cloneAfter(e={}){let t=this.clone(e);return this.parent.insertAfter(this,t),t}replaceWith(...e){if(this.parent){let t=this,i=!1;for(let n of e)n===this?i=!0:i?(this.parent.insertAfter(t,n),t=n):this.parent.insertBefore(t,n);i||this.remove()}return this}next(){if(!this.parent)return;let e=this.parent.index(this);return this.parent.nodes[e+1]}prev(){if(!this.parent)return;let e=this.parent.index(this);return this.parent.nodes[e-1]}before(e){return this.parent.insertBefore(this,e),this}after(e){return this.parent.insertAfter(this,e),this}root(){let e=this;for(;e.parent&&e.parent.type!=="document";)e=e.parent;return e}raw(e,t){return new kx().raw(this,e,t)}cleanRaws(e){delete this.raws.before,delete this.raws.after,e||delete this.raws.between}toJSON(e,t){let i={},n=t==null;t=t||new Map;let a=0;for(let s in this){if(!Object.prototype.hasOwnProperty.call(this,s)||s==="parent"||s==="proxyCache")continue;let o=this[s];if(Array.isArray(o))i[s]=o.map(l=>typeof l=="object"&&l.toJSON?l.toJSON(null,t):l);else if(typeof o=="object"&&o.toJSON)i[s]=o.toJSON(null,t);else if(s==="source"){let l=t.get(o.input);l==null&&(l=a,t.set(o.input,a),a++),i[s]={inputId:l,start:o.start,end:o.end}}else i[s]=o}return n&&(i.inputs=[...t.keys()].map(s=>s.toJSON())),i}positionInside(e){let t=this.toString(),i=this.source.start.column,n=this.source.start.line;for(let a=0;a<e;a++)t[a]===`
+`?(i=1,n+=1):i+=1;return{line:n,column:i}}positionBy(e){let t=this.source.start;if(e.index)t=this.positionInside(e.index);else if(e.word){let i=this.toString().indexOf(e.word);i!==-1&&(t=this.positionInside(i))}return t}rangeBy(e){let t={line:this.source.start.line,column:this.source.start.column},i=this.source.end?{line:this.source.end.line,column:this.source.end.column+1}:{line:t.line,column:t.column+1};if(e.word){let n=this.toString().indexOf(e.word);n!==-1&&(t=this.positionInside(n),i=this.positionInside(n+e.word.length))}else e.start?t={line:e.start.line,column:e.start.column}:e.index&&(t=this.positionInside(e.index)),e.end?i={line:e.end.line,column:e.end.column}:e.endIndex?i=this.positionInside(e.endIndex):e.index&&(i=this.positionInside(e.index+1));return(i.line<t.line||i.line===t.line&&i.column<=t.column)&&(i={line:t.line,column:t.column+1}),{start:t,end:i}}getProxyProcessor(){return{set(e,t,i){return e[t]===i||(e[t]=i,(t==="prop"||t==="value"||t==="name"||t==="params"||t==="important"||t==="text")&&e.markDirty()),!0},get(e,t){return t==="proxyOf"?e:t==="root"?()=>e.root().toProxy():e[t]}}}toProxy(){return this.proxyCache||(this.proxyCache=new Proxy(this,this.getProxyProcessor())),this.proxyCache}addToError(e){if(e.postcssNode=this,e.stack&&this.source&&/\n\s{4}at /.test(e.stack)){let t=this.source;e.stack=e.stack.replace(/\n\s{4}at /,`$&${t.input.from}:${t.start.line}:${t.start.column}$&`)}return e}markDirty(){if(this[on]){this[on]=!1;let e=this;for(;e=e.parent;)e[on]=!1}}get proxyOf(){return this}};dc.exports=ln;ln.default=ln});var Wr=x((S3,hc)=>{u();"use strict";var Ax=Hr(),un=class extends Ax{constructor(e){e&&typeof e.value!="undefined"&&typeof e.value!="string"&&(e={...e,value:String(e.value)});super(e);this.type="decl"}get variable(){return this.prop.startsWith("--")||this.prop[0]==="$"}};hc.exports=un;un.default=un});var pa=x((A3,mc)=>{u();mc.exports=function(r,e){return{generate:()=>{let t="";return r(e,i=>{t+=i}),[t]}}}});var Gr=x((C3,gc)=>{u();"use strict";var Cx=Hr(),fn=class extends Cx{constructor(e){super(e);this.type="comment"}};gc.exports=fn;fn.default=fn});var Et=x((_3,Cc)=>{u();"use strict";var{isClean:yc,my:bc}=sn(),wc=Wr(),vc=Gr(),_x=Hr(),xc,da,ha,kc;function Sc(r){return r.map(e=>(e.nodes&&(e.nodes=Sc(e.nodes)),delete e.source,e))}function Ac(r){if(r[yc]=!1,r.proxyOf.nodes)for(let e of r.proxyOf.nodes)Ac(e)}var Fe=class extends _x{push(e){return e.parent=this,this.proxyOf.nodes.push(e),this}each(e){if(!this.proxyOf.nodes)return;let t=this.getIterator(),i,n;for(;this.indexes[t]<this.proxyOf.nodes.length&&(i=this.indexes[t],n=e(this.proxyOf.nodes[i],i),n!==!1);)this.indexes[t]+=1;return delete this.indexes[t],n}walk(e){return this.each((t,i)=>{let n;try{n=e(t,i)}catch(a){throw t.addToError(a)}return n!==!1&&t.walk&&(n=t.walk(e)),n})}walkDecls(e,t){return t?e instanceof RegExp?this.walk((i,n)=>{if(i.type==="decl"&&e.test(i.prop))return t(i,n)}):this.walk((i,n)=>{if(i.type==="decl"&&i.prop===e)return t(i,n)}):(t=e,this.walk((i,n)=>{if(i.type==="decl")return t(i,n)}))}walkRules(e,t){return t?e instanceof RegExp?this.walk((i,n)=>{if(i.type==="rule"&&e.test(i.selector))return t(i,n)}):this.walk((i,n)=>{if(i.type==="rule"&&i.selector===e)return t(i,n)}):(t=e,this.walk((i,n)=>{if(i.type==="rule")return t(i,n)}))}walkAtRules(e,t){return t?e instanceof RegExp?this.walk((i,n)=>{if(i.type==="atrule"&&e.test(i.name))return t(i,n)}):this.walk((i,n)=>{if(i.type==="atrule"&&i.name===e)return t(i,n)}):(t=e,this.walk((i,n)=>{if(i.type==="atrule")return t(i,n)}))}walkComments(e){return this.walk((t,i)=>{if(t.type==="comment")return e(t,i)})}append(...e){for(let t of e){let i=this.normalize(t,this.last);for(let n of i)this.proxyOf.nodes.push(n)}return this.markDirty(),this}prepend(...e){e=e.reverse();for(let t of e){let i=this.normalize(t,this.first,"prepend").reverse();for(let n of i)this.proxyOf.nodes.unshift(n);for(let n in this.indexes)this.indexes[n]=this.indexes[n]+i.length}return this.markDirty(),this}cleanRaws(e){if(super.cleanRaws(e),this.nodes)for(let t of this.nodes)t.cleanRaws(e)}insertBefore(e,t){let i=this.index(e),n=i===0?"prepend":!1,a=this.normalize(t,this.proxyOf.nodes[i],n).reverse();i=this.index(e);for(let o of a)this.proxyOf.nodes.splice(i,0,o);let s;for(let o in this.indexes)s=this.indexes[o],i<=s&&(this.indexes[o]=s+a.length);return this.markDirty(),this}insertAfter(e,t){let i=this.index(e),n=this.normalize(t,this.proxyOf.nodes[i]).reverse();i=this.index(e);for(let s of n)this.proxyOf.nodes.splice(i+1,0,s);let a;for(let s in this.indexes)a=this.indexes[s],i<a&&(this.indexes[s]=a+n.length);return this.markDirty(),this}removeChild(e){e=this.index(e),this.proxyOf.nodes[e].parent=void 0,this.proxyOf.nodes.splice(e,1);let t;for(let i in this.indexes)t=this.indexes[i],t>=e&&(this.indexes[i]=t-1);return this.markDirty(),this}removeAll(){for(let e of this.proxyOf.nodes)e.parent=void 0;return this.proxyOf.nodes=[],this.markDirty(),this}replaceValues(e,t,i){return i||(i=t,t={}),this.walkDecls(n=>{t.props&&!t.props.includes(n.prop)||t.fast&&!n.value.includes(t.fast)||(n.value=n.value.replace(e,i))}),this.markDirty(),this}every(e){return this.nodes.every(e)}some(e){return this.nodes.some(e)}index(e){return typeof e=="number"?e:(e.proxyOf&&(e=e.proxyOf),this.proxyOf.nodes.indexOf(e))}get first(){if(!!this.proxyOf.nodes)return this.proxyOf.nodes[0]}get last(){if(!!this.proxyOf.nodes)return this.proxyOf.nodes[this.proxyOf.nodes.length-1]}normalize(e,t){if(typeof e=="string")e=Sc(xc(e).nodes);else if(Array.isArray(e)){e=e.slice(0);for(let n of e)n.parent&&n.parent.removeChild(n,"ignore")}else if(e.type==="root"&&this.type!=="document"){e=e.nodes.slice(0);for(let n of e)n.parent&&n.parent.removeChild(n,"ignore")}else if(e.type)e=[e];else if(e.prop){if(typeof e.value=="undefined")throw new Error("Value field is missed in node creation");typeof e.value!="string"&&(e.value=String(e.value)),e=[new wc(e)]}else if(e.selector)e=[new da(e)];else if(e.name)e=[new ha(e)];else if(e.text)e=[new vc(e)];else throw new Error("Unknown node type in node creation");return e.map(n=>(n[bc]||Fe.rebuild(n),n=n.proxyOf,n.parent&&n.parent.removeChild(n),n[yc]&&Ac(n),typeof n.raws.before=="undefined"&&t&&typeof t.raws.before!="undefined"&&(n.raws.before=t.raws.before.replace(/\S/g,"")),n.parent=this.proxyOf,n))}getProxyProcessor(){return{set(e,t,i){return e[t]===i||(e[t]=i,(t==="name"||t==="params"||t==="selector")&&e.markDirty()),!0},get(e,t){return t==="proxyOf"?e:e[t]?t==="each"||typeof t=="string"&&t.startsWith("walk")?(...i)=>e[t](...i.map(n=>typeof n=="function"?(a,s)=>n(a.toProxy(),s):n)):t==="every"||t==="some"?i=>e[t]((n,...a)=>i(n.toProxy(),...a)):t==="root"?()=>e.root().toProxy():t==="nodes"?e.nodes.map(i=>i.toProxy()):t==="first"||t==="last"?e[t].toProxy():e[t]:e[t]}}}getIterator(){this.lastEach||(this.lastEach=0),this.indexes||(this.indexes={}),this.lastEach+=1;let e=this.lastEach;return this.indexes[e]=0,e}};Fe.registerParse=r=>{xc=r};Fe.registerRule=r=>{da=r};Fe.registerAtRule=r=>{ha=r};Fe.registerRoot=r=>{kc=r};Cc.exports=Fe;Fe.default=Fe;Fe.rebuild=r=>{r.type==="atrule"?Object.setPrototypeOf(r,ha.prototype):r.type==="rule"?Object.setPrototypeOf(r,da.prototype):r.type==="decl"?Object.setPrototypeOf(r,wc.prototype):r.type==="comment"?Object.setPrototypeOf(r,vc.prototype):r.type==="root"&&Object.setPrototypeOf(r,kc.prototype),r[bc]=!0,r.nodes&&r.nodes.forEach(e=>{Fe.rebuild(e)})}});var cn=x((E3,Oc)=>{u();"use strict";var Ex=Et(),_c,Ec,er=class extends Ex{constructor(e){super({type:"document",...e});this.nodes||(this.nodes=[])}toResult(e={}){return new _c(new Ec,this,e).stringify()}};er.registerLazyResult=r=>{_c=r};er.registerProcessor=r=>{Ec=r};Oc.exports=er;er.default=er});var ma=x((O3,Rc)=>{u();"use strict";var Tc={};Rc.exports=function(e){Tc[e]||(Tc[e]=!0,typeof console!="undefined"&&console.warn&&console.warn(e))}});var ga=x((T3,Pc)=>{u();"use strict";var pn=class{constructor(e,t={}){if(this.type="warning",this.text=e,t.node&&t.node.source){let i=t.node.rangeBy(t);this.line=i.start.line,this.column=i.start.column,this.endLine=i.end.line,this.endColumn=i.end.column}for(let i in t)this[i]=t[i]}toString(){return this.node?this.node.error(this.text,{plugin:this.plugin,index:this.index,word:this.word}).message:this.plugin?this.plugin+": "+this.text:this.text}};Pc.exports=pn;pn.default=pn});var hn=x((R3,Ic)=>{u();"use strict";var Ox=ga(),dn=class{constructor(e,t,i){this.processor=e,this.messages=[],this.root=t,this.opts=i,this.css=void 0,this.map=void 0}toString(){return this.css}warn(e,t={}){t.plugin||this.lastPlugin&&this.lastPlugin.postcssPlugin&&(t.plugin=this.lastPlugin.postcssPlugin);let i=new Ox(e,t);return this.messages.push(i),i}warnings(){return this.messages.filter(e=>e.type==="warning")}get content(){return this.css}};Ic.exports=dn;dn.default=dn});var Mc=x((P3,Lc)=>{u();"use strict";var ya="'".charCodeAt(0),Dc='"'.charCodeAt(0),mn="\\".charCodeAt(0),qc="/".charCodeAt(0),gn=`
+`.charCodeAt(0),Qr=" ".charCodeAt(0),yn="\f".charCodeAt(0),bn="	".charCodeAt(0),wn="\r".charCodeAt(0),Tx="[".charCodeAt(0),Rx="]".charCodeAt(0),Px="(".charCodeAt(0),Ix=")".charCodeAt(0),Dx="{".charCodeAt(0),qx="}".charCodeAt(0),$x=";".charCodeAt(0),Lx="*".charCodeAt(0),Mx=":".charCodeAt(0),Nx="@".charCodeAt(0),vn=/[\t\n\f\r "#'()/;[\\\]{}]/g,xn=/[\t\n\f\r !"#'():;@[\\\]{}]|\/(?=\*)/g,Bx=/.[\n"'(/\\]/,$c=/[\da-f]/i;Lc.exports=function(e,t={}){let i=e.css.valueOf(),n=t.ignoreErrors,a,s,o,l,c,f,d,p,h,b,v=i.length,y=0,w=[],k=[];function S(){return y}function E(T){throw e.error("Unclosed "+T,y)}function O(){return k.length===0&&y>=v}function B(T){if(k.length)return k.pop();if(y>=v)return;let F=T?T.ignoreUnclosed:!1;switch(a=i.charCodeAt(y),a){case gn:case Qr:case bn:case wn:case yn:{s=y;do s+=1,a=i.charCodeAt(s);while(a===Qr||a===gn||a===bn||a===wn||a===yn);b=["space",i.slice(y,s)],y=s-1;break}case Tx:case Rx:case Dx:case qx:case Mx:case $x:case Ix:{let Y=String.fromCharCode(a);b=[Y,Y,y];break}case Px:{if(p=w.length?w.pop()[1]:"",h=i.charCodeAt(y+1),p==="url"&&h!==ya&&h!==Dc&&h!==Qr&&h!==gn&&h!==bn&&h!==yn&&h!==wn){s=y;do{if(f=!1,s=i.indexOf(")",s+1),s===-1)if(n||F){s=y;break}else E("bracket");for(d=s;i.charCodeAt(d-1)===mn;)d-=1,f=!f}while(f);b=["brackets",i.slice(y,s+1),y,s],y=s}else s=i.indexOf(")",y+1),l=i.slice(y,s+1),s===-1||Bx.test(l)?b=["(","(",y]:(b=["brackets",l,y,s],y=s);break}case ya:case Dc:{o=a===ya?"'":'"',s=y;do{if(f=!1,s=i.indexOf(o,s+1),s===-1)if(n||F){s=y+1;break}else E("string");for(d=s;i.charCodeAt(d-1)===mn;)d-=1,f=!f}while(f);b=["string",i.slice(y,s+1),y,s],y=s;break}case Nx:{vn.lastIndex=y+1,vn.test(i),vn.lastIndex===0?s=i.length-1:s=vn.lastIndex-2,b=["at-word",i.slice(y,s+1),y,s],y=s;break}case mn:{for(s=y,c=!0;i.charCodeAt(s+1)===mn;)s+=1,c=!c;if(a=i.charCodeAt(s+1),c&&a!==qc&&a!==Qr&&a!==gn&&a!==bn&&a!==wn&&a!==yn&&(s+=1,$c.test(i.charAt(s)))){for(;$c.test(i.charAt(s+1));)s+=1;i.charCodeAt(s+1)===Qr&&(s+=1)}b=["word",i.slice(y,s+1),y,s],y=s;break}default:{a===qc&&i.charCodeAt(y+1)===Lx?(s=i.indexOf("*/",y+2)+1,s===0&&(n||F?s=i.length:E("comment")),b=["comment",i.slice(y,s+1),y,s],y=s):(xn.lastIndex=y+1,xn.test(i),xn.lastIndex===0?s=i.length-1:s=xn.lastIndex-2,b=["word",i.slice(y,s+1),y,s],w.push(b),y=s);break}}return y++,b}function N(T){k.push(T)}return{back:N,nextToken:B,endOfFile:O,position:S}}});var kn=x((I3,Bc)=>{u();"use strict";var Nc=Et(),Yr=class extends Nc{constructor(e){super(e);this.type="atrule"}append(...e){return this.proxyOf.nodes||(this.nodes=[]),super.append(...e)}prepend(...e){return this.proxyOf.nodes||(this.nodes=[]),super.prepend(...e)}};Bc.exports=Yr;Yr.default=Yr;Nc.registerAtRule(Yr)});var tr=x((D3,Uc)=>{u();"use strict";var Fc=Et(),jc,zc,Ut=class extends Fc{constructor(e){super(e);this.type="root",this.nodes||(this.nodes=[])}removeChild(e,t){let i=this.index(e);return!t&&i===0&&this.nodes.length>1&&(this.nodes[1].raws.before=this.nodes[i].raws.before),super.removeChild(e)}normalize(e,t,i){let n=super.normalize(e);if(t){if(i==="prepend")this.nodes.length>1?t.raws.before=this.nodes[1].raws.before:delete t.raws.before;else if(this.first!==t)for(let a of n)a.raws.before=t.raws.before}return n}toResult(e={}){return new jc(new zc,this,e).stringify()}};Ut.registerLazyResult=r=>{jc=r};Ut.registerProcessor=r=>{zc=r};Uc.exports=Ut;Ut.default=Ut;Fc.registerRoot(Ut)});var ba=x((q3,Vc)=>{u();"use strict";var Kr={split(r,e,t){let i=[],n="",a=!1,s=0,o=!1,l="",c=!1;for(let f of r)c?c=!1:f==="\\"?c=!0:o?f===l&&(o=!1):f==='"'||f==="'"?(o=!0,l=f):f==="("?s+=1:f===")"?s>0&&(s-=1):s===0&&e.includes(f)&&(a=!0),a?(n!==""&&i.push(n.trim()),n="",a=!1):n+=f;return(t||n!=="")&&i.push(n.trim()),i},space(r){let e=[" ",`
+`,"	"];return Kr.split(r,e)},comma(r){return Kr.split(r,[","],!0)}};Vc.exports=Kr;Kr.default=Kr});var Sn=x(($3,Wc)=>{u();"use strict";var Hc=Et(),Fx=ba(),Xr=class extends Hc{constructor(e){super(e);this.type="rule",this.nodes||(this.nodes=[])}get selectors(){return Fx.comma(this.selector)}set selectors(e){let t=this.selector?this.selector.match(/,\s*/):null,i=t?t[0]:","+this.raw("between","beforeOpen");this.selector=e.join(i)}};Wc.exports=Xr;Xr.default=Xr;Hc.registerRule(Xr)});var Xc=x((L3,Kc)=>{u();"use strict";var jx=Wr(),zx=Mc(),Ux=Gr(),Vx=kn(),Hx=tr(),Gc=Sn(),Qc={empty:!0,space:!0};function Wx(r){for(let e=r.length-1;e>=0;e--){let t=r[e],i=t[3]||t[2];if(i)return i}}var Yc=class{constructor(e){this.input=e,this.root=new Hx,this.current=this.root,this.spaces="",this.semicolon=!1,this.customProperty=!1,this.createTokenizer(),this.root.source={input:e,start:{offset:0,line:1,column:1}}}createTokenizer(){this.tokenizer=zx(this.input)}parse(){let e;for(;!this.tokenizer.endOfFile();)switch(e=this.tokenizer.nextToken(),e[0]){case"space":this.spaces+=e[1];break;case";":this.freeSemicolon(e);break;case"}":this.end(e);break;case"comment":this.comment(e);break;case"at-word":this.atrule(e);break;case"{":this.emptyRule(e);break;default:this.other(e);break}this.endFile()}comment(e){let t=new Ux;this.init(t,e[2]),t.source.end=this.getPosition(e[3]||e[2]);let i=e[1].slice(2,-2);if(/^\s*$/.test(i))t.text="",t.raws.left=i,t.raws.right="";else{let n=i.match(/^(\s*)([^]*\S)(\s*)$/);t.text=n[2],t.raws.left=n[1],t.raws.right=n[3]}}emptyRule(e){let t=new Gc;this.init(t,e[2]),t.selector="",t.raws.between="",this.current=t}other(e){let t=!1,i=null,n=!1,a=null,s=[],o=e[1].startsWith("--"),l=[],c=e;for(;c;){if(i=c[0],l.push(c),i==="("||i==="[")a||(a=c),s.push(i==="("?")":"]");else if(o&&n&&i==="{")a||(a=c),s.push("}");else if(s.length===0)if(i===";")if(n){this.decl(l,o);return}else break;else if(i==="{"){this.rule(l);return}else if(i==="}"){this.tokenizer.back(l.pop()),t=!0;break}else i===":"&&(n=!0);else i===s[s.length-1]&&(s.pop(),s.length===0&&(a=null));c=this.tokenizer.nextToken()}if(this.tokenizer.endOfFile()&&(t=!0),s.length>0&&this.unclosedBracket(a),t&&n){if(!o)for(;l.length&&(c=l[l.length-1][0],!(c!=="space"&&c!=="comment"));)this.tokenizer.back(l.pop());this.decl(l,o)}else this.unknownWord(l)}rule(e){e.pop();let t=new Gc;this.init(t,e[0][2]),t.raws.between=this.spacesAndCommentsFromEnd(e),this.raw(t,"selector",e),this.current=t}decl(e,t){let i=new jx;this.init(i,e[0][2]);let n=e[e.length-1];for(n[0]===";"&&(this.semicolon=!0,e.pop()),i.source.end=this.getPosition(n[3]||n[2]||Wx(e));e[0][0]!=="word";)e.length===1&&this.unknownWord(e),i.raws.before+=e.shift()[1];for(i.source.start=this.getPosition(e[0][2]),i.prop="";e.length;){let c=e[0][0];if(c===":"||c==="space"||c==="comment")break;i.prop+=e.shift()[1]}i.raws.between="";let a;for(;e.length;)if(a=e.shift(),a[0]===":"){i.raws.between+=a[1];break}else a[0]==="word"&&/\w/.test(a[1])&&this.unknownWord([a]),i.raws.between+=a[1];(i.prop[0]==="_"||i.prop[0]==="*")&&(i.raws.before+=i.prop[0],i.prop=i.prop.slice(1));let s=[],o;for(;e.length&&(o=e[0][0],!(o!=="space"&&o!=="comment"));)s.push(e.shift());this.precheckMissedSemicolon(e);for(let c=e.length-1;c>=0;c--){if(a=e[c],a[1].toLowerCase()==="!important"){i.important=!0;let f=this.stringFrom(e,c);f=this.spacesFromEnd(e)+f,f!==" !important"&&(i.raws.important=f);break}else if(a[1].toLowerCase()==="important"){let f=e.slice(0),d="";for(let p=c;p>0;p--){let h=f[p][0];if(d.trim().indexOf("!")===0&&h!=="space")break;d=f.pop()[1]+d}d.trim().indexOf("!")===0&&(i.important=!0,i.raws.important=d,e=f)}if(a[0]!=="space"&&a[0]!=="comment")break}e.some(c=>c[0]!=="space"&&c[0]!=="comment")&&(i.raws.between+=s.map(c=>c[1]).join(""),s=[]),this.raw(i,"value",s.concat(e),t),i.value.includes(":")&&!t&&this.checkMissedSemicolon(e)}atrule(e){let t=new Vx;t.name=e[1].slice(1),t.name===""&&this.unnamedAtrule(t,e),this.init(t,e[2]);let i,n,a,s=!1,o=!1,l=[],c=[];for(;!this.tokenizer.endOfFile();){if(e=this.tokenizer.nextToken(),i=e[0],i==="("||i==="["?c.push(i==="("?")":"]"):i==="{"&&c.length>0?c.push("}"):i===c[c.length-1]&&c.pop(),c.length===0)if(i===";"){t.source.end=this.getPosition(e[2]),this.semicolon=!0;break}else if(i==="{"){o=!0;break}else if(i==="}"){if(l.length>0){for(a=l.length-1,n=l[a];n&&n[0]==="space";)n=l[--a];n&&(t.source.end=this.getPosition(n[3]||n[2]))}this.end(e);break}else l.push(e);else l.push(e);if(this.tokenizer.endOfFile()){s=!0;break}}t.raws.between=this.spacesAndCommentsFromEnd(l),l.length?(t.raws.afterName=this.spacesAndCommentsFromStart(l),this.raw(t,"params",l),s&&(e=l[l.length-1],t.source.end=this.getPosition(e[3]||e[2]),this.spaces=t.raws.between,t.raws.between="")):(t.raws.afterName="",t.params=""),o&&(t.nodes=[],this.current=t)}end(e){this.current.nodes&&this.current.nodes.length&&(this.current.raws.semicolon=this.semicolon),this.semicolon=!1,this.current.raws.after=(this.current.raws.after||"")+this.spaces,this.spaces="",this.current.parent?(this.current.source.end=this.getPosition(e[2]),this.current=this.current.parent):this.unexpectedClose(e)}endFile(){this.current.parent&&this.unclosedBlock(),this.current.nodes&&this.current.nodes.length&&(this.current.raws.semicolon=this.semicolon),this.current.raws.after=(this.current.raws.after||"")+this.spaces}freeSemicolon(e){if(this.spaces+=e[1],this.current.nodes){let t=this.current.nodes[this.current.nodes.length-1];t&&t.type==="rule"&&!t.raws.ownSemicolon&&(t.raws.ownSemicolon=this.spaces,this.spaces="")}}getPosition(e){let t=this.input.fromOffset(e);return{offset:e,line:t.line,column:t.col}}init(e,t){this.current.push(e),e.source={start:this.getPosition(t),input:this.input},e.raws.before=this.spaces,this.spaces="",e.type!=="comment"&&(this.semicolon=!1)}raw(e,t,i,n){let a,s,o=i.length,l="",c=!0,f,d;for(let p=0;p<o;p+=1)a=i[p],s=a[0],s==="space"&&p===o-1&&!n?c=!1:s==="comment"?(d=i[p-1]?i[p-1][0]:"empty",f=i[p+1]?i[p+1][0]:"empty",!Qc[d]&&!Qc[f]?l.slice(-1)===","?c=!1:l+=a[1]:c=!1):l+=a[1];if(!c){let p=i.reduce((h,b)=>h+b[1],"");e.raws[t]={value:l,raw:p}}e[t]=l}spacesAndCommentsFromEnd(e){let t,i="";for(;e.length&&(t=e[e.length-1][0],!(t!=="space"&&t!=="comment"));)i=e.pop()[1]+i;return i}spacesAndCommentsFromStart(e){let t,i="";for(;e.length&&(t=e[0][0],!(t!=="space"&&t!=="comment"));)i+=e.shift()[1];return i}spacesFromEnd(e){let t,i="";for(;e.length&&(t=e[e.length-1][0],t==="space");)i=e.pop()[1]+i;return i}stringFrom(e,t){let i="";for(let n=t;n<e.length;n++)i+=e[n][1];return e.splice(t,e.length-t),i}colon(e){let t=0,i,n,a;for(let[s,o]of e.entries()){if(i=o,n=i[0],n==="("&&(t+=1),n===")"&&(t-=1),t===0&&n===":")if(!a)this.doubleColon(i);else{if(a[0]==="word"&&a[1]==="progid")continue;return s}a=i}return!1}unclosedBracket(e){throw this.input.error("Unclosed bracket",{offset:e[2]},{offset:e[2]+1})}unknownWord(e){throw this.input.error("Unknown word",{offset:e[0][2]},{offset:e[0][2]+e[0][1].length})}unexpectedClose(e){throw this.input.error("Unexpected }",{offset:e[2]},{offset:e[2]+1})}unclosedBlock(){let e=this.current.source.start;throw this.input.error("Unclosed block",e.line,e.column)}doubleColon(e){throw this.input.error("Double colon",{offset:e[2]},{offset:e[2]+e[1].length})}unnamedAtrule(e,t){throw this.input.error("At-rule without name",{offset:t[2]},{offset:t[2]+t[1].length})}precheckMissedSemicolon(){}checkMissedSemicolon(e){let t=this.colon(e);if(t===!1)return;let i=0,n;for(let a=t-1;a>=0&&(n=e[a],!(n[0]!=="space"&&(i+=1,i===2)));a--);throw this.input.error("Missed semicolon",n[0]==="word"?n[3]+1:n[2])}};Kc.exports=Yc});var Jc=x(()=>{u()});var ep=x((B3,Zc)=>{u();var Gx="useandom-26T198340PX75pxJACKVERYMINDBUSHWOLF_GQZbfghjklqvwyzrict",Qx=(r,e=21)=>(t=e)=>{let i="",n=t;for(;n--;)i+=r[Math.random()*r.length|0];return i},Yx=(r=21)=>{let e="",t=r;for(;t--;)e+=Gx[Math.random()*64|0];return e};Zc.exports={nanoid:Yx,customAlphabet:Qx}});var wa=x((F3,tp)=>{u();tp.exports={}});var Cn=x((j3,sp)=>{u();"use strict";var{SourceMapConsumer:Kx,SourceMapGenerator:Xx}=Jc(),{fileURLToPath:rp,pathToFileURL:An}=(aa(),ac),{resolve:va,isAbsolute:xa}=(et(),Ur),{nanoid:Jx}=ep(),ka=oa(),ip=nn(),Zx=wa(),Sa=Symbol("fromOffsetCache"),e1=Boolean(Kx&&Xx),np=Boolean(va&&xa),Jr=class{constructor(e,t={}){if(e===null||typeof e=="undefined"||typeof e=="object"&&!e.toString)throw new Error(`PostCSS received ${e} instead of CSS string`);if(this.css=e.toString(),this.css[0]==="\uFEFF"||this.css[0]==="\uFFFE"?(this.hasBOM=!0,this.css=this.css.slice(1)):this.hasBOM=!1,t.from&&(!np||/^\w+:\/\//.test(t.from)||xa(t.from)?this.file=t.from:this.file=va(t.from)),np&&e1){let i=new Zx(this.css,t);if(i.text){this.map=i;let n=i.consumer().file;!this.file&&n&&(this.file=this.mapResolve(n))}}this.file||(this.id="<input css "+Jx(6)+">"),this.map&&(this.map.file=this.from)}fromOffset(e){let t,i;if(this[Sa])i=this[Sa];else{let a=this.css.split(`
+`);i=new Array(a.length);let s=0;for(let o=0,l=a.length;o<l;o++)i[o]=s,s+=a[o].length+1;this[Sa]=i}t=i[i.length-1];let n=0;if(e>=t)n=i.length-1;else{let a=i.length-2,s;for(;n<a;)if(s=n+(a-n>>1),e<i[s])a=s-1;else if(e>=i[s+1])n=s+1;else{n=s;break}}return{line:n+1,col:e-i[n]+1}}error(e,t,i,n={}){let a,s,o;if(t&&typeof t=="object"){let c=t,f=i;if(typeof c.offset=="number"){let d=this.fromOffset(c.offset);t=d.line,i=d.col}else t=c.line,i=c.column;if(typeof f.offset=="number"){let d=this.fromOffset(f.offset);s=d.line,o=d.col}else s=f.line,o=f.column}else if(!i){let c=this.fromOffset(t);t=c.line,i=c.col}let l=this.origin(t,i,s,o);return l?a=new ip(e,l.endLine===void 0?l.line:{line:l.line,column:l.column},l.endLine===void 0?l.column:{line:l.endLine,column:l.endColumn},l.source,l.file,n.plugin):a=new ip(e,s===void 0?t:{line:t,column:i},s===void 0?i:{line:s,column:o},this.css,this.file,n.plugin),a.input={line:t,column:i,endLine:s,endColumn:o,source:this.css},this.file&&(An&&(a.input.url=An(this.file).toString()),a.input.file=this.file),a}origin(e,t,i,n){if(!this.map)return!1;let a=this.map.consumer(),s=a.originalPositionFor({line:e,column:t});if(!s.source)return!1;let o;typeof i=="number"&&(o=a.originalPositionFor({line:i,column:n}));let l;xa(s.source)?l=An(s.source):l=new URL(s.source,this.map.consumer().sourceRoot||An(this.map.mapFile));let c={url:l.toString(),line:s.line,column:s.column,endLine:o&&o.line,endColumn:o&&o.column};if(l.protocol==="file:")if(rp)c.file=rp(l);else throw new Error("file: protocol is not available in this PostCSS build");let f=a.sourceContentFor(s.source);return f&&(c.source=f),c}mapResolve(e){return/^\w+:\/\//.test(e)?e:va(this.map.consumer().sourceRoot||this.map.root||".",e)}get from(){return this.file||this.id}toJSON(){let e={};for(let t of["hasBOM","css","file","id"])this[t]!=null&&(e[t]=this[t]);return this.map&&(e.map={...this.map},e.map.consumerCache&&(e.map.consumerCache=void 0)),e}};sp.exports=Jr;Jr.default=Jr;ka&&ka.registerInput&&ka.registerInput(Jr)});var En=x((z3,ap)=>{u();"use strict";var t1=Et(),r1=Xc(),i1=Cn();function _n(r,e){let t=new i1(r,e),i=new r1(t);try{i.parse()}catch(n){throw n}return i.root}ap.exports=_n;_n.default=_n;t1.registerParse(_n)});var _a=x((V3,fp)=>{u();"use strict";var{isClean:tt,my:n1}=sn(),s1=pa(),a1=Vr(),o1=Et(),l1=cn(),U3=ma(),op=hn(),u1=En(),f1=tr(),c1={document:"Document",root:"Root",atrule:"AtRule",rule:"Rule",decl:"Declaration",comment:"Comment"},p1={postcssPlugin:!0,prepare:!0,Once:!0,Document:!0,Root:!0,Declaration:!0,Rule:!0,AtRule:!0,Comment:!0,DeclarationExit:!0,RuleExit:!0,AtRuleExit:!0,CommentExit:!0,RootExit:!0,DocumentExit:!0,OnceExit:!0},d1={postcssPlugin:!0,prepare:!0,Once:!0},rr=0;function Zr(r){return typeof r=="object"&&typeof r.then=="function"}function lp(r){let e=!1,t=c1[r.type];return r.type==="decl"?e=r.prop.toLowerCase():r.type==="atrule"&&(e=r.name.toLowerCase()),e&&r.append?[t,t+"-"+e,rr,t+"Exit",t+"Exit-"+e]:e?[t,t+"-"+e,t+"Exit",t+"Exit-"+e]:r.append?[t,rr,t+"Exit"]:[t,t+"Exit"]}function up(r){let e;return r.type==="document"?e=["Document",rr,"DocumentExit"]:r.type==="root"?e=["Root",rr,"RootExit"]:e=lp(r),{node:r,events:e,eventIndex:0,visitors:[],visitorIndex:0,iterator:0}}function Aa(r){return r[tt]=!1,r.nodes&&r.nodes.forEach(e=>Aa(e)),r}var Ca={},pt=class{constructor(e,t,i){this.stringified=!1,this.processed=!1;let n;if(typeof t=="object"&&t!==null&&(t.type==="root"||t.type==="document"))n=Aa(t);else if(t instanceof pt||t instanceof op)n=Aa(t.root),t.map&&(typeof i.map=="undefined"&&(i.map={}),i.map.inline||(i.map.inline=!1),i.map.prev=t.map);else{let a=u1;i.syntax&&(a=i.syntax.parse),i.parser&&(a=i.parser),a.parse&&(a=a.parse);try{n=a(t,i)}catch(s){this.processed=!0,this.error=s}n&&!n[n1]&&o1.rebuild(n)}this.result=new op(e,n,i),this.helpers={...Ca,result:this.result,postcss:Ca},this.plugins=this.processor.plugins.map(a=>typeof a=="object"&&a.prepare?{...a,...a.prepare(this.result)}:a)}get[Symbol.toStringTag](){return"LazyResult"}get processor(){return this.result.processor}get opts(){return this.result.opts}get css(){return this.stringify().css}get content(){return this.stringify().content}get map(){return this.stringify().map}get root(){return this.sync().root}get messages(){return this.sync().messages}warnings(){return this.sync().warnings()}toString(){return this.css}then(e,t){return this.async().then(e,t)}catch(e){return this.async().catch(e)}finally(e){return this.async().then(e,e)}async(){return this.error?Promise.reject(this.error):this.processed?Promise.resolve(this.result):(this.processing||(this.processing=this.runAsync()),this.processing)}sync(){if(this.error)throw this.error;if(this.processed)return this.result;if(this.processed=!0,this.processing)throw this.getAsyncError();for(let e of this.plugins){let t=this.runOnRoot(e);if(Zr(t))throw this.getAsyncError()}if(this.prepareVisitors(),this.hasListener){let e=this.result.root;for(;!e[tt];)e[tt]=!0,this.walkSync(e);if(this.listeners.OnceExit)if(e.type==="document")for(let t of e.nodes)this.visitSync(this.listeners.OnceExit,t);else this.visitSync(this.listeners.OnceExit,e)}return this.result}stringify(){if(this.error)throw this.error;if(this.stringified)return this.result;this.stringified=!0,this.sync();let e=this.result.opts,t=a1;e.syntax&&(t=e.syntax.stringify),e.stringifier&&(t=e.stringifier),t.stringify&&(t=t.stringify);let n=new s1(t,this.result.root,this.result.opts).generate();return this.result.css=n[0],this.result.map=n[1],this.result}walkSync(e){e[tt]=!0;let t=lp(e);for(let i of t)if(i===rr)e.nodes&&e.each(n=>{n[tt]||this.walkSync(n)});else{let n=this.listeners[i];if(n&&this.visitSync(n,e.toProxy()))return}}visitSync(e,t){for(let[i,n]of e){this.result.lastPlugin=i;let a;try{a=n(t,this.helpers)}catch(s){throw this.handleError(s,t.proxyOf)}if(t.type!=="root"&&t.type!=="document"&&!t.parent)return!0;if(Zr(a))throw this.getAsyncError()}}runOnRoot(e){this.result.lastPlugin=e;try{if(typeof e=="object"&&e.Once){if(this.result.root.type==="document"){let t=this.result.root.nodes.map(i=>e.Once(i,this.helpers));return Zr(t[0])?Promise.all(t):t}return e.Once(this.result.root,this.helpers)}else if(typeof e=="function")return e(this.result.root,this.result)}catch(t){throw this.handleError(t)}}getAsyncError(){throw new Error("Use process(css).then(cb) to work with async plugins")}handleError(e,t){let i=this.result.lastPlugin;try{t&&t.addToError(e),this.error=e,e.name==="CssSyntaxError"&&!e.plugin?(e.plugin=i.postcssPlugin,e.setMessage()):i.postcssVersion}catch(n){console&&console.error&&console.error(n)}return e}async runAsync(){this.plugin=0;for(let e=0;e<this.plugins.length;e++){let t=this.plugins[e],i=this.runOnRoot(t);if(Zr(i))try{await i}catch(n){throw this.handleError(n)}}if(this.prepareVisitors(),this.hasListener){let e=this.result.root;for(;!e[tt];){e[tt]=!0;let t=[up(e)];for(;t.length>0;){let i=this.visitTick(t);if(Zr(i))try{await i}catch(n){let a=t[t.length-1].node;throw this.handleError(n,a)}}}if(this.listeners.OnceExit)for(let[t,i]of this.listeners.OnceExit){this.result.lastPlugin=t;try{if(e.type==="document"){let n=e.nodes.map(a=>i(a,this.helpers));await Promise.all(n)}else await i(e,this.helpers)}catch(n){throw this.handleError(n)}}}return this.processed=!0,this.stringify()}prepareVisitors(){this.listeners={};let e=(t,i,n)=>{this.listeners[i]||(this.listeners[i]=[]),this.listeners[i].push([t,n])};for(let t of this.plugins)if(typeof t=="object")for(let i in t){if(!p1[i]&&/^[A-Z]/.test(i))throw new Error(`Unknown event ${i} in ${t.postcssPlugin}. Try to update PostCSS (${this.processor.version} now).`);if(!d1[i])if(typeof t[i]=="object")for(let n in t[i])n==="*"?e(t,i,t[i][n]):e(t,i+"-"+n.toLowerCase(),t[i][n]);else typeof t[i]=="function"&&e(t,i,t[i])}this.hasListener=Object.keys(this.listeners).length>0}visitTick(e){let t=e[e.length-1],{node:i,visitors:n}=t;if(i.type!=="root"&&i.type!=="document"&&!i.parent){e.pop();return}if(n.length>0&&t.visitorIndex<n.length){let[s,o]=n[t.visitorIndex];t.visitorIndex+=1,t.visitorIndex===n.length&&(t.visitors=[],t.visitorIndex=0),this.result.lastPlugin=s;try{return o(i.toProxy(),this.helpers)}catch(l){throw this.handleError(l,i)}}if(t.iterator!==0){let s=t.iterator,o;for(;o=i.nodes[i.indexes[s]];)if(i.indexes[s]+=1,!o[tt]){o[tt]=!0,e.push(up(o));return}t.iterator=0,delete i.indexes[s]}let a=t.events;for(;t.eventIndex<a.length;){let s=a[t.eventIndex];if(t.eventIndex+=1,s===rr){i.nodes&&i.nodes.length&&(i[tt]=!0,t.iterator=i.getIterator());return}else if(this.listeners[s]){t.visitors=this.listeners[s];return}}e.pop()}};pt.registerPostcss=r=>{Ca=r};fp.exports=pt;pt.default=pt;f1.registerLazyResult(pt);l1.registerLazyResult(pt)});var pp=x((W3,cp)=>{u();"use strict";var h1=pa(),m1=Vr(),H3=ma(),g1=En(),y1=hn(),On=class{constructor(e,t,i){t=t.toString(),this.stringified=!1,this._processor=e,this._css=t,this._opts=i,this._map=void 0;let n,a=m1;this.result=new y1(this._processor,n,this._opts),this.result.css=t;let s=this;Object.defineProperty(this.result,"root",{get(){return s.root}});let o=new h1(a,n,this._opts,t);if(o.isMap()){let[l,c]=o.generate();l&&(this.result.css=l),c&&(this.result.map=c)}}get[Symbol.toStringTag](){return"NoWorkResult"}get processor(){return this.result.processor}get opts(){return this.result.opts}get css(){return this.result.css}get content(){return this.result.css}get map(){return this.result.map}get root(){if(this._root)return this._root;let e,t=g1;try{e=t(this._css,this._opts)}catch(i){this.error=i}if(this.error)throw this.error;return this._root=e,e}get messages(){return[]}warnings(){return[]}toString(){return this._css}then(e,t){return this.async().then(e,t)}catch(e){return this.async().catch(e)}finally(e){return this.async().then(e,e)}async(){return this.error?Promise.reject(this.error):Promise.resolve(this.result)}sync(){if(this.error)throw this.error;return this.result}};cp.exports=On;On.default=On});var hp=x((G3,dp)=>{u();"use strict";var b1=pp(),w1=_a(),v1=cn(),x1=tr(),ir=class{constructor(e=[]){this.version="8.4.24",this.plugins=this.normalize(e)}use(e){return this.plugins=this.plugins.concat(this.normalize([e])),this}process(e,t={}){return this.plugins.length===0&&typeof t.parser=="undefined"&&typeof t.stringifier=="undefined"&&typeof t.syntax=="undefined"?new b1(this,e,t):new w1(this,e,t)}normalize(e){let t=[];for(let i of e)if(i.postcss===!0?i=i():i.postcss&&(i=i.postcss),typeof i=="object"&&Array.isArray(i.plugins))t=t.concat(i.plugins);else if(typeof i=="object"&&i.postcssPlugin)t.push(i);else if(typeof i=="function")t.push(i);else if(!(typeof i=="object"&&(i.parse||i.stringify)))throw new Error(i+" is not a PostCSS plugin");return t}};dp.exports=ir;ir.default=ir;x1.registerProcessor(ir);v1.registerProcessor(ir)});var gp=x((Q3,mp)=>{u();"use strict";var k1=Wr(),S1=wa(),A1=Gr(),C1=kn(),_1=Cn(),E1=tr(),O1=Sn();function ei(r,e){if(Array.isArray(r))return r.map(n=>ei(n));let{inputs:t,...i}=r;if(t){e=[];for(let n of t){let a={...n,__proto__:_1.prototype};a.map&&(a.map={...a.map,__proto__:S1.prototype}),e.push(a)}}if(i.nodes&&(i.nodes=r.nodes.map(n=>ei(n,e))),i.source){let{inputId:n,...a}=i.source;i.source=a,n!=null&&(i.source.input=e[n])}if(i.type==="root")return new E1(i);if(i.type==="decl")return new k1(i);if(i.type==="rule")return new O1(i);if(i.type==="comment")return new A1(i);if(i.type==="atrule")return new C1(i);throw new Error("Unknown node type: "+r.type)}mp.exports=ei;ei.default=ei});var $e=x((Y3,Sp)=>{u();"use strict";var T1=nn(),yp=Wr(),R1=_a(),P1=Et(),Ea=hp(),I1=Vr(),D1=gp(),bp=cn(),q1=ga(),wp=Gr(),vp=kn(),$1=hn(),L1=Cn(),M1=En(),N1=ba(),xp=Sn(),kp=tr(),B1=Hr();function Z(...r){return r.length===1&&Array.isArray(r[0])&&(r=r[0]),new Ea(r)}Z.plugin=function(e,t){let i=!1;function n(...s){console&&console.warn&&!i&&(i=!0,console.warn(e+`: postcss.plugin was deprecated. Migration guide:
+https://evilmartians.com/chronicles/postcss-8-plugin-migration`),m.env.LANG&&m.env.LANG.startsWith("cn")&&console.warn(e+`: \u91CC\u9762 postcss.plugin \u88AB\u5F03\u7528. \u8FC1\u79FB\u6307\u5357:
+https://www.w3ctech.com/topic/2226`));let o=t(...s);return o.postcssPlugin=e,o.postcssVersion=new Ea().version,o}let a;return Object.defineProperty(n,"postcss",{get(){return a||(a=n()),a}}),n.process=function(s,o,l){return Z([n(l)]).process(s,o)},n};Z.stringify=I1;Z.parse=M1;Z.fromJSON=D1;Z.list=N1;Z.comment=r=>new wp(r);Z.atRule=r=>new vp(r);Z.decl=r=>new yp(r);Z.rule=r=>new xp(r);Z.root=r=>new kp(r);Z.document=r=>new bp(r);Z.CssSyntaxError=T1;Z.Declaration=yp;Z.Container=P1;Z.Processor=Ea;Z.Document=bp;Z.Comment=wp;Z.Warning=q1;Z.AtRule=vp;Z.Result=$1;Z.Input=L1;Z.Rule=xp;Z.Root=kp;Z.Node=B1;R1.registerPostcss(Z);Sp.exports=Z;Z.default=Z});var re,ee,K3,X3,J3,Z3,eI,tI,rI,iI,nI,sI,aI,oI,lI,uI,fI,cI,pI,dI,hI,mI,gI,yI,bI,wI,Ot=R(()=>{u();re=pe($e()),ee=re.default,K3=re.default.stringify,X3=re.default.fromJSON,J3=re.default.plugin,Z3=re.default.parse,eI=re.default.list,tI=re.default.document,rI=re.default.comment,iI=re.default.atRule,nI=re.default.rule,sI=re.default.decl,aI=re.default.root,oI=re.default.CssSyntaxError,lI=re.default.Declaration,uI=re.default.Container,fI=re.default.Processor,cI=re.default.Document,pI=re.default.Comment,dI=re.default.Warning,hI=re.default.AtRule,mI=re.default.Result,gI=re.default.Input,yI=re.default.Rule,bI=re.default.Root,wI=re.default.Node});var Oa=x((xI,Ap)=>{u();Ap.exports=function(r,e,t,i,n){for(e=e.split?e.split("."):e,i=0;i<e.length;i++)r=r?r[e[i]]:n;return r===n?t:r}});var Rn=x((Tn,Cp)=>{u();"use strict";Tn.__esModule=!0;Tn.default=z1;function F1(r){for(var e=r.toLowerCase(),t="",i=!1,n=0;n<6&&e[n]!==void 0;n++){var a=e.charCodeAt(n),s=a>=97&&a<=102||a>=48&&a<=57;if(i=a===32,!s)break;t+=e[n]}if(t.length!==0){var o=parseInt(t,16),l=o>=55296&&o<=57343;return l||o===0||o>1114111?["\uFFFD",t.length+(i?1:0)]:[String.fromCodePoint(o),t.length+(i?1:0)]}}var j1=/\\/;function z1(r){var e=j1.test(r);if(!e)return r;for(var t="",i=0;i<r.length;i++){if(r[i]==="\\"){var n=F1(r.slice(i+1,i+7));if(n!==void 0){t+=n[0],i+=n[1];continue}if(r[i+1]==="\\"){t+="\\",i++;continue}r.length===i+1&&(t+=r[i]);continue}t+=r[i]}return t}Cp.exports=Tn.default});var Ep=x((Pn,_p)=>{u();"use strict";Pn.__esModule=!0;Pn.default=U1;function U1(r){for(var e=arguments.length,t=new Array(e>1?e-1:0),i=1;i<e;i++)t[i-1]=arguments[i];for(;t.length>0;){var n=t.shift();if(!r[n])return;r=r[n]}return r}_p.exports=Pn.default});var Tp=x((In,Op)=>{u();"use strict";In.__esModule=!0;In.default=V1;function V1(r){for(var e=arguments.length,t=new Array(e>1?e-1:0),i=1;i<e;i++)t[i-1]=arguments[i];for(;t.length>0;){var n=t.shift();r[n]||(r[n]={}),r=r[n]}}Op.exports=In.default});var Pp=x((Dn,Rp)=>{u();"use strict";Dn.__esModule=!0;Dn.default=H1;function H1(r){for(var e="",t=r.indexOf("/*"),i=0;t>=0;){e=e+r.slice(i,t);var n=r.indexOf("*/",t+2);if(n<0)return e;i=n+2,t=r.indexOf("/*",i)}return e=e+r.slice(i),e}Rp.exports=Dn.default});var ti=x(rt=>{u();"use strict";rt.__esModule=!0;rt.unesc=rt.stripComments=rt.getProp=rt.ensureObject=void 0;var W1=qn(Rn());rt.unesc=W1.default;var G1=qn(Ep());rt.getProp=G1.default;var Q1=qn(Tp());rt.ensureObject=Q1.default;var Y1=qn(Pp());rt.stripComments=Y1.default;function qn(r){return r&&r.__esModule?r:{default:r}}});var dt=x((ri,qp)=>{u();"use strict";ri.__esModule=!0;ri.default=void 0;var Ip=ti();function Dp(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function K1(r,e,t){return e&&Dp(r.prototype,e),t&&Dp(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}var X1=function r(e,t){if(typeof e!="object"||e===null)return e;var i=new e.constructor;for(var n in e)if(!!e.hasOwnProperty(n)){var a=e[n],s=typeof a;n==="parent"&&s==="object"?t&&(i[n]=t):a instanceof Array?i[n]=a.map(function(o){return r(o,i)}):i[n]=r(a,i)}return i},J1=function(){function r(t){t===void 0&&(t={}),Object.assign(this,t),this.spaces=this.spaces||{},this.spaces.before=this.spaces.before||"",this.spaces.after=this.spaces.after||""}var e=r.prototype;return e.remove=function(){return this.parent&&this.parent.removeChild(this),this.parent=void 0,this},e.replaceWith=function(){if(this.parent){for(var i in arguments)this.parent.insertBefore(this,arguments[i]);this.remove()}return this},e.next=function(){return this.parent.at(this.parent.index(this)+1)},e.prev=function(){return this.parent.at(this.parent.index(this)-1)},e.clone=function(i){i===void 0&&(i={});var n=X1(this);for(var a in i)n[a]=i[a];return n},e.appendToPropertyAndEscape=function(i,n,a){this.raws||(this.raws={});var s=this[i],o=this.raws[i];this[i]=s+n,o||a!==n?this.raws[i]=(o||s)+a:delete this.raws[i]},e.setPropertyAndEscape=function(i,n,a){this.raws||(this.raws={}),this[i]=n,this.raws[i]=a},e.setPropertyWithoutEscape=function(i,n){this[i]=n,this.raws&&delete this.raws[i]},e.isAtPosition=function(i,n){if(this.source&&this.source.start&&this.source.end)return!(this.source.start.line>i||this.source.end.line<i||this.source.start.line===i&&this.source.start.column>n||this.source.end.line===i&&this.source.end.column<n)},e.stringifyProperty=function(i){return this.raws&&this.raws[i]||this[i]},e.valueToString=function(){return String(this.stringifyProperty("value"))},e.toString=function(){return[this.rawSpaceBefore,this.valueToString(),this.rawSpaceAfter].join("")},K1(r,[{key:"rawSpaceBefore",get:function(){var i=this.raws&&this.raws.spaces&&this.raws.spaces.before;return i===void 0&&(i=this.spaces&&this.spaces.before),i||""},set:function(i){(0,Ip.ensureObject)(this,"raws","spaces"),this.raws.spaces.before=i}},{key:"rawSpaceAfter",get:function(){var i=this.raws&&this.raws.spaces&&this.raws.spaces.after;return i===void 0&&(i=this.spaces.after),i||""},set:function(i){(0,Ip.ensureObject)(this,"raws","spaces"),this.raws.spaces.after=i}}]),r}();ri.default=J1;qp.exports=ri.default});var Se=x(ie=>{u();"use strict";ie.__esModule=!0;ie.UNIVERSAL=ie.TAG=ie.STRING=ie.SELECTOR=ie.ROOT=ie.PSEUDO=ie.NESTING=ie.ID=ie.COMMENT=ie.COMBINATOR=ie.CLASS=ie.ATTRIBUTE=void 0;var Z1="tag";ie.TAG=Z1;var ek="string";ie.STRING=ek;var tk="selector";ie.SELECTOR=tk;var rk="root";ie.ROOT=rk;var ik="pseudo";ie.PSEUDO=ik;var nk="nesting";ie.NESTING=nk;var sk="id";ie.ID=sk;var ak="comment";ie.COMMENT=ak;var ok="combinator";ie.COMBINATOR=ok;var lk="class";ie.CLASS=lk;var uk="attribute";ie.ATTRIBUTE=uk;var fk="universal";ie.UNIVERSAL=fk});var $n=x((ii,Np)=>{u();"use strict";ii.__esModule=!0;ii.default=void 0;var ck=dk(dt()),ht=pk(Se());function $p(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return($p=function(n){return n?t:e})(r)}function pk(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=$p(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}function dk(r){return r&&r.__esModule?r:{default:r}}function hk(r,e){var t=typeof Symbol!="undefined"&&r[Symbol.iterator]||r["@@iterator"];if(t)return(t=t.call(r)).next.bind(t);if(Array.isArray(r)||(t=mk(r))||e&&r&&typeof r.length=="number"){t&&(r=t);var i=0;return function(){return i>=r.length?{done:!0}:{done:!1,value:r[i++]}}}throw new TypeError(`Invalid attempt to iterate non-iterable instance.
+In order to be iterable, non-array objects must have a [Symbol.iterator]() method.`)}function mk(r,e){if(!!r){if(typeof r=="string")return Lp(r,e);var t=Object.prototype.toString.call(r).slice(8,-1);if(t==="Object"&&r.constructor&&(t=r.constructor.name),t==="Map"||t==="Set")return Array.from(r);if(t==="Arguments"||/^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t))return Lp(r,e)}}function Lp(r,e){(e==null||e>r.length)&&(e=r.length);for(var t=0,i=new Array(e);t<e;t++)i[t]=r[t];return i}function Mp(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function gk(r,e,t){return e&&Mp(r.prototype,e),t&&Mp(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}function yk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ta(r,e)}function Ta(r,e){return Ta=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ta(r,e)}var bk=function(r){yk(e,r);function e(i){var n;return n=r.call(this,i)||this,n.nodes||(n.nodes=[]),n}var t=e.prototype;return t.append=function(n){return n.parent=this,this.nodes.push(n),this},t.prepend=function(n){return n.parent=this,this.nodes.unshift(n),this},t.at=function(n){return this.nodes[n]},t.index=function(n){return typeof n=="number"?n:this.nodes.indexOf(n)},t.removeChild=function(n){n=this.index(n),this.at(n).parent=void 0,this.nodes.splice(n,1);var a;for(var s in this.indexes)a=this.indexes[s],a>=n&&(this.indexes[s]=a-1);return this},t.removeAll=function(){for(var n=hk(this.nodes),a;!(a=n()).done;){var s=a.value;s.parent=void 0}return this.nodes=[],this},t.empty=function(){return this.removeAll()},t.insertAfter=function(n,a){a.parent=this;var s=this.index(n);this.nodes.splice(s+1,0,a),a.parent=this;var o;for(var l in this.indexes)o=this.indexes[l],s<=o&&(this.indexes[l]=o+1);return this},t.insertBefore=function(n,a){a.parent=this;var s=this.index(n);this.nodes.splice(s,0,a),a.parent=this;var o;for(var l in this.indexes)o=this.indexes[l],o<=s&&(this.indexes[l]=o+1);return this},t._findChildAtPosition=function(n,a){var s=void 0;return this.each(function(o){if(o.atPosition){var l=o.atPosition(n,a);if(l)return s=l,!1}else if(o.isAtPosition(n,a))return s=o,!1}),s},t.atPosition=function(n,a){if(this.isAtPosition(n,a))return this._findChildAtPosition(n,a)||this},t._inferEndPosition=function(){this.last&&this.last.source&&this.last.source.end&&(this.source=this.source||{},this.source.end=this.source.end||{},Object.assign(this.source.end,this.last.source.end))},t.each=function(n){this.lastEach||(this.lastEach=0),this.indexes||(this.indexes={}),this.lastEach++;var a=this.lastEach;if(this.indexes[a]=0,!!this.length){for(var s,o;this.indexes[a]<this.length&&(s=this.indexes[a],o=n(this.at(s),s),o!==!1);)this.indexes[a]+=1;if(delete this.indexes[a],o===!1)return!1}},t.walk=function(n){return this.each(function(a,s){var o=n(a,s);if(o!==!1&&a.length&&(o=a.walk(n)),o===!1)return!1})},t.walkAttributes=function(n){var a=this;return this.walk(function(s){if(s.type===ht.ATTRIBUTE)return n.call(a,s)})},t.walkClasses=function(n){var a=this;return this.walk(function(s){if(s.type===ht.CLASS)return n.call(a,s)})},t.walkCombinators=function(n){var a=this;return this.walk(function(s){if(s.type===ht.COMBINATOR)return n.call(a,s)})},t.walkComments=function(n){var a=this;return this.walk(function(s){if(s.type===ht.COMMENT)return n.call(a,s)})},t.walkIds=function(n){var a=this;return this.walk(function(s){if(s.type===ht.ID)return n.call(a,s)})},t.walkNesting=function(n){var a=this;return this.walk(function(s){if(s.type===ht.NESTING)return n.call(a,s)})},t.walkPseudos=function(n){var a=this;return this.walk(function(s){if(s.type===ht.PSEUDO)return n.call(a,s)})},t.walkTags=function(n){var a=this;return this.walk(function(s){if(s.type===ht.TAG)return n.call(a,s)})},t.walkUniversals=function(n){var a=this;return this.walk(function(s){if(s.type===ht.UNIVERSAL)return n.call(a,s)})},t.split=function(n){var a=this,s=[];return this.reduce(function(o,l,c){var f=n.call(a,l);return s.push(l),f?(o.push(s),s=[]):c===a.length-1&&o.push(s),o},[])},t.map=function(n){return this.nodes.map(n)},t.reduce=function(n,a){return this.nodes.reduce(n,a)},t.every=function(n){return this.nodes.every(n)},t.some=function(n){return this.nodes.some(n)},t.filter=function(n){return this.nodes.filter(n)},t.sort=function(n){return this.nodes.sort(n)},t.toString=function(){return this.map(String).join("")},gk(e,[{key:"first",get:function(){return this.at(0)}},{key:"last",get:function(){return this.at(this.length-1)}},{key:"length",get:function(){return this.nodes.length}}]),e}(ck.default);ii.default=bk;Np.exports=ii.default});var Pa=x((ni,Fp)=>{u();"use strict";ni.__esModule=!0;ni.default=void 0;var wk=xk($n()),vk=Se();function xk(r){return r&&r.__esModule?r:{default:r}}function Bp(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function kk(r,e,t){return e&&Bp(r.prototype,e),t&&Bp(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}function Sk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ra(r,e)}function Ra(r,e){return Ra=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ra(r,e)}var Ak=function(r){Sk(e,r);function e(i){var n;return n=r.call(this,i)||this,n.type=vk.ROOT,n}var t=e.prototype;return t.toString=function(){var n=this.reduce(function(a,s){return a.push(String(s)),a},[]).join(",");return this.trailingComma?n+",":n},t.error=function(n,a){return this._error?this._error(n,a):new Error(n)},kk(e,[{key:"errorGenerator",set:function(n){this._error=n}}]),e}(wk.default);ni.default=Ak;Fp.exports=ni.default});var Da=x((si,jp)=>{u();"use strict";si.__esModule=!0;si.default=void 0;var Ck=Ek($n()),_k=Se();function Ek(r){return r&&r.__esModule?r:{default:r}}function Ok(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ia(r,e)}function Ia(r,e){return Ia=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ia(r,e)}var Tk=function(r){Ok(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=_k.SELECTOR,i}return e}(Ck.default);si.default=Tk;jp.exports=si.default});var Ln=x((AI,zp)=>{u();"use strict";var Rk={},Pk=Rk.hasOwnProperty,Ik=function(e,t){if(!e)return t;var i={};for(var n in t)i[n]=Pk.call(e,n)?e[n]:t[n];return i},Dk=/[ -,\.\/:-@\[-\^`\{-~]/,qk=/[ -,\.\/:-@\[\]\^`\{-~]/,$k=/(^|\\+)?(\\[A-F0-9]{1,6})\x20(?![a-fA-F0-9\x20])/g,qa=function r(e,t){t=Ik(t,r.options),t.quotes!="single"&&t.quotes!="double"&&(t.quotes="single");for(var i=t.quotes=="double"?'"':"'",n=t.isIdentifier,a=e.charAt(0),s="",o=0,l=e.length;o<l;){var c=e.charAt(o++),f=c.charCodeAt(),d=void 0;if(f<32||f>126){if(f>=55296&&f<=56319&&o<l){var p=e.charCodeAt(o++);(p&64512)==56320?f=((f&1023)<<10)+(p&1023)+65536:o--}d="\\"+f.toString(16).toUpperCase()+" "}else t.escapeEverything?Dk.test(c)?d="\\"+c:d="\\"+f.toString(16).toUpperCase()+" ":/[\t\n\f\r\x0B]/.test(c)?d="\\"+f.toString(16).toUpperCase()+" ":c=="\\"||!n&&(c=='"'&&i==c||c=="'"&&i==c)||n&&qk.test(c)?d="\\"+c:d=c;s+=d}return n&&(/^-[-\d]/.test(s)?s="\\-"+s.slice(1):/\d/.test(a)&&(s="\\3"+a+" "+s.slice(1))),s=s.replace($k,function(h,b,v){return b&&b.length%2?h:(b||"")+v}),!n&&t.wrap?i+s+i:s};qa.options={escapeEverything:!1,isIdentifier:!1,quotes:"single",wrap:!1};qa.version="3.0.0";zp.exports=qa});var La=x((ai,Hp)=>{u();"use strict";ai.__esModule=!0;ai.default=void 0;var Lk=Up(Ln()),Mk=ti(),Nk=Up(dt()),Bk=Se();function Up(r){return r&&r.__esModule?r:{default:r}}function Vp(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function Fk(r,e,t){return e&&Vp(r.prototype,e),t&&Vp(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}function jk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,$a(r,e)}function $a(r,e){return $a=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},$a(r,e)}var zk=function(r){jk(e,r);function e(i){var n;return n=r.call(this,i)||this,n.type=Bk.CLASS,n._constructed=!0,n}var t=e.prototype;return t.valueToString=function(){return"."+r.prototype.valueToString.call(this)},Fk(e,[{key:"value",get:function(){return this._value},set:function(n){if(this._constructed){var a=(0,Lk.default)(n,{isIdentifier:!0});a!==n?((0,Mk.ensureObject)(this,"raws"),this.raws.value=a):this.raws&&delete this.raws.value}this._value=n}}]),e}(Nk.default);ai.default=zk;Hp.exports=ai.default});var Na=x((oi,Wp)=>{u();"use strict";oi.__esModule=!0;oi.default=void 0;var Uk=Hk(dt()),Vk=Se();function Hk(r){return r&&r.__esModule?r:{default:r}}function Wk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ma(r,e)}function Ma(r,e){return Ma=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ma(r,e)}var Gk=function(r){Wk(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=Vk.COMMENT,i}return e}(Uk.default);oi.default=Gk;Wp.exports=oi.default});var Fa=x((li,Gp)=>{u();"use strict";li.__esModule=!0;li.default=void 0;var Qk=Kk(dt()),Yk=Se();function Kk(r){return r&&r.__esModule?r:{default:r}}function Xk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ba(r,e)}function Ba(r,e){return Ba=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ba(r,e)}var Jk=function(r){Xk(e,r);function e(i){var n;return n=r.call(this,i)||this,n.type=Yk.ID,n}var t=e.prototype;return t.valueToString=function(){return"#"+r.prototype.valueToString.call(this)},e}(Qk.default);li.default=Jk;Gp.exports=li.default});var Mn=x((ui,Kp)=>{u();"use strict";ui.__esModule=!0;ui.default=void 0;var Zk=Qp(Ln()),eS=ti(),tS=Qp(dt());function Qp(r){return r&&r.__esModule?r:{default:r}}function Yp(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function rS(r,e,t){return e&&Yp(r.prototype,e),t&&Yp(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}function iS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,ja(r,e)}function ja(r,e){return ja=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},ja(r,e)}var nS=function(r){iS(e,r);function e(){return r.apply(this,arguments)||this}var t=e.prototype;return t.qualifiedName=function(n){return this.namespace?this.namespaceString+"|"+n:n},t.valueToString=function(){return this.qualifiedName(r.prototype.valueToString.call(this))},rS(e,[{key:"namespace",get:function(){return this._namespace},set:function(n){if(n===!0||n==="*"||n==="&"){this._namespace=n,this.raws&&delete this.raws.namespace;return}var a=(0,Zk.default)(n,{isIdentifier:!0});this._namespace=n,a!==n?((0,eS.ensureObject)(this,"raws"),this.raws.namespace=a):this.raws&&delete this.raws.namespace}},{key:"ns",get:function(){return this._namespace},set:function(n){this.namespace=n}},{key:"namespaceString",get:function(){if(this.namespace){var n=this.stringifyProperty("namespace");return n===!0?"":n}else return""}}]),e}(tS.default);ui.default=nS;Kp.exports=ui.default});var Ua=x((fi,Xp)=>{u();"use strict";fi.__esModule=!0;fi.default=void 0;var sS=oS(Mn()),aS=Se();function oS(r){return r&&r.__esModule?r:{default:r}}function lS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,za(r,e)}function za(r,e){return za=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},za(r,e)}var uS=function(r){lS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=aS.TAG,i}return e}(sS.default);fi.default=uS;Xp.exports=fi.default});var Ha=x((ci,Jp)=>{u();"use strict";ci.__esModule=!0;ci.default=void 0;var fS=pS(dt()),cS=Se();function pS(r){return r&&r.__esModule?r:{default:r}}function dS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Va(r,e)}function Va(r,e){return Va=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Va(r,e)}var hS=function(r){dS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=cS.STRING,i}return e}(fS.default);ci.default=hS;Jp.exports=ci.default});var Ga=x((pi,Zp)=>{u();"use strict";pi.__esModule=!0;pi.default=void 0;var mS=yS($n()),gS=Se();function yS(r){return r&&r.__esModule?r:{default:r}}function bS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Wa(r,e)}function Wa(r,e){return Wa=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Wa(r,e)}var wS=function(r){bS(e,r);function e(i){var n;return n=r.call(this,i)||this,n.type=gS.PSEUDO,n}var t=e.prototype;return t.toString=function(){var n=this.length?"("+this.map(String).join(",")+")":"";return[this.rawSpaceBefore,this.stringifyProperty("value"),n,this.rawSpaceAfter].join("")},e}(mS.default);pi.default=wS;Zp.exports=pi.default});var Nn={};Ge(Nn,{deprecate:()=>vS});function vS(r){return r}var Bn=R(()=>{u()});var td=x((CI,ed)=>{u();ed.exports=(Bn(),Nn).deprecate});var Za=x(mi=>{u();"use strict";mi.__esModule=!0;mi.default=void 0;mi.unescapeValue=Xa;var di=Ya(Ln()),xS=Ya(Rn()),kS=Ya(Mn()),SS=Se(),Qa;function Ya(r){return r&&r.__esModule?r:{default:r}}function rd(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function AS(r,e,t){return e&&rd(r.prototype,e),t&&rd(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}function CS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ka(r,e)}function Ka(r,e){return Ka=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ka(r,e)}var hi=td(),_S=/^('|")([^]*)\1$/,ES=hi(function(){},"Assigning an attribute a value containing characters that might need to be escaped is deprecated. Call attribute.setValue() instead."),OS=hi(function(){},"Assigning attr.quoted is deprecated and has no effect. Assign to attr.quoteMark instead."),TS=hi(function(){},"Constructing an Attribute selector with a value without specifying quoteMark is deprecated. Note: The value should be unescaped now.");function Xa(r){var e=!1,t=null,i=r,n=i.match(_S);return n&&(t=n[1],i=n[2]),i=(0,xS.default)(i),i!==r&&(e=!0),{deprecatedUsage:e,unescaped:i,quoteMark:t}}function RS(r){if(r.quoteMark!==void 0||r.value===void 0)return r;TS();var e=Xa(r.value),t=e.quoteMark,i=e.unescaped;return r.raws||(r.raws={}),r.raws.value===void 0&&(r.raws.value=r.value),r.value=i,r.quoteMark=t,r}var Fn=function(r){CS(e,r);function e(i){var n;return i===void 0&&(i={}),n=r.call(this,RS(i))||this,n.type=SS.ATTRIBUTE,n.raws=n.raws||{},Object.defineProperty(n.raws,"unquoted",{get:hi(function(){return n.value},"attr.raws.unquoted is deprecated. Call attr.value instead."),set:hi(function(){return n.value},"Setting attr.raws.unquoted is deprecated and has no effect. attr.value is unescaped by default now.")}),n._constructed=!0,n}var t=e.prototype;return t.getQuotedValue=function(n){n===void 0&&(n={});var a=this._determineQuoteMark(n),s=Ja[a],o=(0,di.default)(this._value,s);return o},t._determineQuoteMark=function(n){return n.smart?this.smartQuoteMark(n):this.preferredQuoteMark(n)},t.setValue=function(n,a){a===void 0&&(a={}),this._value=n,this._quoteMark=this._determineQuoteMark(a),this._syncRawValue()},t.smartQuoteMark=function(n){var a=this.value,s=a.replace(/[^']/g,"").length,o=a.replace(/[^"]/g,"").length;if(s+o===0){var l=(0,di.default)(a,{isIdentifier:!0});if(l===a)return e.NO_QUOTE;var c=this.preferredQuoteMark(n);if(c===e.NO_QUOTE){var f=this.quoteMark||n.quoteMark||e.DOUBLE_QUOTE,d=Ja[f],p=(0,di.default)(a,d);if(p.length<l.length)return f}return c}else return o===s?this.preferredQuoteMark(n):o<s?e.DOUBLE_QUOTE:e.SINGLE_QUOTE},t.preferredQuoteMark=function(n){var a=n.preferCurrentQuoteMark?this.quoteMark:n.quoteMark;return a===void 0&&(a=n.preferCurrentQuoteMark?n.quoteMark:this.quoteMark),a===void 0&&(a=e.DOUBLE_QUOTE),a},t._syncRawValue=function(){var n=(0,di.default)(this._value,Ja[this.quoteMark]);n===this._value?this.raws&&delete this.raws.value:this.raws.value=n},t._handleEscapes=function(n,a){if(this._constructed){var s=(0,di.default)(a,{isIdentifier:!0});s!==a?this.raws[n]=s:delete this.raws[n]}},t._spacesFor=function(n){var a={before:"",after:""},s=this.spaces[n]||{},o=this.raws.spaces&&this.raws.spaces[n]||{};return Object.assign(a,s,o)},t._stringFor=function(n,a,s){a===void 0&&(a=n),s===void 0&&(s=id);var o=this._spacesFor(a);return s(this.stringifyProperty(n),o)},t.offsetOf=function(n){var a=1,s=this._spacesFor("attribute");if(a+=s.before.length,n==="namespace"||n==="ns")return this.namespace?a:-1;if(n==="attributeNS"||(a+=this.namespaceString.length,this.namespace&&(a+=1),n==="attribute"))return a;a+=this.stringifyProperty("attribute").length,a+=s.after.length;var o=this._spacesFor("operator");a+=o.before.length;var l=this.stringifyProperty("operator");if(n==="operator")return l?a:-1;a+=l.length,a+=o.after.length;var c=this._spacesFor("value");a+=c.before.length;var f=this.stringifyProperty("value");if(n==="value")return f?a:-1;a+=f.length,a+=c.after.length;var d=this._spacesFor("insensitive");return a+=d.before.length,n==="insensitive"&&this.insensitive?a:-1},t.toString=function(){var n=this,a=[this.rawSpaceBefore,"["];return a.push(this._stringFor("qualifiedAttribute","attribute")),this.operator&&(this.value||this.value==="")&&(a.push(this._stringFor("operator")),a.push(this._stringFor("value")),a.push(this._stringFor("insensitiveFlag","insensitive",function(s,o){return s.length>0&&!n.quoted&&o.before.length===0&&!(n.spaces.value&&n.spaces.value.after)&&(o.before=" "),id(s,o)}))),a.push("]"),a.push(this.rawSpaceAfter),a.join("")},AS(e,[{key:"quoted",get:function(){var n=this.quoteMark;return n==="'"||n==='"'},set:function(n){OS()}},{key:"quoteMark",get:function(){return this._quoteMark},set:function(n){if(!this._constructed){this._quoteMark=n;return}this._quoteMark!==n&&(this._quoteMark=n,this._syncRawValue())}},{key:"qualifiedAttribute",get:function(){return this.qualifiedName(this.raws.attribute||this.attribute)}},{key:"insensitiveFlag",get:function(){return this.insensitive?"i":""}},{key:"value",get:function(){return this._value},set:function(n){if(this._constructed){var a=Xa(n),s=a.deprecatedUsage,o=a.unescaped,l=a.quoteMark;if(s&&ES(),o===this._value&&l===this._quoteMark)return;this._value=o,this._quoteMark=l,this._syncRawValue()}else this._value=n}},{key:"insensitive",get:function(){return this._insensitive},set:function(n){n||(this._insensitive=!1,this.raws&&(this.raws.insensitiveFlag==="I"||this.raws.insensitiveFlag==="i")&&(this.raws.insensitiveFlag=void 0)),this._insensitive=n}},{key:"attribute",get:function(){return this._attribute},set:function(n){this._handleEscapes("attribute",n),this._attribute=n}}]),e}(kS.default);mi.default=Fn;Fn.NO_QUOTE=null;Fn.SINGLE_QUOTE="'";Fn.DOUBLE_QUOTE='"';var Ja=(Qa={"'":{quotes:"single",wrap:!0},'"':{quotes:"double",wrap:!0}},Qa[null]={isIdentifier:!0},Qa);function id(r,e){return""+e.before+r+e.after}});var to=x((gi,nd)=>{u();"use strict";gi.__esModule=!0;gi.default=void 0;var PS=DS(Mn()),IS=Se();function DS(r){return r&&r.__esModule?r:{default:r}}function qS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,eo(r,e)}function eo(r,e){return eo=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},eo(r,e)}var $S=function(r){qS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=IS.UNIVERSAL,i.value="*",i}return e}(PS.default);gi.default=$S;nd.exports=gi.default});var io=x((yi,sd)=>{u();"use strict";yi.__esModule=!0;yi.default=void 0;var LS=NS(dt()),MS=Se();function NS(r){return r&&r.__esModule?r:{default:r}}function BS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,ro(r,e)}function ro(r,e){return ro=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},ro(r,e)}var FS=function(r){BS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=MS.COMBINATOR,i}return e}(LS.default);yi.default=FS;sd.exports=yi.default});var so=x((bi,ad)=>{u();"use strict";bi.__esModule=!0;bi.default=void 0;var jS=US(dt()),zS=Se();function US(r){return r&&r.__esModule?r:{default:r}}function VS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,no(r,e)}function no(r,e){return no=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},no(r,e)}var HS=function(r){VS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=zS.NESTING,i.value="&",i}return e}(jS.default);bi.default=HS;ad.exports=bi.default});var ld=x((jn,od)=>{u();"use strict";jn.__esModule=!0;jn.default=WS;function WS(r){return r.sort(function(e,t){return e-t})}od.exports=jn.default});var ao=x(M=>{u();"use strict";M.__esModule=!0;M.word=M.tilde=M.tab=M.str=M.space=M.slash=M.singleQuote=M.semicolon=M.plus=M.pipe=M.openSquare=M.openParenthesis=M.newline=M.greaterThan=M.feed=M.equals=M.doubleQuote=M.dollar=M.cr=M.comment=M.comma=M.combinator=M.colon=M.closeSquare=M.closeParenthesis=M.caret=M.bang=M.backslash=M.at=M.asterisk=M.ampersand=void 0;var GS=38;M.ampersand=GS;var QS=42;M.asterisk=QS;var YS=64;M.at=YS;var KS=44;M.comma=KS;var XS=58;M.colon=XS;var JS=59;M.semicolon=JS;var ZS=40;M.openParenthesis=ZS;var eA=41;M.closeParenthesis=eA;var tA=91;M.openSquare=tA;var rA=93;M.closeSquare=rA;var iA=36;M.dollar=iA;var nA=126;M.tilde=nA;var sA=94;M.caret=sA;var aA=43;M.plus=aA;var oA=61;M.equals=oA;var lA=124;M.pipe=lA;var uA=62;M.greaterThan=uA;var fA=32;M.space=fA;var ud=39;M.singleQuote=ud;var cA=34;M.doubleQuote=cA;var pA=47;M.slash=pA;var dA=33;M.bang=dA;var hA=92;M.backslash=hA;var mA=13;M.cr=mA;var gA=12;M.feed=gA;var yA=10;M.newline=yA;var bA=9;M.tab=bA;var wA=ud;M.str=wA;var vA=-1;M.comment=vA;var xA=-2;M.word=xA;var kA=-3;M.combinator=kA});var pd=x(wi=>{u();"use strict";wi.__esModule=!0;wi.FIELDS=void 0;wi.default=TA;var D=SA(ao()),nr,te;function fd(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return(fd=function(n){return n?t:e})(r)}function SA(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=fd(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}var AA=(nr={},nr[D.tab]=!0,nr[D.newline]=!0,nr[D.cr]=!0,nr[D.feed]=!0,nr),CA=(te={},te[D.space]=!0,te[D.tab]=!0,te[D.newline]=!0,te[D.cr]=!0,te[D.feed]=!0,te[D.ampersand]=!0,te[D.asterisk]=!0,te[D.bang]=!0,te[D.comma]=!0,te[D.colon]=!0,te[D.semicolon]=!0,te[D.openParenthesis]=!0,te[D.closeParenthesis]=!0,te[D.openSquare]=!0,te[D.closeSquare]=!0,te[D.singleQuote]=!0,te[D.doubleQuote]=!0,te[D.plus]=!0,te[D.pipe]=!0,te[D.tilde]=!0,te[D.greaterThan]=!0,te[D.equals]=!0,te[D.dollar]=!0,te[D.caret]=!0,te[D.slash]=!0,te),oo={},cd="0123456789abcdefABCDEF";for(zn=0;zn<cd.length;zn++)oo[cd.charCodeAt(zn)]=!0;var zn;function _A(r,e){var t=e,i;do{if(i=r.charCodeAt(t),CA[i])return t-1;i===D.backslash?t=EA(r,t)+1:t++}while(t<r.length);return t-1}function EA(r,e){var t=e,i=r.charCodeAt(t+1);if(!AA[i])if(oo[i]){var n=0;do t++,n++,i=r.charCodeAt(t+1);while(oo[i]&&n<6);n<6&&i===D.space&&t++}else t++;return t}var OA={TYPE:0,START_LINE:1,START_COL:2,END_LINE:3,END_COL:4,START_POS:5,END_POS:6};wi.FIELDS=OA;function TA(r){var e=[],t=r.css.valueOf(),i=t,n=i.length,a=-1,s=1,o=0,l=0,c,f,d,p,h,b,v,y,w,k,S,E,O;function B(N,T){if(r.safe)t+=T,w=t.length-1;else throw r.error("Unclosed "+N,s,o-a,o)}for(;o<n;){switch(c=t.charCodeAt(o),c===D.newline&&(a=o,s+=1),c){case D.space:case D.tab:case D.newline:case D.cr:case D.feed:w=o;do w+=1,c=t.charCodeAt(w),c===D.newline&&(a=w,s+=1);while(c===D.space||c===D.newline||c===D.tab||c===D.cr||c===D.feed);O=D.space,p=s,d=w-a-1,l=w;break;case D.plus:case D.greaterThan:case D.tilde:case D.pipe:w=o;do w+=1,c=t.charCodeAt(w);while(c===D.plus||c===D.greaterThan||c===D.tilde||c===D.pipe);O=D.combinator,p=s,d=o-a,l=w;break;case D.asterisk:case D.ampersand:case D.bang:case D.comma:case D.equals:case D.dollar:case D.caret:case D.openSquare:case D.closeSquare:case D.colon:case D.semicolon:case D.openParenthesis:case D.closeParenthesis:w=o,O=c,p=s,d=o-a,l=w+1;break;case D.singleQuote:case D.doubleQuote:E=c===D.singleQuote?"'":'"',w=o;do for(h=!1,w=t.indexOf(E,w+1),w===-1&&B("quote",E),b=w;t.charCodeAt(b-1)===D.backslash;)b-=1,h=!h;while(h);O=D.str,p=s,d=o-a,l=w+1;break;default:c===D.slash&&t.charCodeAt(o+1)===D.asterisk?(w=t.indexOf("*/",o+2)+1,w===0&&B("comment","*/"),f=t.slice(o,w+1),y=f.split(`
+`),v=y.length-1,v>0?(k=s+v,S=w-y[v].length):(k=s,S=a),O=D.comment,s=k,p=k,d=w-S):c===D.slash?(w=o,O=c,p=s,d=o-a,l=w+1):(w=_A(t,o),O=D.word,p=s,d=w-a),l=w+1;break}e.push([O,s,o-a,p,d,o,l]),S&&(a=S,S=null),o=l}return e}});var vd=x((vi,wd)=>{u();"use strict";vi.__esModule=!0;vi.default=void 0;var RA=je(Pa()),lo=je(Da()),PA=je(La()),dd=je(Na()),IA=je(Fa()),DA=je(Ua()),uo=je(Ha()),qA=je(Ga()),hd=Un(Za()),$A=je(to()),fo=je(io()),LA=je(so()),MA=je(ld()),P=Un(pd()),$=Un(ao()),NA=Un(Se()),le=ti(),Vt,co;function md(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return(md=function(n){return n?t:e})(r)}function Un(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=md(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}function je(r){return r&&r.__esModule?r:{default:r}}function gd(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function BA(r,e,t){return e&&gd(r.prototype,e),t&&gd(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}var po=(Vt={},Vt[$.space]=!0,Vt[$.cr]=!0,Vt[$.feed]=!0,Vt[$.newline]=!0,Vt[$.tab]=!0,Vt),FA=Object.assign({},po,(co={},co[$.comment]=!0,co));function yd(r){return{line:r[P.FIELDS.START_LINE],column:r[P.FIELDS.START_COL]}}function bd(r){return{line:r[P.FIELDS.END_LINE],column:r[P.FIELDS.END_COL]}}function Ht(r,e,t,i){return{start:{line:r,column:e},end:{line:t,column:i}}}function sr(r){return Ht(r[P.FIELDS.START_LINE],r[P.FIELDS.START_COL],r[P.FIELDS.END_LINE],r[P.FIELDS.END_COL])}function ho(r,e){if(!!r)return Ht(r[P.FIELDS.START_LINE],r[P.FIELDS.START_COL],e[P.FIELDS.END_LINE],e[P.FIELDS.END_COL])}function ar(r,e){var t=r[e];if(typeof t=="string")return t.indexOf("\\")!==-1&&((0,le.ensureObject)(r,"raws"),r[e]=(0,le.unesc)(t),r.raws[e]===void 0&&(r.raws[e]=t)),r}function mo(r,e){for(var t=-1,i=[];(t=r.indexOf(e,t+1))!==-1;)i.push(t);return i}function jA(){var r=Array.prototype.concat.apply([],arguments);return r.filter(function(e,t){return t===r.indexOf(e)})}var zA=function(){function r(t,i){i===void 0&&(i={}),this.rule=t,this.options=Object.assign({lossy:!1,safe:!1},i),this.position=0,this.css=typeof this.rule=="string"?this.rule:this.rule.selector,this.tokens=(0,P.default)({css:this.css,error:this._errorGenerator(),safe:this.options.safe});var n=ho(this.tokens[0],this.tokens[this.tokens.length-1]);this.root=new RA.default({source:n}),this.root.errorGenerator=this._errorGenerator();var a=new lo.default({source:{start:{line:1,column:1}}});this.root.append(a),this.current=a,this.loop()}var e=r.prototype;return e._errorGenerator=function(){var i=this;return function(n,a){return typeof i.rule=="string"?new Error(n):i.rule.error(n,a)}},e.attribute=function(){var i=[],n=this.currToken;for(this.position++;this.position<this.tokens.length&&this.currToken[P.FIELDS.TYPE]!==$.closeSquare;)i.push(this.currToken),this.position++;if(this.currToken[P.FIELDS.TYPE]!==$.closeSquare)return this.expected("closing square bracket",this.currToken[P.FIELDS.START_POS]);var a=i.length,s={source:Ht(n[1],n[2],this.currToken[3],this.currToken[4]),sourceIndex:n[P.FIELDS.START_POS]};if(a===1&&!~[$.word].indexOf(i[0][P.FIELDS.TYPE]))return this.expected("attribute",i[0][P.FIELDS.START_POS]);for(var o=0,l="",c="",f=null,d=!1;o<a;){var p=i[o],h=this.content(p),b=i[o+1];switch(p[P.FIELDS.TYPE]){case $.space:if(d=!0,this.options.lossy)break;if(f){(0,le.ensureObject)(s,"spaces",f);var v=s.spaces[f].after||"";s.spaces[f].after=v+h;var y=(0,le.getProp)(s,"raws","spaces",f,"after")||null;y&&(s.raws.spaces[f].after=y+h)}else l=l+h,c=c+h;break;case $.asterisk:if(b[P.FIELDS.TYPE]===$.equals)s.operator=h,f="operator";else if((!s.namespace||f==="namespace"&&!d)&&b){l&&((0,le.ensureObject)(s,"spaces","attribute"),s.spaces.attribute.before=l,l=""),c&&((0,le.ensureObject)(s,"raws","spaces","attribute"),s.raws.spaces.attribute.before=l,c=""),s.namespace=(s.namespace||"")+h;var w=(0,le.getProp)(s,"raws","namespace")||null;w&&(s.raws.namespace+=h),f="namespace"}d=!1;break;case $.dollar:if(f==="value"){var k=(0,le.getProp)(s,"raws","value");s.value+="$",k&&(s.raws.value=k+"$");break}case $.caret:b[P.FIELDS.TYPE]===$.equals&&(s.operator=h,f="operator"),d=!1;break;case $.combinator:if(h==="~"&&b[P.FIELDS.TYPE]===$.equals&&(s.operator=h,f="operator"),h!=="|"){d=!1;break}b[P.FIELDS.TYPE]===$.equals?(s.operator=h,f="operator"):!s.namespace&&!s.attribute&&(s.namespace=!0),d=!1;break;case $.word:if(b&&this.content(b)==="|"&&i[o+2]&&i[o+2][P.FIELDS.TYPE]!==$.equals&&!s.operator&&!s.namespace)s.namespace=h,f="namespace";else if(!s.attribute||f==="attribute"&&!d){l&&((0,le.ensureObject)(s,"spaces","attribute"),s.spaces.attribute.before=l,l=""),c&&((0,le.ensureObject)(s,"raws","spaces","attribute"),s.raws.spaces.attribute.before=c,c=""),s.attribute=(s.attribute||"")+h;var S=(0,le.getProp)(s,"raws","attribute")||null;S&&(s.raws.attribute+=h),f="attribute"}else if(!s.value&&s.value!==""||f==="value"&&!(d||s.quoteMark)){var E=(0,le.unesc)(h),O=(0,le.getProp)(s,"raws","value")||"",B=s.value||"";s.value=B+E,s.quoteMark=null,(E!==h||O)&&((0,le.ensureObject)(s,"raws"),s.raws.value=(O||B)+h),f="value"}else{var N=h==="i"||h==="I";(s.value||s.value==="")&&(s.quoteMark||d)?(s.insensitive=N,(!N||h==="I")&&((0,le.ensureObject)(s,"raws"),s.raws.insensitiveFlag=h),f="insensitive",l&&((0,le.ensureObject)(s,"spaces","insensitive"),s.spaces.insensitive.before=l,l=""),c&&((0,le.ensureObject)(s,"raws","spaces","insensitive"),s.raws.spaces.insensitive.before=c,c="")):(s.value||s.value==="")&&(f="value",s.value+=h,s.raws.value&&(s.raws.value+=h))}d=!1;break;case $.str:if(!s.attribute||!s.operator)return this.error("Expected an attribute followed by an operator preceding the string.",{index:p[P.FIELDS.START_POS]});var T=(0,hd.unescapeValue)(h),F=T.unescaped,Y=T.quoteMark;s.value=F,s.quoteMark=Y,f="value",(0,le.ensureObject)(s,"raws"),s.raws.value=h,d=!1;break;case $.equals:if(!s.attribute)return this.expected("attribute",p[P.FIELDS.START_POS],h);if(s.value)return this.error('Unexpected "=" found; an operator was already defined.',{index:p[P.FIELDS.START_POS]});s.operator=s.operator?s.operator+h:h,f="operator",d=!1;break;case $.comment:if(f)if(d||b&&b[P.FIELDS.TYPE]===$.space||f==="insensitive"){var _=(0,le.getProp)(s,"spaces",f,"after")||"",Q=(0,le.getProp)(s,"raws","spaces",f,"after")||_;(0,le.ensureObject)(s,"raws","spaces",f),s.raws.spaces[f].after=Q+h}else{var U=s[f]||"",oe=(0,le.getProp)(s,"raws",f)||U;(0,le.ensureObject)(s,"raws"),s.raws[f]=oe+h}else c=c+h;break;default:return this.error('Unexpected "'+h+'" found.',{index:p[P.FIELDS.START_POS]})}o++}ar(s,"attribute"),ar(s,"namespace"),this.newNode(new hd.default(s)),this.position++},e.parseWhitespaceEquivalentTokens=function(i){i<0&&(i=this.tokens.length);var n=this.position,a=[],s="",o=void 0;do if(po[this.currToken[P.FIELDS.TYPE]])this.options.lossy||(s+=this.content());else if(this.currToken[P.FIELDS.TYPE]===$.comment){var l={};s&&(l.before=s,s=""),o=new dd.default({value:this.content(),source:sr(this.currToken),sourceIndex:this.currToken[P.FIELDS.START_POS],spaces:l}),a.push(o)}while(++this.position<i);if(s){if(o)o.spaces.after=s;else if(!this.options.lossy){var c=this.tokens[n],f=this.tokens[this.position-1];a.push(new uo.default({value:"",source:Ht(c[P.FIELDS.START_LINE],c[P.FIELDS.START_COL],f[P.FIELDS.END_LINE],f[P.FIELDS.END_COL]),sourceIndex:c[P.FIELDS.START_POS],spaces:{before:s,after:""}}))}}return a},e.convertWhitespaceNodesToSpace=function(i,n){var a=this;n===void 0&&(n=!1);var s="",o="";i.forEach(function(c){var f=a.lossySpace(c.spaces.before,n),d=a.lossySpace(c.rawSpaceBefore,n);s+=f+a.lossySpace(c.spaces.after,n&&f.length===0),o+=f+c.value+a.lossySpace(c.rawSpaceAfter,n&&d.length===0)}),o===s&&(o=void 0);var l={space:s,rawSpace:o};return l},e.isNamedCombinator=function(i){return i===void 0&&(i=this.position),this.tokens[i+0]&&this.tokens[i+0][P.FIELDS.TYPE]===$.slash&&this.tokens[i+1]&&this.tokens[i+1][P.FIELDS.TYPE]===$.word&&this.tokens[i+2]&&this.tokens[i+2][P.FIELDS.TYPE]===$.slash},e.namedCombinator=function(){if(this.isNamedCombinator()){var i=this.content(this.tokens[this.position+1]),n=(0,le.unesc)(i).toLowerCase(),a={};n!==i&&(a.value="/"+i+"/");var s=new fo.default({value:"/"+n+"/",source:Ht(this.currToken[P.FIELDS.START_LINE],this.currToken[P.FIELDS.START_COL],this.tokens[this.position+2][P.FIELDS.END_LINE],this.tokens[this.position+2][P.FIELDS.END_COL]),sourceIndex:this.currToken[P.FIELDS.START_POS],raws:a});return this.position=this.position+3,s}else this.unexpected()},e.combinator=function(){var i=this;if(this.content()==="|")return this.namespace();var n=this.locateNextMeaningfulToken(this.position);if(n<0||this.tokens[n][P.FIELDS.TYPE]===$.comma){var a=this.parseWhitespaceEquivalentTokens(n);if(a.length>0){var s=this.current.last;if(s){var o=this.convertWhitespaceNodesToSpace(a),l=o.space,c=o.rawSpace;c!==void 0&&(s.rawSpaceAfter+=c),s.spaces.after+=l}else a.forEach(function(O){return i.newNode(O)})}return}var f=this.currToken,d=void 0;n>this.position&&(d=this.parseWhitespaceEquivalentTokens(n));var p;if(this.isNamedCombinator()?p=this.namedCombinator():this.currToken[P.FIELDS.TYPE]===$.combinator?(p=new fo.default({value:this.content(),source:sr(this.currToken),sourceIndex:this.currToken[P.FIELDS.START_POS]}),this.position++):po[this.currToken[P.FIELDS.TYPE]]||d||this.unexpected(),p){if(d){var h=this.convertWhitespaceNodesToSpace(d),b=h.space,v=h.rawSpace;p.spaces.before=b,p.rawSpaceBefore=v}}else{var y=this.convertWhitespaceNodesToSpace(d,!0),w=y.space,k=y.rawSpace;k||(k=w);var S={},E={spaces:{}};w.endsWith(" ")&&k.endsWith(" ")?(S.before=w.slice(0,w.length-1),E.spaces.before=k.slice(0,k.length-1)):w.startsWith(" ")&&k.startsWith(" ")?(S.after=w.slice(1),E.spaces.after=k.slice(1)):E.value=k,p=new fo.default({value:" ",source:ho(f,this.tokens[this.position-1]),sourceIndex:f[P.FIELDS.START_POS],spaces:S,raws:E})}return this.currToken&&this.currToken[P.FIELDS.TYPE]===$.space&&(p.spaces.after=this.optionalSpace(this.content()),this.position++),this.newNode(p)},e.comma=function(){if(this.position===this.tokens.length-1){this.root.trailingComma=!0,this.position++;return}this.current._inferEndPosition();var i=new lo.default({source:{start:yd(this.tokens[this.position+1])}});this.current.parent.append(i),this.current=i,this.position++},e.comment=function(){var i=this.currToken;this.newNode(new dd.default({value:this.content(),source:sr(i),sourceIndex:i[P.FIELDS.START_POS]})),this.position++},e.error=function(i,n){throw this.root.error(i,n)},e.missingBackslash=function(){return this.error("Expected a backslash preceding the semicolon.",{index:this.currToken[P.FIELDS.START_POS]})},e.missingParenthesis=function(){return this.expected("opening parenthesis",this.currToken[P.FIELDS.START_POS])},e.missingSquareBracket=function(){return this.expected("opening square bracket",this.currToken[P.FIELDS.START_POS])},e.unexpected=function(){return this.error("Unexpected '"+this.content()+"'. Escaping special characters with \\ may help.",this.currToken[P.FIELDS.START_POS])},e.unexpectedPipe=function(){return this.error("Unexpected '|'.",this.currToken[P.FIELDS.START_POS])},e.namespace=function(){var i=this.prevToken&&this.content(this.prevToken)||!0;if(this.nextToken[P.FIELDS.TYPE]===$.word)return this.position++,this.word(i);if(this.nextToken[P.FIELDS.TYPE]===$.asterisk)return this.position++,this.universal(i);this.unexpectedPipe()},e.nesting=function(){if(this.nextToken){var i=this.content(this.nextToken);if(i==="|"){this.position++;return}}var n=this.currToken;this.newNode(new LA.default({value:this.content(),source:sr(n),sourceIndex:n[P.FIELDS.START_POS]})),this.position++},e.parentheses=function(){var i=this.current.last,n=1;if(this.position++,i&&i.type===NA.PSEUDO){var a=new lo.default({source:{start:yd(this.tokens[this.position-1])}}),s=this.current;for(i.append(a),this.current=a;this.position<this.tokens.length&&n;)this.currToken[P.FIELDS.TYPE]===$.openParenthesis&&n++,this.currToken[P.FIELDS.TYPE]===$.closeParenthesis&&n--,n?this.parse():(this.current.source.end=bd(this.currToken),this.current.parent.source.end=bd(this.currToken),this.position++);this.current=s}else{for(var o=this.currToken,l="(",c;this.position<this.tokens.length&&n;)this.currToken[P.FIELDS.TYPE]===$.openParenthesis&&n++,this.currToken[P.FIELDS.TYPE]===$.closeParenthesis&&n--,c=this.currToken,l+=this.parseParenthesisToken(this.currToken),this.position++;i?i.appendToPropertyAndEscape("value",l,l):this.newNode(new uo.default({value:l,source:Ht(o[P.FIELDS.START_LINE],o[P.FIELDS.START_COL],c[P.FIELDS.END_LINE],c[P.FIELDS.END_COL]),sourceIndex:o[P.FIELDS.START_POS]}))}if(n)return this.expected("closing parenthesis",this.currToken[P.FIELDS.START_POS])},e.pseudo=function(){for(var i=this,n="",a=this.currToken;this.currToken&&this.currToken[P.FIELDS.TYPE]===$.colon;)n+=this.content(),this.position++;if(!this.currToken)return this.expected(["pseudo-class","pseudo-element"],this.position-1);if(this.currToken[P.FIELDS.TYPE]===$.word)this.splitWord(!1,function(s,o){n+=s,i.newNode(new qA.default({value:n,source:ho(a,i.currToken),sourceIndex:a[P.FIELDS.START_POS]})),o>1&&i.nextToken&&i.nextToken[P.FIELDS.TYPE]===$.openParenthesis&&i.error("Misplaced parenthesis.",{index:i.nextToken[P.FIELDS.START_POS]})});else return this.expected(["pseudo-class","pseudo-element"],this.currToken[P.FIELDS.START_POS])},e.space=function(){var i=this.content();this.position===0||this.prevToken[P.FIELDS.TYPE]===$.comma||this.prevToken[P.FIELDS.TYPE]===$.openParenthesis||this.current.nodes.every(function(n){return n.type==="comment"})?(this.spaces=this.optionalSpace(i),this.position++):this.position===this.tokens.length-1||this.nextToken[P.FIELDS.TYPE]===$.comma||this.nextToken[P.FIELDS.TYPE]===$.closeParenthesis?(this.current.last.spaces.after=this.optionalSpace(i),this.position++):this.combinator()},e.string=function(){var i=this.currToken;this.newNode(new uo.default({value:this.content(),source:sr(i),sourceIndex:i[P.FIELDS.START_POS]})),this.position++},e.universal=function(i){var n=this.nextToken;if(n&&this.content(n)==="|")return this.position++,this.namespace();var a=this.currToken;this.newNode(new $A.default({value:this.content(),source:sr(a),sourceIndex:a[P.FIELDS.START_POS]}),i),this.position++},e.splitWord=function(i,n){for(var a=this,s=this.nextToken,o=this.content();s&&~[$.dollar,$.caret,$.equals,$.word].indexOf(s[P.FIELDS.TYPE]);){this.position++;var l=this.content();if(o+=l,l.lastIndexOf("\\")===l.length-1){var c=this.nextToken;c&&c[P.FIELDS.TYPE]===$.space&&(o+=this.requiredSpace(this.content(c)),this.position++)}s=this.nextToken}var f=mo(o,".").filter(function(b){var v=o[b-1]==="\\",y=/^\d+\.\d+%$/.test(o);return!v&&!y}),d=mo(o,"#").filter(function(b){return o[b-1]!=="\\"}),p=mo(o,"#{");p.length&&(d=d.filter(function(b){return!~p.indexOf(b)}));var h=(0,MA.default)(jA([0].concat(f,d)));h.forEach(function(b,v){var y=h[v+1]||o.length,w=o.slice(b,y);if(v===0&&n)return n.call(a,w,h.length);var k,S=a.currToken,E=S[P.FIELDS.START_POS]+h[v],O=Ht(S[1],S[2]+b,S[3],S[2]+(y-1));if(~f.indexOf(b)){var B={value:w.slice(1),source:O,sourceIndex:E};k=new PA.default(ar(B,"value"))}else if(~d.indexOf(b)){var N={value:w.slice(1),source:O,sourceIndex:E};k=new IA.default(ar(N,"value"))}else{var T={value:w,source:O,sourceIndex:E};ar(T,"value"),k=new DA.default(T)}a.newNode(k,i),i=null}),this.position++},e.word=function(i){var n=this.nextToken;return n&&this.content(n)==="|"?(this.position++,this.namespace()):this.splitWord(i)},e.loop=function(){for(;this.position<this.tokens.length;)this.parse(!0);return this.current._inferEndPosition(),this.root},e.parse=function(i){switch(this.currToken[P.FIELDS.TYPE]){case $.space:this.space();break;case $.comment:this.comment();break;case $.openParenthesis:this.parentheses();break;case $.closeParenthesis:i&&this.missingParenthesis();break;case $.openSquare:this.attribute();break;case $.dollar:case $.caret:case $.equals:case $.word:this.word();break;case $.colon:this.pseudo();break;case $.comma:this.comma();break;case $.asterisk:this.universal();break;case $.ampersand:this.nesting();break;case $.slash:case $.combinator:this.combinator();break;case $.str:this.string();break;case $.closeSquare:this.missingSquareBracket();case $.semicolon:this.missingBackslash();default:this.unexpected()}},e.expected=function(i,n,a){if(Array.isArray(i)){var s=i.pop();i=i.join(", ")+" or "+s}var o=/^[aeiou]/.test(i[0])?"an":"a";return a?this.error("Expected "+o+" "+i+', found "'+a+'" instead.',{index:n}):this.error("Expected "+o+" "+i+".",{index:n})},e.requiredSpace=function(i){return this.options.lossy?" ":i},e.optionalSpace=function(i){return this.options.lossy?"":i},e.lossySpace=function(i,n){return this.options.lossy?n?" ":"":i},e.parseParenthesisToken=function(i){var n=this.content(i);return i[P.FIELDS.TYPE]===$.space?this.requiredSpace(n):n},e.newNode=function(i,n){return n&&(/^ +$/.test(n)&&(this.options.lossy||(this.spaces=(this.spaces||"")+n),n=!0),i.namespace=n,ar(i,"namespace")),this.spaces&&(i.spaces.before=this.spaces,this.spaces=""),this.current.append(i)},e.content=function(i){return i===void 0&&(i=this.currToken),this.css.slice(i[P.FIELDS.START_POS],i[P.FIELDS.END_POS])},e.locateNextMeaningfulToken=function(i){i===void 0&&(i=this.position+1);for(var n=i;n<this.tokens.length;)if(FA[this.tokens[n][P.FIELDS.TYPE]]){n++;continue}else return n;return-1},BA(r,[{key:"currToken",get:function(){return this.tokens[this.position]}},{key:"nextToken",get:function(){return this.tokens[this.position+1]}},{key:"prevToken",get:function(){return this.tokens[this.position-1]}}]),r}();vi.default=zA;wd.exports=vi.default});var kd=x((xi,xd)=>{u();"use strict";xi.__esModule=!0;xi.default=void 0;var UA=VA(vd());function VA(r){return r&&r.__esModule?r:{default:r}}var HA=function(){function r(t,i){this.func=t||function(){},this.funcRes=null,this.options=i}var e=r.prototype;return e._shouldUpdateSelector=function(i,n){n===void 0&&(n={});var a=Object.assign({},this.options,n);return a.updateSelector===!1?!1:typeof i!="string"},e._isLossy=function(i){i===void 0&&(i={});var n=Object.assign({},this.options,i);return n.lossless===!1},e._root=function(i,n){n===void 0&&(n={});var a=new UA.default(i,this._parseOptions(n));return a.root},e._parseOptions=function(i){return{lossy:this._isLossy(i)}},e._run=function(i,n){var a=this;return n===void 0&&(n={}),new Promise(function(s,o){try{var l=a._root(i,n);Promise.resolve(a.func(l)).then(function(c){var f=void 0;return a._shouldUpdateSelector(i,n)&&(f=l.toString(),i.selector=f),{transform:c,root:l,string:f}}).then(s,o)}catch(c){o(c);return}})},e._runSync=function(i,n){n===void 0&&(n={});var a=this._root(i,n),s=this.func(a);if(s&&typeof s.then=="function")throw new Error("Selector processor returned a promise to a synchronous call.");var o=void 0;return n.updateSelector&&typeof i!="string"&&(o=a.toString(),i.selector=o),{transform:s,root:a,string:o}},e.ast=function(i,n){return this._run(i,n).then(function(a){return a.root})},e.astSync=function(i,n){return this._runSync(i,n).root},e.transform=function(i,n){return this._run(i,n).then(function(a){return a.transform})},e.transformSync=function(i,n){return this._runSync(i,n).transform},e.process=function(i,n){return this._run(i,n).then(function(a){return a.string||a.root.toString()})},e.processSync=function(i,n){var a=this._runSync(i,n);return a.string||a.root.toString()},r}();xi.default=HA;xd.exports=xi.default});var Sd=x(ne=>{u();"use strict";ne.__esModule=!0;ne.universal=ne.tag=ne.string=ne.selector=ne.root=ne.pseudo=ne.nesting=ne.id=ne.comment=ne.combinator=ne.className=ne.attribute=void 0;var WA=ze(Za()),GA=ze(La()),QA=ze(io()),YA=ze(Na()),KA=ze(Fa()),XA=ze(so()),JA=ze(Ga()),ZA=ze(Pa()),eC=ze(Da()),tC=ze(Ha()),rC=ze(Ua()),iC=ze(to());function ze(r){return r&&r.__esModule?r:{default:r}}var nC=function(e){return new WA.default(e)};ne.attribute=nC;var sC=function(e){return new GA.default(e)};ne.className=sC;var aC=function(e){return new QA.default(e)};ne.combinator=aC;var oC=function(e){return new YA.default(e)};ne.comment=oC;var lC=function(e){return new KA.default(e)};ne.id=lC;var uC=function(e){return new XA.default(e)};ne.nesting=uC;var fC=function(e){return new JA.default(e)};ne.pseudo=fC;var cC=function(e){return new ZA.default(e)};ne.root=cC;var pC=function(e){return new eC.default(e)};ne.selector=pC;var dC=function(e){return new tC.default(e)};ne.string=dC;var hC=function(e){return new rC.default(e)};ne.tag=hC;var mC=function(e){return new iC.default(e)};ne.universal=mC});var Ed=x(J=>{u();"use strict";J.__esModule=!0;J.isComment=J.isCombinator=J.isClassName=J.isAttribute=void 0;J.isContainer=EC;J.isIdentifier=void 0;J.isNamespace=OC;J.isNesting=void 0;J.isNode=go;J.isPseudo=void 0;J.isPseudoClass=_C;J.isPseudoElement=_d;J.isUniversal=J.isTag=J.isString=J.isSelector=J.isRoot=void 0;var ue=Se(),Oe,gC=(Oe={},Oe[ue.ATTRIBUTE]=!0,Oe[ue.CLASS]=!0,Oe[ue.COMBINATOR]=!0,Oe[ue.COMMENT]=!0,Oe[ue.ID]=!0,Oe[ue.NESTING]=!0,Oe[ue.PSEUDO]=!0,Oe[ue.ROOT]=!0,Oe[ue.SELECTOR]=!0,Oe[ue.STRING]=!0,Oe[ue.TAG]=!0,Oe[ue.UNIVERSAL]=!0,Oe);function go(r){return typeof r=="object"&&gC[r.type]}function Ue(r,e){return go(e)&&e.type===r}var Ad=Ue.bind(null,ue.ATTRIBUTE);J.isAttribute=Ad;var yC=Ue.bind(null,ue.CLASS);J.isClassName=yC;var bC=Ue.bind(null,ue.COMBINATOR);J.isCombinator=bC;var wC=Ue.bind(null,ue.COMMENT);J.isComment=wC;var vC=Ue.bind(null,ue.ID);J.isIdentifier=vC;var xC=Ue.bind(null,ue.NESTING);J.isNesting=xC;var yo=Ue.bind(null,ue.PSEUDO);J.isPseudo=yo;var kC=Ue.bind(null,ue.ROOT);J.isRoot=kC;var SC=Ue.bind(null,ue.SELECTOR);J.isSelector=SC;var AC=Ue.bind(null,ue.STRING);J.isString=AC;var Cd=Ue.bind(null,ue.TAG);J.isTag=Cd;var CC=Ue.bind(null,ue.UNIVERSAL);J.isUniversal=CC;function _d(r){return yo(r)&&r.value&&(r.value.startsWith("::")||r.value.toLowerCase()===":before"||r.value.toLowerCase()===":after"||r.value.toLowerCase()===":first-letter"||r.value.toLowerCase()===":first-line")}function _C(r){return yo(r)&&!_d(r)}function EC(r){return!!(go(r)&&r.walk)}function OC(r){return Ad(r)||Cd(r)}});var Od=x(Ke=>{u();"use strict";Ke.__esModule=!0;var bo=Se();Object.keys(bo).forEach(function(r){r==="default"||r==="__esModule"||r in Ke&&Ke[r]===bo[r]||(Ke[r]=bo[r])});var wo=Sd();Object.keys(wo).forEach(function(r){r==="default"||r==="__esModule"||r in Ke&&Ke[r]===wo[r]||(Ke[r]=wo[r])});var vo=Ed();Object.keys(vo).forEach(function(r){r==="default"||r==="__esModule"||r in Ke&&Ke[r]===vo[r]||(Ke[r]=vo[r])})});var it=x((ki,Rd)=>{u();"use strict";ki.__esModule=!0;ki.default=void 0;var TC=IC(kd()),RC=PC(Od());function Td(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return(Td=function(n){return n?t:e})(r)}function PC(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=Td(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}function IC(r){return r&&r.__esModule?r:{default:r}}var xo=function(e){return new TC.default(e)};Object.assign(xo,RC);delete xo.__esModule;var DC=xo;ki.default=DC;Rd.exports=ki.default});function mt(r){return["fontSize","outline"].includes(r)?e=>(typeof e=="function"&&(e=e({})),Array.isArray(e)&&(e=e[0]),e):r==="fontFamily"?e=>{typeof e=="function"&&(e=e({}));let t=Array.isArray(e)&&ke(e[1])?e[0]:e;return Array.isArray(t)?t.join(", "):t}:["boxShadow","transitionProperty","transitionDuration","transitionDelay","transitionTimingFunction","backgroundImage","backgroundSize","backgroundColor","cursor","animation"].includes(r)?e=>(typeof e=="function"&&(e=e({})),Array.isArray(e)&&(e=e.join(", ")),e):["gridTemplateColumns","gridTemplateRows","objectPosition"].includes(r)?e=>(typeof e=="function"&&(e=e({})),typeof e=="string"&&(e=ee.list.comma(e).join(" ")),e):(e,t={})=>(typeof e=="function"&&(e=e(t)),e)}var Si=R(()=>{u();Ot();Kt()});var Md=x(($I,_o)=>{u();var{Rule:Pd,AtRule:qC}=$e(),Id=it();function ko(r,e){let t;try{Id(i=>{t=i}).processSync(r)}catch(i){throw r.includes(":")?e?e.error("Missed semicolon"):i:e?e.error(i.message):i}return t.at(0)}function Dd(r,e){let t=!1;return r.each(i=>{if(i.type==="nesting"){let n=e.clone({});i.value!=="&"?i.replaceWith(ko(i.value.replace("&",n.toString()))):i.replaceWith(n),t=!0}else"nodes"in i&&i.nodes&&Dd(i,e)&&(t=!0)}),t}function qd(r,e){let t=[];return r.selectors.forEach(i=>{let n=ko(i,r);e.selectors.forEach(a=>{if(!a)return;let s=ko(a,e);Dd(s,n)||(s.prepend(Id.combinator({value:" "})),s.prepend(n.clone({}))),t.push(s.toString())})}),t}function Vn(r,e){let t=r.prev();for(e.after(r);t&&t.type==="comment";){let i=t.prev();e.after(t),t=i}return r}function $C(r){return function e(t,i,n,a=n){let s=[];if(i.each(o=>{o.type==="rule"&&n?a&&(o.selectors=qd(t,o)):o.type==="atrule"&&o.nodes?r[o.name]?e(t,o,a):i[Ao]!==!1&&s.push(o):s.push(o)}),n&&s.length){let o=t.clone({nodes:[]});for(let l of s)o.append(l);i.prepend(o)}}}function So(r,e,t){let i=new Pd({selector:r,nodes:[]});return i.append(e),t.after(i),i}function $d(r,e){let t={};for(let i of r)t[i]=!0;if(e)for(let i of e)t[i.replace(/^@/,"")]=!0;return t}function LC(r){r=r.trim();let e=r.match(/^\((.*)\)$/);if(!e)return{type:"basic",selector:r};let t=e[1].match(/^(with(?:out)?):(.+)$/);if(t){let i=t[1]==="with",n=Object.fromEntries(t[2].trim().split(/\s+/).map(s=>[s,!0]));if(i&&n.all)return{type:"noop"};let a=s=>!!n[s];return n.all?a=()=>!0:i&&(a=s=>s==="all"?!1:!n[s]),{type:"withrules",escapes:a}}return{type:"unknown"}}function MC(r){let e=[],t=r.parent;for(;t&&t instanceof qC;)e.push(t),t=t.parent;return e}function NC(r){let e=r[Ld];if(!e)r.after(r.nodes);else{let t=r.nodes,i,n=-1,a,s,o,l=MC(r);if(l.forEach((c,f)=>{if(e(c.name))i=c,n=f,s=o;else{let d=o;o=c.clone({nodes:[]}),d&&o.append(d),a=a||o}}),i?s?(a.append(t),i.after(s)):i.after(t):r.after(t),r.next()&&i){let c;l.slice(0,n+1).forEach((f,d,p)=>{let h=c;c=f.clone({nodes:[]}),h&&c.append(h);let b=[],y=(p[d-1]||r).next();for(;y;)b.push(y),y=y.next();c.append(b)}),c&&(s||t[t.length-1]).after(c)}}r.remove()}var Ao=Symbol("rootRuleMergeSel"),Ld=Symbol("rootRuleEscapes");function BC(r){let{params:e}=r,{type:t,selector:i,escapes:n}=LC(e);if(t==="unknown")throw r.error(`Unknown @${r.name} parameter ${JSON.stringify(e)}`);if(t==="basic"&&i){let a=new Pd({selector:i,nodes:r.nodes});r.removeAll(),r.append(a)}r[Ld]=n,r[Ao]=n?!n("all"):t==="noop"}var Co=Symbol("hasRootRule");_o.exports=(r={})=>{let e=$d(["media","supports","layer","container"],r.bubble),t=$C(e),i=$d(["document","font-face","keyframes","-webkit-keyframes","-moz-keyframes"],r.unwrap),n=(r.rootRuleName||"at-root").replace(/^@/,""),a=r.preserveEmpty;return{postcssPlugin:"postcss-nested",Once(s){s.walkAtRules(n,o=>{BC(o),s[Co]=!0})},Rule(s){let o=!1,l=s,c=!1,f=[];s.each(d=>{d.type==="rule"?(f.length&&(l=So(s.selector,f,l),f=[]),c=!0,o=!0,d.selectors=qd(s,d),l=Vn(d,l)):d.type==="atrule"?(f.length&&(l=So(s.selector,f,l),f=[]),d.name===n?(o=!0,t(s,d,!0,d[Ao]),l=Vn(d,l)):e[d.name]?(c=!0,o=!0,t(s,d,!0),l=Vn(d,l)):i[d.name]?(c=!0,o=!0,t(s,d,!1),l=Vn(d,l)):c&&f.push(d)):d.type==="decl"&&c&&f.push(d)}),f.length&&(l=So(s.selector,f,l)),o&&a!==!0&&(s.raws.semicolon=!0,s.nodes.length===0&&s.remove())},RootExit(s){s[Co]&&(s.walkAtRules(n,NC),s[Co]=!1)}}};_o.exports.postcss=!0});var jd=x((LI,Fd)=>{u();"use strict";var Nd=/-(\w|$)/g,Bd=(r,e)=>e.toUpperCase(),FC=r=>(r=r.toLowerCase(),r==="float"?"cssFloat":r.startsWith("-ms-")?r.substr(1).replace(Nd,Bd):r.replace(Nd,Bd));Fd.exports=FC});var To=x((MI,zd)=>{u();var jC=jd(),zC={boxFlex:!0,boxFlexGroup:!0,columnCount:!0,flex:!0,flexGrow:!0,flexPositive:!0,flexShrink:!0,flexNegative:!0,fontWeight:!0,lineClamp:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,tabSize:!0,widows:!0,zIndex:!0,zoom:!0,fillOpacity:!0,strokeDashoffset:!0,strokeOpacity:!0,strokeWidth:!0};function Eo(r){return typeof r.nodes=="undefined"?!0:Oo(r)}function Oo(r){let e,t={};return r.each(i=>{if(i.type==="atrule")e="@"+i.name,i.params&&(e+=" "+i.params),typeof t[e]=="undefined"?t[e]=Eo(i):Array.isArray(t[e])?t[e].push(Eo(i)):t[e]=[t[e],Eo(i)];else if(i.type==="rule"){let n=Oo(i);if(t[i.selector])for(let a in n)t[i.selector][a]=n[a];else t[i.selector]=n}else if(i.type==="decl"){i.prop[0]==="-"&&i.prop[1]==="-"||i.parent&&i.parent.selector===":export"?e=i.prop:e=jC(i.prop);let n=i.value;!isNaN(i.value)&&zC[e]&&(n=parseFloat(i.value)),i.important&&(n+=" !important"),typeof t[e]=="undefined"?t[e]=n:Array.isArray(t[e])?t[e].push(n):t[e]=[t[e],n]}}),t}zd.exports=Oo});var Hn=x((NI,Wd)=>{u();var Ai=$e(),Ud=/\s*!important\s*$/i,UC={"box-flex":!0,"box-flex-group":!0,"column-count":!0,flex:!0,"flex-grow":!0,"flex-positive":!0,"flex-shrink":!0,"flex-negative":!0,"font-weight":!0,"line-clamp":!0,"line-height":!0,opacity:!0,order:!0,orphans:!0,"tab-size":!0,widows:!0,"z-index":!0,zoom:!0,"fill-opacity":!0,"stroke-dashoffset":!0,"stroke-opacity":!0,"stroke-width":!0};function VC(r){return r.replace(/([A-Z])/g,"-$1").replace(/^ms-/,"-ms-").toLowerCase()}function Vd(r,e,t){t===!1||t===null||(e.startsWith("--")||(e=VC(e)),typeof t=="number"&&(t===0||UC[e]?t=t.toString():t+="px"),e==="css-float"&&(e="float"),Ud.test(t)?(t=t.replace(Ud,""),r.push(Ai.decl({prop:e,value:t,important:!0}))):r.push(Ai.decl({prop:e,value:t})))}function Hd(r,e,t){let i=Ai.atRule({name:e[1],params:e[3]||""});typeof t=="object"&&(i.nodes=[],Ro(t,i)),r.push(i)}function Ro(r,e){let t,i,n;for(t in r)if(i=r[t],!(i===null||typeof i=="undefined"))if(t[0]==="@"){let a=t.match(/@(\S+)(\s+([\W\w]*)\s*)?/);if(Array.isArray(i))for(let s of i)Hd(e,a,s);else Hd(e,a,i)}else if(Array.isArray(i))for(let a of i)Vd(e,t,a);else typeof i=="object"?(n=Ai.rule({selector:t}),Ro(i,n),e.push(n)):Vd(e,t,i)}Wd.exports=function(r){let e=Ai.root();return Ro(r,e),e}});var Po=x((BI,Gd)=>{u();var HC=To();Gd.exports=function(e){return console&&console.warn&&e.warnings().forEach(t=>{let i=t.plugin||"PostCSS";console.warn(i+": "+t.text)}),HC(e.root)}});var Yd=x((FI,Qd)=>{u();var WC=$e(),GC=Po(),QC=Hn();Qd.exports=function(e){let t=WC(e);return async i=>{let n=await t.process(i,{parser:QC,from:void 0});return GC(n)}}});var Xd=x((jI,Kd)=>{u();var YC=$e(),KC=Po(),XC=Hn();Kd.exports=function(r){let e=YC(r);return t=>{let i=e.process(t,{parser:XC,from:void 0});return KC(i)}}});var Zd=x((zI,Jd)=>{u();var JC=To(),ZC=Hn(),e_=Yd(),t_=Xd();Jd.exports={objectify:JC,parse:ZC,async:e_,sync:t_}});var or,eh,UI,VI,HI,WI,th=R(()=>{u();or=pe(Zd()),eh=or.default,UI=or.default.objectify,VI=or.default.parse,HI=or.default.async,WI=or.default.sync});function lr(r){return Array.isArray(r)?r.flatMap(e=>ee([(0,rh.default)({bubble:["screen"]})]).process(e,{parser:eh}).root.nodes):lr([r])}var rh,Io=R(()=>{u();Ot();rh=pe(Md());th()});function ur(r,e,t=!1){if(r==="")return e;let i=typeof e=="string"?(0,ih.default)().astSync(e):e;return i.walkClasses(n=>{let a=n.value,s=t&&a.startsWith("-");n.value=s?`-${r}${a.slice(1)}`:`${r}${a}`}),typeof e=="string"?i.toString():i}var ih,Wn=R(()=>{u();ih=pe(it())});function Te(r){let e=nh.default.className();return e.value=r,jt(e?.raws?.value??e.value)}var nh,fr=R(()=>{u();nh=pe(it());Ki()});function Do(r){return jt(`.${Te(r)}`)}function Gn(r,e){return Do(Ci(r,e))}function Ci(r,e){return e==="DEFAULT"?r:e==="-"||e==="-DEFAULT"?`-${r}`:e.startsWith("-")?`-${r}${e}`:e.startsWith("/")?`${r}${e}`:`${r}-${e}`}var qo=R(()=>{u();fr();Ki()});function L(r,e=[[r,[r]]],{filterDefault:t=!1,...i}={}){let n=mt(r);return function({matchUtilities:a,theme:s}){for(let o of e){let l=Array.isArray(o[0])?o:[o];a(l.reduce((c,[f,d])=>Object.assign(c,{[f]:p=>d.reduce((h,b)=>Array.isArray(b)?Object.assign(h,{[b[0]]:b[1]}):Object.assign(h,{[b]:n(p)}),{})}),{}),{...i,values:t?Object.fromEntries(Object.entries(s(r)??{}).filter(([c])=>c!=="DEFAULT")):s(r)})}}}var sh=R(()=>{u();Si()});function Tt(r){return r=Array.isArray(r)?r:[r],r.map(e=>{let t=e.values.map(i=>i.raw!==void 0?i.raw:[i.min&&`(min-width: ${i.min})`,i.max&&`(max-width: ${i.max})`].filter(Boolean).join(" and "));return e.not?`not all and ${t}`:t}).join(", ")}var Qn=R(()=>{u()});function $o(r){return r.split(l_).map(t=>{let i=t.trim(),n={value:i},a=i.split(u_),s=new Set;for(let o of a)!s.has("DIRECTIONS")&&r_.has(o)?(n.direction=o,s.add("DIRECTIONS")):!s.has("PLAY_STATES")&&i_.has(o)?(n.playState=o,s.add("PLAY_STATES")):!s.has("FILL_MODES")&&n_.has(o)?(n.fillMode=o,s.add("FILL_MODES")):!s.has("ITERATION_COUNTS")&&(s_.has(o)||f_.test(o))?(n.iterationCount=o,s.add("ITERATION_COUNTS")):!s.has("TIMING_FUNCTION")&&a_.has(o)||!s.has("TIMING_FUNCTION")&&o_.some(l=>o.startsWith(`${l}(`))?(n.timingFunction=o,s.add("TIMING_FUNCTION")):!s.has("DURATION")&&ah.test(o)?(n.duration=o,s.add("DURATION")):!s.has("DELAY")&&ah.test(o)?(n.delay=o,s.add("DELAY")):s.has("NAME")?(n.unknown||(n.unknown=[]),n.unknown.push(o)):(n.name=o,s.add("NAME"));return n})}var r_,i_,n_,s_,a_,o_,l_,u_,ah,f_,oh=R(()=>{u();r_=new Set(["normal","reverse","alternate","alternate-reverse"]),i_=new Set(["running","paused"]),n_=new Set(["none","forwards","backwards","both"]),s_=new Set(["infinite"]),a_=new Set(["linear","ease","ease-in","ease-out","ease-in-out","step-start","step-end"]),o_=["cubic-bezier","steps"],l_=/\,(?![^(]*\))/g,u_=/\ +(?![^(]*\))/g,ah=/^(-?[\d.]+m?s)$/,f_=/^(\d+)$/});var lh,xe,uh=R(()=>{u();lh=r=>Object.assign({},...Object.entries(r??{}).flatMap(([e,t])=>typeof t=="object"?Object.entries(lh(t)).map(([i,n])=>({[e+(i==="DEFAULT"?"":`-${i}`)]:n})):[{[`${e}`]:t}])),xe=lh});var ch,fh=R(()=>{ch="3.4.14"});function Rt(r,e=!0){return Array.isArray(r)?r.map(t=>{if(e&&Array.isArray(t))throw new Error("The tuple syntax is not supported for `screens`.");if(typeof t=="string")return{name:t.toString(),not:!1,values:[{min:t,max:void 0}]};let[i,n]=t;return i=i.toString(),typeof n=="string"?{name:i,not:!1,values:[{min:n,max:void 0}]}:Array.isArray(n)?{name:i,not:!1,values:n.map(a=>dh(a))}:{name:i,not:!1,values:[dh(n)]}}):Rt(Object.entries(r??{}),!1)}function Yn(r){return r.values.length!==1?{result:!1,reason:"multiple-values"}:r.values[0].raw!==void 0?{result:!1,reason:"raw-values"}:r.values[0].min!==void 0&&r.values[0].max!==void 0?{result:!1,reason:"min-and-max"}:{result:!0,reason:null}}function ph(r,e,t){let i=Kn(e,r),n=Kn(t,r),a=Yn(i),s=Yn(n);if(a.reason==="multiple-values"||s.reason==="multiple-values")throw new Error("Attempted to sort a screen with multiple values. This should never happen. Please open a bug report.");if(a.reason==="raw-values"||s.reason==="raw-values")throw new Error("Attempted to sort a screen with raw values. This should never happen. Please open a bug report.");if(a.reason==="min-and-max"||s.reason==="min-and-max")throw new Error("Attempted to sort a screen with both min and max values. This should never happen. Please open a bug report.");let{min:o,max:l}=i.values[0],{min:c,max:f}=n.values[0];e.not&&([o,l]=[l,o]),t.not&&([c,f]=[f,c]),o=o===void 0?o:parseFloat(o),l=l===void 0?l:parseFloat(l),c=c===void 0?c:parseFloat(c),f=f===void 0?f:parseFloat(f);let[d,p]=r==="min"?[o,c]:[f,l];return d-p}function Kn(r,e){return typeof r=="object"?r:{name:"arbitrary-screen",values:[{[e]:r}]}}function dh({"min-width":r,min:e=r,max:t,raw:i}={}){return{min:e,max:t,raw:i}}var Xn=R(()=>{u()});function Jn(r,e){r.walkDecls(t=>{if(e.includes(t.prop)){t.remove();return}for(let i of e)t.value.includes(`/ var(${i})`)&&(t.value=t.value.replace(`/ var(${i})`,""))})}var hh=R(()=>{u()});var se,Xe,nt,ge,mh,gh=R(()=>{u();ft();et();Ot();sh();Qn();fr();oh();uh();Lr();ea();Kt();Si();fh();Be();Xn();Gs();hh();ct();Br();_i();se={childVariant:({addVariant:r})=>{r("*","& > *")},pseudoElementVariants:({addVariant:r})=>{r("first-letter","&::first-letter"),r("first-line","&::first-line"),r("marker",[({container:e})=>(Jn(e,["--tw-text-opacity"]),"& *::marker"),({container:e})=>(Jn(e,["--tw-text-opacity"]),"&::marker")]),r("selection",["& *::selection","&::selection"]),r("file","&::file-selector-button"),r("placeholder","&::placeholder"),r("backdrop","&::backdrop"),r("before",({container:e})=>(e.walkRules(t=>{let i=!1;t.walkDecls("content",()=>{i=!0}),i||t.prepend(ee.decl({prop:"content",value:"var(--tw-content)"}))}),"&::before")),r("after",({container:e})=>(e.walkRules(t=>{let i=!1;t.walkDecls("content",()=>{i=!0}),i||t.prepend(ee.decl({prop:"content",value:"var(--tw-content)"}))}),"&::after"))},pseudoClassVariants:({addVariant:r,matchVariant:e,config:t,prefix:i})=>{let n=[["first","&:first-child"],["last","&:last-child"],["only","&:only-child"],["odd","&:nth-child(odd)"],["even","&:nth-child(even)"],"first-of-type","last-of-type","only-of-type",["visited",({container:s})=>(Jn(s,["--tw-text-opacity","--tw-border-opacity","--tw-bg-opacity"]),"&:visited")],"target",["open","&[open]"],"default","checked","indeterminate","placeholder-shown","autofill","optional","required","valid","invalid","in-range","out-of-range","read-only","empty","focus-within",["hover",we(t(),"hoverOnlyWhenSupported")?"@media (hover: hover) and (pointer: fine) { &:hover }":"&:hover"],"focus","focus-visible","active","enabled","disabled"].map(s=>Array.isArray(s)?s:[s,`&:${s}`]);for(let[s,o]of n)r(s,l=>typeof o=="function"?o(l):o);let a={group:(s,{modifier:o})=>o?[`:merge(${i(".group")}\\/${Te(o)})`," &"]:[`:merge(${i(".group")})`," &"],peer:(s,{modifier:o})=>o?[`:merge(${i(".peer")}\\/${Te(o)})`," ~ &"]:[`:merge(${i(".peer")})`," ~ &"]};for(let[s,o]of Object.entries(a))e(s,(l="",c)=>{let f=K(typeof l=="function"?l(c):l);f.includes("&")||(f="&"+f);let[d,p]=o("",c),h=null,b=null,v=0;for(let y=0;y<f.length;++y){let w=f[y];w==="&"?h=y:w==="'"||w==='"'?v+=1:h!==null&&w===" "&&!v&&(b=y)}return h!==null&&b===null&&(b=f.length),f.slice(0,h)+d+f.slice(h+1,b)+p+f.slice(b)},{values:Object.fromEntries(n),[Pt]:{respectPrefix:!1}})},directionVariants:({addVariant:r})=>{r("ltr",'&:where([dir="ltr"], [dir="ltr"] *)'),r("rtl",'&:where([dir="rtl"], [dir="rtl"] *)')},reducedMotionVariants:({addVariant:r})=>{r("motion-safe","@media (prefers-reduced-motion: no-preference)"),r("motion-reduce","@media (prefers-reduced-motion: reduce)")},darkVariants:({config:r,addVariant:e})=>{let[t,i=".dark"]=[].concat(r("darkMode","media"));if(t===!1&&(t="media",G.warn("darkmode-false",["The `darkMode` option in your Tailwind CSS configuration is set to `false`, which now behaves the same as `media`.","Change `darkMode` to `media` or remove it entirely.","https://tailwindcss.com/docs/upgrade-guide#remove-dark-mode-configuration"])),t==="variant"){let n;if(Array.isArray(i)||typeof i=="function"?n=i:typeof i=="string"&&(n=[i]),Array.isArray(n))for(let a of n)a===".dark"?(t=!1,G.warn("darkmode-variant-without-selector",["When using `variant` for `darkMode`, you must provide a selector.",'Example: `darkMode: ["variant", ".your-selector &"]`'])):a.includes("&")||(t=!1,G.warn("darkmode-variant-without-ampersand",["When using `variant` for `darkMode`, your selector must contain `&`.",'Example `darkMode: ["variant", ".your-selector &"]`']));i=n}t==="selector"?e("dark",`&:where(${i}, ${i} *)`):t==="media"?e("dark","@media (prefers-color-scheme: dark)"):t==="variant"?e("dark",i):t==="class"&&e("dark",`&:is(${i} *)`)},printVariant:({addVariant:r})=>{r("print","@media print")},screenVariants:({theme:r,addVariant:e,matchVariant:t})=>{let i=r("screens")??{},n=Object.values(i).every(w=>typeof w=="string"),a=Rt(r("screens")),s=new Set([]);function o(w){return w.match(/(\D+)$/)?.[1]??"(none)"}function l(w){w!==void 0&&s.add(o(w))}function c(w){return l(w),s.size===1}for(let w of a)for(let k of w.values)l(k.min),l(k.max);let f=s.size<=1;function d(w){return Object.fromEntries(a.filter(k=>Yn(k).result).map(k=>{let{min:S,max:E}=k.values[0];if(w==="min"&&S!==void 0)return k;if(w==="min"&&E!==void 0)return{...k,not:!k.not};if(w==="max"&&E!==void 0)return k;if(w==="max"&&S!==void 0)return{...k,not:!k.not}}).map(k=>[k.name,k]))}function p(w){return(k,S)=>ph(w,k.value,S.value)}let h=p("max"),b=p("min");function v(w){return k=>{if(n)if(f){if(typeof k=="string"&&!c(k))return G.warn("minmax-have-mixed-units",["The `min-*` and `max-*` variants are not supported with a `screens` configuration containing mixed units."]),[]}else return G.warn("mixed-screen-units",["The `min-*` and `max-*` variants are not supported with a `screens` configuration containing mixed units."]),[];else return G.warn("complex-screen-config",["The `min-*` and `max-*` variants are not supported with a `screens` configuration containing objects."]),[];return[`@media ${Tt(Kn(k,w))}`]}}t("max",v("max"),{sort:h,values:n?d("max"):{}});let y="min-screens";for(let w of a)e(w.name,`@media ${Tt(w)}`,{id:y,sort:n&&f?b:void 0,value:w});t("min",v("min"),{id:y,sort:b})},supportsVariants:({matchVariant:r,theme:e})=>{r("supports",(t="")=>{let i=K(t),n=/^\w*\s*\(/.test(i);return i=n?i.replace(/\b(and|or|not)\b/g," $1 "):i,n?`@supports ${i}`:(i.includes(":")||(i=`${i}: var(--tw)`),i.startsWith("(")&&i.endsWith(")")||(i=`(${i})`),`@supports ${i}`)},{values:e("supports")??{}})},hasVariants:({matchVariant:r,prefix:e})=>{r("has",t=>`&:has(${K(t)})`,{values:{},[Pt]:{respectPrefix:!1}}),r("group-has",(t,{modifier:i})=>i?`:merge(${e(".group")}\\/${i}):has(${K(t)}) &`:`:merge(${e(".group")}):has(${K(t)}) &`,{values:{},[Pt]:{respectPrefix:!1}}),r("peer-has",(t,{modifier:i})=>i?`:merge(${e(".peer")}\\/${i}):has(${K(t)}) ~ &`:`:merge(${e(".peer")}):has(${K(t)}) ~ &`,{values:{},[Pt]:{respectPrefix:!1}})},ariaVariants:({matchVariant:r,theme:e})=>{r("aria",t=>`&[aria-${Ye(K(t))}]`,{values:e("aria")??{}}),r("group-aria",(t,{modifier:i})=>i?`:merge(.group\\/${i})[aria-${Ye(K(t))}] &`:`:merge(.group)[aria-${Ye(K(t))}] &`,{values:e("aria")??{}}),r("peer-aria",(t,{modifier:i})=>i?`:merge(.peer\\/${i})[aria-${Ye(K(t))}] ~ &`:`:merge(.peer)[aria-${Ye(K(t))}] ~ &`,{values:e("aria")??{}})},dataVariants:({matchVariant:r,theme:e})=>{r("data",t=>`&[data-${Ye(K(t))}]`,{values:e("data")??{}}),r("group-data",(t,{modifier:i})=>i?`:merge(.group\\/${i})[data-${Ye(K(t))}] &`:`:merge(.group)[data-${Ye(K(t))}] &`,{values:e("data")??{}}),r("peer-data",(t,{modifier:i})=>i?`:merge(.peer\\/${i})[data-${Ye(K(t))}] ~ &`:`:merge(.peer)[data-${Ye(K(t))}] ~ &`,{values:e("data")??{}})},orientationVariants:({addVariant:r})=>{r("portrait","@media (orientation: portrait)"),r("landscape","@media (orientation: landscape)")},prefersContrastVariants:({addVariant:r})=>{r("contrast-more","@media (prefers-contrast: more)"),r("contrast-less","@media (prefers-contrast: less)")},forcedColorsVariants:({addVariant:r})=>{r("forced-colors","@media (forced-colors: active)")}},Xe=["translate(var(--tw-translate-x), var(--tw-translate-y))","rotate(var(--tw-rotate))","skewX(var(--tw-skew-x))","skewY(var(--tw-skew-y))","scaleX(var(--tw-scale-x))","scaleY(var(--tw-scale-y))"].join(" "),nt=["var(--tw-blur)","var(--tw-brightness)","var(--tw-contrast)","var(--tw-grayscale)","var(--tw-hue-rotate)","var(--tw-invert)","var(--tw-saturate)","var(--tw-sepia)","var(--tw-drop-shadow)"].join(" "),ge=["var(--tw-backdrop-blur)","var(--tw-backdrop-brightness)","var(--tw-backdrop-contrast)","var(--tw-backdrop-grayscale)","var(--tw-backdrop-hue-rotate)","var(--tw-backdrop-invert)","var(--tw-backdrop-opacity)","var(--tw-backdrop-saturate)","var(--tw-backdrop-sepia)"].join(" "),mh={preflight:({addBase:r})=>{let e=ee.parse(`*,::after,::before{box-sizing:border-box;border-width:0;border-style:solid;border-color:theme('borderColor.DEFAULT', currentColor)}::after,::before{--tw-content:''}:host,html{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;tab-size:4;font-family:theme('fontFamily.sans', ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji");font-feature-settings:theme('fontFamily.sans[1].fontFeatureSettings', normal);font-variation-settings:theme('fontFamily.sans[1].fontVariationSettings', normal);-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:theme('fontFamily.mono', ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace);font-feature-settings:theme('fontFamily.mono[1].fontFeatureSettings', normal);font-variation-settings:theme('fontFamily.mono[1].fontVariationSettings', normal);font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::placeholder,textarea::placeholder{opacity:1;color:theme('colors.gray.4', #9ca3af)}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}`);r([ee.comment({text:`! tailwindcss v${ch} | MIT License | https://tailwindcss.com`}),...e.nodes])},container:(()=>{function r(t=[]){return t.flatMap(i=>i.values.map(n=>n.min)).filter(i=>i!==void 0)}function e(t,i,n){if(typeof n=="undefined")return[];if(!(typeof n=="object"&&n!==null))return[{screen:"DEFAULT",minWidth:0,padding:n}];let a=[];n.DEFAULT&&a.push({screen:"DEFAULT",minWidth:0,padding:n.DEFAULT});for(let s of t)for(let o of i)for(let{min:l}of o.values)l===s&&a.push({minWidth:s,padding:n[o.name]});return a}return function({addComponents:t,theme:i}){let n=Rt(i("container.screens",i("screens"))),a=r(n),s=e(a,n,i("container.padding")),o=c=>{let f=s.find(d=>d.minWidth===c);return f?{paddingRight:f.padding,paddingLeft:f.padding}:{}},l=Array.from(new Set(a.slice().sort((c,f)=>parseInt(c)-parseInt(f)))).map(c=>({[`@media (min-width: ${c})`]:{".container":{"max-width":c,...o(c)}}}));t([{".container":Object.assign({width:"100%"},i("container.center",!1)?{marginRight:"auto",marginLeft:"auto"}:{},o(0))},...l])}})(),accessibility:({addUtilities:r})=>{r({".sr-only":{position:"absolute",width:"1px",height:"1px",padding:"0",margin:"-1px",overflow:"hidden",clip:"rect(0, 0, 0, 0)",whiteSpace:"nowrap",borderWidth:"0"},".not-sr-only":{position:"static",width:"auto",height:"auto",padding:"0",margin:"0",overflow:"visible",clip:"auto",whiteSpace:"normal"}})},pointerEvents:({addUtilities:r})=>{r({".pointer-events-none":{"pointer-events":"none"},".pointer-events-auto":{"pointer-events":"auto"}})},visibility:({addUtilities:r})=>{r({".visible":{visibility:"visible"},".invisible":{visibility:"hidden"},".collapse":{visibility:"collapse"}})},position:({addUtilities:r})=>{r({".static":{position:"static"},".fixed":{position:"fixed"},".absolute":{position:"absolute"},".relative":{position:"relative"},".sticky":{position:"sticky"}})},inset:L("inset",[["inset",["inset"]],[["inset-x",["left","right"]],["inset-y",["top","bottom"]]],[["start",["inset-inline-start"]],["end",["inset-inline-end"]],["top",["top"]],["right",["right"]],["bottom",["bottom"]],["left",["left"]]]],{supportsNegativeValues:!0}),isolation:({addUtilities:r})=>{r({".isolate":{isolation:"isolate"},".isolation-auto":{isolation:"auto"}})},zIndex:L("zIndex",[["z",["zIndex"]]],{supportsNegativeValues:!0}),order:L("order",void 0,{supportsNegativeValues:!0}),gridColumn:L("gridColumn",[["col",["gridColumn"]]]),gridColumnStart:L("gridColumnStart",[["col-start",["gridColumnStart"]]],{supportsNegativeValues:!0}),gridColumnEnd:L("gridColumnEnd",[["col-end",["gridColumnEnd"]]],{supportsNegativeValues:!0}),gridRow:L("gridRow",[["row",["gridRow"]]]),gridRowStart:L("gridRowStart",[["row-start",["gridRowStart"]]],{supportsNegativeValues:!0}),gridRowEnd:L("gridRowEnd",[["row-end",["gridRowEnd"]]],{supportsNegativeValues:!0}),float:({addUtilities:r})=>{r({".float-start":{float:"inline-start"},".float-end":{float:"inline-end"},".float-right":{float:"right"},".float-left":{float:"left"},".float-none":{float:"none"}})},clear:({addUtilities:r})=>{r({".clear-start":{clear:"inline-start"},".clear-end":{clear:"inline-end"},".clear-left":{clear:"left"},".clear-right":{clear:"right"},".clear-both":{clear:"both"},".clear-none":{clear:"none"}})},margin:L("margin",[["m",["margin"]],[["mx",["margin-left","margin-right"]],["my",["margin-top","margin-bottom"]]],[["ms",["margin-inline-start"]],["me",["margin-inline-end"]],["mt",["margin-top"]],["mr",["margin-right"]],["mb",["margin-bottom"]],["ml",["margin-left"]]]],{supportsNegativeValues:!0}),boxSizing:({addUtilities:r})=>{r({".box-border":{"box-sizing":"border-box"},".box-content":{"box-sizing":"content-box"}})},lineClamp:({matchUtilities:r,addUtilities:e,theme:t})=>{r({"line-clamp":i=>({overflow:"hidden",display:"-webkit-box","-webkit-box-orient":"vertical","-webkit-line-clamp":`${i}`})},{values:t("lineClamp")}),e({".line-clamp-none":{overflow:"visible",display:"block","-webkit-box-orient":"horizontal","-webkit-line-clamp":"none"}})},display:({addUtilities:r})=>{r({".block":{display:"block"},".inline-block":{display:"inline-block"},".inline":{display:"inline"},".flex":{display:"flex"},".inline-flex":{display:"inline-flex"},".table":{display:"table"},".inline-table":{display:"inline-table"},".table-caption":{display:"table-caption"},".table-cell":{display:"table-cell"},".table-column":{display:"table-column"},".table-column-group":{display:"table-column-group"},".table-footer-group":{display:"table-footer-group"},".table-header-group":{display:"table-header-group"},".table-row-group":{display:"table-row-group"},".table-row":{display:"table-row"},".flow-root":{display:"flow-root"},".grid":{display:"grid"},".inline-grid":{display:"inline-grid"},".contents":{display:"contents"},".list-item":{display:"list-item"},".hidden":{display:"none"}})},aspectRatio:L("aspectRatio",[["aspect",["aspect-ratio"]]]),size:L("size",[["size",["width","height"]]]),height:L("height",[["h",["height"]]]),maxHeight:L("maxHeight",[["max-h",["maxHeight"]]]),minHeight:L("minHeight",[["min-h",["minHeight"]]]),width:L("width",[["w",["width"]]]),minWidth:L("minWidth",[["min-w",["minWidth"]]]),maxWidth:L("maxWidth",[["max-w",["maxWidth"]]]),flex:L("flex"),flexShrink:L("flexShrink",[["flex-shrink",["flex-shrink"]],["shrink",["flex-shrink"]]]),flexGrow:L("flexGrow",[["flex-grow",["flex-grow"]],["grow",["flex-grow"]]]),flexBasis:L("flexBasis",[["basis",["flex-basis"]]]),tableLayout:({addUtilities:r})=>{r({".table-auto":{"table-layout":"auto"},".table-fixed":{"table-layout":"fixed"}})},captionSide:({addUtilities:r})=>{r({".caption-top":{"caption-side":"top"},".caption-bottom":{"caption-side":"bottom"}})},borderCollapse:({addUtilities:r})=>{r({".border-collapse":{"border-collapse":"collapse"},".border-separate":{"border-collapse":"separate"}})},borderSpacing:({addDefaults:r,matchUtilities:e,theme:t})=>{r("border-spacing",{"--tw-border-spacing-x":0,"--tw-border-spacing-y":0}),e({"border-spacing":i=>({"--tw-border-spacing-x":i,"--tw-border-spacing-y":i,"@defaults border-spacing":{},"border-spacing":"var(--tw-border-spacing-x) var(--tw-border-spacing-y)"}),"border-spacing-x":i=>({"--tw-border-spacing-x":i,"@defaults border-spacing":{},"border-spacing":"var(--tw-border-spacing-x) var(--tw-border-spacing-y)"}),"border-spacing-y":i=>({"--tw-border-spacing-y":i,"@defaults border-spacing":{},"border-spacing":"var(--tw-border-spacing-x) var(--tw-border-spacing-y)"})},{values:t("borderSpacing")})},transformOrigin:L("transformOrigin",[["origin",["transformOrigin"]]]),translate:L("translate",[[["translate-x",[["@defaults transform",{}],"--tw-translate-x",["transform",Xe]]],["translate-y",[["@defaults transform",{}],"--tw-translate-y",["transform",Xe]]]]],{supportsNegativeValues:!0}),rotate:L("rotate",[["rotate",[["@defaults transform",{}],"--tw-rotate",["transform",Xe]]]],{supportsNegativeValues:!0}),skew:L("skew",[[["skew-x",[["@defaults transform",{}],"--tw-skew-x",["transform",Xe]]],["skew-y",[["@defaults transform",{}],"--tw-skew-y",["transform",Xe]]]]],{supportsNegativeValues:!0}),scale:L("scale",[["scale",[["@defaults transform",{}],"--tw-scale-x","--tw-scale-y",["transform",Xe]]],[["scale-x",[["@defaults transform",{}],"--tw-scale-x",["transform",Xe]]],["scale-y",[["@defaults transform",{}],"--tw-scale-y",["transform",Xe]]]]],{supportsNegativeValues:!0}),transform:({addDefaults:r,addUtilities:e})=>{r("transform",{"--tw-translate-x":"0","--tw-translate-y":"0","--tw-rotate":"0","--tw-skew-x":"0","--tw-skew-y":"0","--tw-scale-x":"1","--tw-scale-y":"1"}),e({".transform":{"@defaults transform":{},transform:Xe},".transform-cpu":{transform:Xe},".transform-gpu":{transform:Xe.replace("translate(var(--tw-translate-x), var(--tw-translate-y))","translate3d(var(--tw-translate-x), var(--tw-translate-y), 0)")},".transform-none":{transform:"none"}})},animation:({matchUtilities:r,theme:e,config:t})=>{let i=a=>Te(t("prefix")+a),n=Object.fromEntries(Object.entries(e("keyframes")??{}).map(([a,s])=>[a,{[`@keyframes ${i(a)}`]:s}]));r({animate:a=>{let s=$o(a);return[...s.flatMap(o=>n[o.name]),{animation:s.map(({name:o,value:l})=>o===void 0||n[o]===void 0?l:l.replace(o,i(o))).join(", ")}]}},{values:e("animation")})},cursor:L("cursor"),touchAction:({addDefaults:r,addUtilities:e})=>{r("touch-action",{"--tw-pan-x":" ","--tw-pan-y":" ","--tw-pinch-zoom":" "});let t="var(--tw-pan-x) var(--tw-pan-y) var(--tw-pinch-zoom)";e({".touch-auto":{"touch-action":"auto"},".touch-none":{"touch-action":"none"},".touch-pan-x":{"@defaults touch-action":{},"--tw-pan-x":"pan-x","touch-action":t},".touch-pan-left":{"@defaults touch-action":{},"--tw-pan-x":"pan-left","touch-action":t},".touch-pan-right":{"@defaults touch-action":{},"--tw-pan-x":"pan-right","touch-action":t},".touch-pan-y":{"@defaults touch-action":{},"--tw-pan-y":"pan-y","touch-action":t},".touch-pan-up":{"@defaults touch-action":{},"--tw-pan-y":"pan-up","touch-action":t},".touch-pan-down":{"@defaults touch-action":{},"--tw-pan-y":"pan-down","touch-action":t},".touch-pinch-zoom":{"@defaults touch-action":{},"--tw-pinch-zoom":"pinch-zoom","touch-action":t},".touch-manipulation":{"touch-action":"manipulation"}})},userSelect:({addUtilities:r})=>{r({".select-none":{"user-select":"none"},".select-text":{"user-select":"text"},".select-all":{"user-select":"all"},".select-auto":{"user-select":"auto"}})},resize:({addUtilities:r})=>{r({".resize-none":{resize:"none"},".resize-y":{resize:"vertical"},".resize-x":{resize:"horizontal"},".resize":{resize:"both"}})},scrollSnapType:({addDefaults:r,addUtilities:e})=>{r("scroll-snap-type",{"--tw-scroll-snap-strictness":"proximity"}),e({".snap-none":{"scroll-snap-type":"none"},".snap-x":{"@defaults scroll-snap-type":{},"scroll-snap-type":"x var(--tw-scroll-snap-strictness)"},".snap-y":{"@defaults scroll-snap-type":{},"scroll-snap-type":"y var(--tw-scroll-snap-strictness)"},".snap-both":{"@defaults scroll-snap-type":{},"scroll-snap-type":"both var(--tw-scroll-snap-strictness)"},".snap-mandatory":{"--tw-scroll-snap-strictness":"mandatory"},".snap-proximity":{"--tw-scroll-snap-strictness":"proximity"}})},scrollSnapAlign:({addUtilities:r})=>{r({".snap-start":{"scroll-snap-align":"start"},".snap-end":{"scroll-snap-align":"end"},".snap-center":{"scroll-snap-align":"center"},".snap-align-none":{"scroll-snap-align":"none"}})},scrollSnapStop:({addUtilities:r})=>{r({".snap-normal":{"scroll-snap-stop":"normal"},".snap-always":{"scroll-snap-stop":"always"}})},scrollMargin:L("scrollMargin",[["scroll-m",["scroll-margin"]],[["scroll-mx",["scroll-margin-left","scroll-margin-right"]],["scroll-my",["scroll-margin-top","scroll-margin-bottom"]]],[["scroll-ms",["scroll-margin-inline-start"]],["scroll-me",["scroll-margin-inline-end"]],["scroll-mt",["scroll-margin-top"]],["scroll-mr",["scroll-margin-right"]],["scroll-mb",["scroll-margin-bottom"]],["scroll-ml",["scroll-margin-left"]]]],{supportsNegativeValues:!0}),scrollPadding:L("scrollPadding",[["scroll-p",["scroll-padding"]],[["scroll-px",["scroll-padding-left","scroll-padding-right"]],["scroll-py",["scroll-padding-top","scroll-padding-bottom"]]],[["scroll-ps",["scroll-padding-inline-start"]],["scroll-pe",["scroll-padding-inline-end"]],["scroll-pt",["scroll-padding-top"]],["scroll-pr",["scroll-padding-right"]],["scroll-pb",["scroll-padding-bottom"]],["scroll-pl",["scroll-padding-left"]]]]),listStylePosition:({addUtilities:r})=>{r({".list-inside":{"list-style-position":"inside"},".list-outside":{"list-style-position":"outside"}})},listStyleType:L("listStyleType",[["list",["listStyleType"]]]),listStyleImage:L("listStyleImage",[["list-image",["listStyleImage"]]]),appearance:({addUtilities:r})=>{r({".appearance-none":{appearance:"none"},".appearance-auto":{appearance:"auto"}})},columns:L("columns",[["columns",["columns"]]]),breakBefore:({addUtilities:r})=>{r({".break-before-auto":{"break-before":"auto"},".break-before-avoid":{"break-before":"avoid"},".break-before-all":{"break-before":"all"},".break-before-avoid-page":{"break-before":"avoid-page"},".break-before-page":{"break-before":"page"},".break-before-left":{"break-before":"left"},".break-before-right":{"break-before":"right"},".break-before-column":{"break-before":"column"}})},breakInside:({addUtilities:r})=>{r({".break-inside-auto":{"break-inside":"auto"},".break-inside-avoid":{"break-inside":"avoid"},".break-inside-avoid-page":{"break-inside":"avoid-page"},".break-inside-avoid-column":{"break-inside":"avoid-column"}})},breakAfter:({addUtilities:r})=>{r({".break-after-auto":{"break-after":"auto"},".break-after-avoid":{"break-after":"avoid"},".break-after-all":{"break-after":"all"},".break-after-avoid-page":{"break-after":"avoid-page"},".break-after-page":{"break-after":"page"},".break-after-left":{"break-after":"left"},".break-after-right":{"break-after":"right"},".break-after-column":{"break-after":"column"}})},gridAutoColumns:L("gridAutoColumns",[["auto-cols",["gridAutoColumns"]]]),gridAutoFlow:({addUtilities:r})=>{r({".grid-flow-row":{gridAutoFlow:"row"},".grid-flow-col":{gridAutoFlow:"column"},".grid-flow-dense":{gridAutoFlow:"dense"},".grid-flow-row-dense":{gridAutoFlow:"row dense"},".grid-flow-col-dense":{gridAutoFlow:"column dense"}})},gridAutoRows:L("gridAutoRows",[["auto-rows",["gridAutoRows"]]]),gridTemplateColumns:L("gridTemplateColumns",[["grid-cols",["gridTemplateColumns"]]]),gridTemplateRows:L("gridTemplateRows",[["grid-rows",["gridTemplateRows"]]]),flexDirection:({addUtilities:r})=>{r({".flex-row":{"flex-direction":"row"},".flex-row-reverse":{"flex-direction":"row-reverse"},".flex-col":{"flex-direction":"column"},".flex-col-reverse":{"flex-direction":"column-reverse"}})},flexWrap:({addUtilities:r})=>{r({".flex-wrap":{"flex-wrap":"wrap"},".flex-wrap-reverse":{"flex-wrap":"wrap-reverse"},".flex-nowrap":{"flex-wrap":"nowrap"}})},placeContent:({addUtilities:r})=>{r({".place-content-center":{"place-content":"center"},".place-content-start":{"place-content":"start"},".place-content-end":{"place-content":"end"},".place-content-between":{"place-content":"space-between"},".place-content-around":{"place-content":"space-around"},".place-content-evenly":{"place-content":"space-evenly"},".place-content-baseline":{"place-content":"baseline"},".place-content-stretch":{"place-content":"stretch"}})},placeItems:({addUtilities:r})=>{r({".place-items-start":{"place-items":"start"},".place-items-end":{"place-items":"end"},".place-items-center":{"place-items":"center"},".place-items-baseline":{"place-items":"baseline"},".place-items-stretch":{"place-items":"stretch"}})},alignContent:({addUtilities:r})=>{r({".content-normal":{"align-content":"normal"},".content-center":{"align-content":"center"},".content-start":{"align-content":"flex-start"},".content-end":{"align-content":"flex-end"},".content-between":{"align-content":"space-between"},".content-around":{"align-content":"space-around"},".content-evenly":{"align-content":"space-evenly"},".content-baseline":{"align-content":"baseline"},".content-stretch":{"align-content":"stretch"}})},alignItems:({addUtilities:r})=>{r({".items-start":{"align-items":"flex-start"},".items-end":{"align-items":"flex-end"},".items-center":{"align-items":"center"},".items-baseline":{"align-items":"baseline"},".items-stretch":{"align-items":"stretch"}})},justifyContent:({addUtilities:r})=>{r({".justify-normal":{"justify-content":"normal"},".justify-start":{"justify-content":"flex-start"},".justify-end":{"justify-content":"flex-end"},".justify-center":{"justify-content":"center"},".justify-between":{"justify-content":"space-between"},".justify-around":{"justify-content":"space-around"},".justify-evenly":{"justify-content":"space-evenly"},".justify-stretch":{"justify-content":"stretch"}})},justifyItems:({addUtilities:r})=>{r({".justify-items-start":{"justify-items":"start"},".justify-items-end":{"justify-items":"end"},".justify-items-center":{"justify-items":"center"},".justify-items-stretch":{"justify-items":"stretch"}})},gap:L("gap",[["gap",["gap"]],[["gap-x",["columnGap"]],["gap-y",["rowGap"]]]]),space:({matchUtilities:r,addUtilities:e,theme:t})=>{r({"space-x":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"--tw-space-x-reverse":"0","margin-right":`calc(${i} * var(--tw-space-x-reverse))`,"margin-left":`calc(${i} * calc(1 - var(--tw-space-x-reverse)))`}}),"space-y":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"--tw-space-y-reverse":"0","margin-top":`calc(${i} * calc(1 - var(--tw-space-y-reverse)))`,"margin-bottom":`calc(${i} * var(--tw-space-y-reverse))`}})},{values:t("space"),supportsNegativeValues:!0}),e({".space-y-reverse > :not([hidden]) ~ :not([hidden])":{"--tw-space-y-reverse":"1"},".space-x-reverse > :not([hidden]) ~ :not([hidden])":{"--tw-space-x-reverse":"1"}})},divideWidth:({matchUtilities:r,addUtilities:e,theme:t})=>{r({"divide-x":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-x-reverse":"0","border-right-width":`calc(${i} * var(--tw-divide-x-reverse))`,"border-left-width":`calc(${i} * calc(1 - var(--tw-divide-x-reverse)))`}}),"divide-y":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-y-reverse":"0","border-top-width":`calc(${i} * calc(1 - var(--tw-divide-y-reverse)))`,"border-bottom-width":`calc(${i} * var(--tw-divide-y-reverse))`}})},{values:t("divideWidth"),type:["line-width","length","any"]}),e({".divide-y-reverse > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-y-reverse":"1"},".divide-x-reverse > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-x-reverse":"1"}})},divideStyle:({addUtilities:r})=>{r({".divide-solid > :not([hidden]) ~ :not([hidden])":{"border-style":"solid"},".divide-dashed > :not([hidden]) ~ :not([hidden])":{"border-style":"dashed"},".divide-dotted > :not([hidden]) ~ :not([hidden])":{"border-style":"dotted"},".divide-double > :not([hidden]) ~ :not([hidden])":{"border-style":"double"},".divide-none > :not([hidden]) ~ :not([hidden])":{"border-style":"none"}})},divideColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({divide:i=>t("divideOpacity")?{["& > :not([hidden]) ~ :not([hidden])"]:Ae({color:i,property:"border-color",variable:"--tw-divide-opacity"})}:{["& > :not([hidden]) ~ :not([hidden])"]:{"border-color":X(i)}}},{values:(({DEFAULT:i,...n})=>n)(xe(e("divideColor"))),type:["color","any"]})},divideOpacity:({matchUtilities:r,theme:e})=>{r({"divide-opacity":t=>({["& > :not([hidden]) ~ :not([hidden])"]:{"--tw-divide-opacity":t}})},{values:e("divideOpacity")})},placeSelf:({addUtilities:r})=>{r({".place-self-auto":{"place-self":"auto"},".place-self-start":{"place-self":"start"},".place-self-end":{"place-self":"end"},".place-self-center":{"place-self":"center"},".place-self-stretch":{"place-self":"stretch"}})},alignSelf:({addUtilities:r})=>{r({".self-auto":{"align-self":"auto"},".self-start":{"align-self":"flex-start"},".self-end":{"align-self":"flex-end"},".self-center":{"align-self":"center"},".self-stretch":{"align-self":"stretch"},".self-baseline":{"align-self":"baseline"}})},justifySelf:({addUtilities:r})=>{r({".justify-self-auto":{"justify-self":"auto"},".justify-self-start":{"justify-self":"start"},".justify-self-end":{"justify-self":"end"},".justify-self-center":{"justify-self":"center"},".justify-self-stretch":{"justify-self":"stretch"}})},overflow:({addUtilities:r})=>{r({".overflow-auto":{overflow:"auto"},".overflow-hidden":{overflow:"hidden"},".overflow-clip":{overflow:"clip"},".overflow-visible":{overflow:"visible"},".overflow-scroll":{overflow:"scroll"},".overflow-x-auto":{"overflow-x":"auto"},".overflow-y-auto":{"overflow-y":"auto"},".overflow-x-hidden":{"overflow-x":"hidden"},".overflow-y-hidden":{"overflow-y":"hidden"},".overflow-x-clip":{"overflow-x":"clip"},".overflow-y-clip":{"overflow-y":"clip"},".overflow-x-visible":{"overflow-x":"visible"},".overflow-y-visible":{"overflow-y":"visible"},".overflow-x-scroll":{"overflow-x":"scroll"},".overflow-y-scroll":{"overflow-y":"scroll"}})},overscrollBehavior:({addUtilities:r})=>{r({".overscroll-auto":{"overscroll-behavior":"auto"},".overscroll-contain":{"overscroll-behavior":"contain"},".overscroll-none":{"overscroll-behavior":"none"},".overscroll-y-auto":{"overscroll-behavior-y":"auto"},".overscroll-y-contain":{"overscroll-behavior-y":"contain"},".overscroll-y-none":{"overscroll-behavior-y":"none"},".overscroll-x-auto":{"overscroll-behavior-x":"auto"},".overscroll-x-contain":{"overscroll-behavior-x":"contain"},".overscroll-x-none":{"overscroll-behavior-x":"none"}})},scrollBehavior:({addUtilities:r})=>{r({".scroll-auto":{"scroll-behavior":"auto"},".scroll-smooth":{"scroll-behavior":"smooth"}})},textOverflow:({addUtilities:r})=>{r({".truncate":{overflow:"hidden","text-overflow":"ellipsis","white-space":"nowrap"},".overflow-ellipsis":{"text-overflow":"ellipsis"},".text-ellipsis":{"text-overflow":"ellipsis"},".text-clip":{"text-overflow":"clip"}})},hyphens:({addUtilities:r})=>{r({".hyphens-none":{hyphens:"none"},".hyphens-manual":{hyphens:"manual"},".hyphens-auto":{hyphens:"auto"}})},whitespace:({addUtilities:r})=>{r({".whitespace-normal":{"white-space":"normal"},".whitespace-nowrap":{"white-space":"nowrap"},".whitespace-pre":{"white-space":"pre"},".whitespace-pre-line":{"white-space":"pre-line"},".whitespace-pre-wrap":{"white-space":"pre-wrap"},".whitespace-break-spaces":{"white-space":"break-spaces"}})},textWrap:({addUtilities:r})=>{r({".text-wrap":{"text-wrap":"wrap"},".text-nowrap":{"text-wrap":"nowrap"},".text-balance":{"text-wrap":"balance"},".text-pretty":{"text-wrap":"pretty"}})},wordBreak:({addUtilities:r})=>{r({".break-normal":{"overflow-wrap":"normal","word-break":"normal"},".break-words":{"overflow-wrap":"break-word"},".break-all":{"word-break":"break-all"},".break-keep":{"word-break":"keep-all"}})},borderRadius:L("borderRadius",[["rounded",["border-radius"]],[["rounded-s",["border-start-start-radius","border-end-start-radius"]],["rounded-e",["border-start-end-radius","border-end-end-radius"]],["rounded-t",["border-top-left-radius","border-top-right-radius"]],["rounded-r",["border-top-right-radius","border-bottom-right-radius"]],["rounded-b",["border-bottom-right-radius","border-bottom-left-radius"]],["rounded-l",["border-top-left-radius","border-bottom-left-radius"]]],[["rounded-ss",["border-start-start-radius"]],["rounded-se",["border-start-end-radius"]],["rounded-ee",["border-end-end-radius"]],["rounded-es",["border-end-start-radius"]],["rounded-tl",["border-top-left-radius"]],["rounded-tr",["border-top-right-radius"]],["rounded-br",["border-bottom-right-radius"]],["rounded-bl",["border-bottom-left-radius"]]]]),borderWidth:L("borderWidth",[["border",[["@defaults border-width",{}],"border-width"]],[["border-x",[["@defaults border-width",{}],"border-left-width","border-right-width"]],["border-y",[["@defaults border-width",{}],"border-top-width","border-bottom-width"]]],[["border-s",[["@defaults border-width",{}],"border-inline-start-width"]],["border-e",[["@defaults border-width",{}],"border-inline-end-width"]],["border-t",[["@defaults border-width",{}],"border-top-width"]],["border-r",[["@defaults border-width",{}],"border-right-width"]],["border-b",[["@defaults border-width",{}],"border-bottom-width"]],["border-l",[["@defaults border-width",{}],"border-left-width"]]]],{type:["line-width","length"]}),borderStyle:({addUtilities:r})=>{r({".border-solid":{"border-style":"solid"},".border-dashed":{"border-style":"dashed"},".border-dotted":{"border-style":"dotted"},".border-double":{"border-style":"double"},".border-hidden":{"border-style":"hidden"},".border-none":{"border-style":"none"}})},borderColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({border:i=>t("borderOpacity")?Ae({color:i,property:"border-color",variable:"--tw-border-opacity"}):{"border-color":X(i)}},{values:(({DEFAULT:i,...n})=>n)(xe(e("borderColor"))),type:["color","any"]}),r({"border-x":i=>t("borderOpacity")?Ae({color:i,property:["border-left-color","border-right-color"],variable:"--tw-border-opacity"}):{"border-left-color":X(i),"border-right-color":X(i)},"border-y":i=>t("borderOpacity")?Ae({color:i,property:["border-top-color","border-bottom-color"],variable:"--tw-border-opacity"}):{"border-top-color":X(i),"border-bottom-color":X(i)}},{values:(({DEFAULT:i,...n})=>n)(xe(e("borderColor"))),type:["color","any"]}),r({"border-s":i=>t("borderOpacity")?Ae({color:i,property:"border-inline-start-color",variable:"--tw-border-opacity"}):{"border-inline-start-color":X(i)},"border-e":i=>t("borderOpacity")?Ae({color:i,property:"border-inline-end-color",variable:"--tw-border-opacity"}):{"border-inline-end-color":X(i)},"border-t":i=>t("borderOpacity")?Ae({color:i,property:"border-top-color",variable:"--tw-border-opacity"}):{"border-top-color":X(i)},"border-r":i=>t("borderOpacity")?Ae({color:i,property:"border-right-color",variable:"--tw-border-opacity"}):{"border-right-color":X(i)},"border-b":i=>t("borderOpacity")?Ae({color:i,property:"border-bottom-color",variable:"--tw-border-opacity"}):{"border-bottom-color":X(i)},"border-l":i=>t("borderOpacity")?Ae({color:i,property:"border-left-color",variable:"--tw-border-opacity"}):{"border-left-color":X(i)}},{values:(({DEFAULT:i,...n})=>n)(xe(e("borderColor"))),type:["color","any"]})},borderOpacity:L("borderOpacity",[["border-opacity",["--tw-border-opacity"]]]),backgroundColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({bg:i=>t("backgroundOpacity")?Ae({color:i,property:"background-color",variable:"--tw-bg-opacity"}):{"background-color":X(i)}},{values:xe(e("backgroundColor")),type:["color","any"]})},backgroundOpacity:L("backgroundOpacity",[["bg-opacity",["--tw-bg-opacity"]]]),backgroundImage:L("backgroundImage",[["bg",["background-image"]]],{type:["lookup","image","url"]}),gradientColorStops:(()=>{function r(e){return Ze(e,0,"rgb(255 255 255 / 0)")}return function({matchUtilities:e,theme:t,addDefaults:i}){i("gradient-color-stops",{"--tw-gradient-from-position":" ","--tw-gradient-via-position":" ","--tw-gradient-to-position":" "});let n={values:xe(t("gradientColorStops")),type:["color","any"]},a={values:t("gradientColorStopPositions"),type:["length","percentage"]};e({from:s=>{let o=r(s);return{"@defaults gradient-color-stops":{},"--tw-gradient-from":`${X(s)} var(--tw-gradient-from-position)`,"--tw-gradient-to":`${o} var(--tw-gradient-to-position)`,"--tw-gradient-stops":"var(--tw-gradient-from), var(--tw-gradient-to)"}}},n),e({from:s=>({"--tw-gradient-from-position":s})},a),e({via:s=>{let o=r(s);return{"@defaults gradient-color-stops":{},"--tw-gradient-to":`${o}  var(--tw-gradient-to-position)`,"--tw-gradient-stops":`var(--tw-gradient-from), ${X(s)} var(--tw-gradient-via-position), var(--tw-gradient-to)`}}},n),e({via:s=>({"--tw-gradient-via-position":s})},a),e({to:s=>({"@defaults gradient-color-stops":{},"--tw-gradient-to":`${X(s)} var(--tw-gradient-to-position)`})},n),e({to:s=>({"--tw-gradient-to-position":s})},a)}})(),boxDecorationBreak:({addUtilities:r})=>{r({".decoration-slice":{"box-decoration-break":"slice"},".decoration-clone":{"box-decoration-break":"clone"},".box-decoration-slice":{"box-decoration-break":"slice"},".box-decoration-clone":{"box-decoration-break":"clone"}})},backgroundSize:L("backgroundSize",[["bg",["background-size"]]],{type:["lookup","length","percentage","size"]}),backgroundAttachment:({addUtilities:r})=>{r({".bg-fixed":{"background-attachment":"fixed"},".bg-local":{"background-attachment":"local"},".bg-scroll":{"background-attachment":"scroll"}})},backgroundClip:({addUtilities:r})=>{r({".bg-clip-border":{"background-clip":"border-box"},".bg-clip-padding":{"background-clip":"padding-box"},".bg-clip-content":{"background-clip":"content-box"},".bg-clip-text":{"background-clip":"text"}})},backgroundPosition:L("backgroundPosition",[["bg",["background-position"]]],{type:["lookup",["position",{preferOnConflict:!0}]]}),backgroundRepeat:({addUtilities:r})=>{r({".bg-repeat":{"background-repeat":"repeat"},".bg-no-repeat":{"background-repeat":"no-repeat"},".bg-repeat-x":{"background-repeat":"repeat-x"},".bg-repeat-y":{"background-repeat":"repeat-y"},".bg-repeat-round":{"background-repeat":"round"},".bg-repeat-space":{"background-repeat":"space"}})},backgroundOrigin:({addUtilities:r})=>{r({".bg-origin-border":{"background-origin":"border-box"},".bg-origin-padding":{"background-origin":"padding-box"},".bg-origin-content":{"background-origin":"content-box"}})},fill:({matchUtilities:r,theme:e})=>{r({fill:t=>({fill:X(t)})},{values:xe(e("fill")),type:["color","any"]})},stroke:({matchUtilities:r,theme:e})=>{r({stroke:t=>({stroke:X(t)})},{values:xe(e("stroke")),type:["color","url","any"]})},strokeWidth:L("strokeWidth",[["stroke",["stroke-width"]]],{type:["length","number","percentage"]}),objectFit:({addUtilities:r})=>{r({".object-contain":{"object-fit":"contain"},".object-cover":{"object-fit":"cover"},".object-fill":{"object-fit":"fill"},".object-none":{"object-fit":"none"},".object-scale-down":{"object-fit":"scale-down"}})},objectPosition:L("objectPosition",[["object",["object-position"]]]),padding:L("padding",[["p",["padding"]],[["px",["padding-left","padding-right"]],["py",["padding-top","padding-bottom"]]],[["ps",["padding-inline-start"]],["pe",["padding-inline-end"]],["pt",["padding-top"]],["pr",["padding-right"]],["pb",["padding-bottom"]],["pl",["padding-left"]]]]),textAlign:({addUtilities:r})=>{r({".text-left":{"text-align":"left"},".text-center":{"text-align":"center"},".text-right":{"text-align":"right"},".text-justify":{"text-align":"justify"},".text-start":{"text-align":"start"},".text-end":{"text-align":"end"}})},textIndent:L("textIndent",[["indent",["text-indent"]]],{supportsNegativeValues:!0}),verticalAlign:({addUtilities:r,matchUtilities:e})=>{r({".align-baseline":{"vertical-align":"baseline"},".align-top":{"vertical-align":"top"},".align-middle":{"vertical-align":"middle"},".align-bottom":{"vertical-align":"bottom"},".align-text-top":{"vertical-align":"text-top"},".align-text-bottom":{"vertical-align":"text-bottom"},".align-sub":{"vertical-align":"sub"},".align-super":{"vertical-align":"super"}}),e({align:t=>({"vertical-align":t})})},fontFamily:({matchUtilities:r,theme:e})=>{r({font:t=>{let[i,n={}]=Array.isArray(t)&&ke(t[1])?t:[t],{fontFeatureSettings:a,fontVariationSettings:s}=n;return{"font-family":Array.isArray(i)?i.join(", "):i,...a===void 0?{}:{"font-feature-settings":a},...s===void 0?{}:{"font-variation-settings":s}}}},{values:e("fontFamily"),type:["lookup","generic-name","family-name"]})},fontSize:({matchUtilities:r,theme:e})=>{r({text:(t,{modifier:i})=>{let[n,a]=Array.isArray(t)?t:[t];if(i)return{"font-size":n,"line-height":i};let{lineHeight:s,letterSpacing:o,fontWeight:l}=ke(a)?a:{lineHeight:a};return{"font-size":n,...s===void 0?{}:{"line-height":s},...o===void 0?{}:{"letter-spacing":o},...l===void 0?{}:{"font-weight":l}}}},{values:e("fontSize"),modifiers:e("lineHeight"),type:["absolute-size","relative-size","length","percentage"]})},fontWeight:L("fontWeight",[["font",["fontWeight"]]],{type:["lookup","number","any"]}),textTransform:({addUtilities:r})=>{r({".uppercase":{"text-transform":"uppercase"},".lowercase":{"text-transform":"lowercase"},".capitalize":{"text-transform":"capitalize"},".normal-case":{"text-transform":"none"}})},fontStyle:({addUtilities:r})=>{r({".italic":{"font-style":"italic"},".not-italic":{"font-style":"normal"}})},fontVariantNumeric:({addDefaults:r,addUtilities:e})=>{let t="var(--tw-ordinal) var(--tw-slashed-zero) var(--tw-numeric-figure) var(--tw-numeric-spacing) var(--tw-numeric-fraction)";r("font-variant-numeric",{"--tw-ordinal":" ","--tw-slashed-zero":" ","--tw-numeric-figure":" ","--tw-numeric-spacing":" ","--tw-numeric-fraction":" "}),e({".normal-nums":{"font-variant-numeric":"normal"},".ordinal":{"@defaults font-variant-numeric":{},"--tw-ordinal":"ordinal","font-variant-numeric":t},".slashed-zero":{"@defaults font-variant-numeric":{},"--tw-slashed-zero":"slashed-zero","font-variant-numeric":t},".lining-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-figure":"lining-nums","font-variant-numeric":t},".oldstyle-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-figure":"oldstyle-nums","font-variant-numeric":t},".proportional-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-spacing":"proportional-nums","font-variant-numeric":t},".tabular-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-spacing":"tabular-nums","font-variant-numeric":t},".diagonal-fractions":{"@defaults font-variant-numeric":{},"--tw-numeric-fraction":"diagonal-fractions","font-variant-numeric":t},".stacked-fractions":{"@defaults font-variant-numeric":{},"--tw-numeric-fraction":"stacked-fractions","font-variant-numeric":t}})},lineHeight:L("lineHeight",[["leading",["lineHeight"]]]),letterSpacing:L("letterSpacing",[["tracking",["letterSpacing"]]],{supportsNegativeValues:!0}),textColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({text:i=>t("textOpacity")?Ae({color:i,property:"color",variable:"--tw-text-opacity"}):{color:X(i)}},{values:xe(e("textColor")),type:["color","any"]})},textOpacity:L("textOpacity",[["text-opacity",["--tw-text-opacity"]]]),textDecoration:({addUtilities:r})=>{r({".underline":{"text-decoration-line":"underline"},".overline":{"text-decoration-line":"overline"},".line-through":{"text-decoration-line":"line-through"},".no-underline":{"text-decoration-line":"none"}})},textDecorationColor:({matchUtilities:r,theme:e})=>{r({decoration:t=>({"text-decoration-color":X(t)})},{values:xe(e("textDecorationColor")),type:["color","any"]})},textDecorationStyle:({addUtilities:r})=>{r({".decoration-solid":{"text-decoration-style":"solid"},".decoration-double":{"text-decoration-style":"double"},".decoration-dotted":{"text-decoration-style":"dotted"},".decoration-dashed":{"text-decoration-style":"dashed"},".decoration-wavy":{"text-decoration-style":"wavy"}})},textDecorationThickness:L("textDecorationThickness",[["decoration",["text-decoration-thickness"]]],{type:["length","percentage"]}),textUnderlineOffset:L("textUnderlineOffset",[["underline-offset",["text-underline-offset"]]],{type:["length","percentage","any"]}),fontSmoothing:({addUtilities:r})=>{r({".antialiased":{"-webkit-font-smoothing":"antialiased","-moz-osx-font-smoothing":"grayscale"},".subpixel-antialiased":{"-webkit-font-smoothing":"auto","-moz-osx-font-smoothing":"auto"}})},placeholderColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({placeholder:i=>t("placeholderOpacity")?{"&::placeholder":Ae({color:i,property:"color",variable:"--tw-placeholder-opacity"})}:{"&::placeholder":{color:X(i)}}},{values:xe(e("placeholderColor")),type:["color","any"]})},placeholderOpacity:({matchUtilities:r,theme:e})=>{r({"placeholder-opacity":t=>({["&::placeholder"]:{"--tw-placeholder-opacity":t}})},{values:e("placeholderOpacity")})},caretColor:({matchUtilities:r,theme:e})=>{r({caret:t=>({"caret-color":X(t)})},{values:xe(e("caretColor")),type:["color","any"]})},accentColor:({matchUtilities:r,theme:e})=>{r({accent:t=>({"accent-color":X(t)})},{values:xe(e("accentColor")),type:["color","any"]})},opacity:L("opacity",[["opacity",["opacity"]]]),backgroundBlendMode:({addUtilities:r})=>{r({".bg-blend-normal":{"background-blend-mode":"normal"},".bg-blend-multiply":{"background-blend-mode":"multiply"},".bg-blend-screen":{"background-blend-mode":"screen"},".bg-blend-overlay":{"background-blend-mode":"overlay"},".bg-blend-darken":{"background-blend-mode":"darken"},".bg-blend-lighten":{"background-blend-mode":"lighten"},".bg-blend-color-dodge":{"background-blend-mode":"color-dodge"},".bg-blend-color-burn":{"background-blend-mode":"color-burn"},".bg-blend-hard-light":{"background-blend-mode":"hard-light"},".bg-blend-soft-light":{"background-blend-mode":"soft-light"},".bg-blend-difference":{"background-blend-mode":"difference"},".bg-blend-exclusion":{"background-blend-mode":"exclusion"},".bg-blend-hue":{"background-blend-mode":"hue"},".bg-blend-saturation":{"background-blend-mode":"saturation"},".bg-blend-color":{"background-blend-mode":"color"},".bg-blend-luminosity":{"background-blend-mode":"luminosity"}})},mixBlendMode:({addUtilities:r})=>{r({".mix-blend-normal":{"mix-blend-mode":"normal"},".mix-blend-multiply":{"mix-blend-mode":"multiply"},".mix-blend-screen":{"mix-blend-mode":"screen"},".mix-blend-overlay":{"mix-blend-mode":"overlay"},".mix-blend-darken":{"mix-blend-mode":"darken"},".mix-blend-lighten":{"mix-blend-mode":"lighten"},".mix-blend-color-dodge":{"mix-blend-mode":"color-dodge"},".mix-blend-color-burn":{"mix-blend-mode":"color-burn"},".mix-blend-hard-light":{"mix-blend-mode":"hard-light"},".mix-blend-soft-light":{"mix-blend-mode":"soft-light"},".mix-blend-difference":{"mix-blend-mode":"difference"},".mix-blend-exclusion":{"mix-blend-mode":"exclusion"},".mix-blend-hue":{"mix-blend-mode":"hue"},".mix-blend-saturation":{"mix-blend-mode":"saturation"},".mix-blend-color":{"mix-blend-mode":"color"},".mix-blend-luminosity":{"mix-blend-mode":"luminosity"},".mix-blend-plus-darker":{"mix-blend-mode":"plus-darker"},".mix-blend-plus-lighter":{"mix-blend-mode":"plus-lighter"}})},boxShadow:(()=>{let r=mt("boxShadow"),e=["var(--tw-ring-offset-shadow, 0 0 #0000)","var(--tw-ring-shadow, 0 0 #0000)","var(--tw-shadow)"].join(", ");return function({matchUtilities:t,addDefaults:i,theme:n}){i("box-shadow",{"--tw-ring-offset-shadow":"0 0 #0000","--tw-ring-shadow":"0 0 #0000","--tw-shadow":"0 0 #0000","--tw-shadow-colored":"0 0 #0000"}),t({shadow:a=>{a=r(a);let s=Ji(a);for(let o of s)!o.valid||(o.color="var(--tw-shadow-color)");return{"@defaults box-shadow":{},"--tw-shadow":a==="none"?"0 0 #0000":a,"--tw-shadow-colored":a==="none"?"0 0 #0000":qf(s),"box-shadow":e}}},{values:n("boxShadow"),type:["shadow"]})}})(),boxShadowColor:({matchUtilities:r,theme:e})=>{r({shadow:t=>({"--tw-shadow-color":X(t),"--tw-shadow":"var(--tw-shadow-colored)"})},{values:xe(e("boxShadowColor")),type:["color","any"]})},outlineStyle:({addUtilities:r})=>{r({".outline-none":{outline:"2px solid transparent","outline-offset":"2px"},".outline":{"outline-style":"solid"},".outline-dashed":{"outline-style":"dashed"},".outline-dotted":{"outline-style":"dotted"},".outline-double":{"outline-style":"double"}})},outlineWidth:L("outlineWidth",[["outline",["outline-width"]]],{type:["length","number","percentage"]}),outlineOffset:L("outlineOffset",[["outline-offset",["outline-offset"]]],{type:["length","number","percentage","any"],supportsNegativeValues:!0}),outlineColor:({matchUtilities:r,theme:e})=>{r({outline:t=>({"outline-color":X(t)})},{values:xe(e("outlineColor")),type:["color","any"]})},ringWidth:({matchUtilities:r,addDefaults:e,addUtilities:t,theme:i,config:n})=>{let a=(()=>{if(we(n(),"respectDefaultRingColorOpacity"))return i("ringColor.DEFAULT");let s=i("ringOpacity.DEFAULT","0.5");return i("ringColor")?.DEFAULT?Ze(i("ringColor")?.DEFAULT,s,`rgb(147 197 253 / ${s})`):`rgb(147 197 253 / ${s})`})();e("ring-width",{"--tw-ring-inset":" ","--tw-ring-offset-width":i("ringOffsetWidth.DEFAULT","0px"),"--tw-ring-offset-color":i("ringOffsetColor.DEFAULT","#fff"),"--tw-ring-color":a,"--tw-ring-offset-shadow":"0 0 #0000","--tw-ring-shadow":"0 0 #0000","--tw-shadow":"0 0 #0000","--tw-shadow-colored":"0 0 #0000"}),r({ring:s=>({"@defaults ring-width":{},"--tw-ring-offset-shadow":"var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color)","--tw-ring-shadow":`var(--tw-ring-inset) 0 0 0 calc(${s} + var(--tw-ring-offset-width)) var(--tw-ring-color)`,"box-shadow":["var(--tw-ring-offset-shadow)","var(--tw-ring-shadow)","var(--tw-shadow, 0 0 #0000)"].join(", ")})},{values:i("ringWidth"),type:"length"}),t({".ring-inset":{"@defaults ring-width":{},"--tw-ring-inset":"inset"}})},ringColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({ring:i=>t("ringOpacity")?Ae({color:i,property:"--tw-ring-color",variable:"--tw-ring-opacity"}):{"--tw-ring-color":X(i)}},{values:Object.fromEntries(Object.entries(xe(e("ringColor"))).filter(([i])=>i!=="DEFAULT")),type:["color","any"]})},ringOpacity:r=>{let{config:e}=r;return L("ringOpacity",[["ring-opacity",["--tw-ring-opacity"]]],{filterDefault:!we(e(),"respectDefaultRingColorOpacity")})(r)},ringOffsetWidth:L("ringOffsetWidth",[["ring-offset",["--tw-ring-offset-width"]]],{type:"length"}),ringOffsetColor:({matchUtilities:r,theme:e})=>{r({"ring-offset":t=>({"--tw-ring-offset-color":X(t)})},{values:xe(e("ringOffsetColor")),type:["color","any"]})},blur:({matchUtilities:r,theme:e})=>{r({blur:t=>({"--tw-blur":t.trim()===""?" ":`blur(${t})`,"@defaults filter":{},filter:nt})},{values:e("blur")})},brightness:({matchUtilities:r,theme:e})=>{r({brightness:t=>({"--tw-brightness":`brightness(${t})`,"@defaults filter":{},filter:nt})},{values:e("brightness")})},contrast:({matchUtilities:r,theme:e})=>{r({contrast:t=>({"--tw-contrast":`contrast(${t})`,"@defaults filter":{},filter:nt})},{values:e("contrast")})},dropShadow:({matchUtilities:r,theme:e})=>{r({"drop-shadow":t=>({"--tw-drop-shadow":Array.isArray(t)?t.map(i=>`drop-shadow(${i})`).join(" "):`drop-shadow(${t})`,"@defaults filter":{},filter:nt})},{values:e("dropShadow")})},grayscale:({matchUtilities:r,theme:e})=>{r({grayscale:t=>({"--tw-grayscale":`grayscale(${t})`,"@defaults filter":{},filter:nt})},{values:e("grayscale")})},hueRotate:({matchUtilities:r,theme:e})=>{r({"hue-rotate":t=>({"--tw-hue-rotate":`hue-rotate(${t})`,"@defaults filter":{},filter:nt})},{values:e("hueRotate"),supportsNegativeValues:!0})},invert:({matchUtilities:r,theme:e})=>{r({invert:t=>({"--tw-invert":`invert(${t})`,"@defaults filter":{},filter:nt})},{values:e("invert")})},saturate:({matchUtilities:r,theme:e})=>{r({saturate:t=>({"--tw-saturate":`saturate(${t})`,"@defaults filter":{},filter:nt})},{values:e("saturate")})},sepia:({matchUtilities:r,theme:e})=>{r({sepia:t=>({"--tw-sepia":`sepia(${t})`,"@defaults filter":{},filter:nt})},{values:e("sepia")})},filter:({addDefaults:r,addUtilities:e})=>{r("filter",{"--tw-blur":" ","--tw-brightness":" ","--tw-contrast":" ","--tw-grayscale":" ","--tw-hue-rotate":" ","--tw-invert":" ","--tw-saturate":" ","--tw-sepia":" ","--tw-drop-shadow":" "}),e({".filter":{"@defaults filter":{},filter:nt},".filter-none":{filter:"none"}})},backdropBlur:({matchUtilities:r,theme:e})=>{r({"backdrop-blur":t=>({"--tw-backdrop-blur":t.trim()===""?" ":`blur(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropBlur")})},backdropBrightness:({matchUtilities:r,theme:e})=>{r({"backdrop-brightness":t=>({"--tw-backdrop-brightness":`brightness(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropBrightness")})},backdropContrast:({matchUtilities:r,theme:e})=>{r({"backdrop-contrast":t=>({"--tw-backdrop-contrast":`contrast(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropContrast")})},backdropGrayscale:({matchUtilities:r,theme:e})=>{r({"backdrop-grayscale":t=>({"--tw-backdrop-grayscale":`grayscale(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropGrayscale")})},backdropHueRotate:({matchUtilities:r,theme:e})=>{r({"backdrop-hue-rotate":t=>({"--tw-backdrop-hue-rotate":`hue-rotate(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropHueRotate"),supportsNegativeValues:!0})},backdropInvert:({matchUtilities:r,theme:e})=>{r({"backdrop-invert":t=>({"--tw-backdrop-invert":`invert(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropInvert")})},backdropOpacity:({matchUtilities:r,theme:e})=>{r({"backdrop-opacity":t=>({"--tw-backdrop-opacity":`opacity(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropOpacity")})},backdropSaturate:({matchUtilities:r,theme:e})=>{r({"backdrop-saturate":t=>({"--tw-backdrop-saturate":`saturate(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropSaturate")})},backdropSepia:({matchUtilities:r,theme:e})=>{r({"backdrop-sepia":t=>({"--tw-backdrop-sepia":`sepia(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropSepia")})},backdropFilter:({addDefaults:r,addUtilities:e})=>{r("backdrop-filter",{"--tw-backdrop-blur":" ","--tw-backdrop-brightness":" ","--tw-backdrop-contrast":" ","--tw-backdrop-grayscale":" ","--tw-backdrop-hue-rotate":" ","--tw-backdrop-invert":" ","--tw-backdrop-opacity":" ","--tw-backdrop-saturate":" ","--tw-backdrop-sepia":" "}),e({".backdrop-filter":{"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge},".backdrop-filter-none":{"-webkit-backdrop-filter":"none","backdrop-filter":"none"}})},transitionProperty:({matchUtilities:r,theme:e})=>{let t=e("transitionTimingFunction.DEFAULT"),i=e("transitionDuration.DEFAULT");r({transition:n=>({"transition-property":n,...n==="none"?{}:{"transition-timing-function":t,"transition-duration":i}})},{values:e("transitionProperty")})},transitionDelay:L("transitionDelay",[["delay",["transitionDelay"]]]),transitionDuration:L("transitionDuration",[["duration",["transitionDuration"]]],{filterDefault:!0}),transitionTimingFunction:L("transitionTimingFunction",[["ease",["transitionTimingFunction"]]],{filterDefault:!0}),willChange:L("willChange",[["will-change",["will-change"]]]),contain:({addDefaults:r,addUtilities:e})=>{let t="var(--tw-contain-size) var(--tw-contain-layout) var(--tw-contain-paint) var(--tw-contain-style)";r("contain",{"--tw-contain-size":" ","--tw-contain-layout":" ","--tw-contain-paint":" ","--tw-contain-style":" "}),e({".contain-none":{contain:"none"},".contain-content":{contain:"content"},".contain-strict":{contain:"strict"},".contain-size":{"@defaults contain":{},"--tw-contain-size":"size",contain:t},".contain-inline-size":{"@defaults contain":{},"--tw-contain-size":"inline-size",contain:t},".contain-layout":{"@defaults contain":{},"--tw-contain-layout":"layout",contain:t},".contain-paint":{"@defaults contain":{},"--tw-contain-paint":"paint",contain:t},".contain-style":{"@defaults contain":{},"--tw-contain-style":"style",contain:t}})},content:L("content",[["content",["--tw-content",["content","var(--tw-content)"]]]]),forcedColorAdjust:({addUtilities:r})=>{r({".forced-color-adjust-auto":{"forced-color-adjust":"auto"},".forced-color-adjust-none":{"forced-color-adjust":"none"}})}}});function p_(r){if(r===void 0)return!1;if(r==="true"||r==="1")return!0;if(r==="false"||r==="0")return!1;if(r==="*")return!0;let e=r.split(",").map(t=>t.split(":")[0]);return e.includes("-tailwindcss")?!1:!!e.includes("tailwindcss")}var Je,yh,bh,Zn,Lo,gt,Ei,It=R(()=>{u();Je=typeof m!="undefined"?{NODE_ENV:"production",DEBUG:p_(m.env.DEBUG)}:{NODE_ENV:"production",DEBUG:!1},yh=new Map,bh=new Map,Zn=new Map,Lo=new Map,gt=new String("*"),Ei=Symbol("__NONE__")});function cr(r){let e=[],t=!1;for(let i=0;i<r.length;i++){let n=r[i];if(n===":"&&!t&&e.length===0)return!1;if(d_.has(n)&&r[i-1]!=="\\"&&(t=!t),!t&&r[i-1]!=="\\"){if(wh.has(n))e.push(n);else if(vh.has(n)){let a=vh.get(n);if(e.length<=0||e.pop()!==a)return!1}}}return!(e.length>0)}var wh,vh,d_,Mo=R(()=>{u();wh=new Map([["{","}"],["[","]"],["(",")"]]),vh=new Map(Array.from(wh.entries()).map(([r,e])=>[e,r])),d_=new Set(['"',"'","`"])});function pr(r){let[e]=xh(r);return e.forEach(([t,i])=>t.removeChild(i)),r.nodes.push(...e.map(([,t])=>t)),r}function xh(r){let e=[],t=null;for(let i of r.nodes)if(i.type==="combinator")e=e.filter(([,n])=>Bo(n).includes("jumpable")),t=null;else if(i.type==="pseudo"){h_(i)?(t=i,e.push([r,i,null])):t&&m_(i,t)?e.push([r,i,t]):t=null;for(let n of i.nodes??[]){let[a,s]=xh(n);t=s||t,e.push(...a)}}return[e,t]}function kh(r){return r.value.startsWith("::")||No[r.value]!==void 0}function h_(r){return kh(r)&&Bo(r).includes("terminal")}function m_(r,e){return r.type!=="pseudo"||kh(r)?!1:Bo(e).includes("actionable")}function Bo(r){return No[r.value]??No.__default__}var No,es=R(()=>{u();No={"::after":["terminal","jumpable"],"::backdrop":["terminal","jumpable"],"::before":["terminal","jumpable"],"::cue":["terminal"],"::cue-region":["terminal"],"::first-letter":["terminal","jumpable"],"::first-line":["terminal","jumpable"],"::grammar-error":["terminal"],"::marker":["terminal","jumpable"],"::part":["terminal","actionable"],"::placeholder":["terminal","jumpable"],"::selection":["terminal","jumpable"],"::slotted":["terminal"],"::spelling-error":["terminal"],"::target-text":["terminal"],"::file-selector-button":["terminal","actionable"],"::deep":["actionable"],"::v-deep":["actionable"],"::ng-deep":["actionable"],":after":["terminal","jumpable"],":before":["terminal","jumpable"],":first-letter":["terminal","jumpable"],":first-line":["terminal","jumpable"],":where":[],":is":[],":has":[],__default__:["terminal","actionable"]}});function dr(r,{context:e,candidate:t}){let i=e?.tailwindConfig.prefix??"",n=r.map(s=>{let o=(0,st.default)().astSync(s.format);return{...s,ast:s.respectPrefix?ur(i,o):o}}),a=st.default.root({nodes:[st.default.selector({nodes:[st.default.className({value:Te(t)})]})]});for(let{ast:s}of n)[a,s]=y_(a,s),s.walkNesting(o=>o.replaceWith(...a.nodes[0].nodes)),a=s;return a}function Ah(r){let e=[];for(;r.prev()&&r.prev().type!=="combinator";)r=r.prev();for(;r&&r.type!=="combinator";)e.push(r),r=r.next();return e}function g_(r){return r.sort((e,t)=>e.type==="tag"&&t.type==="class"?-1:e.type==="class"&&t.type==="tag"?1:e.type==="class"&&t.type==="pseudo"&&t.value.startsWith("::")?-1:e.type==="pseudo"&&e.value.startsWith("::")&&t.type==="class"?1:r.index(e)-r.index(t)),r}function jo(r,e){let t=!1;r.walk(i=>{if(i.type==="class"&&i.value===e)return t=!0,!1}),t||r.remove()}function ts(r,e,{context:t,candidate:i,base:n}){let a=t?.tailwindConfig?.separator??":";n=n??ve(i,a).pop();let s=(0,st.default)().astSync(r);if(s.walkClasses(f=>{f.raws&&f.value.includes(n)&&(f.raws.value=Te((0,Sh.default)(f.raws.value)))}),s.each(f=>jo(f,n)),s.length===0)return null;let o=Array.isArray(e)?dr(e,{context:t,candidate:i}):e;if(o===null)return s.toString();let l=st.default.comment({value:"/*__simple__*/"}),c=st.default.comment({value:"/*__simple__*/"});return s.walkClasses(f=>{if(f.value!==n)return;let d=f.parent,p=o.nodes[0].nodes;if(d.nodes.length===1){f.replaceWith(...p);return}let h=Ah(f);d.insertBefore(h[0],l),d.insertAfter(h[h.length-1],c);for(let v of p)d.insertBefore(h[0],v.clone());f.remove(),h=Ah(l);let b=d.index(l);d.nodes.splice(b,h.length,...g_(st.default.selector({nodes:h})).nodes),l.remove(),c.remove()}),s.walkPseudos(f=>{f.value===Fo&&f.replaceWith(f.nodes)}),s.each(f=>pr(f)),s.toString()}function y_(r,e){let t=[];return r.walkPseudos(i=>{i.value===Fo&&t.push({pseudo:i,value:i.nodes[0].toString()})}),e.walkPseudos(i=>{if(i.value!==Fo)return;let n=i.nodes[0].toString(),a=t.find(c=>c.value===n);if(!a)return;let s=[],o=i.next();for(;o&&o.type!=="combinator";)s.push(o),o=o.next();let l=o;a.pseudo.parent.insertAfter(a.pseudo,st.default.selector({nodes:s.map(c=>c.clone())})),i.remove(),s.forEach(c=>c.remove()),l&&l.type==="combinator"&&l.remove()}),[r,e]}var st,Sh,Fo,zo=R(()=>{u();st=pe(it()),Sh=pe(Rn());fr();Wn();es();zt();Fo=":merge"});function rs(r,e){let t=(0,Uo.default)().astSync(r);return t.each(i=>{i.nodes.some(a=>a.type==="combinator")&&(i.nodes=[Uo.default.pseudo({value:":is",nodes:[i.clone()]})]),pr(i)}),`${e} ${t.toString()}`}var Uo,Vo=R(()=>{u();Uo=pe(it());es()});function Ho(r){return b_.transformSync(r)}function*w_(r){let e=1/0;for(;e>=0;){let t,i=!1;if(e===1/0&&r.endsWith("]")){let s=r.indexOf("[");r[s-1]==="-"?t=s-1:r[s-1]==="/"?(t=s-1,i=!0):t=-1}else e===1/0&&r.includes("/")?(t=r.lastIndexOf("/"),i=!0):t=r.lastIndexOf("-",e);if(t<0)break;let n=r.slice(0,t),a=r.slice(i?t:t+1);e=t-1,!(n===""||a==="/")&&(yield[n,a])}}function v_(r,e){if(r.length===0||e.tailwindConfig.prefix==="")return r;for(let t of r){let[i]=t;if(i.options.respectPrefix){let n=ee.root({nodes:[t[1].clone()]}),a=t[1].raws.tailwind.classCandidate;n.walkRules(s=>{let o=a.startsWith("-");s.selector=ur(e.tailwindConfig.prefix,s.selector,o)}),t[1]=n.nodes[0]}}return r}function x_(r,e){if(r.length===0)return r;let t=[];function i(n){return n.parent&&n.parent.type==="atrule"&&n.parent.name==="keyframes"}for(let[n,a]of r){let s=ee.root({nodes:[a.clone()]});s.walkRules(o=>{if(i(o))return;let l=(0,is.default)().astSync(o.selector);l.each(c=>jo(c,e)),Wf(l,c=>c===e?`!${c}`:c),o.selector=l.toString(),o.walkDecls(c=>c.important=!0)}),t.push([{...n,important:!0},s.nodes[0]])}return t}function k_(r,e,t){if(e.length===0)return e;let i={modifier:null,value:Ei};{let[n,...a]=ve(r,"/");if(a.length>1&&(n=n+"/"+a.slice(0,-1).join("/"),a=a.slice(-1)),a.length&&!t.variantMap.has(r)&&(r=n,i.modifier=a[0],!we(t.tailwindConfig,"generalizedModifiers")))return[]}if(r.endsWith("]")&&!r.startsWith("[")){let n=/(.)(-?)\[(.*)\]/g.exec(r);if(n){let[,a,s,o]=n;if(a==="@"&&s==="-")return[];if(a!=="@"&&s==="")return[];r=r.replace(`${s}[${o}]`,""),i.value=o}}if(Qo(r)&&!t.variantMap.has(r)){let n=t.offsets.recordVariant(r),a=K(r.slice(1,-1)),s=ve(a,",");if(s.length>1)return[];if(!s.every(os))return[];let o=s.map((l,c)=>[t.offsets.applyParallelOffset(n,c),Oi(l.trim())]);t.variantMap.set(r,o)}if(t.variantMap.has(r)){let n=Qo(r),a=t.variantOptions.get(r)?.[Pt]??{},s=t.variantMap.get(r).slice(),o=[],l=(()=>!(n||a.respectPrefix===!1))();for(let[c,f]of e){if(c.layer==="user")continue;let d=ee.root({nodes:[f.clone()]});for(let[p,h,b]of s){let w=function(){v.raws.neededBackup||(v.raws.neededBackup=!0,v.walkRules(O=>O.raws.originalSelector=O.selector))},k=function(O){return w(),v.each(B=>{B.type==="rule"&&(B.selectors=B.selectors.map(N=>O({get className(){return Ho(N)},selector:N})))}),v},v=(b??d).clone(),y=[],S=h({get container(){return w(),v},separator:t.tailwindConfig.separator,modifySelectors:k,wrap(O){let B=v.nodes;v.removeAll(),O.append(B),v.append(O)},format(O){y.push({format:O,respectPrefix:l})},args:i});if(Array.isArray(S)){for(let[O,B]of S.entries())s.push([t.offsets.applyParallelOffset(p,O),B,v.clone()]);continue}if(typeof S=="string"&&y.push({format:S,respectPrefix:l}),S===null)continue;v.raws.neededBackup&&(delete v.raws.neededBackup,v.walkRules(O=>{let B=O.raws.originalSelector;if(!B||(delete O.raws.originalSelector,B===O.selector))return;let N=O.selector,T=(0,is.default)(F=>{F.walkClasses(Y=>{Y.value=`${r}${t.tailwindConfig.separator}${Y.value}`})}).processSync(B);y.push({format:N.replace(T,"&"),respectPrefix:l}),O.selector=B})),v.nodes[0].raws.tailwind={...v.nodes[0].raws.tailwind,parentLayer:c.layer};let E=[{...c,sort:t.offsets.applyVariantOffset(c.sort,p,Object.assign(i,t.variantOptions.get(r))),collectedFormats:(c.collectedFormats??[]).concat(y)},v.nodes[0]];o.push(E)}}return o}return[]}function Wo(r,e,t={}){return!ke(r)&&!Array.isArray(r)?[[r],t]:Array.isArray(r)?Wo(r[0],e,r[1]):(e.has(r)||e.set(r,lr(r)),[e.get(r),t])}function A_(r){return S_.test(r)}function C_(r){if(!r.includes("://"))return!1;try{let e=new URL(r);return e.scheme!==""&&e.host!==""}catch(e){return!1}}function Ch(r){let e=!0;return r.walkDecls(t=>{if(!_h(t.prop,t.value))return e=!1,!1}),e}function _h(r,e){if(C_(`${r}:${e}`))return!1;try{return ee.parse(`a{${r}:${e}}`).toResult(),!0}catch(t){return!1}}function __(r,e){let[,t,i]=r.match(/^\[([a-zA-Z0-9-_]+):(\S+)\]$/)??[];if(i===void 0||!A_(t)||!cr(i))return null;let n=K(i,{property:t});return _h(t,n)?[[{sort:e.offsets.arbitraryProperty(r),layer:"utilities",options:{respectImportant:!0}},()=>({[Do(r)]:{[t]:n}})]]:null}function*E_(r,e){e.candidateRuleMap.has(r)&&(yield[e.candidateRuleMap.get(r),"DEFAULT"]),yield*function*(o){o!==null&&(yield[o,"DEFAULT"])}(__(r,e));let t=r,i=!1,n=e.tailwindConfig.prefix,a=n.length,s=t.startsWith(n)||t.startsWith(`-${n}`);t[a]==="-"&&s&&(i=!0,t=n+t.slice(a+1)),i&&e.candidateRuleMap.has(t)&&(yield[e.candidateRuleMap.get(t),"-DEFAULT"]);for(let[o,l]of w_(t))e.candidateRuleMap.has(o)&&(yield[e.candidateRuleMap.get(o),i?`-${l}`:l])}function O_(r,e){return r===gt?[gt]:ve(r,e)}function*T_(r,e){for(let t of r)t[1].raws.tailwind={...t[1].raws.tailwind,classCandidate:e,preserveSource:t[0].options?.preserveSource??!1},yield t}function*Go(r,e){let t=e.tailwindConfig.separator,[i,...n]=O_(r,t).reverse(),a=!1;i.startsWith("!")&&(a=!0,i=i.slice(1));for(let s of E_(i,e)){let o=[],l=new Map,[c,f]=s,d=c.length===1;for(let[p,h]of c){let b=[];if(typeof h=="function")for(let v of[].concat(h(f,{isOnlyPlugin:d}))){let[y,w]=Wo(v,e.postCssNodeCache);for(let k of y)b.push([{...p,options:{...p.options,...w}},k])}else if(f==="DEFAULT"||f==="-DEFAULT"){let v=h,[y,w]=Wo(v,e.postCssNodeCache);for(let k of y)b.push([{...p,options:{...p.options,...w}},k])}if(b.length>0){let v=Array.from(Zs(p.options?.types??[],f,p.options??{},e.tailwindConfig)).map(([y,w])=>w);v.length>0&&l.set(b,v),o.push(b)}}if(Qo(f)){if(o.length>1){let b=function(y){return y.length===1?y[0]:y.find(w=>{let k=l.get(w);return w.some(([{options:S},E])=>Ch(E)?S.types.some(({type:O,preferOnConflict:B})=>k.includes(O)&&B):!1)})},[p,h]=o.reduce((y,w)=>(w.some(([{options:S}])=>S.types.some(({type:E})=>E==="any"))?y[0].push(w):y[1].push(w),y),[[],[]]),v=b(h)??b(p);if(v)o=[v];else{let y=o.map(k=>new Set([...l.get(k)??[]]));for(let k of y)for(let S of k){let E=!1;for(let O of y)k!==O&&O.has(S)&&(O.delete(S),E=!0);E&&k.delete(S)}let w=[];for(let[k,S]of y.entries())for(let E of S){let O=o[k].map(([,B])=>B).flat().map(B=>B.toString().split(`
+`).slice(1,-1).map(N=>N.trim()).map(N=>`      ${N}`).join(`
+`)).join(`
+
+`);w.push(`  Use \`${r.replace("[",`[${E}:`)}\` for \`${O.trim()}\``);break}G.warn([`The class \`${r}\` is ambiguous and matches multiple utilities.`,...w,`If this is content and not a class, replace it with \`${r.replace("[","&lsqb;").replace("]","&rsqb;")}\` to silence this warning.`]);continue}}o=o.map(p=>p.filter(h=>Ch(h[1])))}o=o.flat(),o=Array.from(T_(o,i)),o=v_(o,e),a&&(o=x_(o,i));for(let p of n)o=k_(p,o,e);for(let p of o)p[1].raws.tailwind={...p[1].raws.tailwind,candidate:r},p=R_(p,{context:e,candidate:r}),p!==null&&(yield p)}}function R_(r,{context:e,candidate:t}){if(!r[0].collectedFormats)return r;let i=!0,n;try{n=dr(r[0].collectedFormats,{context:e,candidate:t})}catch{return null}let a=ee.root({nodes:[r[1].clone()]});return a.walkRules(s=>{if(!ns(s))try{let o=ts(s.selector,n,{candidate:t,context:e});if(o===null){s.remove();return}s.selector=o}catch{return i=!1,!1}}),!i||a.nodes.length===0?null:(r[1]=a.nodes[0],r)}function ns(r){return r.parent&&r.parent.type==="atrule"&&r.parent.name==="keyframes"}function P_(r){if(r===!0)return e=>{ns(e)||e.walkDecls(t=>{t.parent.type==="rule"&&!ns(t.parent)&&(t.important=!0)})};if(typeof r=="string")return e=>{ns(e)||(e.selectors=e.selectors.map(t=>rs(t,r)))}}function ss(r,e,t=!1){let i=[],n=P_(e.tailwindConfig.important);for(let a of r){if(e.notClassCache.has(a))continue;if(e.candidateRuleCache.has(a)){i=i.concat(Array.from(e.candidateRuleCache.get(a)));continue}let s=Array.from(Go(a,e));if(s.length===0){e.notClassCache.add(a);continue}e.classCache.set(a,s);let o=e.candidateRuleCache.get(a)??new Set;e.candidateRuleCache.set(a,o);for(let l of s){let[{sort:c,options:f},d]=l;if(f.respectImportant&&n){let h=ee.root({nodes:[d.clone()]});h.walkRules(n),d=h.nodes[0]}let p=[c,t?d.clone():d];o.add(p),e.ruleCache.add(p),i.push(p)}}return i}function Qo(r){return r.startsWith("[")&&r.endsWith("]")}var is,b_,S_,as=R(()=>{u();Ot();is=pe(it());Io();Kt();Wn();Fr();Be();It();zo();qo();Br();_i();Mo();zt();ct();Vo();b_=(0,is.default)(r=>r.first.filter(({type:e})=>e==="class").pop().value);S_=/^[a-z_-]/});var Eh,Oh=R(()=>{u();Eh={}});function I_(r){try{return Eh.createHash("md5").update(r,"utf-8").digest("binary")}catch(e){return""}}function Th(r,e){let t=e.toString();if(!t.includes("@tailwind"))return!1;let i=Lo.get(r),n=I_(t),a=i!==n;return Lo.set(r,n),a}var Rh=R(()=>{u();Oh();It()});function ls(r){return(r>0n)-(r<0n)}var Ph=R(()=>{u()});function Ih(r,e){let t=0n,i=0n;for(let[n,a]of e)r&n&&(t=t|n,i=i|a);return r&~t|i}var Dh=R(()=>{u()});function qh(r){let e=null;for(let t of r)e=e??t,e=e>t?e:t;return e}function D_(r,e){let t=r.length,i=e.length,n=t<i?t:i;for(let a=0;a<n;a++){let s=r.charCodeAt(a)-e.charCodeAt(a);if(s!==0)return s}return t-i}var Yo,$h=R(()=>{u();Ph();Dh();Yo=class{constructor(){this.offsets={defaults:0n,base:0n,components:0n,utilities:0n,variants:0n,user:0n},this.layerPositions={defaults:0n,base:1n,components:2n,utilities:3n,user:4n,variants:5n},this.reservedVariantBits=0n,this.variantOffsets=new Map}create(e){return{layer:e,parentLayer:e,arbitrary:0n,variants:0n,parallelIndex:0n,index:this.offsets[e]++,propertyOffset:0n,property:"",options:[]}}arbitraryProperty(e){return{...this.create("utilities"),arbitrary:1n,property:e}}forVariant(e,t=0){let i=this.variantOffsets.get(e);if(i===void 0)throw new Error(`Cannot find offset for unknown variant ${e}`);return{...this.create("variants"),variants:i<<BigInt(t)}}applyVariantOffset(e,t,i){return i.variant=t.variants,{...e,layer:"variants",parentLayer:e.layer==="variants"?e.parentLayer:e.layer,variants:e.variants|t.variants,options:i.sort?[].concat(i,e.options):e.options,parallelIndex:qh([e.parallelIndex,t.parallelIndex])}}applyParallelOffset(e,t){return{...e,parallelIndex:BigInt(t)}}recordVariants(e,t){for(let i of e)this.recordVariant(i,t(i))}recordVariant(e,t=1){return this.variantOffsets.set(e,1n<<this.reservedVariantBits),this.reservedVariantBits+=BigInt(t),{...this.create("variants"),variants:this.variantOffsets.get(e)}}compare(e,t){if(e.layer!==t.layer)return this.layerPositions[e.layer]-this.layerPositions[t.layer];if(e.parentLayer!==t.parentLayer)return this.layerPositions[e.parentLayer]-this.layerPositions[t.parentLayer];for(let i of e.options)for(let n of t.options){if(i.id!==n.id||!i.sort||!n.sort)continue;let a=qh([i.variant,n.variant])??0n,s=~(a|a-1n),o=e.variants&s,l=t.variants&s;if(o!==l)continue;let c=i.sort({value:i.value,modifier:i.modifier},{value:n.value,modifier:n.modifier});if(c!==0)return c}return e.variants!==t.variants?e.variants-t.variants:e.parallelIndex!==t.parallelIndex?e.parallelIndex-t.parallelIndex:e.arbitrary!==t.arbitrary?e.arbitrary-t.arbitrary:e.propertyOffset!==t.propertyOffset?e.propertyOffset-t.propertyOffset:e.index-t.index}recalculateVariantOffsets(){let e=Array.from(this.variantOffsets.entries()).filter(([n])=>n.startsWith("[")).sort(([n],[a])=>D_(n,a)),t=e.map(([,n])=>n).sort((n,a)=>ls(n-a));return e.map(([,n],a)=>[n,t[a]]).filter(([n,a])=>n!==a)}remapArbitraryVariantOffsets(e){let t=this.recalculateVariantOffsets();return t.length===0?e:e.map(i=>{let[n,a]=i;return n={...n,variants:Ih(n.variants,t)},[n,a]})}sortArbitraryProperties(e){let t=new Set;for(let[s]of e)s.arbitrary===1n&&t.add(s.property);if(t.size===0)return e;let i=Array.from(t).sort(),n=new Map,a=1n;for(let s of i)n.set(s,a++);return e.map(s=>{let[o,l]=s;return o={...o,propertyOffset:n.get(o.property)??0n},[o,l]})}sort(e){return e=this.remapArbitraryVariantOffsets(e),e=this.sortArbitraryProperties(e),e.sort(([t],[i])=>ls(this.compare(t,i)))}}});function Zo(r,e){let t=r.tailwindConfig.prefix;return typeof t=="function"?t(e):t+e}function Mh({type:r="any",...e}){let t=[].concat(r);return{...e,types:t.map(i=>Array.isArray(i)?{type:i[0],...i[1]}:{type:i,preferOnConflict:!1})}}function q_(r){let e=[],t="",i=0;for(let n=0;n<r.length;n++){let a=r[n];if(a==="\\")t+="\\"+r[++n];else if(a==="{")++i,e.push(t.trim()),t="";else if(a==="}"){if(--i<0)throw new Error("Your { and } are unbalanced.");e.push(t.trim()),t=""}else t+=a}return t.length>0&&e.push(t.trim()),e=e.filter(n=>n!==""),e}function $_(r,e,{before:t=[]}={}){if(t=[].concat(t),t.length<=0){r.push(e);return}let i=r.length-1;for(let n of t){let a=r.indexOf(n);a!==-1&&(i=Math.min(i,a))}r.splice(i,0,e)}function Nh(r){return Array.isArray(r)?r.flatMap(e=>!Array.isArray(e)&&!ke(e)?e:lr(e)):Nh([r])}function L_(r,e){return(0,Ko.default)(i=>{let n=[];return e&&e(i),i.walkClasses(a=>{n.push(a.value)}),n}).transformSync(r)}function M_(r){r.walkPseudos(e=>{e.value===":not"&&e.remove()})}function N_(r,e={containsNonOnDemandable:!1},t=0){let i=[],n=[];r.type==="rule"?n.push(...r.selectors):r.type==="atrule"&&r.walkRules(a=>n.push(...a.selectors));for(let a of n){let s=L_(a,M_);s.length===0&&(e.containsNonOnDemandable=!0);for(let o of s)i.push(o)}return t===0?[e.containsNonOnDemandable||i.length===0,i]:i}function us(r){return Nh(r).flatMap(e=>{let t=new Map,[i,n]=N_(e);return i&&n.unshift(gt),n.map(a=>(t.has(e)||t.set(e,e),[a,t.get(e)]))})}function os(r){return r.startsWith("@")||r.includes("&")}function Oi(r){r=r.replace(/\n+/g,"").replace(/\s{1,}/g," ").trim();let e=q_(r).map(t=>{if(!t.startsWith("@"))return({format:a})=>a(t);let[,i,n]=/@(\S*)( .+|[({].*)?/g.exec(t);return({wrap:a})=>a(ee.atRule({name:i,params:n?.trim()??""}))}).reverse();return t=>{for(let i of e)i(t)}}function B_(r,e,{variantList:t,variantMap:i,offsets:n,classList:a}){function s(p,h){return p?(0,Lh.default)(r,p,h):r}function o(p){return ur(r.prefix,p)}function l(p,h){return p===gt?gt:h.respectPrefix?e.tailwindConfig.prefix+p:p}function c(p,h,b={}){let v=kt(p),y=s(["theme",...v],h);return mt(v[0])(y,b)}let f=0,d={postcss:ee,prefix:o,e:Te,config:s,theme:c,corePlugins:p=>Array.isArray(r.corePlugins)?r.corePlugins.includes(p):s(["corePlugins",p],!0),variants:()=>[],addBase(p){for(let[h,b]of us(p)){let v=l(h,{}),y=n.create("base");e.candidateRuleMap.has(v)||e.candidateRuleMap.set(v,[]),e.candidateRuleMap.get(v).push([{sort:y,layer:"base"},b])}},addDefaults(p,h){let b={[`@defaults ${p}`]:h};for(let[v,y]of us(b)){let w=l(v,{});e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push([{sort:n.create("defaults"),layer:"defaults"},y])}},addComponents(p,h){h=Object.assign({},{preserveSource:!1,respectPrefix:!0,respectImportant:!1},Array.isArray(h)?{}:h);for(let[v,y]of us(p)){let w=l(v,h);a.add(w),e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push([{sort:n.create("components"),layer:"components",options:h},y])}},addUtilities(p,h){h=Object.assign({},{preserveSource:!1,respectPrefix:!0,respectImportant:!0},Array.isArray(h)?{}:h);for(let[v,y]of us(p)){let w=l(v,h);a.add(w),e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push([{sort:n.create("utilities"),layer:"utilities",options:h},y])}},matchUtilities:function(p,h){h=Mh({...{respectPrefix:!0,respectImportant:!0,modifiers:!1},...h});let v=n.create("utilities");for(let y in p){let S=function(O,{isOnlyPlugin:B}){let[N,T,F]=Js(h.types,O,h,r);if(N===void 0)return[];if(!h.types.some(({type:U})=>U===T))if(B)G.warn([`Unnecessary typehint \`${T}\` in \`${y}-${O}\`.`,`You can safely update it to \`${y}-${O.replace(T+":","")}\`.`]);else return[];if(!cr(N))return[];let Y={get modifier(){return h.modifiers||G.warn(`modifier-used-without-options-for-${y}`,["Your plugin must set `modifiers: true` in its options to support modifiers."]),F}},_=we(r,"generalizedModifiers");return[].concat(_?k(N,Y):k(N)).filter(Boolean).map(U=>({[Gn(y,O)]:U}))},w=l(y,h),k=p[y];a.add([w,h]);let E=[{sort:v,layer:"utilities",options:h},S];e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push(E)}},matchComponents:function(p,h){h=Mh({...{respectPrefix:!0,respectImportant:!1,modifiers:!1},...h});let v=n.create("components");for(let y in p){let S=function(O,{isOnlyPlugin:B}){let[N,T,F]=Js(h.types,O,h,r);if(N===void 0)return[];if(!h.types.some(({type:U})=>U===T))if(B)G.warn([`Unnecessary typehint \`${T}\` in \`${y}-${O}\`.`,`You can safely update it to \`${y}-${O.replace(T+":","")}\`.`]);else return[];if(!cr(N))return[];let Y={get modifier(){return h.modifiers||G.warn(`modifier-used-without-options-for-${y}`,["Your plugin must set `modifiers: true` in its options to support modifiers."]),F}},_=we(r,"generalizedModifiers");return[].concat(_?k(N,Y):k(N)).filter(Boolean).map(U=>({[Gn(y,O)]:U}))},w=l(y,h),k=p[y];a.add([w,h]);let E=[{sort:v,layer:"components",options:h},S];e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push(E)}},addVariant(p,h,b={}){h=[].concat(h).map(v=>{if(typeof v!="string")return(y={})=>{let{args:w,modifySelectors:k,container:S,separator:E,wrap:O,format:B}=y,N=v(Object.assign({modifySelectors:k,container:S,separator:E},b.type===Xo.MatchVariant&&{args:w,wrap:O,format:B}));if(typeof N=="string"&&!os(N))throw new Error(`Your custom variant \`${p}\` has an invalid format string. Make sure it's an at-rule or contains a \`&\` placeholder.`);return Array.isArray(N)?N.filter(T=>typeof T=="string").map(T=>Oi(T)):N&&typeof N=="string"&&Oi(N)(y)};if(!os(v))throw new Error(`Your custom variant \`${p}\` has an invalid format string. Make sure it's an at-rule or contains a \`&\` placeholder.`);return Oi(v)}),$_(t,p,b),i.set(p,h),e.variantOptions.set(p,b)},matchVariant(p,h,b){let v=b?.id??++f,y=p==="@",w=we(r,"generalizedModifiers");for(let[S,E]of Object.entries(b?.values??{}))S!=="DEFAULT"&&d.addVariant(y?`${p}${S}`:`${p}-${S}`,({args:O,container:B})=>h(E,w?{modifier:O?.modifier,container:B}:{container:B}),{...b,value:E,id:v,type:Xo.MatchVariant,variantInfo:Jo.Base});let k="DEFAULT"in(b?.values??{});d.addVariant(p,({args:S,container:E})=>S?.value===Ei&&!k?null:h(S?.value===Ei?b.values.DEFAULT:S?.value??(typeof S=="string"?S:""),w?{modifier:S?.modifier,container:E}:{container:E}),{...b,id:v,type:Xo.MatchVariant,variantInfo:Jo.Dynamic})}};return d}function fs(r){return el.has(r)||el.set(r,new Map),el.get(r)}function Bh(r,e){let t=!1,i=new Map;for(let n of r){if(!n)continue;let a=sa.parse(n),s=a.hash?a.href.replace(a.hash,""):a.href;s=a.search?s.replace(a.search,""):s;let o=be.statSync(decodeURIComponent(s),{throwIfNoEntry:!1})?.mtimeMs;!o||((!e.has(n)||o>e.get(n))&&(t=!0),i.set(n,o))}return[t,i]}function Fh(r){r.walkAtRules(e=>{["responsive","variants"].includes(e.name)&&(Fh(e),e.before(e.nodes),e.remove())})}function F_(r){let e=[];return r.each(t=>{t.type==="atrule"&&["responsive","variants"].includes(t.name)&&(t.name="layer",t.params="utilities")}),r.walkAtRules("layer",t=>{if(Fh(t),t.params==="base"){for(let i of t.nodes)e.push(function({addBase:n}){n(i,{respectPrefix:!1})});t.remove()}else if(t.params==="components"){for(let i of t.nodes)e.push(function({addComponents:n}){n(i,{respectPrefix:!1,preserveSource:!0})});t.remove()}else if(t.params==="utilities"){for(let i of t.nodes)e.push(function({addUtilities:n}){n(i,{respectPrefix:!1,preserveSource:!0})});t.remove()}}),e}function j_(r,e){let t=Object.entries({...se,...mh}).map(([l,c])=>r.tailwindConfig.corePlugins.includes(l)?c:null).filter(Boolean),i=r.tailwindConfig.plugins.map(l=>(l.__isOptionsFunction&&(l=l()),typeof l=="function"?l:l.handler)),n=F_(e),a=[se.childVariant,se.pseudoElementVariants,se.pseudoClassVariants,se.hasVariants,se.ariaVariants,se.dataVariants],s=[se.supportsVariants,se.reducedMotionVariants,se.prefersContrastVariants,se.screenVariants,se.orientationVariants,se.directionVariants,se.darkVariants,se.forcedColorsVariants,se.printVariant];return(r.tailwindConfig.darkMode==="class"||Array.isArray(r.tailwindConfig.darkMode)&&r.tailwindConfig.darkMode[0]==="class")&&(s=[se.supportsVariants,se.reducedMotionVariants,se.prefersContrastVariants,se.darkVariants,se.screenVariants,se.orientationVariants,se.directionVariants,se.forcedColorsVariants,se.printVariant]),[...t,...a,...i,...s,...n]}function z_(r,e){let t=[],i=new Map;e.variantMap=i;let n=new Yo;e.offsets=n;let a=new Set,s=B_(e.tailwindConfig,e,{variantList:t,variantMap:i,offsets:n,classList:a});for(let f of r)if(Array.isArray(f))for(let d of f)d(s);else f?.(s);n.recordVariants(t,f=>i.get(f).length);for(let[f,d]of i.entries())e.variantMap.set(f,d.map((p,h)=>[n.forVariant(f,h),p]));let o=(e.tailwindConfig.safelist??[]).filter(Boolean);if(o.length>0){let f=[];for(let d of o){if(typeof d=="string"){e.changedContent.push({content:d,extension:"html"});continue}if(d instanceof RegExp){G.warn("root-regex",["Regular expressions in `safelist` work differently in Tailwind CSS v3.0.","Update your `safelist` configuration to eliminate this warning.","https://tailwindcss.com/docs/content-configuration#safelisting-classes"]);continue}f.push(d)}if(f.length>0){let d=new Map,p=e.tailwindConfig.prefix.length,h=f.some(b=>b.pattern.source.includes("!"));for(let b of a){let v=Array.isArray(b)?(()=>{let[y,w]=b,S=Object.keys(w?.values??{}).map(E=>Ci(y,E));return w?.supportsNegativeValues&&(S=[...S,...S.map(E=>"-"+E)],S=[...S,...S.map(E=>E.slice(0,p)+"-"+E.slice(p))]),w.types.some(({type:E})=>E==="color")&&(S=[...S,...S.flatMap(E=>Object.keys(e.tailwindConfig.theme.opacity).map(O=>`${E}/${O}`))]),h&&w?.respectImportant&&(S=[...S,...S.map(E=>"!"+E)]),S})():[b];for(let y of v)for(let{pattern:w,variants:k=[]}of f)if(w.lastIndex=0,d.has(w)||d.set(w,0),!!w.test(y)){d.set(w,d.get(w)+1),e.changedContent.push({content:y,extension:"html"});for(let S of k)e.changedContent.push({content:S+e.tailwindConfig.separator+y,extension:"html"})}}for(let[b,v]of d.entries())v===0&&G.warn([`The safelist pattern \`${b}\` doesn't match any Tailwind CSS classes.`,"Fix this pattern or remove it from your `safelist` configuration.","https://tailwindcss.com/docs/content-configuration#safelisting-classes"])}}let l=[].concat(e.tailwindConfig.darkMode??"media")[1]??"dark",c=[Zo(e,l),Zo(e,"group"),Zo(e,"peer")];e.getClassOrder=function(d){let p=[...d].sort((y,w)=>y===w?0:y<w?-1:1),h=new Map(p.map(y=>[y,null])),b=ss(new Set(p),e,!0);b=e.offsets.sort(b);let v=BigInt(c.length);for(let[,y]of b){let w=y.raws.tailwind.candidate;h.set(w,h.get(w)??v++)}return d.map(y=>{let w=h.get(y)??null,k=c.indexOf(y);return w===null&&k!==-1&&(w=BigInt(k)),[y,w]})},e.getClassList=function(d={}){let p=[];for(let h of a)if(Array.isArray(h)){let[b,v]=h,y=[],w=Object.keys(v?.modifiers??{});v?.types?.some(({type:E})=>E==="color")&&w.push(...Object.keys(e.tailwindConfig.theme.opacity??{}));let k={modifiers:w},S=d.includeMetadata&&w.length>0;for(let[E,O]of Object.entries(v?.values??{})){if(O==null)continue;let B=Ci(b,E);if(p.push(S?[B,k]:B),v?.supportsNegativeValues&&xt(O)){let N=Ci(b,`-${E}`);y.push(S?[N,k]:N)}}p.push(...y)}else p.push(h);return p},e.getVariants=function(){let d=Math.random().toString(36).substring(7).toUpperCase(),p=[];for(let[h,b]of e.variantOptions.entries())b.variantInfo!==Jo.Base&&p.push({name:h,isArbitrary:b.type===Symbol.for("MATCH_VARIANT"),values:Object.keys(b.values??{}),hasDash:h!=="@",selectors({modifier:v,value:y}={}){let w=`TAILWINDPLACEHOLDER${d}`,k=ee.rule({selector:`.${w}`}),S=ee.root({nodes:[k.clone()]}),E=S.toString(),O=(e.variantMap.get(h)??[]).flatMap(([oe,A])=>A),B=[];for(let oe of O){let A=[],C={args:{modifier:v,value:b.values?.[y]??y},separator:e.tailwindConfig.separator,modifySelectors(V){return S.each(Ee=>{Ee.type==="rule"&&(Ee.selectors=Ee.selectors.map(Ie=>V({get className(){return Ho(Ie)},selector:Ie})))}),S},format(V){A.push(V)},wrap(V){A.push(`@${V.name} ${V.params} { & }`)},container:S},he=oe(C);if(A.length>0&&B.push(A),Array.isArray(he))for(let V of he)A=[],V(C),B.push(A)}let N=[],T=S.toString();E!==T&&(S.walkRules(oe=>{let A=oe.selector,C=(0,Ko.default)(he=>{he.walkClasses(V=>{V.value=`${h}${e.tailwindConfig.separator}${V.value}`})}).processSync(A);N.push(A.replace(C,"&").replace(w,"&"))}),S.walkAtRules(oe=>{N.push(`@${oe.name} (${oe.params}) { & }`)}));let F=!(y in(b.values??{})),Y=b[Pt]??{},_=(()=>!(F||Y.respectPrefix===!1))();B=B.map(oe=>oe.map(A=>({format:A,respectPrefix:_}))),N=N.map(oe=>({format:oe,respectPrefix:_}));let Q={candidate:w,context:e},U=B.map(oe=>ts(`.${w}`,dr(oe,Q),Q).replace(`.${w}`,"&").replace("{ & }","").trim());return N.length>0&&U.push(dr(N,Q).toString().replace(`.${w}`,"&")),U}});return p}}function jh(r,e){!r.classCache.has(e)||(r.notClassCache.add(e),r.classCache.delete(e),r.applyClassCache.delete(e),r.candidateRuleMap.delete(e),r.candidateRuleCache.delete(e),r.stylesheetCache=null)}function U_(r,e){let t=e.raws.tailwind.candidate;if(!!t){for(let i of r.ruleCache)i[1].raws.tailwind.candidate===t&&r.ruleCache.delete(i);jh(r,t)}}function tl(r,e=[],t=ee.root()){let i={disposables:[],ruleCache:new Set,candidateRuleCache:new Map,classCache:new Map,applyClassCache:new Map,notClassCache:new Set(r.blocklist??[]),postCssNodeCache:new Map,candidateRuleMap:new Map,tailwindConfig:r,changedContent:e,variantMap:new Map,stylesheetCache:null,variantOptions:new Map,markInvalidUtilityCandidate:a=>jh(i,a),markInvalidUtilityNode:a=>U_(i,a)},n=j_(i,t);return z_(n,i),i}function zh(r,e,t,i,n,a){let s=e.opts.from,o=i!==null;Je.DEBUG&&console.log("Source path:",s);let l;if(o&&hr.has(s))l=hr.get(s);else if(Ti.has(n)){let p=Ti.get(n);Dt.get(p).add(s),hr.set(s,p),l=p}let c=Th(s,r);if(l){let[p,h]=Bh([...a],fs(l));if(!p&&!c)return[l,!1,h]}if(hr.has(s)){let p=hr.get(s);if(Dt.has(p)&&(Dt.get(p).delete(s),Dt.get(p).size===0)){Dt.delete(p);for(let[h,b]of Ti)b===p&&Ti.delete(h);for(let h of p.disposables.splice(0))h(p)}}Je.DEBUG&&console.log("Setting up new context...");let f=tl(t,[],r);Object.assign(f,{userConfigPath:i});let[,d]=Bh([...a],fs(f));return Ti.set(n,f),hr.set(s,f),Dt.has(f)||Dt.set(f,new Set),Dt.get(f).add(s),[f,!0,d]}var Lh,Ko,Pt,Xo,Jo,el,hr,Ti,Dt,_i=R(()=>{u();ft();aa();Ot();Lh=pe(Oa()),Ko=pe(it());Si();Io();Wn();Kt();fr();qo();Fr();gh();It();It();Gi();Be();Hi();Mo();as();Rh();$h();ct();zo();Pt=Symbol(),Xo={AddVariant:Symbol.for("ADD_VARIANT"),MatchVariant:Symbol.for("MATCH_VARIANT")},Jo={Base:1<<0,Dynamic:1<<1};el=new WeakMap;hr=yh,Ti=bh,Dt=Zn});function rl(r){return r.ignore?[]:r.glob?m.env.ROLLUP_WATCH==="true"?[{type:"dependency",file:r.base}]:[{type:"dir-dependency",dir:r.base,glob:r.glob}]:[{type:"dependency",file:r.base}]}var Uh=R(()=>{u()});function Vh(r,e){return{handler:r,config:e}}var Hh,Wh=R(()=>{u();Vh.withOptions=function(r,e=()=>({})){let t=function(i){return{__options:i,handler:r(i),config:e(i)}};return t.__isOptionsFunction=!0,t.__pluginFunction=r,t.__configFunction=e,t};Hh=Vh});var il={};Ge(il,{default:()=>V_});var V_,nl=R(()=>{u();Wh();V_=Hh});var Qh=x((F4,Gh)=>{u();var H_=(nl(),il).default,W_={overflow:"hidden",display:"-webkit-box","-webkit-box-orient":"vertical"},G_=H_(function({matchUtilities:r,addUtilities:e,theme:t,variants:i}){let n=t("lineClamp");r({"line-clamp":a=>({...W_,"-webkit-line-clamp":`${a}`})},{values:n}),e([{".line-clamp-none":{"-webkit-line-clamp":"unset"}}],i("lineClamp"))},{theme:{lineClamp:{1:"1",2:"2",3:"3",4:"4",5:"5",6:"6"}},variants:{lineClamp:["responsive"]}});Gh.exports=G_});function sl(r){r.content.files.length===0&&G.warn("content-problems",["The `content` option in your Tailwind CSS configuration is missing or empty.","Configure your content sources or your generated CSS will be missing styles.","https://tailwindcss.com/docs/content-configuration"]);try{let e=Qh();r.plugins.includes(e)&&(G.warn("line-clamp-in-core",["As of Tailwind CSS v3.3, the `@tailwindcss/line-clamp` plugin is now included by default.","Remove it from the `plugins` array in your configuration to eliminate this warning."]),r.plugins=r.plugins.filter(t=>t!==e))}catch{}return r}var Yh=R(()=>{u();Be()});var Kh,Xh=R(()=>{u();Kh=()=>!1});var cs,Jh=R(()=>{u();cs={sync:r=>[].concat(r),generateTasks:r=>[{dynamic:!1,base:".",negative:[],positive:[].concat(r),patterns:[].concat(r)}],escapePath:r=>r}});var al,Zh=R(()=>{u();al=r=>r});var em,tm=R(()=>{u();em=()=>""});function rm(r){let e=r,t=em(r);return t!=="."&&(e=r.substr(t.length),e.charAt(0)==="/"&&(e=e.substr(1))),e.substr(0,2)==="./"?e=e.substr(2):e.charAt(0)==="/"&&(e=e.substr(1)),{base:t,glob:e}}var im=R(()=>{u();tm()});var ps=x(Ve=>{u();"use strict";Ve.isInteger=r=>typeof r=="number"?Number.isInteger(r):typeof r=="string"&&r.trim()!==""?Number.isInteger(Number(r)):!1;Ve.find=(r,e)=>r.nodes.find(t=>t.type===e);Ve.exceedsLimit=(r,e,t=1,i)=>i===!1||!Ve.isInteger(r)||!Ve.isInteger(e)?!1:(Number(e)-Number(r))/Number(t)>=i;Ve.escapeNode=(r,e=0,t)=>{let i=r.nodes[e];!i||(t&&i.type===t||i.type==="open"||i.type==="close")&&i.escaped!==!0&&(i.value="\\"+i.value,i.escaped=!0)};Ve.encloseBrace=r=>r.type!=="brace"?!1:r.commas>>0+r.ranges>>0==0?(r.invalid=!0,!0):!1;Ve.isInvalidBrace=r=>r.type!=="brace"?!1:r.invalid===!0||r.dollar?!0:r.commas>>0+r.ranges>>0==0||r.open!==!0||r.close!==!0?(r.invalid=!0,!0):!1;Ve.isOpenOrClose=r=>r.type==="open"||r.type==="close"?!0:r.open===!0||r.close===!0;Ve.reduce=r=>r.reduce((e,t)=>(t.type==="text"&&e.push(t.value),t.type==="range"&&(t.type="text"),e),[]);Ve.flatten=(...r)=>{let e=[],t=i=>{for(let n=0;n<i.length;n++){let a=i[n];Array.isArray(a)?t(a,e):a!==void 0&&e.push(a)}return e};return t(r),e}});var ds=x((K4,sm)=>{u();"use strict";var nm=ps();sm.exports=(r,e={})=>{let t=(i,n={})=>{let a=e.escapeInvalid&&nm.isInvalidBrace(n),s=i.invalid===!0&&e.escapeInvalid===!0,o="";if(i.value)return(a||s)&&nm.isOpenOrClose(i)?"\\"+i.value:i.value;if(i.value)return i.value;if(i.nodes)for(let l of i.nodes)o+=t(l);return o};return t(r)}});var om=x((X4,am)=>{u();"use strict";am.exports=function(r){return typeof r=="number"?r-r==0:typeof r=="string"&&r.trim()!==""?Number.isFinite?Number.isFinite(+r):isFinite(+r):!1}});var gm=x((J4,mm)=>{u();"use strict";var lm=om(),Wt=(r,e,t)=>{if(lm(r)===!1)throw new TypeError("toRegexRange: expected the first argument to be a number");if(e===void 0||r===e)return String(r);if(lm(e)===!1)throw new TypeError("toRegexRange: expected the second argument to be a number.");let i={relaxZeros:!0,...t};typeof i.strictZeros=="boolean"&&(i.relaxZeros=i.strictZeros===!1);let n=String(i.relaxZeros),a=String(i.shorthand),s=String(i.capture),o=String(i.wrap),l=r+":"+e+"="+n+a+s+o;if(Wt.cache.hasOwnProperty(l))return Wt.cache[l].result;let c=Math.min(r,e),f=Math.max(r,e);if(Math.abs(c-f)===1){let v=r+"|"+e;return i.capture?`(${v})`:i.wrap===!1?v:`(?:${v})`}let d=hm(r)||hm(e),p={min:r,max:e,a:c,b:f},h=[],b=[];if(d&&(p.isPadded=d,p.maxLen=String(p.max).length),c<0){let v=f<0?Math.abs(f):1;b=um(v,Math.abs(c),p,i),c=p.a=0}return f>=0&&(h=um(c,f,p,i)),p.negatives=b,p.positives=h,p.result=Q_(b,h,i),i.capture===!0?p.result=`(${p.result})`:i.wrap!==!1&&h.length+b.length>1&&(p.result=`(?:${p.result})`),Wt.cache[l]=p,p.result};function Q_(r,e,t){let i=ol(r,e,"-",!1,t)||[],n=ol(e,r,"",!1,t)||[],a=ol(r,e,"-?",!0,t)||[];return i.concat(a).concat(n).join("|")}function Y_(r,e){let t=1,i=1,n=cm(r,t),a=new Set([e]);for(;r<=n&&n<=e;)a.add(n),t+=1,n=cm(r,t);for(n=pm(e+1,i)-1;r<n&&n<=e;)a.add(n),i+=1,n=pm(e+1,i)-1;return a=[...a],a.sort(J_),a}function K_(r,e,t){if(r===e)return{pattern:r,count:[],digits:0};let i=X_(r,e),n=i.length,a="",s=0;for(let o=0;o<n;o++){let[l,c]=i[o];l===c?a+=l:l!=="0"||c!=="9"?a+=Z_(l,c,t):s++}return s&&(a+=t.shorthand===!0?"\\d":"[0-9]"),{pattern:a,count:[s],digits:n}}function um(r,e,t,i){let n=Y_(r,e),a=[],s=r,o;for(let l=0;l<n.length;l++){let c=n[l],f=K_(String(s),String(c),i),d="";if(!t.isPadded&&o&&o.pattern===f.pattern){o.count.length>1&&o.count.pop(),o.count.push(f.count[0]),o.string=o.pattern+dm(o.count),s=c+1;continue}t.isPadded&&(d=eE(c,t,i)),f.string=d+f.pattern+dm(f.count),a.push(f),s=c+1,o=f}return a}function ol(r,e,t,i,n){let a=[];for(let s of r){let{string:o}=s;!i&&!fm(e,"string",o)&&a.push(t+o),i&&fm(e,"string",o)&&a.push(t+o)}return a}function X_(r,e){let t=[];for(let i=0;i<r.length;i++)t.push([r[i],e[i]]);return t}function J_(r,e){return r>e?1:e>r?-1:0}function fm(r,e,t){return r.some(i=>i[e]===t)}function cm(r,e){return Number(String(r).slice(0,-e)+"9".repeat(e))}function pm(r,e){return r-r%Math.pow(10,e)}function dm(r){let[e=0,t=""]=r;return t||e>1?`{${e+(t?","+t:"")}}`:""}function Z_(r,e,t){return`[${r}${e-r==1?"":"-"}${e}]`}function hm(r){return/^-?(0+)\d/.test(r)}function eE(r,e,t){if(!e.isPadded)return r;let i=Math.abs(e.maxLen-String(r).length),n=t.relaxZeros!==!1;switch(i){case 0:return"";case 1:return n?"0?":"0";case 2:return n?"0{0,2}":"00";default:return n?`0{0,${i}}`:`0{${i}}`}}Wt.cache={};Wt.clearCache=()=>Wt.cache={};mm.exports=Wt});var fl=x((Z4,Am)=>{u();"use strict";var tE=(Bn(),Nn),ym=gm(),bm=r=>r!==null&&typeof r=="object"&&!Array.isArray(r),rE=r=>e=>r===!0?Number(e):String(e),ll=r=>typeof r=="number"||typeof r=="string"&&r!=="",Ri=r=>Number.isInteger(+r),ul=r=>{let e=`${r}`,t=-1;if(e[0]==="-"&&(e=e.slice(1)),e==="0")return!1;for(;e[++t]==="0";);return t>0},iE=(r,e,t)=>typeof r=="string"||typeof e=="string"?!0:t.stringify===!0,nE=(r,e,t)=>{if(e>0){let i=r[0]==="-"?"-":"";i&&(r=r.slice(1)),r=i+r.padStart(i?e-1:e,"0")}return t===!1?String(r):r},wm=(r,e)=>{let t=r[0]==="-"?"-":"";for(t&&(r=r.slice(1),e--);r.length<e;)r="0"+r;return t?"-"+r:r},sE=(r,e)=>{r.negatives.sort((s,o)=>s<o?-1:s>o?1:0),r.positives.sort((s,o)=>s<o?-1:s>o?1:0);let t=e.capture?"":"?:",i="",n="",a;return r.positives.length&&(i=r.positives.join("|")),r.negatives.length&&(n=`-(${t}${r.negatives.join("|")})`),i&&n?a=`${i}|${n}`:a=i||n,e.wrap?`(${t}${a})`:a},vm=(r,e,t,i)=>{if(t)return ym(r,e,{wrap:!1,...i});let n=String.fromCharCode(r);if(r===e)return n;let a=String.fromCharCode(e);return`[${n}-${a}]`},xm=(r,e,t)=>{if(Array.isArray(r)){let i=t.wrap===!0,n=t.capture?"":"?:";return i?`(${n}${r.join("|")})`:r.join("|")}return ym(r,e,t)},km=(...r)=>new RangeError("Invalid range arguments: "+tE.inspect(...r)),Sm=(r,e,t)=>{if(t.strictRanges===!0)throw km([r,e]);return[]},aE=(r,e)=>{if(e.strictRanges===!0)throw new TypeError(`Expected step "${r}" to be a number`);return[]},oE=(r,e,t=1,i={})=>{let n=Number(r),a=Number(e);if(!Number.isInteger(n)||!Number.isInteger(a)){if(i.strictRanges===!0)throw km([r,e]);return[]}n===0&&(n=0),a===0&&(a=0);let s=n>a,o=String(r),l=String(e),c=String(t);t=Math.max(Math.abs(t),1);let f=ul(o)||ul(l)||ul(c),d=f?Math.max(o.length,l.length,c.length):0,p=f===!1&&iE(r,e,i)===!1,h=i.transform||rE(p);if(i.toRegex&&t===1)return vm(wm(r,d),wm(e,d),!0,i);let b={negatives:[],positives:[]},v=k=>b[k<0?"negatives":"positives"].push(Math.abs(k)),y=[],w=0;for(;s?n>=a:n<=a;)i.toRegex===!0&&t>1?v(n):y.push(nE(h(n,w),d,p)),n=s?n-t:n+t,w++;return i.toRegex===!0?t>1?sE(b,i):xm(y,null,{wrap:!1,...i}):y},lE=(r,e,t=1,i={})=>{if(!Ri(r)&&r.length>1||!Ri(e)&&e.length>1)return Sm(r,e,i);let n=i.transform||(p=>String.fromCharCode(p)),a=`${r}`.charCodeAt(0),s=`${e}`.charCodeAt(0),o=a>s,l=Math.min(a,s),c=Math.max(a,s);if(i.toRegex&&t===1)return vm(l,c,!1,i);let f=[],d=0;for(;o?a>=s:a<=s;)f.push(n(a,d)),a=o?a-t:a+t,d++;return i.toRegex===!0?xm(f,null,{wrap:!1,options:i}):f},hs=(r,e,t,i={})=>{if(e==null&&ll(r))return[r];if(!ll(r)||!ll(e))return Sm(r,e,i);if(typeof t=="function")return hs(r,e,1,{transform:t});if(bm(t))return hs(r,e,0,t);let n={...i};return n.capture===!0&&(n.wrap=!0),t=t||n.step||1,Ri(t)?Ri(r)&&Ri(e)?oE(r,e,t,n):lE(r,e,Math.max(Math.abs(t),1),n):t!=null&&!bm(t)?aE(t,n):hs(r,e,1,t)};Am.exports=hs});var Em=x((e6,_m)=>{u();"use strict";var uE=fl(),Cm=ps(),fE=(r,e={})=>{let t=(i,n={})=>{let a=Cm.isInvalidBrace(n),s=i.invalid===!0&&e.escapeInvalid===!0,o=a===!0||s===!0,l=e.escapeInvalid===!0?"\\":"",c="";if(i.isOpen===!0||i.isClose===!0)return l+i.value;if(i.type==="open")return o?l+i.value:"(";if(i.type==="close")return o?l+i.value:")";if(i.type==="comma")return i.prev.type==="comma"?"":o?i.value:"|";if(i.value)return i.value;if(i.nodes&&i.ranges>0){let f=Cm.reduce(i.nodes),d=uE(...f,{...e,wrap:!1,toRegex:!0});if(d.length!==0)return f.length>1&&d.length>1?`(${d})`:d}if(i.nodes)for(let f of i.nodes)c+=t(f,i);return c};return t(r)};_m.exports=fE});var Rm=x((t6,Tm)=>{u();"use strict";var cE=fl(),Om=ds(),mr=ps(),Gt=(r="",e="",t=!1)=>{let i=[];if(r=[].concat(r),e=[].concat(e),!e.length)return r;if(!r.length)return t?mr.flatten(e).map(n=>`{${n}}`):e;for(let n of r)if(Array.isArray(n))for(let a of n)i.push(Gt(a,e,t));else for(let a of e)t===!0&&typeof a=="string"&&(a=`{${a}}`),i.push(Array.isArray(a)?Gt(n,a,t):n+a);return mr.flatten(i)},pE=(r,e={})=>{let t=e.rangeLimit===void 0?1e3:e.rangeLimit,i=(n,a={})=>{n.queue=[];let s=a,o=a.queue;for(;s.type!=="brace"&&s.type!=="root"&&s.parent;)s=s.parent,o=s.queue;if(n.invalid||n.dollar){o.push(Gt(o.pop(),Om(n,e)));return}if(n.type==="brace"&&n.invalid!==!0&&n.nodes.length===2){o.push(Gt(o.pop(),["{}"]));return}if(n.nodes&&n.ranges>0){let d=mr.reduce(n.nodes);if(mr.exceedsLimit(...d,e.step,t))throw new RangeError("expanded array length exceeds range limit. Use options.rangeLimit to increase or disable the limit.");let p=cE(...d,e);p.length===0&&(p=Om(n,e)),o.push(Gt(o.pop(),p)),n.nodes=[];return}let l=mr.encloseBrace(n),c=n.queue,f=n;for(;f.type!=="brace"&&f.type!=="root"&&f.parent;)f=f.parent,c=f.queue;for(let d=0;d<n.nodes.length;d++){let p=n.nodes[d];if(p.type==="comma"&&n.type==="brace"){d===1&&c.push(""),c.push("");continue}if(p.type==="close"){o.push(Gt(o.pop(),c,l));continue}if(p.value&&p.type!=="open"){c.push(Gt(c.pop(),p.value));continue}p.nodes&&i(p,n)}return c};return mr.flatten(i(r))};Tm.exports=pE});var Im=x((r6,Pm)=>{u();"use strict";Pm.exports={MAX_LENGTH:1024*64,CHAR_0:"0",CHAR_9:"9",CHAR_UPPERCASE_A:"A",CHAR_LOWERCASE_A:"a",CHAR_UPPERCASE_Z:"Z",CHAR_LOWERCASE_Z:"z",CHAR_LEFT_PARENTHESES:"(",CHAR_RIGHT_PARENTHESES:")",CHAR_ASTERISK:"*",CHAR_AMPERSAND:"&",CHAR_AT:"@",CHAR_BACKSLASH:"\\",CHAR_BACKTICK:"`",CHAR_CARRIAGE_RETURN:"\r",CHAR_CIRCUMFLEX_ACCENT:"^",CHAR_COLON:":",CHAR_COMMA:",",CHAR_DOLLAR:"$",CHAR_DOT:".",CHAR_DOUBLE_QUOTE:'"',CHAR_EQUAL:"=",CHAR_EXCLAMATION_MARK:"!",CHAR_FORM_FEED:"\f",CHAR_FORWARD_SLASH:"/",CHAR_HASH:"#",CHAR_HYPHEN_MINUS:"-",CHAR_LEFT_ANGLE_BRACKET:"<",CHAR_LEFT_CURLY_BRACE:"{",CHAR_LEFT_SQUARE_BRACKET:"[",CHAR_LINE_FEED:`
+`,CHAR_NO_BREAK_SPACE:"\xA0",CHAR_PERCENT:"%",CHAR_PLUS:"+",CHAR_QUESTION_MARK:"?",CHAR_RIGHT_ANGLE_BRACKET:">",CHAR_RIGHT_CURLY_BRACE:"}",CHAR_RIGHT_SQUARE_BRACKET:"]",CHAR_SEMICOLON:";",CHAR_SINGLE_QUOTE:"'",CHAR_SPACE:" ",CHAR_TAB:"	",CHAR_UNDERSCORE:"_",CHAR_VERTICAL_LINE:"|",CHAR_ZERO_WIDTH_NOBREAK_SPACE:"\uFEFF"}});var Mm=x((i6,Lm)=>{u();"use strict";var dE=ds(),{MAX_LENGTH:Dm,CHAR_BACKSLASH:cl,CHAR_BACKTICK:hE,CHAR_COMMA:mE,CHAR_DOT:gE,CHAR_LEFT_PARENTHESES:yE,CHAR_RIGHT_PARENTHESES:bE,CHAR_LEFT_CURLY_BRACE:wE,CHAR_RIGHT_CURLY_BRACE:vE,CHAR_LEFT_SQUARE_BRACKET:qm,CHAR_RIGHT_SQUARE_BRACKET:$m,CHAR_DOUBLE_QUOTE:xE,CHAR_SINGLE_QUOTE:kE,CHAR_NO_BREAK_SPACE:SE,CHAR_ZERO_WIDTH_NOBREAK_SPACE:AE}=Im(),CE=(r,e={})=>{if(typeof r!="string")throw new TypeError("Expected a string");let t=e||{},i=typeof t.maxLength=="number"?Math.min(Dm,t.maxLength):Dm;if(r.length>i)throw new SyntaxError(`Input length (${r.length}), exceeds max characters (${i})`);let n={type:"root",input:r,nodes:[]},a=[n],s=n,o=n,l=0,c=r.length,f=0,d=0,p,h={},b=()=>r[f++],v=y=>{if(y.type==="text"&&o.type==="dot"&&(o.type="text"),o&&o.type==="text"&&y.type==="text"){o.value+=y.value;return}return s.nodes.push(y),y.parent=s,y.prev=o,o=y,y};for(v({type:"bos"});f<c;)if(s=a[a.length-1],p=b(),!(p===AE||p===SE)){if(p===cl){v({type:"text",value:(e.keepEscaping?p:"")+b()});continue}if(p===$m){v({type:"text",value:"\\"+p});continue}if(p===qm){l++;let y=!0,w;for(;f<c&&(w=b());){if(p+=w,w===qm){l++;continue}if(w===cl){p+=b();continue}if(w===$m&&(l--,l===0))break}v({type:"text",value:p});continue}if(p===yE){s=v({type:"paren",nodes:[]}),a.push(s),v({type:"text",value:p});continue}if(p===bE){if(s.type!=="paren"){v({type:"text",value:p});continue}s=a.pop(),v({type:"text",value:p}),s=a[a.length-1];continue}if(p===xE||p===kE||p===hE){let y=p,w;for(e.keepQuotes!==!0&&(p="");f<c&&(w=b());){if(w===cl){p+=w+b();continue}if(w===y){e.keepQuotes===!0&&(p+=w);break}p+=w}v({type:"text",value:p});continue}if(p===wE){d++;let y=o.value&&o.value.slice(-1)==="$"||s.dollar===!0;s=v({type:"brace",open:!0,close:!1,dollar:y,depth:d,commas:0,ranges:0,nodes:[]}),a.push(s),v({type:"open",value:p});continue}if(p===vE){if(s.type!=="brace"){v({type:"text",value:p});continue}let y="close";s=a.pop(),s.close=!0,v({type:y,value:p}),d--,s=a[a.length-1];continue}if(p===mE&&d>0){if(s.ranges>0){s.ranges=0;let y=s.nodes.shift();s.nodes=[y,{type:"text",value:dE(s)}]}v({type:"comma",value:p}),s.commas++;continue}if(p===gE&&d>0&&s.commas===0){let y=s.nodes;if(d===0||y.length===0){v({type:"text",value:p});continue}if(o.type==="dot"){if(s.range=[],o.value+=p,o.type="range",s.nodes.length!==3&&s.nodes.length!==5){s.invalid=!0,s.ranges=0,o.type="text";continue}s.ranges++,s.args=[];continue}if(o.type==="range"){y.pop();let w=y[y.length-1];w.value+=o.value+p,o=w,s.ranges--;continue}v({type:"dot",value:p});continue}v({type:"text",value:p})}do if(s=a.pop(),s.type!=="root"){s.nodes.forEach(k=>{k.nodes||(k.type==="open"&&(k.isOpen=!0),k.type==="close"&&(k.isClose=!0),k.nodes||(k.type="text"),k.invalid=!0)});let y=a[a.length-1],w=y.nodes.indexOf(s);y.nodes.splice(w,1,...s.nodes)}while(a.length>0);return v({type:"eos"}),n};Lm.exports=CE});var Fm=x((n6,Bm)=>{u();"use strict";var Nm=ds(),_E=Em(),EE=Rm(),OE=Mm(),Le=(r,e={})=>{let t=[];if(Array.isArray(r))for(let i of r){let n=Le.create(i,e);Array.isArray(n)?t.push(...n):t.push(n)}else t=[].concat(Le.create(r,e));return e&&e.expand===!0&&e.nodupes===!0&&(t=[...new Set(t)]),t};Le.parse=(r,e={})=>OE(r,e);Le.stringify=(r,e={})=>typeof r=="string"?Nm(Le.parse(r,e),e):Nm(r,e);Le.compile=(r,e={})=>(typeof r=="string"&&(r=Le.parse(r,e)),_E(r,e));Le.expand=(r,e={})=>{typeof r=="string"&&(r=Le.parse(r,e));let t=EE(r,e);return e.noempty===!0&&(t=t.filter(Boolean)),e.nodupes===!0&&(t=[...new Set(t)]),t};Le.create=(r,e={})=>r===""||r.length<3?[r]:e.expand!==!0?Le.compile(r,e):Le.expand(r,e);Bm.exports=Le});var Pi=x((s6,Hm)=>{u();"use strict";var TE=(et(),Ur),at="\\\\/",jm=`[^${at}]`,yt="\\.",RE="\\+",PE="\\?",ms="\\/",IE="(?=.)",zm="[^/]",pl=`(?:${ms}|$)`,Um=`(?:^|${ms})`,dl=`${yt}{1,2}${pl}`,DE=`(?!${yt})`,qE=`(?!${Um}${dl})`,$E=`(?!${yt}{0,1}${pl})`,LE=`(?!${dl})`,ME=`[^.${ms}]`,NE=`${zm}*?`,Vm={DOT_LITERAL:yt,PLUS_LITERAL:RE,QMARK_LITERAL:PE,SLASH_LITERAL:ms,ONE_CHAR:IE,QMARK:zm,END_ANCHOR:pl,DOTS_SLASH:dl,NO_DOT:DE,NO_DOTS:qE,NO_DOT_SLASH:$E,NO_DOTS_SLASH:LE,QMARK_NO_DOT:ME,STAR:NE,START_ANCHOR:Um},BE={...Vm,SLASH_LITERAL:`[${at}]`,QMARK:jm,STAR:`${jm}*?`,DOTS_SLASH:`${yt}{1,2}(?:[${at}]|$)`,NO_DOT:`(?!${yt})`,NO_DOTS:`(?!(?:^|[${at}])${yt}{1,2}(?:[${at}]|$))`,NO_DOT_SLASH:`(?!${yt}{0,1}(?:[${at}]|$))`,NO_DOTS_SLASH:`(?!${yt}{1,2}(?:[${at}]|$))`,QMARK_NO_DOT:`[^.${at}]`,START_ANCHOR:`(?:^|[${at}])`,END_ANCHOR:`(?:[${at}]|$)`},FE={alnum:"a-zA-Z0-9",alpha:"a-zA-Z",ascii:"\\x00-\\x7F",blank:" \\t",cntrl:"\\x00-\\x1F\\x7F",digit:"0-9",graph:"\\x21-\\x7E",lower:"a-z",print:"\\x20-\\x7E ",punct:"\\-!\"#$%&'()\\*+,./:;<=>?@[\\]^_`{|}~",space:" \\t\\r\\n\\v\\f",upper:"A-Z",word:"A-Za-z0-9_",xdigit:"A-Fa-f0-9"};Hm.exports={MAX_LENGTH:1024*64,POSIX_REGEX_SOURCE:FE,REGEX_BACKSLASH:/\\(?![*+?^${}(|)[\]])/g,REGEX_NON_SPECIAL_CHARS:/^[^@![\].,$*+?^{}()|\\/]+/,REGEX_SPECIAL_CHARS:/[-*+?.^${}(|)[\]]/,REGEX_SPECIAL_CHARS_BACKREF:/(\\?)((\W)(\3*))/g,REGEX_SPECIAL_CHARS_GLOBAL:/([-*+?.^${}(|)[\]])/g,REGEX_REMOVE_BACKSLASH:/(?:\[.*?[^\\]\]|\\(?=.))/g,REPLACEMENTS:{"***":"*","**/**":"**","**/**/**":"**"},CHAR_0:48,CHAR_9:57,CHAR_UPPERCASE_A:65,CHAR_LOWERCASE_A:97,CHAR_UPPERCASE_Z:90,CHAR_LOWERCASE_Z:122,CHAR_LEFT_PARENTHESES:40,CHAR_RIGHT_PARENTHESES:41,CHAR_ASTERISK:42,CHAR_AMPERSAND:38,CHAR_AT:64,CHAR_BACKWARD_SLASH:92,CHAR_CARRIAGE_RETURN:13,CHAR_CIRCUMFLEX_ACCENT:94,CHAR_COLON:58,CHAR_COMMA:44,CHAR_DOT:46,CHAR_DOUBLE_QUOTE:34,CHAR_EQUAL:61,CHAR_EXCLAMATION_MARK:33,CHAR_FORM_FEED:12,CHAR_FORWARD_SLASH:47,CHAR_GRAVE_ACCENT:96,CHAR_HASH:35,CHAR_HYPHEN_MINUS:45,CHAR_LEFT_ANGLE_BRACKET:60,CHAR_LEFT_CURLY_BRACE:123,CHAR_LEFT_SQUARE_BRACKET:91,CHAR_LINE_FEED:10,CHAR_NO_BREAK_SPACE:160,CHAR_PERCENT:37,CHAR_PLUS:43,CHAR_QUESTION_MARK:63,CHAR_RIGHT_ANGLE_BRACKET:62,CHAR_RIGHT_CURLY_BRACE:125,CHAR_RIGHT_SQUARE_BRACKET:93,CHAR_SEMICOLON:59,CHAR_SINGLE_QUOTE:39,CHAR_SPACE:32,CHAR_TAB:9,CHAR_UNDERSCORE:95,CHAR_VERTICAL_LINE:124,CHAR_ZERO_WIDTH_NOBREAK_SPACE:65279,SEP:TE.sep,extglobChars(r){return{"!":{type:"negate",open:"(?:(?!(?:",close:`))${r.STAR})`},"?":{type:"qmark",open:"(?:",close:")?"},"+":{type:"plus",open:"(?:",close:")+"},"*":{type:"star",open:"(?:",close:")*"},"@":{type:"at",open:"(?:",close:")"}}},globChars(r){return r===!0?BE:Vm}}});var Ii=x(Re=>{u();"use strict";var jE=(et(),Ur),zE=m.platform==="win32",{REGEX_BACKSLASH:UE,REGEX_REMOVE_BACKSLASH:VE,REGEX_SPECIAL_CHARS:HE,REGEX_SPECIAL_CHARS_GLOBAL:WE}=Pi();Re.isObject=r=>r!==null&&typeof r=="object"&&!Array.isArray(r);Re.hasRegexChars=r=>HE.test(r);Re.isRegexChar=r=>r.length===1&&Re.hasRegexChars(r);Re.escapeRegex=r=>r.replace(WE,"\\$1");Re.toPosixSlashes=r=>r.replace(UE,"/");Re.removeBackslashes=r=>r.replace(VE,e=>e==="\\"?"":e);Re.supportsLookbehinds=()=>{let r=m.version.slice(1).split(".").map(Number);return r.length===3&&r[0]>=9||r[0]===8&&r[1]>=10};Re.isWindows=r=>r&&typeof r.windows=="boolean"?r.windows:zE===!0||jE.sep==="\\";Re.escapeLast=(r,e,t)=>{let i=r.lastIndexOf(e,t);return i===-1?r:r[i-1]==="\\"?Re.escapeLast(r,e,i-1):`${r.slice(0,i)}\\${r.slice(i)}`};Re.removePrefix=(r,e={})=>{let t=r;return t.startsWith("./")&&(t=t.slice(2),e.prefix="./"),t};Re.wrapOutput=(r,e={},t={})=>{let i=t.contains?"":"^",n=t.contains?"":"$",a=`${i}(?:${r})${n}`;return e.negated===!0&&(a=`(?:^(?!${a}).*$)`),a}});var Zm=x((o6,Jm)=>{u();"use strict";var Wm=Ii(),{CHAR_ASTERISK:hl,CHAR_AT:GE,CHAR_BACKWARD_SLASH:Di,CHAR_COMMA:QE,CHAR_DOT:ml,CHAR_EXCLAMATION_MARK:gl,CHAR_FORWARD_SLASH:Gm,CHAR_LEFT_CURLY_BRACE:yl,CHAR_LEFT_PARENTHESES:bl,CHAR_LEFT_SQUARE_BRACKET:YE,CHAR_PLUS:KE,CHAR_QUESTION_MARK:Qm,CHAR_RIGHT_CURLY_BRACE:XE,CHAR_RIGHT_PARENTHESES:Ym,CHAR_RIGHT_SQUARE_BRACKET:JE}=Pi(),Km=r=>r===Gm||r===Di,Xm=r=>{r.isPrefix!==!0&&(r.depth=r.isGlobstar?1/0:1)},ZE=(r,e)=>{let t=e||{},i=r.length-1,n=t.parts===!0||t.scanToEnd===!0,a=[],s=[],o=[],l=r,c=-1,f=0,d=0,p=!1,h=!1,b=!1,v=!1,y=!1,w=!1,k=!1,S=!1,E=!1,O=!1,B=0,N,T,F={value:"",depth:0,isGlob:!1},Y=()=>c>=i,_=()=>l.charCodeAt(c+1),Q=()=>(N=T,l.charCodeAt(++c));for(;c<i;){T=Q();let he;if(T===Di){k=F.backslashes=!0,T=Q(),T===yl&&(w=!0);continue}if(w===!0||T===yl){for(B++;Y()!==!0&&(T=Q());){if(T===Di){k=F.backslashes=!0,Q();continue}if(T===yl){B++;continue}if(w!==!0&&T===ml&&(T=Q())===ml){if(p=F.isBrace=!0,b=F.isGlob=!0,O=!0,n===!0)continue;break}if(w!==!0&&T===QE){if(p=F.isBrace=!0,b=F.isGlob=!0,O=!0,n===!0)continue;break}if(T===XE&&(B--,B===0)){w=!1,p=F.isBrace=!0,O=!0;break}}if(n===!0)continue;break}if(T===Gm){if(a.push(c),s.push(F),F={value:"",depth:0,isGlob:!1},O===!0)continue;if(N===ml&&c===f+1){f+=2;continue}d=c+1;continue}if(t.noext!==!0&&(T===KE||T===GE||T===hl||T===Qm||T===gl)===!0&&_()===bl){if(b=F.isGlob=!0,v=F.isExtglob=!0,O=!0,T===gl&&c===f&&(E=!0),n===!0){for(;Y()!==!0&&(T=Q());){if(T===Di){k=F.backslashes=!0,T=Q();continue}if(T===Ym){b=F.isGlob=!0,O=!0;break}}continue}break}if(T===hl){if(N===hl&&(y=F.isGlobstar=!0),b=F.isGlob=!0,O=!0,n===!0)continue;break}if(T===Qm){if(b=F.isGlob=!0,O=!0,n===!0)continue;break}if(T===YE){for(;Y()!==!0&&(he=Q());){if(he===Di){k=F.backslashes=!0,Q();continue}if(he===JE){h=F.isBracket=!0,b=F.isGlob=!0,O=!0;break}}if(n===!0)continue;break}if(t.nonegate!==!0&&T===gl&&c===f){S=F.negated=!0,f++;continue}if(t.noparen!==!0&&T===bl){if(b=F.isGlob=!0,n===!0){for(;Y()!==!0&&(T=Q());){if(T===bl){k=F.backslashes=!0,T=Q();continue}if(T===Ym){O=!0;break}}continue}break}if(b===!0){if(O=!0,n===!0)continue;break}}t.noext===!0&&(v=!1,b=!1);let U=l,oe="",A="";f>0&&(oe=l.slice(0,f),l=l.slice(f),d-=f),U&&b===!0&&d>0?(U=l.slice(0,d),A=l.slice(d)):b===!0?(U="",A=l):U=l,U&&U!==""&&U!=="/"&&U!==l&&Km(U.charCodeAt(U.length-1))&&(U=U.slice(0,-1)),t.unescape===!0&&(A&&(A=Wm.removeBackslashes(A)),U&&k===!0&&(U=Wm.removeBackslashes(U)));let C={prefix:oe,input:r,start:f,base:U,glob:A,isBrace:p,isBracket:h,isGlob:b,isExtglob:v,isGlobstar:y,negated:S,negatedExtglob:E};if(t.tokens===!0&&(C.maxDepth=0,Km(T)||s.push(F),C.tokens=s),t.parts===!0||t.tokens===!0){let he;for(let V=0;V<a.length;V++){let Ee=he?he+1:f,Ie=a[V],De=r.slice(Ee,Ie);t.tokens&&(V===0&&f!==0?(s[V].isPrefix=!0,s[V].value=oe):s[V].value=De,Xm(s[V]),C.maxDepth+=s[V].depth),(V!==0||De!=="")&&o.push(De),he=Ie}if(he&&he+1<r.length){let V=r.slice(he+1);o.push(V),t.tokens&&(s[s.length-1].value=V,Xm(s[s.length-1]),C.maxDepth+=s[s.length-1].depth)}C.slashes=a,C.parts=o}return C};Jm.exports=ZE});var rg=x((l6,tg)=>{u();"use strict";var gs=Pi(),Me=Ii(),{MAX_LENGTH:ys,POSIX_REGEX_SOURCE:e2,REGEX_NON_SPECIAL_CHARS:t2,REGEX_SPECIAL_CHARS_BACKREF:r2,REPLACEMENTS:eg}=gs,i2=(r,e)=>{if(typeof e.expandRange=="function")return e.expandRange(...r,e);r.sort();let t=`[${r.join("-")}]`;try{new RegExp(t)}catch(i){return r.map(n=>Me.escapeRegex(n)).join("..")}return t},gr=(r,e)=>`Missing ${r}: "${e}" - use "\\\\${e}" to match literal characters`,wl=(r,e)=>{if(typeof r!="string")throw new TypeError("Expected a string");r=eg[r]||r;let t={...e},i=typeof t.maxLength=="number"?Math.min(ys,t.maxLength):ys,n=r.length;if(n>i)throw new SyntaxError(`Input length: ${n}, exceeds maximum allowed length: ${i}`);let a={type:"bos",value:"",output:t.prepend||""},s=[a],o=t.capture?"":"?:",l=Me.isWindows(e),c=gs.globChars(l),f=gs.extglobChars(c),{DOT_LITERAL:d,PLUS_LITERAL:p,SLASH_LITERAL:h,ONE_CHAR:b,DOTS_SLASH:v,NO_DOT:y,NO_DOT_SLASH:w,NO_DOTS_SLASH:k,QMARK:S,QMARK_NO_DOT:E,STAR:O,START_ANCHOR:B}=c,N=q=>`(${o}(?:(?!${B}${q.dot?v:d}).)*?)`,T=t.dot?"":y,F=t.dot?S:E,Y=t.bash===!0?N(t):O;t.capture&&(Y=`(${Y})`),typeof t.noext=="boolean"&&(t.noextglob=t.noext);let _={input:r,index:-1,start:0,dot:t.dot===!0,consumed:"",output:"",prefix:"",backtrack:!1,negated:!1,brackets:0,braces:0,parens:0,quotes:0,globstar:!1,tokens:s};r=Me.removePrefix(r,_),n=r.length;let Q=[],U=[],oe=[],A=a,C,he=()=>_.index===n-1,V=_.peek=(q=1)=>r[_.index+q],Ee=_.advance=()=>r[++_.index]||"",Ie=()=>r.slice(_.index+1),De=(q="",ae=0)=>{_.consumed+=q,_.index+=ae},Bi=q=>{_.output+=q.output!=null?q.output:q.value,De(q.value)},Rv=()=>{let q=1;for(;V()==="!"&&(V(2)!=="("||V(3)==="?");)Ee(),_.start++,q++;return q%2==0?!1:(_.negated=!0,_.start++,!0)},Fi=q=>{_[q]++,oe.push(q)},Ft=q=>{_[q]--,oe.pop()},W=q=>{if(A.type==="globstar"){let ae=_.braces>0&&(q.type==="comma"||q.type==="brace"),I=q.extglob===!0||Q.length&&(q.type==="pipe"||q.type==="paren");q.type!=="slash"&&q.type!=="paren"&&!ae&&!I&&(_.output=_.output.slice(0,-A.output.length),A.type="star",A.value="*",A.output=Y,_.output+=A.output)}if(Q.length&&q.type!=="paren"&&(Q[Q.length-1].inner+=q.value),(q.value||q.output)&&Bi(q),A&&A.type==="text"&&q.type==="text"){A.value+=q.value,A.output=(A.output||"")+q.value;return}q.prev=A,s.push(q),A=q},ji=(q,ae)=>{let I={...f[ae],conditions:1,inner:""};I.prev=A,I.parens=_.parens,I.output=_.output;let H=(t.capture?"(":"")+I.open;Fi("parens"),W({type:q,value:ae,output:_.output?"":b}),W({type:"paren",extglob:!0,value:Ee(),output:H}),Q.push(I)},Pv=q=>{let ae=q.close+(t.capture?")":""),I;if(q.type==="negate"){let H=Y;if(q.inner&&q.inner.length>1&&q.inner.includes("/")&&(H=N(t)),(H!==Y||he()||/^\)+$/.test(Ie()))&&(ae=q.close=`)$))${H}`),q.inner.includes("*")&&(I=Ie())&&/^\.[^\\/.]+$/.test(I)){let ce=wl(I,{...e,fastpaths:!1}).output;ae=q.close=`)${ce})${H})`}q.prev.type==="bos"&&(_.negatedExtglob=!0)}W({type:"paren",extglob:!0,value:C,output:ae}),Ft("parens")};if(t.fastpaths!==!1&&!/(^[*!]|[/()[\]{}"])/.test(r)){let q=!1,ae=r.replace(r2,(I,H,ce,Ce,ye,Ms)=>Ce==="\\"?(q=!0,I):Ce==="?"?H?H+Ce+(ye?S.repeat(ye.length):""):Ms===0?F+(ye?S.repeat(ye.length):""):S.repeat(ce.length):Ce==="."?d.repeat(ce.length):Ce==="*"?H?H+Ce+(ye?Y:""):Y:H?I:`\\${I}`);return q===!0&&(t.unescape===!0?ae=ae.replace(/\\/g,""):ae=ae.replace(/\\+/g,I=>I.length%2==0?"\\\\":I?"\\":"")),ae===r&&t.contains===!0?(_.output=r,_):(_.output=Me.wrapOutput(ae,_,e),_)}for(;!he();){if(C=Ee(),C==="\0")continue;if(C==="\\"){let I=V();if(I==="/"&&t.bash!==!0||I==="."||I===";")continue;if(!I){C+="\\",W({type:"text",value:C});continue}let H=/^\\+/.exec(Ie()),ce=0;if(H&&H[0].length>2&&(ce=H[0].length,_.index+=ce,ce%2!=0&&(C+="\\")),t.unescape===!0?C=Ee():C+=Ee(),_.brackets===0){W({type:"text",value:C});continue}}if(_.brackets>0&&(C!=="]"||A.value==="["||A.value==="[^")){if(t.posix!==!1&&C===":"){let I=A.value.slice(1);if(I.includes("[")&&(A.posix=!0,I.includes(":"))){let H=A.value.lastIndexOf("["),ce=A.value.slice(0,H),Ce=A.value.slice(H+2),ye=e2[Ce];if(ye){A.value=ce+ye,_.backtrack=!0,Ee(),!a.output&&s.indexOf(A)===1&&(a.output=b);continue}}}(C==="["&&V()!==":"||C==="-"&&V()==="]")&&(C=`\\${C}`),C==="]"&&(A.value==="["||A.value==="[^")&&(C=`\\${C}`),t.posix===!0&&C==="!"&&A.value==="["&&(C="^"),A.value+=C,Bi({value:C});continue}if(_.quotes===1&&C!=='"'){C=Me.escapeRegex(C),A.value+=C,Bi({value:C});continue}if(C==='"'){_.quotes=_.quotes===1?0:1,t.keepQuotes===!0&&W({type:"text",value:C});continue}if(C==="("){Fi("parens"),W({type:"paren",value:C});continue}if(C===")"){if(_.parens===0&&t.strictBrackets===!0)throw new SyntaxError(gr("opening","("));let I=Q[Q.length-1];if(I&&_.parens===I.parens+1){Pv(Q.pop());continue}W({type:"paren",value:C,output:_.parens?")":"\\)"}),Ft("parens");continue}if(C==="["){if(t.nobracket===!0||!Ie().includes("]")){if(t.nobracket!==!0&&t.strictBrackets===!0)throw new SyntaxError(gr("closing","]"));C=`\\${C}`}else Fi("brackets");W({type:"bracket",value:C});continue}if(C==="]"){if(t.nobracket===!0||A&&A.type==="bracket"&&A.value.length===1){W({type:"text",value:C,output:`\\${C}`});continue}if(_.brackets===0){if(t.strictBrackets===!0)throw new SyntaxError(gr("opening","["));W({type:"text",value:C,output:`\\${C}`});continue}Ft("brackets");let I=A.value.slice(1);if(A.posix!==!0&&I[0]==="^"&&!I.includes("/")&&(C=`/${C}`),A.value+=C,Bi({value:C}),t.literalBrackets===!1||Me.hasRegexChars(I))continue;let H=Me.escapeRegex(A.value);if(_.output=_.output.slice(0,-A.value.length),t.literalBrackets===!0){_.output+=H,A.value=H;continue}A.value=`(${o}${H}|${A.value})`,_.output+=A.value;continue}if(C==="{"&&t.nobrace!==!0){Fi("braces");let I={type:"brace",value:C,output:"(",outputIndex:_.output.length,tokensIndex:_.tokens.length};U.push(I),W(I);continue}if(C==="}"){let I=U[U.length-1];if(t.nobrace===!0||!I){W({type:"text",value:C,output:C});continue}let H=")";if(I.dots===!0){let ce=s.slice(),Ce=[];for(let ye=ce.length-1;ye>=0&&(s.pop(),ce[ye].type!=="brace");ye--)ce[ye].type!=="dots"&&Ce.unshift(ce[ye].value);H=i2(Ce,t),_.backtrack=!0}if(I.comma!==!0&&I.dots!==!0){let ce=_.output.slice(0,I.outputIndex),Ce=_.tokens.slice(I.tokensIndex);I.value=I.output="\\{",C=H="\\}",_.output=ce;for(let ye of Ce)_.output+=ye.output||ye.value}W({type:"brace",value:C,output:H}),Ft("braces"),U.pop();continue}if(C==="|"){Q.length>0&&Q[Q.length-1].conditions++,W({type:"text",value:C});continue}if(C===","){let I=C,H=U[U.length-1];H&&oe[oe.length-1]==="braces"&&(H.comma=!0,I="|"),W({type:"comma",value:C,output:I});continue}if(C==="/"){if(A.type==="dot"&&_.index===_.start+1){_.start=_.index+1,_.consumed="",_.output="",s.pop(),A=a;continue}W({type:"slash",value:C,output:h});continue}if(C==="."){if(_.braces>0&&A.type==="dot"){A.value==="."&&(A.output=d);let I=U[U.length-1];A.type="dots",A.output+=C,A.value+=C,I.dots=!0;continue}if(_.braces+_.parens===0&&A.type!=="bos"&&A.type!=="slash"){W({type:"text",value:C,output:d});continue}W({type:"dot",value:C,output:d});continue}if(C==="?"){if(!(A&&A.value==="(")&&t.noextglob!==!0&&V()==="("&&V(2)!=="?"){ji("qmark",C);continue}if(A&&A.type==="paren"){let H=V(),ce=C;if(H==="<"&&!Me.supportsLookbehinds())throw new Error("Node.js v10 or higher is required for regex lookbehinds");(A.value==="("&&!/[!=<:]/.test(H)||H==="<"&&!/<([!=]|\w+>)/.test(Ie()))&&(ce=`\\${C}`),W({type:"text",value:C,output:ce});continue}if(t.dot!==!0&&(A.type==="slash"||A.type==="bos")){W({type:"qmark",value:C,output:E});continue}W({type:"qmark",value:C,output:S});continue}if(C==="!"){if(t.noextglob!==!0&&V()==="("&&(V(2)!=="?"||!/[!=<:]/.test(V(3)))){ji("negate",C);continue}if(t.nonegate!==!0&&_.index===0){Rv();continue}}if(C==="+"){if(t.noextglob!==!0&&V()==="("&&V(2)!=="?"){ji("plus",C);continue}if(A&&A.value==="("||t.regex===!1){W({type:"plus",value:C,output:p});continue}if(A&&(A.type==="bracket"||A.type==="paren"||A.type==="brace")||_.parens>0){W({type:"plus",value:C});continue}W({type:"plus",value:p});continue}if(C==="@"){if(t.noextglob!==!0&&V()==="("&&V(2)!=="?"){W({type:"at",extglob:!0,value:C,output:""});continue}W({type:"text",value:C});continue}if(C!=="*"){(C==="$"||C==="^")&&(C=`\\${C}`);let I=t2.exec(Ie());I&&(C+=I[0],_.index+=I[0].length),W({type:"text",value:C});continue}if(A&&(A.type==="globstar"||A.star===!0)){A.type="star",A.star=!0,A.value+=C,A.output=Y,_.backtrack=!0,_.globstar=!0,De(C);continue}let q=Ie();if(t.noextglob!==!0&&/^\([^?]/.test(q)){ji("star",C);continue}if(A.type==="star"){if(t.noglobstar===!0){De(C);continue}let I=A.prev,H=I.prev,ce=I.type==="slash"||I.type==="bos",Ce=H&&(H.type==="star"||H.type==="globstar");if(t.bash===!0&&(!ce||q[0]&&q[0]!=="/")){W({type:"star",value:C,output:""});continue}let ye=_.braces>0&&(I.type==="comma"||I.type==="brace"),Ms=Q.length&&(I.type==="pipe"||I.type==="paren");if(!ce&&I.type!=="paren"&&!ye&&!Ms){W({type:"star",value:C,output:""});continue}for(;q.slice(0,3)==="/**";){let zi=r[_.index+4];if(zi&&zi!=="/")break;q=q.slice(3),De("/**",3)}if(I.type==="bos"&&he()){A.type="globstar",A.value+=C,A.output=N(t),_.output=A.output,_.globstar=!0,De(C);continue}if(I.type==="slash"&&I.prev.type!=="bos"&&!Ce&&he()){_.output=_.output.slice(0,-(I.output+A.output).length),I.output=`(?:${I.output}`,A.type="globstar",A.output=N(t)+(t.strictSlashes?")":"|$)"),A.value+=C,_.globstar=!0,_.output+=I.output+A.output,De(C);continue}if(I.type==="slash"&&I.prev.type!=="bos"&&q[0]==="/"){let zi=q[1]!==void 0?"|$":"";_.output=_.output.slice(0,-(I.output+A.output).length),I.output=`(?:${I.output}`,A.type="globstar",A.output=`${N(t)}${h}|${h}${zi})`,A.value+=C,_.output+=I.output+A.output,_.globstar=!0,De(C+Ee()),W({type:"slash",value:"/",output:""});continue}if(I.type==="bos"&&q[0]==="/"){A.type="globstar",A.value+=C,A.output=`(?:^|${h}|${N(t)}${h})`,_.output=A.output,_.globstar=!0,De(C+Ee()),W({type:"slash",value:"/",output:""});continue}_.output=_.output.slice(0,-A.output.length),A.type="globstar",A.output=N(t),A.value+=C,_.output+=A.output,_.globstar=!0,De(C);continue}let ae={type:"star",value:C,output:Y};if(t.bash===!0){ae.output=".*?",(A.type==="bos"||A.type==="slash")&&(ae.output=T+ae.output),W(ae);continue}if(A&&(A.type==="bracket"||A.type==="paren")&&t.regex===!0){ae.output=C,W(ae);continue}(_.index===_.start||A.type==="slash"||A.type==="dot")&&(A.type==="dot"?(_.output+=w,A.output+=w):t.dot===!0?(_.output+=k,A.output+=k):(_.output+=T,A.output+=T),V()!=="*"&&(_.output+=b,A.output+=b)),W(ae)}for(;_.brackets>0;){if(t.strictBrackets===!0)throw new SyntaxError(gr("closing","]"));_.output=Me.escapeLast(_.output,"["),Ft("brackets")}for(;_.parens>0;){if(t.strictBrackets===!0)throw new SyntaxError(gr("closing",")"));_.output=Me.escapeLast(_.output,"("),Ft("parens")}for(;_.braces>0;){if(t.strictBrackets===!0)throw new SyntaxError(gr("closing","}"));_.output=Me.escapeLast(_.output,"{"),Ft("braces")}if(t.strictSlashes!==!0&&(A.type==="star"||A.type==="bracket")&&W({type:"maybe_slash",value:"",output:`${h}?`}),_.backtrack===!0){_.output="";for(let q of _.tokens)_.output+=q.output!=null?q.output:q.value,q.suffix&&(_.output+=q.suffix)}return _};wl.fastpaths=(r,e)=>{let t={...e},i=typeof t.maxLength=="number"?Math.min(ys,t.maxLength):ys,n=r.length;if(n>i)throw new SyntaxError(`Input length: ${n}, exceeds maximum allowed length: ${i}`);r=eg[r]||r;let a=Me.isWindows(e),{DOT_LITERAL:s,SLASH_LITERAL:o,ONE_CHAR:l,DOTS_SLASH:c,NO_DOT:f,NO_DOTS:d,NO_DOTS_SLASH:p,STAR:h,START_ANCHOR:b}=gs.globChars(a),v=t.dot?d:f,y=t.dot?p:f,w=t.capture?"":"?:",k={negated:!1,prefix:""},S=t.bash===!0?".*?":h;t.capture&&(S=`(${S})`);let E=T=>T.noglobstar===!0?S:`(${w}(?:(?!${b}${T.dot?c:s}).)*?)`,O=T=>{switch(T){case"*":return`${v}${l}${S}`;case".*":return`${s}${l}${S}`;case"*.*":return`${v}${S}${s}${l}${S}`;case"*/*":return`${v}${S}${o}${l}${y}${S}`;case"**":return v+E(t);case"**/*":return`(?:${v}${E(t)}${o})?${y}${l}${S}`;case"**/*.*":return`(?:${v}${E(t)}${o})?${y}${S}${s}${l}${S}`;case"**/.*":return`(?:${v}${E(t)}${o})?${s}${l}${S}`;default:{let F=/^(.*?)\.(\w+)$/.exec(T);if(!F)return;let Y=O(F[1]);return Y?Y+s+F[2]:void 0}}},B=Me.removePrefix(r,k),N=O(B);return N&&t.strictSlashes!==!0&&(N+=`${o}?`),N};tg.exports=wl});var ng=x((u6,ig)=>{u();"use strict";var n2=(et(),Ur),s2=Zm(),vl=rg(),xl=Ii(),a2=Pi(),o2=r=>r&&typeof r=="object"&&!Array.isArray(r),de=(r,e,t=!1)=>{if(Array.isArray(r)){let f=r.map(p=>de(p,e,t));return p=>{for(let h of f){let b=h(p);if(b)return b}return!1}}let i=o2(r)&&r.tokens&&r.input;if(r===""||typeof r!="string"&&!i)throw new TypeError("Expected pattern to be a non-empty string");let n=e||{},a=xl.isWindows(e),s=i?de.compileRe(r,e):de.makeRe(r,e,!1,!0),o=s.state;delete s.state;let l=()=>!1;if(n.ignore){let f={...e,ignore:null,onMatch:null,onResult:null};l=de(n.ignore,f,t)}let c=(f,d=!1)=>{let{isMatch:p,match:h,output:b}=de.test(f,s,e,{glob:r,posix:a}),v={glob:r,state:o,regex:s,posix:a,input:f,output:b,match:h,isMatch:p};return typeof n.onResult=="function"&&n.onResult(v),p===!1?(v.isMatch=!1,d?v:!1):l(f)?(typeof n.onIgnore=="function"&&n.onIgnore(v),v.isMatch=!1,d?v:!1):(typeof n.onMatch=="function"&&n.onMatch(v),d?v:!0)};return t&&(c.state=o),c};de.test=(r,e,t,{glob:i,posix:n}={})=>{if(typeof r!="string")throw new TypeError("Expected input to be a string");if(r==="")return{isMatch:!1,output:""};let a=t||{},s=a.format||(n?xl.toPosixSlashes:null),o=r===i,l=o&&s?s(r):r;return o===!1&&(l=s?s(r):r,o=l===i),(o===!1||a.capture===!0)&&(a.matchBase===!0||a.basename===!0?o=de.matchBase(r,e,t,n):o=e.exec(l)),{isMatch:Boolean(o),match:o,output:l}};de.matchBase=(r,e,t,i=xl.isWindows(t))=>(e instanceof RegExp?e:de.makeRe(e,t)).test(n2.basename(r));de.isMatch=(r,e,t)=>de(e,t)(r);de.parse=(r,e)=>Array.isArray(r)?r.map(t=>de.parse(t,e)):vl(r,{...e,fastpaths:!1});de.scan=(r,e)=>s2(r,e);de.compileRe=(r,e,t=!1,i=!1)=>{if(t===!0)return r.output;let n=e||{},a=n.contains?"":"^",s=n.contains?"":"$",o=`${a}(?:${r.output})${s}`;r&&r.negated===!0&&(o=`^(?!${o}).*$`);let l=de.toRegex(o,e);return i===!0&&(l.state=r),l};de.makeRe=(r,e={},t=!1,i=!1)=>{if(!r||typeof r!="string")throw new TypeError("Expected a non-empty string");let n={negated:!1,fastpaths:!0};return e.fastpaths!==!1&&(r[0]==="."||r[0]==="*")&&(n.output=vl.fastpaths(r,e)),n.output||(n=vl(r,e)),de.compileRe(n,e,t,i)};de.toRegex=(r,e)=>{try{let t=e||{};return new RegExp(r,t.flags||(t.nocase?"i":""))}catch(t){if(e&&e.debug===!0)throw t;return/$^/}};de.constants=a2;ig.exports=de});var ag=x((f6,sg)=>{u();"use strict";sg.exports=ng()});var cg=x((c6,fg)=>{u();"use strict";var og=(Bn(),Nn),lg=Fm(),ot=ag(),kl=Ii(),ug=r=>r===""||r==="./",fe=(r,e,t)=>{e=[].concat(e),r=[].concat(r);let i=new Set,n=new Set,a=new Set,s=0,o=f=>{a.add(f.output),t&&t.onResult&&t.onResult(f)};for(let f=0;f<e.length;f++){let d=ot(String(e[f]),{...t,onResult:o},!0),p=d.state.negated||d.state.negatedExtglob;p&&s++;for(let h of r){let b=d(h,!0);!(p?!b.isMatch:b.isMatch)||(p?i.add(b.output):(i.delete(b.output),n.add(b.output)))}}let c=(s===e.length?[...a]:[...n]).filter(f=>!i.has(f));if(t&&c.length===0){if(t.failglob===!0)throw new Error(`No matches found for "${e.join(", ")}"`);if(t.nonull===!0||t.nullglob===!0)return t.unescape?e.map(f=>f.replace(/\\/g,"")):e}return c};fe.match=fe;fe.matcher=(r,e)=>ot(r,e);fe.isMatch=(r,e,t)=>ot(e,t)(r);fe.any=fe.isMatch;fe.not=(r,e,t={})=>{e=[].concat(e).map(String);let i=new Set,n=[],a=o=>{t.onResult&&t.onResult(o),n.push(o.output)},s=new Set(fe(r,e,{...t,onResult:a}));for(let o of n)s.has(o)||i.add(o);return[...i]};fe.contains=(r,e,t)=>{if(typeof r!="string")throw new TypeError(`Expected a string: "${og.inspect(r)}"`);if(Array.isArray(e))return e.some(i=>fe.contains(r,i,t));if(typeof e=="string"){if(ug(r)||ug(e))return!1;if(r.includes(e)||r.startsWith("./")&&r.slice(2).includes(e))return!0}return fe.isMatch(r,e,{...t,contains:!0})};fe.matchKeys=(r,e,t)=>{if(!kl.isObject(r))throw new TypeError("Expected the first argument to be an object");let i=fe(Object.keys(r),e,t),n={};for(let a of i)n[a]=r[a];return n};fe.some=(r,e,t)=>{let i=[].concat(r);for(let n of[].concat(e)){let a=ot(String(n),t);if(i.some(s=>a(s)))return!0}return!1};fe.every=(r,e,t)=>{let i=[].concat(r);for(let n of[].concat(e)){let a=ot(String(n),t);if(!i.every(s=>a(s)))return!1}return!0};fe.all=(r,e,t)=>{if(typeof r!="string")throw new TypeError(`Expected a string: "${og.inspect(r)}"`);return[].concat(e).every(i=>ot(i,t)(r))};fe.capture=(r,e,t)=>{let i=kl.isWindows(t),a=ot.makeRe(String(r),{...t,capture:!0}).exec(i?kl.toPosixSlashes(e):e);if(a)return a.slice(1).map(s=>s===void 0?"":s)};fe.makeRe=(...r)=>ot.makeRe(...r);fe.scan=(...r)=>ot.scan(...r);fe.parse=(r,e)=>{let t=[];for(let i of[].concat(r||[]))for(let n of lg(String(i),e))t.push(ot.parse(n,e));return t};fe.braces=(r,e)=>{if(typeof r!="string")throw new TypeError("Expected a string");return e&&e.nobrace===!0||!/\{.*\}/.test(r)?[r]:lg(r,e)};fe.braceExpand=(r,e)=>{if(typeof r!="string")throw new TypeError("Expected a string");return fe.braces(r,{...e,expand:!0})};fg.exports=fe});function dg(r,e){let t=e.content.files;t=t.filter(o=>typeof o=="string"),t=t.map(al);let i=cs.generateTasks(t),n=[],a=[];for(let o of i)n.push(...o.positive.map(l=>hg(l,!1))),a.push(...o.negative.map(l=>hg(l,!0)));let s=[...n,...a];return s=u2(r,s),s=s.flatMap(f2),s=s.map(l2),s}function hg(r,e){let t={original:r,base:r,ignore:e,pattern:r,glob:null};return Kh(r)&&Object.assign(t,rm(r)),t}function l2(r){let e=al(r.base);return e=cs.escapePath(e),r.pattern=r.glob?`${e}/${r.glob}`:e,r.pattern=r.ignore?`!${r.pattern}`:r.pattern,r}function u2(r,e){let t=[];return r.userConfigPath&&r.tailwindConfig.content.relative&&(t=[me.dirname(r.userConfigPath)]),e.map(i=>(i.base=me.resolve(...t,i.base),i))}function f2(r){let e=[r];try{let t=be.realpathSync(r.base);t!==r.base&&e.push({...r,base:t})}catch{}return e}function mg(r,e,t){let i=r.tailwindConfig.content.files.filter(s=>typeof s.raw=="string").map(({raw:s,extension:o="html"})=>({content:s,extension:o})),[n,a]=p2(e,t);for(let s of n){let o=me.extname(s).slice(1);i.push({file:s,extension:o})}return[i,a]}function c2(r){if(!r.some(a=>a.includes("**")&&!yg.test(a)))return()=>{};let t=[],i=[];for(let a of r){let s=pg.default.matcher(a);yg.test(a)&&i.push(s),t.push(s)}let n=!1;return a=>{if(n||i.some(f=>f(a)))return;let s=t.findIndex(f=>f(a));if(s===-1)return;let o=r[s],l=me.relative(m.cwd(),o);l[0]!=="."&&(l=`./${l}`);let c=gg.find(f=>a.includes(f));c&&(n=!0,G.warn("broad-content-glob-pattern",[`Your \`content\` configuration includes a pattern which looks like it's accidentally matching all of \`${c}\` and can cause serious performance issues.`,`Pattern: \`${l}\``,"See our documentation for recommendations:","https://tailwindcss.com/docs/content-configuration#pattern-recommendations"]))}}function p2(r,e){let t=r.map(o=>o.pattern),i=new Map,n=c2(t),a=new Set;Je.DEBUG&&console.time("Finding changed files");let s=cs.sync(t,{absolute:!0});for(let o of s){n(o);let l=e.get(o)||-1/0,c=be.statSync(o).mtimeMs;c>l&&(a.add(o),i.set(o,c))}return Je.DEBUG&&console.timeEnd("Finding changed files"),[a,i]}var pg,gg,yg,bg=R(()=>{u();ft();et();Xh();Jh();Zh();im();It();Be();pg=pe(cg());gg=["node_modules"],yg=new RegExp(`(${gg.map(r=>String.raw`\b${r}\b`).join("|")})`)});function wg(){}var vg=R(()=>{u()});function g2(r,e){for(let t of e){let i=`${r}${t}`;if(be.existsSync(i)&&be.statSync(i).isFile())return i}for(let t of e){let i=`${r}/index${t}`;if(be.existsSync(i))return i}return null}function*xg(r,e,t,i=me.extname(r)){let n=g2(me.resolve(e,r),d2.includes(i)?h2:m2);if(n===null||t.has(n))return;t.add(n),yield n,e=me.dirname(n),i=me.extname(n);let a=be.readFileSync(n,"utf-8");for(let s of[...a.matchAll(/import[\s\S]*?['"](.{3,}?)['"]/gi),...a.matchAll(/import[\s\S]*from[\s\S]*?['"](.{3,}?)['"]/gi),...a.matchAll(/require\(['"`](.+)['"`]\)/gi)])!s[1].startsWith(".")||(yield*xg(s[1],e,t,i))}function Sl(r){return r===null?new Set:new Set(xg(r,me.dirname(r),new Set))}var d2,h2,m2,kg=R(()=>{u();ft();et();d2=[".js",".cjs",".mjs"],h2=["",".js",".cjs",".mjs",".ts",".cts",".mts",".jsx",".tsx"],m2=["",".ts",".cts",".mts",".tsx",".js",".cjs",".mjs",".jsx"]});function y2(r,e){if(Al.has(r))return Al.get(r);let t=dg(r,e);return Al.set(r,t).get(r)}function b2(r){let e=na(r);if(e!==null){let[i,n,a,s]=Ag.get(e)||[],o=Sl(e),l=!1,c=new Map;for(let p of o){let h=be.statSync(p).mtimeMs;c.set(p,h),(!s||!s.has(p)||h>s.get(p))&&(l=!0)}if(!l)return[i,e,n,a];for(let p of o)delete pf.cache[p];let f=sl(zr(wg(e))),d=Vi(f);return Ag.set(e,[f,d,o,c]),[f,e,d,o]}let t=zr(r?.config??r??{});return t=sl(t),[t,null,Vi(t),[]]}function Cl(r){return({tailwindDirectives:e,registerDependency:t})=>(i,n)=>{let[a,s,o,l]=b2(r),c=new Set(l);if(e.size>0){c.add(n.opts.from);for(let b of n.messages)b.type==="dependency"&&c.add(b.file)}let[f,,d]=zh(i,n,a,s,o,c),p=fs(f),h=y2(f,a);if(e.size>0){for(let y of h)for(let w of rl(y))t(w);let[b,v]=mg(f,h,p);for(let y of b)f.changedContent.push(y);for(let[y,w]of v.entries())d.set(y,w)}for(let b of l)t({type:"dependency",file:b});for(let[b,v]of d.entries())p.set(b,v);return f}}var Sg,Ag,Al,Cg=R(()=>{u();ft();Sg=pe(Ns());yf();ia();sc();_i();Uh();Yh();bg();vg();kg();Ag=new Sg.default({maxSize:100}),Al=new WeakMap});function _l(r){let e=new Set,t=new Set,i=new Set;if(r.walkAtRules(n=>{n.name==="apply"&&i.add(n),n.name==="import"&&(n.params==='"tailwindcss/base"'||n.params==="'tailwindcss/base'"?(n.name="tailwind",n.params="base"):n.params==='"tailwindcss/components"'||n.params==="'tailwindcss/components'"?(n.name="tailwind",n.params="components"):n.params==='"tailwindcss/utilities"'||n.params==="'tailwindcss/utilities'"?(n.name="tailwind",n.params="utilities"):(n.params==='"tailwindcss/screens"'||n.params==="'tailwindcss/screens'"||n.params==='"tailwindcss/variants"'||n.params==="'tailwindcss/variants'")&&(n.name="tailwind",n.params="variants")),n.name==="tailwind"&&(n.params==="screens"&&(n.params="variants"),e.add(n.params)),["layer","responsive","variants"].includes(n.name)&&(["responsive","variants"].includes(n.name)&&G.warn(`${n.name}-at-rule-deprecated`,[`The \`@${n.name}\` directive has been deprecated in Tailwind CSS v3.0.`,"Use `@layer utilities` or `@layer components` instead.","https://tailwindcss.com/docs/upgrade-guide#replace-variants-with-layer"]),t.add(n))}),!e.has("base")||!e.has("components")||!e.has("utilities")){for(let n of t)if(n.name==="layer"&&["base","components","utilities"].includes(n.params)){if(!e.has(n.params))throw n.error(`\`@layer ${n.params}\` is used but no matching \`@tailwind ${n.params}\` directive is present.`)}else if(n.name==="responsive"){if(!e.has("utilities"))throw n.error("`@responsive` is used but `@tailwind utilities` is missing.")}else if(n.name==="variants"&&!e.has("utilities"))throw n.error("`@variants` is used but `@tailwind utilities` is missing.")}return{tailwindDirectives:e,applyDirectives:i}}var _g=R(()=>{u();Be()});function Qt(r,e=void 0,t=void 0){return r.map(i=>{let n=i.clone();return t!==void 0&&(n.raws.tailwind={...n.raws.tailwind,...t}),e!==void 0&&Eg(n,a=>{if(a.raws.tailwind?.preserveSource===!0&&a.source)return!1;a.source=e}),n})}function Eg(r,e){e(r)!==!1&&r.each?.(t=>Eg(t,e))}var Og=R(()=>{u()});function El(r){return r=Array.isArray(r)?r:[r],r=r.map(e=>e instanceof RegExp?e.source:e),r.join("")}function Ne(r){return new RegExp(El(r),"g")}function qt(r){return`(?:${r.map(El).join("|")})`}function Ol(r){return`(?:${El(r)})?`}function Rg(r){return r&&w2.test(r)?r.replace(Tg,"\\$&"):r||""}var Tg,w2,Pg=R(()=>{u();Tg=/[\\^$.*+?()[\]{}|]/g,w2=RegExp(Tg.source)});function Ig(r){let e=Array.from(v2(r));return t=>{let i=[];for(let n of e)for(let a of t.match(n)??[])i.push(S2(a));for(let n of i.slice()){let a=ve(n,".");for(let s=0;s<a.length;s++){let o=a[s];if(s>=a.length-1){i.push(o);continue}let l=Number(a[s+1]);isNaN(l)?i.push(o):s++}}return i}}function*v2(r){let e=r.tailwindConfig.separator,t=r.tailwindConfig.prefix!==""?Ol(Ne([/-?/,Rg(r.tailwindConfig.prefix)])):"",i=qt([/\[[^\s:'"`]+:[^\s\[\]]+\]/,/\[[^\s:'"`\]]+:[^\s]+?\[[^\s]+\][^\s]+?\]/,Ne([qt([/-?(?:\w+)/,/@(?:\w+)/]),Ol(qt([Ne([qt([/-(?:\w+-)*\['[^\s]+'\]/,/-(?:\w+-)*\["[^\s]+"\]/,/-(?:\w+-)*\[`[^\s]+`\]/,/-(?:\w+-)*\[(?:[^\s\[\]]+\[[^\s\[\]]+\])*[^\s:\[\]]+\]/]),/(?![{([]])/,/(?:\/[^\s'"`\\><$]*)?/]),Ne([qt([/-(?:\w+-)*\['[^\s]+'\]/,/-(?:\w+-)*\["[^\s]+"\]/,/-(?:\w+-)*\[`[^\s]+`\]/,/-(?:\w+-)*\[(?:[^\s\[\]]+\[[^\s\[\]]+\])*[^\s\[\]]+\]/]),/(?![{([]])/,/(?:\/[^\s'"`\\$]*)?/]),/[-\/][^\s'"`\\$={><]*/]))])]),n=[qt([Ne([/@\[[^\s"'`]+\](\/[^\s"'`]+)?/,e]),Ne([/([^\s"'`\[\\]+-)?\[[^\s"'`]+\]\/[\w_-]+/,e]),Ne([/([^\s"'`\[\\]+-)?\[[^\s"'`]+\]/,e]),Ne([/[^\s"'`\[\\]+/,e])]),qt([Ne([/([^\s"'`\[\\]+-)?\[[^\s`]+\]\/[\w_-]+/,e]),Ne([/([^\s"'`\[\\]+-)?\[[^\s`]+\]/,e]),Ne([/[^\s`\[\\]+/,e])])];for(let a of n)yield Ne(["((?=((",a,")+))\\2)?",/!?/,t,i]);yield/[^<>"'`\s.(){}[\]#=%$][^<>"'`\s(){}[\]#=%$]*[^<>"'`\s.(){}[\]#=%:$]/g}function S2(r){if(!r.includes("-["))return r;let e=0,t=[],i=r.matchAll(x2);i=Array.from(i).flatMap(n=>{let[,...a]=n;return a.map((s,o)=>Object.assign([],n,{index:n.index+o,0:s}))});for(let n of i){let a=n[0],s=t[t.length-1];if(a===s?t.pop():(a==="'"||a==='"'||a==="`")&&t.push(a),!s){if(a==="["){e++;continue}else if(a==="]"){e--;continue}if(e<0)return r.substring(0,n.index-1);if(e===0&&!k2.test(a))return r.substring(0,n.index)}}return r}var x2,k2,Dg=R(()=>{u();Pg();zt();x2=/([\[\]'"`])([^\[\]'"`])?/g,k2=/[^"'`\s<>\]]+/});function A2(r,e){let t=r.tailwindConfig.content.extract;return t[e]||t.DEFAULT||$g[e]||$g.DEFAULT(r)}function C2(r,e){let t=r.content.transform;return t[e]||t.DEFAULT||Lg[e]||Lg.DEFAULT}function _2(r,e,t,i){qi.has(e)||qi.set(e,new qg.default({maxSize:25e3}));for(let n of r.split(`
+`))if(n=n.trim(),!i.has(n))if(i.add(n),qi.get(e).has(n))for(let a of qi.get(e).get(n))t.add(a);else{let a=e(n).filter(o=>o!=="!*"),s=new Set(a);for(let o of s)t.add(o);qi.get(e).set(n,s)}}function E2(r,e){let t=e.offsets.sort(r),i={base:new Set,defaults:new Set,components:new Set,utilities:new Set,variants:new Set};for(let[n,a]of t)i[n.layer].add(a);return i}function Tl(r){return async e=>{let t={base:null,components:null,utilities:null,variants:null};if(e.walkAtRules(y=>{y.name==="tailwind"&&Object.keys(t).includes(y.params)&&(t[y.params]=y)}),Object.values(t).every(y=>y===null))return e;let i=new Set([...r.candidates??[],gt]),n=new Set;bt.DEBUG&&console.time("Reading changed files");let a=[];for(let y of r.changedContent){let w=C2(r.tailwindConfig,y.extension),k=A2(r,y.extension);a.push([y,{transformer:w,extractor:k}])}let s=500;for(let y=0;y<a.length;y+=s){let w=a.slice(y,y+s);await Promise.all(w.map(async([{file:k,content:S},{transformer:E,extractor:O}])=>{S=k?await be.promises.readFile(k,"utf8"):S,_2(E(S),O,i,n)}))}bt.DEBUG&&console.timeEnd("Reading changed files");let o=r.classCache.size;bt.DEBUG&&console.time("Generate rules"),bt.DEBUG&&console.time("Sorting candidates");let l=new Set([...i].sort((y,w)=>y===w?0:y<w?-1:1));bt.DEBUG&&console.timeEnd("Sorting candidates"),ss(l,r),bt.DEBUG&&console.timeEnd("Generate rules"),bt.DEBUG&&console.time("Build stylesheet"),(r.stylesheetCache===null||r.classCache.size!==o)&&(r.stylesheetCache=E2([...r.ruleCache],r)),bt.DEBUG&&console.timeEnd("Build stylesheet");let{defaults:c,base:f,components:d,utilities:p,variants:h}=r.stylesheetCache;t.base&&(t.base.before(Qt([...c,...f],t.base.source,{layer:"base"})),t.base.remove()),t.components&&(t.components.before(Qt([...d],t.components.source,{layer:"components"})),t.components.remove()),t.utilities&&(t.utilities.before(Qt([...p],t.utilities.source,{layer:"utilities"})),t.utilities.remove());let b=Array.from(h).filter(y=>{let w=y.raws.tailwind?.parentLayer;return w==="components"?t.components!==null:w==="utilities"?t.utilities!==null:!0});t.variants?(t.variants.before(Qt(b,t.variants.source,{layer:"variants"})),t.variants.remove()):b.length>0&&e.append(Qt(b,e.source,{layer:"variants"})),e.source.end=e.source.end??e.source.start;let v=b.some(y=>y.raws.tailwind?.parentLayer==="utilities");t.utilities&&p.size===0&&!v&&G.warn("content-problems",["No utility classes were detected in your source files. If this is unexpected, double-check the `content` option in your Tailwind CSS configuration.","https://tailwindcss.com/docs/content-configuration"]),bt.DEBUG&&(console.log("Potential classes: ",i.size),console.log("Active contexts: ",Zn.size)),r.changedContent=[],e.walkAtRules("layer",y=>{Object.keys(t).includes(y.params)&&y.remove()})}}var qg,bt,$g,Lg,qi,Mg=R(()=>{u();ft();qg=pe(Ns());It();as();Be();Og();Dg();bt=Je,$g={DEFAULT:Ig},Lg={DEFAULT:r=>r,svelte:r=>r.replace(/(?:^|\s)class:/g," ")};qi=new WeakMap});function ws(r){let e=new Map;ee.root({nodes:[r.clone()]}).walkRules(a=>{(0,bs.default)(s=>{s.walkClasses(o=>{let l=o.parent.toString(),c=e.get(l);c||e.set(l,c=new Set),c.add(o.value)})}).processSync(a.selector)});let i=Array.from(e.values(),a=>Array.from(a)),n=i.flat();return Object.assign(n,{groups:i})}function Rl(r){return O2.astSync(r)}function Ng(r,e){let t=new Set;for(let i of r)t.add(i.split(e).pop());return Array.from(t)}function Bg(r,e){let t=r.tailwindConfig.prefix;return typeof t=="function"?t(e):t+e}function*Fg(r){for(yield r;r.parent;)yield r.parent,r=r.parent}function T2(r,e={}){let t=r.nodes;r.nodes=[];let i=r.clone(e);return r.nodes=t,i}function R2(r){for(let e of Fg(r))if(r!==e){if(e.type==="root")break;r=T2(e,{nodes:[r]})}return r}function P2(r,e){let t=new Map;return r.walkRules(i=>{for(let s of Fg(i))if(s.raws.tailwind?.layer!==void 0)return;let n=R2(i),a=e.offsets.create("user");for(let s of ws(i)){let o=t.get(s)||[];t.set(s,o),o.push([{layer:"user",sort:a,important:!1},n])}}),t}function I2(r,e){for(let t of r){if(e.notClassCache.has(t)||e.applyClassCache.has(t))continue;if(e.classCache.has(t)){e.applyClassCache.set(t,e.classCache.get(t).map(([n,a])=>[n,a.clone()]));continue}let i=Array.from(Go(t,e));if(i.length===0){e.notClassCache.add(t);continue}e.applyClassCache.set(t,i)}return e.applyClassCache}function D2(r){let e=null;return{get:t=>(e=e||r(),e.get(t)),has:t=>(e=e||r(),e.has(t))}}function q2(r){return{get:e=>r.flatMap(t=>t.get(e)||[]),has:e=>r.some(t=>t.has(e))}}function jg(r){let e=r.split(/[\s\t\n]+/g);return e[e.length-1]==="!important"?[e.slice(0,-1),!0]:[e,!1]}function zg(r,e,t){let i=new Set,n=[];if(r.walkAtRules("apply",l=>{let[c]=jg(l.params);for(let f of c)i.add(f);n.push(l)}),n.length===0)return;let a=q2([t,I2(i,e)]);function s(l,c,f){let d=Rl(l),p=Rl(c),b=Rl(`.${Te(f)}`).nodes[0].nodes[0];return d.each(v=>{let y=new Set;p.each(w=>{let k=!1;w=w.clone(),w.walkClasses(S=>{S.value===b.value&&(k||(S.replaceWith(...v.nodes.map(E=>E.clone())),y.add(w),k=!0))})});for(let w of y){let k=[[]];for(let S of w.nodes)S.type==="combinator"?(k.push(S),k.push([])):k[k.length-1].push(S);w.nodes=[];for(let S of k)Array.isArray(S)&&S.sort((E,O)=>E.type==="tag"&&O.type==="class"?-1:E.type==="class"&&O.type==="tag"?1:E.type==="class"&&O.type==="pseudo"&&O.value.startsWith("::")?-1:E.type==="pseudo"&&E.value.startsWith("::")&&O.type==="class"?1:0),w.nodes=w.nodes.concat(S)}v.replaceWith(...y)}),d.toString()}let o=new Map;for(let l of n){let[c]=o.get(l.parent)||[[],l.source];o.set(l.parent,[c,l.source]);let[f,d]=jg(l.params);if(l.parent.type==="atrule"){if(l.parent.name==="screen"){let p=l.parent.params;throw l.error(`@apply is not supported within nested at-rules like @screen. We suggest you write this as @apply ${f.map(h=>`${p}:${h}`).join(" ")} instead.`)}throw l.error(`@apply is not supported within nested at-rules like @${l.parent.name}. You can fix this by un-nesting @${l.parent.name}.`)}for(let p of f){if([Bg(e,"group"),Bg(e,"peer")].includes(p))throw l.error(`@apply should not be used with the '${p}' utility`);if(!a.has(p))throw l.error(`The \`${p}\` class does not exist. If \`${p}\` is a custom class, make sure it is defined within a \`@layer\` directive.`);let h=a.get(p);for(let[,b]of h)b.type!=="atrule"&&b.walkRules(()=>{throw l.error([`The \`${p}\` class cannot be used with \`@apply\` because \`@apply\` does not currently support nested CSS.`,"Rewrite the selector without nesting or configure the `tailwindcss/nesting` plugin:","https://tailwindcss.com/docs/using-with-preprocessors#nesting"].join(`
+`))});c.push([p,d,h])}}for(let[l,[c,f]]of o){let d=[];for(let[h,b,v]of c){let y=[h,...Ng([h],e.tailwindConfig.separator)];for(let[w,k]of v){let S=ws(l),E=ws(k);if(E=E.groups.filter(T=>T.some(F=>y.includes(F))).flat(),E=E.concat(Ng(E,e.tailwindConfig.separator)),S.some(T=>E.includes(T)))throw k.error(`You cannot \`@apply\` the \`${h}\` utility here because it creates a circular dependency.`);let B=ee.root({nodes:[k.clone()]});B.walk(T=>{T.source=f}),(k.type!=="atrule"||k.type==="atrule"&&k.name!=="keyframes")&&B.walkRules(T=>{if(!ws(T).some(U=>U===h)){T.remove();return}let F=typeof e.tailwindConfig.important=="string"?e.tailwindConfig.important:null,_=l.raws.tailwind!==void 0&&F&&l.selector.indexOf(F)===0?l.selector.slice(F.length):l.selector;_===""&&(_=l.selector),T.selector=s(_,T.selector,h),F&&_!==l.selector&&(T.selector=rs(T.selector,F)),T.walkDecls(U=>{U.important=w.important||b});let Q=(0,bs.default)().astSync(T.selector);Q.each(U=>pr(U)),T.selector=Q.toString()}),!!B.nodes[0]&&d.push([w.sort,B.nodes[0]])}}let p=e.offsets.sort(d).map(h=>h[1]);l.after(p)}for(let l of n)l.parent.nodes.length>1?l.remove():l.parent.remove();zg(r,e,t)}function Pl(r){return e=>{let t=D2(()=>P2(e,r));zg(e,r,t)}}var bs,O2,Ug=R(()=>{u();Ot();bs=pe(it());as();fr();Vo();es();O2=(0,bs.default)()});var Vg=x((rq,vs)=>{u();(function(){"use strict";function r(i,n,a){if(!i)return null;r.caseSensitive||(i=i.toLowerCase());var s=r.threshold===null?null:r.threshold*i.length,o=r.thresholdAbsolute,l;s!==null&&o!==null?l=Math.min(s,o):s!==null?l=s:o!==null?l=o:l=null;var c,f,d,p,h,b=n.length;for(h=0;h<b;h++)if(f=n[h],a&&(f=f[a]),!!f&&(r.caseSensitive?d=f:d=f.toLowerCase(),p=t(i,d,l),(l===null||p<l)&&(l=p,a&&r.returnWinningObject?c=n[h]:c=f,r.returnFirstMatch)))return c;return c||r.nullResultValue}r.threshold=.4,r.thresholdAbsolute=20,r.caseSensitive=!1,r.nullResultValue=null,r.returnWinningObject=null,r.returnFirstMatch=!1,typeof vs!="undefined"&&vs.exports?vs.exports=r:window.didYouMean=r;var e=Math.pow(2,32)-1;function t(i,n,a){a=a||a===0?a:e;var s=i.length,o=n.length;if(s===0)return Math.min(a+1,o);if(o===0)return Math.min(a+1,s);if(Math.abs(s-o)>a)return a+1;var l=[],c,f,d,p,h;for(c=0;c<=o;c++)l[c]=[c];for(f=0;f<=s;f++)l[0][f]=f;for(c=1;c<=o;c++){for(d=e,p=1,c>a&&(p=c-a),h=o+1,h>a+c&&(h=a+c),f=1;f<=s;f++)fh?l[c][f]=a+1:n.charAt(c-1)===i.charAt(f-1)?l[c][f]=l[c-1][f-1]:l[c][f]=Math.min(l[c-1][f-1]+1,Math.min(l[c][f-1]+1,l[c-1][f]+1)),l[c][f]<d&&(d=l[c][f]);if(d>a)return a+1}return l[o][s]}})()});var Wg=x((iq,Hg)=>{u();var Il="(".charCodeAt(0),Dl=")".charCodeAt(0),xs="'".charCodeAt(0),ql='"'.charCodeAt(0),$l="\\".charCodeAt(0),yr="/".charCodeAt(0),Ll=",".charCodeAt(0),Ml=":".charCodeAt(0),ks="*".charCodeAt(0),$2="u".charCodeAt(0),L2="U".charCodeAt(0),M2="+".charCodeAt(0),N2=/^[a-f0-9?-]+$/i;Hg.exports=function(r){for(var e=[],t=r,i,n,a,s,o,l,c,f,d=0,p=t.charCodeAt(d),h=t.length,b=[{nodes:e}],v=0,y,w="",k="",S="";d<h;)if(p<=32){i=d;do i+=1,p=t.charCodeAt(i);while(p<=32);s=t.slice(d,i),a=e[e.length-1],p===Dl&&v?S=s:a&&a.type==="div"?(a.after=s,a.sourceEndIndex+=s.length):p===Ll||p===Ml||p===yr&&t.charCodeAt(i+1)!==ks&&(!y||y&&y.type==="function"&&!1)?k=s:e.push({type:"space",sourceIndex:d,sourceEndIndex:i,value:s}),d=i}else if(p===xs||p===ql){i=d,n=p===xs?"'":'"',s={type:"string",sourceIndex:d,quote:n};do if(o=!1,i=t.indexOf(n,i+1),~i)for(l=i;t.charCodeAt(l-1)===$l;)l-=1,o=!o;else t+=n,i=t.length-1,s.unclosed=!0;while(o);s.value=t.slice(d+1,i),s.sourceEndIndex=s.unclosed?i:i+1,e.push(s),d=i+1,p=t.charCodeAt(d)}else if(p===yr&&t.charCodeAt(d+1)===ks)i=t.indexOf("*/",d),s={type:"comment",sourceIndex:d,sourceEndIndex:i+2},i===-1&&(s.unclosed=!0,i=t.length,s.sourceEndIndex=i),s.value=t.slice(d+2,i),e.push(s),d=i+2,p=t.charCodeAt(d);else if((p===yr||p===ks)&&y&&y.type==="function")s=t[d],e.push({type:"word",sourceIndex:d-k.length,sourceEndIndex:d+s.length,value:s}),d+=1,p=t.charCodeAt(d);else if(p===yr||p===Ll||p===Ml)s=t[d],e.push({type:"div",sourceIndex:d-k.length,sourceEndIndex:d+s.length,value:s,before:k,after:""}),k="",d+=1,p=t.charCodeAt(d);else if(Il===p){i=d;do i+=1,p=t.charCodeAt(i);while(p<=32);if(f=d,s={type:"function",sourceIndex:d-w.length,value:w,before:t.slice(f+1,i)},d=i,w==="url"&&p!==xs&&p!==ql){i-=1;do if(o=!1,i=t.indexOf(")",i+1),~i)for(l=i;t.charCodeAt(l-1)===$l;)l-=1,o=!o;else t+=")",i=t.length-1,s.unclosed=!0;while(o);c=i;do c-=1,p=t.charCodeAt(c);while(p<=32);f<c?(d!==c+1?s.nodes=[{type:"word",sourceIndex:d,sourceEndIndex:c+1,value:t.slice(d,c+1)}]:s.nodes=[],s.unclosed&&c+1!==i?(s.after="",s.nodes.push({type:"space",sourceIndex:c+1,sourceEndIndex:i,value:t.slice(c+1,i)})):(s.after=t.slice(c+1,i),s.sourceEndIndex=i)):(s.after="",s.nodes=[]),d=i+1,s.sourceEndIndex=s.unclosed?i:d,p=t.charCodeAt(d),e.push(s)}else v+=1,s.after="",s.sourceEndIndex=d+1,e.push(s),b.push(s),e=s.nodes=[],y=s;w=""}else if(Dl===p&&v)d+=1,p=t.charCodeAt(d),y.after=S,y.sourceEndIndex+=S.length,S="",v-=1,b[b.length-1].sourceEndIndex=d,b.pop(),y=b[v],e=y.nodes;else{i=d;do p===$l&&(i+=1),i+=1,p=t.charCodeAt(i);while(i<h&&!(p<=32||p===xs||p===ql||p===Ll||p===Ml||p===yr||p===Il||p===ks&&y&&y.type==="function"&&!0||p===yr&&y.type==="function"&&!0||p===Dl&&v));s=t.slice(d,i),Il===p?w=s:($2===s.charCodeAt(0)||L2===s.charCodeAt(0))&&M2===s.charCodeAt(1)&&N2.test(s.slice(2))?e.push({type:"unicode-range",sourceIndex:d,sourceEndIndex:i,value:s}):e.push({type:"word",sourceIndex:d,sourceEndIndex:i,value:s}),d=i}for(d=b.length-1;d;d-=1)b[d].unclosed=!0,b[d].sourceEndIndex=t.length;return b[0].nodes}});var Qg=x((nq,Gg)=>{u();Gg.exports=function r(e,t,i){var n,a,s,o;for(n=0,a=e.length;n<a;n+=1)s=e[n],i||(o=t(s,n,e)),o!==!1&&s.type==="function"&&Array.isArray(s.nodes)&&r(s.nodes,t,i),i&&t(s,n,e)}});var Jg=x((sq,Xg)=>{u();function Yg(r,e){var t=r.type,i=r.value,n,a;return e&&(a=e(r))!==void 0?a:t==="word"||t==="space"?i:t==="string"?(n=r.quote||"",n+i+(r.unclosed?"":n)):t==="comment"?"/*"+i+(r.unclosed?"":"*/"):t==="div"?(r.before||"")+i+(r.after||""):Array.isArray(r.nodes)?(n=Kg(r.nodes,e),t!=="function"?n:i+"("+(r.before||"")+n+(r.after||"")+(r.unclosed?"":")")):i}function Kg(r,e){var t,i;if(Array.isArray(r)){for(t="",i=r.length-1;~i;i-=1)t=Yg(r[i],e)+t;return t}return Yg(r,e)}Xg.exports=Kg});var ey=x((aq,Zg)=>{u();var Ss="-".charCodeAt(0),As="+".charCodeAt(0),Nl=".".charCodeAt(0),B2="e".charCodeAt(0),F2="E".charCodeAt(0);function j2(r){var e=r.charCodeAt(0),t;if(e===As||e===Ss){if(t=r.charCodeAt(1),t>=48&&t<=57)return!0;var i=r.charCodeAt(2);return t===Nl&&i>=48&&i<=57}return e===Nl?(t=r.charCodeAt(1),t>=48&&t<=57):e>=48&&e<=57}Zg.exports=function(r){var e=0,t=r.length,i,n,a;if(t===0||!j2(r))return!1;for(i=r.charCodeAt(e),(i===As||i===Ss)&&e++;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),i===Nl&&n>=48&&n<=57)for(e+=2;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),a=r.charCodeAt(e+2),(i===B2||i===F2)&&(n>=48&&n<=57||(n===As||n===Ss)&&a>=48&&a<=57))for(e+=n===As||n===Ss?3:2;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;return{number:r.slice(0,e),unit:r.slice(e)}}});var ny=x((oq,iy)=>{u();var z2=Wg(),ty=Qg(),ry=Jg();function $t(r){return this instanceof $t?(this.nodes=z2(r),this):new $t(r)}$t.prototype.toString=function(){return Array.isArray(this.nodes)?ry(this.nodes):""};$t.prototype.walk=function(r,e){return ty(this.nodes,r,e),this};$t.unit=ey();$t.walk=ty;$t.stringify=ry;iy.exports=$t});function Fl(r){return typeof r=="object"&&r!==null}function U2(r,e){let t=kt(e);do if(t.pop(),(0,$i.default)(r,t)!==void 0)break;while(t.length);return t.length?t:void 0}function br(r){return typeof r=="string"?r:r.reduce((e,t,i)=>t.includes(".")?`${e}[${t}]`:i===0?t:`${e}.${t}`,"")}function ay(r){return r.map(e=>`'${e}'`).join(", ")}function oy(r){return ay(Object.keys(r))}function jl(r,e,t,i={}){let n=Array.isArray(e)?br(e):e.replace(/^['"]+|['"]+$/g,""),a=Array.isArray(e)?e:kt(n),s=(0,$i.default)(r.theme,a,t);if(s===void 0){let l=`'${n}' does not exist in your theme config.`,c=a.slice(0,-1),f=(0,$i.default)(r.theme,c);if(Fl(f)){let d=Object.keys(f).filter(h=>jl(r,[...c,h]).isValid),p=(0,sy.default)(a[a.length-1],d);p?l+=` Did you mean '${br([...c,p])}'?`:d.length>0&&(l+=` '${br(c)}' has the following valid keys: ${ay(d)}`)}else{let d=U2(r.theme,n);if(d){let p=(0,$i.default)(r.theme,d);Fl(p)?l+=` '${br(d)}' has the following keys: ${oy(p)}`:l+=` '${br(d)}' is not an object.`}else l+=` Your theme has the following top-level keys: ${oy(r.theme)}`}return{isValid:!1,error:l}}if(!(typeof s=="string"||typeof s=="number"||typeof s=="function"||s instanceof String||s instanceof Number||Array.isArray(s))){let l=`'${n}' was found but does not resolve to a string.`;if(Fl(s)){let c=Object.keys(s).filter(f=>jl(r,[...a,f]).isValid);c.length&&(l+=` Did you mean something like '${br([...a,c[0]])}'?`)}return{isValid:!1,error:l}}let[o]=a;return{isValid:!0,value:mt(o)(s,i)}}function V2(r,e,t){e=e.map(n=>ly(r,n,t));let i=[""];for(let n of e)n.type==="div"&&n.value===","?i.push(""):i[i.length-1]+=Bl.default.stringify(n);return i}function ly(r,e,t){if(e.type==="function"&&t[e.value]!==void 0){let i=V2(r,e.nodes,t);e.type="word",e.value=t[e.value](r,...i)}return e}function H2(r,e,t){return Object.keys(t).some(n=>e.includes(`${n}(`))?(0,Bl.default)(e).walk(n=>{ly(r,n,t)}).toString():e}function*G2(r){r=r.replace(/^['"]+|['"]+$/g,"");let e=r.match(/^([^\s]+)(?![^\[]*\])(?:\s*\/\s*([^\/\s]+))$/),t;yield[r,void 0],e&&(r=e[1],t=e[2],yield[r,t])}function Q2(r,e,t){let i=Array.from(G2(e)).map(([n,a])=>Object.assign(jl(r,n,t,{opacityValue:a}),{resolvedPath:n,alpha:a}));return i.find(n=>n.isValid)??i[0]}function uy(r){let e=r.tailwindConfig,t={theme:(i,n,...a)=>{let{isValid:s,value:o,error:l,alpha:c}=Q2(e,n,a.length?a:void 0);if(!s){let p=i.parent,h=p?.raws.tailwind?.candidate;if(p&&h!==void 0){r.markInvalidUtilityNode(p),p.remove(),G.warn("invalid-theme-key-in-class",[`The utility \`${h}\` contains an invalid theme value and was not generated.`]);return}throw i.error(l)}let f=Xt(o),d=f!==void 0&&typeof f=="function";return(c!==void 0||d)&&(c===void 0&&(c=1),o=Ze(f,c,f)),o},screen:(i,n)=>{n=n.replace(/^['"]+/g,"").replace(/['"]+$/g,"");let s=Rt(e.theme.screens).find(({name:o})=>o===n);if(!s)throw i.error(`The '${n}' screen does not exist in your theme.`);return Tt(s)}};return i=>{i.walk(n=>{let a=W2[n.type];a!==void 0&&(n[a]=H2(n,n[a],t))})}}var $i,sy,Bl,W2,fy=R(()=>{u();$i=pe(Oa()),sy=pe(Vg());Si();Bl=pe(ny());Xn();Qn();Gi();Lr();Fr();Be();W2={atrule:"params",decl:"value"}});function cy({tailwindConfig:{theme:r}}){return function(e){e.walkAtRules("screen",t=>{let i=t.params,a=Rt(r.screens).find(({name:s})=>s===i);if(!a)throw t.error(`No \`${i}\` screen found.`);t.name="media",t.params=Tt(a)})}}var py=R(()=>{u();Xn();Qn()});function Y2(r){let e=r.filter(o=>o.type!=="pseudo"||o.nodes.length>0?!0:o.value.startsWith("::")||[":before",":after",":first-line",":first-letter"].includes(o.value)).reverse(),t=new Set(["tag","class","id","attribute"]),i=e.findIndex(o=>t.has(o.type));if(i===-1)return e.reverse().join("").trim();let n=e[i],a=dy[n.type]?dy[n.type](n):n;e=e.slice(0,i);let s=e.findIndex(o=>o.type==="combinator"&&o.value===">");return s!==-1&&(e.splice(0,s),e.unshift(Cs.default.universal())),[a,...e.reverse()].join("").trim()}function X2(r){return zl.has(r)||zl.set(r,K2.transformSync(r)),zl.get(r)}function Ul({tailwindConfig:r}){return e=>{let t=new Map,i=new Set;if(e.walkAtRules("defaults",n=>{if(n.nodes&&n.nodes.length>0){i.add(n);return}let a=n.params;t.has(a)||t.set(a,new Set),t.get(a).add(n.parent),n.remove()}),we(r,"optimizeUniversalDefaults"))for(let n of i){let a=new Map,s=t.get(n.params)??[];for(let o of s)for(let l of X2(o.selector)){let c=l.includes(":-")||l.includes("::-")||l.includes(":has")?l:"__DEFAULT__",f=a.get(c)??new Set;a.set(c,f),f.add(l)}if(a.size===0){n.remove();continue}for(let[,o]of a){let l=ee.rule({source:n.source});l.selectors=[...o],l.append(n.nodes.map(c=>c.clone())),n.before(l)}n.remove()}else if(i.size){let n=ee.rule({selectors:["*","::before","::after"]});for(let s of i)n.append(s.nodes),n.parent||s.before(n),n.source||(n.source=s.source),s.remove();let a=n.clone({selectors:["::backdrop"]});n.after(a)}}}var Cs,dy,K2,zl,hy=R(()=>{u();Ot();Cs=pe(it());ct();dy={id(r){return Cs.default.attribute({attribute:"id",operator:"=",value:r.value,quoteMark:'"'})}};K2=(0,Cs.default)(r=>r.map(e=>{let t=e.split(i=>i.type==="combinator"&&i.value===" ").pop();return Y2(t)})),zl=new Map});function Vl(){function r(e){let t=null;e.each(i=>{if(!J2.has(i.type)){t=null;return}if(t===null){t=i;return}let n=my[i.type];i.type==="atrule"&&i.name==="font-face"?t=i:n.every(a=>(i[a]??"").replace(/\s+/g," ")===(t[a]??"").replace(/\s+/g," "))?(i.nodes&&t.append(i.nodes),i.remove()):t=i}),e.each(i=>{i.type==="atrule"&&r(i)})}return e=>{r(e)}}var my,J2,gy=R(()=>{u();my={atrule:["name","params"],rule:["selector"]},J2=new Set(Object.keys(my))});function Hl(){return r=>{r.walkRules(e=>{let t=new Map,i=new Set([]),n=new Map;e.walkDecls(a=>{if(a.parent===e){if(t.has(a.prop)){if(t.get(a.prop).value===a.value){i.add(t.get(a.prop)),t.set(a.prop,a);return}n.has(a.prop)||n.set(a.prop,new Set),n.get(a.prop).add(t.get(a.prop)),n.get(a.prop).add(a)}t.set(a.prop,a)}});for(let a of i)a.remove();for(let a of n.values()){let s=new Map;for(let o of a){let l=eO(o.value);l!==null&&(s.has(l)||s.set(l,new Set),s.get(l).add(o))}for(let o of s.values()){let l=Array.from(o).slice(0,-1);for(let c of l)c.remove()}}})}}function eO(r){let e=/^-?\d*.?\d+([\w%]+)?$/g.exec(r);return e?e[1]??Z2:null}var Z2,yy=R(()=>{u();Z2=Symbol("unitless-number")});function tO(r){if(!r.walkAtRules)return;let e=new Set;if(r.walkAtRules("apply",t=>{e.add(t.parent)}),e.size!==0)for(let t of e){let i=[],n=[];for(let a of t.nodes)a.type==="atrule"&&a.name==="apply"?(n.length>0&&(i.push(n),n=[]),i.push([a])):n.push(a);if(n.length>0&&i.push(n),i.length!==1){for(let a of[...i].reverse()){let s=t.clone({nodes:[]});s.append(a),t.after(s)}t.remove()}}}function _s(){return r=>{tO(r)}}var by=R(()=>{u()});function Es(r){return async function(e,t){let{tailwindDirectives:i,applyDirectives:n}=_l(e);_s()(e,t);let a=r({tailwindDirectives:i,applyDirectives:n,registerDependency(s){t.messages.push({plugin:"tailwindcss",parent:t.opts.from,...s})},createContext(s,o){return tl(s,o,e)}})(e,t);if(a.tailwindConfig.separator==="-")throw new Error("The '-' character cannot be used as a custom separator in JIT mode due to parsing ambiguity. Please use another character like '_' instead.");Of(a.tailwindConfig),await Tl(a)(e,t),_s()(e,t),Pl(a)(e,t),uy(a)(e,t),cy(a)(e,t),Ul(a)(e,t),Vl(a)(e,t),Hl(a)(e,t)}}var wy=R(()=>{u();_g();Mg();Ug();fy();py();hy();gy();yy();by();_i();ct()});function vy(r,e){let t=null,i=null;return r.walkAtRules("config",n=>{if(i=n.source?.input.file??e.opts.from??null,i===null)throw n.error("The `@config` directive cannot be used without setting `from` in your PostCSS config.");if(t)throw n.error("Only one `@config` directive is allowed per file.");let a=n.params.match(/(['"])(.*?)\1/);if(!a)throw n.error("A path is required when using the `@config` directive.");let s=a[2];if(me.isAbsolute(s))throw n.error("The `@config` directive cannot be used with an absolute path.");if(t=me.resolve(me.dirname(i),s),!be.existsSync(t))throw n.error(`The config file at "${s}" does not exist. Make sure the path is correct and the file exists.`);n.remove()}),t||null}var xy=R(()=>{u();ft();et()});var ky=x((Vq,Wl)=>{u();Cg();wy();It();xy();Wl.exports=function(e){return{postcssPlugin:"tailwindcss",plugins:[Je.DEBUG&&function(t){return console.log(`
+`),console.time("JIT TOTAL"),t},async function(t,i){e=vy(t,i)??e;let n=Cl(e);if(t.type==="document"){let a=t.nodes.filter(s=>s.type==="root");for(let s of a)s.type==="root"&&await Es(n)(s,i);return}await Es(n)(t,i)},Je.DEBUG&&function(t){return console.timeEnd("JIT TOTAL"),console.log(`
+`),t}].filter(Boolean)}};Wl.exports.postcss=!0});var Ay=x((Hq,Sy)=>{u();Sy.exports=ky()});var Gl=x((Wq,Cy)=>{u();Cy.exports=()=>["and_chr 114","and_uc 15.5","chrome 114","chrome 113","chrome 109","edge 114","firefox 114","ios_saf 16.5","ios_saf 16.4","ios_saf 16.3","ios_saf 16.1","opera 99","safari 16.5","samsung 21"]});var Os={};Ge(Os,{agents:()=>rO,feature:()=>iO});function iO(){return{status:"cr",title:"CSS Feature Queries",stats:{ie:{"6":"n","7":"n","8":"n","9":"n","10":"n","11":"n","5.5":"n"},edge:{"12":"y","13":"y","14":"y","15":"y","16":"y","17":"y","18":"y","79":"y","80":"y","81":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","101":"y","102":"y","103":"y","104":"y","105":"y","106":"y","107":"y","108":"y","109":"y","110":"y","111":"y","112":"y","113":"y","114":"y"},firefox:{"2":"n","3":"n","4":"n","5":"n","6":"n","7":"n","8":"n","9":"n","10":"n","11":"n","12":"n","13":"n","14":"n","15":"n","16":"n","17":"n","18":"n","19":"n","20":"n","21":"n","22":"y","23":"y","24":"y","25":"y","26":"y","27":"y","28":"y","29":"y","30":"y","31":"y","32":"y","33":"y","34":"y","35":"y","36":"y","37":"y","38":"y","39":"y","40":"y","41":"y","42":"y","43":"y","44":"y","45":"y","46":"y","47":"y","48":"y","49":"y","50":"y","51":"y","52":"y","53":"y","54":"y","55":"y","56":"y","57":"y","58":"y","59":"y","60":"y","61":"y","62":"y","63":"y","64":"y","65":"y","66":"y","67":"y","68":"y","69":"y","70":"y","71":"y","72":"y","73":"y","74":"y","75":"y","76":"y","77":"y","78":"y","79":"y","80":"y","81":"y","82":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","101":"y","102":"y","103":"y","104":"y","105":"y","106":"y","107":"y","108":"y","109":"y","110":"y","111":"y","112":"y","113":"y","114":"y","115":"y","116":"y","117":"y","3.5":"n","3.6":"n"},chrome:{"4":"n","5":"n","6":"n","7":"n","8":"n","9":"n","10":"n","11":"n","12":"n","13":"n","14":"n","15":"n","16":"n","17":"n","18":"n","19":"n","20":"n","21":"n","22":"n","23":"n","24":"n","25":"n","26":"n","27":"n","28":"y","29":"y","30":"y","31":"y","32":"y","33":"y","34":"y","35":"y","36":"y","37":"y","38":"y","39":"y","40":"y","41":"y","42":"y","43":"y","44":"y","45":"y","46":"y","47":"y","48":"y","49":"y","50":"y","51":"y","52":"y","53":"y","54":"y","55":"y","56":"y","57":"y","58":"y","59":"y","60":"y","61":"y","62":"y","63":"y","64":"y","65":"y","66":"y","67":"y","68":"y","69":"y","70":"y","71":"y","72":"y","73":"y","74":"y","75":"y","76":"y","77":"y","78":"y","79":"y","80":"y","81":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","101":"y","102":"y","103":"y","104":"y","105":"y","106":"y","107":"y","108":"y","109":"y","110":"y","111":"y","112":"y","113":"y","114":"y","115":"y","116":"y","117":"y"},safari:{"4":"n","5":"n","6":"n","7":"n","8":"n","9":"y","10":"y","11":"y","12":"y","13":"y","14":"y","15":"y","17":"y","9.1":"y","10.1":"y","11.1":"y","12.1":"y","13.1":"y","14.1":"y","15.1":"y","15.2-15.3":"y","15.4":"y","15.5":"y","15.6":"y","16.0":"y","16.1":"y","16.2":"y","16.3":"y","16.4":"y","16.5":"y","16.6":"y",TP:"y","3.1":"n","3.2":"n","5.1":"n","6.1":"n","7.1":"n"},opera:{"9":"n","11":"n","12":"n","15":"y","16":"y","17":"y","18":"y","19":"y","20":"y","21":"y","22":"y","23":"y","24":"y","25":"y","26":"y","27":"y","28":"y","29":"y","30":"y","31":"y","32":"y","33":"y","34":"y","35":"y","36":"y","37":"y","38":"y","39":"y","40":"y","41":"y","42":"y","43":"y","44":"y","45":"y","46":"y","47":"y","48":"y","49":"y","50":"y","51":"y","52":"y","53":"y","54":"y","55":"y","56":"y","57":"y","58":"y","60":"y","62":"y","63":"y","64":"y","65":"y","66":"y","67":"y","68":"y","69":"y","70":"y","71":"y","72":"y","73":"y","74":"y","75":"y","76":"y","77":"y","78":"y","79":"y","80":"y","81":"y","82":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","12.1":"y","9.5-9.6":"n","10.0-10.1":"n","10.5":"n","10.6":"n","11.1":"n","11.5":"n","11.6":"n"},ios_saf:{"8":"n","17":"y","9.0-9.2":"y","9.3":"y","10.0-10.2":"y","10.3":"y","11.0-11.2":"y","11.3-11.4":"y","12.0-12.1":"y","12.2-12.5":"y","13.0-13.1":"y","13.2":"y","13.3":"y","13.4-13.7":"y","14.0-14.4":"y","14.5-14.8":"y","15.0-15.1":"y","15.2-15.3":"y","15.4":"y","15.5":"y","15.6":"y","16.0":"y","16.1":"y","16.2":"y","16.3":"y","16.4":"y","16.5":"y","16.6":"y","3.2":"n","4.0-4.1":"n","4.2-4.3":"n","5.0-5.1":"n","6.0-6.1":"n","7.0-7.1":"n","8.1-8.4":"n"},op_mini:{all:"y"},android:{"3":"n","4":"n","114":"y","4.4":"y","4.4.3-4.4.4":"y","2.1":"n","2.2":"n","2.3":"n","4.1":"n","4.2-4.3":"n"},bb:{"7":"n","10":"n"},op_mob:{"10":"n","11":"n","12":"n","73":"y","11.1":"n","11.5":"n","12.1":"n"},and_chr:{"114":"y"},and_ff:{"115":"y"},ie_mob:{"10":"n","11":"n"},and_uc:{"15.5":"y"},samsung:{"4":"y","20":"y","21":"y","5.0-5.4":"y","6.2-6.4":"y","7.2-7.4":"y","8.2":"y","9.2":"y","10.1":"y","11.1-11.2":"y","12.0":"y","13.0":"y","14.0":"y","15.0":"y","16.0":"y","17.0":"y","18.0":"y","19.0":"y"},and_qq:{"13.1":"y"},baidu:{"13.18":"y"},kaios:{"2.5":"y","3.0-3.1":"y"}}}}var rO,Ts=R(()=>{u();rO={ie:{prefix:"ms"},edge:{prefix:"webkit",prefix_exceptions:{"12":"ms","13":"ms","14":"ms","15":"ms","16":"ms","17":"ms","18":"ms"}},firefox:{prefix:"moz"},chrome:{prefix:"webkit"},safari:{prefix:"webkit"},opera:{prefix:"webkit",prefix_exceptions:{"9":"o","11":"o","12":"o","9.5-9.6":"o","10.0-10.1":"o","10.5":"o","10.6":"o","11.1":"o","11.5":"o","11.6":"o","12.1":"o"}},ios_saf:{prefix:"webkit"},op_mini:{prefix:"o"},android:{prefix:"webkit"},bb:{prefix:"webkit"},op_mob:{prefix:"o",prefix_exceptions:{"73":"webkit"}},and_chr:{prefix:"webkit"},and_ff:{prefix:"moz"},ie_mob:{prefix:"ms"},and_uc:{prefix:"webkit",prefix_exceptions:{"15.5":"webkit"}},samsung:{prefix:"webkit"},and_qq:{prefix:"webkit"},baidu:{prefix:"webkit"},kaios:{prefix:"moz"}}});var _y=x(()=>{u()});var _e=x((Yq,Lt)=>{u();var{list:Ql}=$e();Lt.exports.error=function(r){let e=new Error(r);throw e.autoprefixer=!0,e};Lt.exports.uniq=function(r){return[...new Set(r)]};Lt.exports.removeNote=function(r){return r.includes(" ")?r.split(" ")[0]:r};Lt.exports.escapeRegexp=function(r){return r.replace(/[$()*+-.?[\\\]^{|}]/g,"\\$&")};Lt.exports.regexp=function(r,e=!0){return e&&(r=this.escapeRegexp(r)),new RegExp(`(^|[\\s,(])(${r}($|[\\s(,]))`,"gi")};Lt.exports.editList=function(r,e){let t=Ql.comma(r),i=e(t,[]);if(t===i)return r;let n=r.match(/,\s*/);return n=n?n[0]:", ",i.join(n)};Lt.exports.splitSelector=function(r){return Ql.comma(r).map(e=>Ql.space(e).map(t=>t.split(/(?=\.|#)/g)))}});var Mt=x((Kq,Ty)=>{u();var nO=Gl(),Ey=(Ts(),Os).agents,sO=_e(),Oy=class{static prefixes(){if(this.prefixesCache)return this.prefixesCache;this.prefixesCache=[];for(let e in Ey)this.prefixesCache.push(`-${Ey[e].prefix}-`);return this.prefixesCache=sO.uniq(this.prefixesCache).sort((e,t)=>t.length-e.length),this.prefixesCache}static withPrefix(e){return this.prefixesRegexp||(this.prefixesRegexp=new RegExp(this.prefixes().join("|"))),this.prefixesRegexp.test(e)}constructor(e,t,i,n){this.data=e,this.options=i||{},this.browserslistOpts=n||{},this.selected=this.parse(t)}parse(e){let t={};for(let i in this.browserslistOpts)t[i]=this.browserslistOpts[i];return t.path=this.options.from,nO(e,t)}prefix(e){let[t,i]=e.split(" "),n=this.data[t],a=n.prefix_exceptions&&n.prefix_exceptions[i];return a||(a=n.prefix),`-${a}-`}isSelected(e){return this.selected.includes(e)}};Ty.exports=Oy});var Li=x((Xq,Ry)=>{u();Ry.exports={prefix(r){let e=r.match(/^(-\w+-)/);return e?e[0]:""},unprefixed(r){return r.replace(/^-\w+-/,"")}}});var wr=x((Jq,Iy)=>{u();var aO=Mt(),Py=Li(),oO=_e();function Yl(r,e){let t=new r.constructor;for(let i of Object.keys(r||{})){let n=r[i];i==="parent"&&typeof n=="object"?e&&(t[i]=e):i==="source"||i===null?t[i]=n:Array.isArray(n)?t[i]=n.map(a=>Yl(a,t)):i!=="_autoprefixerPrefix"&&i!=="_autoprefixerValues"&&i!=="proxyCache"&&(typeof n=="object"&&n!==null&&(n=Yl(n,t)),t[i]=n)}return t}var Rs=class{static hack(e){return this.hacks||(this.hacks={}),e.names.map(t=>(this.hacks[t]=e,this.hacks[t]))}static load(e,t,i){let n=this.hacks&&this.hacks[e];return n?new n(e,t,i):new this(e,t,i)}static clone(e,t){let i=Yl(e);for(let n in t)i[n]=t[n];return i}constructor(e,t,i){this.prefixes=t,this.name=e,this.all=i}parentPrefix(e){let t;return typeof e._autoprefixerPrefix!="undefined"?t=e._autoprefixerPrefix:e.type==="decl"&&e.prop[0]==="-"?t=Py.prefix(e.prop):e.type==="root"?t=!1:e.type==="rule"&&e.selector.includes(":-")&&/:(-\w+-)/.test(e.selector)?t=e.selector.match(/:(-\w+-)/)[1]:e.type==="atrule"&&e.name[0]==="-"?t=Py.prefix(e.name):t=this.parentPrefix(e.parent),aO.prefixes().includes(t)||(t=!1),e._autoprefixerPrefix=t,e._autoprefixerPrefix}process(e,t){if(!this.check(e))return;let i=this.parentPrefix(e),n=this.prefixes.filter(s=>!i||i===oO.removeNote(s)),a=[];for(let s of n)this.add(e,s,a.concat([s]),t)&&a.push(s);return a}clone(e,t){return Rs.clone(e,t)}};Iy.exports=Rs});var j=x((Zq,$y)=>{u();var lO=wr(),uO=Mt(),Dy=_e(),qy=class extends lO{check(){return!0}prefixed(e,t){return t+e}normalize(e){return e}otherPrefixes(e,t){for(let i of uO.prefixes())if(i!==t&&e.includes(i))return!0;return!1}set(e,t){return e.prop=this.prefixed(e.prop,t),e}needCascade(e){return e._autoprefixerCascade||(e._autoprefixerCascade=this.all.options.cascade!==!1&&e.raw("before").includes(`
+`)),e._autoprefixerCascade}maxPrefixed(e,t){if(t._autoprefixerMax)return t._autoprefixerMax;let i=0;for(let n of e)n=Dy.removeNote(n),n.length>i&&(i=n.length);return t._autoprefixerMax=i,t._autoprefixerMax}calcBefore(e,t,i=""){let a=this.maxPrefixed(e,t)-Dy.removeNote(i).length,s=t.raw("before");return a>0&&(s+=Array(a).fill(" ").join("")),s}restoreBefore(e){let t=e.raw("before").split(`
+`),i=t[t.length-1];this.all.group(e).up(n=>{let a=n.raw("before").split(`
+`),s=a[a.length-1];s.length<i.length&&(i=s)}),t[t.length-1]=i,e.raws.before=t.join(`
+`)}insert(e,t,i){let n=this.set(this.clone(e),t);if(!(!n||e.parent.some(s=>s.prop===n.prop&&s.value===n.value)))return this.needCascade(e)&&(n.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,n)}isAlready(e,t){let i=this.all.group(e).up(n=>n.prop===t);return i||(i=this.all.group(e).down(n=>n.prop===t)),i}add(e,t,i,n){let a=this.prefixed(e.prop,t);if(!(this.isAlready(e,a)||this.otherPrefixes(e.value,t)))return this.insert(e,t,i,n)}process(e,t){if(!this.needCascade(e)){super.process(e,t);return}let i=super.process(e,t);!i||!i.length||(this.restoreBefore(e),e.raws.before=this.calcBefore(i,e))}old(e,t){return[this.prefixed(e,t)]}};$y.exports=qy});var My=x((e$,Ly)=>{u();Ly.exports=function r(e){return{mul:t=>new r(e*t),div:t=>new r(e/t),simplify:()=>new r(e),toString:()=>e.toString()}}});var Fy=x((t$,By)=>{u();var fO=My(),cO=wr(),Kl=_e(),pO=/(min|max)-resolution\s*:\s*\d*\.?\d+(dppx|dpcm|dpi|x)/gi,dO=/(min|max)-resolution(\s*:\s*)(\d*\.?\d+)(dppx|dpcm|dpi|x)/i,Ny=class extends cO{prefixName(e,t){return e==="-moz-"?t+"--moz-device-pixel-ratio":e+t+"-device-pixel-ratio"}prefixQuery(e,t,i,n,a){return n=new fO(n),a==="dpi"?n=n.div(96):a==="dpcm"&&(n=n.mul(2.54).div(96)),n=n.simplify(),e==="-o-"&&(n=n.n+"/"+n.d),this.prefixName(e,t)+i+n}clean(e){if(!this.bad){this.bad=[];for(let t of this.prefixes)this.bad.push(this.prefixName(t,"min")),this.bad.push(this.prefixName(t,"max"))}e.params=Kl.editList(e.params,t=>t.filter(i=>this.bad.every(n=>!i.includes(n))))}process(e){let t=this.parentPrefix(e),i=t?[t]:this.prefixes;e.params=Kl.editList(e.params,(n,a)=>{for(let s of n){if(!s.includes("min-resolution")&&!s.includes("max-resolution")){a.push(s);continue}for(let o of i){let l=s.replace(pO,c=>{let f=c.match(dO);return this.prefixQuery(o,f[1],f[2],f[3],f[4])});a.push(l)}a.push(s)}return Kl.uniq(a)})}};By.exports=Ny});var zy=x((r$,jy)=>{u();var Xl="(".charCodeAt(0),Jl=")".charCodeAt(0),Ps="'".charCodeAt(0),Zl='"'.charCodeAt(0),eu="\\".charCodeAt(0),vr="/".charCodeAt(0),tu=",".charCodeAt(0),ru=":".charCodeAt(0),Is="*".charCodeAt(0),hO="u".charCodeAt(0),mO="U".charCodeAt(0),gO="+".charCodeAt(0),yO=/^[a-f0-9?-]+$/i;jy.exports=function(r){for(var e=[],t=r,i,n,a,s,o,l,c,f,d=0,p=t.charCodeAt(d),h=t.length,b=[{nodes:e}],v=0,y,w="",k="",S="";d<h;)if(p<=32){i=d;do i+=1,p=t.charCodeAt(i);while(p<=32);s=t.slice(d,i),a=e[e.length-1],p===Jl&&v?S=s:a&&a.type==="div"?(a.after=s,a.sourceEndIndex+=s.length):p===tu||p===ru||p===vr&&t.charCodeAt(i+1)!==Is&&(!y||y&&y.type==="function"&&y.value!=="calc")?k=s:e.push({type:"space",sourceIndex:d,sourceEndIndex:i,value:s}),d=i}else if(p===Ps||p===Zl){i=d,n=p===Ps?"'":'"',s={type:"string",sourceIndex:d,quote:n};do if(o=!1,i=t.indexOf(n,i+1),~i)for(l=i;t.charCodeAt(l-1)===eu;)l-=1,o=!o;else t+=n,i=t.length-1,s.unclosed=!0;while(o);s.value=t.slice(d+1,i),s.sourceEndIndex=s.unclosed?i:i+1,e.push(s),d=i+1,p=t.charCodeAt(d)}else if(p===vr&&t.charCodeAt(d+1)===Is)i=t.indexOf("*/",d),s={type:"comment",sourceIndex:d,sourceEndIndex:i+2},i===-1&&(s.unclosed=!0,i=t.length,s.sourceEndIndex=i),s.value=t.slice(d+2,i),e.push(s),d=i+2,p=t.charCodeAt(d);else if((p===vr||p===Is)&&y&&y.type==="function"&&y.value==="calc")s=t[d],e.push({type:"word",sourceIndex:d-k.length,sourceEndIndex:d+s.length,value:s}),d+=1,p=t.charCodeAt(d);else if(p===vr||p===tu||p===ru)s=t[d],e.push({type:"div",sourceIndex:d-k.length,sourceEndIndex:d+s.length,value:s,before:k,after:""}),k="",d+=1,p=t.charCodeAt(d);else if(Xl===p){i=d;do i+=1,p=t.charCodeAt(i);while(p<=32);if(f=d,s={type:"function",sourceIndex:d-w.length,value:w,before:t.slice(f+1,i)},d=i,w==="url"&&p!==Ps&&p!==Zl){i-=1;do if(o=!1,i=t.indexOf(")",i+1),~i)for(l=i;t.charCodeAt(l-1)===eu;)l-=1,o=!o;else t+=")",i=t.length-1,s.unclosed=!0;while(o);c=i;do c-=1,p=t.charCodeAt(c);while(p<=32);f<c?(d!==c+1?s.nodes=[{type:"word",sourceIndex:d,sourceEndIndex:c+1,value:t.slice(d,c+1)}]:s.nodes=[],s.unclosed&&c+1!==i?(s.after="",s.nodes.push({type:"space",sourceIndex:c+1,sourceEndIndex:i,value:t.slice(c+1,i)})):(s.after=t.slice(c+1,i),s.sourceEndIndex=i)):(s.after="",s.nodes=[]),d=i+1,s.sourceEndIndex=s.unclosed?i:d,p=t.charCodeAt(d),e.push(s)}else v+=1,s.after="",s.sourceEndIndex=d+1,e.push(s),b.push(s),e=s.nodes=[],y=s;w=""}else if(Jl===p&&v)d+=1,p=t.charCodeAt(d),y.after=S,y.sourceEndIndex+=S.length,S="",v-=1,b[b.length-1].sourceEndIndex=d,b.pop(),y=b[v],e=y.nodes;else{i=d;do p===eu&&(i+=1),i+=1,p=t.charCodeAt(i);while(i<h&&!(p<=32||p===Ps||p===Zl||p===tu||p===ru||p===vr||p===Xl||p===Is&&y&&y.type==="function"&&y.value==="calc"||p===vr&&y.type==="function"&&y.value==="calc"||p===Jl&&v));s=t.slice(d,i),Xl===p?w=s:(hO===s.charCodeAt(0)||mO===s.charCodeAt(0))&&gO===s.charCodeAt(1)&&yO.test(s.slice(2))?e.push({type:"unicode-range",sourceIndex:d,sourceEndIndex:i,value:s}):e.push({type:"word",sourceIndex:d,sourceEndIndex:i,value:s}),d=i}for(d=b.length-1;d;d-=1)b[d].unclosed=!0,b[d].sourceEndIndex=t.length;return b[0].nodes}});var Vy=x((i$,Uy)=>{u();Uy.exports=function r(e,t,i){var n,a,s,o;for(n=0,a=e.length;n<a;n+=1)s=e[n],i||(o=t(s,n,e)),o!==!1&&s.type==="function"&&Array.isArray(s.nodes)&&r(s.nodes,t,i),i&&t(s,n,e)}});var Qy=x((n$,Gy)=>{u();function Hy(r,e){var t=r.type,i=r.value,n,a;return e&&(a=e(r))!==void 0?a:t==="word"||t==="space"?i:t==="string"?(n=r.quote||"",n+i+(r.unclosed?"":n)):t==="comment"?"/*"+i+(r.unclosed?"":"*/"):t==="div"?(r.before||"")+i+(r.after||""):Array.isArray(r.nodes)?(n=Wy(r.nodes,e),t!=="function"?n:i+"("+(r.before||"")+n+(r.after||"")+(r.unclosed?"":")")):i}function Wy(r,e){var t,i;if(Array.isArray(r)){for(t="",i=r.length-1;~i;i-=1)t=Hy(r[i],e)+t;return t}return Hy(r,e)}Gy.exports=Wy});var Ky=x((s$,Yy)=>{u();var Ds="-".charCodeAt(0),qs="+".charCodeAt(0),iu=".".charCodeAt(0),bO="e".charCodeAt(0),wO="E".charCodeAt(0);function vO(r){var e=r.charCodeAt(0),t;if(e===qs||e===Ds){if(t=r.charCodeAt(1),t>=48&&t<=57)return!0;var i=r.charCodeAt(2);return t===iu&&i>=48&&i<=57}return e===iu?(t=r.charCodeAt(1),t>=48&&t<=57):e>=48&&e<=57}Yy.exports=function(r){var e=0,t=r.length,i,n,a;if(t===0||!vO(r))return!1;for(i=r.charCodeAt(e),(i===qs||i===Ds)&&e++;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),i===iu&&n>=48&&n<=57)for(e+=2;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),a=r.charCodeAt(e+2),(i===bO||i===wO)&&(n>=48&&n<=57||(n===qs||n===Ds)&&a>=48&&a<=57))for(e+=n===qs||n===Ds?3:2;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;return{number:r.slice(0,e),unit:r.slice(e)}}});var $s=x((a$,Zy)=>{u();var xO=zy(),Xy=Vy(),Jy=Qy();function Nt(r){return this instanceof Nt?(this.nodes=xO(r),this):new Nt(r)}Nt.prototype.toString=function(){return Array.isArray(this.nodes)?Jy(this.nodes):""};Nt.prototype.walk=function(r,e){return Xy(this.nodes,r,e),this};Nt.unit=Ky();Nt.walk=Xy;Nt.stringify=Jy;Zy.exports=Nt});var nb=x((o$,ib)=>{u();var{list:kO}=$e(),eb=$s(),SO=Mt(),tb=Li(),rb=class{constructor(e){this.props=["transition","transition-property"],this.prefixes=e}add(e,t){let i,n,a=this.prefixes.add[e.prop],s=this.ruleVendorPrefixes(e),o=s||a&&a.prefixes||[],l=this.parse(e.value),c=l.map(h=>this.findProp(h)),f=[];if(c.some(h=>h[0]==="-"))return;for(let h of l){if(n=this.findProp(h),n[0]==="-")continue;let b=this.prefixes.add[n];if(!(!b||!b.prefixes))for(i of b.prefixes){if(s&&!s.some(y=>i.includes(y)))continue;let v=this.prefixes.prefixed(n,i);v!=="-ms-transform"&&!c.includes(v)&&(this.disabled(n,i)||f.push(this.clone(n,v,h)))}}l=l.concat(f);let d=this.stringify(l),p=this.stringify(this.cleanFromUnprefixed(l,"-webkit-"));if(o.includes("-webkit-")&&this.cloneBefore(e,`-webkit-${e.prop}`,p),this.cloneBefore(e,e.prop,p),o.includes("-o-")){let h=this.stringify(this.cleanFromUnprefixed(l,"-o-"));this.cloneBefore(e,`-o-${e.prop}`,h)}for(i of o)if(i!=="-webkit-"&&i!=="-o-"){let h=this.stringify(this.cleanOtherPrefixes(l,i));this.cloneBefore(e,i+e.prop,h)}d!==e.value&&!this.already(e,e.prop,d)&&(this.checkForWarning(t,e),e.cloneBefore(),e.value=d)}findProp(e){let t=e[0].value;if(/^\d/.test(t)){for(let[i,n]of e.entries())if(i!==0&&n.type==="word")return n.value}return t}already(e,t,i){return e.parent.some(n=>n.prop===t&&n.value===i)}cloneBefore(e,t,i){this.already(e,t,i)||e.cloneBefore({prop:t,value:i})}checkForWarning(e,t){if(t.prop!=="transition-property")return;let i=!1,n=!1;t.parent.each(a=>{if(a.type!=="decl"||a.prop.indexOf("transition-")!==0)return;let s=kO.comma(a.value);if(a.prop==="transition-property"){s.forEach(o=>{let l=this.prefixes.add[o];l&&l.prefixes&&l.prefixes.length>0&&(i=!0)});return}return n=n||s.length>1,!1}),i&&n&&t.warn(e,"Replace transition-property to transition, because Autoprefixer could not support any cases of transition-property and other transition-*")}remove(e){let t=this.parse(e.value);t=t.filter(s=>{let o=this.prefixes.remove[this.findProp(s)];return!o||!o.remove});let i=this.stringify(t);if(e.value===i)return;if(t.length===0){e.remove();return}let n=e.parent.some(s=>s.prop===e.prop&&s.value===i),a=e.parent.some(s=>s!==e&&s.prop===e.prop&&s.value.length>i.length);if(n||a){e.remove();return}e.value=i}parse(e){let t=eb(e),i=[],n=[];for(let a of t.nodes)n.push(a),a.type==="div"&&a.value===","&&(i.push(n),n=[]);return i.push(n),i.filter(a=>a.length>0)}stringify(e){if(e.length===0)return"";let t=[];for(let i of e)i[i.length-1].type!=="div"&&i.push(this.div(e)),t=t.concat(i);return t[0].type==="div"&&(t=t.slice(1)),t[t.length-1].type==="div"&&(t=t.slice(0,-2+1||void 0)),eb.stringify({nodes:t})}clone(e,t,i){let n=[],a=!1;for(let s of i)!a&&s.type==="word"&&s.value===e?(n.push({type:"word",value:t}),a=!0):n.push(s);return n}div(e){for(let t of e)for(let i of t)if(i.type==="div"&&i.value===",")return i;return{type:"div",value:",",after:" "}}cleanOtherPrefixes(e,t){return e.filter(i=>{let n=tb.prefix(this.findProp(i));return n===""||n===t})}cleanFromUnprefixed(e,t){let i=e.map(a=>this.findProp(a)).filter(a=>a.slice(0,t.length)===t).map(a=>this.prefixes.unprefixed(a)),n=[];for(let a of e){let s=this.findProp(a),o=tb.prefix(s);!i.includes(s)&&(o===t||o==="")&&n.push(a)}return n}disabled(e,t){let i=["order","justify-content","align-self","align-content"];if(e.includes("flex")||i.includes(e)){if(this.prefixes.options.flexbox===!1)return!0;if(this.prefixes.options.flexbox==="no-2009")return t.includes("2009")}}ruleVendorPrefixes(e){let{parent:t}=e;if(t.type!=="rule")return!1;if(!t.selector.includes(":-"))return!1;let i=SO.prefixes().filter(n=>t.selector.includes(":"+n));return i.length>0?i:!1}};ib.exports=rb});var xr=x((l$,ab)=>{u();var AO=_e(),sb=class{constructor(e,t,i,n){this.unprefixed=e,this.prefixed=t,this.string=i||t,this.regexp=n||AO.regexp(t)}check(e){return e.includes(this.string)?!!e.match(this.regexp):!1}};ab.exports=sb});var He=x((u$,lb)=>{u();var CO=wr(),_O=xr(),EO=Li(),OO=_e(),ob=class extends CO{static save(e,t){let i=t.prop,n=[];for(let a in t._autoprefixerValues){let s=t._autoprefixerValues[a];if(s===t.value)continue;let o,l=EO.prefix(i);if(l==="-pie-")continue;if(l===a){o=t.value=s,n.push(o);continue}let c=e.prefixed(i,a),f=t.parent;if(!f.every(b=>b.prop!==c)){n.push(o);continue}let d=s.replace(/\s+/," ");if(f.some(b=>b.prop===t.prop&&b.value.replace(/\s+/," ")===d)){n.push(o);continue}let h=this.clone(t,{value:s});o=t.parent.insertBefore(t,h),n.push(o)}return n}check(e){let t=e.value;return t.includes(this.name)?!!t.match(this.regexp()):!1}regexp(){return this.regexpCache||(this.regexpCache=OO.regexp(this.name))}replace(e,t){return e.replace(this.regexp(),`$1${t}$2`)}value(e){return e.raws.value&&e.raws.value.value===e.value?e.raws.value.raw:e.value}add(e,t){e._autoprefixerValues||(e._autoprefixerValues={});let i=e._autoprefixerValues[t]||this.value(e),n;do if(n=i,i=this.replace(i,t),i===!1)return;while(i!==n);e._autoprefixerValues[t]=i}old(e){return new _O(this.name,e+this.name)}};lb.exports=ob});var Bt=x((f$,ub)=>{u();ub.exports={}});var su=x((c$,pb)=>{u();var fb=$s(),TO=He(),RO=Bt().insertAreas,PO=/(^|[^-])linear-gradient\(\s*(top|left|right|bottom)/i,IO=/(^|[^-])radial-gradient\(\s*\d+(\w*|%)\s+\d+(\w*|%)\s*,/i,DO=/(!\s*)?autoprefixer:\s*ignore\s+next/i,qO=/(!\s*)?autoprefixer\s*grid:\s*(on|off|(no-)?autoplace)/i,$O=["width","height","min-width","max-width","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size"];function nu(r){return r.parent.some(e=>e.prop==="grid-template"||e.prop==="grid-template-areas")}function LO(r){let e=r.parent.some(i=>i.prop==="grid-template-rows"),t=r.parent.some(i=>i.prop==="grid-template-columns");return e&&t}var cb=class{constructor(e){this.prefixes=e}add(e,t){let i=this.prefixes.add["@resolution"],n=this.prefixes.add["@keyframes"],a=this.prefixes.add["@viewport"],s=this.prefixes.add["@supports"];e.walkAtRules(f=>{if(f.name==="keyframes"){if(!this.disabled(f,t))return n&&n.process(f)}else if(f.name==="viewport"){if(!this.disabled(f,t))return a&&a.process(f)}else if(f.name==="supports"){if(this.prefixes.options.supports!==!1&&!this.disabled(f,t))return s.process(f)}else if(f.name==="media"&&f.params.includes("-resolution")&&!this.disabled(f,t))return i&&i.process(f)}),e.walkRules(f=>{if(!this.disabled(f,t))return this.prefixes.add.selectors.map(d=>d.process(f,t))});function o(f){return f.parent.nodes.some(d=>{if(d.type!=="decl")return!1;let p=d.prop==="display"&&/(inline-)?grid/.test(d.value),h=d.prop.startsWith("grid-template"),b=/^grid-([A-z]+-)?gap/.test(d.prop);return p||h||b})}function l(f){return f.parent.some(d=>d.prop==="display"&&/(inline-)?flex/.test(d.value))}let c=this.gridStatus(e,t)&&this.prefixes.add["grid-area"]&&this.prefixes.add["grid-area"].prefixes;return e.walkDecls(f=>{if(this.disabledDecl(f,t))return;let d=f.parent,p=f.prop,h=f.value;if(p==="grid-row-span"){t.warn("grid-row-span is not part of final Grid Layout. Use grid-row.",{node:f});return}else if(p==="grid-column-span"){t.warn("grid-column-span is not part of final Grid Layout. Use grid-column.",{node:f});return}else if(p==="display"&&h==="box"){t.warn("You should write display: flex by final spec instead of display: box",{node:f});return}else if(p==="text-emphasis-position")(h==="under"||h==="over")&&t.warn("You should use 2 values for text-emphasis-position For example, `under left` instead of just `under`.",{node:f});else if(/^(align|justify|place)-(items|content)$/.test(p)&&l(f))(h==="start"||h==="end")&&t.warn(`${h} value has mixed support, consider using flex-${h} instead`,{node:f});else if(p==="text-decoration-skip"&&h==="ink")t.warn("Replace text-decoration-skip: ink to text-decoration-skip-ink: auto, because spec had been changed",{node:f});else{if(c&&this.gridStatus(f,t))if(f.value==="subgrid"&&t.warn("IE does not support subgrid",{node:f}),/^(align|justify|place)-items$/.test(p)&&o(f)){let v=p.replace("-items","-self");t.warn(`IE does not support ${p} on grid containers. Try using ${v} on child elements instead: ${f.parent.selector} > * { ${v}: ${f.value} }`,{node:f})}else if(/^(align|justify|place)-content$/.test(p)&&o(f))t.warn(`IE does not support ${f.prop} on grid containers`,{node:f});else if(p==="display"&&f.value==="contents"){t.warn("Please do not use display: contents; if you have grid setting enabled",{node:f});return}else if(f.prop==="grid-gap"){let v=this.gridStatus(f,t);v==="autoplace"&&!LO(f)&&!nu(f)?t.warn("grid-gap only works if grid-template(-areas) is being used or both rows and columns have been declared and cells have not been manually placed inside the explicit grid",{node:f}):(v===!0||v==="no-autoplace")&&!nu(f)&&t.warn("grid-gap only works if grid-template(-areas) is being used",{node:f})}else if(p==="grid-auto-columns"){t.warn("grid-auto-columns is not supported by IE",{node:f});return}else if(p==="grid-auto-rows"){t.warn("grid-auto-rows is not supported by IE",{node:f});return}else if(p==="grid-auto-flow"){let v=d.some(w=>w.prop==="grid-template-rows"),y=d.some(w=>w.prop==="grid-template-columns");nu(f)?t.warn("grid-auto-flow is not supported by IE",{node:f}):h.includes("dense")?t.warn("grid-auto-flow: dense is not supported by IE",{node:f}):!v&&!y&&t.warn("grid-auto-flow works only if grid-template-rows and grid-template-columns are present in the same rule",{node:f});return}else if(h.includes("auto-fit")){t.warn("auto-fit value is not supported by IE",{node:f,word:"auto-fit"});return}else if(h.includes("auto-fill")){t.warn("auto-fill value is not supported by IE",{node:f,word:"auto-fill"});return}else p.startsWith("grid-template")&&h.includes("[")&&t.warn("Autoprefixer currently does not support line names. Try using grid-template-areas instead.",{node:f,word:"["});if(h.includes("radial-gradient"))if(IO.test(f.value))t.warn("Gradient has outdated direction syntax. New syntax is like `closest-side at 0 0` instead of `0 0, closest-side`.",{node:f});else{let v=fb(h);for(let y of v.nodes)if(y.type==="function"&&y.value==="radial-gradient")for(let w of y.nodes)w.type==="word"&&(w.value==="cover"?t.warn("Gradient has outdated direction syntax. Replace `cover` to `farthest-corner`.",{node:f}):w.value==="contain"&&t.warn("Gradient has outdated direction syntax. Replace `contain` to `closest-side`.",{node:f}))}h.includes("linear-gradient")&&PO.test(h)&&t.warn("Gradient has outdated direction syntax. New syntax is like `to left` instead of `right`.",{node:f})}$O.includes(f.prop)&&(f.value.includes("-fill-available")||(f.value.includes("fill-available")?t.warn("Replace fill-available to stretch, because spec had been changed",{node:f}):f.value.includes("fill")&&fb(h).nodes.some(y=>y.type==="word"&&y.value==="fill")&&t.warn("Replace fill to stretch, because spec had been changed",{node:f})));let b;if(f.prop==="transition"||f.prop==="transition-property")return this.prefixes.transition.add(f,t);if(f.prop==="align-self"){if(this.displayType(f)!=="grid"&&this.prefixes.options.flexbox!==!1&&(b=this.prefixes.add["align-self"],b&&b.prefixes&&b.process(f)),this.gridStatus(f,t)!==!1&&(b=this.prefixes.add["grid-row-align"],b&&b.prefixes))return b.process(f,t)}else if(f.prop==="justify-self"){if(this.gridStatus(f,t)!==!1&&(b=this.prefixes.add["grid-column-align"],b&&b.prefixes))return b.process(f,t)}else if(f.prop==="place-self"){if(b=this.prefixes.add["place-self"],b&&b.prefixes&&this.gridStatus(f,t)!==!1)return b.process(f,t)}else if(b=this.prefixes.add[f.prop],b&&b.prefixes)return b.process(f,t)}),this.gridStatus(e,t)&&RO(e,this.disabled),e.walkDecls(f=>{if(this.disabledValue(f,t))return;let d=this.prefixes.unprefixed(f.prop),p=this.prefixes.values("add",d);if(Array.isArray(p))for(let h of p)h.process&&h.process(f,t);TO.save(this.prefixes,f)})}remove(e,t){let i=this.prefixes.remove["@resolution"];e.walkAtRules((n,a)=>{this.prefixes.remove[`@${n.name}`]?this.disabled(n,t)||n.parent.removeChild(a):n.name==="media"&&n.params.includes("-resolution")&&i&&i.clean(n)});for(let n of this.prefixes.remove.selectors)e.walkRules((a,s)=>{n.check(a)&&(this.disabled(a,t)||a.parent.removeChild(s))});return e.walkDecls((n,a)=>{if(this.disabled(n,t))return;let s=n.parent,o=this.prefixes.unprefixed(n.prop);if((n.prop==="transition"||n.prop==="transition-property")&&this.prefixes.transition.remove(n),this.prefixes.remove[n.prop]&&this.prefixes.remove[n.prop].remove){let l=this.prefixes.group(n).down(c=>this.prefixes.normalize(c.prop)===o);if(o==="flex-flow"&&(l=!0),n.prop==="-webkit-box-orient"){let c={"flex-direction":!0,"flex-flow":!0};if(!n.parent.some(f=>c[f.prop]))return}if(l&&!this.withHackValue(n)){n.raw("before").includes(`
+`)&&this.reduceSpaces(n),s.removeChild(a);return}}for(let l of this.prefixes.values("remove",o)){if(!l.check||!l.check(n.value))continue;if(o=l.unprefixed,this.prefixes.group(n).down(f=>f.value.includes(o))){s.removeChild(a);return}}})}withHackValue(e){return e.prop==="-webkit-background-clip"&&e.value==="text"}disabledValue(e,t){return this.gridStatus(e,t)===!1&&e.type==="decl"&&e.prop==="display"&&e.value.includes("grid")||this.prefixes.options.flexbox===!1&&e.type==="decl"&&e.prop==="display"&&e.value.includes("flex")||e.type==="decl"&&e.prop==="content"?!0:this.disabled(e,t)}disabledDecl(e,t){if(this.gridStatus(e,t)===!1&&e.type==="decl"&&(e.prop.includes("grid")||e.prop==="justify-items"))return!0;if(this.prefixes.options.flexbox===!1&&e.type==="decl"){let i=["order","justify-content","align-items","align-content"];if(e.prop.includes("flex")||i.includes(e.prop))return!0}return this.disabled(e,t)}disabled(e,t){if(!e)return!1;if(e._autoprefixerDisabled!==void 0)return e._autoprefixerDisabled;if(e.parent){let n=e.prev();if(n&&n.type==="comment"&&DO.test(n.text))return e._autoprefixerDisabled=!0,e._autoprefixerSelfDisabled=!0,!0}let i=null;if(e.nodes){let n;e.each(a=>{a.type==="comment"&&/(!\s*)?autoprefixer:\s*(off|on)/i.test(a.text)&&(typeof n!="undefined"?t.warn("Second Autoprefixer control comment was ignored. Autoprefixer applies control comment to whole block, not to next rules.",{node:a}):n=/on/i.test(a.text))}),n!==void 0&&(i=!n)}if(!e.nodes||i===null)if(e.parent){let n=this.disabled(e.parent,t);e.parent._autoprefixerSelfDisabled===!0?i=!1:i=n}else i=!1;return e._autoprefixerDisabled=i,i}reduceSpaces(e){let t=!1;if(this.prefixes.group(e).up(()=>(t=!0,!0)),t)return;let i=e.raw("before").split(`
+`),n=i[i.length-1].length,a=!1;this.prefixes.group(e).down(s=>{i=s.raw("before").split(`
+`);let o=i.length-1;i[o].length>n&&(a===!1&&(a=i[o].length-n),i[o]=i[o].slice(0,-a),s.raws.before=i.join(`
+`))})}displayType(e){for(let t of e.parent.nodes)if(t.prop==="display"){if(t.value.includes("flex"))return"flex";if(t.value.includes("grid"))return"grid"}return!1}gridStatus(e,t){if(!e)return!1;if(e._autoprefixerGridStatus!==void 0)return e._autoprefixerGridStatus;let i=null;if(e.nodes){let n;e.each(a=>{if(a.type==="comment"&&qO.test(a.text)){let s=/:\s*autoplace/i.test(a.text),o=/no-autoplace/i.test(a.text);typeof n!="undefined"?t.warn("Second Autoprefixer grid control comment was ignored. Autoprefixer applies control comments to the whole block, not to the next rules.",{node:a}):s?n="autoplace":o?n=!0:n=/on/i.test(a.text)}}),n!==void 0&&(i=n)}if(e.type==="atrule"&&e.name==="supports"){let n=e.params;n.includes("grid")&&n.includes("auto")&&(i=!1)}if(!e.nodes||i===null)if(e.parent){let n=this.gridStatus(e.parent,t);e.parent._autoprefixerSelfDisabled===!0?i=!1:i=n}else typeof this.prefixes.options.grid!="undefined"?i=this.prefixes.options.grid:typeof m.env.AUTOPREFIXER_GRID!="undefined"?m.env.AUTOPREFIXER_GRID==="autoplace"?i="autoplace":i=!0:i=!1;return e._autoprefixerGridStatus=i,i}};pb.exports=cb});var hb=x((p$,db)=>{u();db.exports={A:{A:{"2":"K E F G A B JC"},B:{"1":"C L M H N D O P Q R S T U V W X Y Z a b c d e f g h i j n o p q r s t u v w x y z I"},C:{"1":"2 3 4 5 6 7 8 9 AB BB CB DB EB FB GB HB IB JB KB LB MB NB OB PB QB RB SB TB UB VB WB XB YB ZB aB bB cB 0B dB 1B eB fB gB hB iB jB kB lB mB nB oB m pB qB rB sB tB P Q R 2B S T U V W X Y Z a b c d e f g h i j n o p q r s t u v w x y z I uB 3B 4B","2":"0 1 KC zB J K E F G A B C L M H N D O k l LC MC"},D:{"1":"8 9 AB BB CB DB EB FB GB HB IB JB KB LB MB NB OB PB QB RB SB TB UB VB WB XB YB ZB aB bB cB 0B dB 1B eB fB gB hB iB jB kB lB mB nB oB m pB qB rB sB tB P Q R S T U V W X Y Z a b c d e f g h i j n o p q r s t u v w x y z I uB 3B 4B","2":"0 1 2 3 4 5 6 7 J K E F G A B C L M H N D O k l"},E:{"1":"G A B C L M H D RC 6B vB wB 7B SC TC 8B 9B xB AC yB BC CC DC EC FC GC UC","2":"0 J K E F NC 5B OC PC QC"},F:{"1":"1 2 3 4 5 6 7 8 9 H N D O k l AB BB CB DB EB FB GB HB IB JB KB LB MB NB OB PB QB RB SB TB UB VB WB XB YB ZB aB bB cB dB eB fB gB hB iB jB kB lB mB nB oB m pB qB rB sB tB P Q R 2B S T U V W X Y Z a b c d e f g h i j wB","2":"G B C VC WC XC YC vB HC ZC"},G:{"1":"D fC gC hC iC jC kC lC mC nC oC pC qC rC sC tC 8B 9B xB AC yB BC CC DC EC FC GC","2":"F 5B aC IC bC cC dC eC"},H:{"1":"uC"},I:{"1":"I zC 0C","2":"zB J vC wC xC yC IC"},J:{"2":"E A"},K:{"1":"m","2":"A B C vB HC wB"},L:{"1":"I"},M:{"1":"uB"},N:{"2":"A B"},O:{"1":"xB"},P:{"1":"J k l 1C 2C 3C 4C 5C 6B 6C 7C 8C 9C AD yB BD CD DD"},Q:{"1":"7B"},R:{"1":"ED"},S:{"1":"FD GD"}},B:4,C:"CSS Feature Queries"}});var bb=x((d$,yb)=>{u();function mb(r){return r[r.length-1]}var gb={parse(r){let e=[""],t=[e];for(let i of r){if(i==="("){e=[""],mb(t).push(e),t.push(e);continue}if(i===")"){t.pop(),e=mb(t),e.push("");continue}e[e.length-1]+=i}return t[0]},stringify(r){let e="";for(let t of r){if(typeof t=="object"){e+=`(${gb.stringify(t)})`;continue}e+=t}return e}};yb.exports=gb});var Sb=x((h$,kb)=>{u();var MO=hb(),{feature:NO}=(Ts(),Os),{parse:BO}=$e(),FO=Mt(),au=bb(),jO=He(),zO=_e(),wb=NO(MO),vb=[];for(let r in wb.stats){let e=wb.stats[r];for(let t in e){let i=e[t];/y/.test(i)&&vb.push(r+" "+t)}}var xb=class{constructor(e,t){this.Prefixes=e,this.all=t}prefixer(){if(this.prefixerCache)return this.prefixerCache;let e=this.all.browsers.selected.filter(i=>vb.includes(i)),t=new FO(this.all.browsers.data,e,this.all.options);return this.prefixerCache=new this.Prefixes(this.all.data,t,this.all.options),this.prefixerCache}parse(e){let t=e.split(":"),i=t[0],n=t[1];return n||(n=""),[i.trim(),n.trim()]}virtual(e){let[t,i]=this.parse(e),n=BO("a{}").first;return n.append({prop:t,value:i,raws:{before:""}}),n}prefixed(e){let t=this.virtual(e);if(this.disabled(t.first))return t.nodes;let i={warn:()=>null},n=this.prefixer().add[t.first.prop];n&&n.process&&n.process(t.first,i);for(let a of t.nodes){for(let s of this.prefixer().values("add",t.first.prop))s.process(a);jO.save(this.all,a)}return t.nodes}isNot(e){return typeof e=="string"&&/not\s*/i.test(e)}isOr(e){return typeof e=="string"&&/\s*or\s*/i.test(e)}isProp(e){return typeof e=="object"&&e.length===1&&typeof e[0]=="string"}isHack(e,t){return!new RegExp(`(\\(|\\s)${zO.escapeRegexp(t)}:`).test(e)}toRemove(e,t){let[i,n]=this.parse(e),a=this.all.unprefixed(i),s=this.all.cleaner();if(s.remove[i]&&s.remove[i].remove&&!this.isHack(t,a))return!0;for(let o of s.values("remove",a))if(o.check(n))return!0;return!1}remove(e,t){let i=0;for(;i<e.length;){if(!this.isNot(e[i-1])&&this.isProp(e[i])&&this.isOr(e[i+1])){if(this.toRemove(e[i][0],t)){e.splice(i,2);continue}i+=2;continue}typeof e[i]=="object"&&(e[i]=this.remove(e[i],t)),i+=1}return e}cleanBrackets(e){return e.map(t=>typeof t!="object"?t:t.length===1&&typeof t[0]=="object"?this.cleanBrackets(t[0]):this.cleanBrackets(t))}convert(e){let t=[""];for(let i of e)t.push([`${i.prop}: ${i.value}`]),t.push(" or ");return t[t.length-1]="",t}normalize(e){if(typeof e!="object")return e;if(e=e.filter(t=>t!==""),typeof e[0]=="string"){let t=e[0].trim();if(t.includes(":")||t==="selector"||t==="not selector")return[au.stringify(e)]}return e.map(t=>this.normalize(t))}add(e,t){return e.map(i=>{if(this.isProp(i)){let n=this.prefixed(i[0]);return n.length>1?this.convert(n):i}return typeof i=="object"?this.add(i,t):i})}process(e){let t=au.parse(e.params);t=this.normalize(t),t=this.remove(t,e.params),t=this.add(t,e.params),t=this.cleanBrackets(t),e.params=au.stringify(t)}disabled(e){if(!this.all.options.grid&&(e.prop==="display"&&e.value.includes("grid")||e.prop.includes("grid")||e.prop==="justify-items"))return!0;if(this.all.options.flexbox===!1){if(e.prop==="display"&&e.value.includes("flex"))return!0;let t=["order","justify-content","align-items","align-content"];if(e.prop.includes("flex")||t.includes(e.prop))return!0}return!1}};kb.exports=xb});var _b=x((m$,Cb)=>{u();var Ab=class{constructor(e,t){this.prefix=t,this.prefixed=e.prefixed(this.prefix),this.regexp=e.regexp(this.prefix),this.prefixeds=e.possible().map(i=>[e.prefixed(i),e.regexp(i)]),this.unprefixed=e.name,this.nameRegexp=e.regexp()}isHack(e){let t=e.parent.index(e)+1,i=e.parent.nodes;for(;t<i.length;){let n=i[t].selector;if(!n)return!0;if(n.includes(this.unprefixed)&&n.match(this.nameRegexp))return!1;let a=!1;for(let[s,o]of this.prefixeds)if(n.includes(s)&&n.match(o)){a=!0;break}if(!a)return!0;t+=1}return!0}check(e){return!(!e.selector.includes(this.prefixed)||!e.selector.match(this.regexp)||this.isHack(e))}};Cb.exports=Ab});var kr=x((g$,Ob)=>{u();var{list:UO}=$e(),VO=_b(),HO=wr(),WO=Mt(),GO=_e(),Eb=class extends HO{constructor(e,t,i){super(e,t,i);this.regexpCache=new Map}check(e){return e.selector.includes(this.name)?!!e.selector.match(this.regexp()):!1}prefixed(e){return this.name.replace(/^(\W*)/,`$1${e}`)}regexp(e){if(!this.regexpCache.has(e)){let t=e?this.prefixed(e):this.name;this.regexpCache.set(e,new RegExp(`(^|[^:"'=])${GO.escapeRegexp(t)}`,"gi"))}return this.regexpCache.get(e)}possible(){return WO.prefixes()}prefixeds(e){if(e._autoprefixerPrefixeds){if(e._autoprefixerPrefixeds[this.name])return e._autoprefixerPrefixeds}else e._autoprefixerPrefixeds={};let t={};if(e.selector.includes(",")){let n=UO.comma(e.selector).filter(a=>a.includes(this.name));for(let a of this.possible())t[a]=n.map(s=>this.replace(s,a)).join(", ")}else for(let i of this.possible())t[i]=this.replace(e.selector,i);return e._autoprefixerPrefixeds[this.name]=t,e._autoprefixerPrefixeds}already(e,t,i){let n=e.parent.index(e)-1;for(;n>=0;){let a=e.parent.nodes[n];if(a.type!=="rule")return!1;let s=!1;for(let o in t[this.name]){let l=t[this.name][o];if(a.selector===l){if(i===o)return!0;s=!0;break}}if(!s)return!1;n-=1}return!1}replace(e,t){return e.replace(this.regexp(),`$1${this.prefixed(t)}`)}add(e,t){let i=this.prefixeds(e);if(this.already(e,i,t))return;let n=this.clone(e,{selector:i[this.name][t]});e.parent.insertBefore(e,n)}old(e){return new VO(this,e)}};Ob.exports=Eb});var Pb=x((y$,Rb)=>{u();var QO=wr(),Tb=class extends QO{add(e,t){let i=t+e.name;if(e.parent.some(s=>s.name===i&&s.params===e.params))return;let a=this.clone(e,{name:i});return e.parent.insertBefore(e,a)}process(e){let t=this.parentPrefix(e);for(let i of this.prefixes)(!t||t===i)&&this.add(e,i)}};Rb.exports=Tb});var Db=x((b$,Ib)=>{u();var YO=kr(),ou=class extends YO{prefixed(e){return e==="-webkit-"?":-webkit-full-screen":e==="-moz-"?":-moz-full-screen":`:${e}fullscreen`}};ou.names=[":fullscreen"];Ib.exports=ou});var $b=x((w$,qb)=>{u();var KO=kr(),lu=class extends KO{possible(){return super.possible().concat(["-moz- old","-ms- old"])}prefixed(e){return e==="-webkit-"?"::-webkit-input-placeholder":e==="-ms-"?"::-ms-input-placeholder":e==="-ms- old"?":-ms-input-placeholder":e==="-moz- old"?":-moz-placeholder":`::${e}placeholder`}};lu.names=["::placeholder"];qb.exports=lu});var Mb=x((v$,Lb)=>{u();var XO=kr(),uu=class extends XO{prefixed(e){return e==="-ms-"?":-ms-input-placeholder":`:${e}placeholder-shown`}};uu.names=[":placeholder-shown"];Lb.exports=uu});var Bb=x((x$,Nb)=>{u();var JO=kr(),ZO=_e(),fu=class extends JO{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=ZO.uniq(this.prefixes.map(n=>"-webkit-")))}prefixed(e){return e==="-webkit-"?"::-webkit-file-upload-button":`::${e}file-selector-button`}};fu.names=["::file-selector-button"];Nb.exports=fu});var Pe=x((k$,Fb)=>{u();Fb.exports=function(r){let e;return r==="-webkit- 2009"||r==="-moz-"?e=2009:r==="-ms-"?e=2012:r==="-webkit-"&&(e="final"),r==="-webkit- 2009"&&(r="-webkit-"),[e,r]}});var Vb=x((S$,Ub)=>{u();var jb=$e().list,zb=Pe(),eT=j(),Sr=class extends eT{prefixed(e,t){let i;return[i,t]=zb(t),i===2009?t+"box-flex":super.prefixed(e,t)}normalize(){return"flex"}set(e,t){let i=zb(t)[0];if(i===2009)return e.value=jb.space(e.value)[0],e.value=Sr.oldValues[e.value]||e.value,super.set(e,t);if(i===2012){let n=jb.space(e.value);n.length===3&&n[2]==="0"&&(e.value=n.slice(0,2).concat("0px").join(" "))}return super.set(e,t)}};Sr.names=["flex","box-flex"];Sr.oldValues={auto:"1",none:"0"};Ub.exports=Sr});var Gb=x((A$,Wb)=>{u();var Hb=Pe(),tT=j(),cu=class extends tT{prefixed(e,t){let i;return[i,t]=Hb(t),i===2009?t+"box-ordinal-group":i===2012?t+"flex-order":super.prefixed(e,t)}normalize(){return"order"}set(e,t){return Hb(t)[0]===2009&&/\d/.test(e.value)?(e.value=(parseInt(e.value)+1).toString(),super.set(e,t)):super.set(e,t)}};cu.names=["order","flex-order","box-ordinal-group"];Wb.exports=cu});var Yb=x((C$,Qb)=>{u();var rT=j(),pu=class extends rT{check(e){let t=e.value;return!t.toLowerCase().includes("alpha(")&&!t.includes("DXImageTransform.Microsoft")&&!t.includes("data:image/svg+xml")}};pu.names=["filter"];Qb.exports=pu});var Xb=x((_$,Kb)=>{u();var iT=j(),du=class extends iT{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let a=this.clone(e),s=e.prop.replace(/end$/,"start"),o=t+e.prop.replace(/end$/,"span");if(!e.parent.some(l=>l.prop===o)){if(a.prop=o,e.value.includes("span"))a.value=e.value.replace(/span\s/i,"");else{let l;if(e.parent.walkDecls(s,c=>{l=c}),l){let c=Number(e.value)-Number(l.value)+"";a.value=c}else e.warn(n,`Can not prefix ${e.prop} (${s} is not found)`)}e.cloneBefore(a)}}};du.names=["grid-row-end","grid-column-end"];Kb.exports=du});var Zb=x((E$,Jb)=>{u();var nT=j(),hu=class extends nT{check(e){return!e.value.split(/\s+/).some(t=>{let i=t.toLowerCase();return i==="reverse"||i==="alternate-reverse"})}};hu.names=["animation","animation-direction"];Jb.exports=hu});var tw=x((O$,ew)=>{u();var sT=Pe(),aT=j(),mu=class extends aT{insert(e,t,i){let n;if([n,t]=sT(t),n!==2009)return super.insert(e,t,i);let a=e.value.split(/\s+/).filter(d=>d!=="wrap"&&d!=="nowrap"&&"wrap-reverse");if(a.length===0||e.parent.some(d=>d.prop===t+"box-orient"||d.prop===t+"box-direction"))return;let o=a[0],l=o.includes("row")?"horizontal":"vertical",c=o.includes("reverse")?"reverse":"normal",f=this.clone(e);return f.prop=t+"box-orient",f.value=l,this.needCascade(e)&&(f.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,f),f=this.clone(e),f.prop=t+"box-direction",f.value=c,this.needCascade(e)&&(f.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,f)}};mu.names=["flex-flow","box-direction","box-orient"];ew.exports=mu});var iw=x((T$,rw)=>{u();var oT=Pe(),lT=j(),gu=class extends lT{normalize(){return"flex"}prefixed(e,t){let i;return[i,t]=oT(t),i===2009?t+"box-flex":i===2012?t+"flex-positive":super.prefixed(e,t)}};gu.names=["flex-grow","flex-positive"];rw.exports=gu});var sw=x((R$,nw)=>{u();var uT=Pe(),fT=j(),yu=class extends fT{set(e,t){if(uT(t)[0]!==2009)return super.set(e,t)}};yu.names=["flex-wrap"];nw.exports=yu});var ow=x((P$,aw)=>{u();var cT=j(),Ar=Bt(),bu=class extends cT{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let a=Ar.parse(e),[s,o]=Ar.translate(a,0,2),[l,c]=Ar.translate(a,1,3);[["grid-row",s],["grid-row-span",o],["grid-column",l],["grid-column-span",c]].forEach(([f,d])=>{Ar.insertDecl(e,f,d)}),Ar.warnTemplateSelectorNotFound(e,n),Ar.warnIfGridRowColumnExists(e,n)}};bu.names=["grid-area"];aw.exports=bu});var uw=x((I$,lw)=>{u();var pT=j(),Mi=Bt(),wu=class extends pT{insert(e,t,i){if(t!=="-ms-")return super.insert(e,t,i);if(e.parent.some(s=>s.prop==="-ms-grid-row-align"))return;let[[n,a]]=Mi.parse(e);a?(Mi.insertDecl(e,"grid-row-align",n),Mi.insertDecl(e,"grid-column-align",a)):(Mi.insertDecl(e,"grid-row-align",n),Mi.insertDecl(e,"grid-column-align",n))}};wu.names=["place-self"];lw.exports=wu});var cw=x((D$,fw)=>{u();var dT=j(),vu=class extends dT{check(e){let t=e.value;return!t.includes("/")||t.includes("span")}normalize(e){return e.replace("-start","")}prefixed(e,t){let i=super.prefixed(e,t);return t==="-ms-"&&(i=i.replace("-start","")),i}};vu.names=["grid-row-start","grid-column-start"];fw.exports=vu});var hw=x((q$,dw)=>{u();var pw=Pe(),hT=j(),Cr=class extends hT{check(e){return e.parent&&!e.parent.some(t=>t.prop&&t.prop.startsWith("grid-"))}prefixed(e,t){let i;return[i,t]=pw(t),i===2012?t+"flex-item-align":super.prefixed(e,t)}normalize(){return"align-self"}set(e,t){let i=pw(t)[0];if(i===2012)return e.value=Cr.oldValues[e.value]||e.value,super.set(e,t);if(i==="final")return super.set(e,t)}};Cr.names=["align-self","flex-item-align"];Cr.oldValues={"flex-end":"end","flex-start":"start"};dw.exports=Cr});var gw=x(($$,mw)=>{u();var mT=j(),gT=_e(),xu=class extends mT{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=gT.uniq(this.prefixes.map(n=>n==="-ms-"?"-webkit-":n)))}};xu.names=["appearance"];mw.exports=xu});var ww=x((L$,bw)=>{u();var yw=Pe(),yT=j(),ku=class extends yT{normalize(){return"flex-basis"}prefixed(e,t){let i;return[i,t]=yw(t),i===2012?t+"flex-preferred-size":super.prefixed(e,t)}set(e,t){let i;if([i,t]=yw(t),i===2012||i==="final")return super.set(e,t)}};ku.names=["flex-basis","flex-preferred-size"];bw.exports=ku});var xw=x((M$,vw)=>{u();var bT=j(),Su=class extends bT{normalize(){return this.name.replace("box-image","border")}prefixed(e,t){let i=super.prefixed(e,t);return t==="-webkit-"&&(i=i.replace("border","box-image")),i}};Su.names=["mask-border","mask-border-source","mask-border-slice","mask-border-width","mask-border-outset","mask-border-repeat","mask-box-image","mask-box-image-source","mask-box-image-slice","mask-box-image-width","mask-box-image-outset","mask-box-image-repeat"];vw.exports=Su});var Sw=x((N$,kw)=>{u();var wT=j(),lt=class extends wT{insert(e,t,i){let n=e.prop==="mask-composite",a;n?a=e.value.split(","):a=e.value.match(lt.regexp)||[],a=a.map(c=>c.trim()).filter(c=>c);let s=a.length,o;if(s&&(o=this.clone(e),o.value=a.map(c=>lt.oldValues[c]||c).join(", "),a.includes("intersect")&&(o.value+=", xor"),o.prop=t+"mask-composite"),n)return s?(this.needCascade(e)&&(o.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,o)):void 0;let l=this.clone(e);return l.prop=t+l.prop,s&&(l.value=l.value.replace(lt.regexp,"")),this.needCascade(e)&&(l.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,l),s?(this.needCascade(e)&&(o.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,o)):e}};lt.names=["mask","mask-composite"];lt.oldValues={add:"source-over",subtract:"source-out",intersect:"source-in",exclude:"xor"};lt.regexp=new RegExp(`\\s+(${Object.keys(lt.oldValues).join("|")})\\b(?!\\))\\s*(?=[,])`,"ig");kw.exports=lt});var _w=x((B$,Cw)=>{u();var Aw=Pe(),vT=j(),_r=class extends vT{prefixed(e,t){let i;return[i,t]=Aw(t),i===2009?t+"box-align":i===2012?t+"flex-align":super.prefixed(e,t)}normalize(){return"align-items"}set(e,t){let i=Aw(t)[0];return(i===2009||i===2012)&&(e.value=_r.oldValues[e.value]||e.value),super.set(e,t)}};_r.names=["align-items","flex-align","box-align"];_r.oldValues={"flex-end":"end","flex-start":"start"};Cw.exports=_r});var Ow=x((F$,Ew)=>{u();var xT=j(),Au=class extends xT{set(e,t){return t==="-ms-"&&e.value==="contain"&&(e.value="element"),super.set(e,t)}insert(e,t,i){if(!(e.value==="all"&&t==="-ms-"))return super.insert(e,t,i)}};Au.names=["user-select"];Ew.exports=Au});var Pw=x((j$,Rw)=>{u();var Tw=Pe(),kT=j(),Cu=class extends kT{normalize(){return"flex-shrink"}prefixed(e,t){let i;return[i,t]=Tw(t),i===2012?t+"flex-negative":super.prefixed(e,t)}set(e,t){let i;if([i,t]=Tw(t),i===2012||i==="final")return super.set(e,t)}};Cu.names=["flex-shrink","flex-negative"];Rw.exports=Cu});var Dw=x((z$,Iw)=>{u();var ST=j(),_u=class extends ST{prefixed(e,t){return`${t}column-${e}`}normalize(e){return e.includes("inside")?"break-inside":e.includes("before")?"break-before":"break-after"}set(e,t){return(e.prop==="break-inside"&&e.value==="avoid-column"||e.value==="avoid-page")&&(e.value="avoid"),super.set(e,t)}insert(e,t,i){if(e.prop!=="break-inside")return super.insert(e,t,i);if(!(/region/i.test(e.value)||/page/i.test(e.value)))return super.insert(e,t,i)}};_u.names=["break-inside","page-break-inside","column-break-inside","break-before","page-break-before","column-break-before","break-after","page-break-after","column-break-after"];Iw.exports=_u});var $w=x((U$,qw)=>{u();var AT=j(),Eu=class extends AT{prefixed(e,t){return t+"print-color-adjust"}normalize(){return"color-adjust"}};Eu.names=["color-adjust","print-color-adjust"];qw.exports=Eu});var Mw=x((V$,Lw)=>{u();var CT=j(),Er=class extends CT{insert(e,t,i){if(t==="-ms-"){let n=this.set(this.clone(e),t);this.needCascade(e)&&(n.raws.before=this.calcBefore(i,e,t));let a="ltr";return e.parent.nodes.forEach(s=>{s.prop==="direction"&&(s.value==="rtl"||s.value==="ltr")&&(a=s.value)}),n.value=Er.msValues[a][e.value]||e.value,e.parent.insertBefore(e,n)}return super.insert(e,t,i)}};Er.names=["writing-mode"];Er.msValues={ltr:{"horizontal-tb":"lr-tb","vertical-rl":"tb-rl","vertical-lr":"tb-lr"},rtl:{"horizontal-tb":"rl-tb","vertical-rl":"bt-rl","vertical-lr":"bt-lr"}};Lw.exports=Er});var Bw=x((H$,Nw)=>{u();var _T=j(),Ou=class extends _T{set(e,t){return e.value=e.value.replace(/\s+fill(\s)/,"$1"),super.set(e,t)}};Ou.names=["border-image"];Nw.exports=Ou});var zw=x((W$,jw)=>{u();var Fw=Pe(),ET=j(),Or=class extends ET{prefixed(e,t){let i;return[i,t]=Fw(t),i===2012?t+"flex-line-pack":super.prefixed(e,t)}normalize(){return"align-content"}set(e,t){let i=Fw(t)[0];if(i===2012)return e.value=Or.oldValues[e.value]||e.value,super.set(e,t);if(i==="final")return super.set(e,t)}};Or.names=["align-content","flex-line-pack"];Or.oldValues={"flex-end":"end","flex-start":"start","space-between":"justify","space-around":"distribute"};jw.exports=Or});var Vw=x((G$,Uw)=>{u();var OT=j(),We=class extends OT{prefixed(e,t){return t==="-moz-"?t+(We.toMozilla[e]||e):super.prefixed(e,t)}normalize(e){return We.toNormal[e]||e}};We.names=["border-radius"];We.toMozilla={};We.toNormal={};for(let r of["top","bottom"])for(let e of["left","right"]){let t=`border-${r}-${e}-radius`,i=`border-radius-${r}${e}`;We.names.push(t),We.names.push(i),We.toMozilla[t]=i,We.toNormal[i]=t}Uw.exports=We});var Ww=x((Q$,Hw)=>{u();var TT=j(),Tu=class extends TT{prefixed(e,t){return e.includes("-start")?t+e.replace("-block-start","-before"):t+e.replace("-block-end","-after")}normalize(e){return e.includes("-before")?e.replace("-before","-block-start"):e.replace("-after","-block-end")}};Tu.names=["border-block-start","border-block-end","margin-block-start","margin-block-end","padding-block-start","padding-block-end","border-before","border-after","margin-before","margin-after","padding-before","padding-after"];Hw.exports=Tu});var Qw=x((Y$,Gw)=>{u();var RT=j(),{parseTemplate:PT,warnMissedAreas:IT,getGridGap:DT,warnGridGap:qT,inheritGridGap:$T}=Bt(),Ru=class extends RT{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);if(e.parent.some(h=>h.prop==="-ms-grid-rows"))return;let a=DT(e),s=$T(e,a),{rows:o,columns:l,areas:c}=PT({decl:e,gap:s||a}),f=Object.keys(c).length>0,d=Boolean(o),p=Boolean(l);return qT({gap:a,hasColumns:p,decl:e,result:n}),IT(c,e,n),(d&&p||f)&&e.cloneBefore({prop:"-ms-grid-rows",value:o,raws:{}}),p&&e.cloneBefore({prop:"-ms-grid-columns",value:l,raws:{}}),e}};Ru.names=["grid-template"];Gw.exports=Ru});var Kw=x((K$,Yw)=>{u();var LT=j(),Pu=class extends LT{prefixed(e,t){return t+e.replace("-inline","")}normalize(e){return e.replace(/(margin|padding|border)-(start|end)/,"$1-inline-$2")}};Pu.names=["border-inline-start","border-inline-end","margin-inline-start","margin-inline-end","padding-inline-start","padding-inline-end","border-start","border-end","margin-start","margin-end","padding-start","padding-end"];Yw.exports=Pu});var Jw=x((X$,Xw)=>{u();var MT=j(),Iu=class extends MT{check(e){return!e.value.includes("flex-")&&e.value!=="baseline"}prefixed(e,t){return t+"grid-row-align"}normalize(){return"align-self"}};Iu.names=["grid-row-align"];Xw.exports=Iu});var e0=x((J$,Zw)=>{u();var NT=j(),Tr=class extends NT{keyframeParents(e){let{parent:t}=e;for(;t;){if(t.type==="atrule"&&t.name==="keyframes")return!0;({parent:t}=t)}return!1}contain3d(e){if(e.prop==="transform-origin")return!1;for(let t of Tr.functions3d)if(e.value.includes(`${t}(`))return!0;return!1}set(e,t){return e=super.set(e,t),t==="-ms-"&&(e.value=e.value.replace(/rotatez/gi,"rotate")),e}insert(e,t,i){if(t==="-ms-"){if(!this.contain3d(e)&&!this.keyframeParents(e))return super.insert(e,t,i)}else if(t==="-o-"){if(!this.contain3d(e))return super.insert(e,t,i)}else return super.insert(e,t,i)}};Tr.names=["transform","transform-origin"];Tr.functions3d=["matrix3d","translate3d","translateZ","scale3d","scaleZ","rotate3d","rotateX","rotateY","perspective"];Zw.exports=Tr});var i0=x((Z$,r0)=>{u();var t0=Pe(),BT=j(),Du=class extends BT{normalize(){return"flex-direction"}insert(e,t,i){let n;if([n,t]=t0(t),n!==2009)return super.insert(e,t,i);if(e.parent.some(f=>f.prop===t+"box-orient"||f.prop===t+"box-direction"))return;let s=e.value,o,l;s==="inherit"||s==="initial"||s==="unset"?(o=s,l=s):(o=s.includes("row")?"horizontal":"vertical",l=s.includes("reverse")?"reverse":"normal");let c=this.clone(e);return c.prop=t+"box-orient",c.value=o,this.needCascade(e)&&(c.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,c),c=this.clone(e),c.prop=t+"box-direction",c.value=l,this.needCascade(e)&&(c.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,c)}old(e,t){let i;return[i,t]=t0(t),i===2009?[t+"box-orient",t+"box-direction"]:super.old(e,t)}};Du.names=["flex-direction","box-direction","box-orient"];r0.exports=Du});var s0=x((eL,n0)=>{u();var FT=j(),qu=class extends FT{check(e){return e.value==="pixelated"}prefixed(e,t){return t==="-ms-"?"-ms-interpolation-mode":super.prefixed(e,t)}set(e,t){return t!=="-ms-"?super.set(e,t):(e.prop="-ms-interpolation-mode",e.value="nearest-neighbor",e)}normalize(){return"image-rendering"}process(e,t){return super.process(e,t)}};qu.names=["image-rendering","interpolation-mode"];n0.exports=qu});var o0=x((tL,a0)=>{u();var jT=j(),zT=_e(),$u=class extends jT{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=zT.uniq(this.prefixes.map(n=>n==="-ms-"?"-webkit-":n)))}};$u.names=["backdrop-filter"];a0.exports=$u});var u0=x((rL,l0)=>{u();var UT=j(),VT=_e(),Lu=class extends UT{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=VT.uniq(this.prefixes.map(n=>n==="-ms-"?"-webkit-":n)))}check(e){return e.value.toLowerCase()==="text"}};Lu.names=["background-clip"];l0.exports=Lu});var c0=x((iL,f0)=>{u();var HT=j(),WT=["none","underline","overline","line-through","blink","inherit","initial","unset"],Mu=class extends HT{check(e){return e.value.split(/\s+/).some(t=>!WT.includes(t))}};Mu.names=["text-decoration"];f0.exports=Mu});var h0=x((nL,d0)=>{u();var p0=Pe(),GT=j(),Rr=class extends GT{prefixed(e,t){let i;return[i,t]=p0(t),i===2009?t+"box-pack":i===2012?t+"flex-pack":super.prefixed(e,t)}normalize(){return"justify-content"}set(e,t){let i=p0(t)[0];if(i===2009||i===2012){let n=Rr.oldValues[e.value]||e.value;if(e.value=n,i!==2009||n!=="distribute")return super.set(e,t)}else if(i==="final")return super.set(e,t)}};Rr.names=["justify-content","flex-pack","box-pack"];Rr.oldValues={"flex-end":"end","flex-start":"start","space-between":"justify","space-around":"distribute"};d0.exports=Rr});var g0=x((sL,m0)=>{u();var QT=j(),Nu=class extends QT{set(e,t){let i=e.value.toLowerCase();return t==="-webkit-"&&!i.includes(" ")&&i!=="contain"&&i!=="cover"&&(e.value=e.value+" "+e.value),super.set(e,t)}};Nu.names=["background-size"];m0.exports=Nu});var b0=x((aL,y0)=>{u();var YT=j(),Bu=Bt(),Fu=class extends YT{insert(e,t,i){if(t!=="-ms-")return super.insert(e,t,i);let n=Bu.parse(e),[a,s]=Bu.translate(n,0,1);n[0]&&n[0].includes("span")&&(s=n[0].join("").replace(/\D/g,"")),[[e.prop,a],[`${e.prop}-span`,s]].forEach(([l,c])=>{Bu.insertDecl(e,l,c)})}};Fu.names=["grid-row","grid-column"];y0.exports=Fu});var x0=x((oL,v0)=>{u();var KT=j(),{prefixTrackProp:w0,prefixTrackValue:XT,autoplaceGridItems:JT,getGridGap:ZT,inheritGridGap:eR}=Bt(),tR=su(),ju=class extends KT{prefixed(e,t){return t==="-ms-"?w0({prop:e,prefix:t}):super.prefixed(e,t)}normalize(e){return e.replace(/^grid-(rows|columns)/,"grid-template-$1")}insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let{parent:a,prop:s,value:o}=e,l=s.includes("rows"),c=s.includes("columns"),f=a.some(k=>k.prop==="grid-template"||k.prop==="grid-template-areas");if(f&&l)return!1;let d=new tR({options:{}}),p=d.gridStatus(a,n),h=ZT(e);h=eR(e,h)||h;let b=l?h.row:h.column;(p==="no-autoplace"||p===!0)&&!f&&(b=null);let v=XT({value:o,gap:b});e.cloneBefore({prop:w0({prop:s,prefix:t}),value:v});let y=a.nodes.find(k=>k.prop==="grid-auto-flow"),w="row";if(y&&!d.disabled(y,n)&&(w=y.value.trim()),p==="autoplace"){let k=a.nodes.find(E=>E.prop==="grid-template-rows");if(!k&&f)return;if(!k&&!f){e.warn(n,"Autoplacement does not work without grid-template-rows property");return}!a.nodes.find(E=>E.prop==="grid-template-columns")&&!f&&e.warn(n,"Autoplacement does not work without grid-template-columns property"),c&&!f&&JT(e,n,h,w)}}};ju.names=["grid-template-rows","grid-template-columns","grid-rows","grid-columns"];v0.exports=ju});var S0=x((lL,k0)=>{u();var rR=j(),zu=class extends rR{check(e){return!e.value.includes("flex-")&&e.value!=="baseline"}prefixed(e,t){return t+"grid-column-align"}normalize(){return"justify-self"}};zu.names=["grid-column-align"];k0.exports=zu});var C0=x((uL,A0)=>{u();var iR=j(),Uu=class extends iR{prefixed(e,t){return t+"scroll-chaining"}normalize(){return"overscroll-behavior"}set(e,t){return e.value==="auto"?e.value="chained":(e.value==="none"||e.value==="contain")&&(e.value="none"),super.set(e,t)}};Uu.names=["overscroll-behavior","scroll-chaining"];A0.exports=Uu});var O0=x((fL,E0)=>{u();var nR=j(),{parseGridAreas:sR,warnMissedAreas:aR,prefixTrackProp:oR,prefixTrackValue:_0,getGridGap:lR,warnGridGap:uR,inheritGridGap:fR}=Bt();function cR(r){return r.trim().slice(1,-1).split(/["']\s*["']?/g)}var Vu=class extends nR{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let a=!1,s=!1,o=e.parent,l=lR(e);l=fR(e,l)||l,o.walkDecls(/-ms-grid-rows/,d=>d.remove()),o.walkDecls(/grid-template-(rows|columns)/,d=>{if(d.prop==="grid-template-rows"){s=!0;let{prop:p,value:h}=d;d.cloneBefore({prop:oR({prop:p,prefix:t}),value:_0({value:h,gap:l.row})})}else a=!0});let c=cR(e.value);a&&!s&&l.row&&c.length>1&&e.cloneBefore({prop:"-ms-grid-rows",value:_0({value:`repeat(${c.length}, auto)`,gap:l.row}),raws:{}}),uR({gap:l,hasColumns:a,decl:e,result:n});let f=sR({rows:c,gap:l});return aR(f,e,n),e}};Vu.names=["grid-template-areas"];E0.exports=Vu});var R0=x((cL,T0)=>{u();var pR=j(),Hu=class extends pR{set(e,t){return t==="-webkit-"&&(e.value=e.value.replace(/\s*(right|left)\s*/i,"")),super.set(e,t)}};Hu.names=["text-emphasis-position"];T0.exports=Hu});var I0=x((pL,P0)=>{u();var dR=j(),Wu=class extends dR{set(e,t){return e.prop==="text-decoration-skip-ink"&&e.value==="auto"?(e.prop=t+"text-decoration-skip",e.value="ink",e):super.set(e,t)}};Wu.names=["text-decoration-skip-ink","text-decoration-skip"];P0.exports=Wu});var N0=x((dL,M0)=>{u();"use strict";M0.exports={wrap:D0,limit:q0,validate:$0,test:Gu,curry:hR,name:L0};function D0(r,e,t){var i=e-r;return((t-r)%i+i)%i+r}function q0(r,e,t){return Math.max(r,Math.min(e,t))}function $0(r,e,t,i,n){if(!Gu(r,e,t,i,n))throw new Error(t+" is outside of range ["+r+","+e+")");return t}function Gu(r,e,t,i,n){return!(t<r||t>e||n&&t===e||i&&t===r)}function L0(r,e,t,i){return(t?"(":"[")+r+","+e+(i?")":"]")}function hR(r,e,t,i){var n=L0.bind(null,r,e,t,i);return{wrap:D0.bind(null,r,e),limit:q0.bind(null,r,e),validate:function(a){return $0(r,e,a,t,i)},test:function(a){return Gu(r,e,a,t,i)},toString:n,name:n}}});var j0=x((hL,F0)=>{u();var Qu=$s(),mR=N0(),gR=xr(),yR=He(),bR=_e(),B0=/top|left|right|bottom/gi,wt=class extends yR{replace(e,t){let i=Qu(e);for(let n of i.nodes)if(n.type==="function"&&n.value===this.name)if(n.nodes=this.newDirection(n.nodes),n.nodes=this.normalize(n.nodes),t==="-webkit- old"){if(!this.oldWebkit(n))return!1}else n.nodes=this.convertDirection(n.nodes),n.value=t+n.value;return i.toString()}replaceFirst(e,...t){return t.map(n=>n===" "?{type:"space",value:n}:{type:"word",value:n}).concat(e.slice(1))}normalizeUnit(e,t){return`${parseFloat(e)/t*360}deg`}normalize(e){if(!e[0])return e;if(/-?\d+(.\d+)?grad/.test(e[0].value))e[0].value=this.normalizeUnit(e[0].value,400);else if(/-?\d+(.\d+)?rad/.test(e[0].value))e[0].value=this.normalizeUnit(e[0].value,2*Math.PI);else if(/-?\d+(.\d+)?turn/.test(e[0].value))e[0].value=this.normalizeUnit(e[0].value,1);else if(e[0].value.includes("deg")){let t=parseFloat(e[0].value);t=mR.wrap(0,360,t),e[0].value=`${t}deg`}return e[0].value==="0deg"?e=this.replaceFirst(e,"to"," ","top"):e[0].value==="90deg"?e=this.replaceFirst(e,"to"," ","right"):e[0].value==="180deg"?e=this.replaceFirst(e,"to"," ","bottom"):e[0].value==="270deg"&&(e=this.replaceFirst(e,"to"," ","left")),e}newDirection(e){if(e[0].value==="to"||(B0.lastIndex=0,!B0.test(e[0].value)))return e;e.unshift({type:"word",value:"to"},{type:"space",value:" "});for(let t=2;t<e.length&&e[t].type!=="div";t++)e[t].type==="word"&&(e[t].value=this.revertDirection(e[t].value));return e}isRadial(e){let t="before";for(let i of e)if(t==="before"&&i.type==="space")t="at";else if(t==="at"&&i.value==="at")t="after";else{if(t==="after"&&i.type==="space")return!0;if(i.type==="div")break;t="before"}return!1}convertDirection(e){return e.length>0&&(e[0].value==="to"?this.fixDirection(e):e[0].value.includes("deg")?this.fixAngle(e):this.isRadial(e)&&this.fixRadial(e)),e}fixDirection(e){e.splice(0,2);for(let t of e){if(t.type==="div")break;t.type==="word"&&(t.value=this.revertDirection(t.value))}}fixAngle(e){let t=e[0].value;t=parseFloat(t),t=Math.abs(450-t)%360,t=this.roundFloat(t,3),e[0].value=`${t}deg`}fixRadial(e){let t=[],i=[],n,a,s,o,l;for(o=0;o<e.length-2;o++)if(n=e[o],a=e[o+1],s=e[o+2],n.type==="space"&&a.value==="at"&&s.type==="space"){l=o+3;break}else t.push(n);let c;for(o=l;o<e.length;o++)if(e[o].type==="div"){c=e[o];break}else i.push(e[o]);e.splice(0,o,...i,c,...t)}revertDirection(e){return wt.directions[e.toLowerCase()]||e}roundFloat(e,t){return parseFloat(e.toFixed(t))}oldWebkit(e){let{nodes:t}=e,i=Qu.stringify(e.nodes);if(this.name!=="linear-gradient"||t[0]&&t[0].value.includes("deg")||i.includes("px")||i.includes("-corner")||i.includes("-side"))return!1;let n=[[]];for(let a of t)n[n.length-1].push(a),a.type==="div"&&a.value===","&&n.push([]);this.oldDirection(n),this.colorStops(n),e.nodes=[];for(let a of n)e.nodes=e.nodes.concat(a);return e.nodes.unshift({type:"word",value:"linear"},this.cloneDiv(e.nodes)),e.value="-webkit-gradient",!0}oldDirection(e){let t=this.cloneDiv(e[0]);if(e[0][0].value!=="to")return e.unshift([{type:"word",value:wt.oldDirections.bottom},t]);{let i=[];for(let a of e[0].slice(2))a.type==="word"&&i.push(a.value.toLowerCase());i=i.join(" ");let n=wt.oldDirections[i]||i;return e[0]=[{type:"word",value:n},t],e[0]}}cloneDiv(e){for(let t of e)if(t.type==="div"&&t.value===",")return t;return{type:"div",value:",",after:" "}}colorStops(e){let t=[];for(let i=0;i<e.length;i++){let n,a=e[i],s;if(i===0)continue;let o=Qu.stringify(a[0]);a[1]&&a[1].type==="word"?n=a[1].value:a[2]&&a[2].type==="word"&&(n=a[2].value);let l;i===1&&(!n||n==="0%")?l=`from(${o})`:i===e.length-1&&(!n||n==="100%")?l=`to(${o})`:n?l=`color-stop(${n}, ${o})`:l=`color-stop(${o})`;let c=a[a.length-1];e[i]=[{type:"word",value:l}],c.type==="div"&&c.value===","&&(s=e[i].push(c)),t.push(s)}return t}old(e){if(e==="-webkit-"){let t=this.name==="linear-gradient"?"linear":"radial",i="-gradient",n=bR.regexp(`-webkit-(${t}-gradient|gradient\\(\\s*${t})`,!1);return new gR(this.name,e+this.name,i,n)}else return super.old(e)}add(e,t){let i=e.prop;if(i.includes("mask")){if(t==="-webkit-"||t==="-webkit- old")return super.add(e,t)}else if(i==="list-style"||i==="list-style-image"||i==="content"){if(t==="-webkit-"||t==="-webkit- old")return super.add(e,t)}else return super.add(e,t)}};wt.names=["linear-gradient","repeating-linear-gradient","radial-gradient","repeating-radial-gradient"];wt.directions={top:"bottom",left:"right",bottom:"top",right:"left"};wt.oldDirections={top:"left bottom, left top",left:"right top, left top",bottom:"left top, left bottom",right:"left top, right top","top right":"left bottom, right top","top left":"right bottom, left top","right top":"left bottom, right top","right bottom":"left top, right bottom","bottom right":"left top, right bottom","bottom left":"right top, left bottom","left top":"right bottom, left top","left bottom":"right top, left bottom"};F0.exports=wt});var V0=x((mL,U0)=>{u();var wR=xr(),vR=He();function z0(r){return new RegExp(`(^|[\\s,(])(${r}($|[\\s),]))`,"gi")}var Yu=class extends vR{regexp(){return this.regexpCache||(this.regexpCache=z0(this.name)),this.regexpCache}isStretch(){return this.name==="stretch"||this.name==="fill"||this.name==="fill-available"}replace(e,t){return t==="-moz-"&&this.isStretch()?e.replace(this.regexp(),"$1-moz-available$3"):t==="-webkit-"&&this.isStretch()?e.replace(this.regexp(),"$1-webkit-fill-available$3"):super.replace(e,t)}old(e){let t=e+this.name;return this.isStretch()&&(e==="-moz-"?t="-moz-available":e==="-webkit-"&&(t="-webkit-fill-available")),new wR(this.name,t,t,z0(t))}add(e,t){if(!(e.prop.includes("grid")&&t!=="-webkit-"))return super.add(e,t)}};Yu.names=["max-content","min-content","fit-content","fill","fill-available","stretch"];U0.exports=Yu});var G0=x((gL,W0)=>{u();var H0=xr(),xR=He(),Ku=class extends xR{replace(e,t){return t==="-webkit-"?e.replace(this.regexp(),"$1-webkit-optimize-contrast"):t==="-moz-"?e.replace(this.regexp(),"$1-moz-crisp-edges"):super.replace(e,t)}old(e){return e==="-webkit-"?new H0(this.name,"-webkit-optimize-contrast"):e==="-moz-"?new H0(this.name,"-moz-crisp-edges"):super.old(e)}};Ku.names=["pixelated"];W0.exports=Ku});var Y0=x((yL,Q0)=>{u();var kR=He(),Xu=class extends kR{replace(e,t){let i=super.replace(e,t);return t==="-webkit-"&&(i=i.replace(/("[^"]+"|'[^']+')(\s+\d+\w)/gi,"url($1)$2")),i}};Xu.names=["image-set"];Q0.exports=Xu});var X0=x((bL,K0)=>{u();var SR=$e().list,AR=He(),Ju=class extends AR{replace(e,t){return SR.space(e).map(i=>{if(i.slice(0,+this.name.length+1)!==this.name+"(")return i;let n=i.lastIndexOf(")"),a=i.slice(n+1),s=i.slice(this.name.length+1,n);if(t==="-webkit-"){let o=s.match(/\d*.?\d+%?/);o?(s=s.slice(o[0].length).trim(),s+=`, ${o[0]}`):s+=", 0.5"}return t+this.name+"("+s+")"+a}).join(" ")}};Ju.names=["cross-fade"];K0.exports=Ju});var Z0=x((wL,J0)=>{u();var CR=Pe(),_R=xr(),ER=He(),Zu=class extends ER{constructor(e,t){super(e,t);e==="display-flex"&&(this.name="flex")}check(e){return e.prop==="display"&&e.value===this.name}prefixed(e){let t,i;return[t,e]=CR(e),t===2009?this.name==="flex"?i="box":i="inline-box":t===2012?this.name==="flex"?i="flexbox":i="inline-flexbox":t==="final"&&(i=this.name),e+i}replace(e,t){return this.prefixed(t)}old(e){let t=this.prefixed(e);if(!!t)return new _R(this.name,t)}};Zu.names=["display-flex","inline-flex"];J0.exports=Zu});var tv=x((vL,ev)=>{u();var OR=He(),ef=class extends OR{constructor(e,t){super(e,t);e==="display-grid"&&(this.name="grid")}check(e){return e.prop==="display"&&e.value===this.name}};ef.names=["display-grid","inline-grid"];ev.exports=ef});var iv=x((xL,rv)=>{u();var TR=He(),tf=class extends TR{constructor(e,t){super(e,t);e==="filter-function"&&(this.name="filter")}};tf.names=["filter","filter-function"];rv.exports=tf});var ov=x((kL,av)=>{u();var nv=Li(),z=j(),sv=Fy(),RR=nb(),PR=su(),IR=Sb(),rf=Mt(),Pr=kr(),DR=Pb(),ut=He(),Ir=_e(),qR=Db(),$R=$b(),LR=Mb(),MR=Bb(),NR=Vb(),BR=Gb(),FR=Yb(),jR=Xb(),zR=Zb(),UR=tw(),VR=iw(),HR=sw(),WR=ow(),GR=uw(),QR=cw(),YR=hw(),KR=gw(),XR=ww(),JR=xw(),ZR=Sw(),e5=_w(),t5=Ow(),r5=Pw(),i5=Dw(),n5=$w(),s5=Mw(),a5=Bw(),o5=zw(),l5=Vw(),u5=Ww(),f5=Qw(),c5=Kw(),p5=Jw(),d5=e0(),h5=i0(),m5=s0(),g5=o0(),y5=u0(),b5=c0(),w5=h0(),v5=g0(),x5=b0(),k5=x0(),S5=S0(),A5=C0(),C5=O0(),_5=R0(),E5=I0(),O5=j0(),T5=V0(),R5=G0(),P5=Y0(),I5=X0(),D5=Z0(),q5=tv(),$5=iv();Pr.hack(qR);Pr.hack($R);Pr.hack(LR);Pr.hack(MR);z.hack(NR);z.hack(BR);z.hack(FR);z.hack(jR);z.hack(zR);z.hack(UR);z.hack(VR);z.hack(HR);z.hack(WR);z.hack(GR);z.hack(QR);z.hack(YR);z.hack(KR);z.hack(XR);z.hack(JR);z.hack(ZR);z.hack(e5);z.hack(t5);z.hack(r5);z.hack(i5);z.hack(n5);z.hack(s5);z.hack(a5);z.hack(o5);z.hack(l5);z.hack(u5);z.hack(f5);z.hack(c5);z.hack(p5);z.hack(d5);z.hack(h5);z.hack(m5);z.hack(g5);z.hack(y5);z.hack(b5);z.hack(w5);z.hack(v5);z.hack(x5);z.hack(k5);z.hack(S5);z.hack(A5);z.hack(C5);z.hack(_5);z.hack(E5);ut.hack(O5);ut.hack(T5);ut.hack(R5);ut.hack(P5);ut.hack(I5);ut.hack(D5);ut.hack(q5);ut.hack($5);var nf=new Map,Ni=class{constructor(e,t,i={}){this.data=e,this.browsers=t,this.options=i,[this.add,this.remove]=this.preprocess(this.select(this.data)),this.transition=new RR(this),this.processor=new PR(this)}cleaner(){if(this.cleanerCache)return this.cleanerCache;if(this.browsers.selected.length){let e=new rf(this.browsers.data,[]);this.cleanerCache=new Ni(this.data,e,this.options)}else return this;return this.cleanerCache}select(e){let t={add:{},remove:{}};for(let i in e){let n=e[i],a=n.browsers.map(l=>{let c=l.split(" ");return{browser:`${c[0]} ${c[1]}`,note:c[2]}}),s=a.filter(l=>l.note).map(l=>`${this.browsers.prefix(l.browser)} ${l.note}`);s=Ir.uniq(s),a=a.filter(l=>this.browsers.isSelected(l.browser)).map(l=>{let c=this.browsers.prefix(l.browser);return l.note?`${c} ${l.note}`:c}),a=this.sort(Ir.uniq(a)),this.options.flexbox==="no-2009"&&(a=a.filter(l=>!l.includes("2009")));let o=n.browsers.map(l=>this.browsers.prefix(l));n.mistakes&&(o=o.concat(n.mistakes)),o=o.concat(s),o=Ir.uniq(o),a.length?(t.add[i]=a,a.length<o.length&&(t.remove[i]=o.filter(l=>!a.includes(l)))):t.remove[i]=o}return t}sort(e){return e.sort((t,i)=>{let n=Ir.removeNote(t).length,a=Ir.removeNote(i).length;return n===a?i.length-t.length:a-n})}preprocess(e){let t={selectors:[],"@supports":new IR(Ni,this)};for(let n in e.add){let a=e.add[n];if(n==="@keyframes"||n==="@viewport")t[n]=new DR(n,a,this);else if(n==="@resolution")t[n]=new sv(n,a,this);else if(this.data[n].selector)t.selectors.push(Pr.load(n,a,this));else{let s=this.data[n].props;if(s){let o=ut.load(n,a,this);for(let l of s)t[l]||(t[l]={values:[]}),t[l].values.push(o)}else{let o=t[n]&&t[n].values||[];t[n]=z.load(n,a,this),t[n].values=o}}}let i={selectors:[]};for(let n in e.remove){let a=e.remove[n];if(this.data[n].selector){let s=Pr.load(n,a);for(let o of a)i.selectors.push(s.old(o))}else if(n==="@keyframes"||n==="@viewport")for(let s of a){let o=`@${s}${n.slice(1)}`;i[o]={remove:!0}}else if(n==="@resolution")i[n]=new sv(n,a,this);else{let s=this.data[n].props;if(s){let o=ut.load(n,[],this);for(let l of a){let c=o.old(l);if(c)for(let f of s)i[f]||(i[f]={}),i[f].values||(i[f].values=[]),i[f].values.push(c)}}else for(let o of a){let l=this.decl(n).old(n,o);if(n==="align-self"){let c=t[n]&&t[n].prefixes;if(c){if(o==="-webkit- 2009"&&c.includes("-webkit-"))continue;if(o==="-webkit-"&&c.includes("-webkit- 2009"))continue}}for(let c of l)i[c]||(i[c]={}),i[c].remove=!0}}}return[t,i]}decl(e){return nf.has(e)||nf.set(e,z.load(e)),nf.get(e)}unprefixed(e){let t=this.normalize(nv.unprefixed(e));return t==="flex-direction"&&(t="flex-flow"),t}normalize(e){return this.decl(e).normalize(e)}prefixed(e,t){return e=nv.unprefixed(e),this.decl(e).prefixed(e,t)}values(e,t){let i=this[e],n=i["*"]&&i["*"].values,a=i[t]&&i[t].values;return n&&a?Ir.uniq(n.concat(a)):n||a||[]}group(e){let t=e.parent,i=t.index(e),{length:n}=t.nodes,a=this.unprefixed(e.prop),s=(o,l)=>{for(i+=o;i>=0&&i<n;){let c=t.nodes[i];if(c.type==="decl"){if(o===-1&&c.prop===a&&!rf.withPrefix(c.value)||this.unprefixed(c.prop)!==a)break;if(l(c)===!0)return!0;if(o===1&&c.prop===a&&!rf.withPrefix(c.value))break}i+=o}return!1};return{up(o){return s(-1,o)},down(o){return s(1,o)}}}};av.exports=Ni});var uv=x((SL,lv)=>{u();lv.exports={"backdrop-filter":{feature:"css-backdrop-filter",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},element:{props:["background","background-image","border-image","mask","list-style","list-style-image","content","mask-image"],feature:"css-element-function",browsers:["firefox 114"]},"user-select":{mistakes:["-khtml-"],feature:"user-select-none",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},"background-clip":{feature:"background-clip-text",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},hyphens:{feature:"css-hyphens",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},fill:{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"fill-available":{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},stretch:{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["firefox 114"]},"fit-content":{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["firefox 114"]},"text-decoration-style":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-color":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-line":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-skip":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-skip-ink":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-size-adjust":{feature:"text-size-adjust",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"mask-clip":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-composite":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-image":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-origin":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-repeat":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-repeat":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-source":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},mask:{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-position":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-size":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-outset":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-width":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-slice":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"clip-path":{feature:"css-clip-path",browsers:["samsung 21"]},"box-decoration-break":{feature:"css-boxdecorationbreak",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","opera 99","safari 16.5","samsung 21"]},appearance:{feature:"css-appearance",browsers:["samsung 21"]},"image-set":{props:["background","background-image","border-image","cursor","mask","mask-image","list-style","list-style-image","content"],feature:"css-image-set",browsers:["and_uc 15.5","chrome 109","samsung 21"]},"cross-fade":{props:["background","background-image","border-image","mask","list-style","list-style-image","content","mask-image"],feature:"css-cross-fade",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},isolate:{props:["unicode-bidi"],feature:"css-unicode-bidi",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},"color-adjust":{feature:"css-color-adjust",browsers:["chrome 109","chrome 113","chrome 114","edge 114","opera 99"]}}});var cv=x((AL,fv)=>{u();fv.exports={}});var mv=x((CL,hv)=>{u();var L5=Gl(),{agents:M5}=(Ts(),Os),sf=_y(),N5=Mt(),B5=ov(),F5=uv(),j5=cv(),pv={browsers:M5,prefixes:F5},dv=`
+  Replace Autoprefixer \`browsers\` option to Browserslist config.
+  Use \`browserslist\` key in \`package.json\` or \`.browserslistrc\` file.
+
+  Using \`browsers\` option can cause errors. Browserslist config can
+  be used for Babel, Autoprefixer, postcss-normalize and other tools.
+
+  If you really need to use option, rename it to \`overrideBrowserslist\`.
+
+  Learn more at:
+  https://github.com/browserslist/browserslist#readme
+  https://twitter.com/browserslist
+
+`;function z5(r){return Object.prototype.toString.apply(r)==="[object Object]"}var af=new Map;function U5(r,e){e.browsers.selected.length!==0&&(e.add.selectors.length>0||Object.keys(e.add).length>2||r.warn(`Autoprefixer target browsers do not need any prefixes.You do not need Autoprefixer anymore.
+Check your Browserslist config to be sure that your targets are set up correctly.
+
+  Learn more at:
+  https://github.com/postcss/autoprefixer#readme
+  https://github.com/browserslist/browserslist#readme
+
+`))}hv.exports=Dr;function Dr(...r){let e;if(r.length===1&&z5(r[0])?(e=r[0],r=void 0):r.length===0||r.length===1&&!r[0]?r=void 0:r.length<=2&&(Array.isArray(r[0])||!r[0])?(e=r[1],r=r[0]):typeof r[r.length-1]=="object"&&(e=r.pop()),e||(e={}),e.browser)throw new Error("Change `browser` option to `overrideBrowserslist` in Autoprefixer");if(e.browserslist)throw new Error("Change `browserslist` option to `overrideBrowserslist` in Autoprefixer");e.overrideBrowserslist?r=e.overrideBrowserslist:e.browsers&&(typeof console!="undefined"&&console.warn&&(sf.red?console.warn(sf.red(dv.replace(/`[^`]+`/g,n=>sf.yellow(n.slice(1,-1))))):console.warn(dv)),r=e.browsers);let t={ignoreUnknownVersions:e.ignoreUnknownVersions,stats:e.stats,env:e.env};function i(n){let a=pv,s=new N5(a.browsers,r,n,t),o=s.selected.join(", ")+JSON.stringify(e);return af.has(o)||af.set(o,new B5(a.prefixes,s,e)),af.get(o)}return{postcssPlugin:"autoprefixer",prepare(n){let a=i({from:n.opts.from,env:e.env});return{OnceExit(s){U5(n,a),e.remove!==!1&&a.processor.remove(s,n),e.add!==!1&&a.processor.add(s,n)}}},info(n){return n=n||{},n.from=n.from||m.cwd(),j5(i(n))},options:e,browsers:r}}Dr.postcss=!0;Dr.data=pv;Dr.defaults=L5.defaults;Dr.info=()=>Dr().info()});var gv={};Ge(gv,{default:()=>V5});var V5,yv=R(()=>{u();V5=[]});var wv={};Ge(wv,{default:()=>H5});var bv,H5,vv=R(()=>{u();Yi();bv=pe(en()),H5=St(bv.default.theme)});var kv={};Ge(kv,{default:()=>W5});var xv,W5,Sv=R(()=>{u();Yi();xv=pe(en()),W5=St(xv.default)});u();"use strict";var G5=vt(Ay()),Q5=vt($e()),Y5=vt(mv()),K5=vt((yv(),gv)),X5=vt((vv(),wv)),J5=vt((Sv(),kv)),Z5=vt((zs(),Af)),eP=vt((nl(),il)),tP=vt((ia(),ic));function vt(r){return r&&r.__esModule?r:{default:r}}console.warn("cdn.tailwindcss.com should not be used in production. To use Tailwind CSS in production, install it as a PostCSS plugin or use the Tailwind CLI: https://tailwindcss.com/docs/installation");var Ls="tailwind",of="text/tailwindcss",Av="/template.html",Yt,Cv=!0,_v=0,lf=new Set,uf,Ev="",Ov=(r=!1)=>({get(e,t){return(!r||t==="config")&&typeof e[t]=="object"&&e[t]!==null?new Proxy(e[t],Ov()):e[t]},set(e,t,i){return e[t]=i,(!r||t==="config")&&ff(!0),!0}});window[Ls]=new Proxy({config:{},defaultTheme:X5.default,defaultConfig:J5.default,colors:Z5.default,plugin:eP.default,resolveConfig:tP.default},Ov(!0));function Tv(r){uf.observe(r,{attributes:!0,attributeFilter:["type"],characterData:!0,subtree:!0,childList:!0})}new MutationObserver(async r=>{let e=!1;if(!uf){uf=new MutationObserver(async()=>await ff(!0));for(let t of document.querySelectorAll(`style[type="${of}"]`))Tv(t)}for(let t of r)for(let i of t.addedNodes)i.nodeType===1&&i.tagName==="STYLE"&&i.getAttribute("type")===of&&(Tv(i),e=!0);await ff(e)}).observe(document.documentElement,{attributes:!0,attributeFilter:["class"],childList:!0,subtree:!0});async function ff(r=!1){r&&(_v++,lf.clear());let e="";for(let i of document.querySelectorAll(`style[type="${of}"]`))e+=i.textContent;let t=new Set;for(let i of document.querySelectorAll("[class]"))for(let n of i.classList)lf.has(n)||t.add(n);if(document.body&&(Cv||t.size>0||e!==Ev||!Yt||!Yt.isConnected)){for(let n of t)lf.add(n);Cv=!1,Ev=e,self[Av]=Array.from(t).join(" ");let{css:i}=await(0,Q5.default)([(0,G5.default)({...window[Ls].config,_hash:_v,content:{files:[Av],extract:{html:n=>n.split(" ")}},plugins:[...K5.default,...Array.isArray(window[Ls].config.plugins)?window[Ls].config.plugins:[]]}),(0,Y5.default)({remove:!1})]).process(`@tailwind base;@tailwind components;@tailwind utilities;${e}`);(!Yt||!Yt.isConnected)&&(Yt=document.createElement("style"),document.head.append(Yt)),Yt.textContent=i}}})();
+/*!
+ * fill-range <https://github.com/jonschlinkert/fill-range>
+ *
+ * Copyright (c) 2014-present, Jon Schlinkert.
+ * Licensed under the MIT License.
+ */
+/*!
+ * is-number <https://github.com/jonschlinkert/is-number>
+ *
+ * Copyright (c) 2014-present, Jon Schlinkert.
+ * Released under the MIT License.
+ */
+/*!
+ * to-regex-range <https://github.com/micromatch/to-regex-range>
+ *
+ * Copyright (c) 2015-present, Jon Schlinkert.
+ * Released under the MIT License.
+ */
+/*! https://mths.be/cssesc v3.0.0 by @mathias */
+
diff --git a/examples/server/public/deps_vue.esm-browser.js b/examples/server/public/deps_vue.esm-browser.js
new file mode 100644
index 000000000..4679d9614
--- /dev/null
+++ b/examples/server/public/deps_vue.esm-browser.js
@@ -0,0 +1,18160 @@
+/**
+* vue v3.5.12
+* (c) 2018-present Yuxi (Evan) You and Vue contributors
+* @license MIT
+**/
+/*! #__NO_SIDE_EFFECTS__ */
+// @__NO_SIDE_EFFECTS__
+function makeMap(str) {
+  const map = /* @__PURE__ */ Object.create(null);
+  for (const key of str.split(",")) map[key] = 1;
+  return (val) => val in map;
+}
+
+const EMPTY_OBJ = Object.freeze({}) ;
+const EMPTY_ARR = Object.freeze([]) ;
+const NOOP = () => {
+};
+const NO = () => false;
+const isOn = (key) => key.charCodeAt(0) === 111 && key.charCodeAt(1) === 110 && // uppercase letter
+(key.charCodeAt(2) > 122 || key.charCodeAt(2) < 97);
+const isModelListener = (key) => key.startsWith("onUpdate:");
+const extend = Object.assign;
+const remove = (arr, el) => {
+  const i = arr.indexOf(el);
+  if (i > -1) {
+    arr.splice(i, 1);
+  }
+};
+const hasOwnProperty$1 = Object.prototype.hasOwnProperty;
+const hasOwn = (val, key) => hasOwnProperty$1.call(val, key);
+const isArray = Array.isArray;
+const isMap = (val) => toTypeString(val) === "[object Map]";
+const isSet = (val) => toTypeString(val) === "[object Set]";
+const isDate = (val) => toTypeString(val) === "[object Date]";
+const isRegExp = (val) => toTypeString(val) === "[object RegExp]";
+const isFunction = (val) => typeof val === "function";
+const isString = (val) => typeof val === "string";
+const isSymbol = (val) => typeof val === "symbol";
+const isObject = (val) => val !== null && typeof val === "object";
+const isPromise = (val) => {
+  return (isObject(val) || isFunction(val)) && isFunction(val.then) && isFunction(val.catch);
+};
+const objectToString = Object.prototype.toString;
+const toTypeString = (value) => objectToString.call(value);
+const toRawType = (value) => {
+  return toTypeString(value).slice(8, -1);
+};
+const isPlainObject = (val) => toTypeString(val) === "[object Object]";
+const isIntegerKey = (key) => isString(key) && key !== "NaN" && key[0] !== "-" && "" + parseInt(key, 10) === key;
+const isReservedProp = /* @__PURE__ */ makeMap(
+  // the leading comma is intentional so empty string "" is also included
+  ",key,ref,ref_for,ref_key,onVnodeBeforeMount,onVnodeMounted,onVnodeBeforeUpdate,onVnodeUpdated,onVnodeBeforeUnmount,onVnodeUnmounted"
+);
+const isBuiltInDirective = /* @__PURE__ */ makeMap(
+  "bind,cloak,else-if,else,for,html,if,model,on,once,pre,show,slot,text,memo"
+);
+const cacheStringFunction = (fn) => {
+  const cache = /* @__PURE__ */ Object.create(null);
+  return (str) => {
+    const hit = cache[str];
+    return hit || (cache[str] = fn(str));
+  };
+};
+const camelizeRE = /-(\w)/g;
+const camelize = cacheStringFunction(
+  (str) => {
+    return str.replace(camelizeRE, (_, c) => c ? c.toUpperCase() : "");
+  }
+);
+const hyphenateRE = /\B([A-Z])/g;
+const hyphenate = cacheStringFunction(
+  (str) => str.replace(hyphenateRE, "-$1").toLowerCase()
+);
+const capitalize = cacheStringFunction((str) => {
+  return str.charAt(0).toUpperCase() + str.slice(1);
+});
+const toHandlerKey = cacheStringFunction(
+  (str) => {
+    const s = str ? `on${capitalize(str)}` : ``;
+    return s;
+  }
+);
+const hasChanged = (value, oldValue) => !Object.is(value, oldValue);
+const invokeArrayFns = (fns, ...arg) => {
+  for (let i = 0; i < fns.length; i++) {
+    fns[i](...arg);
+  }
+};
+const def = (obj, key, value, writable = false) => {
+  Object.defineProperty(obj, key, {
+    configurable: true,
+    enumerable: false,
+    writable,
+    value
+  });
+};
+const looseToNumber = (val) => {
+  const n = parseFloat(val);
+  return isNaN(n) ? val : n;
+};
+const toNumber = (val) => {
+  const n = isString(val) ? Number(val) : NaN;
+  return isNaN(n) ? val : n;
+};
+let _globalThis;
+const getGlobalThis = () => {
+  return _globalThis || (_globalThis = typeof globalThis !== "undefined" ? globalThis : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : {});
+};
+function genCacheKey(source, options) {
+  return source + JSON.stringify(
+    options,
+    (_, val) => typeof val === "function" ? val.toString() : val
+  );
+}
+
+const PatchFlagNames = {
+  [1]: `TEXT`,
+  [2]: `CLASS`,
+  [4]: `STYLE`,
+  [8]: `PROPS`,
+  [16]: `FULL_PROPS`,
+  [32]: `NEED_HYDRATION`,
+  [64]: `STABLE_FRAGMENT`,
+  [128]: `KEYED_FRAGMENT`,
+  [256]: `UNKEYED_FRAGMENT`,
+  [512]: `NEED_PATCH`,
+  [1024]: `DYNAMIC_SLOTS`,
+  [2048]: `DEV_ROOT_FRAGMENT`,
+  [-1]: `HOISTED`,
+  [-2]: `BAIL`
+};
+
+const slotFlagsText = {
+  [1]: "STABLE",
+  [2]: "DYNAMIC",
+  [3]: "FORWARDED"
+};
+
+const GLOBALS_ALLOWED = "Infinity,undefined,NaN,isFinite,isNaN,parseFloat,parseInt,decodeURI,decodeURIComponent,encodeURI,encodeURIComponent,Math,Number,Date,Array,Object,Boolean,String,RegExp,Map,Set,JSON,Intl,BigInt,console,Error,Symbol";
+const isGloballyAllowed = /* @__PURE__ */ makeMap(GLOBALS_ALLOWED);
+
+const range = 2;
+function generateCodeFrame(source, start = 0, end = source.length) {
+  start = Math.max(0, Math.min(start, source.length));
+  end = Math.max(0, Math.min(end, source.length));
+  if (start > end) return "";
+  let lines = source.split(/(\r?\n)/);
+  const newlineSequences = lines.filter((_, idx) => idx % 2 === 1);
+  lines = lines.filter((_, idx) => idx % 2 === 0);
+  let count = 0;
+  const res = [];
+  for (let i = 0; i < lines.length; i++) {
+    count += lines[i].length + (newlineSequences[i] && newlineSequences[i].length || 0);
+    if (count >= start) {
+      for (let j = i - range; j <= i + range || end > count; j++) {
+        if (j < 0 || j >= lines.length) continue;
+        const line = j + 1;
+        res.push(
+          `${line}${" ".repeat(Math.max(3 - String(line).length, 0))}|  ${lines[j]}`
+        );
+        const lineLength = lines[j].length;
+        const newLineSeqLength = newlineSequences[j] && newlineSequences[j].length || 0;
+        if (j === i) {
+          const pad = start - (count - (lineLength + newLineSeqLength));
+          const length = Math.max(
+            1,
+            end > count ? lineLength - pad : end - start
+          );
+          res.push(`   |  ` + " ".repeat(pad) + "^".repeat(length));
+        } else if (j > i) {
+          if (end > count) {
+            const length = Math.max(Math.min(end - count, lineLength), 1);
+            res.push(`   |  ` + "^".repeat(length));
+          }
+          count += lineLength + newLineSeqLength;
+        }
+      }
+      break;
+    }
+  }
+  return res.join("\n");
+}
+
+function normalizeStyle(value) {
+  if (isArray(value)) {
+    const res = {};
+    for (let i = 0; i < value.length; i++) {
+      const item = value[i];
+      const normalized = isString(item) ? parseStringStyle(item) : normalizeStyle(item);
+      if (normalized) {
+        for (const key in normalized) {
+          res[key] = normalized[key];
+        }
+      }
+    }
+    return res;
+  } else if (isString(value) || isObject(value)) {
+    return value;
+  }
+}
+const listDelimiterRE = /;(?![^(]*\))/g;
+const propertyDelimiterRE = /:([^]+)/;
+const styleCommentRE = /\/\*[^]*?\*\//g;
+function parseStringStyle(cssText) {
+  const ret = {};
+  cssText.replace(styleCommentRE, "").split(listDelimiterRE).forEach((item) => {
+    if (item) {
+      const tmp = item.split(propertyDelimiterRE);
+      tmp.length > 1 && (ret[tmp[0].trim()] = tmp[1].trim());
+    }
+  });
+  return ret;
+}
+function stringifyStyle(styles) {
+  let ret = "";
+  if (!styles || isString(styles)) {
+    return ret;
+  }
+  for (const key in styles) {
+    const value = styles[key];
+    if (isString(value) || typeof value === "number") {
+      const normalizedKey = key.startsWith(`--`) ? key : hyphenate(key);
+      ret += `${normalizedKey}:${value};`;
+    }
+  }
+  return ret;
+}
+function normalizeClass(value) {
+  let res = "";
+  if (isString(value)) {
+    res = value;
+  } else if (isArray(value)) {
+    for (let i = 0; i < value.length; i++) {
+      const normalized = normalizeClass(value[i]);
+      if (normalized) {
+        res += normalized + " ";
+      }
+    }
+  } else if (isObject(value)) {
+    for (const name in value) {
+      if (value[name]) {
+        res += name + " ";
+      }
+    }
+  }
+  return res.trim();
+}
+function normalizeProps(props) {
+  if (!props) return null;
+  let { class: klass, style } = props;
+  if (klass && !isString(klass)) {
+    props.class = normalizeClass(klass);
+  }
+  if (style) {
+    props.style = normalizeStyle(style);
+  }
+  return props;
+}
+
+const HTML_TAGS = "html,body,base,head,link,meta,style,title,address,article,aside,footer,header,hgroup,h1,h2,h3,h4,h5,h6,nav,section,div,dd,dl,dt,figcaption,figure,picture,hr,img,li,main,ol,p,pre,ul,a,b,abbr,bdi,bdo,br,cite,code,data,dfn,em,i,kbd,mark,q,rp,rt,ruby,s,samp,small,span,strong,sub,sup,time,u,var,wbr,area,audio,map,track,video,embed,object,param,source,canvas,script,noscript,del,ins,caption,col,colgroup,table,thead,tbody,td,th,tr,button,datalist,fieldset,form,input,label,legend,meter,optgroup,option,output,progress,select,textarea,details,dialog,menu,summary,template,blockquote,iframe,tfoot";
+const SVG_TAGS = "svg,animate,animateMotion,animateTransform,circle,clipPath,color-profile,defs,desc,discard,ellipse,feBlend,feColorMatrix,feComponentTransfer,feComposite,feConvolveMatrix,feDiffuseLighting,feDisplacementMap,feDistantLight,feDropShadow,feFlood,feFuncA,feFuncB,feFuncG,feFuncR,feGaussianBlur,feImage,feMerge,feMergeNode,feMorphology,feOffset,fePointLight,feSpecularLighting,feSpotLight,feTile,feTurbulence,filter,foreignObject,g,hatch,hatchpath,image,line,linearGradient,marker,mask,mesh,meshgradient,meshpatch,meshrow,metadata,mpath,path,pattern,polygon,polyline,radialGradient,rect,set,solidcolor,stop,switch,symbol,text,textPath,title,tspan,unknown,use,view";
+const MATH_TAGS = "annotation,annotation-xml,maction,maligngroup,malignmark,math,menclose,merror,mfenced,mfrac,mfraction,mglyph,mi,mlabeledtr,mlongdiv,mmultiscripts,mn,mo,mover,mpadded,mphantom,mprescripts,mroot,mrow,ms,mscarries,mscarry,msgroup,msline,mspace,msqrt,msrow,mstack,mstyle,msub,msubsup,msup,mtable,mtd,mtext,mtr,munder,munderover,none,semantics";
+const VOID_TAGS = "area,base,br,col,embed,hr,img,input,link,meta,param,source,track,wbr";
+const isHTMLTag = /* @__PURE__ */ makeMap(HTML_TAGS);
+const isSVGTag = /* @__PURE__ */ makeMap(SVG_TAGS);
+const isMathMLTag = /* @__PURE__ */ makeMap(MATH_TAGS);
+const isVoidTag = /* @__PURE__ */ makeMap(VOID_TAGS);
+
+const specialBooleanAttrs = `itemscope,allowfullscreen,formnovalidate,ismap,nomodule,novalidate,readonly`;
+const isSpecialBooleanAttr = /* @__PURE__ */ makeMap(specialBooleanAttrs);
+const isBooleanAttr = /* @__PURE__ */ makeMap(
+  specialBooleanAttrs + `,async,autofocus,autoplay,controls,default,defer,disabled,hidden,inert,loop,open,required,reversed,scoped,seamless,checked,muted,multiple,selected`
+);
+function includeBooleanAttr(value) {
+  return !!value || value === "";
+}
+const isKnownHtmlAttr = /* @__PURE__ */ makeMap(
+  `accept,accept-charset,accesskey,action,align,allow,alt,async,autocapitalize,autocomplete,autofocus,autoplay,background,bgcolor,border,buffered,capture,challenge,charset,checked,cite,class,code,codebase,color,cols,colspan,content,contenteditable,contextmenu,controls,coords,crossorigin,csp,data,datetime,decoding,default,defer,dir,dirname,disabled,download,draggable,dropzone,enctype,enterkeyhint,for,form,formaction,formenctype,formmethod,formnovalidate,formtarget,headers,height,hidden,high,href,hreflang,http-equiv,icon,id,importance,inert,integrity,ismap,itemprop,keytype,kind,label,lang,language,loading,list,loop,low,manifest,max,maxlength,minlength,media,min,multiple,muted,name,novalidate,open,optimum,pattern,ping,placeholder,poster,preload,radiogroup,readonly,referrerpolicy,rel,required,reversed,rows,rowspan,sandbox,scope,scoped,selected,shape,size,sizes,slot,span,spellcheck,src,srcdoc,srclang,srcset,start,step,style,summary,tabindex,target,title,translate,type,usemap,value,width,wrap`
+);
+const isKnownSvgAttr = /* @__PURE__ */ makeMap(
+  `xmlns,accent-height,accumulate,additive,alignment-baseline,alphabetic,amplitude,arabic-form,ascent,attributeName,attributeType,azimuth,baseFrequency,baseline-shift,baseProfile,bbox,begin,bias,by,calcMode,cap-height,class,clip,clipPathUnits,clip-path,clip-rule,color,color-interpolation,color-interpolation-filters,color-profile,color-rendering,contentScriptType,contentStyleType,crossorigin,cursor,cx,cy,d,decelerate,descent,diffuseConstant,direction,display,divisor,dominant-baseline,dur,dx,dy,edgeMode,elevation,enable-background,end,exponent,fill,fill-opacity,fill-rule,filter,filterRes,filterUnits,flood-color,flood-opacity,font-family,font-size,font-size-adjust,font-stretch,font-style,font-variant,font-weight,format,from,fr,fx,fy,g1,g2,glyph-name,glyph-orientation-horizontal,glyph-orientation-vertical,glyphRef,gradientTransform,gradientUnits,hanging,height,href,hreflang,horiz-adv-x,horiz-origin-x,id,ideographic,image-rendering,in,in2,intercept,k,k1,k2,k3,k4,kernelMatrix,kernelUnitLength,kerning,keyPoints,keySplines,keyTimes,lang,lengthAdjust,letter-spacing,lighting-color,limitingConeAngle,local,marker-end,marker-mid,marker-start,markerHeight,markerUnits,markerWidth,mask,maskContentUnits,maskUnits,mathematical,max,media,method,min,mode,name,numOctaves,offset,opacity,operator,order,orient,orientation,origin,overflow,overline-position,overline-thickness,panose-1,paint-order,path,pathLength,patternContentUnits,patternTransform,patternUnits,ping,pointer-events,points,pointsAtX,pointsAtY,pointsAtZ,preserveAlpha,preserveAspectRatio,primitiveUnits,r,radius,referrerPolicy,refX,refY,rel,rendering-intent,repeatCount,repeatDur,requiredExtensions,requiredFeatures,restart,result,rotate,rx,ry,scale,seed,shape-rendering,slope,spacing,specularConstant,specularExponent,speed,spreadMethod,startOffset,stdDeviation,stemh,stemv,stitchTiles,stop-color,stop-opacity,strikethrough-position,strikethrough-thickness,string,stroke,stroke-dasharray,stroke-dashoffset,stroke-linecap,stroke-linejoin,stroke-miterlimit,stroke-opacity,stroke-width,style,surfaceScale,systemLanguage,tabindex,tableValues,target,targetX,targetY,text-anchor,text-decoration,text-rendering,textLength,to,transform,transform-origin,type,u1,u2,underline-position,underline-thickness,unicode,unicode-bidi,unicode-range,units-per-em,v-alphabetic,v-hanging,v-ideographic,v-mathematical,values,vector-effect,version,vert-adv-y,vert-origin-x,vert-origin-y,viewBox,viewTarget,visibility,width,widths,word-spacing,writing-mode,x,x-height,x1,x2,xChannelSelector,xlink:actuate,xlink:arcrole,xlink:href,xlink:role,xlink:show,xlink:title,xlink:type,xmlns:xlink,xml:base,xml:lang,xml:space,y,y1,y2,yChannelSelector,z,zoomAndPan`
+);
+function isRenderableAttrValue(value) {
+  if (value == null) {
+    return false;
+  }
+  const type = typeof value;
+  return type === "string" || type === "number" || type === "boolean";
+}
+
+const cssVarNameEscapeSymbolsRE = /[ !"#$%&'()*+,./:;<=>?@[\\\]^`{|}~]/g;
+function getEscapedCssVarName(key, doubleEscape) {
+  return key.replace(
+    cssVarNameEscapeSymbolsRE,
+    (s) => `\\${s}`
+  );
+}
+
+function looseCompareArrays(a, b) {
+  if (a.length !== b.length) return false;
+  let equal = true;
+  for (let i = 0; equal && i < a.length; i++) {
+    equal = looseEqual(a[i], b[i]);
+  }
+  return equal;
+}
+function looseEqual(a, b) {
+  if (a === b) return true;
+  let aValidType = isDate(a);
+  let bValidType = isDate(b);
+  if (aValidType || bValidType) {
+    return aValidType && bValidType ? a.getTime() === b.getTime() : false;
+  }
+  aValidType = isSymbol(a);
+  bValidType = isSymbol(b);
+  if (aValidType || bValidType) {
+    return a === b;
+  }
+  aValidType = isArray(a);
+  bValidType = isArray(b);
+  if (aValidType || bValidType) {
+    return aValidType && bValidType ? looseCompareArrays(a, b) : false;
+  }
+  aValidType = isObject(a);
+  bValidType = isObject(b);
+  if (aValidType || bValidType) {
+    if (!aValidType || !bValidType) {
+      return false;
+    }
+    const aKeysCount = Object.keys(a).length;
+    const bKeysCount = Object.keys(b).length;
+    if (aKeysCount !== bKeysCount) {
+      return false;
+    }
+    for (const key in a) {
+      const aHasKey = a.hasOwnProperty(key);
+      const bHasKey = b.hasOwnProperty(key);
+      if (aHasKey && !bHasKey || !aHasKey && bHasKey || !looseEqual(a[key], b[key])) {
+        return false;
+      }
+    }
+  }
+  return String(a) === String(b);
+}
+function looseIndexOf(arr, val) {
+  return arr.findIndex((item) => looseEqual(item, val));
+}
+
+const isRef$1 = (val) => {
+  return !!(val && val["__v_isRef"] === true);
+};
+const toDisplayString = (val) => {
+  return isString(val) ? val : val == null ? "" : isArray(val) || isObject(val) && (val.toString === objectToString || !isFunction(val.toString)) ? isRef$1(val) ? toDisplayString(val.value) : JSON.stringify(val, replacer, 2) : String(val);
+};
+const replacer = (_key, val) => {
+  if (isRef$1(val)) {
+    return replacer(_key, val.value);
+  } else if (isMap(val)) {
+    return {
+      [`Map(${val.size})`]: [...val.entries()].reduce(
+        (entries, [key, val2], i) => {
+          entries[stringifySymbol(key, i) + " =>"] = val2;
+          return entries;
+        },
+        {}
+      )
+    };
+  } else if (isSet(val)) {
+    return {
+      [`Set(${val.size})`]: [...val.values()].map((v) => stringifySymbol(v))
+    };
+  } else if (isSymbol(val)) {
+    return stringifySymbol(val);
+  } else if (isObject(val) && !isArray(val) && !isPlainObject(val)) {
+    return String(val);
+  }
+  return val;
+};
+const stringifySymbol = (v, i = "") => {
+  var _a;
+  return (
+    // Symbol.description in es2019+ so we need to cast here to pass
+    // the lib: es2016 check
+    isSymbol(v) ? `Symbol(${(_a = v.description) != null ? _a : i})` : v
+  );
+};
+
+function warn$2(msg, ...args) {
+  console.warn(`[Vue warn] ${msg}`, ...args);
+}
+
+let activeEffectScope;
+class EffectScope {
+  constructor(detached = false) {
+    this.detached = detached;
+    /**
+     * @internal
+     */
+    this._active = true;
+    /**
+     * @internal
+     */
+    this.effects = [];
+    /**
+     * @internal
+     */
+    this.cleanups = [];
+    this._isPaused = false;
+    this.parent = activeEffectScope;
+    if (!detached && activeEffectScope) {
+      this.index = (activeEffectScope.scopes || (activeEffectScope.scopes = [])).push(
+        this
+      ) - 1;
+    }
+  }
+  get active() {
+    return this._active;
+  }
+  pause() {
+    if (this._active) {
+      this._isPaused = true;
+      let i, l;
+      if (this.scopes) {
+        for (i = 0, l = this.scopes.length; i < l; i++) {
+          this.scopes[i].pause();
+        }
+      }
+      for (i = 0, l = this.effects.length; i < l; i++) {
+        this.effects[i].pause();
+      }
+    }
+  }
+  /**
+   * Resumes the effect scope, including all child scopes and effects.
+   */
+  resume() {
+    if (this._active) {
+      if (this._isPaused) {
+        this._isPaused = false;
+        let i, l;
+        if (this.scopes) {
+          for (i = 0, l = this.scopes.length; i < l; i++) {
+            this.scopes[i].resume();
+          }
+        }
+        for (i = 0, l = this.effects.length; i < l; i++) {
+          this.effects[i].resume();
+        }
+      }
+    }
+  }
+  run(fn) {
+    if (this._active) {
+      const currentEffectScope = activeEffectScope;
+      try {
+        activeEffectScope = this;
+        return fn();
+      } finally {
+        activeEffectScope = currentEffectScope;
+      }
+    } else {
+      warn$2(`cannot run an inactive effect scope.`);
+    }
+  }
+  /**
+   * This should only be called on non-detached scopes
+   * @internal
+   */
+  on() {
+    activeEffectScope = this;
+  }
+  /**
+   * This should only be called on non-detached scopes
+   * @internal
+   */
+  off() {
+    activeEffectScope = this.parent;
+  }
+  stop(fromParent) {
+    if (this._active) {
+      let i, l;
+      for (i = 0, l = this.effects.length; i < l; i++) {
+        this.effects[i].stop();
+      }
+      for (i = 0, l = this.cleanups.length; i < l; i++) {
+        this.cleanups[i]();
+      }
+      if (this.scopes) {
+        for (i = 0, l = this.scopes.length; i < l; i++) {
+          this.scopes[i].stop(true);
+        }
+      }
+      if (!this.detached && this.parent && !fromParent) {
+        const last = this.parent.scopes.pop();
+        if (last && last !== this) {
+          this.parent.scopes[this.index] = last;
+          last.index = this.index;
+        }
+      }
+      this.parent = void 0;
+      this._active = false;
+    }
+  }
+}
+function effectScope(detached) {
+  return new EffectScope(detached);
+}
+function getCurrentScope() {
+  return activeEffectScope;
+}
+function onScopeDispose(fn, failSilently = false) {
+  if (activeEffectScope) {
+    activeEffectScope.cleanups.push(fn);
+  } else if (!failSilently) {
+    warn$2(
+      `onScopeDispose() is called when there is no active effect scope to be associated with.`
+    );
+  }
+}
+
+let activeSub;
+const pausedQueueEffects = /* @__PURE__ */ new WeakSet();
+class ReactiveEffect {
+  constructor(fn) {
+    this.fn = fn;
+    /**
+     * @internal
+     */
+    this.deps = void 0;
+    /**
+     * @internal
+     */
+    this.depsTail = void 0;
+    /**
+     * @internal
+     */
+    this.flags = 1 | 4;
+    /**
+     * @internal
+     */
+    this.next = void 0;
+    /**
+     * @internal
+     */
+    this.cleanup = void 0;
+    this.scheduler = void 0;
+    if (activeEffectScope && activeEffectScope.active) {
+      activeEffectScope.effects.push(this);
+    }
+  }
+  pause() {
+    this.flags |= 64;
+  }
+  resume() {
+    if (this.flags & 64) {
+      this.flags &= ~64;
+      if (pausedQueueEffects.has(this)) {
+        pausedQueueEffects.delete(this);
+        this.trigger();
+      }
+    }
+  }
+  /**
+   * @internal
+   */
+  notify() {
+    if (this.flags & 2 && !(this.flags & 32)) {
+      return;
+    }
+    if (!(this.flags & 8)) {
+      batch(this);
+    }
+  }
+  run() {
+    if (!(this.flags & 1)) {
+      return this.fn();
+    }
+    this.flags |= 2;
+    cleanupEffect(this);
+    prepareDeps(this);
+    const prevEffect = activeSub;
+    const prevShouldTrack = shouldTrack;
+    activeSub = this;
+    shouldTrack = true;
+    try {
+      return this.fn();
+    } finally {
+      if (activeSub !== this) {
+        warn$2(
+          "Active effect was not restored correctly - this is likely a Vue internal bug."
+        );
+      }
+      cleanupDeps(this);
+      activeSub = prevEffect;
+      shouldTrack = prevShouldTrack;
+      this.flags &= ~2;
+    }
+  }
+  stop() {
+    if (this.flags & 1) {
+      for (let link = this.deps; link; link = link.nextDep) {
+        removeSub(link);
+      }
+      this.deps = this.depsTail = void 0;
+      cleanupEffect(this);
+      this.onStop && this.onStop();
+      this.flags &= ~1;
+    }
+  }
+  trigger() {
+    if (this.flags & 64) {
+      pausedQueueEffects.add(this);
+    } else if (this.scheduler) {
+      this.scheduler();
+    } else {
+      this.runIfDirty();
+    }
+  }
+  /**
+   * @internal
+   */
+  runIfDirty() {
+    if (isDirty(this)) {
+      this.run();
+    }
+  }
+  get dirty() {
+    return isDirty(this);
+  }
+}
+let batchDepth = 0;
+let batchedSub;
+let batchedComputed;
+function batch(sub, isComputed = false) {
+  sub.flags |= 8;
+  if (isComputed) {
+    sub.next = batchedComputed;
+    batchedComputed = sub;
+    return;
+  }
+  sub.next = batchedSub;
+  batchedSub = sub;
+}
+function startBatch() {
+  batchDepth++;
+}
+function endBatch() {
+  if (--batchDepth > 0) {
+    return;
+  }
+  if (batchedComputed) {
+    let e = batchedComputed;
+    batchedComputed = void 0;
+    while (e) {
+      const next = e.next;
+      e.next = void 0;
+      e.flags &= ~8;
+      e = next;
+    }
+  }
+  let error;
+  while (batchedSub) {
+    let e = batchedSub;
+    batchedSub = void 0;
+    while (e) {
+      const next = e.next;
+      e.next = void 0;
+      e.flags &= ~8;
+      if (e.flags & 1) {
+        try {
+          ;
+          e.trigger();
+        } catch (err) {
+          if (!error) error = err;
+        }
+      }
+      e = next;
+    }
+  }
+  if (error) throw error;
+}
+function prepareDeps(sub) {
+  for (let link = sub.deps; link; link = link.nextDep) {
+    link.version = -1;
+    link.prevActiveLink = link.dep.activeLink;
+    link.dep.activeLink = link;
+  }
+}
+function cleanupDeps(sub) {
+  let head;
+  let tail = sub.depsTail;
+  let link = tail;
+  while (link) {
+    const prev = link.prevDep;
+    if (link.version === -1) {
+      if (link === tail) tail = prev;
+      removeSub(link);
+      removeDep(link);
+    } else {
+      head = link;
+    }
+    link.dep.activeLink = link.prevActiveLink;
+    link.prevActiveLink = void 0;
+    link = prev;
+  }
+  sub.deps = head;
+  sub.depsTail = tail;
+}
+function isDirty(sub) {
+  for (let link = sub.deps; link; link = link.nextDep) {
+    if (link.dep.version !== link.version || link.dep.computed && (refreshComputed(link.dep.computed) || link.dep.version !== link.version)) {
+      return true;
+    }
+  }
+  if (sub._dirty) {
+    return true;
+  }
+  return false;
+}
+function refreshComputed(computed) {
+  if (computed.flags & 4 && !(computed.flags & 16)) {
+    return;
+  }
+  computed.flags &= ~16;
+  if (computed.globalVersion === globalVersion) {
+    return;
+  }
+  computed.globalVersion = globalVersion;
+  const dep = computed.dep;
+  computed.flags |= 2;
+  if (dep.version > 0 && !computed.isSSR && computed.deps && !isDirty(computed)) {
+    computed.flags &= ~2;
+    return;
+  }
+  const prevSub = activeSub;
+  const prevShouldTrack = shouldTrack;
+  activeSub = computed;
+  shouldTrack = true;
+  try {
+    prepareDeps(computed);
+    const value = computed.fn(computed._value);
+    if (dep.version === 0 || hasChanged(value, computed._value)) {
+      computed._value = value;
+      dep.version++;
+    }
+  } catch (err) {
+    dep.version++;
+    throw err;
+  } finally {
+    activeSub = prevSub;
+    shouldTrack = prevShouldTrack;
+    cleanupDeps(computed);
+    computed.flags &= ~2;
+  }
+}
+function removeSub(link, soft = false) {
+  const { dep, prevSub, nextSub } = link;
+  if (prevSub) {
+    prevSub.nextSub = nextSub;
+    link.prevSub = void 0;
+  }
+  if (nextSub) {
+    nextSub.prevSub = prevSub;
+    link.nextSub = void 0;
+  }
+  if (dep.subsHead === link) {
+    dep.subsHead = nextSub;
+  }
+  if (dep.subs === link) {
+    dep.subs = prevSub;
+    if (!prevSub && dep.computed) {
+      dep.computed.flags &= ~4;
+      for (let l = dep.computed.deps; l; l = l.nextDep) {
+        removeSub(l, true);
+      }
+    }
+  }
+  if (!soft && !--dep.sc && dep.map) {
+    dep.map.delete(dep.key);
+  }
+}
+function removeDep(link) {
+  const { prevDep, nextDep } = link;
+  if (prevDep) {
+    prevDep.nextDep = nextDep;
+    link.prevDep = void 0;
+  }
+  if (nextDep) {
+    nextDep.prevDep = prevDep;
+    link.nextDep = void 0;
+  }
+}
+function effect(fn, options) {
+  if (fn.effect instanceof ReactiveEffect) {
+    fn = fn.effect.fn;
+  }
+  const e = new ReactiveEffect(fn);
+  if (options) {
+    extend(e, options);
+  }
+  try {
+    e.run();
+  } catch (err) {
+    e.stop();
+    throw err;
+  }
+  const runner = e.run.bind(e);
+  runner.effect = e;
+  return runner;
+}
+function stop(runner) {
+  runner.effect.stop();
+}
+let shouldTrack = true;
+const trackStack = [];
+function pauseTracking() {
+  trackStack.push(shouldTrack);
+  shouldTrack = false;
+}
+function resetTracking() {
+  const last = trackStack.pop();
+  shouldTrack = last === void 0 ? true : last;
+}
+function cleanupEffect(e) {
+  const { cleanup } = e;
+  e.cleanup = void 0;
+  if (cleanup) {
+    const prevSub = activeSub;
+    activeSub = void 0;
+    try {
+      cleanup();
+    } finally {
+      activeSub = prevSub;
+    }
+  }
+}
+
+let globalVersion = 0;
+class Link {
+  constructor(sub, dep) {
+    this.sub = sub;
+    this.dep = dep;
+    this.version = dep.version;
+    this.nextDep = this.prevDep = this.nextSub = this.prevSub = this.prevActiveLink = void 0;
+  }
+}
+class Dep {
+  constructor(computed) {
+    this.computed = computed;
+    this.version = 0;
+    /**
+     * Link between this dep and the current active effect
+     */
+    this.activeLink = void 0;
+    /**
+     * Doubly linked list representing the subscribing effects (tail)
+     */
+    this.subs = void 0;
+    /**
+     * For object property deps cleanup
+     */
+    this.map = void 0;
+    this.key = void 0;
+    /**
+     * Subscriber counter
+     */
+    this.sc = 0;
+    {
+      this.subsHead = void 0;
+    }
+  }
+  track(debugInfo) {
+    if (!activeSub || !shouldTrack || activeSub === this.computed) {
+      return;
+    }
+    let link = this.activeLink;
+    if (link === void 0 || link.sub !== activeSub) {
+      link = this.activeLink = new Link(activeSub, this);
+      if (!activeSub.deps) {
+        activeSub.deps = activeSub.depsTail = link;
+      } else {
+        link.prevDep = activeSub.depsTail;
+        activeSub.depsTail.nextDep = link;
+        activeSub.depsTail = link;
+      }
+      addSub(link);
+    } else if (link.version === -1) {
+      link.version = this.version;
+      if (link.nextDep) {
+        const next = link.nextDep;
+        next.prevDep = link.prevDep;
+        if (link.prevDep) {
+          link.prevDep.nextDep = next;
+        }
+        link.prevDep = activeSub.depsTail;
+        link.nextDep = void 0;
+        activeSub.depsTail.nextDep = link;
+        activeSub.depsTail = link;
+        if (activeSub.deps === link) {
+          activeSub.deps = next;
+        }
+      }
+    }
+    if (activeSub.onTrack) {
+      activeSub.onTrack(
+        extend(
+          {
+            effect: activeSub
+          },
+          debugInfo
+        )
+      );
+    }
+    return link;
+  }
+  trigger(debugInfo) {
+    this.version++;
+    globalVersion++;
+    this.notify(debugInfo);
+  }
+  notify(debugInfo) {
+    startBatch();
+    try {
+      if (true) {
+        for (let head = this.subsHead; head; head = head.nextSub) {
+          if (head.sub.onTrigger && !(head.sub.flags & 8)) {
+            head.sub.onTrigger(
+              extend(
+                {
+                  effect: head.sub
+                },
+                debugInfo
+              )
+            );
+          }
+        }
+      }
+      for (let link = this.subs; link; link = link.prevSub) {
+        if (link.sub.notify()) {
+          ;
+          link.sub.dep.notify();
+        }
+      }
+    } finally {
+      endBatch();
+    }
+  }
+}
+function addSub(link) {
+  link.dep.sc++;
+  if (link.sub.flags & 4) {
+    const computed = link.dep.computed;
+    if (computed && !link.dep.subs) {
+      computed.flags |= 4 | 16;
+      for (let l = computed.deps; l; l = l.nextDep) {
+        addSub(l);
+      }
+    }
+    const currentTail = link.dep.subs;
+    if (currentTail !== link) {
+      link.prevSub = currentTail;
+      if (currentTail) currentTail.nextSub = link;
+    }
+    if (link.dep.subsHead === void 0) {
+      link.dep.subsHead = link;
+    }
+    link.dep.subs = link;
+  }
+}
+const targetMap = /* @__PURE__ */ new WeakMap();
+const ITERATE_KEY = Symbol(
+  "Object iterate" 
+);
+const MAP_KEY_ITERATE_KEY = Symbol(
+  "Map keys iterate" 
+);
+const ARRAY_ITERATE_KEY = Symbol(
+  "Array iterate" 
+);
+function track(target, type, key) {
+  if (shouldTrack && activeSub) {
+    let depsMap = targetMap.get(target);
+    if (!depsMap) {
+      targetMap.set(target, depsMap = /* @__PURE__ */ new Map());
+    }
+    let dep = depsMap.get(key);
+    if (!dep) {
+      depsMap.set(key, dep = new Dep());
+      dep.map = depsMap;
+      dep.key = key;
+    }
+    {
+      dep.track({
+        target,
+        type,
+        key
+      });
+    }
+  }
+}
+function trigger(target, type, key, newValue, oldValue, oldTarget) {
+  const depsMap = targetMap.get(target);
+  if (!depsMap) {
+    globalVersion++;
+    return;
+  }
+  const run = (dep) => {
+    if (dep) {
+      {
+        dep.trigger({
+          target,
+          type,
+          key,
+          newValue,
+          oldValue,
+          oldTarget
+        });
+      }
+    }
+  };
+  startBatch();
+  if (type === "clear") {
+    depsMap.forEach(run);
+  } else {
+    const targetIsArray = isArray(target);
+    const isArrayIndex = targetIsArray && isIntegerKey(key);
+    if (targetIsArray && key === "length") {
+      const newLength = Number(newValue);
+      depsMap.forEach((dep, key2) => {
+        if (key2 === "length" || key2 === ARRAY_ITERATE_KEY || !isSymbol(key2) && key2 >= newLength) {
+          run(dep);
+        }
+      });
+    } else {
+      if (key !== void 0 || depsMap.has(void 0)) {
+        run(depsMap.get(key));
+      }
+      if (isArrayIndex) {
+        run(depsMap.get(ARRAY_ITERATE_KEY));
+      }
+      switch (type) {
+        case "add":
+          if (!targetIsArray) {
+            run(depsMap.get(ITERATE_KEY));
+            if (isMap(target)) {
+              run(depsMap.get(MAP_KEY_ITERATE_KEY));
+            }
+          } else if (isArrayIndex) {
+            run(depsMap.get("length"));
+          }
+          break;
+        case "delete":
+          if (!targetIsArray) {
+            run(depsMap.get(ITERATE_KEY));
+            if (isMap(target)) {
+              run(depsMap.get(MAP_KEY_ITERATE_KEY));
+            }
+          }
+          break;
+        case "set":
+          if (isMap(target)) {
+            run(depsMap.get(ITERATE_KEY));
+          }
+          break;
+      }
+    }
+  }
+  endBatch();
+}
+function getDepFromReactive(object, key) {
+  const depMap = targetMap.get(object);
+  return depMap && depMap.get(key);
+}
+
+function reactiveReadArray(array) {
+  const raw = toRaw(array);
+  if (raw === array) return raw;
+  track(raw, "iterate", ARRAY_ITERATE_KEY);
+  return isShallow(array) ? raw : raw.map(toReactive);
+}
+function shallowReadArray(arr) {
+  track(arr = toRaw(arr), "iterate", ARRAY_ITERATE_KEY);
+  return arr;
+}
+const arrayInstrumentations = {
+  __proto__: null,
+  [Symbol.iterator]() {
+    return iterator(this, Symbol.iterator, toReactive);
+  },
+  concat(...args) {
+    return reactiveReadArray(this).concat(
+      ...args.map((x) => isArray(x) ? reactiveReadArray(x) : x)
+    );
+  },
+  entries() {
+    return iterator(this, "entries", (value) => {
+      value[1] = toReactive(value[1]);
+      return value;
+    });
+  },
+  every(fn, thisArg) {
+    return apply(this, "every", fn, thisArg, void 0, arguments);
+  },
+  filter(fn, thisArg) {
+    return apply(this, "filter", fn, thisArg, (v) => v.map(toReactive), arguments);
+  },
+  find(fn, thisArg) {
+    return apply(this, "find", fn, thisArg, toReactive, arguments);
+  },
+  findIndex(fn, thisArg) {
+    return apply(this, "findIndex", fn, thisArg, void 0, arguments);
+  },
+  findLast(fn, thisArg) {
+    return apply(this, "findLast", fn, thisArg, toReactive, arguments);
+  },
+  findLastIndex(fn, thisArg) {
+    return apply(this, "findLastIndex", fn, thisArg, void 0, arguments);
+  },
+  // flat, flatMap could benefit from ARRAY_ITERATE but are not straight-forward to implement
+  forEach(fn, thisArg) {
+    return apply(this, "forEach", fn, thisArg, void 0, arguments);
+  },
+  includes(...args) {
+    return searchProxy(this, "includes", args);
+  },
+  indexOf(...args) {
+    return searchProxy(this, "indexOf", args);
+  },
+  join(separator) {
+    return reactiveReadArray(this).join(separator);
+  },
+  // keys() iterator only reads `length`, no optimisation required
+  lastIndexOf(...args) {
+    return searchProxy(this, "lastIndexOf", args);
+  },
+  map(fn, thisArg) {
+    return apply(this, "map", fn, thisArg, void 0, arguments);
+  },
+  pop() {
+    return noTracking(this, "pop");
+  },
+  push(...args) {
+    return noTracking(this, "push", args);
+  },
+  reduce(fn, ...args) {
+    return reduce(this, "reduce", fn, args);
+  },
+  reduceRight(fn, ...args) {
+    return reduce(this, "reduceRight", fn, args);
+  },
+  shift() {
+    return noTracking(this, "shift");
+  },
+  // slice could use ARRAY_ITERATE but also seems to beg for range tracking
+  some(fn, thisArg) {
+    return apply(this, "some", fn, thisArg, void 0, arguments);
+  },
+  splice(...args) {
+    return noTracking(this, "splice", args);
+  },
+  toReversed() {
+    return reactiveReadArray(this).toReversed();
+  },
+  toSorted(comparer) {
+    return reactiveReadArray(this).toSorted(comparer);
+  },
+  toSpliced(...args) {
+    return reactiveReadArray(this).toSpliced(...args);
+  },
+  unshift(...args) {
+    return noTracking(this, "unshift", args);
+  },
+  values() {
+    return iterator(this, "values", toReactive);
+  }
+};
+function iterator(self, method, wrapValue) {
+  const arr = shallowReadArray(self);
+  const iter = arr[method]();
+  if (arr !== self && !isShallow(self)) {
+    iter._next = iter.next;
+    iter.next = () => {
+      const result = iter._next();
+      if (result.value) {
+        result.value = wrapValue(result.value);
+      }
+      return result;
+    };
+  }
+  return iter;
+}
+const arrayProto = Array.prototype;
+function apply(self, method, fn, thisArg, wrappedRetFn, args) {
+  const arr = shallowReadArray(self);
+  const needsWrap = arr !== self && !isShallow(self);
+  const methodFn = arr[method];
+  if (methodFn !== arrayProto[method]) {
+    const result2 = methodFn.apply(self, args);
+    return needsWrap ? toReactive(result2) : result2;
+  }
+  let wrappedFn = fn;
+  if (arr !== self) {
+    if (needsWrap) {
+      wrappedFn = function(item, index) {
+        return fn.call(this, toReactive(item), index, self);
+      };
+    } else if (fn.length > 2) {
+      wrappedFn = function(item, index) {
+        return fn.call(this, item, index, self);
+      };
+    }
+  }
+  const result = methodFn.call(arr, wrappedFn, thisArg);
+  return needsWrap && wrappedRetFn ? wrappedRetFn(result) : result;
+}
+function reduce(self, method, fn, args) {
+  const arr = shallowReadArray(self);
+  let wrappedFn = fn;
+  if (arr !== self) {
+    if (!isShallow(self)) {
+      wrappedFn = function(acc, item, index) {
+        return fn.call(this, acc, toReactive(item), index, self);
+      };
+    } else if (fn.length > 3) {
+      wrappedFn = function(acc, item, index) {
+        return fn.call(this, acc, item, index, self);
+      };
+    }
+  }
+  return arr[method](wrappedFn, ...args);
+}
+function searchProxy(self, method, args) {
+  const arr = toRaw(self);
+  track(arr, "iterate", ARRAY_ITERATE_KEY);
+  const res = arr[method](...args);
+  if ((res === -1 || res === false) && isProxy(args[0])) {
+    args[0] = toRaw(args[0]);
+    return arr[method](...args);
+  }
+  return res;
+}
+function noTracking(self, method, args = []) {
+  pauseTracking();
+  startBatch();
+  const res = toRaw(self)[method].apply(self, args);
+  endBatch();
+  resetTracking();
+  return res;
+}
+
+const isNonTrackableKeys = /* @__PURE__ */ makeMap(`__proto__,__v_isRef,__isVue`);
+const builtInSymbols = new Set(
+  /* @__PURE__ */ Object.getOwnPropertyNames(Symbol).filter((key) => key !== "arguments" && key !== "caller").map((key) => Symbol[key]).filter(isSymbol)
+);
+function hasOwnProperty(key) {
+  if (!isSymbol(key)) key = String(key);
+  const obj = toRaw(this);
+  track(obj, "has", key);
+  return obj.hasOwnProperty(key);
+}
+class BaseReactiveHandler {
+  constructor(_isReadonly = false, _isShallow = false) {
+    this._isReadonly = _isReadonly;
+    this._isShallow = _isShallow;
+  }
+  get(target, key, receiver) {
+    const isReadonly2 = this._isReadonly, isShallow2 = this._isShallow;
+    if (key === "__v_isReactive") {
+      return !isReadonly2;
+    } else if (key === "__v_isReadonly") {
+      return isReadonly2;
+    } else if (key === "__v_isShallow") {
+      return isShallow2;
+    } else if (key === "__v_raw") {
+      if (receiver === (isReadonly2 ? isShallow2 ? shallowReadonlyMap : readonlyMap : isShallow2 ? shallowReactiveMap : reactiveMap).get(target) || // receiver is not the reactive proxy, but has the same prototype
+      // this means the receiver is a user proxy of the reactive proxy
+      Object.getPrototypeOf(target) === Object.getPrototypeOf(receiver)) {
+        return target;
+      }
+      return;
+    }
+    const targetIsArray = isArray(target);
+    if (!isReadonly2) {
+      let fn;
+      if (targetIsArray && (fn = arrayInstrumentations[key])) {
+        return fn;
+      }
+      if (key === "hasOwnProperty") {
+        return hasOwnProperty;
+      }
+    }
+    const res = Reflect.get(
+      target,
+      key,
+      // if this is a proxy wrapping a ref, return methods using the raw ref
+      // as receiver so that we don't have to call `toRaw` on the ref in all
+      // its class methods
+      isRef(target) ? target : receiver
+    );
+    if (isSymbol(key) ? builtInSymbols.has(key) : isNonTrackableKeys(key)) {
+      return res;
+    }
+    if (!isReadonly2) {
+      track(target, "get", key);
+    }
+    if (isShallow2) {
+      return res;
+    }
+    if (isRef(res)) {
+      return targetIsArray && isIntegerKey(key) ? res : res.value;
+    }
+    if (isObject(res)) {
+      return isReadonly2 ? readonly(res) : reactive(res);
+    }
+    return res;
+  }
+}
+class MutableReactiveHandler extends BaseReactiveHandler {
+  constructor(isShallow2 = false) {
+    super(false, isShallow2);
+  }
+  set(target, key, value, receiver) {
+    let oldValue = target[key];
+    if (!this._isShallow) {
+      const isOldValueReadonly = isReadonly(oldValue);
+      if (!isShallow(value) && !isReadonly(value)) {
+        oldValue = toRaw(oldValue);
+        value = toRaw(value);
+      }
+      if (!isArray(target) && isRef(oldValue) && !isRef(value)) {
+        if (isOldValueReadonly) {
+          return false;
+        } else {
+          oldValue.value = value;
+          return true;
+        }
+      }
+    }
+    const hadKey = isArray(target) && isIntegerKey(key) ? Number(key) < target.length : hasOwn(target, key);
+    const result = Reflect.set(
+      target,
+      key,
+      value,
+      isRef(target) ? target : receiver
+    );
+    if (target === toRaw(receiver)) {
+      if (!hadKey) {
+        trigger(target, "add", key, value);
+      } else if (hasChanged(value, oldValue)) {
+        trigger(target, "set", key, value, oldValue);
+      }
+    }
+    return result;
+  }
+  deleteProperty(target, key) {
+    const hadKey = hasOwn(target, key);
+    const oldValue = target[key];
+    const result = Reflect.deleteProperty(target, key);
+    if (result && hadKey) {
+      trigger(target, "delete", key, void 0, oldValue);
+    }
+    return result;
+  }
+  has(target, key) {
+    const result = Reflect.has(target, key);
+    if (!isSymbol(key) || !builtInSymbols.has(key)) {
+      track(target, "has", key);
+    }
+    return result;
+  }
+  ownKeys(target) {
+    track(
+      target,
+      "iterate",
+      isArray(target) ? "length" : ITERATE_KEY
+    );
+    return Reflect.ownKeys(target);
+  }
+}
+class ReadonlyReactiveHandler extends BaseReactiveHandler {
+  constructor(isShallow2 = false) {
+    super(true, isShallow2);
+  }
+  set(target, key) {
+    {
+      warn$2(
+        `Set operation on key "${String(key)}" failed: target is readonly.`,
+        target
+      );
+    }
+    return true;
+  }
+  deleteProperty(target, key) {
+    {
+      warn$2(
+        `Delete operation on key "${String(key)}" failed: target is readonly.`,
+        target
+      );
+    }
+    return true;
+  }
+}
+const mutableHandlers = /* @__PURE__ */ new MutableReactiveHandler();
+const readonlyHandlers = /* @__PURE__ */ new ReadonlyReactiveHandler();
+const shallowReactiveHandlers = /* @__PURE__ */ new MutableReactiveHandler(true);
+const shallowReadonlyHandlers = /* @__PURE__ */ new ReadonlyReactiveHandler(true);
+
+const toShallow = (value) => value;
+const getProto = (v) => Reflect.getPrototypeOf(v);
+function createIterableMethod(method, isReadonly2, isShallow2) {
+  return function(...args) {
+    const target = this["__v_raw"];
+    const rawTarget = toRaw(target);
+    const targetIsMap = isMap(rawTarget);
+    const isPair = method === "entries" || method === Symbol.iterator && targetIsMap;
+    const isKeyOnly = method === "keys" && targetIsMap;
+    const innerIterator = target[method](...args);
+    const wrap = isShallow2 ? toShallow : isReadonly2 ? toReadonly : toReactive;
+    !isReadonly2 && track(
+      rawTarget,
+      "iterate",
+      isKeyOnly ? MAP_KEY_ITERATE_KEY : ITERATE_KEY
+    );
+    return {
+      // iterator protocol
+      next() {
+        const { value, done } = innerIterator.next();
+        return done ? { value, done } : {
+          value: isPair ? [wrap(value[0]), wrap(value[1])] : wrap(value),
+          done
+        };
+      },
+      // iterable protocol
+      [Symbol.iterator]() {
+        return this;
+      }
+    };
+  };
+}
+function createReadonlyMethod(type) {
+  return function(...args) {
+    {
+      const key = args[0] ? `on key "${args[0]}" ` : ``;
+      warn$2(
+        `${capitalize(type)} operation ${key}failed: target is readonly.`,
+        toRaw(this)
+      );
+    }
+    return type === "delete" ? false : type === "clear" ? void 0 : this;
+  };
+}
+function createInstrumentations(readonly, shallow) {
+  const instrumentations = {
+    get(key) {
+      const target = this["__v_raw"];
+      const rawTarget = toRaw(target);
+      const rawKey = toRaw(key);
+      if (!readonly) {
+        if (hasChanged(key, rawKey)) {
+          track(rawTarget, "get", key);
+        }
+        track(rawTarget, "get", rawKey);
+      }
+      const { has } = getProto(rawTarget);
+      const wrap = shallow ? toShallow : readonly ? toReadonly : toReactive;
+      if (has.call(rawTarget, key)) {
+        return wrap(target.get(key));
+      } else if (has.call(rawTarget, rawKey)) {
+        return wrap(target.get(rawKey));
+      } else if (target !== rawTarget) {
+        target.get(key);
+      }
+    },
+    get size() {
+      const target = this["__v_raw"];
+      !readonly && track(toRaw(target), "iterate", ITERATE_KEY);
+      return Reflect.get(target, "size", target);
+    },
+    has(key) {
+      const target = this["__v_raw"];
+      const rawTarget = toRaw(target);
+      const rawKey = toRaw(key);
+      if (!readonly) {
+        if (hasChanged(key, rawKey)) {
+          track(rawTarget, "has", key);
+        }
+        track(rawTarget, "has", rawKey);
+      }
+      return key === rawKey ? target.has(key) : target.has(key) || target.has(rawKey);
+    },
+    forEach(callback, thisArg) {
+      const observed = this;
+      const target = observed["__v_raw"];
+      const rawTarget = toRaw(target);
+      const wrap = shallow ? toShallow : readonly ? toReadonly : toReactive;
+      !readonly && track(rawTarget, "iterate", ITERATE_KEY);
+      return target.forEach((value, key) => {
+        return callback.call(thisArg, wrap(value), wrap(key), observed);
+      });
+    }
+  };
+  extend(
+    instrumentations,
+    readonly ? {
+      add: createReadonlyMethod("add"),
+      set: createReadonlyMethod("set"),
+      delete: createReadonlyMethod("delete"),
+      clear: createReadonlyMethod("clear")
+    } : {
+      add(value) {
+        if (!shallow && !isShallow(value) && !isReadonly(value)) {
+          value = toRaw(value);
+        }
+        const target = toRaw(this);
+        const proto = getProto(target);
+        const hadKey = proto.has.call(target, value);
+        if (!hadKey) {
+          target.add(value);
+          trigger(target, "add", value, value);
+        }
+        return this;
+      },
+      set(key, value) {
+        if (!shallow && !isShallow(value) && !isReadonly(value)) {
+          value = toRaw(value);
+        }
+        const target = toRaw(this);
+        const { has, get } = getProto(target);
+        let hadKey = has.call(target, key);
+        if (!hadKey) {
+          key = toRaw(key);
+          hadKey = has.call(target, key);
+        } else {
+          checkIdentityKeys(target, has, key);
+        }
+        const oldValue = get.call(target, key);
+        target.set(key, value);
+        if (!hadKey) {
+          trigger(target, "add", key, value);
+        } else if (hasChanged(value, oldValue)) {
+          trigger(target, "set", key, value, oldValue);
+        }
+        return this;
+      },
+      delete(key) {
+        const target = toRaw(this);
+        const { has, get } = getProto(target);
+        let hadKey = has.call(target, key);
+        if (!hadKey) {
+          key = toRaw(key);
+          hadKey = has.call(target, key);
+        } else {
+          checkIdentityKeys(target, has, key);
+        }
+        const oldValue = get ? get.call(target, key) : void 0;
+        const result = target.delete(key);
+        if (hadKey) {
+          trigger(target, "delete", key, void 0, oldValue);
+        }
+        return result;
+      },
+      clear() {
+        const target = toRaw(this);
+        const hadItems = target.size !== 0;
+        const oldTarget = isMap(target) ? new Map(target) : new Set(target) ;
+        const result = target.clear();
+        if (hadItems) {
+          trigger(
+            target,
+            "clear",
+            void 0,
+            void 0,
+            oldTarget
+          );
+        }
+        return result;
+      }
+    }
+  );
+  const iteratorMethods = [
+    "keys",
+    "values",
+    "entries",
+    Symbol.iterator
+  ];
+  iteratorMethods.forEach((method) => {
+    instrumentations[method] = createIterableMethod(method, readonly, shallow);
+  });
+  return instrumentations;
+}
+function createInstrumentationGetter(isReadonly2, shallow) {
+  const instrumentations = createInstrumentations(isReadonly2, shallow);
+  return (target, key, receiver) => {
+    if (key === "__v_isReactive") {
+      return !isReadonly2;
+    } else if (key === "__v_isReadonly") {
+      return isReadonly2;
+    } else if (key === "__v_raw") {
+      return target;
+    }
+    return Reflect.get(
+      hasOwn(instrumentations, key) && key in target ? instrumentations : target,
+      key,
+      receiver
+    );
+  };
+}
+const mutableCollectionHandlers = {
+  get: /* @__PURE__ */ createInstrumentationGetter(false, false)
+};
+const shallowCollectionHandlers = {
+  get: /* @__PURE__ */ createInstrumentationGetter(false, true)
+};
+const readonlyCollectionHandlers = {
+  get: /* @__PURE__ */ createInstrumentationGetter(true, false)
+};
+const shallowReadonlyCollectionHandlers = {
+  get: /* @__PURE__ */ createInstrumentationGetter(true, true)
+};
+function checkIdentityKeys(target, has, key) {
+  const rawKey = toRaw(key);
+  if (rawKey !== key && has.call(target, rawKey)) {
+    const type = toRawType(target);
+    warn$2(
+      `Reactive ${type} contains both the raw and reactive versions of the same object${type === `Map` ? ` as keys` : ``}, which can lead to inconsistencies. Avoid differentiating between the raw and reactive versions of an object and only use the reactive version if possible.`
+    );
+  }
+}
+
+const reactiveMap = /* @__PURE__ */ new WeakMap();
+const shallowReactiveMap = /* @__PURE__ */ new WeakMap();
+const readonlyMap = /* @__PURE__ */ new WeakMap();
+const shallowReadonlyMap = /* @__PURE__ */ new WeakMap();
+function targetTypeMap(rawType) {
+  switch (rawType) {
+    case "Object":
+    case "Array":
+      return 1 /* COMMON */;
+    case "Map":
+    case "Set":
+    case "WeakMap":
+    case "WeakSet":
+      return 2 /* COLLECTION */;
+    default:
+      return 0 /* INVALID */;
+  }
+}
+function getTargetType(value) {
+  return value["__v_skip"] || !Object.isExtensible(value) ? 0 /* INVALID */ : targetTypeMap(toRawType(value));
+}
+function reactive(target) {
+  if (isReadonly(target)) {
+    return target;
+  }
+  return createReactiveObject(
+    target,
+    false,
+    mutableHandlers,
+    mutableCollectionHandlers,
+    reactiveMap
+  );
+}
+function shallowReactive(target) {
+  return createReactiveObject(
+    target,
+    false,
+    shallowReactiveHandlers,
+    shallowCollectionHandlers,
+    shallowReactiveMap
+  );
+}
+function readonly(target) {
+  return createReactiveObject(
+    target,
+    true,
+    readonlyHandlers,
+    readonlyCollectionHandlers,
+    readonlyMap
+  );
+}
+function shallowReadonly(target) {
+  return createReactiveObject(
+    target,
+    true,
+    shallowReadonlyHandlers,
+    shallowReadonlyCollectionHandlers,
+    shallowReadonlyMap
+  );
+}
+function createReactiveObject(target, isReadonly2, baseHandlers, collectionHandlers, proxyMap) {
+  if (!isObject(target)) {
+    {
+      warn$2(
+        `value cannot be made ${isReadonly2 ? "readonly" : "reactive"}: ${String(
+          target
+        )}`
+      );
+    }
+    return target;
+  }
+  if (target["__v_raw"] && !(isReadonly2 && target["__v_isReactive"])) {
+    return target;
+  }
+  const existingProxy = proxyMap.get(target);
+  if (existingProxy) {
+    return existingProxy;
+  }
+  const targetType = getTargetType(target);
+  if (targetType === 0 /* INVALID */) {
+    return target;
+  }
+  const proxy = new Proxy(
+    target,
+    targetType === 2 /* COLLECTION */ ? collectionHandlers : baseHandlers
+  );
+  proxyMap.set(target, proxy);
+  return proxy;
+}
+function isReactive(value) {
+  if (isReadonly(value)) {
+    return isReactive(value["__v_raw"]);
+  }
+  return !!(value && value["__v_isReactive"]);
+}
+function isReadonly(value) {
+  return !!(value && value["__v_isReadonly"]);
+}
+function isShallow(value) {
+  return !!(value && value["__v_isShallow"]);
+}
+function isProxy(value) {
+  return value ? !!value["__v_raw"] : false;
+}
+function toRaw(observed) {
+  const raw = observed && observed["__v_raw"];
+  return raw ? toRaw(raw) : observed;
+}
+function markRaw(value) {
+  if (!hasOwn(value, "__v_skip") && Object.isExtensible(value)) {
+    def(value, "__v_skip", true);
+  }
+  return value;
+}
+const toReactive = (value) => isObject(value) ? reactive(value) : value;
+const toReadonly = (value) => isObject(value) ? readonly(value) : value;
+
+function isRef(r) {
+  return r ? r["__v_isRef"] === true : false;
+}
+function ref(value) {
+  return createRef(value, false);
+}
+function shallowRef(value) {
+  return createRef(value, true);
+}
+function createRef(rawValue, shallow) {
+  if (isRef(rawValue)) {
+    return rawValue;
+  }
+  return new RefImpl(rawValue, shallow);
+}
+class RefImpl {
+  constructor(value, isShallow2) {
+    this.dep = new Dep();
+    this["__v_isRef"] = true;
+    this["__v_isShallow"] = false;
+    this._rawValue = isShallow2 ? value : toRaw(value);
+    this._value = isShallow2 ? value : toReactive(value);
+    this["__v_isShallow"] = isShallow2;
+  }
+  get value() {
+    {
+      this.dep.track({
+        target: this,
+        type: "get",
+        key: "value"
+      });
+    }
+    return this._value;
+  }
+  set value(newValue) {
+    const oldValue = this._rawValue;
+    const useDirectValue = this["__v_isShallow"] || isShallow(newValue) || isReadonly(newValue);
+    newValue = useDirectValue ? newValue : toRaw(newValue);
+    if (hasChanged(newValue, oldValue)) {
+      this._rawValue = newValue;
+      this._value = useDirectValue ? newValue : toReactive(newValue);
+      {
+        this.dep.trigger({
+          target: this,
+          type: "set",
+          key: "value",
+          newValue,
+          oldValue
+        });
+      }
+    }
+  }
+}
+function triggerRef(ref2) {
+  if (ref2.dep) {
+    {
+      ref2.dep.trigger({
+        target: ref2,
+        type: "set",
+        key: "value",
+        newValue: ref2._value
+      });
+    }
+  }
+}
+function unref(ref2) {
+  return isRef(ref2) ? ref2.value : ref2;
+}
+function toValue(source) {
+  return isFunction(source) ? source() : unref(source);
+}
+const shallowUnwrapHandlers = {
+  get: (target, key, receiver) => key === "__v_raw" ? target : unref(Reflect.get(target, key, receiver)),
+  set: (target, key, value, receiver) => {
+    const oldValue = target[key];
+    if (isRef(oldValue) && !isRef(value)) {
+      oldValue.value = value;
+      return true;
+    } else {
+      return Reflect.set(target, key, value, receiver);
+    }
+  }
+};
+function proxyRefs(objectWithRefs) {
+  return isReactive(objectWithRefs) ? objectWithRefs : new Proxy(objectWithRefs, shallowUnwrapHandlers);
+}
+class CustomRefImpl {
+  constructor(factory) {
+    this["__v_isRef"] = true;
+    this._value = void 0;
+    const dep = this.dep = new Dep();
+    const { get, set } = factory(dep.track.bind(dep), dep.trigger.bind(dep));
+    this._get = get;
+    this._set = set;
+  }
+  get value() {
+    return this._value = this._get();
+  }
+  set value(newVal) {
+    this._set(newVal);
+  }
+}
+function customRef(factory) {
+  return new CustomRefImpl(factory);
+}
+function toRefs(object) {
+  if (!isProxy(object)) {
+    warn$2(`toRefs() expects a reactive object but received a plain one.`);
+  }
+  const ret = isArray(object) ? new Array(object.length) : {};
+  for (const key in object) {
+    ret[key] = propertyToRef(object, key);
+  }
+  return ret;
+}
+class ObjectRefImpl {
+  constructor(_object, _key, _defaultValue) {
+    this._object = _object;
+    this._key = _key;
+    this._defaultValue = _defaultValue;
+    this["__v_isRef"] = true;
+    this._value = void 0;
+  }
+  get value() {
+    const val = this._object[this._key];
+    return this._value = val === void 0 ? this._defaultValue : val;
+  }
+  set value(newVal) {
+    this._object[this._key] = newVal;
+  }
+  get dep() {
+    return getDepFromReactive(toRaw(this._object), this._key);
+  }
+}
+class GetterRefImpl {
+  constructor(_getter) {
+    this._getter = _getter;
+    this["__v_isRef"] = true;
+    this["__v_isReadonly"] = true;
+    this._value = void 0;
+  }
+  get value() {
+    return this._value = this._getter();
+  }
+}
+function toRef(source, key, defaultValue) {
+  if (isRef(source)) {
+    return source;
+  } else if (isFunction(source)) {
+    return new GetterRefImpl(source);
+  } else if (isObject(source) && arguments.length > 1) {
+    return propertyToRef(source, key, defaultValue);
+  } else {
+    return ref(source);
+  }
+}
+function propertyToRef(source, key, defaultValue) {
+  const val = source[key];
+  return isRef(val) ? val : new ObjectRefImpl(source, key, defaultValue);
+}
+
+class ComputedRefImpl {
+  constructor(fn, setter, isSSR) {
+    this.fn = fn;
+    this.setter = setter;
+    /**
+     * @internal
+     */
+    this._value = void 0;
+    /**
+     * @internal
+     */
+    this.dep = new Dep(this);
+    /**
+     * @internal
+     */
+    this.__v_isRef = true;
+    // TODO isolatedDeclarations "__v_isReadonly"
+    // A computed is also a subscriber that tracks other deps
+    /**
+     * @internal
+     */
+    this.deps = void 0;
+    /**
+     * @internal
+     */
+    this.depsTail = void 0;
+    /**
+     * @internal
+     */
+    this.flags = 16;
+    /**
+     * @internal
+     */
+    this.globalVersion = globalVersion - 1;
+    /**
+     * @internal
+     */
+    this.next = void 0;
+    // for backwards compat
+    this.effect = this;
+    this["__v_isReadonly"] = !setter;
+    this.isSSR = isSSR;
+  }
+  /**
+   * @internal
+   */
+  notify() {
+    this.flags |= 16;
+    if (!(this.flags & 8) && // avoid infinite self recursion
+    activeSub !== this) {
+      batch(this, true);
+      return true;
+    }
+  }
+  get value() {
+    const link = this.dep.track({
+      target: this,
+      type: "get",
+      key: "value"
+    }) ;
+    refreshComputed(this);
+    if (link) {
+      link.version = this.dep.version;
+    }
+    return this._value;
+  }
+  set value(newValue) {
+    if (this.setter) {
+      this.setter(newValue);
+    } else {
+      warn$2("Write operation failed: computed value is readonly");
+    }
+  }
+}
+function computed$1(getterOrOptions, debugOptions, isSSR = false) {
+  let getter;
+  let setter;
+  if (isFunction(getterOrOptions)) {
+    getter = getterOrOptions;
+  } else {
+    getter = getterOrOptions.get;
+    setter = getterOrOptions.set;
+  }
+  const cRef = new ComputedRefImpl(getter, setter, isSSR);
+  if (debugOptions && !isSSR) {
+    cRef.onTrack = debugOptions.onTrack;
+    cRef.onTrigger = debugOptions.onTrigger;
+  }
+  return cRef;
+}
+
+const TrackOpTypes = {
+  "GET": "get",
+  "HAS": "has",
+  "ITERATE": "iterate"
+};
+const TriggerOpTypes = {
+  "SET": "set",
+  "ADD": "add",
+  "DELETE": "delete",
+  "CLEAR": "clear"
+};
+
+const INITIAL_WATCHER_VALUE = {};
+const cleanupMap = /* @__PURE__ */ new WeakMap();
+let activeWatcher = void 0;
+function getCurrentWatcher() {
+  return activeWatcher;
+}
+function onWatcherCleanup(cleanupFn, failSilently = false, owner = activeWatcher) {
+  if (owner) {
+    let cleanups = cleanupMap.get(owner);
+    if (!cleanups) cleanupMap.set(owner, cleanups = []);
+    cleanups.push(cleanupFn);
+  } else if (!failSilently) {
+    warn$2(
+      `onWatcherCleanup() was called when there was no active watcher to associate with.`
+    );
+  }
+}
+function watch$1(source, cb, options = EMPTY_OBJ) {
+  const { immediate, deep, once, scheduler, augmentJob, call } = options;
+  const warnInvalidSource = (s) => {
+    (options.onWarn || warn$2)(
+      `Invalid watch source: `,
+      s,
+      `A watch source can only be a getter/effect function, a ref, a reactive object, or an array of these types.`
+    );
+  };
+  const reactiveGetter = (source2) => {
+    if (deep) return source2;
+    if (isShallow(source2) || deep === false || deep === 0)
+      return traverse(source2, 1);
+    return traverse(source2);
+  };
+  let effect;
+  let getter;
+  let cleanup;
+  let boundCleanup;
+  let forceTrigger = false;
+  let isMultiSource = false;
+  if (isRef(source)) {
+    getter = () => source.value;
+    forceTrigger = isShallow(source);
+  } else if (isReactive(source)) {
+    getter = () => reactiveGetter(source);
+    forceTrigger = true;
+  } else if (isArray(source)) {
+    isMultiSource = true;
+    forceTrigger = source.some((s) => isReactive(s) || isShallow(s));
+    getter = () => source.map((s) => {
+      if (isRef(s)) {
+        return s.value;
+      } else if (isReactive(s)) {
+        return reactiveGetter(s);
+      } else if (isFunction(s)) {
+        return call ? call(s, 2) : s();
+      } else {
+        warnInvalidSource(s);
+      }
+    });
+  } else if (isFunction(source)) {
+    if (cb) {
+      getter = call ? () => call(source, 2) : source;
+    } else {
+      getter = () => {
+        if (cleanup) {
+          pauseTracking();
+          try {
+            cleanup();
+          } finally {
+            resetTracking();
+          }
+        }
+        const currentEffect = activeWatcher;
+        activeWatcher = effect;
+        try {
+          return call ? call(source, 3, [boundCleanup]) : source(boundCleanup);
+        } finally {
+          activeWatcher = currentEffect;
+        }
+      };
+    }
+  } else {
+    getter = NOOP;
+    warnInvalidSource(source);
+  }
+  if (cb && deep) {
+    const baseGetter = getter;
+    const depth = deep === true ? Infinity : deep;
+    getter = () => traverse(baseGetter(), depth);
+  }
+  const scope = getCurrentScope();
+  const watchHandle = () => {
+    effect.stop();
+    if (scope) {
+      remove(scope.effects, effect);
+    }
+  };
+  if (once && cb) {
+    const _cb = cb;
+    cb = (...args) => {
+      _cb(...args);
+      watchHandle();
+    };
+  }
+  let oldValue = isMultiSource ? new Array(source.length).fill(INITIAL_WATCHER_VALUE) : INITIAL_WATCHER_VALUE;
+  const job = (immediateFirstRun) => {
+    if (!(effect.flags & 1) || !effect.dirty && !immediateFirstRun) {
+      return;
+    }
+    if (cb) {
+      const newValue = effect.run();
+      if (deep || forceTrigger || (isMultiSource ? newValue.some((v, i) => hasChanged(v, oldValue[i])) : hasChanged(newValue, oldValue))) {
+        if (cleanup) {
+          cleanup();
+        }
+        const currentWatcher = activeWatcher;
+        activeWatcher = effect;
+        try {
+          const args = [
+            newValue,
+            // pass undefined as the old value when it's changed for the first time
+            oldValue === INITIAL_WATCHER_VALUE ? void 0 : isMultiSource && oldValue[0] === INITIAL_WATCHER_VALUE ? [] : oldValue,
+            boundCleanup
+          ];
+          call ? call(cb, 3, args) : (
+            // @ts-expect-error
+            cb(...args)
+          );
+          oldValue = newValue;
+        } finally {
+          activeWatcher = currentWatcher;
+        }
+      }
+    } else {
+      effect.run();
+    }
+  };
+  if (augmentJob) {
+    augmentJob(job);
+  }
+  effect = new ReactiveEffect(getter);
+  effect.scheduler = scheduler ? () => scheduler(job, false) : job;
+  boundCleanup = (fn) => onWatcherCleanup(fn, false, effect);
+  cleanup = effect.onStop = () => {
+    const cleanups = cleanupMap.get(effect);
+    if (cleanups) {
+      if (call) {
+        call(cleanups, 4);
+      } else {
+        for (const cleanup2 of cleanups) cleanup2();
+      }
+      cleanupMap.delete(effect);
+    }
+  };
+  {
+    effect.onTrack = options.onTrack;
+    effect.onTrigger = options.onTrigger;
+  }
+  if (cb) {
+    if (immediate) {
+      job(true);
+    } else {
+      oldValue = effect.run();
+    }
+  } else if (scheduler) {
+    scheduler(job.bind(null, true), true);
+  } else {
+    effect.run();
+  }
+  watchHandle.pause = effect.pause.bind(effect);
+  watchHandle.resume = effect.resume.bind(effect);
+  watchHandle.stop = watchHandle;
+  return watchHandle;
+}
+function traverse(value, depth = Infinity, seen) {
+  if (depth <= 0 || !isObject(value) || value["__v_skip"]) {
+    return value;
+  }
+  seen = seen || /* @__PURE__ */ new Set();
+  if (seen.has(value)) {
+    return value;
+  }
+  seen.add(value);
+  depth--;
+  if (isRef(value)) {
+    traverse(value.value, depth, seen);
+  } else if (isArray(value)) {
+    for (let i = 0; i < value.length; i++) {
+      traverse(value[i], depth, seen);
+    }
+  } else if (isSet(value) || isMap(value)) {
+    value.forEach((v) => {
+      traverse(v, depth, seen);
+    });
+  } else if (isPlainObject(value)) {
+    for (const key in value) {
+      traverse(value[key], depth, seen);
+    }
+    for (const key of Object.getOwnPropertySymbols(value)) {
+      if (Object.prototype.propertyIsEnumerable.call(value, key)) {
+        traverse(value[key], depth, seen);
+      }
+    }
+  }
+  return value;
+}
+
+const stack$1 = [];
+function pushWarningContext(vnode) {
+  stack$1.push(vnode);
+}
+function popWarningContext() {
+  stack$1.pop();
+}
+let isWarning = false;
+function warn$1(msg, ...args) {
+  if (isWarning) return;
+  isWarning = true;
+  pauseTracking();
+  const instance = stack$1.length ? stack$1[stack$1.length - 1].component : null;
+  const appWarnHandler = instance && instance.appContext.config.warnHandler;
+  const trace = getComponentTrace();
+  if (appWarnHandler) {
+    callWithErrorHandling(
+      appWarnHandler,
+      instance,
+      11,
+      [
+        // eslint-disable-next-line no-restricted-syntax
+        msg + args.map((a) => {
+          var _a, _b;
+          return (_b = (_a = a.toString) == null ? void 0 : _a.call(a)) != null ? _b : JSON.stringify(a);
+        }).join(""),
+        instance && instance.proxy,
+        trace.map(
+          ({ vnode }) => `at <${formatComponentName(instance, vnode.type)}>`
+        ).join("\n"),
+        trace
+      ]
+    );
+  } else {
+    const warnArgs = [`[Vue warn]: ${msg}`, ...args];
+    if (trace.length && // avoid spamming console during tests
+    true) {
+      warnArgs.push(`
+`, ...formatTrace(trace));
+    }
+    console.warn(...warnArgs);
+  }
+  resetTracking();
+  isWarning = false;
+}
+function getComponentTrace() {
+  let currentVNode = stack$1[stack$1.length - 1];
+  if (!currentVNode) {
+    return [];
+  }
+  const normalizedStack = [];
+  while (currentVNode) {
+    const last = normalizedStack[0];
+    if (last && last.vnode === currentVNode) {
+      last.recurseCount++;
+    } else {
+      normalizedStack.push({
+        vnode: currentVNode,
+        recurseCount: 0
+      });
+    }
+    const parentInstance = currentVNode.component && currentVNode.component.parent;
+    currentVNode = parentInstance && parentInstance.vnode;
+  }
+  return normalizedStack;
+}
+function formatTrace(trace) {
+  const logs = [];
+  trace.forEach((entry, i) => {
+    logs.push(...i === 0 ? [] : [`
+`], ...formatTraceEntry(entry));
+  });
+  return logs;
+}
+function formatTraceEntry({ vnode, recurseCount }) {
+  const postfix = recurseCount > 0 ? `... (${recurseCount} recursive calls)` : ``;
+  const isRoot = vnode.component ? vnode.component.parent == null : false;
+  const open = ` at <${formatComponentName(
+    vnode.component,
+    vnode.type,
+    isRoot
+  )}`;
+  const close = `>` + postfix;
+  return vnode.props ? [open, ...formatProps(vnode.props), close] : [open + close];
+}
+function formatProps(props) {
+  const res = [];
+  const keys = Object.keys(props);
+  keys.slice(0, 3).forEach((key) => {
+    res.push(...formatProp(key, props[key]));
+  });
+  if (keys.length > 3) {
+    res.push(` ...`);
+  }
+  return res;
+}
+function formatProp(key, value, raw) {
+  if (isString(value)) {
+    value = JSON.stringify(value);
+    return raw ? value : [`${key}=${value}`];
+  } else if (typeof value === "number" || typeof value === "boolean" || value == null) {
+    return raw ? value : [`${key}=${value}`];
+  } else if (isRef(value)) {
+    value = formatProp(key, toRaw(value.value), true);
+    return raw ? value : [`${key}=Ref<`, value, `>`];
+  } else if (isFunction(value)) {
+    return [`${key}=fn${value.name ? `<${value.name}>` : ``}`];
+  } else {
+    value = toRaw(value);
+    return raw ? value : [`${key}=`, value];
+  }
+}
+function assertNumber(val, type) {
+  if (val === void 0) {
+    return;
+  } else if (typeof val !== "number") {
+    warn$1(`${type} is not a valid number - got ${JSON.stringify(val)}.`);
+  } else if (isNaN(val)) {
+    warn$1(`${type} is NaN - the duration expression might be incorrect.`);
+  }
+}
+
+const ErrorCodes = {
+  "SETUP_FUNCTION": 0,
+  "0": "SETUP_FUNCTION",
+  "RENDER_FUNCTION": 1,
+  "1": "RENDER_FUNCTION",
+  "NATIVE_EVENT_HANDLER": 5,
+  "5": "NATIVE_EVENT_HANDLER",
+  "COMPONENT_EVENT_HANDLER": 6,
+  "6": "COMPONENT_EVENT_HANDLER",
+  "VNODE_HOOK": 7,
+  "7": "VNODE_HOOK",
+  "DIRECTIVE_HOOK": 8,
+  "8": "DIRECTIVE_HOOK",
+  "TRANSITION_HOOK": 9,
+  "9": "TRANSITION_HOOK",
+  "APP_ERROR_HANDLER": 10,
+  "10": "APP_ERROR_HANDLER",
+  "APP_WARN_HANDLER": 11,
+  "11": "APP_WARN_HANDLER",
+  "FUNCTION_REF": 12,
+  "12": "FUNCTION_REF",
+  "ASYNC_COMPONENT_LOADER": 13,
+  "13": "ASYNC_COMPONENT_LOADER",
+  "SCHEDULER": 14,
+  "14": "SCHEDULER",
+  "COMPONENT_UPDATE": 15,
+  "15": "COMPONENT_UPDATE",
+  "APP_UNMOUNT_CLEANUP": 16,
+  "16": "APP_UNMOUNT_CLEANUP"
+};
+const ErrorTypeStrings$1 = {
+  ["sp"]: "serverPrefetch hook",
+  ["bc"]: "beforeCreate hook",
+  ["c"]: "created hook",
+  ["bm"]: "beforeMount hook",
+  ["m"]: "mounted hook",
+  ["bu"]: "beforeUpdate hook",
+  ["u"]: "updated",
+  ["bum"]: "beforeUnmount hook",
+  ["um"]: "unmounted hook",
+  ["a"]: "activated hook",
+  ["da"]: "deactivated hook",
+  ["ec"]: "errorCaptured hook",
+  ["rtc"]: "renderTracked hook",
+  ["rtg"]: "renderTriggered hook",
+  [0]: "setup function",
+  [1]: "render function",
+  [2]: "watcher getter",
+  [3]: "watcher callback",
+  [4]: "watcher cleanup function",
+  [5]: "native event handler",
+  [6]: "component event handler",
+  [7]: "vnode hook",
+  [8]: "directive hook",
+  [9]: "transition hook",
+  [10]: "app errorHandler",
+  [11]: "app warnHandler",
+  [12]: "ref function",
+  [13]: "async component loader",
+  [14]: "scheduler flush",
+  [15]: "component update",
+  [16]: "app unmount cleanup function"
+};
+function callWithErrorHandling(fn, instance, type, args) {
+  try {
+    return args ? fn(...args) : fn();
+  } catch (err) {
+    handleError(err, instance, type);
+  }
+}
+function callWithAsyncErrorHandling(fn, instance, type, args) {
+  if (isFunction(fn)) {
+    const res = callWithErrorHandling(fn, instance, type, args);
+    if (res && isPromise(res)) {
+      res.catch((err) => {
+        handleError(err, instance, type);
+      });
+    }
+    return res;
+  }
+  if (isArray(fn)) {
+    const values = [];
+    for (let i = 0; i < fn.length; i++) {
+      values.push(callWithAsyncErrorHandling(fn[i], instance, type, args));
+    }
+    return values;
+  } else {
+    warn$1(
+      `Invalid value type passed to callWithAsyncErrorHandling(): ${typeof fn}`
+    );
+  }
+}
+function handleError(err, instance, type, throwInDev = true) {
+  const contextVNode = instance ? instance.vnode : null;
+  const { errorHandler, throwUnhandledErrorInProduction } = instance && instance.appContext.config || EMPTY_OBJ;
+  if (instance) {
+    let cur = instance.parent;
+    const exposedInstance = instance.proxy;
+    const errorInfo = ErrorTypeStrings$1[type] ;
+    while (cur) {
+      const errorCapturedHooks = cur.ec;
+      if (errorCapturedHooks) {
+        for (let i = 0; i < errorCapturedHooks.length; i++) {
+          if (errorCapturedHooks[i](err, exposedInstance, errorInfo) === false) {
+            return;
+          }
+        }
+      }
+      cur = cur.parent;
+    }
+    if (errorHandler) {
+      pauseTracking();
+      callWithErrorHandling(errorHandler, null, 10, [
+        err,
+        exposedInstance,
+        errorInfo
+      ]);
+      resetTracking();
+      return;
+    }
+  }
+  logError(err, type, contextVNode, throwInDev, throwUnhandledErrorInProduction);
+}
+function logError(err, type, contextVNode, throwInDev = true, throwInProd = false) {
+  {
+    const info = ErrorTypeStrings$1[type];
+    if (contextVNode) {
+      pushWarningContext(contextVNode);
+    }
+    warn$1(`Unhandled error${info ? ` during execution of ${info}` : ``}`);
+    if (contextVNode) {
+      popWarningContext();
+    }
+    if (throwInDev) {
+      throw err;
+    } else {
+      console.error(err);
+    }
+  }
+}
+
+const queue = [];
+let flushIndex = -1;
+const pendingPostFlushCbs = [];
+let activePostFlushCbs = null;
+let postFlushIndex = 0;
+const resolvedPromise = /* @__PURE__ */ Promise.resolve();
+let currentFlushPromise = null;
+const RECURSION_LIMIT = 100;
+function nextTick(fn) {
+  const p = currentFlushPromise || resolvedPromise;
+  return fn ? p.then(this ? fn.bind(this) : fn) : p;
+}
+function findInsertionIndex(id) {
+  let start = flushIndex + 1;
+  let end = queue.length;
+  while (start < end) {
+    const middle = start + end >>> 1;
+    const middleJob = queue[middle];
+    const middleJobId = getId(middleJob);
+    if (middleJobId < id || middleJobId === id && middleJob.flags & 2) {
+      start = middle + 1;
+    } else {
+      end = middle;
+    }
+  }
+  return start;
+}
+function queueJob(job) {
+  if (!(job.flags & 1)) {
+    const jobId = getId(job);
+    const lastJob = queue[queue.length - 1];
+    if (!lastJob || // fast path when the job id is larger than the tail
+    !(job.flags & 2) && jobId >= getId(lastJob)) {
+      queue.push(job);
+    } else {
+      queue.splice(findInsertionIndex(jobId), 0, job);
+    }
+    job.flags |= 1;
+    queueFlush();
+  }
+}
+function queueFlush() {
+  if (!currentFlushPromise) {
+    currentFlushPromise = resolvedPromise.then(flushJobs);
+  }
+}
+function queuePostFlushCb(cb) {
+  if (!isArray(cb)) {
+    if (activePostFlushCbs && cb.id === -1) {
+      activePostFlushCbs.splice(postFlushIndex + 1, 0, cb);
+    } else if (!(cb.flags & 1)) {
+      pendingPostFlushCbs.push(cb);
+      cb.flags |= 1;
+    }
+  } else {
+    pendingPostFlushCbs.push(...cb);
+  }
+  queueFlush();
+}
+function flushPreFlushCbs(instance, seen, i = flushIndex + 1) {
+  {
+    seen = seen || /* @__PURE__ */ new Map();
+  }
+  for (; i < queue.length; i++) {
+    const cb = queue[i];
+    if (cb && cb.flags & 2) {
+      if (instance && cb.id !== instance.uid) {
+        continue;
+      }
+      if (checkRecursiveUpdates(seen, cb)) {
+        continue;
+      }
+      queue.splice(i, 1);
+      i--;
+      if (cb.flags & 4) {
+        cb.flags &= ~1;
+      }
+      cb();
+      if (!(cb.flags & 4)) {
+        cb.flags &= ~1;
+      }
+    }
+  }
+}
+function flushPostFlushCbs(seen) {
+  if (pendingPostFlushCbs.length) {
+    const deduped = [...new Set(pendingPostFlushCbs)].sort(
+      (a, b) => getId(a) - getId(b)
+    );
+    pendingPostFlushCbs.length = 0;
+    if (activePostFlushCbs) {
+      activePostFlushCbs.push(...deduped);
+      return;
+    }
+    activePostFlushCbs = deduped;
+    {
+      seen = seen || /* @__PURE__ */ new Map();
+    }
+    for (postFlushIndex = 0; postFlushIndex < activePostFlushCbs.length; postFlushIndex++) {
+      const cb = activePostFlushCbs[postFlushIndex];
+      if (checkRecursiveUpdates(seen, cb)) {
+        continue;
+      }
+      if (cb.flags & 4) {
+        cb.flags &= ~1;
+      }
+      if (!(cb.flags & 8)) cb();
+      cb.flags &= ~1;
+    }
+    activePostFlushCbs = null;
+    postFlushIndex = 0;
+  }
+}
+const getId = (job) => job.id == null ? job.flags & 2 ? -1 : Infinity : job.id;
+function flushJobs(seen) {
+  {
+    seen = seen || /* @__PURE__ */ new Map();
+  }
+  const check = (job) => checkRecursiveUpdates(seen, job) ;
+  try {
+    for (flushIndex = 0; flushIndex < queue.length; flushIndex++) {
+      const job = queue[flushIndex];
+      if (job && !(job.flags & 8)) {
+        if (check(job)) {
+          continue;
+        }
+        if (job.flags & 4) {
+          job.flags &= ~1;
+        }
+        callWithErrorHandling(
+          job,
+          job.i,
+          job.i ? 15 : 14
+        );
+        if (!(job.flags & 4)) {
+          job.flags &= ~1;
+        }
+      }
+    }
+  } finally {
+    for (; flushIndex < queue.length; flushIndex++) {
+      const job = queue[flushIndex];
+      if (job) {
+        job.flags &= ~1;
+      }
+    }
+    flushIndex = -1;
+    queue.length = 0;
+    flushPostFlushCbs(seen);
+    currentFlushPromise = null;
+    if (queue.length || pendingPostFlushCbs.length) {
+      flushJobs(seen);
+    }
+  }
+}
+function checkRecursiveUpdates(seen, fn) {
+  const count = seen.get(fn) || 0;
+  if (count > RECURSION_LIMIT) {
+    const instance = fn.i;
+    const componentName = instance && getComponentName(instance.type);
+    handleError(
+      `Maximum recursive updates exceeded${componentName ? ` in component <${componentName}>` : ``}. This means you have a reactive effect that is mutating its own dependencies and thus recursively triggering itself. Possible sources include component template, render function, updated hook or watcher source function.`,
+      null,
+      10
+    );
+    return true;
+  }
+  seen.set(fn, count + 1);
+  return false;
+}
+
+let isHmrUpdating = false;
+const hmrDirtyComponents = /* @__PURE__ */ new Map();
+{
+  getGlobalThis().__VUE_HMR_RUNTIME__ = {
+    createRecord: tryWrap(createRecord),
+    rerender: tryWrap(rerender),
+    reload: tryWrap(reload)
+  };
+}
+const map = /* @__PURE__ */ new Map();
+function registerHMR(instance) {
+  const id = instance.type.__hmrId;
+  let record = map.get(id);
+  if (!record) {
+    createRecord(id, instance.type);
+    record = map.get(id);
+  }
+  record.instances.add(instance);
+}
+function unregisterHMR(instance) {
+  map.get(instance.type.__hmrId).instances.delete(instance);
+}
+function createRecord(id, initialDef) {
+  if (map.has(id)) {
+    return false;
+  }
+  map.set(id, {
+    initialDef: normalizeClassComponent(initialDef),
+    instances: /* @__PURE__ */ new Set()
+  });
+  return true;
+}
+function normalizeClassComponent(component) {
+  return isClassComponent(component) ? component.__vccOpts : component;
+}
+function rerender(id, newRender) {
+  const record = map.get(id);
+  if (!record) {
+    return;
+  }
+  record.initialDef.render = newRender;
+  [...record.instances].forEach((instance) => {
+    if (newRender) {
+      instance.render = newRender;
+      normalizeClassComponent(instance.type).render = newRender;
+    }
+    instance.renderCache = [];
+    isHmrUpdating = true;
+    instance.update();
+    isHmrUpdating = false;
+  });
+}
+function reload(id, newComp) {
+  const record = map.get(id);
+  if (!record) return;
+  newComp = normalizeClassComponent(newComp);
+  updateComponentDef(record.initialDef, newComp);
+  const instances = [...record.instances];
+  for (let i = 0; i < instances.length; i++) {
+    const instance = instances[i];
+    const oldComp = normalizeClassComponent(instance.type);
+    let dirtyInstances = hmrDirtyComponents.get(oldComp);
+    if (!dirtyInstances) {
+      if (oldComp !== record.initialDef) {
+        updateComponentDef(oldComp, newComp);
+      }
+      hmrDirtyComponents.set(oldComp, dirtyInstances = /* @__PURE__ */ new Set());
+    }
+    dirtyInstances.add(instance);
+    instance.appContext.propsCache.delete(instance.type);
+    instance.appContext.emitsCache.delete(instance.type);
+    instance.appContext.optionsCache.delete(instance.type);
+    if (instance.ceReload) {
+      dirtyInstances.add(instance);
+      instance.ceReload(newComp.styles);
+      dirtyInstances.delete(instance);
+    } else if (instance.parent) {
+      queueJob(() => {
+        isHmrUpdating = true;
+        instance.parent.update();
+        isHmrUpdating = false;
+        dirtyInstances.delete(instance);
+      });
+    } else if (instance.appContext.reload) {
+      instance.appContext.reload();
+    } else if (typeof window !== "undefined") {
+      window.location.reload();
+    } else {
+      console.warn(
+        "[HMR] Root or manually mounted instance modified. Full reload required."
+      );
+    }
+    if (instance.root.ce && instance !== instance.root) {
+      instance.root.ce._removeChildStyle(oldComp);
+    }
+  }
+  queuePostFlushCb(() => {
+    hmrDirtyComponents.clear();
+  });
+}
+function updateComponentDef(oldComp, newComp) {
+  extend(oldComp, newComp);
+  for (const key in oldComp) {
+    if (key !== "__file" && !(key in newComp)) {
+      delete oldComp[key];
+    }
+  }
+}
+function tryWrap(fn) {
+  return (id, arg) => {
+    try {
+      return fn(id, arg);
+    } catch (e) {
+      console.error(e);
+      console.warn(
+        `[HMR] Something went wrong during Vue component hot-reload. Full reload required.`
+      );
+    }
+  };
+}
+
+let devtools$1;
+let buffer = [];
+let devtoolsNotInstalled = false;
+function emit$1(event, ...args) {
+  if (devtools$1) {
+    devtools$1.emit(event, ...args);
+  } else if (!devtoolsNotInstalled) {
+    buffer.push({ event, args });
+  }
+}
+function setDevtoolsHook$1(hook, target) {
+  var _a, _b;
+  devtools$1 = hook;
+  if (devtools$1) {
+    devtools$1.enabled = true;
+    buffer.forEach(({ event, args }) => devtools$1.emit(event, ...args));
+    buffer = [];
+  } else if (
+    // handle late devtools injection - only do this if we are in an actual
+    // browser environment to avoid the timer handle stalling test runner exit
+    // (#4815)
+    typeof window !== "undefined" && // some envs mock window but not fully
+    window.HTMLElement && // also exclude jsdom
+    // eslint-disable-next-line no-restricted-syntax
+    !((_b = (_a = window.navigator) == null ? void 0 : _a.userAgent) == null ? void 0 : _b.includes("jsdom"))
+  ) {
+    const replay = target.__VUE_DEVTOOLS_HOOK_REPLAY__ = target.__VUE_DEVTOOLS_HOOK_REPLAY__ || [];
+    replay.push((newHook) => {
+      setDevtoolsHook$1(newHook, target);
+    });
+    setTimeout(() => {
+      if (!devtools$1) {
+        target.__VUE_DEVTOOLS_HOOK_REPLAY__ = null;
+        devtoolsNotInstalled = true;
+        buffer = [];
+      }
+    }, 3e3);
+  } else {
+    devtoolsNotInstalled = true;
+    buffer = [];
+  }
+}
+function devtoolsInitApp(app, version) {
+  emit$1("app:init" /* APP_INIT */, app, version, {
+    Fragment,
+    Text,
+    Comment,
+    Static
+  });
+}
+function devtoolsUnmountApp(app) {
+  emit$1("app:unmount" /* APP_UNMOUNT */, app);
+}
+const devtoolsComponentAdded = /* @__PURE__ */ createDevtoolsComponentHook("component:added" /* COMPONENT_ADDED */);
+const devtoolsComponentUpdated = /* @__PURE__ */ createDevtoolsComponentHook("component:updated" /* COMPONENT_UPDATED */);
+const _devtoolsComponentRemoved = /* @__PURE__ */ createDevtoolsComponentHook(
+  "component:removed" /* COMPONENT_REMOVED */
+);
+const devtoolsComponentRemoved = (component) => {
+  if (devtools$1 && typeof devtools$1.cleanupBuffer === "function" && // remove the component if it wasn't buffered
+  !devtools$1.cleanupBuffer(component)) {
+    _devtoolsComponentRemoved(component);
+  }
+};
+/*! #__NO_SIDE_EFFECTS__ */
+// @__NO_SIDE_EFFECTS__
+function createDevtoolsComponentHook(hook) {
+  return (component) => {
+    emit$1(
+      hook,
+      component.appContext.app,
+      component.uid,
+      component.parent ? component.parent.uid : void 0,
+      component
+    );
+  };
+}
+const devtoolsPerfStart = /* @__PURE__ */ createDevtoolsPerformanceHook("perf:start" /* PERFORMANCE_START */);
+const devtoolsPerfEnd = /* @__PURE__ */ createDevtoolsPerformanceHook("perf:end" /* PERFORMANCE_END */);
+function createDevtoolsPerformanceHook(hook) {
+  return (component, type, time) => {
+    emit$1(hook, component.appContext.app, component.uid, component, type, time);
+  };
+}
+function devtoolsComponentEmit(component, event, params) {
+  emit$1(
+    "component:emit" /* COMPONENT_EMIT */,
+    component.appContext.app,
+    component,
+    event,
+    params
+  );
+}
+
+let currentRenderingInstance = null;
+let currentScopeId = null;
+function setCurrentRenderingInstance(instance) {
+  const prev = currentRenderingInstance;
+  currentRenderingInstance = instance;
+  currentScopeId = instance && instance.type.__scopeId || null;
+  return prev;
+}
+function pushScopeId(id) {
+  currentScopeId = id;
+}
+function popScopeId() {
+  currentScopeId = null;
+}
+const withScopeId = (_id) => withCtx;
+function withCtx(fn, ctx = currentRenderingInstance, isNonScopedSlot) {
+  if (!ctx) return fn;
+  if (fn._n) {
+    return fn;
+  }
+  const renderFnWithContext = (...args) => {
+    if (renderFnWithContext._d) {
+      setBlockTracking(-1);
+    }
+    const prevInstance = setCurrentRenderingInstance(ctx);
+    let res;
+    try {
+      res = fn(...args);
+    } finally {
+      setCurrentRenderingInstance(prevInstance);
+      if (renderFnWithContext._d) {
+        setBlockTracking(1);
+      }
+    }
+    {
+      devtoolsComponentUpdated(ctx);
+    }
+    return res;
+  };
+  renderFnWithContext._n = true;
+  renderFnWithContext._c = true;
+  renderFnWithContext._d = true;
+  return renderFnWithContext;
+}
+
+function validateDirectiveName(name) {
+  if (isBuiltInDirective(name)) {
+    warn$1("Do not use built-in directive ids as custom directive id: " + name);
+  }
+}
+function withDirectives(vnode, directives) {
+  if (currentRenderingInstance === null) {
+    warn$1(`withDirectives can only be used inside render functions.`);
+    return vnode;
+  }
+  const instance = getComponentPublicInstance(currentRenderingInstance);
+  const bindings = vnode.dirs || (vnode.dirs = []);
+  for (let i = 0; i < directives.length; i++) {
+    let [dir, value, arg, modifiers = EMPTY_OBJ] = directives[i];
+    if (dir) {
+      if (isFunction(dir)) {
+        dir = {
+          mounted: dir,
+          updated: dir
+        };
+      }
+      if (dir.deep) {
+        traverse(value);
+      }
+      bindings.push({
+        dir,
+        instance,
+        value,
+        oldValue: void 0,
+        arg,
+        modifiers
+      });
+    }
+  }
+  return vnode;
+}
+function invokeDirectiveHook(vnode, prevVNode, instance, name) {
+  const bindings = vnode.dirs;
+  const oldBindings = prevVNode && prevVNode.dirs;
+  for (let i = 0; i < bindings.length; i++) {
+    const binding = bindings[i];
+    if (oldBindings) {
+      binding.oldValue = oldBindings[i].value;
+    }
+    let hook = binding.dir[name];
+    if (hook) {
+      pauseTracking();
+      callWithAsyncErrorHandling(hook, instance, 8, [
+        vnode.el,
+        binding,
+        vnode,
+        prevVNode
+      ]);
+      resetTracking();
+    }
+  }
+}
+
+const TeleportEndKey = Symbol("_vte");
+const isTeleport = (type) => type.__isTeleport;
+const isTeleportDisabled = (props) => props && (props.disabled || props.disabled === "");
+const isTeleportDeferred = (props) => props && (props.defer || props.defer === "");
+const isTargetSVG = (target) => typeof SVGElement !== "undefined" && target instanceof SVGElement;
+const isTargetMathML = (target) => typeof MathMLElement === "function" && target instanceof MathMLElement;
+const resolveTarget = (props, select) => {
+  const targetSelector = props && props.to;
+  if (isString(targetSelector)) {
+    if (!select) {
+      warn$1(
+        `Current renderer does not support string target for Teleports. (missing querySelector renderer option)`
+      );
+      return null;
+    } else {
+      const target = select(targetSelector);
+      if (!target && !isTeleportDisabled(props)) {
+        warn$1(
+          `Failed to locate Teleport target with selector "${targetSelector}". Note the target element must exist before the component is mounted - i.e. the target cannot be rendered by the component itself, and ideally should be outside of the entire Vue component tree.`
+        );
+      }
+      return target;
+    }
+  } else {
+    if (!targetSelector && !isTeleportDisabled(props)) {
+      warn$1(`Invalid Teleport target: ${targetSelector}`);
+    }
+    return targetSelector;
+  }
+};
+const TeleportImpl = {
+  name: "Teleport",
+  __isTeleport: true,
+  process(n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, internals) {
+    const {
+      mc: mountChildren,
+      pc: patchChildren,
+      pbc: patchBlockChildren,
+      o: { insert, querySelector, createText, createComment }
+    } = internals;
+    const disabled = isTeleportDisabled(n2.props);
+    let { shapeFlag, children, dynamicChildren } = n2;
+    if (isHmrUpdating) {
+      optimized = false;
+      dynamicChildren = null;
+    }
+    if (n1 == null) {
+      const placeholder = n2.el = createComment("teleport start") ;
+      const mainAnchor = n2.anchor = createComment("teleport end") ;
+      insert(placeholder, container, anchor);
+      insert(mainAnchor, container, anchor);
+      const mount = (container2, anchor2) => {
+        if (shapeFlag & 16) {
+          if (parentComponent && parentComponent.isCE) {
+            parentComponent.ce._teleportTarget = container2;
+          }
+          mountChildren(
+            children,
+            container2,
+            anchor2,
+            parentComponent,
+            parentSuspense,
+            namespace,
+            slotScopeIds,
+            optimized
+          );
+        }
+      };
+      const mountToTarget = () => {
+        const target = n2.target = resolveTarget(n2.props, querySelector);
+        const targetAnchor = prepareAnchor(target, n2, createText, insert);
+        if (target) {
+          if (namespace !== "svg" && isTargetSVG(target)) {
+            namespace = "svg";
+          } else if (namespace !== "mathml" && isTargetMathML(target)) {
+            namespace = "mathml";
+          }
+          if (!disabled) {
+            mount(target, targetAnchor);
+            updateCssVars(n2, false);
+          }
+        } else if (!disabled) {
+          warn$1(
+            "Invalid Teleport target on mount:",
+            target,
+            `(${typeof target})`
+          );
+        }
+      };
+      if (disabled) {
+        mount(container, mainAnchor);
+        updateCssVars(n2, true);
+      }
+      if (isTeleportDeferred(n2.props)) {
+        queuePostRenderEffect(mountToTarget, parentSuspense);
+      } else {
+        mountToTarget();
+      }
+    } else {
+      n2.el = n1.el;
+      n2.targetStart = n1.targetStart;
+      const mainAnchor = n2.anchor = n1.anchor;
+      const target = n2.target = n1.target;
+      const targetAnchor = n2.targetAnchor = n1.targetAnchor;
+      const wasDisabled = isTeleportDisabled(n1.props);
+      const currentContainer = wasDisabled ? container : target;
+      const currentAnchor = wasDisabled ? mainAnchor : targetAnchor;
+      if (namespace === "svg" || isTargetSVG(target)) {
+        namespace = "svg";
+      } else if (namespace === "mathml" || isTargetMathML(target)) {
+        namespace = "mathml";
+      }
+      if (dynamicChildren) {
+        patchBlockChildren(
+          n1.dynamicChildren,
+          dynamicChildren,
+          currentContainer,
+          parentComponent,
+          parentSuspense,
+          namespace,
+          slotScopeIds
+        );
+        traverseStaticChildren(n1, n2, true);
+      } else if (!optimized) {
+        patchChildren(
+          n1,
+          n2,
+          currentContainer,
+          currentAnchor,
+          parentComponent,
+          parentSuspense,
+          namespace,
+          slotScopeIds,
+          false
+        );
+      }
+      if (disabled) {
+        if (!wasDisabled) {
+          moveTeleport(
+            n2,
+            container,
+            mainAnchor,
+            internals,
+            1
+          );
+        } else {
+          if (n2.props && n1.props && n2.props.to !== n1.props.to) {
+            n2.props.to = n1.props.to;
+          }
+        }
+      } else {
+        if ((n2.props && n2.props.to) !== (n1.props && n1.props.to)) {
+          const nextTarget = n2.target = resolveTarget(
+            n2.props,
+            querySelector
+          );
+          if (nextTarget) {
+            moveTeleport(
+              n2,
+              nextTarget,
+              null,
+              internals,
+              0
+            );
+          } else {
+            warn$1(
+              "Invalid Teleport target on update:",
+              target,
+              `(${typeof target})`
+            );
+          }
+        } else if (wasDisabled) {
+          moveTeleport(
+            n2,
+            target,
+            targetAnchor,
+            internals,
+            1
+          );
+        }
+      }
+      updateCssVars(n2, disabled);
+    }
+  },
+  remove(vnode, parentComponent, parentSuspense, { um: unmount, o: { remove: hostRemove } }, doRemove) {
+    const {
+      shapeFlag,
+      children,
+      anchor,
+      targetStart,
+      targetAnchor,
+      target,
+      props
+    } = vnode;
+    if (target) {
+      hostRemove(targetStart);
+      hostRemove(targetAnchor);
+    }
+    doRemove && hostRemove(anchor);
+    if (shapeFlag & 16) {
+      const shouldRemove = doRemove || !isTeleportDisabled(props);
+      for (let i = 0; i < children.length; i++) {
+        const child = children[i];
+        unmount(
+          child,
+          parentComponent,
+          parentSuspense,
+          shouldRemove,
+          !!child.dynamicChildren
+        );
+      }
+    }
+  },
+  move: moveTeleport,
+  hydrate: hydrateTeleport
+};
+function moveTeleport(vnode, container, parentAnchor, { o: { insert }, m: move }, moveType = 2) {
+  if (moveType === 0) {
+    insert(vnode.targetAnchor, container, parentAnchor);
+  }
+  const { el, anchor, shapeFlag, children, props } = vnode;
+  const isReorder = moveType === 2;
+  if (isReorder) {
+    insert(el, container, parentAnchor);
+  }
+  if (!isReorder || isTeleportDisabled(props)) {
+    if (shapeFlag & 16) {
+      for (let i = 0; i < children.length; i++) {
+        move(
+          children[i],
+          container,
+          parentAnchor,
+          2
+        );
+      }
+    }
+  }
+  if (isReorder) {
+    insert(anchor, container, parentAnchor);
+  }
+}
+function hydrateTeleport(node, vnode, parentComponent, parentSuspense, slotScopeIds, optimized, {
+  o: { nextSibling, parentNode, querySelector, insert, createText }
+}, hydrateChildren) {
+  const target = vnode.target = resolveTarget(
+    vnode.props,
+    querySelector
+  );
+  if (target) {
+    const disabled = isTeleportDisabled(vnode.props);
+    const targetNode = target._lpa || target.firstChild;
+    if (vnode.shapeFlag & 16) {
+      if (disabled) {
+        vnode.anchor = hydrateChildren(
+          nextSibling(node),
+          vnode,
+          parentNode(node),
+          parentComponent,
+          parentSuspense,
+          slotScopeIds,
+          optimized
+        );
+        vnode.targetStart = targetNode;
+        vnode.targetAnchor = targetNode && nextSibling(targetNode);
+      } else {
+        vnode.anchor = nextSibling(node);
+        let targetAnchor = targetNode;
+        while (targetAnchor) {
+          if (targetAnchor && targetAnchor.nodeType === 8) {
+            if (targetAnchor.data === "teleport start anchor") {
+              vnode.targetStart = targetAnchor;
+            } else if (targetAnchor.data === "teleport anchor") {
+              vnode.targetAnchor = targetAnchor;
+              target._lpa = vnode.targetAnchor && nextSibling(vnode.targetAnchor);
+              break;
+            }
+          }
+          targetAnchor = nextSibling(targetAnchor);
+        }
+        if (!vnode.targetAnchor) {
+          prepareAnchor(target, vnode, createText, insert);
+        }
+        hydrateChildren(
+          targetNode && nextSibling(targetNode),
+          vnode,
+          target,
+          parentComponent,
+          parentSuspense,
+          slotScopeIds,
+          optimized
+        );
+      }
+    }
+    updateCssVars(vnode, disabled);
+  }
+  return vnode.anchor && nextSibling(vnode.anchor);
+}
+const Teleport = TeleportImpl;
+function updateCssVars(vnode, isDisabled) {
+  const ctx = vnode.ctx;
+  if (ctx && ctx.ut) {
+    let node, anchor;
+    if (isDisabled) {
+      node = vnode.el;
+      anchor = vnode.anchor;
+    } else {
+      node = vnode.targetStart;
+      anchor = vnode.targetAnchor;
+    }
+    while (node && node !== anchor) {
+      if (node.nodeType === 1) node.setAttribute("data-v-owner", ctx.uid);
+      node = node.nextSibling;
+    }
+    ctx.ut();
+  }
+}
+function prepareAnchor(target, vnode, createText, insert) {
+  const targetStart = vnode.targetStart = createText("");
+  const targetAnchor = vnode.targetAnchor = createText("");
+  targetStart[TeleportEndKey] = targetAnchor;
+  if (target) {
+    insert(targetStart, target);
+    insert(targetAnchor, target);
+  }
+  return targetAnchor;
+}
+
+const leaveCbKey = Symbol("_leaveCb");
+const enterCbKey$1 = Symbol("_enterCb");
+function useTransitionState() {
+  const state = {
+    isMounted: false,
+    isLeaving: false,
+    isUnmounting: false,
+    leavingVNodes: /* @__PURE__ */ new Map()
+  };
+  onMounted(() => {
+    state.isMounted = true;
+  });
+  onBeforeUnmount(() => {
+    state.isUnmounting = true;
+  });
+  return state;
+}
+const TransitionHookValidator = [Function, Array];
+const BaseTransitionPropsValidators = {
+  mode: String,
+  appear: Boolean,
+  persisted: Boolean,
+  // enter
+  onBeforeEnter: TransitionHookValidator,
+  onEnter: TransitionHookValidator,
+  onAfterEnter: TransitionHookValidator,
+  onEnterCancelled: TransitionHookValidator,
+  // leave
+  onBeforeLeave: TransitionHookValidator,
+  onLeave: TransitionHookValidator,
+  onAfterLeave: TransitionHookValidator,
+  onLeaveCancelled: TransitionHookValidator,
+  // appear
+  onBeforeAppear: TransitionHookValidator,
+  onAppear: TransitionHookValidator,
+  onAfterAppear: TransitionHookValidator,
+  onAppearCancelled: TransitionHookValidator
+};
+const recursiveGetSubtree = (instance) => {
+  const subTree = instance.subTree;
+  return subTree.component ? recursiveGetSubtree(subTree.component) : subTree;
+};
+const BaseTransitionImpl = {
+  name: `BaseTransition`,
+  props: BaseTransitionPropsValidators,
+  setup(props, { slots }) {
+    const instance = getCurrentInstance();
+    const state = useTransitionState();
+    return () => {
+      const children = slots.default && getTransitionRawChildren(slots.default(), true);
+      if (!children || !children.length) {
+        return;
+      }
+      const child = findNonCommentChild(children);
+      const rawProps = toRaw(props);
+      const { mode } = rawProps;
+      if (mode && mode !== "in-out" && mode !== "out-in" && mode !== "default") {
+        warn$1(`invalid <transition> mode: ${mode}`);
+      }
+      if (state.isLeaving) {
+        return emptyPlaceholder(child);
+      }
+      const innerChild = getInnerChild$1(child);
+      if (!innerChild) {
+        return emptyPlaceholder(child);
+      }
+      let enterHooks = resolveTransitionHooks(
+        innerChild,
+        rawProps,
+        state,
+        instance,
+        // #11061, ensure enterHooks is fresh after clone
+        (hooks) => enterHooks = hooks
+      );
+      if (innerChild.type !== Comment) {
+        setTransitionHooks(innerChild, enterHooks);
+      }
+      const oldChild = instance.subTree;
+      const oldInnerChild = oldChild && getInnerChild$1(oldChild);
+      if (oldInnerChild && oldInnerChild.type !== Comment && !isSameVNodeType(innerChild, oldInnerChild) && recursiveGetSubtree(instance).type !== Comment) {
+        const leavingHooks = resolveTransitionHooks(
+          oldInnerChild,
+          rawProps,
+          state,
+          instance
+        );
+        setTransitionHooks(oldInnerChild, leavingHooks);
+        if (mode === "out-in" && innerChild.type !== Comment) {
+          state.isLeaving = true;
+          leavingHooks.afterLeave = () => {
+            state.isLeaving = false;
+            if (!(instance.job.flags & 8)) {
+              instance.update();
+            }
+            delete leavingHooks.afterLeave;
+          };
+          return emptyPlaceholder(child);
+        } else if (mode === "in-out" && innerChild.type !== Comment) {
+          leavingHooks.delayLeave = (el, earlyRemove, delayedLeave) => {
+            const leavingVNodesCache = getLeavingNodesForType(
+              state,
+              oldInnerChild
+            );
+            leavingVNodesCache[String(oldInnerChild.key)] = oldInnerChild;
+            el[leaveCbKey] = () => {
+              earlyRemove();
+              el[leaveCbKey] = void 0;
+              delete enterHooks.delayedLeave;
+            };
+            enterHooks.delayedLeave = delayedLeave;
+          };
+        }
+      }
+      return child;
+    };
+  }
+};
+function findNonCommentChild(children) {
+  let child = children[0];
+  if (children.length > 1) {
+    let hasFound = false;
+    for (const c of children) {
+      if (c.type !== Comment) {
+        if (hasFound) {
+          warn$1(
+            "<transition> can only be used on a single element or component. Use <transition-group> for lists."
+          );
+          break;
+        }
+        child = c;
+        hasFound = true;
+      }
+    }
+  }
+  return child;
+}
+const BaseTransition = BaseTransitionImpl;
+function getLeavingNodesForType(state, vnode) {
+  const { leavingVNodes } = state;
+  let leavingVNodesCache = leavingVNodes.get(vnode.type);
+  if (!leavingVNodesCache) {
+    leavingVNodesCache = /* @__PURE__ */ Object.create(null);
+    leavingVNodes.set(vnode.type, leavingVNodesCache);
+  }
+  return leavingVNodesCache;
+}
+function resolveTransitionHooks(vnode, props, state, instance, postClone) {
+  const {
+    appear,
+    mode,
+    persisted = false,
+    onBeforeEnter,
+    onEnter,
+    onAfterEnter,
+    onEnterCancelled,
+    onBeforeLeave,
+    onLeave,
+    onAfterLeave,
+    onLeaveCancelled,
+    onBeforeAppear,
+    onAppear,
+    onAfterAppear,
+    onAppearCancelled
+  } = props;
+  const key = String(vnode.key);
+  const leavingVNodesCache = getLeavingNodesForType(state, vnode);
+  const callHook = (hook, args) => {
+    hook && callWithAsyncErrorHandling(
+      hook,
+      instance,
+      9,
+      args
+    );
+  };
+  const callAsyncHook = (hook, args) => {
+    const done = args[1];
+    callHook(hook, args);
+    if (isArray(hook)) {
+      if (hook.every((hook2) => hook2.length <= 1)) done();
+    } else if (hook.length <= 1) {
+      done();
+    }
+  };
+  const hooks = {
+    mode,
+    persisted,
+    beforeEnter(el) {
+      let hook = onBeforeEnter;
+      if (!state.isMounted) {
+        if (appear) {
+          hook = onBeforeAppear || onBeforeEnter;
+        } else {
+          return;
+        }
+      }
+      if (el[leaveCbKey]) {
+        el[leaveCbKey](
+          true
+          /* cancelled */
+        );
+      }
+      const leavingVNode = leavingVNodesCache[key];
+      if (leavingVNode && isSameVNodeType(vnode, leavingVNode) && leavingVNode.el[leaveCbKey]) {
+        leavingVNode.el[leaveCbKey]();
+      }
+      callHook(hook, [el]);
+    },
+    enter(el) {
+      let hook = onEnter;
+      let afterHook = onAfterEnter;
+      let cancelHook = onEnterCancelled;
+      if (!state.isMounted) {
+        if (appear) {
+          hook = onAppear || onEnter;
+          afterHook = onAfterAppear || onAfterEnter;
+          cancelHook = onAppearCancelled || onEnterCancelled;
+        } else {
+          return;
+        }
+      }
+      let called = false;
+      const done = el[enterCbKey$1] = (cancelled) => {
+        if (called) return;
+        called = true;
+        if (cancelled) {
+          callHook(cancelHook, [el]);
+        } else {
+          callHook(afterHook, [el]);
+        }
+        if (hooks.delayedLeave) {
+          hooks.delayedLeave();
+        }
+        el[enterCbKey$1] = void 0;
+      };
+      if (hook) {
+        callAsyncHook(hook, [el, done]);
+      } else {
+        done();
+      }
+    },
+    leave(el, remove) {
+      const key2 = String(vnode.key);
+      if (el[enterCbKey$1]) {
+        el[enterCbKey$1](
+          true
+          /* cancelled */
+        );
+      }
+      if (state.isUnmounting) {
+        return remove();
+      }
+      callHook(onBeforeLeave, [el]);
+      let called = false;
+      const done = el[leaveCbKey] = (cancelled) => {
+        if (called) return;
+        called = true;
+        remove();
+        if (cancelled) {
+          callHook(onLeaveCancelled, [el]);
+        } else {
+          callHook(onAfterLeave, [el]);
+        }
+        el[leaveCbKey] = void 0;
+        if (leavingVNodesCache[key2] === vnode) {
+          delete leavingVNodesCache[key2];
+        }
+      };
+      leavingVNodesCache[key2] = vnode;
+      if (onLeave) {
+        callAsyncHook(onLeave, [el, done]);
+      } else {
+        done();
+      }
+    },
+    clone(vnode2) {
+      const hooks2 = resolveTransitionHooks(
+        vnode2,
+        props,
+        state,
+        instance,
+        postClone
+      );
+      if (postClone) postClone(hooks2);
+      return hooks2;
+    }
+  };
+  return hooks;
+}
+function emptyPlaceholder(vnode) {
+  if (isKeepAlive(vnode)) {
+    vnode = cloneVNode(vnode);
+    vnode.children = null;
+    return vnode;
+  }
+}
+function getInnerChild$1(vnode) {
+  if (!isKeepAlive(vnode)) {
+    if (isTeleport(vnode.type) && vnode.children) {
+      return findNonCommentChild(vnode.children);
+    }
+    return vnode;
+  }
+  if (vnode.component) {
+    return vnode.component.subTree;
+  }
+  const { shapeFlag, children } = vnode;
+  if (children) {
+    if (shapeFlag & 16) {
+      return children[0];
+    }
+    if (shapeFlag & 32 && isFunction(children.default)) {
+      return children.default();
+    }
+  }
+}
+function setTransitionHooks(vnode, hooks) {
+  if (vnode.shapeFlag & 6 && vnode.component) {
+    vnode.transition = hooks;
+    setTransitionHooks(vnode.component.subTree, hooks);
+  } else if (vnode.shapeFlag & 128) {
+    vnode.ssContent.transition = hooks.clone(vnode.ssContent);
+    vnode.ssFallback.transition = hooks.clone(vnode.ssFallback);
+  } else {
+    vnode.transition = hooks;
+  }
+}
+function getTransitionRawChildren(children, keepComment = false, parentKey) {
+  let ret = [];
+  let keyedFragmentCount = 0;
+  for (let i = 0; i < children.length; i++) {
+    let child = children[i];
+    const key = parentKey == null ? child.key : String(parentKey) + String(child.key != null ? child.key : i);
+    if (child.type === Fragment) {
+      if (child.patchFlag & 128) keyedFragmentCount++;
+      ret = ret.concat(
+        getTransitionRawChildren(child.children, keepComment, key)
+      );
+    } else if (keepComment || child.type !== Comment) {
+      ret.push(key != null ? cloneVNode(child, { key }) : child);
+    }
+  }
+  if (keyedFragmentCount > 1) {
+    for (let i = 0; i < ret.length; i++) {
+      ret[i].patchFlag = -2;
+    }
+  }
+  return ret;
+}
+
+/*! #__NO_SIDE_EFFECTS__ */
+// @__NO_SIDE_EFFECTS__
+function defineComponent(options, extraOptions) {
+  return isFunction(options) ? (
+    // #8236: extend call and options.name access are considered side-effects
+    // by Rollup, so we have to wrap it in a pure-annotated IIFE.
+    /* @__PURE__ */ (() => extend({ name: options.name }, extraOptions, { setup: options }))()
+  ) : options;
+}
+
+function useId() {
+  const i = getCurrentInstance();
+  if (i) {
+    return (i.appContext.config.idPrefix || "v") + "-" + i.ids[0] + i.ids[1]++;
+  } else {
+    warn$1(
+      `useId() is called when there is no active component instance to be associated with.`
+    );
+  }
+  return "";
+}
+function markAsyncBoundary(instance) {
+  instance.ids = [instance.ids[0] + instance.ids[2]++ + "-", 0, 0];
+}
+
+const knownTemplateRefs = /* @__PURE__ */ new WeakSet();
+function useTemplateRef(key) {
+  const i = getCurrentInstance();
+  const r = shallowRef(null);
+  if (i) {
+    const refs = i.refs === EMPTY_OBJ ? i.refs = {} : i.refs;
+    let desc;
+    if ((desc = Object.getOwnPropertyDescriptor(refs, key)) && !desc.configurable) {
+      warn$1(`useTemplateRef('${key}') already exists.`);
+    } else {
+      Object.defineProperty(refs, key, {
+        enumerable: true,
+        get: () => r.value,
+        set: (val) => r.value = val
+      });
+    }
+  } else {
+    warn$1(
+      `useTemplateRef() is called when there is no active component instance to be associated with.`
+    );
+  }
+  const ret = readonly(r) ;
+  {
+    knownTemplateRefs.add(ret);
+  }
+  return ret;
+}
+
+function setRef(rawRef, oldRawRef, parentSuspense, vnode, isUnmount = false) {
+  if (isArray(rawRef)) {
+    rawRef.forEach(
+      (r, i) => setRef(
+        r,
+        oldRawRef && (isArray(oldRawRef) ? oldRawRef[i] : oldRawRef),
+        parentSuspense,
+        vnode,
+        isUnmount
+      )
+    );
+    return;
+  }
+  if (isAsyncWrapper(vnode) && !isUnmount) {
+    return;
+  }
+  const refValue = vnode.shapeFlag & 4 ? getComponentPublicInstance(vnode.component) : vnode.el;
+  const value = isUnmount ? null : refValue;
+  const { i: owner, r: ref } = rawRef;
+  if (!owner) {
+    warn$1(
+      `Missing ref owner context. ref cannot be used on hoisted vnodes. A vnode with ref must be created inside the render function.`
+    );
+    return;
+  }
+  const oldRef = oldRawRef && oldRawRef.r;
+  const refs = owner.refs === EMPTY_OBJ ? owner.refs = {} : owner.refs;
+  const setupState = owner.setupState;
+  const rawSetupState = toRaw(setupState);
+  const canSetSetupRef = setupState === EMPTY_OBJ ? () => false : (key) => {
+    {
+      if (hasOwn(rawSetupState, key) && !isRef(rawSetupState[key])) {
+        warn$1(
+          `Template ref "${key}" used on a non-ref value. It will not work in the production build.`
+        );
+      }
+      if (knownTemplateRefs.has(rawSetupState[key])) {
+        return false;
+      }
+    }
+    return hasOwn(rawSetupState, key);
+  };
+  if (oldRef != null && oldRef !== ref) {
+    if (isString(oldRef)) {
+      refs[oldRef] = null;
+      if (canSetSetupRef(oldRef)) {
+        setupState[oldRef] = null;
+      }
+    } else if (isRef(oldRef)) {
+      oldRef.value = null;
+    }
+  }
+  if (isFunction(ref)) {
+    callWithErrorHandling(ref, owner, 12, [value, refs]);
+  } else {
+    const _isString = isString(ref);
+    const _isRef = isRef(ref);
+    if (_isString || _isRef) {
+      const doSet = () => {
+        if (rawRef.f) {
+          const existing = _isString ? canSetSetupRef(ref) ? setupState[ref] : refs[ref] : ref.value;
+          if (isUnmount) {
+            isArray(existing) && remove(existing, refValue);
+          } else {
+            if (!isArray(existing)) {
+              if (_isString) {
+                refs[ref] = [refValue];
+                if (canSetSetupRef(ref)) {
+                  setupState[ref] = refs[ref];
+                }
+              } else {
+                ref.value = [refValue];
+                if (rawRef.k) refs[rawRef.k] = ref.value;
+              }
+            } else if (!existing.includes(refValue)) {
+              existing.push(refValue);
+            }
+          }
+        } else if (_isString) {
+          refs[ref] = value;
+          if (canSetSetupRef(ref)) {
+            setupState[ref] = value;
+          }
+        } else if (_isRef) {
+          ref.value = value;
+          if (rawRef.k) refs[rawRef.k] = value;
+        } else {
+          warn$1("Invalid template ref type:", ref, `(${typeof ref})`);
+        }
+      };
+      if (value) {
+        doSet.id = -1;
+        queuePostRenderEffect(doSet, parentSuspense);
+      } else {
+        doSet();
+      }
+    } else {
+      warn$1("Invalid template ref type:", ref, `(${typeof ref})`);
+    }
+  }
+}
+
+let hasLoggedMismatchError = false;
+const logMismatchError = () => {
+  if (hasLoggedMismatchError) {
+    return;
+  }
+  console.error("Hydration completed but contains mismatches.");
+  hasLoggedMismatchError = true;
+};
+const isSVGContainer = (container) => container.namespaceURI.includes("svg") && container.tagName !== "foreignObject";
+const isMathMLContainer = (container) => container.namespaceURI.includes("MathML");
+const getContainerType = (container) => {
+  if (container.nodeType !== 1) return void 0;
+  if (isSVGContainer(container)) return "svg";
+  if (isMathMLContainer(container)) return "mathml";
+  return void 0;
+};
+const isComment = (node) => node.nodeType === 8;
+function createHydrationFunctions(rendererInternals) {
+  const {
+    mt: mountComponent,
+    p: patch,
+    o: {
+      patchProp,
+      createText,
+      nextSibling,
+      parentNode,
+      remove,
+      insert,
+      createComment
+    }
+  } = rendererInternals;
+  const hydrate = (vnode, container) => {
+    if (!container.hasChildNodes()) {
+      warn$1(
+        `Attempting to hydrate existing markup but container is empty. Performing full mount instead.`
+      );
+      patch(null, vnode, container);
+      flushPostFlushCbs();
+      container._vnode = vnode;
+      return;
+    }
+    hydrateNode(container.firstChild, vnode, null, null, null);
+    flushPostFlushCbs();
+    container._vnode = vnode;
+  };
+  const hydrateNode = (node, vnode, parentComponent, parentSuspense, slotScopeIds, optimized = false) => {
+    optimized = optimized || !!vnode.dynamicChildren;
+    const isFragmentStart = isComment(node) && node.data === "[";
+    const onMismatch = () => handleMismatch(
+      node,
+      vnode,
+      parentComponent,
+      parentSuspense,
+      slotScopeIds,
+      isFragmentStart
+    );
+    const { type, ref, shapeFlag, patchFlag } = vnode;
+    let domType = node.nodeType;
+    vnode.el = node;
+    {
+      def(node, "__vnode", vnode, true);
+      def(node, "__vueParentComponent", parentComponent, true);
+    }
+    if (patchFlag === -2) {
+      optimized = false;
+      vnode.dynamicChildren = null;
+    }
+    let nextNode = null;
+    switch (type) {
+      case Text:
+        if (domType !== 3) {
+          if (vnode.children === "") {
+            insert(vnode.el = createText(""), parentNode(node), node);
+            nextNode = node;
+          } else {
+            nextNode = onMismatch();
+          }
+        } else {
+          if (node.data !== vnode.children) {
+            warn$1(
+              `Hydration text mismatch in`,
+              node.parentNode,
+              `
+  - rendered on server: ${JSON.stringify(
+                node.data
+              )}
+  - expected on client: ${JSON.stringify(vnode.children)}`
+            );
+            logMismatchError();
+            node.data = vnode.children;
+          }
+          nextNode = nextSibling(node);
+        }
+        break;
+      case Comment:
+        if (isTemplateNode(node)) {
+          nextNode = nextSibling(node);
+          replaceNode(
+            vnode.el = node.content.firstChild,
+            node,
+            parentComponent
+          );
+        } else if (domType !== 8 || isFragmentStart) {
+          nextNode = onMismatch();
+        } else {
+          nextNode = nextSibling(node);
+        }
+        break;
+      case Static:
+        if (isFragmentStart) {
+          node = nextSibling(node);
+          domType = node.nodeType;
+        }
+        if (domType === 1 || domType === 3) {
+          nextNode = node;
+          const needToAdoptContent = !vnode.children.length;
+          for (let i = 0; i < vnode.staticCount; i++) {
+            if (needToAdoptContent)
+              vnode.children += nextNode.nodeType === 1 ? nextNode.outerHTML : nextNode.data;
+            if (i === vnode.staticCount - 1) {
+              vnode.anchor = nextNode;
+            }
+            nextNode = nextSibling(nextNode);
+          }
+          return isFragmentStart ? nextSibling(nextNode) : nextNode;
+        } else {
+          onMismatch();
+        }
+        break;
+      case Fragment:
+        if (!isFragmentStart) {
+          nextNode = onMismatch();
+        } else {
+          nextNode = hydrateFragment(
+            node,
+            vnode,
+            parentComponent,
+            parentSuspense,
+            slotScopeIds,
+            optimized
+          );
+        }
+        break;
+      default:
+        if (shapeFlag & 1) {
+          if ((domType !== 1 || vnode.type.toLowerCase() !== node.tagName.toLowerCase()) && !isTemplateNode(node)) {
+            nextNode = onMismatch();
+          } else {
+            nextNode = hydrateElement(
+              node,
+              vnode,
+              parentComponent,
+              parentSuspense,
+              slotScopeIds,
+              optimized
+            );
+          }
+        } else if (shapeFlag & 6) {
+          vnode.slotScopeIds = slotScopeIds;
+          const container = parentNode(node);
+          if (isFragmentStart) {
+            nextNode = locateClosingAnchor(node);
+          } else if (isComment(node) && node.data === "teleport start") {
+            nextNode = locateClosingAnchor(node, node.data, "teleport end");
+          } else {
+            nextNode = nextSibling(node);
+          }
+          mountComponent(
+            vnode,
+            container,
+            null,
+            parentComponent,
+            parentSuspense,
+            getContainerType(container),
+            optimized
+          );
+          if (isAsyncWrapper(vnode)) {
+            let subTree;
+            if (isFragmentStart) {
+              subTree = createVNode(Fragment);
+              subTree.anchor = nextNode ? nextNode.previousSibling : container.lastChild;
+            } else {
+              subTree = node.nodeType === 3 ? createTextVNode("") : createVNode("div");
+            }
+            subTree.el = node;
+            vnode.component.subTree = subTree;
+          }
+        } else if (shapeFlag & 64) {
+          if (domType !== 8) {
+            nextNode = onMismatch();
+          } else {
+            nextNode = vnode.type.hydrate(
+              node,
+              vnode,
+              parentComponent,
+              parentSuspense,
+              slotScopeIds,
+              optimized,
+              rendererInternals,
+              hydrateChildren
+            );
+          }
+        } else if (shapeFlag & 128) {
+          nextNode = vnode.type.hydrate(
+            node,
+            vnode,
+            parentComponent,
+            parentSuspense,
+            getContainerType(parentNode(node)),
+            slotScopeIds,
+            optimized,
+            rendererInternals,
+            hydrateNode
+          );
+        } else {
+          warn$1("Invalid HostVNode type:", type, `(${typeof type})`);
+        }
+    }
+    if (ref != null) {
+      setRef(ref, null, parentSuspense, vnode);
+    }
+    return nextNode;
+  };
+  const hydrateElement = (el, vnode, parentComponent, parentSuspense, slotScopeIds, optimized) => {
+    optimized = optimized || !!vnode.dynamicChildren;
+    const { type, props, patchFlag, shapeFlag, dirs, transition } = vnode;
+    const forcePatch = type === "input" || type === "option";
+    {
+      if (dirs) {
+        invokeDirectiveHook(vnode, null, parentComponent, "created");
+      }
+      let needCallTransitionHooks = false;
+      if (isTemplateNode(el)) {
+        needCallTransitionHooks = needTransition(
+          null,
+          // no need check parentSuspense in hydration
+          transition
+        ) && parentComponent && parentComponent.vnode.props && parentComponent.vnode.props.appear;
+        const content = el.content.firstChild;
+        if (needCallTransitionHooks) {
+          transition.beforeEnter(content);
+        }
+        replaceNode(content, el, parentComponent);
+        vnode.el = el = content;
+      }
+      if (shapeFlag & 16 && // skip if element has innerHTML / textContent
+      !(props && (props.innerHTML || props.textContent))) {
+        let next = hydrateChildren(
+          el.firstChild,
+          vnode,
+          el,
+          parentComponent,
+          parentSuspense,
+          slotScopeIds,
+          optimized
+        );
+        let hasWarned = false;
+        while (next) {
+          if (!isMismatchAllowed(el, 1 /* CHILDREN */)) {
+            if (!hasWarned) {
+              warn$1(
+                `Hydration children mismatch on`,
+                el,
+                `
+Server rendered element contains more child nodes than client vdom.`
+              );
+              hasWarned = true;
+            }
+            logMismatchError();
+          }
+          const cur = next;
+          next = next.nextSibling;
+          remove(cur);
+        }
+      } else if (shapeFlag & 8) {
+        let clientText = vnode.children;
+        if (clientText[0] === "\n" && (el.tagName === "PRE" || el.tagName === "TEXTAREA")) {
+          clientText = clientText.slice(1);
+        }
+        if (el.textContent !== clientText) {
+          if (!isMismatchAllowed(el, 0 /* TEXT */)) {
+            warn$1(
+              `Hydration text content mismatch on`,
+              el,
+              `
+  - rendered on server: ${el.textContent}
+  - expected on client: ${vnode.children}`
+            );
+            logMismatchError();
+          }
+          el.textContent = vnode.children;
+        }
+      }
+      if (props) {
+        {
+          const isCustomElement = el.tagName.includes("-");
+          for (const key in props) {
+            if (// #11189 skip if this node has directives that have created hooks
+            // as it could have mutated the DOM in any possible way
+            !(dirs && dirs.some((d) => d.dir.created)) && propHasMismatch(el, key, props[key], vnode, parentComponent)) {
+              logMismatchError();
+            }
+            if (forcePatch && (key.endsWith("value") || key === "indeterminate") || isOn(key) && !isReservedProp(key) || // force hydrate v-bind with .prop modifiers
+            key[0] === "." || isCustomElement) {
+              patchProp(el, key, null, props[key], void 0, parentComponent);
+            }
+          }
+        }
+      }
+      let vnodeHooks;
+      if (vnodeHooks = props && props.onVnodeBeforeMount) {
+        invokeVNodeHook(vnodeHooks, parentComponent, vnode);
+      }
+      if (dirs) {
+        invokeDirectiveHook(vnode, null, parentComponent, "beforeMount");
+      }
+      if ((vnodeHooks = props && props.onVnodeMounted) || dirs || needCallTransitionHooks) {
+        queueEffectWithSuspense(() => {
+          vnodeHooks && invokeVNodeHook(vnodeHooks, parentComponent, vnode);
+          needCallTransitionHooks && transition.enter(el);
+          dirs && invokeDirectiveHook(vnode, null, parentComponent, "mounted");
+        }, parentSuspense);
+      }
+    }
+    return el.nextSibling;
+  };
+  const hydrateChildren = (node, parentVNode, container, parentComponent, parentSuspense, slotScopeIds, optimized) => {
+    optimized = optimized || !!parentVNode.dynamicChildren;
+    const children = parentVNode.children;
+    const l = children.length;
+    let hasWarned = false;
+    for (let i = 0; i < l; i++) {
+      const vnode = optimized ? children[i] : children[i] = normalizeVNode(children[i]);
+      const isText = vnode.type === Text;
+      if (node) {
+        if (isText && !optimized) {
+          if (i + 1 < l && normalizeVNode(children[i + 1]).type === Text) {
+            insert(
+              createText(
+                node.data.slice(vnode.children.length)
+              ),
+              container,
+              nextSibling(node)
+            );
+            node.data = vnode.children;
+          }
+        }
+        node = hydrateNode(
+          node,
+          vnode,
+          parentComponent,
+          parentSuspense,
+          slotScopeIds,
+          optimized
+        );
+      } else if (isText && !vnode.children) {
+        insert(vnode.el = createText(""), container);
+      } else {
+        if (!isMismatchAllowed(container, 1 /* CHILDREN */)) {
+          if (!hasWarned) {
+            warn$1(
+              `Hydration children mismatch on`,
+              container,
+              `
+Server rendered element contains fewer child nodes than client vdom.`
+            );
+            hasWarned = true;
+          }
+          logMismatchError();
+        }
+        patch(
+          null,
+          vnode,
+          container,
+          null,
+          parentComponent,
+          parentSuspense,
+          getContainerType(container),
+          slotScopeIds
+        );
+      }
+    }
+    return node;
+  };
+  const hydrateFragment = (node, vnode, parentComponent, parentSuspense, slotScopeIds, optimized) => {
+    const { slotScopeIds: fragmentSlotScopeIds } = vnode;
+    if (fragmentSlotScopeIds) {
+      slotScopeIds = slotScopeIds ? slotScopeIds.concat(fragmentSlotScopeIds) : fragmentSlotScopeIds;
+    }
+    const container = parentNode(node);
+    const next = hydrateChildren(
+      nextSibling(node),
+      vnode,
+      container,
+      parentComponent,
+      parentSuspense,
+      slotScopeIds,
+      optimized
+    );
+    if (next && isComment(next) && next.data === "]") {
+      return nextSibling(vnode.anchor = next);
+    } else {
+      logMismatchError();
+      insert(vnode.anchor = createComment(`]`), container, next);
+      return next;
+    }
+  };
+  const handleMismatch = (node, vnode, parentComponent, parentSuspense, slotScopeIds, isFragment) => {
+    if (!isMismatchAllowed(node.parentElement, 1 /* CHILDREN */)) {
+      warn$1(
+        `Hydration node mismatch:
+- rendered on server:`,
+        node,
+        node.nodeType === 3 ? `(text)` : isComment(node) && node.data === "[" ? `(start of fragment)` : ``,
+        `
+- expected on client:`,
+        vnode.type
+      );
+      logMismatchError();
+    }
+    vnode.el = null;
+    if (isFragment) {
+      const end = locateClosingAnchor(node);
+      while (true) {
+        const next2 = nextSibling(node);
+        if (next2 && next2 !== end) {
+          remove(next2);
+        } else {
+          break;
+        }
+      }
+    }
+    const next = nextSibling(node);
+    const container = parentNode(node);
+    remove(node);
+    patch(
+      null,
+      vnode,
+      container,
+      next,
+      parentComponent,
+      parentSuspense,
+      getContainerType(container),
+      slotScopeIds
+    );
+    return next;
+  };
+  const locateClosingAnchor = (node, open = "[", close = "]") => {
+    let match = 0;
+    while (node) {
+      node = nextSibling(node);
+      if (node && isComment(node)) {
+        if (node.data === open) match++;
+        if (node.data === close) {
+          if (match === 0) {
+            return nextSibling(node);
+          } else {
+            match--;
+          }
+        }
+      }
+    }
+    return node;
+  };
+  const replaceNode = (newNode, oldNode, parentComponent) => {
+    const parentNode2 = oldNode.parentNode;
+    if (parentNode2) {
+      parentNode2.replaceChild(newNode, oldNode);
+    }
+    let parent = parentComponent;
+    while (parent) {
+      if (parent.vnode.el === oldNode) {
+        parent.vnode.el = parent.subTree.el = newNode;
+      }
+      parent = parent.parent;
+    }
+  };
+  const isTemplateNode = (node) => {
+    return node.nodeType === 1 && node.tagName === "TEMPLATE";
+  };
+  return [hydrate, hydrateNode];
+}
+function propHasMismatch(el, key, clientValue, vnode, instance) {
+  let mismatchType;
+  let mismatchKey;
+  let actual;
+  let expected;
+  if (key === "class") {
+    actual = el.getAttribute("class");
+    expected = normalizeClass(clientValue);
+    if (!isSetEqual(toClassSet(actual || ""), toClassSet(expected))) {
+      mismatchType = 2 /* CLASS */;
+      mismatchKey = `class`;
+    }
+  } else if (key === "style") {
+    actual = el.getAttribute("style") || "";
+    expected = isString(clientValue) ? clientValue : stringifyStyle(normalizeStyle(clientValue));
+    const actualMap = toStyleMap(actual);
+    const expectedMap = toStyleMap(expected);
+    if (vnode.dirs) {
+      for (const { dir, value } of vnode.dirs) {
+        if (dir.name === "show" && !value) {
+          expectedMap.set("display", "none");
+        }
+      }
+    }
+    if (instance) {
+      resolveCssVars(instance, vnode, expectedMap);
+    }
+    if (!isMapEqual(actualMap, expectedMap)) {
+      mismatchType = 3 /* STYLE */;
+      mismatchKey = "style";
+    }
+  } else if (el instanceof SVGElement && isKnownSvgAttr(key) || el instanceof HTMLElement && (isBooleanAttr(key) || isKnownHtmlAttr(key))) {
+    if (isBooleanAttr(key)) {
+      actual = el.hasAttribute(key);
+      expected = includeBooleanAttr(clientValue);
+    } else if (clientValue == null) {
+      actual = el.hasAttribute(key);
+      expected = false;
+    } else {
+      if (el.hasAttribute(key)) {
+        actual = el.getAttribute(key);
+      } else if (key === "value" && el.tagName === "TEXTAREA") {
+        actual = el.value;
+      } else {
+        actual = false;
+      }
+      expected = isRenderableAttrValue(clientValue) ? String(clientValue) : false;
+    }
+    if (actual !== expected) {
+      mismatchType = 4 /* ATTRIBUTE */;
+      mismatchKey = key;
+    }
+  }
+  if (mismatchType != null && !isMismatchAllowed(el, mismatchType)) {
+    const format = (v) => v === false ? `(not rendered)` : `${mismatchKey}="${v}"`;
+    const preSegment = `Hydration ${MismatchTypeString[mismatchType]} mismatch on`;
+    const postSegment = `
+  - rendered on server: ${format(actual)}
+  - expected on client: ${format(expected)}
+  Note: this mismatch is check-only. The DOM will not be rectified in production due to performance overhead.
+  You should fix the source of the mismatch.`;
+    {
+      warn$1(preSegment, el, postSegment);
+    }
+    return true;
+  }
+  return false;
+}
+function toClassSet(str) {
+  return new Set(str.trim().split(/\s+/));
+}
+function isSetEqual(a, b) {
+  if (a.size !== b.size) {
+    return false;
+  }
+  for (const s of a) {
+    if (!b.has(s)) {
+      return false;
+    }
+  }
+  return true;
+}
+function toStyleMap(str) {
+  const styleMap = /* @__PURE__ */ new Map();
+  for (const item of str.split(";")) {
+    let [key, value] = item.split(":");
+    key = key.trim();
+    value = value && value.trim();
+    if (key && value) {
+      styleMap.set(key, value);
+    }
+  }
+  return styleMap;
+}
+function isMapEqual(a, b) {
+  if (a.size !== b.size) {
+    return false;
+  }
+  for (const [key, value] of a) {
+    if (value !== b.get(key)) {
+      return false;
+    }
+  }
+  return true;
+}
+function resolveCssVars(instance, vnode, expectedMap) {
+  const root = instance.subTree;
+  if (instance.getCssVars && (vnode === root || root && root.type === Fragment && root.children.includes(vnode))) {
+    const cssVars = instance.getCssVars();
+    for (const key in cssVars) {
+      expectedMap.set(
+        `--${getEscapedCssVarName(key)}`,
+        String(cssVars[key])
+      );
+    }
+  }
+  if (vnode === root && instance.parent) {
+    resolveCssVars(instance.parent, instance.vnode, expectedMap);
+  }
+}
+const allowMismatchAttr = "data-allow-mismatch";
+const MismatchTypeString = {
+  [0 /* TEXT */]: "text",
+  [1 /* CHILDREN */]: "children",
+  [2 /* CLASS */]: "class",
+  [3 /* STYLE */]: "style",
+  [4 /* ATTRIBUTE */]: "attribute"
+};
+function isMismatchAllowed(el, allowedType) {
+  if (allowedType === 0 /* TEXT */ || allowedType === 1 /* CHILDREN */) {
+    while (el && !el.hasAttribute(allowMismatchAttr)) {
+      el = el.parentElement;
+    }
+  }
+  const allowedAttr = el && el.getAttribute(allowMismatchAttr);
+  if (allowedAttr == null) {
+    return false;
+  } else if (allowedAttr === "") {
+    return true;
+  } else {
+    const list = allowedAttr.split(",");
+    if (allowedType === 0 /* TEXT */ && list.includes("children")) {
+      return true;
+    }
+    return allowedAttr.split(",").includes(MismatchTypeString[allowedType]);
+  }
+}
+
+const requestIdleCallback = getGlobalThis().requestIdleCallback || ((cb) => setTimeout(cb, 1));
+const cancelIdleCallback = getGlobalThis().cancelIdleCallback || ((id) => clearTimeout(id));
+const hydrateOnIdle = (timeout = 1e4) => (hydrate) => {
+  const id = requestIdleCallback(hydrate, { timeout });
+  return () => cancelIdleCallback(id);
+};
+function elementIsVisibleInViewport(el) {
+  const { top, left, bottom, right } = el.getBoundingClientRect();
+  const { innerHeight, innerWidth } = window;
+  return (top > 0 && top < innerHeight || bottom > 0 && bottom < innerHeight) && (left > 0 && left < innerWidth || right > 0 && right < innerWidth);
+}
+const hydrateOnVisible = (opts) => (hydrate, forEach) => {
+  const ob = new IntersectionObserver((entries) => {
+    for (const e of entries) {
+      if (!e.isIntersecting) continue;
+      ob.disconnect();
+      hydrate();
+      break;
+    }
+  }, opts);
+  forEach((el) => {
+    if (!(el instanceof Element)) return;
+    if (elementIsVisibleInViewport(el)) {
+      hydrate();
+      ob.disconnect();
+      return false;
+    }
+    ob.observe(el);
+  });
+  return () => ob.disconnect();
+};
+const hydrateOnMediaQuery = (query) => (hydrate) => {
+  if (query) {
+    const mql = matchMedia(query);
+    if (mql.matches) {
+      hydrate();
+    } else {
+      mql.addEventListener("change", hydrate, { once: true });
+      return () => mql.removeEventListener("change", hydrate);
+    }
+  }
+};
+const hydrateOnInteraction = (interactions = []) => (hydrate, forEach) => {
+  if (isString(interactions)) interactions = [interactions];
+  let hasHydrated = false;
+  const doHydrate = (e) => {
+    if (!hasHydrated) {
+      hasHydrated = true;
+      teardown();
+      hydrate();
+      e.target.dispatchEvent(new e.constructor(e.type, e));
+    }
+  };
+  const teardown = () => {
+    forEach((el) => {
+      for (const i of interactions) {
+        el.removeEventListener(i, doHydrate);
+      }
+    });
+  };
+  forEach((el) => {
+    for (const i of interactions) {
+      el.addEventListener(i, doHydrate, { once: true });
+    }
+  });
+  return teardown;
+};
+function forEachElement(node, cb) {
+  if (isComment(node) && node.data === "[") {
+    let depth = 1;
+    let next = node.nextSibling;
+    while (next) {
+      if (next.nodeType === 1) {
+        const result = cb(next);
+        if (result === false) {
+          break;
+        }
+      } else if (isComment(next)) {
+        if (next.data === "]") {
+          if (--depth === 0) break;
+        } else if (next.data === "[") {
+          depth++;
+        }
+      }
+      next = next.nextSibling;
+    }
+  } else {
+    cb(node);
+  }
+}
+
+const isAsyncWrapper = (i) => !!i.type.__asyncLoader;
+/*! #__NO_SIDE_EFFECTS__ */
+// @__NO_SIDE_EFFECTS__
+function defineAsyncComponent(source) {
+  if (isFunction(source)) {
+    source = { loader: source };
+  }
+  const {
+    loader,
+    loadingComponent,
+    errorComponent,
+    delay = 200,
+    hydrate: hydrateStrategy,
+    timeout,
+    // undefined = never times out
+    suspensible = true,
+    onError: userOnError
+  } = source;
+  let pendingRequest = null;
+  let resolvedComp;
+  let retries = 0;
+  const retry = () => {
+    retries++;
+    pendingRequest = null;
+    return load();
+  };
+  const load = () => {
+    let thisRequest;
+    return pendingRequest || (thisRequest = pendingRequest = loader().catch((err) => {
+      err = err instanceof Error ? err : new Error(String(err));
+      if (userOnError) {
+        return new Promise((resolve, reject) => {
+          const userRetry = () => resolve(retry());
+          const userFail = () => reject(err);
+          userOnError(err, userRetry, userFail, retries + 1);
+        });
+      } else {
+        throw err;
+      }
+    }).then((comp) => {
+      if (thisRequest !== pendingRequest && pendingRequest) {
+        return pendingRequest;
+      }
+      if (!comp) {
+        warn$1(
+          `Async component loader resolved to undefined. If you are using retry(), make sure to return its return value.`
+        );
+      }
+      if (comp && (comp.__esModule || comp[Symbol.toStringTag] === "Module")) {
+        comp = comp.default;
+      }
+      if (comp && !isObject(comp) && !isFunction(comp)) {
+        throw new Error(`Invalid async component load result: ${comp}`);
+      }
+      resolvedComp = comp;
+      return comp;
+    }));
+  };
+  return defineComponent({
+    name: "AsyncComponentWrapper",
+    __asyncLoader: load,
+    __asyncHydrate(el, instance, hydrate) {
+      const doHydrate = hydrateStrategy ? () => {
+        const teardown = hydrateStrategy(
+          hydrate,
+          (cb) => forEachElement(el, cb)
+        );
+        if (teardown) {
+          (instance.bum || (instance.bum = [])).push(teardown);
+        }
+      } : hydrate;
+      if (resolvedComp) {
+        doHydrate();
+      } else {
+        load().then(() => !instance.isUnmounted && doHydrate());
+      }
+    },
+    get __asyncResolved() {
+      return resolvedComp;
+    },
+    setup() {
+      const instance = currentInstance;
+      markAsyncBoundary(instance);
+      if (resolvedComp) {
+        return () => createInnerComp(resolvedComp, instance);
+      }
+      const onError = (err) => {
+        pendingRequest = null;
+        handleError(
+          err,
+          instance,
+          13,
+          !errorComponent
+        );
+      };
+      if (suspensible && instance.suspense || isInSSRComponentSetup) {
+        return load().then((comp) => {
+          return () => createInnerComp(comp, instance);
+        }).catch((err) => {
+          onError(err);
+          return () => errorComponent ? createVNode(errorComponent, {
+            error: err
+          }) : null;
+        });
+      }
+      const loaded = ref(false);
+      const error = ref();
+      const delayed = ref(!!delay);
+      if (delay) {
+        setTimeout(() => {
+          delayed.value = false;
+        }, delay);
+      }
+      if (timeout != null) {
+        setTimeout(() => {
+          if (!loaded.value && !error.value) {
+            const err = new Error(
+              `Async component timed out after ${timeout}ms.`
+            );
+            onError(err);
+            error.value = err;
+          }
+        }, timeout);
+      }
+      load().then(() => {
+        loaded.value = true;
+        if (instance.parent && isKeepAlive(instance.parent.vnode)) {
+          instance.parent.update();
+        }
+      }).catch((err) => {
+        onError(err);
+        error.value = err;
+      });
+      return () => {
+        if (loaded.value && resolvedComp) {
+          return createInnerComp(resolvedComp, instance);
+        } else if (error.value && errorComponent) {
+          return createVNode(errorComponent, {
+            error: error.value
+          });
+        } else if (loadingComponent && !delayed.value) {
+          return createVNode(loadingComponent);
+        }
+      };
+    }
+  });
+}
+function createInnerComp(comp, parent) {
+  const { ref: ref2, props, children, ce } = parent.vnode;
+  const vnode = createVNode(comp, props, children);
+  vnode.ref = ref2;
+  vnode.ce = ce;
+  delete parent.vnode.ce;
+  return vnode;
+}
+
+const isKeepAlive = (vnode) => vnode.type.__isKeepAlive;
+const KeepAliveImpl = {
+  name: `KeepAlive`,
+  // Marker for special handling inside the renderer. We are not using a ===
+  // check directly on KeepAlive in the renderer, because importing it directly
+  // would prevent it from being tree-shaken.
+  __isKeepAlive: true,
+  props: {
+    include: [String, RegExp, Array],
+    exclude: [String, RegExp, Array],
+    max: [String, Number]
+  },
+  setup(props, { slots }) {
+    const instance = getCurrentInstance();
+    const sharedContext = instance.ctx;
+    if (!sharedContext.renderer) {
+      return () => {
+        const children = slots.default && slots.default();
+        return children && children.length === 1 ? children[0] : children;
+      };
+    }
+    const cache = /* @__PURE__ */ new Map();
+    const keys = /* @__PURE__ */ new Set();
+    let current = null;
+    {
+      instance.__v_cache = cache;
+    }
+    const parentSuspense = instance.suspense;
+    const {
+      renderer: {
+        p: patch,
+        m: move,
+        um: _unmount,
+        o: { createElement }
+      }
+    } = sharedContext;
+    const storageContainer = createElement("div");
+    sharedContext.activate = (vnode, container, anchor, namespace, optimized) => {
+      const instance2 = vnode.component;
+      move(vnode, container, anchor, 0, parentSuspense);
+      patch(
+        instance2.vnode,
+        vnode,
+        container,
+        anchor,
+        instance2,
+        parentSuspense,
+        namespace,
+        vnode.slotScopeIds,
+        optimized
+      );
+      queuePostRenderEffect(() => {
+        instance2.isDeactivated = false;
+        if (instance2.a) {
+          invokeArrayFns(instance2.a);
+        }
+        const vnodeHook = vnode.props && vnode.props.onVnodeMounted;
+        if (vnodeHook) {
+          invokeVNodeHook(vnodeHook, instance2.parent, vnode);
+        }
+      }, parentSuspense);
+      {
+        devtoolsComponentAdded(instance2);
+      }
+    };
+    sharedContext.deactivate = (vnode) => {
+      const instance2 = vnode.component;
+      invalidateMount(instance2.m);
+      invalidateMount(instance2.a);
+      move(vnode, storageContainer, null, 1, parentSuspense);
+      queuePostRenderEffect(() => {
+        if (instance2.da) {
+          invokeArrayFns(instance2.da);
+        }
+        const vnodeHook = vnode.props && vnode.props.onVnodeUnmounted;
+        if (vnodeHook) {
+          invokeVNodeHook(vnodeHook, instance2.parent, vnode);
+        }
+        instance2.isDeactivated = true;
+      }, parentSuspense);
+      {
+        devtoolsComponentAdded(instance2);
+      }
+    };
+    function unmount(vnode) {
+      resetShapeFlag(vnode);
+      _unmount(vnode, instance, parentSuspense, true);
+    }
+    function pruneCache(filter) {
+      cache.forEach((vnode, key) => {
+        const name = getComponentName(vnode.type);
+        if (name && !filter(name)) {
+          pruneCacheEntry(key);
+        }
+      });
+    }
+    function pruneCacheEntry(key) {
+      const cached = cache.get(key);
+      if (cached && (!current || !isSameVNodeType(cached, current))) {
+        unmount(cached);
+      } else if (current) {
+        resetShapeFlag(current);
+      }
+      cache.delete(key);
+      keys.delete(key);
+    }
+    watch(
+      () => [props.include, props.exclude],
+      ([include, exclude]) => {
+        include && pruneCache((name) => matches(include, name));
+        exclude && pruneCache((name) => !matches(exclude, name));
+      },
+      // prune post-render after `current` has been updated
+      { flush: "post", deep: true }
+    );
+    let pendingCacheKey = null;
+    const cacheSubtree = () => {
+      if (pendingCacheKey != null) {
+        if (isSuspense(instance.subTree.type)) {
+          queuePostRenderEffect(() => {
+            cache.set(pendingCacheKey, getInnerChild(instance.subTree));
+          }, instance.subTree.suspense);
+        } else {
+          cache.set(pendingCacheKey, getInnerChild(instance.subTree));
+        }
+      }
+    };
+    onMounted(cacheSubtree);
+    onUpdated(cacheSubtree);
+    onBeforeUnmount(() => {
+      cache.forEach((cached) => {
+        const { subTree, suspense } = instance;
+        const vnode = getInnerChild(subTree);
+        if (cached.type === vnode.type && cached.key === vnode.key) {
+          resetShapeFlag(vnode);
+          const da = vnode.component.da;
+          da && queuePostRenderEffect(da, suspense);
+          return;
+        }
+        unmount(cached);
+      });
+    });
+    return () => {
+      pendingCacheKey = null;
+      if (!slots.default) {
+        return current = null;
+      }
+      const children = slots.default();
+      const rawVNode = children[0];
+      if (children.length > 1) {
+        {
+          warn$1(`KeepAlive should contain exactly one component child.`);
+        }
+        current = null;
+        return children;
+      } else if (!isVNode(rawVNode) || !(rawVNode.shapeFlag & 4) && !(rawVNode.shapeFlag & 128)) {
+        current = null;
+        return rawVNode;
+      }
+      let vnode = getInnerChild(rawVNode);
+      if (vnode.type === Comment) {
+        current = null;
+        return vnode;
+      }
+      const comp = vnode.type;
+      const name = getComponentName(
+        isAsyncWrapper(vnode) ? vnode.type.__asyncResolved || {} : comp
+      );
+      const { include, exclude, max } = props;
+      if (include && (!name || !matches(include, name)) || exclude && name && matches(exclude, name)) {
+        vnode.shapeFlag &= ~256;
+        current = vnode;
+        return rawVNode;
+      }
+      const key = vnode.key == null ? comp : vnode.key;
+      const cachedVNode = cache.get(key);
+      if (vnode.el) {
+        vnode = cloneVNode(vnode);
+        if (rawVNode.shapeFlag & 128) {
+          rawVNode.ssContent = vnode;
+        }
+      }
+      pendingCacheKey = key;
+      if (cachedVNode) {
+        vnode.el = cachedVNode.el;
+        vnode.component = cachedVNode.component;
+        if (vnode.transition) {
+          setTransitionHooks(vnode, vnode.transition);
+        }
+        vnode.shapeFlag |= 512;
+        keys.delete(key);
+        keys.add(key);
+      } else {
+        keys.add(key);
+        if (max && keys.size > parseInt(max, 10)) {
+          pruneCacheEntry(keys.values().next().value);
+        }
+      }
+      vnode.shapeFlag |= 256;
+      current = vnode;
+      return isSuspense(rawVNode.type) ? rawVNode : vnode;
+    };
+  }
+};
+const KeepAlive = KeepAliveImpl;
+function matches(pattern, name) {
+  if (isArray(pattern)) {
+    return pattern.some((p) => matches(p, name));
+  } else if (isString(pattern)) {
+    return pattern.split(",").includes(name);
+  } else if (isRegExp(pattern)) {
+    pattern.lastIndex = 0;
+    return pattern.test(name);
+  }
+  return false;
+}
+function onActivated(hook, target) {
+  registerKeepAliveHook(hook, "a", target);
+}
+function onDeactivated(hook, target) {
+  registerKeepAliveHook(hook, "da", target);
+}
+function registerKeepAliveHook(hook, type, target = currentInstance) {
+  const wrappedHook = hook.__wdc || (hook.__wdc = () => {
+    let current = target;
+    while (current) {
+      if (current.isDeactivated) {
+        return;
+      }
+      current = current.parent;
+    }
+    return hook();
+  });
+  injectHook(type, wrappedHook, target);
+  if (target) {
+    let current = target.parent;
+    while (current && current.parent) {
+      if (isKeepAlive(current.parent.vnode)) {
+        injectToKeepAliveRoot(wrappedHook, type, target, current);
+      }
+      current = current.parent;
+    }
+  }
+}
+function injectToKeepAliveRoot(hook, type, target, keepAliveRoot) {
+  const injected = injectHook(
+    type,
+    hook,
+    keepAliveRoot,
+    true
+    /* prepend */
+  );
+  onUnmounted(() => {
+    remove(keepAliveRoot[type], injected);
+  }, target);
+}
+function resetShapeFlag(vnode) {
+  vnode.shapeFlag &= ~256;
+  vnode.shapeFlag &= ~512;
+}
+function getInnerChild(vnode) {
+  return vnode.shapeFlag & 128 ? vnode.ssContent : vnode;
+}
+
+function injectHook(type, hook, target = currentInstance, prepend = false) {
+  if (target) {
+    const hooks = target[type] || (target[type] = []);
+    const wrappedHook = hook.__weh || (hook.__weh = (...args) => {
+      pauseTracking();
+      const reset = setCurrentInstance(target);
+      const res = callWithAsyncErrorHandling(hook, target, type, args);
+      reset();
+      resetTracking();
+      return res;
+    });
+    if (prepend) {
+      hooks.unshift(wrappedHook);
+    } else {
+      hooks.push(wrappedHook);
+    }
+    return wrappedHook;
+  } else {
+    const apiName = toHandlerKey(ErrorTypeStrings$1[type].replace(/ hook$/, ""));
+    warn$1(
+      `${apiName} is called when there is no active component instance to be associated with. Lifecycle injection APIs can only be used during execution of setup().` + (` If you are using async setup(), make sure to register lifecycle hooks before the first await statement.` )
+    );
+  }
+}
+const createHook = (lifecycle) => (hook, target = currentInstance) => {
+  if (!isInSSRComponentSetup || lifecycle === "sp") {
+    injectHook(lifecycle, (...args) => hook(...args), target);
+  }
+};
+const onBeforeMount = createHook("bm");
+const onMounted = createHook("m");
+const onBeforeUpdate = createHook(
+  "bu"
+);
+const onUpdated = createHook("u");
+const onBeforeUnmount = createHook(
+  "bum"
+);
+const onUnmounted = createHook("um");
+const onServerPrefetch = createHook(
+  "sp"
+);
+const onRenderTriggered = createHook("rtg");
+const onRenderTracked = createHook("rtc");
+function onErrorCaptured(hook, target = currentInstance) {
+  injectHook("ec", hook, target);
+}
+
+const COMPONENTS = "components";
+const DIRECTIVES = "directives";
+function resolveComponent(name, maybeSelfReference) {
+  return resolveAsset(COMPONENTS, name, true, maybeSelfReference) || name;
+}
+const NULL_DYNAMIC_COMPONENT = Symbol.for("v-ndc");
+function resolveDynamicComponent(component) {
+  if (isString(component)) {
+    return resolveAsset(COMPONENTS, component, false) || component;
+  } else {
+    return component || NULL_DYNAMIC_COMPONENT;
+  }
+}
+function resolveDirective(name) {
+  return resolveAsset(DIRECTIVES, name);
+}
+function resolveAsset(type, name, warnMissing = true, maybeSelfReference = false) {
+  const instance = currentRenderingInstance || currentInstance;
+  if (instance) {
+    const Component = instance.type;
+    if (type === COMPONENTS) {
+      const selfName = getComponentName(
+        Component,
+        false
+      );
+      if (selfName && (selfName === name || selfName === camelize(name) || selfName === capitalize(camelize(name)))) {
+        return Component;
+      }
+    }
+    const res = (
+      // local registration
+      // check instance[type] first which is resolved for options API
+      resolve(instance[type] || Component[type], name) || // global registration
+      resolve(instance.appContext[type], name)
+    );
+    if (!res && maybeSelfReference) {
+      return Component;
+    }
+    if (warnMissing && !res) {
+      const extra = type === COMPONENTS ? `
+If this is a native custom element, make sure to exclude it from component resolution via compilerOptions.isCustomElement.` : ``;
+      warn$1(`Failed to resolve ${type.slice(0, -1)}: ${name}${extra}`);
+    }
+    return res;
+  } else {
+    warn$1(
+      `resolve${capitalize(type.slice(0, -1))} can only be used in render() or setup().`
+    );
+  }
+}
+function resolve(registry, name) {
+  return registry && (registry[name] || registry[camelize(name)] || registry[capitalize(camelize(name))]);
+}
+
+function renderList(source, renderItem, cache, index) {
+  let ret;
+  const cached = cache && cache[index];
+  const sourceIsArray = isArray(source);
+  if (sourceIsArray || isString(source)) {
+    const sourceIsReactiveArray = sourceIsArray && isReactive(source);
+    let needsWrap = false;
+    if (sourceIsReactiveArray) {
+      needsWrap = !isShallow(source);
+      source = shallowReadArray(source);
+    }
+    ret = new Array(source.length);
+    for (let i = 0, l = source.length; i < l; i++) {
+      ret[i] = renderItem(
+        needsWrap ? toReactive(source[i]) : source[i],
+        i,
+        void 0,
+        cached && cached[i]
+      );
+    }
+  } else if (typeof source === "number") {
+    if (!Number.isInteger(source)) {
+      warn$1(`The v-for range expect an integer value but got ${source}.`);
+    }
+    ret = new Array(source);
+    for (let i = 0; i < source; i++) {
+      ret[i] = renderItem(i + 1, i, void 0, cached && cached[i]);
+    }
+  } else if (isObject(source)) {
+    if (source[Symbol.iterator]) {
+      ret = Array.from(
+        source,
+        (item, i) => renderItem(item, i, void 0, cached && cached[i])
+      );
+    } else {
+      const keys = Object.keys(source);
+      ret = new Array(keys.length);
+      for (let i = 0, l = keys.length; i < l; i++) {
+        const key = keys[i];
+        ret[i] = renderItem(source[key], key, i, cached && cached[i]);
+      }
+    }
+  } else {
+    ret = [];
+  }
+  if (cache) {
+    cache[index] = ret;
+  }
+  return ret;
+}
+
+function createSlots(slots, dynamicSlots) {
+  for (let i = 0; i < dynamicSlots.length; i++) {
+    const slot = dynamicSlots[i];
+    if (isArray(slot)) {
+      for (let j = 0; j < slot.length; j++) {
+        slots[slot[j].name] = slot[j].fn;
+      }
+    } else if (slot) {
+      slots[slot.name] = slot.key ? (...args) => {
+        const res = slot.fn(...args);
+        if (res) res.key = slot.key;
+        return res;
+      } : slot.fn;
+    }
+  }
+  return slots;
+}
+
+function renderSlot(slots, name, props = {}, fallback, noSlotted) {
+  if (currentRenderingInstance.ce || currentRenderingInstance.parent && isAsyncWrapper(currentRenderingInstance.parent) && currentRenderingInstance.parent.ce) {
+    if (name !== "default") props.name = name;
+    return openBlock(), createBlock(
+      Fragment,
+      null,
+      [createVNode("slot", props, fallback && fallback())],
+      64
+    );
+  }
+  let slot = slots[name];
+  if (slot && slot.length > 1) {
+    warn$1(
+      `SSR-optimized slot function detected in a non-SSR-optimized render function. You need to mark this component with $dynamic-slots in the parent template.`
+    );
+    slot = () => [];
+  }
+  if (slot && slot._c) {
+    slot._d = false;
+  }
+  openBlock();
+  const validSlotContent = slot && ensureValidVNode(slot(props));
+  const slotKey = props.key || // slot content array of a dynamic conditional slot may have a branch
+  // key attached in the `createSlots` helper, respect that
+  validSlotContent && validSlotContent.key;
+  const rendered = createBlock(
+    Fragment,
+    {
+      key: (slotKey && !isSymbol(slotKey) ? slotKey : `_${name}`) + // #7256 force differentiate fallback content from actual content
+      (!validSlotContent && fallback ? "_fb" : "")
+    },
+    validSlotContent || (fallback ? fallback() : []),
+    validSlotContent && slots._ === 1 ? 64 : -2
+  );
+  if (!noSlotted && rendered.scopeId) {
+    rendered.slotScopeIds = [rendered.scopeId + "-s"];
+  }
+  if (slot && slot._c) {
+    slot._d = true;
+  }
+  return rendered;
+}
+function ensureValidVNode(vnodes) {
+  return vnodes.some((child) => {
+    if (!isVNode(child)) return true;
+    if (child.type === Comment) return false;
+    if (child.type === Fragment && !ensureValidVNode(child.children))
+      return false;
+    return true;
+  }) ? vnodes : null;
+}
+
+function toHandlers(obj, preserveCaseIfNecessary) {
+  const ret = {};
+  if (!isObject(obj)) {
+    warn$1(`v-on with no argument expects an object value.`);
+    return ret;
+  }
+  for (const key in obj) {
+    ret[preserveCaseIfNecessary && /[A-Z]/.test(key) ? `on:${key}` : toHandlerKey(key)] = obj[key];
+  }
+  return ret;
+}
+
+const getPublicInstance = (i) => {
+  if (!i) return null;
+  if (isStatefulComponent(i)) return getComponentPublicInstance(i);
+  return getPublicInstance(i.parent);
+};
+const publicPropertiesMap = (
+  // Move PURE marker to new line to workaround compiler discarding it
+  // due to type annotation
+  /* @__PURE__ */ extend(/* @__PURE__ */ Object.create(null), {
+    $: (i) => i,
+    $el: (i) => i.vnode.el,
+    $data: (i) => i.data,
+    $props: (i) => shallowReadonly(i.props) ,
+    $attrs: (i) => shallowReadonly(i.attrs) ,
+    $slots: (i) => shallowReadonly(i.slots) ,
+    $refs: (i) => shallowReadonly(i.refs) ,
+    $parent: (i) => getPublicInstance(i.parent),
+    $root: (i) => getPublicInstance(i.root),
+    $host: (i) => i.ce,
+    $emit: (i) => i.emit,
+    $options: (i) => resolveMergedOptions(i) ,
+    $forceUpdate: (i) => i.f || (i.f = () => {
+      queueJob(i.update);
+    }),
+    $nextTick: (i) => i.n || (i.n = nextTick.bind(i.proxy)),
+    $watch: (i) => instanceWatch.bind(i) 
+  })
+);
+const isReservedPrefix = (key) => key === "_" || key === "$";
+const hasSetupBinding = (state, key) => state !== EMPTY_OBJ && !state.__isScriptSetup && hasOwn(state, key);
+const PublicInstanceProxyHandlers = {
+  get({ _: instance }, key) {
+    if (key === "__v_skip") {
+      return true;
+    }
+    const { ctx, setupState, data, props, accessCache, type, appContext } = instance;
+    if (key === "__isVue") {
+      return true;
+    }
+    let normalizedProps;
+    if (key[0] !== "$") {
+      const n = accessCache[key];
+      if (n !== void 0) {
+        switch (n) {
+          case 1 /* SETUP */:
+            return setupState[key];
+          case 2 /* DATA */:
+            return data[key];
+          case 4 /* CONTEXT */:
+            return ctx[key];
+          case 3 /* PROPS */:
+            return props[key];
+        }
+      } else if (hasSetupBinding(setupState, key)) {
+        accessCache[key] = 1 /* SETUP */;
+        return setupState[key];
+      } else if (data !== EMPTY_OBJ && hasOwn(data, key)) {
+        accessCache[key] = 2 /* DATA */;
+        return data[key];
+      } else if (
+        // only cache other properties when instance has declared (thus stable)
+        // props
+        (normalizedProps = instance.propsOptions[0]) && hasOwn(normalizedProps, key)
+      ) {
+        accessCache[key] = 3 /* PROPS */;
+        return props[key];
+      } else if (ctx !== EMPTY_OBJ && hasOwn(ctx, key)) {
+        accessCache[key] = 4 /* CONTEXT */;
+        return ctx[key];
+      } else if (shouldCacheAccess) {
+        accessCache[key] = 0 /* OTHER */;
+      }
+    }
+    const publicGetter = publicPropertiesMap[key];
+    let cssModule, globalProperties;
+    if (publicGetter) {
+      if (key === "$attrs") {
+        track(instance.attrs, "get", "");
+        markAttrsAccessed();
+      } else if (key === "$slots") {
+        track(instance, "get", key);
+      }
+      return publicGetter(instance);
+    } else if (
+      // css module (injected by vue-loader)
+      (cssModule = type.__cssModules) && (cssModule = cssModule[key])
+    ) {
+      return cssModule;
+    } else if (ctx !== EMPTY_OBJ && hasOwn(ctx, key)) {
+      accessCache[key] = 4 /* CONTEXT */;
+      return ctx[key];
+    } else if (
+      // global properties
+      globalProperties = appContext.config.globalProperties, hasOwn(globalProperties, key)
+    ) {
+      {
+        return globalProperties[key];
+      }
+    } else if (currentRenderingInstance && (!isString(key) || // #1091 avoid internal isRef/isVNode checks on component instance leading
+    // to infinite warning loop
+    key.indexOf("__v") !== 0)) {
+      if (data !== EMPTY_OBJ && isReservedPrefix(key[0]) && hasOwn(data, key)) {
+        warn$1(
+          `Property ${JSON.stringify(
+            key
+          )} must be accessed via $data because it starts with a reserved character ("$" or "_") and is not proxied on the render context.`
+        );
+      } else if (instance === currentRenderingInstance) {
+        warn$1(
+          `Property ${JSON.stringify(key)} was accessed during render but is not defined on instance.`
+        );
+      }
+    }
+  },
+  set({ _: instance }, key, value) {
+    const { data, setupState, ctx } = instance;
+    if (hasSetupBinding(setupState, key)) {
+      setupState[key] = value;
+      return true;
+    } else if (setupState.__isScriptSetup && hasOwn(setupState, key)) {
+      warn$1(`Cannot mutate <script setup> binding "${key}" from Options API.`);
+      return false;
+    } else if (data !== EMPTY_OBJ && hasOwn(data, key)) {
+      data[key] = value;
+      return true;
+    } else if (hasOwn(instance.props, key)) {
+      warn$1(`Attempting to mutate prop "${key}". Props are readonly.`);
+      return false;
+    }
+    if (key[0] === "$" && key.slice(1) in instance) {
+      warn$1(
+        `Attempting to mutate public property "${key}". Properties starting with $ are reserved and readonly.`
+      );
+      return false;
+    } else {
+      if (key in instance.appContext.config.globalProperties) {
+        Object.defineProperty(ctx, key, {
+          enumerable: true,
+          configurable: true,
+          value
+        });
+      } else {
+        ctx[key] = value;
+      }
+    }
+    return true;
+  },
+  has({
+    _: { data, setupState, accessCache, ctx, appContext, propsOptions }
+  }, key) {
+    let normalizedProps;
+    return !!accessCache[key] || data !== EMPTY_OBJ && hasOwn(data, key) || hasSetupBinding(setupState, key) || (normalizedProps = propsOptions[0]) && hasOwn(normalizedProps, key) || hasOwn(ctx, key) || hasOwn(publicPropertiesMap, key) || hasOwn(appContext.config.globalProperties, key);
+  },
+  defineProperty(target, key, descriptor) {
+    if (descriptor.get != null) {
+      target._.accessCache[key] = 0;
+    } else if (hasOwn(descriptor, "value")) {
+      this.set(target, key, descriptor.value, null);
+    }
+    return Reflect.defineProperty(target, key, descriptor);
+  }
+};
+{
+  PublicInstanceProxyHandlers.ownKeys = (target) => {
+    warn$1(
+      `Avoid app logic that relies on enumerating keys on a component instance. The keys will be empty in production mode to avoid performance overhead.`
+    );
+    return Reflect.ownKeys(target);
+  };
+}
+const RuntimeCompiledPublicInstanceProxyHandlers = /* @__PURE__ */ extend({}, PublicInstanceProxyHandlers, {
+  get(target, key) {
+    if (key === Symbol.unscopables) {
+      return;
+    }
+    return PublicInstanceProxyHandlers.get(target, key, target);
+  },
+  has(_, key) {
+    const has = key[0] !== "_" && !isGloballyAllowed(key);
+    if (!has && PublicInstanceProxyHandlers.has(_, key)) {
+      warn$1(
+        `Property ${JSON.stringify(
+          key
+        )} should not start with _ which is a reserved prefix for Vue internals.`
+      );
+    }
+    return has;
+  }
+});
+function createDevRenderContext(instance) {
+  const target = {};
+  Object.defineProperty(target, `_`, {
+    configurable: true,
+    enumerable: false,
+    get: () => instance
+  });
+  Object.keys(publicPropertiesMap).forEach((key) => {
+    Object.defineProperty(target, key, {
+      configurable: true,
+      enumerable: false,
+      get: () => publicPropertiesMap[key](instance),
+      // intercepted by the proxy so no need for implementation,
+      // but needed to prevent set errors
+      set: NOOP
+    });
+  });
+  return target;
+}
+function exposePropsOnRenderContext(instance) {
+  const {
+    ctx,
+    propsOptions: [propsOptions]
+  } = instance;
+  if (propsOptions) {
+    Object.keys(propsOptions).forEach((key) => {
+      Object.defineProperty(ctx, key, {
+        enumerable: true,
+        configurable: true,
+        get: () => instance.props[key],
+        set: NOOP
+      });
+    });
+  }
+}
+function exposeSetupStateOnRenderContext(instance) {
+  const { ctx, setupState } = instance;
+  Object.keys(toRaw(setupState)).forEach((key) => {
+    if (!setupState.__isScriptSetup) {
+      if (isReservedPrefix(key[0])) {
+        warn$1(
+          `setup() return property ${JSON.stringify(
+            key
+          )} should not start with "$" or "_" which are reserved prefixes for Vue internals.`
+        );
+        return;
+      }
+      Object.defineProperty(ctx, key, {
+        enumerable: true,
+        configurable: true,
+        get: () => setupState[key],
+        set: NOOP
+      });
+    }
+  });
+}
+
+const warnRuntimeUsage = (method) => warn$1(
+  `${method}() is a compiler-hint helper that is only usable inside <script setup> of a single file component. Its arguments should be compiled away and passing it at runtime has no effect.`
+);
+function defineProps() {
+  {
+    warnRuntimeUsage(`defineProps`);
+  }
+  return null;
+}
+function defineEmits() {
+  {
+    warnRuntimeUsage(`defineEmits`);
+  }
+  return null;
+}
+function defineExpose(exposed) {
+  {
+    warnRuntimeUsage(`defineExpose`);
+  }
+}
+function defineOptions(options) {
+  {
+    warnRuntimeUsage(`defineOptions`);
+  }
+}
+function defineSlots() {
+  {
+    warnRuntimeUsage(`defineSlots`);
+  }
+  return null;
+}
+function defineModel() {
+  {
+    warnRuntimeUsage("defineModel");
+  }
+}
+function withDefaults(props, defaults) {
+  {
+    warnRuntimeUsage(`withDefaults`);
+  }
+  return null;
+}
+function useSlots() {
+  return getContext().slots;
+}
+function useAttrs() {
+  return getContext().attrs;
+}
+function getContext() {
+  const i = getCurrentInstance();
+  if (!i) {
+    warn$1(`useContext() called without active instance.`);
+  }
+  return i.setupContext || (i.setupContext = createSetupContext(i));
+}
+function normalizePropsOrEmits(props) {
+  return isArray(props) ? props.reduce(
+    (normalized, p) => (normalized[p] = null, normalized),
+    {}
+  ) : props;
+}
+function mergeDefaults(raw, defaults) {
+  const props = normalizePropsOrEmits(raw);
+  for (const key in defaults) {
+    if (key.startsWith("__skip")) continue;
+    let opt = props[key];
+    if (opt) {
+      if (isArray(opt) || isFunction(opt)) {
+        opt = props[key] = { type: opt, default: defaults[key] };
+      } else {
+        opt.default = defaults[key];
+      }
+    } else if (opt === null) {
+      opt = props[key] = { default: defaults[key] };
+    } else {
+      warn$1(`props default key "${key}" has no corresponding declaration.`);
+    }
+    if (opt && defaults[`__skip_${key}`]) {
+      opt.skipFactory = true;
+    }
+  }
+  return props;
+}
+function mergeModels(a, b) {
+  if (!a || !b) return a || b;
+  if (isArray(a) && isArray(b)) return a.concat(b);
+  return extend({}, normalizePropsOrEmits(a), normalizePropsOrEmits(b));
+}
+function createPropsRestProxy(props, excludedKeys) {
+  const ret = {};
+  for (const key in props) {
+    if (!excludedKeys.includes(key)) {
+      Object.defineProperty(ret, key, {
+        enumerable: true,
+        get: () => props[key]
+      });
+    }
+  }
+  return ret;
+}
+function withAsyncContext(getAwaitable) {
+  const ctx = getCurrentInstance();
+  if (!ctx) {
+    warn$1(
+      `withAsyncContext called without active current instance. This is likely a bug.`
+    );
+  }
+  let awaitable = getAwaitable();
+  unsetCurrentInstance();
+  if (isPromise(awaitable)) {
+    awaitable = awaitable.catch((e) => {
+      setCurrentInstance(ctx);
+      throw e;
+    });
+  }
+  return [awaitable, () => setCurrentInstance(ctx)];
+}
+
+function createDuplicateChecker() {
+  const cache = /* @__PURE__ */ Object.create(null);
+  return (type, key) => {
+    if (cache[key]) {
+      warn$1(`${type} property "${key}" is already defined in ${cache[key]}.`);
+    } else {
+      cache[key] = type;
+    }
+  };
+}
+let shouldCacheAccess = true;
+function applyOptions(instance) {
+  const options = resolveMergedOptions(instance);
+  const publicThis = instance.proxy;
+  const ctx = instance.ctx;
+  shouldCacheAccess = false;
+  if (options.beforeCreate) {
+    callHook$1(options.beforeCreate, instance, "bc");
+  }
+  const {
+    // state
+    data: dataOptions,
+    computed: computedOptions,
+    methods,
+    watch: watchOptions,
+    provide: provideOptions,
+    inject: injectOptions,
+    // lifecycle
+    created,
+    beforeMount,
+    mounted,
+    beforeUpdate,
+    updated,
+    activated,
+    deactivated,
+    beforeDestroy,
+    beforeUnmount,
+    destroyed,
+    unmounted,
+    render,
+    renderTracked,
+    renderTriggered,
+    errorCaptured,
+    serverPrefetch,
+    // public API
+    expose,
+    inheritAttrs,
+    // assets
+    components,
+    directives,
+    filters
+  } = options;
+  const checkDuplicateProperties = createDuplicateChecker() ;
+  {
+    const [propsOptions] = instance.propsOptions;
+    if (propsOptions) {
+      for (const key in propsOptions) {
+        checkDuplicateProperties("Props" /* PROPS */, key);
+      }
+    }
+  }
+  if (injectOptions) {
+    resolveInjections(injectOptions, ctx, checkDuplicateProperties);
+  }
+  if (methods) {
+    for (const key in methods) {
+      const methodHandler = methods[key];
+      if (isFunction(methodHandler)) {
+        {
+          Object.defineProperty(ctx, key, {
+            value: methodHandler.bind(publicThis),
+            configurable: true,
+            enumerable: true,
+            writable: true
+          });
+        }
+        {
+          checkDuplicateProperties("Methods" /* METHODS */, key);
+        }
+      } else {
+        warn$1(
+          `Method "${key}" has type "${typeof methodHandler}" in the component definition. Did you reference the function correctly?`
+        );
+      }
+    }
+  }
+  if (dataOptions) {
+    if (!isFunction(dataOptions)) {
+      warn$1(
+        `The data option must be a function. Plain object usage is no longer supported.`
+      );
+    }
+    const data = dataOptions.call(publicThis, publicThis);
+    if (isPromise(data)) {
+      warn$1(
+        `data() returned a Promise - note data() cannot be async; If you intend to perform data fetching before component renders, use async setup() + <Suspense>.`
+      );
+    }
+    if (!isObject(data)) {
+      warn$1(`data() should return an object.`);
+    } else {
+      instance.data = reactive(data);
+      {
+        for (const key in data) {
+          checkDuplicateProperties("Data" /* DATA */, key);
+          if (!isReservedPrefix(key[0])) {
+            Object.defineProperty(ctx, key, {
+              configurable: true,
+              enumerable: true,
+              get: () => data[key],
+              set: NOOP
+            });
+          }
+        }
+      }
+    }
+  }
+  shouldCacheAccess = true;
+  if (computedOptions) {
+    for (const key in computedOptions) {
+      const opt = computedOptions[key];
+      const get = isFunction(opt) ? opt.bind(publicThis, publicThis) : isFunction(opt.get) ? opt.get.bind(publicThis, publicThis) : NOOP;
+      if (get === NOOP) {
+        warn$1(`Computed property "${key}" has no getter.`);
+      }
+      const set = !isFunction(opt) && isFunction(opt.set) ? opt.set.bind(publicThis) : () => {
+        warn$1(
+          `Write operation failed: computed property "${key}" is readonly.`
+        );
+      } ;
+      const c = computed({
+        get,
+        set
+      });
+      Object.defineProperty(ctx, key, {
+        enumerable: true,
+        configurable: true,
+        get: () => c.value,
+        set: (v) => c.value = v
+      });
+      {
+        checkDuplicateProperties("Computed" /* COMPUTED */, key);
+      }
+    }
+  }
+  if (watchOptions) {
+    for (const key in watchOptions) {
+      createWatcher(watchOptions[key], ctx, publicThis, key);
+    }
+  }
+  if (provideOptions) {
+    const provides = isFunction(provideOptions) ? provideOptions.call(publicThis) : provideOptions;
+    Reflect.ownKeys(provides).forEach((key) => {
+      provide(key, provides[key]);
+    });
+  }
+  if (created) {
+    callHook$1(created, instance, "c");
+  }
+  function registerLifecycleHook(register, hook) {
+    if (isArray(hook)) {
+      hook.forEach((_hook) => register(_hook.bind(publicThis)));
+    } else if (hook) {
+      register(hook.bind(publicThis));
+    }
+  }
+  registerLifecycleHook(onBeforeMount, beforeMount);
+  registerLifecycleHook(onMounted, mounted);
+  registerLifecycleHook(onBeforeUpdate, beforeUpdate);
+  registerLifecycleHook(onUpdated, updated);
+  registerLifecycleHook(onActivated, activated);
+  registerLifecycleHook(onDeactivated, deactivated);
+  registerLifecycleHook(onErrorCaptured, errorCaptured);
+  registerLifecycleHook(onRenderTracked, renderTracked);
+  registerLifecycleHook(onRenderTriggered, renderTriggered);
+  registerLifecycleHook(onBeforeUnmount, beforeUnmount);
+  registerLifecycleHook(onUnmounted, unmounted);
+  registerLifecycleHook(onServerPrefetch, serverPrefetch);
+  if (isArray(expose)) {
+    if (expose.length) {
+      const exposed = instance.exposed || (instance.exposed = {});
+      expose.forEach((key) => {
+        Object.defineProperty(exposed, key, {
+          get: () => publicThis[key],
+          set: (val) => publicThis[key] = val
+        });
+      });
+    } else if (!instance.exposed) {
+      instance.exposed = {};
+    }
+  }
+  if (render && instance.render === NOOP) {
+    instance.render = render;
+  }
+  if (inheritAttrs != null) {
+    instance.inheritAttrs = inheritAttrs;
+  }
+  if (components) instance.components = components;
+  if (directives) instance.directives = directives;
+  if (serverPrefetch) {
+    markAsyncBoundary(instance);
+  }
+}
+function resolveInjections(injectOptions, ctx, checkDuplicateProperties = NOOP) {
+  if (isArray(injectOptions)) {
+    injectOptions = normalizeInject(injectOptions);
+  }
+  for (const key in injectOptions) {
+    const opt = injectOptions[key];
+    let injected;
+    if (isObject(opt)) {
+      if ("default" in opt) {
+        injected = inject(
+          opt.from || key,
+          opt.default,
+          true
+        );
+      } else {
+        injected = inject(opt.from || key);
+      }
+    } else {
+      injected = inject(opt);
+    }
+    if (isRef(injected)) {
+      Object.defineProperty(ctx, key, {
+        enumerable: true,
+        configurable: true,
+        get: () => injected.value,
+        set: (v) => injected.value = v
+      });
+    } else {
+      ctx[key] = injected;
+    }
+    {
+      checkDuplicateProperties("Inject" /* INJECT */, key);
+    }
+  }
+}
+function callHook$1(hook, instance, type) {
+  callWithAsyncErrorHandling(
+    isArray(hook) ? hook.map((h) => h.bind(instance.proxy)) : hook.bind(instance.proxy),
+    instance,
+    type
+  );
+}
+function createWatcher(raw, ctx, publicThis, key) {
+  let getter = key.includes(".") ? createPathGetter(publicThis, key) : () => publicThis[key];
+  if (isString(raw)) {
+    const handler = ctx[raw];
+    if (isFunction(handler)) {
+      {
+        watch(getter, handler);
+      }
+    } else {
+      warn$1(`Invalid watch handler specified by key "${raw}"`, handler);
+    }
+  } else if (isFunction(raw)) {
+    {
+      watch(getter, raw.bind(publicThis));
+    }
+  } else if (isObject(raw)) {
+    if (isArray(raw)) {
+      raw.forEach((r) => createWatcher(r, ctx, publicThis, key));
+    } else {
+      const handler = isFunction(raw.handler) ? raw.handler.bind(publicThis) : ctx[raw.handler];
+      if (isFunction(handler)) {
+        watch(getter, handler, raw);
+      } else {
+        warn$1(`Invalid watch handler specified by key "${raw.handler}"`, handler);
+      }
+    }
+  } else {
+    warn$1(`Invalid watch option: "${key}"`, raw);
+  }
+}
+function resolveMergedOptions(instance) {
+  const base = instance.type;
+  const { mixins, extends: extendsOptions } = base;
+  const {
+    mixins: globalMixins,
+    optionsCache: cache,
+    config: { optionMergeStrategies }
+  } = instance.appContext;
+  const cached = cache.get(base);
+  let resolved;
+  if (cached) {
+    resolved = cached;
+  } else if (!globalMixins.length && !mixins && !extendsOptions) {
+    {
+      resolved = base;
+    }
+  } else {
+    resolved = {};
+    if (globalMixins.length) {
+      globalMixins.forEach(
+        (m) => mergeOptions(resolved, m, optionMergeStrategies, true)
+      );
+    }
+    mergeOptions(resolved, base, optionMergeStrategies);
+  }
+  if (isObject(base)) {
+    cache.set(base, resolved);
+  }
+  return resolved;
+}
+function mergeOptions(to, from, strats, asMixin = false) {
+  const { mixins, extends: extendsOptions } = from;
+  if (extendsOptions) {
+    mergeOptions(to, extendsOptions, strats, true);
+  }
+  if (mixins) {
+    mixins.forEach(
+      (m) => mergeOptions(to, m, strats, true)
+    );
+  }
+  for (const key in from) {
+    if (asMixin && key === "expose") {
+      warn$1(
+        `"expose" option is ignored when declared in mixins or extends. It should only be declared in the base component itself.`
+      );
+    } else {
+      const strat = internalOptionMergeStrats[key] || strats && strats[key];
+      to[key] = strat ? strat(to[key], from[key]) : from[key];
+    }
+  }
+  return to;
+}
+const internalOptionMergeStrats = {
+  data: mergeDataFn,
+  props: mergeEmitsOrPropsOptions,
+  emits: mergeEmitsOrPropsOptions,
+  // objects
+  methods: mergeObjectOptions,
+  computed: mergeObjectOptions,
+  // lifecycle
+  beforeCreate: mergeAsArray$1,
+  created: mergeAsArray$1,
+  beforeMount: mergeAsArray$1,
+  mounted: mergeAsArray$1,
+  beforeUpdate: mergeAsArray$1,
+  updated: mergeAsArray$1,
+  beforeDestroy: mergeAsArray$1,
+  beforeUnmount: mergeAsArray$1,
+  destroyed: mergeAsArray$1,
+  unmounted: mergeAsArray$1,
+  activated: mergeAsArray$1,
+  deactivated: mergeAsArray$1,
+  errorCaptured: mergeAsArray$1,
+  serverPrefetch: mergeAsArray$1,
+  // assets
+  components: mergeObjectOptions,
+  directives: mergeObjectOptions,
+  // watch
+  watch: mergeWatchOptions,
+  // provide / inject
+  provide: mergeDataFn,
+  inject: mergeInject
+};
+function mergeDataFn(to, from) {
+  if (!from) {
+    return to;
+  }
+  if (!to) {
+    return from;
+  }
+  return function mergedDataFn() {
+    return (extend)(
+      isFunction(to) ? to.call(this, this) : to,
+      isFunction(from) ? from.call(this, this) : from
+    );
+  };
+}
+function mergeInject(to, from) {
+  return mergeObjectOptions(normalizeInject(to), normalizeInject(from));
+}
+function normalizeInject(raw) {
+  if (isArray(raw)) {
+    const res = {};
+    for (let i = 0; i < raw.length; i++) {
+      res[raw[i]] = raw[i];
+    }
+    return res;
+  }
+  return raw;
+}
+function mergeAsArray$1(to, from) {
+  return to ? [...new Set([].concat(to, from))] : from;
+}
+function mergeObjectOptions(to, from) {
+  return to ? extend(/* @__PURE__ */ Object.create(null), to, from) : from;
+}
+function mergeEmitsOrPropsOptions(to, from) {
+  if (to) {
+    if (isArray(to) && isArray(from)) {
+      return [.../* @__PURE__ */ new Set([...to, ...from])];
+    }
+    return extend(
+      /* @__PURE__ */ Object.create(null),
+      normalizePropsOrEmits(to),
+      normalizePropsOrEmits(from != null ? from : {})
+    );
+  } else {
+    return from;
+  }
+}
+function mergeWatchOptions(to, from) {
+  if (!to) return from;
+  if (!from) return to;
+  const merged = extend(/* @__PURE__ */ Object.create(null), to);
+  for (const key in from) {
+    merged[key] = mergeAsArray$1(to[key], from[key]);
+  }
+  return merged;
+}
+
+function createAppContext() {
+  return {
+    app: null,
+    config: {
+      isNativeTag: NO,
+      performance: false,
+      globalProperties: {},
+      optionMergeStrategies: {},
+      errorHandler: void 0,
+      warnHandler: void 0,
+      compilerOptions: {}
+    },
+    mixins: [],
+    components: {},
+    directives: {},
+    provides: /* @__PURE__ */ Object.create(null),
+    optionsCache: /* @__PURE__ */ new WeakMap(),
+    propsCache: /* @__PURE__ */ new WeakMap(),
+    emitsCache: /* @__PURE__ */ new WeakMap()
+  };
+}
+let uid$1 = 0;
+function createAppAPI(render, hydrate) {
+  return function createApp(rootComponent, rootProps = null) {
+    if (!isFunction(rootComponent)) {
+      rootComponent = extend({}, rootComponent);
+    }
+    if (rootProps != null && !isObject(rootProps)) {
+      warn$1(`root props passed to app.mount() must be an object.`);
+      rootProps = null;
+    }
+    const context = createAppContext();
+    const installedPlugins = /* @__PURE__ */ new WeakSet();
+    const pluginCleanupFns = [];
+    let isMounted = false;
+    const app = context.app = {
+      _uid: uid$1++,
+      _component: rootComponent,
+      _props: rootProps,
+      _container: null,
+      _context: context,
+      _instance: null,
+      version,
+      get config() {
+        return context.config;
+      },
+      set config(v) {
+        {
+          warn$1(
+            `app.config cannot be replaced. Modify individual options instead.`
+          );
+        }
+      },
+      use(plugin, ...options) {
+        if (installedPlugins.has(plugin)) {
+          warn$1(`Plugin has already been applied to target app.`);
+        } else if (plugin && isFunction(plugin.install)) {
+          installedPlugins.add(plugin);
+          plugin.install(app, ...options);
+        } else if (isFunction(plugin)) {
+          installedPlugins.add(plugin);
+          plugin(app, ...options);
+        } else {
+          warn$1(
+            `A plugin must either be a function or an object with an "install" function.`
+          );
+        }
+        return app;
+      },
+      mixin(mixin) {
+        {
+          if (!context.mixins.includes(mixin)) {
+            context.mixins.push(mixin);
+          } else {
+            warn$1(
+              "Mixin has already been applied to target app" + (mixin.name ? `: ${mixin.name}` : "")
+            );
+          }
+        }
+        return app;
+      },
+      component(name, component) {
+        {
+          validateComponentName(name, context.config);
+        }
+        if (!component) {
+          return context.components[name];
+        }
+        if (context.components[name]) {
+          warn$1(`Component "${name}" has already been registered in target app.`);
+        }
+        context.components[name] = component;
+        return app;
+      },
+      directive(name, directive) {
+        {
+          validateDirectiveName(name);
+        }
+        if (!directive) {
+          return context.directives[name];
+        }
+        if (context.directives[name]) {
+          warn$1(`Directive "${name}" has already been registered in target app.`);
+        }
+        context.directives[name] = directive;
+        return app;
+      },
+      mount(rootContainer, isHydrate, namespace) {
+        if (!isMounted) {
+          if (rootContainer.__vue_app__) {
+            warn$1(
+              `There is already an app instance mounted on the host container.
+ If you want to mount another app on the same host container, you need to unmount the previous app by calling \`app.unmount()\` first.`
+            );
+          }
+          const vnode = app._ceVNode || createVNode(rootComponent, rootProps);
+          vnode.appContext = context;
+          if (namespace === true) {
+            namespace = "svg";
+          } else if (namespace === false) {
+            namespace = void 0;
+          }
+          {
+            context.reload = () => {
+              render(
+                cloneVNode(vnode),
+                rootContainer,
+                namespace
+              );
+            };
+          }
+          if (isHydrate && hydrate) {
+            hydrate(vnode, rootContainer);
+          } else {
+            render(vnode, rootContainer, namespace);
+          }
+          isMounted = true;
+          app._container = rootContainer;
+          rootContainer.__vue_app__ = app;
+          {
+            app._instance = vnode.component;
+            devtoolsInitApp(app, version);
+          }
+          return getComponentPublicInstance(vnode.component);
+        } else {
+          warn$1(
+            `App has already been mounted.
+If you want to remount the same app, move your app creation logic into a factory function and create fresh app instances for each mount - e.g. \`const createMyApp = () => createApp(App)\``
+          );
+        }
+      },
+      onUnmount(cleanupFn) {
+        if (typeof cleanupFn !== "function") {
+          warn$1(
+            `Expected function as first argument to app.onUnmount(), but got ${typeof cleanupFn}`
+          );
+        }
+        pluginCleanupFns.push(cleanupFn);
+      },
+      unmount() {
+        if (isMounted) {
+          callWithAsyncErrorHandling(
+            pluginCleanupFns,
+            app._instance,
+            16
+          );
+          render(null, app._container);
+          {
+            app._instance = null;
+            devtoolsUnmountApp(app);
+          }
+          delete app._container.__vue_app__;
+        } else {
+          warn$1(`Cannot unmount an app that is not mounted.`);
+        }
+      },
+      provide(key, value) {
+        if (key in context.provides) {
+          warn$1(
+            `App already provides property with key "${String(key)}". It will be overwritten with the new value.`
+          );
+        }
+        context.provides[key] = value;
+        return app;
+      },
+      runWithContext(fn) {
+        const lastApp = currentApp;
+        currentApp = app;
+        try {
+          return fn();
+        } finally {
+          currentApp = lastApp;
+        }
+      }
+    };
+    return app;
+  };
+}
+let currentApp = null;
+
+function provide(key, value) {
+  if (!currentInstance) {
+    {
+      warn$1(`provide() can only be used inside setup().`);
+    }
+  } else {
+    let provides = currentInstance.provides;
+    const parentProvides = currentInstance.parent && currentInstance.parent.provides;
+    if (parentProvides === provides) {
+      provides = currentInstance.provides = Object.create(parentProvides);
+    }
+    provides[key] = value;
+  }
+}
+function inject(key, defaultValue, treatDefaultAsFactory = false) {
+  const instance = currentInstance || currentRenderingInstance;
+  if (instance || currentApp) {
+    const provides = currentApp ? currentApp._context.provides : instance ? instance.parent == null ? instance.vnode.appContext && instance.vnode.appContext.provides : instance.parent.provides : void 0;
+    if (provides && key in provides) {
+      return provides[key];
+    } else if (arguments.length > 1) {
+      return treatDefaultAsFactory && isFunction(defaultValue) ? defaultValue.call(instance && instance.proxy) : defaultValue;
+    } else {
+      warn$1(`injection "${String(key)}" not found.`);
+    }
+  } else {
+    warn$1(`inject() can only be used inside setup() or functional components.`);
+  }
+}
+function hasInjectionContext() {
+  return !!(currentInstance || currentRenderingInstance || currentApp);
+}
+
+const internalObjectProto = {};
+const createInternalObject = () => Object.create(internalObjectProto);
+const isInternalObject = (obj) => Object.getPrototypeOf(obj) === internalObjectProto;
+
+function initProps(instance, rawProps, isStateful, isSSR = false) {
+  const props = {};
+  const attrs = createInternalObject();
+  instance.propsDefaults = /* @__PURE__ */ Object.create(null);
+  setFullProps(instance, rawProps, props, attrs);
+  for (const key in instance.propsOptions[0]) {
+    if (!(key in props)) {
+      props[key] = void 0;
+    }
+  }
+  {
+    validateProps(rawProps || {}, props, instance);
+  }
+  if (isStateful) {
+    instance.props = isSSR ? props : shallowReactive(props);
+  } else {
+    if (!instance.type.props) {
+      instance.props = attrs;
+    } else {
+      instance.props = props;
+    }
+  }
+  instance.attrs = attrs;
+}
+function isInHmrContext(instance) {
+  while (instance) {
+    if (instance.type.__hmrId) return true;
+    instance = instance.parent;
+  }
+}
+function updateProps(instance, rawProps, rawPrevProps, optimized) {
+  const {
+    props,
+    attrs,
+    vnode: { patchFlag }
+  } = instance;
+  const rawCurrentProps = toRaw(props);
+  const [options] = instance.propsOptions;
+  let hasAttrsChanged = false;
+  if (
+    // always force full diff in dev
+    // - #1942 if hmr is enabled with sfc component
+    // - vite#872 non-sfc component used by sfc component
+    !isInHmrContext(instance) && (optimized || patchFlag > 0) && !(patchFlag & 16)
+  ) {
+    if (patchFlag & 8) {
+      const propsToUpdate = instance.vnode.dynamicProps;
+      for (let i = 0; i < propsToUpdate.length; i++) {
+        let key = propsToUpdate[i];
+        if (isEmitListener(instance.emitsOptions, key)) {
+          continue;
+        }
+        const value = rawProps[key];
+        if (options) {
+          if (hasOwn(attrs, key)) {
+            if (value !== attrs[key]) {
+              attrs[key] = value;
+              hasAttrsChanged = true;
+            }
+          } else {
+            const camelizedKey = camelize(key);
+            props[camelizedKey] = resolvePropValue(
+              options,
+              rawCurrentProps,
+              camelizedKey,
+              value,
+              instance,
+              false
+            );
+          }
+        } else {
+          if (value !== attrs[key]) {
+            attrs[key] = value;
+            hasAttrsChanged = true;
+          }
+        }
+      }
+    }
+  } else {
+    if (setFullProps(instance, rawProps, props, attrs)) {
+      hasAttrsChanged = true;
+    }
+    let kebabKey;
+    for (const key in rawCurrentProps) {
+      if (!rawProps || // for camelCase
+      !hasOwn(rawProps, key) && // it's possible the original props was passed in as kebab-case
+      // and converted to camelCase (#955)
+      ((kebabKey = hyphenate(key)) === key || !hasOwn(rawProps, kebabKey))) {
+        if (options) {
+          if (rawPrevProps && // for camelCase
+          (rawPrevProps[key] !== void 0 || // for kebab-case
+          rawPrevProps[kebabKey] !== void 0)) {
+            props[key] = resolvePropValue(
+              options,
+              rawCurrentProps,
+              key,
+              void 0,
+              instance,
+              true
+            );
+          }
+        } else {
+          delete props[key];
+        }
+      }
+    }
+    if (attrs !== rawCurrentProps) {
+      for (const key in attrs) {
+        if (!rawProps || !hasOwn(rawProps, key) && true) {
+          delete attrs[key];
+          hasAttrsChanged = true;
+        }
+      }
+    }
+  }
+  if (hasAttrsChanged) {
+    trigger(instance.attrs, "set", "");
+  }
+  {
+    validateProps(rawProps || {}, props, instance);
+  }
+}
+function setFullProps(instance, rawProps, props, attrs) {
+  const [options, needCastKeys] = instance.propsOptions;
+  let hasAttrsChanged = false;
+  let rawCastValues;
+  if (rawProps) {
+    for (let key in rawProps) {
+      if (isReservedProp(key)) {
+        continue;
+      }
+      const value = rawProps[key];
+      let camelKey;
+      if (options && hasOwn(options, camelKey = camelize(key))) {
+        if (!needCastKeys || !needCastKeys.includes(camelKey)) {
+          props[camelKey] = value;
+        } else {
+          (rawCastValues || (rawCastValues = {}))[camelKey] = value;
+        }
+      } else if (!isEmitListener(instance.emitsOptions, key)) {
+        if (!(key in attrs) || value !== attrs[key]) {
+          attrs[key] = value;
+          hasAttrsChanged = true;
+        }
+      }
+    }
+  }
+  if (needCastKeys) {
+    const rawCurrentProps = toRaw(props);
+    const castValues = rawCastValues || EMPTY_OBJ;
+    for (let i = 0; i < needCastKeys.length; i++) {
+      const key = needCastKeys[i];
+      props[key] = resolvePropValue(
+        options,
+        rawCurrentProps,
+        key,
+        castValues[key],
+        instance,
+        !hasOwn(castValues, key)
+      );
+    }
+  }
+  return hasAttrsChanged;
+}
+function resolvePropValue(options, props, key, value, instance, isAbsent) {
+  const opt = options[key];
+  if (opt != null) {
+    const hasDefault = hasOwn(opt, "default");
+    if (hasDefault && value === void 0) {
+      const defaultValue = opt.default;
+      if (opt.type !== Function && !opt.skipFactory && isFunction(defaultValue)) {
+        const { propsDefaults } = instance;
+        if (key in propsDefaults) {
+          value = propsDefaults[key];
+        } else {
+          const reset = setCurrentInstance(instance);
+          value = propsDefaults[key] = defaultValue.call(
+            null,
+            props
+          );
+          reset();
+        }
+      } else {
+        value = defaultValue;
+      }
+      if (instance.ce) {
+        instance.ce._setProp(key, value);
+      }
+    }
+    if (opt[0 /* shouldCast */]) {
+      if (isAbsent && !hasDefault) {
+        value = false;
+      } else if (opt[1 /* shouldCastTrue */] && (value === "" || value === hyphenate(key))) {
+        value = true;
+      }
+    }
+  }
+  return value;
+}
+const mixinPropsCache = /* @__PURE__ */ new WeakMap();
+function normalizePropsOptions(comp, appContext, asMixin = false) {
+  const cache = asMixin ? mixinPropsCache : appContext.propsCache;
+  const cached = cache.get(comp);
+  if (cached) {
+    return cached;
+  }
+  const raw = comp.props;
+  const normalized = {};
+  const needCastKeys = [];
+  let hasExtends = false;
+  if (!isFunction(comp)) {
+    const extendProps = (raw2) => {
+      hasExtends = true;
+      const [props, keys] = normalizePropsOptions(raw2, appContext, true);
+      extend(normalized, props);
+      if (keys) needCastKeys.push(...keys);
+    };
+    if (!asMixin && appContext.mixins.length) {
+      appContext.mixins.forEach(extendProps);
+    }
+    if (comp.extends) {
+      extendProps(comp.extends);
+    }
+    if (comp.mixins) {
+      comp.mixins.forEach(extendProps);
+    }
+  }
+  if (!raw && !hasExtends) {
+    if (isObject(comp)) {
+      cache.set(comp, EMPTY_ARR);
+    }
+    return EMPTY_ARR;
+  }
+  if (isArray(raw)) {
+    for (let i = 0; i < raw.length; i++) {
+      if (!isString(raw[i])) {
+        warn$1(`props must be strings when using array syntax.`, raw[i]);
+      }
+      const normalizedKey = camelize(raw[i]);
+      if (validatePropName(normalizedKey)) {
+        normalized[normalizedKey] = EMPTY_OBJ;
+      }
+    }
+  } else if (raw) {
+    if (!isObject(raw)) {
+      warn$1(`invalid props options`, raw);
+    }
+    for (const key in raw) {
+      const normalizedKey = camelize(key);
+      if (validatePropName(normalizedKey)) {
+        const opt = raw[key];
+        const prop = normalized[normalizedKey] = isArray(opt) || isFunction(opt) ? { type: opt } : extend({}, opt);
+        const propType = prop.type;
+        let shouldCast = false;
+        let shouldCastTrue = true;
+        if (isArray(propType)) {
+          for (let index = 0; index < propType.length; ++index) {
+            const type = propType[index];
+            const typeName = isFunction(type) && type.name;
+            if (typeName === "Boolean") {
+              shouldCast = true;
+              break;
+            } else if (typeName === "String") {
+              shouldCastTrue = false;
+            }
+          }
+        } else {
+          shouldCast = isFunction(propType) && propType.name === "Boolean";
+        }
+        prop[0 /* shouldCast */] = shouldCast;
+        prop[1 /* shouldCastTrue */] = shouldCastTrue;
+        if (shouldCast || hasOwn(prop, "default")) {
+          needCastKeys.push(normalizedKey);
+        }
+      }
+    }
+  }
+  const res = [normalized, needCastKeys];
+  if (isObject(comp)) {
+    cache.set(comp, res);
+  }
+  return res;
+}
+function validatePropName(key) {
+  if (key[0] !== "$" && !isReservedProp(key)) {
+    return true;
+  } else {
+    warn$1(`Invalid prop name: "${key}" is a reserved property.`);
+  }
+  return false;
+}
+function getType(ctor) {
+  if (ctor === null) {
+    return "null";
+  }
+  if (typeof ctor === "function") {
+    return ctor.name || "";
+  } else if (typeof ctor === "object") {
+    const name = ctor.constructor && ctor.constructor.name;
+    return name || "";
+  }
+  return "";
+}
+function validateProps(rawProps, props, instance) {
+  const resolvedValues = toRaw(props);
+  const options = instance.propsOptions[0];
+  const camelizePropsKey = Object.keys(rawProps).map((key) => camelize(key));
+  for (const key in options) {
+    let opt = options[key];
+    if (opt == null) continue;
+    validateProp(
+      key,
+      resolvedValues[key],
+      opt,
+      shallowReadonly(resolvedValues) ,
+      !camelizePropsKey.includes(key)
+    );
+  }
+}
+function validateProp(name, value, prop, props, isAbsent) {
+  const { type, required, validator, skipCheck } = prop;
+  if (required && isAbsent) {
+    warn$1('Missing required prop: "' + name + '"');
+    return;
+  }
+  if (value == null && !required) {
+    return;
+  }
+  if (type != null && type !== true && !skipCheck) {
+    let isValid = false;
+    const types = isArray(type) ? type : [type];
+    const expectedTypes = [];
+    for (let i = 0; i < types.length && !isValid; i++) {
+      const { valid, expectedType } = assertType(value, types[i]);
+      expectedTypes.push(expectedType || "");
+      isValid = valid;
+    }
+    if (!isValid) {
+      warn$1(getInvalidTypeMessage(name, value, expectedTypes));
+      return;
+    }
+  }
+  if (validator && !validator(value, props)) {
+    warn$1('Invalid prop: custom validator check failed for prop "' + name + '".');
+  }
+}
+const isSimpleType = /* @__PURE__ */ makeMap(
+  "String,Number,Boolean,Function,Symbol,BigInt"
+);
+function assertType(value, type) {
+  let valid;
+  const expectedType = getType(type);
+  if (expectedType === "null") {
+    valid = value === null;
+  } else if (isSimpleType(expectedType)) {
+    const t = typeof value;
+    valid = t === expectedType.toLowerCase();
+    if (!valid && t === "object") {
+      valid = value instanceof type;
+    }
+  } else if (expectedType === "Object") {
+    valid = isObject(value);
+  } else if (expectedType === "Array") {
+    valid = isArray(value);
+  } else {
+    valid = value instanceof type;
+  }
+  return {
+    valid,
+    expectedType
+  };
+}
+function getInvalidTypeMessage(name, value, expectedTypes) {
+  if (expectedTypes.length === 0) {
+    return `Prop type [] for prop "${name}" won't match anything. Did you mean to use type Array instead?`;
+  }
+  let message = `Invalid prop: type check failed for prop "${name}". Expected ${expectedTypes.map(capitalize).join(" | ")}`;
+  const expectedType = expectedTypes[0];
+  const receivedType = toRawType(value);
+  const expectedValue = styleValue(value, expectedType);
+  const receivedValue = styleValue(value, receivedType);
+  if (expectedTypes.length === 1 && isExplicable(expectedType) && !isBoolean(expectedType, receivedType)) {
+    message += ` with value ${expectedValue}`;
+  }
+  message += `, got ${receivedType} `;
+  if (isExplicable(receivedType)) {
+    message += `with value ${receivedValue}.`;
+  }
+  return message;
+}
+function styleValue(value, type) {
+  if (type === "String") {
+    return `"${value}"`;
+  } else if (type === "Number") {
+    return `${Number(value)}`;
+  } else {
+    return `${value}`;
+  }
+}
+function isExplicable(type) {
+  const explicitTypes = ["string", "number", "boolean"];
+  return explicitTypes.some((elem) => type.toLowerCase() === elem);
+}
+function isBoolean(...args) {
+  return args.some((elem) => elem.toLowerCase() === "boolean");
+}
+
+const isInternalKey = (key) => key[0] === "_" || key === "$stable";
+const normalizeSlotValue = (value) => isArray(value) ? value.map(normalizeVNode) : [normalizeVNode(value)];
+const normalizeSlot = (key, rawSlot, ctx) => {
+  if (rawSlot._n) {
+    return rawSlot;
+  }
+  const normalized = withCtx((...args) => {
+    if (currentInstance && (!ctx || ctx.root === currentInstance.root)) {
+      warn$1(
+        `Slot "${key}" invoked outside of the render function: this will not track dependencies used in the slot. Invoke the slot function inside the render function instead.`
+      );
+    }
+    return normalizeSlotValue(rawSlot(...args));
+  }, ctx);
+  normalized._c = false;
+  return normalized;
+};
+const normalizeObjectSlots = (rawSlots, slots, instance) => {
+  const ctx = rawSlots._ctx;
+  for (const key in rawSlots) {
+    if (isInternalKey(key)) continue;
+    const value = rawSlots[key];
+    if (isFunction(value)) {
+      slots[key] = normalizeSlot(key, value, ctx);
+    } else if (value != null) {
+      {
+        warn$1(
+          `Non-function value encountered for slot "${key}". Prefer function slots for better performance.`
+        );
+      }
+      const normalized = normalizeSlotValue(value);
+      slots[key] = () => normalized;
+    }
+  }
+};
+const normalizeVNodeSlots = (instance, children) => {
+  if (!isKeepAlive(instance.vnode) && true) {
+    warn$1(
+      `Non-function value encountered for default slot. Prefer function slots for better performance.`
+    );
+  }
+  const normalized = normalizeSlotValue(children);
+  instance.slots.default = () => normalized;
+};
+const assignSlots = (slots, children, optimized) => {
+  for (const key in children) {
+    if (optimized || key !== "_") {
+      slots[key] = children[key];
+    }
+  }
+};
+const initSlots = (instance, children, optimized) => {
+  const slots = instance.slots = createInternalObject();
+  if (instance.vnode.shapeFlag & 32) {
+    const type = children._;
+    if (type) {
+      assignSlots(slots, children, optimized);
+      if (optimized) {
+        def(slots, "_", type, true);
+      }
+    } else {
+      normalizeObjectSlots(children, slots);
+    }
+  } else if (children) {
+    normalizeVNodeSlots(instance, children);
+  }
+};
+const updateSlots = (instance, children, optimized) => {
+  const { vnode, slots } = instance;
+  let needDeletionCheck = true;
+  let deletionComparisonTarget = EMPTY_OBJ;
+  if (vnode.shapeFlag & 32) {
+    const type = children._;
+    if (type) {
+      if (isHmrUpdating) {
+        assignSlots(slots, children, optimized);
+        trigger(instance, "set", "$slots");
+      } else if (optimized && type === 1) {
+        needDeletionCheck = false;
+      } else {
+        assignSlots(slots, children, optimized);
+      }
+    } else {
+      needDeletionCheck = !children.$stable;
+      normalizeObjectSlots(children, slots);
+    }
+    deletionComparisonTarget = children;
+  } else if (children) {
+    normalizeVNodeSlots(instance, children);
+    deletionComparisonTarget = { default: 1 };
+  }
+  if (needDeletionCheck) {
+    for (const key in slots) {
+      if (!isInternalKey(key) && deletionComparisonTarget[key] == null) {
+        delete slots[key];
+      }
+    }
+  }
+};
+
+let supported;
+let perf;
+function startMeasure(instance, type) {
+  if (instance.appContext.config.performance && isSupported()) {
+    perf.mark(`vue-${type}-${instance.uid}`);
+  }
+  {
+    devtoolsPerfStart(instance, type, isSupported() ? perf.now() : Date.now());
+  }
+}
+function endMeasure(instance, type) {
+  if (instance.appContext.config.performance && isSupported()) {
+    const startTag = `vue-${type}-${instance.uid}`;
+    const endTag = startTag + `:end`;
+    perf.mark(endTag);
+    perf.measure(
+      `<${formatComponentName(instance, instance.type)}> ${type}`,
+      startTag,
+      endTag
+    );
+    perf.clearMarks(startTag);
+    perf.clearMarks(endTag);
+  }
+  {
+    devtoolsPerfEnd(instance, type, isSupported() ? perf.now() : Date.now());
+  }
+}
+function isSupported() {
+  if (supported !== void 0) {
+    return supported;
+  }
+  if (typeof window !== "undefined" && window.performance) {
+    supported = true;
+    perf = window.performance;
+  } else {
+    supported = false;
+  }
+  return supported;
+}
+
+const queuePostRenderEffect = queueEffectWithSuspense ;
+function createRenderer(options) {
+  return baseCreateRenderer(options);
+}
+function createHydrationRenderer(options) {
+  return baseCreateRenderer(options, createHydrationFunctions);
+}
+function baseCreateRenderer(options, createHydrationFns) {
+  const target = getGlobalThis();
+  target.__VUE__ = true;
+  {
+    setDevtoolsHook$1(target.__VUE_DEVTOOLS_GLOBAL_HOOK__, target);
+  }
+  const {
+    insert: hostInsert,
+    remove: hostRemove,
+    patchProp: hostPatchProp,
+    createElement: hostCreateElement,
+    createText: hostCreateText,
+    createComment: hostCreateComment,
+    setText: hostSetText,
+    setElementText: hostSetElementText,
+    parentNode: hostParentNode,
+    nextSibling: hostNextSibling,
+    setScopeId: hostSetScopeId = NOOP,
+    insertStaticContent: hostInsertStaticContent
+  } = options;
+  const patch = (n1, n2, container, anchor = null, parentComponent = null, parentSuspense = null, namespace = void 0, slotScopeIds = null, optimized = isHmrUpdating ? false : !!n2.dynamicChildren) => {
+    if (n1 === n2) {
+      return;
+    }
+    if (n1 && !isSameVNodeType(n1, n2)) {
+      anchor = getNextHostNode(n1);
+      unmount(n1, parentComponent, parentSuspense, true);
+      n1 = null;
+    }
+    if (n2.patchFlag === -2) {
+      optimized = false;
+      n2.dynamicChildren = null;
+    }
+    const { type, ref, shapeFlag } = n2;
+    switch (type) {
+      case Text:
+        processText(n1, n2, container, anchor);
+        break;
+      case Comment:
+        processCommentNode(n1, n2, container, anchor);
+        break;
+      case Static:
+        if (n1 == null) {
+          mountStaticNode(n2, container, anchor, namespace);
+        } else {
+          patchStaticNode(n1, n2, container, namespace);
+        }
+        break;
+      case Fragment:
+        processFragment(
+          n1,
+          n2,
+          container,
+          anchor,
+          parentComponent,
+          parentSuspense,
+          namespace,
+          slotScopeIds,
+          optimized
+        );
+        break;
+      default:
+        if (shapeFlag & 1) {
+          processElement(
+            n1,
+            n2,
+            container,
+            anchor,
+            parentComponent,
+            parentSuspense,
+            namespace,
+            slotScopeIds,
+            optimized
+          );
+        } else if (shapeFlag & 6) {
+          processComponent(
+            n1,
+            n2,
+            container,
+            anchor,
+            parentComponent,
+            parentSuspense,
+            namespace,
+            slotScopeIds,
+            optimized
+          );
+        } else if (shapeFlag & 64) {
+          type.process(
+            n1,
+            n2,
+            container,
+            anchor,
+            parentComponent,
+            parentSuspense,
+            namespace,
+            slotScopeIds,
+            optimized,
+            internals
+          );
+        } else if (shapeFlag & 128) {
+          type.process(
+            n1,
+            n2,
+            container,
+            anchor,
+            parentComponent,
+            parentSuspense,
+            namespace,
+            slotScopeIds,
+            optimized,
+            internals
+          );
+        } else {
+          warn$1("Invalid VNode type:", type, `(${typeof type})`);
+        }
+    }
+    if (ref != null && parentComponent) {
+      setRef(ref, n1 && n1.ref, parentSuspense, n2 || n1, !n2);
+    }
+  };
+  const processText = (n1, n2, container, anchor) => {
+    if (n1 == null) {
+      hostInsert(
+        n2.el = hostCreateText(n2.children),
+        container,
+        anchor
+      );
+    } else {
+      const el = n2.el = n1.el;
+      if (n2.children !== n1.children) {
+        hostSetText(el, n2.children);
+      }
+    }
+  };
+  const processCommentNode = (n1, n2, container, anchor) => {
+    if (n1 == null) {
+      hostInsert(
+        n2.el = hostCreateComment(n2.children || ""),
+        container,
+        anchor
+      );
+    } else {
+      n2.el = n1.el;
+    }
+  };
+  const mountStaticNode = (n2, container, anchor, namespace) => {
+    [n2.el, n2.anchor] = hostInsertStaticContent(
+      n2.children,
+      container,
+      anchor,
+      namespace,
+      n2.el,
+      n2.anchor
+    );
+  };
+  const patchStaticNode = (n1, n2, container, namespace) => {
+    if (n2.children !== n1.children) {
+      const anchor = hostNextSibling(n1.anchor);
+      removeStaticNode(n1);
+      [n2.el, n2.anchor] = hostInsertStaticContent(
+        n2.children,
+        container,
+        anchor,
+        namespace
+      );
+    } else {
+      n2.el = n1.el;
+      n2.anchor = n1.anchor;
+    }
+  };
+  const moveStaticNode = ({ el, anchor }, container, nextSibling) => {
+    let next;
+    while (el && el !== anchor) {
+      next = hostNextSibling(el);
+      hostInsert(el, container, nextSibling);
+      el = next;
+    }
+    hostInsert(anchor, container, nextSibling);
+  };
+  const removeStaticNode = ({ el, anchor }) => {
+    let next;
+    while (el && el !== anchor) {
+      next = hostNextSibling(el);
+      hostRemove(el);
+      el = next;
+    }
+    hostRemove(anchor);
+  };
+  const processElement = (n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
+    if (n2.type === "svg") {
+      namespace = "svg";
+    } else if (n2.type === "math") {
+      namespace = "mathml";
+    }
+    if (n1 == null) {
+      mountElement(
+        n2,
+        container,
+        anchor,
+        parentComponent,
+        parentSuspense,
+        namespace,
+        slotScopeIds,
+        optimized
+      );
+    } else {
+      patchElement(
+        n1,
+        n2,
+        parentComponent,
+        parentSuspense,
+        namespace,
+        slotScopeIds,
+        optimized
+      );
+    }
+  };
+  const mountElement = (vnode, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
+    let el;
+    let vnodeHook;
+    const { props, shapeFlag, transition, dirs } = vnode;
+    el = vnode.el = hostCreateElement(
+      vnode.type,
+      namespace,
+      props && props.is,
+      props
+    );
+    if (shapeFlag & 8) {
+      hostSetElementText(el, vnode.children);
+    } else if (shapeFlag & 16) {
+      mountChildren(
+        vnode.children,
+        el,
+        null,
+        parentComponent,
+        parentSuspense,
+        resolveChildrenNamespace(vnode, namespace),
+        slotScopeIds,
+        optimized
+      );
+    }
+    if (dirs) {
+      invokeDirectiveHook(vnode, null, parentComponent, "created");
+    }
+    setScopeId(el, vnode, vnode.scopeId, slotScopeIds, parentComponent);
+    if (props) {
+      for (const key in props) {
+        if (key !== "value" && !isReservedProp(key)) {
+          hostPatchProp(el, key, null, props[key], namespace, parentComponent);
+        }
+      }
+      if ("value" in props) {
+        hostPatchProp(el, "value", null, props.value, namespace);
+      }
+      if (vnodeHook = props.onVnodeBeforeMount) {
+        invokeVNodeHook(vnodeHook, parentComponent, vnode);
+      }
+    }
+    {
+      def(el, "__vnode", vnode, true);
+      def(el, "__vueParentComponent", parentComponent, true);
+    }
+    if (dirs) {
+      invokeDirectiveHook(vnode, null, parentComponent, "beforeMount");
+    }
+    const needCallTransitionHooks = needTransition(parentSuspense, transition);
+    if (needCallTransitionHooks) {
+      transition.beforeEnter(el);
+    }
+    hostInsert(el, container, anchor);
+    if ((vnodeHook = props && props.onVnodeMounted) || needCallTransitionHooks || dirs) {
+      queuePostRenderEffect(() => {
+        vnodeHook && invokeVNodeHook(vnodeHook, parentComponent, vnode);
+        needCallTransitionHooks && transition.enter(el);
+        dirs && invokeDirectiveHook(vnode, null, parentComponent, "mounted");
+      }, parentSuspense);
+    }
+  };
+  const setScopeId = (el, vnode, scopeId, slotScopeIds, parentComponent) => {
+    if (scopeId) {
+      hostSetScopeId(el, scopeId);
+    }
+    if (slotScopeIds) {
+      for (let i = 0; i < slotScopeIds.length; i++) {
+        hostSetScopeId(el, slotScopeIds[i]);
+      }
+    }
+    if (parentComponent) {
+      let subTree = parentComponent.subTree;
+      if (subTree.patchFlag > 0 && subTree.patchFlag & 2048) {
+        subTree = filterSingleRoot(subTree.children) || subTree;
+      }
+      if (vnode === subTree || isSuspense(subTree.type) && (subTree.ssContent === vnode || subTree.ssFallback === vnode)) {
+        const parentVNode = parentComponent.vnode;
+        setScopeId(
+          el,
+          parentVNode,
+          parentVNode.scopeId,
+          parentVNode.slotScopeIds,
+          parentComponent.parent
+        );
+      }
+    }
+  };
+  const mountChildren = (children, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, start = 0) => {
+    for (let i = start; i < children.length; i++) {
+      const child = children[i] = optimized ? cloneIfMounted(children[i]) : normalizeVNode(children[i]);
+      patch(
+        null,
+        child,
+        container,
+        anchor,
+        parentComponent,
+        parentSuspense,
+        namespace,
+        slotScopeIds,
+        optimized
+      );
+    }
+  };
+  const patchElement = (n1, n2, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
+    const el = n2.el = n1.el;
+    {
+      el.__vnode = n2;
+    }
+    let { patchFlag, dynamicChildren, dirs } = n2;
+    patchFlag |= n1.patchFlag & 16;
+    const oldProps = n1.props || EMPTY_OBJ;
+    const newProps = n2.props || EMPTY_OBJ;
+    let vnodeHook;
+    parentComponent && toggleRecurse(parentComponent, false);
+    if (vnodeHook = newProps.onVnodeBeforeUpdate) {
+      invokeVNodeHook(vnodeHook, parentComponent, n2, n1);
+    }
+    if (dirs) {
+      invokeDirectiveHook(n2, n1, parentComponent, "beforeUpdate");
+    }
+    parentComponent && toggleRecurse(parentComponent, true);
+    if (isHmrUpdating) {
+      patchFlag = 0;
+      optimized = false;
+      dynamicChildren = null;
+    }
+    if (oldProps.innerHTML && newProps.innerHTML == null || oldProps.textContent && newProps.textContent == null) {
+      hostSetElementText(el, "");
+    }
+    if (dynamicChildren) {
+      patchBlockChildren(
+        n1.dynamicChildren,
+        dynamicChildren,
+        el,
+        parentComponent,
+        parentSuspense,
+        resolveChildrenNamespace(n2, namespace),
+        slotScopeIds
+      );
+      {
+        traverseStaticChildren(n1, n2);
+      }
+    } else if (!optimized) {
+      patchChildren(
+        n1,
+        n2,
+        el,
+        null,
+        parentComponent,
+        parentSuspense,
+        resolveChildrenNamespace(n2, namespace),
+        slotScopeIds,
+        false
+      );
+    }
+    if (patchFlag > 0) {
+      if (patchFlag & 16) {
+        patchProps(el, oldProps, newProps, parentComponent, namespace);
+      } else {
+        if (patchFlag & 2) {
+          if (oldProps.class !== newProps.class) {
+            hostPatchProp(el, "class", null, newProps.class, namespace);
+          }
+        }
+        if (patchFlag & 4) {
+          hostPatchProp(el, "style", oldProps.style, newProps.style, namespace);
+        }
+        if (patchFlag & 8) {
+          const propsToUpdate = n2.dynamicProps;
+          for (let i = 0; i < propsToUpdate.length; i++) {
+            const key = propsToUpdate[i];
+            const prev = oldProps[key];
+            const next = newProps[key];
+            if (next !== prev || key === "value") {
+              hostPatchProp(el, key, prev, next, namespace, parentComponent);
+            }
+          }
+        }
+      }
+      if (patchFlag & 1) {
+        if (n1.children !== n2.children) {
+          hostSetElementText(el, n2.children);
+        }
+      }
+    } else if (!optimized && dynamicChildren == null) {
+      patchProps(el, oldProps, newProps, parentComponent, namespace);
+    }
+    if ((vnodeHook = newProps.onVnodeUpdated) || dirs) {
+      queuePostRenderEffect(() => {
+        vnodeHook && invokeVNodeHook(vnodeHook, parentComponent, n2, n1);
+        dirs && invokeDirectiveHook(n2, n1, parentComponent, "updated");
+      }, parentSuspense);
+    }
+  };
+  const patchBlockChildren = (oldChildren, newChildren, fallbackContainer, parentComponent, parentSuspense, namespace, slotScopeIds) => {
+    for (let i = 0; i < newChildren.length; i++) {
+      const oldVNode = oldChildren[i];
+      const newVNode = newChildren[i];
+      const container = (
+        // oldVNode may be an errored async setup() component inside Suspense
+        // which will not have a mounted element
+        oldVNode.el && // - In the case of a Fragment, we need to provide the actual parent
+        // of the Fragment itself so it can move its children.
+        (oldVNode.type === Fragment || // - In the case of different nodes, there is going to be a replacement
+        // which also requires the correct parent container
+        !isSameVNodeType(oldVNode, newVNode) || // - In the case of a component, it could contain anything.
+        oldVNode.shapeFlag & (6 | 64)) ? hostParentNode(oldVNode.el) : (
+          // In other cases, the parent container is not actually used so we
+          // just pass the block element here to avoid a DOM parentNode call.
+          fallbackContainer
+        )
+      );
+      patch(
+        oldVNode,
+        newVNode,
+        container,
+        null,
+        parentComponent,
+        parentSuspense,
+        namespace,
+        slotScopeIds,
+        true
+      );
+    }
+  };
+  const patchProps = (el, oldProps, newProps, parentComponent, namespace) => {
+    if (oldProps !== newProps) {
+      if (oldProps !== EMPTY_OBJ) {
+        for (const key in oldProps) {
+          if (!isReservedProp(key) && !(key in newProps)) {
+            hostPatchProp(
+              el,
+              key,
+              oldProps[key],
+              null,
+              namespace,
+              parentComponent
+            );
+          }
+        }
+      }
+      for (const key in newProps) {
+        if (isReservedProp(key)) continue;
+        const next = newProps[key];
+        const prev = oldProps[key];
+        if (next !== prev && key !== "value") {
+          hostPatchProp(el, key, prev, next, namespace, parentComponent);
+        }
+      }
+      if ("value" in newProps) {
+        hostPatchProp(el, "value", oldProps.value, newProps.value, namespace);
+      }
+    }
+  };
+  const processFragment = (n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
+    const fragmentStartAnchor = n2.el = n1 ? n1.el : hostCreateText("");
+    const fragmentEndAnchor = n2.anchor = n1 ? n1.anchor : hostCreateText("");
+    let { patchFlag, dynamicChildren, slotScopeIds: fragmentSlotScopeIds } = n2;
+    if (
+      // #5523 dev root fragment may inherit directives
+      isHmrUpdating || patchFlag & 2048
+    ) {
+      patchFlag = 0;
+      optimized = false;
+      dynamicChildren = null;
+    }
+    if (fragmentSlotScopeIds) {
+      slotScopeIds = slotScopeIds ? slotScopeIds.concat(fragmentSlotScopeIds) : fragmentSlotScopeIds;
+    }
+    if (n1 == null) {
+      hostInsert(fragmentStartAnchor, container, anchor);
+      hostInsert(fragmentEndAnchor, container, anchor);
+      mountChildren(
+        // #10007
+        // such fragment like `<></>` will be compiled into
+        // a fragment which doesn't have a children.
+        // In this case fallback to an empty array
+        n2.children || [],
+        container,
+        fragmentEndAnchor,
+        parentComponent,
+        parentSuspense,
+        namespace,
+        slotScopeIds,
+        optimized
+      );
+    } else {
+      if (patchFlag > 0 && patchFlag & 64 && dynamicChildren && // #2715 the previous fragment could've been a BAILed one as a result
+      // of renderSlot() with no valid children
+      n1.dynamicChildren) {
+        patchBlockChildren(
+          n1.dynamicChildren,
+          dynamicChildren,
+          container,
+          parentComponent,
+          parentSuspense,
+          namespace,
+          slotScopeIds
+        );
+        {
+          traverseStaticChildren(n1, n2);
+        }
+      } else {
+        patchChildren(
+          n1,
+          n2,
+          container,
+          fragmentEndAnchor,
+          parentComponent,
+          parentSuspense,
+          namespace,
+          slotScopeIds,
+          optimized
+        );
+      }
+    }
+  };
+  const processComponent = (n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
+    n2.slotScopeIds = slotScopeIds;
+    if (n1 == null) {
+      if (n2.shapeFlag & 512) {
+        parentComponent.ctx.activate(
+          n2,
+          container,
+          anchor,
+          namespace,
+          optimized
+        );
+      } else {
+        mountComponent(
+          n2,
+          container,
+          anchor,
+          parentComponent,
+          parentSuspense,
+          namespace,
+          optimized
+        );
+      }
+    } else {
+      updateComponent(n1, n2, optimized);
+    }
+  };
+  const mountComponent = (initialVNode, container, anchor, parentComponent, parentSuspense, namespace, optimized) => {
+    const instance = (initialVNode.component = createComponentInstance(
+      initialVNode,
+      parentComponent,
+      parentSuspense
+    ));
+    if (instance.type.__hmrId) {
+      registerHMR(instance);
+    }
+    {
+      pushWarningContext(initialVNode);
+      startMeasure(instance, `mount`);
+    }
+    if (isKeepAlive(initialVNode)) {
+      instance.ctx.renderer = internals;
+    }
+    {
+      {
+        startMeasure(instance, `init`);
+      }
+      setupComponent(instance, false, optimized);
+      {
+        endMeasure(instance, `init`);
+      }
+    }
+    if (instance.asyncDep) {
+      if (isHmrUpdating) initialVNode.el = null;
+      parentSuspense && parentSuspense.registerDep(instance, setupRenderEffect, optimized);
+      if (!initialVNode.el) {
+        const placeholder = instance.subTree = createVNode(Comment);
+        processCommentNode(null, placeholder, container, anchor);
+      }
+    } else {
+      setupRenderEffect(
+        instance,
+        initialVNode,
+        container,
+        anchor,
+        parentSuspense,
+        namespace,
+        optimized
+      );
+    }
+    {
+      popWarningContext();
+      endMeasure(instance, `mount`);
+    }
+  };
+  const updateComponent = (n1, n2, optimized) => {
+    const instance = n2.component = n1.component;
+    if (shouldUpdateComponent(n1, n2, optimized)) {
+      if (instance.asyncDep && !instance.asyncResolved) {
+        {
+          pushWarningContext(n2);
+        }
+        updateComponentPreRender(instance, n2, optimized);
+        {
+          popWarningContext();
+        }
+        return;
+      } else {
+        instance.next = n2;
+        instance.update();
+      }
+    } else {
+      n2.el = n1.el;
+      instance.vnode = n2;
+    }
+  };
+  const setupRenderEffect = (instance, initialVNode, container, anchor, parentSuspense, namespace, optimized) => {
+    const componentUpdateFn = () => {
+      if (!instance.isMounted) {
+        let vnodeHook;
+        const { el, props } = initialVNode;
+        const { bm, m, parent, root, type } = instance;
+        const isAsyncWrapperVNode = isAsyncWrapper(initialVNode);
+        toggleRecurse(instance, false);
+        if (bm) {
+          invokeArrayFns(bm);
+        }
+        if (!isAsyncWrapperVNode && (vnodeHook = props && props.onVnodeBeforeMount)) {
+          invokeVNodeHook(vnodeHook, parent, initialVNode);
+        }
+        toggleRecurse(instance, true);
+        if (el && hydrateNode) {
+          const hydrateSubTree = () => {
+            {
+              startMeasure(instance, `render`);
+            }
+            instance.subTree = renderComponentRoot(instance);
+            {
+              endMeasure(instance, `render`);
+            }
+            {
+              startMeasure(instance, `hydrate`);
+            }
+            hydrateNode(
+              el,
+              instance.subTree,
+              instance,
+              parentSuspense,
+              null
+            );
+            {
+              endMeasure(instance, `hydrate`);
+            }
+          };
+          if (isAsyncWrapperVNode && type.__asyncHydrate) {
+            type.__asyncHydrate(
+              el,
+              instance,
+              hydrateSubTree
+            );
+          } else {
+            hydrateSubTree();
+          }
+        } else {
+          if (root.ce) {
+            root.ce._injectChildStyle(type);
+          }
+          {
+            startMeasure(instance, `render`);
+          }
+          const subTree = instance.subTree = renderComponentRoot(instance);
+          {
+            endMeasure(instance, `render`);
+          }
+          {
+            startMeasure(instance, `patch`);
+          }
+          patch(
+            null,
+            subTree,
+            container,
+            anchor,
+            instance,
+            parentSuspense,
+            namespace
+          );
+          {
+            endMeasure(instance, `patch`);
+          }
+          initialVNode.el = subTree.el;
+        }
+        if (m) {
+          queuePostRenderEffect(m, parentSuspense);
+        }
+        if (!isAsyncWrapperVNode && (vnodeHook = props && props.onVnodeMounted)) {
+          const scopedInitialVNode = initialVNode;
+          queuePostRenderEffect(
+            () => invokeVNodeHook(vnodeHook, parent, scopedInitialVNode),
+            parentSuspense
+          );
+        }
+        if (initialVNode.shapeFlag & 256 || parent && isAsyncWrapper(parent.vnode) && parent.vnode.shapeFlag & 256) {
+          instance.a && queuePostRenderEffect(instance.a, parentSuspense);
+        }
+        instance.isMounted = true;
+        {
+          devtoolsComponentAdded(instance);
+        }
+        initialVNode = container = anchor = null;
+      } else {
+        let { next, bu, u, parent, vnode } = instance;
+        {
+          const nonHydratedAsyncRoot = locateNonHydratedAsyncRoot(instance);
+          if (nonHydratedAsyncRoot) {
+            if (next) {
+              next.el = vnode.el;
+              updateComponentPreRender(instance, next, optimized);
+            }
+            nonHydratedAsyncRoot.asyncDep.then(() => {
+              if (!instance.isUnmounted) {
+                componentUpdateFn();
+              }
+            });
+            return;
+          }
+        }
+        let originNext = next;
+        let vnodeHook;
+        {
+          pushWarningContext(next || instance.vnode);
+        }
+        toggleRecurse(instance, false);
+        if (next) {
+          next.el = vnode.el;
+          updateComponentPreRender(instance, next, optimized);
+        } else {
+          next = vnode;
+        }
+        if (bu) {
+          invokeArrayFns(bu);
+        }
+        if (vnodeHook = next.props && next.props.onVnodeBeforeUpdate) {
+          invokeVNodeHook(vnodeHook, parent, next, vnode);
+        }
+        toggleRecurse(instance, true);
+        {
+          startMeasure(instance, `render`);
+        }
+        const nextTree = renderComponentRoot(instance);
+        {
+          endMeasure(instance, `render`);
+        }
+        const prevTree = instance.subTree;
+        instance.subTree = nextTree;
+        {
+          startMeasure(instance, `patch`);
+        }
+        patch(
+          prevTree,
+          nextTree,
+          // parent may have changed if it's in a teleport
+          hostParentNode(prevTree.el),
+          // anchor may have changed if it's in a fragment
+          getNextHostNode(prevTree),
+          instance,
+          parentSuspense,
+          namespace
+        );
+        {
+          endMeasure(instance, `patch`);
+        }
+        next.el = nextTree.el;
+        if (originNext === null) {
+          updateHOCHostEl(instance, nextTree.el);
+        }
+        if (u) {
+          queuePostRenderEffect(u, parentSuspense);
+        }
+        if (vnodeHook = next.props && next.props.onVnodeUpdated) {
+          queuePostRenderEffect(
+            () => invokeVNodeHook(vnodeHook, parent, next, vnode),
+            parentSuspense
+          );
+        }
+        {
+          devtoolsComponentUpdated(instance);
+        }
+        {
+          popWarningContext();
+        }
+      }
+    };
+    instance.scope.on();
+    const effect = instance.effect = new ReactiveEffect(componentUpdateFn);
+    instance.scope.off();
+    const update = instance.update = effect.run.bind(effect);
+    const job = instance.job = effect.runIfDirty.bind(effect);
+    job.i = instance;
+    job.id = instance.uid;
+    effect.scheduler = () => queueJob(job);
+    toggleRecurse(instance, true);
+    {
+      effect.onTrack = instance.rtc ? (e) => invokeArrayFns(instance.rtc, e) : void 0;
+      effect.onTrigger = instance.rtg ? (e) => invokeArrayFns(instance.rtg, e) : void 0;
+    }
+    update();
+  };
+  const updateComponentPreRender = (instance, nextVNode, optimized) => {
+    nextVNode.component = instance;
+    const prevProps = instance.vnode.props;
+    instance.vnode = nextVNode;
+    instance.next = null;
+    updateProps(instance, nextVNode.props, prevProps, optimized);
+    updateSlots(instance, nextVNode.children, optimized);
+    pauseTracking();
+    flushPreFlushCbs(instance);
+    resetTracking();
+  };
+  const patchChildren = (n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized = false) => {
+    const c1 = n1 && n1.children;
+    const prevShapeFlag = n1 ? n1.shapeFlag : 0;
+    const c2 = n2.children;
+    const { patchFlag, shapeFlag } = n2;
+    if (patchFlag > 0) {
+      if (patchFlag & 128) {
+        patchKeyedChildren(
+          c1,
+          c2,
+          container,
+          anchor,
+          parentComponent,
+          parentSuspense,
+          namespace,
+          slotScopeIds,
+          optimized
+        );
+        return;
+      } else if (patchFlag & 256) {
+        patchUnkeyedChildren(
+          c1,
+          c2,
+          container,
+          anchor,
+          parentComponent,
+          parentSuspense,
+          namespace,
+          slotScopeIds,
+          optimized
+        );
+        return;
+      }
+    }
+    if (shapeFlag & 8) {
+      if (prevShapeFlag & 16) {
+        unmountChildren(c1, parentComponent, parentSuspense);
+      }
+      if (c2 !== c1) {
+        hostSetElementText(container, c2);
+      }
+    } else {
+      if (prevShapeFlag & 16) {
+        if (shapeFlag & 16) {
+          patchKeyedChildren(
+            c1,
+            c2,
+            container,
+            anchor,
+            parentComponent,
+            parentSuspense,
+            namespace,
+            slotScopeIds,
+            optimized
+          );
+        } else {
+          unmountChildren(c1, parentComponent, parentSuspense, true);
+        }
+      } else {
+        if (prevShapeFlag & 8) {
+          hostSetElementText(container, "");
+        }
+        if (shapeFlag & 16) {
+          mountChildren(
+            c2,
+            container,
+            anchor,
+            parentComponent,
+            parentSuspense,
+            namespace,
+            slotScopeIds,
+            optimized
+          );
+        }
+      }
+    }
+  };
+  const patchUnkeyedChildren = (c1, c2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
+    c1 = c1 || EMPTY_ARR;
+    c2 = c2 || EMPTY_ARR;
+    const oldLength = c1.length;
+    const newLength = c2.length;
+    const commonLength = Math.min(oldLength, newLength);
+    let i;
+    for (i = 0; i < commonLength; i++) {
+      const nextChild = c2[i] = optimized ? cloneIfMounted(c2[i]) : normalizeVNode(c2[i]);
+      patch(
+        c1[i],
+        nextChild,
+        container,
+        null,
+        parentComponent,
+        parentSuspense,
+        namespace,
+        slotScopeIds,
+        optimized
+      );
+    }
+    if (oldLength > newLength) {
+      unmountChildren(
+        c1,
+        parentComponent,
+        parentSuspense,
+        true,
+        false,
+        commonLength
+      );
+    } else {
+      mountChildren(
+        c2,
+        container,
+        anchor,
+        parentComponent,
+        parentSuspense,
+        namespace,
+        slotScopeIds,
+        optimized,
+        commonLength
+      );
+    }
+  };
+  const patchKeyedChildren = (c1, c2, container, parentAnchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
+    let i = 0;
+    const l2 = c2.length;
+    let e1 = c1.length - 1;
+    let e2 = l2 - 1;
+    while (i <= e1 && i <= e2) {
+      const n1 = c1[i];
+      const n2 = c2[i] = optimized ? cloneIfMounted(c2[i]) : normalizeVNode(c2[i]);
+      if (isSameVNodeType(n1, n2)) {
+        patch(
+          n1,
+          n2,
+          container,
+          null,
+          parentComponent,
+          parentSuspense,
+          namespace,
+          slotScopeIds,
+          optimized
+        );
+      } else {
+        break;
+      }
+      i++;
+    }
+    while (i <= e1 && i <= e2) {
+      const n1 = c1[e1];
+      const n2 = c2[e2] = optimized ? cloneIfMounted(c2[e2]) : normalizeVNode(c2[e2]);
+      if (isSameVNodeType(n1, n2)) {
+        patch(
+          n1,
+          n2,
+          container,
+          null,
+          parentComponent,
+          parentSuspense,
+          namespace,
+          slotScopeIds,
+          optimized
+        );
+      } else {
+        break;
+      }
+      e1--;
+      e2--;
+    }
+    if (i > e1) {
+      if (i <= e2) {
+        const nextPos = e2 + 1;
+        const anchor = nextPos < l2 ? c2[nextPos].el : parentAnchor;
+        while (i <= e2) {
+          patch(
+            null,
+            c2[i] = optimized ? cloneIfMounted(c2[i]) : normalizeVNode(c2[i]),
+            container,
+            anchor,
+            parentComponent,
+            parentSuspense,
+            namespace,
+            slotScopeIds,
+            optimized
+          );
+          i++;
+        }
+      }
+    } else if (i > e2) {
+      while (i <= e1) {
+        unmount(c1[i], parentComponent, parentSuspense, true);
+        i++;
+      }
+    } else {
+      const s1 = i;
+      const s2 = i;
+      const keyToNewIndexMap = /* @__PURE__ */ new Map();
+      for (i = s2; i <= e2; i++) {
+        const nextChild = c2[i] = optimized ? cloneIfMounted(c2[i]) : normalizeVNode(c2[i]);
+        if (nextChild.key != null) {
+          if (keyToNewIndexMap.has(nextChild.key)) {
+            warn$1(
+              `Duplicate keys found during update:`,
+              JSON.stringify(nextChild.key),
+              `Make sure keys are unique.`
+            );
+          }
+          keyToNewIndexMap.set(nextChild.key, i);
+        }
+      }
+      let j;
+      let patched = 0;
+      const toBePatched = e2 - s2 + 1;
+      let moved = false;
+      let maxNewIndexSoFar = 0;
+      const newIndexToOldIndexMap = new Array(toBePatched);
+      for (i = 0; i < toBePatched; i++) newIndexToOldIndexMap[i] = 0;
+      for (i = s1; i <= e1; i++) {
+        const prevChild = c1[i];
+        if (patched >= toBePatched) {
+          unmount(prevChild, parentComponent, parentSuspense, true);
+          continue;
+        }
+        let newIndex;
+        if (prevChild.key != null) {
+          newIndex = keyToNewIndexMap.get(prevChild.key);
+        } else {
+          for (j = s2; j <= e2; j++) {
+            if (newIndexToOldIndexMap[j - s2] === 0 && isSameVNodeType(prevChild, c2[j])) {
+              newIndex = j;
+              break;
+            }
+          }
+        }
+        if (newIndex === void 0) {
+          unmount(prevChild, parentComponent, parentSuspense, true);
+        } else {
+          newIndexToOldIndexMap[newIndex - s2] = i + 1;
+          if (newIndex >= maxNewIndexSoFar) {
+            maxNewIndexSoFar = newIndex;
+          } else {
+            moved = true;
+          }
+          patch(
+            prevChild,
+            c2[newIndex],
+            container,
+            null,
+            parentComponent,
+            parentSuspense,
+            namespace,
+            slotScopeIds,
+            optimized
+          );
+          patched++;
+        }
+      }
+      const increasingNewIndexSequence = moved ? getSequence(newIndexToOldIndexMap) : EMPTY_ARR;
+      j = increasingNewIndexSequence.length - 1;
+      for (i = toBePatched - 1; i >= 0; i--) {
+        const nextIndex = s2 + i;
+        const nextChild = c2[nextIndex];
+        const anchor = nextIndex + 1 < l2 ? c2[nextIndex + 1].el : parentAnchor;
+        if (newIndexToOldIndexMap[i] === 0) {
+          patch(
+            null,
+            nextChild,
+            container,
+            anchor,
+            parentComponent,
+            parentSuspense,
+            namespace,
+            slotScopeIds,
+            optimized
+          );
+        } else if (moved) {
+          if (j < 0 || i !== increasingNewIndexSequence[j]) {
+            move(nextChild, container, anchor, 2);
+          } else {
+            j--;
+          }
+        }
+      }
+    }
+  };
+  const move = (vnode, container, anchor, moveType, parentSuspense = null) => {
+    const { el, type, transition, children, shapeFlag } = vnode;
+    if (shapeFlag & 6) {
+      move(vnode.component.subTree, container, anchor, moveType);
+      return;
+    }
+    if (shapeFlag & 128) {
+      vnode.suspense.move(container, anchor, moveType);
+      return;
+    }
+    if (shapeFlag & 64) {
+      type.move(vnode, container, anchor, internals);
+      return;
+    }
+    if (type === Fragment) {
+      hostInsert(el, container, anchor);
+      for (let i = 0; i < children.length; i++) {
+        move(children[i], container, anchor, moveType);
+      }
+      hostInsert(vnode.anchor, container, anchor);
+      return;
+    }
+    if (type === Static) {
+      moveStaticNode(vnode, container, anchor);
+      return;
+    }
+    const needTransition2 = moveType !== 2 && shapeFlag & 1 && transition;
+    if (needTransition2) {
+      if (moveType === 0) {
+        transition.beforeEnter(el);
+        hostInsert(el, container, anchor);
+        queuePostRenderEffect(() => transition.enter(el), parentSuspense);
+      } else {
+        const { leave, delayLeave, afterLeave } = transition;
+        const remove2 = () => hostInsert(el, container, anchor);
+        const performLeave = () => {
+          leave(el, () => {
+            remove2();
+            afterLeave && afterLeave();
+          });
+        };
+        if (delayLeave) {
+          delayLeave(el, remove2, performLeave);
+        } else {
+          performLeave();
+        }
+      }
+    } else {
+      hostInsert(el, container, anchor);
+    }
+  };
+  const unmount = (vnode, parentComponent, parentSuspense, doRemove = false, optimized = false) => {
+    const {
+      type,
+      props,
+      ref,
+      children,
+      dynamicChildren,
+      shapeFlag,
+      patchFlag,
+      dirs,
+      cacheIndex
+    } = vnode;
+    if (patchFlag === -2) {
+      optimized = false;
+    }
+    if (ref != null) {
+      setRef(ref, null, parentSuspense, vnode, true);
+    }
+    if (cacheIndex != null) {
+      parentComponent.renderCache[cacheIndex] = void 0;
+    }
+    if (shapeFlag & 256) {
+      parentComponent.ctx.deactivate(vnode);
+      return;
+    }
+    const shouldInvokeDirs = shapeFlag & 1 && dirs;
+    const shouldInvokeVnodeHook = !isAsyncWrapper(vnode);
+    let vnodeHook;
+    if (shouldInvokeVnodeHook && (vnodeHook = props && props.onVnodeBeforeUnmount)) {
+      invokeVNodeHook(vnodeHook, parentComponent, vnode);
+    }
+    if (shapeFlag & 6) {
+      unmountComponent(vnode.component, parentSuspense, doRemove);
+    } else {
+      if (shapeFlag & 128) {
+        vnode.suspense.unmount(parentSuspense, doRemove);
+        return;
+      }
+      if (shouldInvokeDirs) {
+        invokeDirectiveHook(vnode, null, parentComponent, "beforeUnmount");
+      }
+      if (shapeFlag & 64) {
+        vnode.type.remove(
+          vnode,
+          parentComponent,
+          parentSuspense,
+          internals,
+          doRemove
+        );
+      } else if (dynamicChildren && // #5154
+      // when v-once is used inside a block, setBlockTracking(-1) marks the
+      // parent block with hasOnce: true
+      // so that it doesn't take the fast path during unmount - otherwise
+      // components nested in v-once are never unmounted.
+      !dynamicChildren.hasOnce && // #1153: fast path should not be taken for non-stable (v-for) fragments
+      (type !== Fragment || patchFlag > 0 && patchFlag & 64)) {
+        unmountChildren(
+          dynamicChildren,
+          parentComponent,
+          parentSuspense,
+          false,
+          true
+        );
+      } else if (type === Fragment && patchFlag & (128 | 256) || !optimized && shapeFlag & 16) {
+        unmountChildren(children, parentComponent, parentSuspense);
+      }
+      if (doRemove) {
+        remove(vnode);
+      }
+    }
+    if (shouldInvokeVnodeHook && (vnodeHook = props && props.onVnodeUnmounted) || shouldInvokeDirs) {
+      queuePostRenderEffect(() => {
+        vnodeHook && invokeVNodeHook(vnodeHook, parentComponent, vnode);
+        shouldInvokeDirs && invokeDirectiveHook(vnode, null, parentComponent, "unmounted");
+      }, parentSuspense);
+    }
+  };
+  const remove = (vnode) => {
+    const { type, el, anchor, transition } = vnode;
+    if (type === Fragment) {
+      if (vnode.patchFlag > 0 && vnode.patchFlag & 2048 && transition && !transition.persisted) {
+        vnode.children.forEach((child) => {
+          if (child.type === Comment) {
+            hostRemove(child.el);
+          } else {
+            remove(child);
+          }
+        });
+      } else {
+        removeFragment(el, anchor);
+      }
+      return;
+    }
+    if (type === Static) {
+      removeStaticNode(vnode);
+      return;
+    }
+    const performRemove = () => {
+      hostRemove(el);
+      if (transition && !transition.persisted && transition.afterLeave) {
+        transition.afterLeave();
+      }
+    };
+    if (vnode.shapeFlag & 1 && transition && !transition.persisted) {
+      const { leave, delayLeave } = transition;
+      const performLeave = () => leave(el, performRemove);
+      if (delayLeave) {
+        delayLeave(vnode.el, performRemove, performLeave);
+      } else {
+        performLeave();
+      }
+    } else {
+      performRemove();
+    }
+  };
+  const removeFragment = (cur, end) => {
+    let next;
+    while (cur !== end) {
+      next = hostNextSibling(cur);
+      hostRemove(cur);
+      cur = next;
+    }
+    hostRemove(end);
+  };
+  const unmountComponent = (instance, parentSuspense, doRemove) => {
+    if (instance.type.__hmrId) {
+      unregisterHMR(instance);
+    }
+    const { bum, scope, job, subTree, um, m, a } = instance;
+    invalidateMount(m);
+    invalidateMount(a);
+    if (bum) {
+      invokeArrayFns(bum);
+    }
+    scope.stop();
+    if (job) {
+      job.flags |= 8;
+      unmount(subTree, instance, parentSuspense, doRemove);
+    }
+    if (um) {
+      queuePostRenderEffect(um, parentSuspense);
+    }
+    queuePostRenderEffect(() => {
+      instance.isUnmounted = true;
+    }, parentSuspense);
+    if (parentSuspense && parentSuspense.pendingBranch && !parentSuspense.isUnmounted && instance.asyncDep && !instance.asyncResolved && instance.suspenseId === parentSuspense.pendingId) {
+      parentSuspense.deps--;
+      if (parentSuspense.deps === 0) {
+        parentSuspense.resolve();
+      }
+    }
+    {
+      devtoolsComponentRemoved(instance);
+    }
+  };
+  const unmountChildren = (children, parentComponent, parentSuspense, doRemove = false, optimized = false, start = 0) => {
+    for (let i = start; i < children.length; i++) {
+      unmount(children[i], parentComponent, parentSuspense, doRemove, optimized);
+    }
+  };
+  const getNextHostNode = (vnode) => {
+    if (vnode.shapeFlag & 6) {
+      return getNextHostNode(vnode.component.subTree);
+    }
+    if (vnode.shapeFlag & 128) {
+      return vnode.suspense.next();
+    }
+    const el = hostNextSibling(vnode.anchor || vnode.el);
+    const teleportEnd = el && el[TeleportEndKey];
+    return teleportEnd ? hostNextSibling(teleportEnd) : el;
+  };
+  let isFlushing = false;
+  const render = (vnode, container, namespace) => {
+    if (vnode == null) {
+      if (container._vnode) {
+        unmount(container._vnode, null, null, true);
+      }
+    } else {
+      patch(
+        container._vnode || null,
+        vnode,
+        container,
+        null,
+        null,
+        null,
+        namespace
+      );
+    }
+    container._vnode = vnode;
+    if (!isFlushing) {
+      isFlushing = true;
+      flushPreFlushCbs();
+      flushPostFlushCbs();
+      isFlushing = false;
+    }
+  };
+  const internals = {
+    p: patch,
+    um: unmount,
+    m: move,
+    r: remove,
+    mt: mountComponent,
+    mc: mountChildren,
+    pc: patchChildren,
+    pbc: patchBlockChildren,
+    n: getNextHostNode,
+    o: options
+  };
+  let hydrate;
+  let hydrateNode;
+  if (createHydrationFns) {
+    [hydrate, hydrateNode] = createHydrationFns(
+      internals
+    );
+  }
+  return {
+    render,
+    hydrate,
+    createApp: createAppAPI(render, hydrate)
+  };
+}
+function resolveChildrenNamespace({ type, props }, currentNamespace) {
+  return currentNamespace === "svg" && type === "foreignObject" || currentNamespace === "mathml" && type === "annotation-xml" && props && props.encoding && props.encoding.includes("html") ? void 0 : currentNamespace;
+}
+function toggleRecurse({ effect, job }, allowed) {
+  if (allowed) {
+    effect.flags |= 32;
+    job.flags |= 4;
+  } else {
+    effect.flags &= ~32;
+    job.flags &= ~4;
+  }
+}
+function needTransition(parentSuspense, transition) {
+  return (!parentSuspense || parentSuspense && !parentSuspense.pendingBranch) && transition && !transition.persisted;
+}
+function traverseStaticChildren(n1, n2, shallow = false) {
+  const ch1 = n1.children;
+  const ch2 = n2.children;
+  if (isArray(ch1) && isArray(ch2)) {
+    for (let i = 0; i < ch1.length; i++) {
+      const c1 = ch1[i];
+      let c2 = ch2[i];
+      if (c2.shapeFlag & 1 && !c2.dynamicChildren) {
+        if (c2.patchFlag <= 0 || c2.patchFlag === 32) {
+          c2 = ch2[i] = cloneIfMounted(ch2[i]);
+          c2.el = c1.el;
+        }
+        if (!shallow && c2.patchFlag !== -2)
+          traverseStaticChildren(c1, c2);
+      }
+      if (c2.type === Text) {
+        c2.el = c1.el;
+      }
+      if (c2.type === Comment && !c2.el) {
+        c2.el = c1.el;
+      }
+    }
+  }
+}
+function getSequence(arr) {
+  const p = arr.slice();
+  const result = [0];
+  let i, j, u, v, c;
+  const len = arr.length;
+  for (i = 0; i < len; i++) {
+    const arrI = arr[i];
+    if (arrI !== 0) {
+      j = result[result.length - 1];
+      if (arr[j] < arrI) {
+        p[i] = j;
+        result.push(i);
+        continue;
+      }
+      u = 0;
+      v = result.length - 1;
+      while (u < v) {
+        c = u + v >> 1;
+        if (arr[result[c]] < arrI) {
+          u = c + 1;
+        } else {
+          v = c;
+        }
+      }
+      if (arrI < arr[result[u]]) {
+        if (u > 0) {
+          p[i] = result[u - 1];
+        }
+        result[u] = i;
+      }
+    }
+  }
+  u = result.length;
+  v = result[u - 1];
+  while (u-- > 0) {
+    result[u] = v;
+    v = p[v];
+  }
+  return result;
+}
+function locateNonHydratedAsyncRoot(instance) {
+  const subComponent = instance.subTree.component;
+  if (subComponent) {
+    if (subComponent.asyncDep && !subComponent.asyncResolved) {
+      return subComponent;
+    } else {
+      return locateNonHydratedAsyncRoot(subComponent);
+    }
+  }
+}
+function invalidateMount(hooks) {
+  if (hooks) {
+    for (let i = 0; i < hooks.length; i++)
+      hooks[i].flags |= 8;
+  }
+}
+
+const ssrContextKey = Symbol.for("v-scx");
+const useSSRContext = () => {
+  {
+    const ctx = inject(ssrContextKey);
+    if (!ctx) {
+      warn$1(
+        `Server rendering context not provided. Make sure to only call useSSRContext() conditionally in the server build.`
+      );
+    }
+    return ctx;
+  }
+};
+
+function watchEffect(effect, options) {
+  return doWatch(effect, null, options);
+}
+function watchPostEffect(effect, options) {
+  return doWatch(
+    effect,
+    null,
+    extend({}, options, { flush: "post" }) 
+  );
+}
+function watchSyncEffect(effect, options) {
+  return doWatch(
+    effect,
+    null,
+    extend({}, options, { flush: "sync" }) 
+  );
+}
+function watch(source, cb, options) {
+  if (!isFunction(cb)) {
+    warn$1(
+      `\`watch(fn, options?)\` signature has been moved to a separate API. Use \`watchEffect(fn, options?)\` instead. \`watch\` now only supports \`watch(source, cb, options?) signature.`
+    );
+  }
+  return doWatch(source, cb, options);
+}
+function doWatch(source, cb, options = EMPTY_OBJ) {
+  const { immediate, deep, flush, once } = options;
+  if (!cb) {
+    if (immediate !== void 0) {
+      warn$1(
+        `watch() "immediate" option is only respected when using the watch(source, callback, options?) signature.`
+      );
+    }
+    if (deep !== void 0) {
+      warn$1(
+        `watch() "deep" option is only respected when using the watch(source, callback, options?) signature.`
+      );
+    }
+    if (once !== void 0) {
+      warn$1(
+        `watch() "once" option is only respected when using the watch(source, callback, options?) signature.`
+      );
+    }
+  }
+  const baseWatchOptions = extend({}, options);
+  baseWatchOptions.onWarn = warn$1;
+  const runsImmediately = cb && immediate || !cb && flush !== "post";
+  let ssrCleanup;
+  if (isInSSRComponentSetup) {
+    if (flush === "sync") {
+      const ctx = useSSRContext();
+      ssrCleanup = ctx.__watcherHandles || (ctx.__watcherHandles = []);
+    } else if (!runsImmediately) {
+      const watchStopHandle = () => {
+      };
+      watchStopHandle.stop = NOOP;
+      watchStopHandle.resume = NOOP;
+      watchStopHandle.pause = NOOP;
+      return watchStopHandle;
+    }
+  }
+  const instance = currentInstance;
+  baseWatchOptions.call = (fn, type, args) => callWithAsyncErrorHandling(fn, instance, type, args);
+  let isPre = false;
+  if (flush === "post") {
+    baseWatchOptions.scheduler = (job) => {
+      queuePostRenderEffect(job, instance && instance.suspense);
+    };
+  } else if (flush !== "sync") {
+    isPre = true;
+    baseWatchOptions.scheduler = (job, isFirstRun) => {
+      if (isFirstRun) {
+        job();
+      } else {
+        queueJob(job);
+      }
+    };
+  }
+  baseWatchOptions.augmentJob = (job) => {
+    if (cb) {
+      job.flags |= 4;
+    }
+    if (isPre) {
+      job.flags |= 2;
+      if (instance) {
+        job.id = instance.uid;
+        job.i = instance;
+      }
+    }
+  };
+  const watchHandle = watch$1(source, cb, baseWatchOptions);
+  if (isInSSRComponentSetup) {
+    if (ssrCleanup) {
+      ssrCleanup.push(watchHandle);
+    } else if (runsImmediately) {
+      watchHandle();
+    }
+  }
+  return watchHandle;
+}
+function instanceWatch(source, value, options) {
+  const publicThis = this.proxy;
+  const getter = isString(source) ? source.includes(".") ? createPathGetter(publicThis, source) : () => publicThis[source] : source.bind(publicThis, publicThis);
+  let cb;
+  if (isFunction(value)) {
+    cb = value;
+  } else {
+    cb = value.handler;
+    options = value;
+  }
+  const reset = setCurrentInstance(this);
+  const res = doWatch(getter, cb.bind(publicThis), options);
+  reset();
+  return res;
+}
+function createPathGetter(ctx, path) {
+  const segments = path.split(".");
+  return () => {
+    let cur = ctx;
+    for (let i = 0; i < segments.length && cur; i++) {
+      cur = cur[segments[i]];
+    }
+    return cur;
+  };
+}
+
+function useModel(props, name, options = EMPTY_OBJ) {
+  const i = getCurrentInstance();
+  if (!i) {
+    warn$1(`useModel() called without active instance.`);
+    return ref();
+  }
+  const camelizedName = camelize(name);
+  if (!i.propsOptions[0][camelizedName]) {
+    warn$1(`useModel() called with prop "${name}" which is not declared.`);
+    return ref();
+  }
+  const hyphenatedName = hyphenate(name);
+  const modifiers = getModelModifiers(props, camelizedName);
+  const res = customRef((track, trigger) => {
+    let localValue;
+    let prevSetValue = EMPTY_OBJ;
+    let prevEmittedValue;
+    watchSyncEffect(() => {
+      const propValue = props[camelizedName];
+      if (hasChanged(localValue, propValue)) {
+        localValue = propValue;
+        trigger();
+      }
+    });
+    return {
+      get() {
+        track();
+        return options.get ? options.get(localValue) : localValue;
+      },
+      set(value) {
+        const emittedValue = options.set ? options.set(value) : value;
+        if (!hasChanged(emittedValue, localValue) && !(prevSetValue !== EMPTY_OBJ && hasChanged(value, prevSetValue))) {
+          return;
+        }
+        const rawProps = i.vnode.props;
+        if (!(rawProps && // check if parent has passed v-model
+        (name in rawProps || camelizedName in rawProps || hyphenatedName in rawProps) && (`onUpdate:${name}` in rawProps || `onUpdate:${camelizedName}` in rawProps || `onUpdate:${hyphenatedName}` in rawProps))) {
+          localValue = value;
+          trigger();
+        }
+        i.emit(`update:${name}`, emittedValue);
+        if (hasChanged(value, emittedValue) && hasChanged(value, prevSetValue) && !hasChanged(emittedValue, prevEmittedValue)) {
+          trigger();
+        }
+        prevSetValue = value;
+        prevEmittedValue = emittedValue;
+      }
+    };
+  });
+  res[Symbol.iterator] = () => {
+    let i2 = 0;
+    return {
+      next() {
+        if (i2 < 2) {
+          return { value: i2++ ? modifiers || EMPTY_OBJ : res, done: false };
+        } else {
+          return { done: true };
+        }
+      }
+    };
+  };
+  return res;
+}
+const getModelModifiers = (props, modelName) => {
+  return modelName === "modelValue" || modelName === "model-value" ? props.modelModifiers : props[`${modelName}Modifiers`] || props[`${camelize(modelName)}Modifiers`] || props[`${hyphenate(modelName)}Modifiers`];
+};
+
+function emit(instance, event, ...rawArgs) {
+  if (instance.isUnmounted) return;
+  const props = instance.vnode.props || EMPTY_OBJ;
+  {
+    const {
+      emitsOptions,
+      propsOptions: [propsOptions]
+    } = instance;
+    if (emitsOptions) {
+      if (!(event in emitsOptions) && true) {
+        if (!propsOptions || !(toHandlerKey(camelize(event)) in propsOptions)) {
+          warn$1(
+            `Component emitted event "${event}" but it is neither declared in the emits option nor as an "${toHandlerKey(camelize(event))}" prop.`
+          );
+        }
+      } else {
+        const validator = emitsOptions[event];
+        if (isFunction(validator)) {
+          const isValid = validator(...rawArgs);
+          if (!isValid) {
+            warn$1(
+              `Invalid event arguments: event validation failed for event "${event}".`
+            );
+          }
+        }
+      }
+    }
+  }
+  let args = rawArgs;
+  const isModelListener = event.startsWith("update:");
+  const modifiers = isModelListener && getModelModifiers(props, event.slice(7));
+  if (modifiers) {
+    if (modifiers.trim) {
+      args = rawArgs.map((a) => isString(a) ? a.trim() : a);
+    }
+    if (modifiers.number) {
+      args = rawArgs.map(looseToNumber);
+    }
+  }
+  {
+    devtoolsComponentEmit(instance, event, args);
+  }
+  {
+    const lowerCaseEvent = event.toLowerCase();
+    if (lowerCaseEvent !== event && props[toHandlerKey(lowerCaseEvent)]) {
+      warn$1(
+        `Event "${lowerCaseEvent}" is emitted in component ${formatComponentName(
+          instance,
+          instance.type
+        )} but the handler is registered for "${event}". Note that HTML attributes are case-insensitive and you cannot use v-on to listen to camelCase events when using in-DOM templates. You should probably use "${hyphenate(
+          event
+        )}" instead of "${event}".`
+      );
+    }
+  }
+  let handlerName;
+  let handler = props[handlerName = toHandlerKey(event)] || // also try camelCase event handler (#2249)
+  props[handlerName = toHandlerKey(camelize(event))];
+  if (!handler && isModelListener) {
+    handler = props[handlerName = toHandlerKey(hyphenate(event))];
+  }
+  if (handler) {
+    callWithAsyncErrorHandling(
+      handler,
+      instance,
+      6,
+      args
+    );
+  }
+  const onceHandler = props[handlerName + `Once`];
+  if (onceHandler) {
+    if (!instance.emitted) {
+      instance.emitted = {};
+    } else if (instance.emitted[handlerName]) {
+      return;
+    }
+    instance.emitted[handlerName] = true;
+    callWithAsyncErrorHandling(
+      onceHandler,
+      instance,
+      6,
+      args
+    );
+  }
+}
+function normalizeEmitsOptions(comp, appContext, asMixin = false) {
+  const cache = appContext.emitsCache;
+  const cached = cache.get(comp);
+  if (cached !== void 0) {
+    return cached;
+  }
+  const raw = comp.emits;
+  let normalized = {};
+  let hasExtends = false;
+  if (!isFunction(comp)) {
+    const extendEmits = (raw2) => {
+      const normalizedFromExtend = normalizeEmitsOptions(raw2, appContext, true);
+      if (normalizedFromExtend) {
+        hasExtends = true;
+        extend(normalized, normalizedFromExtend);
+      }
+    };
+    if (!asMixin && appContext.mixins.length) {
+      appContext.mixins.forEach(extendEmits);
+    }
+    if (comp.extends) {
+      extendEmits(comp.extends);
+    }
+    if (comp.mixins) {
+      comp.mixins.forEach(extendEmits);
+    }
+  }
+  if (!raw && !hasExtends) {
+    if (isObject(comp)) {
+      cache.set(comp, null);
+    }
+    return null;
+  }
+  if (isArray(raw)) {
+    raw.forEach((key) => normalized[key] = null);
+  } else {
+    extend(normalized, raw);
+  }
+  if (isObject(comp)) {
+    cache.set(comp, normalized);
+  }
+  return normalized;
+}
+function isEmitListener(options, key) {
+  if (!options || !isOn(key)) {
+    return false;
+  }
+  key = key.slice(2).replace(/Once$/, "");
+  return hasOwn(options, key[0].toLowerCase() + key.slice(1)) || hasOwn(options, hyphenate(key)) || hasOwn(options, key);
+}
+
+let accessedAttrs = false;
+function markAttrsAccessed() {
+  accessedAttrs = true;
+}
+function renderComponentRoot(instance) {
+  const {
+    type: Component,
+    vnode,
+    proxy,
+    withProxy,
+    propsOptions: [propsOptions],
+    slots,
+    attrs,
+    emit,
+    render,
+    renderCache,
+    props,
+    data,
+    setupState,
+    ctx,
+    inheritAttrs
+  } = instance;
+  const prev = setCurrentRenderingInstance(instance);
+  let result;
+  let fallthroughAttrs;
+  {
+    accessedAttrs = false;
+  }
+  try {
+    if (vnode.shapeFlag & 4) {
+      const proxyToUse = withProxy || proxy;
+      const thisProxy = setupState.__isScriptSetup ? new Proxy(proxyToUse, {
+        get(target, key, receiver) {
+          warn$1(
+            `Property '${String(
+              key
+            )}' was accessed via 'this'. Avoid using 'this' in templates.`
+          );
+          return Reflect.get(target, key, receiver);
+        }
+      }) : proxyToUse;
+      result = normalizeVNode(
+        render.call(
+          thisProxy,
+          proxyToUse,
+          renderCache,
+          true ? shallowReadonly(props) : props,
+          setupState,
+          data,
+          ctx
+        )
+      );
+      fallthroughAttrs = attrs;
+    } else {
+      const render2 = Component;
+      if (attrs === props) {
+        markAttrsAccessed();
+      }
+      result = normalizeVNode(
+        render2.length > 1 ? render2(
+          true ? shallowReadonly(props) : props,
+          true ? {
+            get attrs() {
+              markAttrsAccessed();
+              return shallowReadonly(attrs);
+            },
+            slots,
+            emit
+          } : { attrs, slots, emit }
+        ) : render2(
+          true ? shallowReadonly(props) : props,
+          null
+        )
+      );
+      fallthroughAttrs = Component.props ? attrs : getFunctionalFallthrough(attrs);
+    }
+  } catch (err) {
+    blockStack.length = 0;
+    handleError(err, instance, 1);
+    result = createVNode(Comment);
+  }
+  let root = result;
+  let setRoot = void 0;
+  if (result.patchFlag > 0 && result.patchFlag & 2048) {
+    [root, setRoot] = getChildRoot(result);
+  }
+  if (fallthroughAttrs && inheritAttrs !== false) {
+    const keys = Object.keys(fallthroughAttrs);
+    const { shapeFlag } = root;
+    if (keys.length) {
+      if (shapeFlag & (1 | 6)) {
+        if (propsOptions && keys.some(isModelListener)) {
+          fallthroughAttrs = filterModelListeners(
+            fallthroughAttrs,
+            propsOptions
+          );
+        }
+        root = cloneVNode(root, fallthroughAttrs, false, true);
+      } else if (!accessedAttrs && root.type !== Comment) {
+        const allAttrs = Object.keys(attrs);
+        const eventAttrs = [];
+        const extraAttrs = [];
+        for (let i = 0, l = allAttrs.length; i < l; i++) {
+          const key = allAttrs[i];
+          if (isOn(key)) {
+            if (!isModelListener(key)) {
+              eventAttrs.push(key[2].toLowerCase() + key.slice(3));
+            }
+          } else {
+            extraAttrs.push(key);
+          }
+        }
+        if (extraAttrs.length) {
+          warn$1(
+            `Extraneous non-props attributes (${extraAttrs.join(", ")}) were passed to component but could not be automatically inherited because component renders fragment or text root nodes.`
+          );
+        }
+        if (eventAttrs.length) {
+          warn$1(
+            `Extraneous non-emits event listeners (${eventAttrs.join(", ")}) were passed to component but could not be automatically inherited because component renders fragment or text root nodes. If the listener is intended to be a component custom event listener only, declare it using the "emits" option.`
+          );
+        }
+      }
+    }
+  }
+  if (vnode.dirs) {
+    if (!isElementRoot(root)) {
+      warn$1(
+        `Runtime directive used on component with non-element root node. The directives will not function as intended.`
+      );
+    }
+    root = cloneVNode(root, null, false, true);
+    root.dirs = root.dirs ? root.dirs.concat(vnode.dirs) : vnode.dirs;
+  }
+  if (vnode.transition) {
+    if (!isElementRoot(root)) {
+      warn$1(
+        `Component inside <Transition> renders non-element root node that cannot be animated.`
+      );
+    }
+    setTransitionHooks(root, vnode.transition);
+  }
+  if (setRoot) {
+    setRoot(root);
+  } else {
+    result = root;
+  }
+  setCurrentRenderingInstance(prev);
+  return result;
+}
+const getChildRoot = (vnode) => {
+  const rawChildren = vnode.children;
+  const dynamicChildren = vnode.dynamicChildren;
+  const childRoot = filterSingleRoot(rawChildren, false);
+  if (!childRoot) {
+    return [vnode, void 0];
+  } else if (childRoot.patchFlag > 0 && childRoot.patchFlag & 2048) {
+    return getChildRoot(childRoot);
+  }
+  const index = rawChildren.indexOf(childRoot);
+  const dynamicIndex = dynamicChildren ? dynamicChildren.indexOf(childRoot) : -1;
+  const setRoot = (updatedRoot) => {
+    rawChildren[index] = updatedRoot;
+    if (dynamicChildren) {
+      if (dynamicIndex > -1) {
+        dynamicChildren[dynamicIndex] = updatedRoot;
+      } else if (updatedRoot.patchFlag > 0) {
+        vnode.dynamicChildren = [...dynamicChildren, updatedRoot];
+      }
+    }
+  };
+  return [normalizeVNode(childRoot), setRoot];
+};
+function filterSingleRoot(children, recurse = true) {
+  let singleRoot;
+  for (let i = 0; i < children.length; i++) {
+    const child = children[i];
+    if (isVNode(child)) {
+      if (child.type !== Comment || child.children === "v-if") {
+        if (singleRoot) {
+          return;
+        } else {
+          singleRoot = child;
+          if (recurse && singleRoot.patchFlag > 0 && singleRoot.patchFlag & 2048) {
+            return filterSingleRoot(singleRoot.children);
+          }
+        }
+      }
+    } else {
+      return;
+    }
+  }
+  return singleRoot;
+}
+const getFunctionalFallthrough = (attrs) => {
+  let res;
+  for (const key in attrs) {
+    if (key === "class" || key === "style" || isOn(key)) {
+      (res || (res = {}))[key] = attrs[key];
+    }
+  }
+  return res;
+};
+const filterModelListeners = (attrs, props) => {
+  const res = {};
+  for (const key in attrs) {
+    if (!isModelListener(key) || !(key.slice(9) in props)) {
+      res[key] = attrs[key];
+    }
+  }
+  return res;
+};
+const isElementRoot = (vnode) => {
+  return vnode.shapeFlag & (6 | 1) || vnode.type === Comment;
+};
+function shouldUpdateComponent(prevVNode, nextVNode, optimized) {
+  const { props: prevProps, children: prevChildren, component } = prevVNode;
+  const { props: nextProps, children: nextChildren, patchFlag } = nextVNode;
+  const emits = component.emitsOptions;
+  if ((prevChildren || nextChildren) && isHmrUpdating) {
+    return true;
+  }
+  if (nextVNode.dirs || nextVNode.transition) {
+    return true;
+  }
+  if (optimized && patchFlag >= 0) {
+    if (patchFlag & 1024) {
+      return true;
+    }
+    if (patchFlag & 16) {
+      if (!prevProps) {
+        return !!nextProps;
+      }
+      return hasPropsChanged(prevProps, nextProps, emits);
+    } else if (patchFlag & 8) {
+      const dynamicProps = nextVNode.dynamicProps;
+      for (let i = 0; i < dynamicProps.length; i++) {
+        const key = dynamicProps[i];
+        if (nextProps[key] !== prevProps[key] && !isEmitListener(emits, key)) {
+          return true;
+        }
+      }
+    }
+  } else {
+    if (prevChildren || nextChildren) {
+      if (!nextChildren || !nextChildren.$stable) {
+        return true;
+      }
+    }
+    if (prevProps === nextProps) {
+      return false;
+    }
+    if (!prevProps) {
+      return !!nextProps;
+    }
+    if (!nextProps) {
+      return true;
+    }
+    return hasPropsChanged(prevProps, nextProps, emits);
+  }
+  return false;
+}
+function hasPropsChanged(prevProps, nextProps, emitsOptions) {
+  const nextKeys = Object.keys(nextProps);
+  if (nextKeys.length !== Object.keys(prevProps).length) {
+    return true;
+  }
+  for (let i = 0; i < nextKeys.length; i++) {
+    const key = nextKeys[i];
+    if (nextProps[key] !== prevProps[key] && !isEmitListener(emitsOptions, key)) {
+      return true;
+    }
+  }
+  return false;
+}
+function updateHOCHostEl({ vnode, parent }, el) {
+  while (parent) {
+    const root = parent.subTree;
+    if (root.suspense && root.suspense.activeBranch === vnode) {
+      root.el = vnode.el;
+    }
+    if (root === vnode) {
+      (vnode = parent.vnode).el = el;
+      parent = parent.parent;
+    } else {
+      break;
+    }
+  }
+}
+
+const isSuspense = (type) => type.__isSuspense;
+let suspenseId = 0;
+const SuspenseImpl = {
+  name: "Suspense",
+  // In order to make Suspense tree-shakable, we need to avoid importing it
+  // directly in the renderer. The renderer checks for the __isSuspense flag
+  // on a vnode's type and calls the `process` method, passing in renderer
+  // internals.
+  __isSuspense: true,
+  process(n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, rendererInternals) {
+    if (n1 == null) {
+      mountSuspense(
+        n2,
+        container,
+        anchor,
+        parentComponent,
+        parentSuspense,
+        namespace,
+        slotScopeIds,
+        optimized,
+        rendererInternals
+      );
+    } else {
+      if (parentSuspense && parentSuspense.deps > 0 && !n1.suspense.isInFallback) {
+        n2.suspense = n1.suspense;
+        n2.suspense.vnode = n2;
+        n2.el = n1.el;
+        return;
+      }
+      patchSuspense(
+        n1,
+        n2,
+        container,
+        anchor,
+        parentComponent,
+        namespace,
+        slotScopeIds,
+        optimized,
+        rendererInternals
+      );
+    }
+  },
+  hydrate: hydrateSuspense,
+  normalize: normalizeSuspenseChildren
+};
+const Suspense = SuspenseImpl ;
+function triggerEvent(vnode, name) {
+  const eventListener = vnode.props && vnode.props[name];
+  if (isFunction(eventListener)) {
+    eventListener();
+  }
+}
+function mountSuspense(vnode, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, rendererInternals) {
+  const {
+    p: patch,
+    o: { createElement }
+  } = rendererInternals;
+  const hiddenContainer = createElement("div");
+  const suspense = vnode.suspense = createSuspenseBoundary(
+    vnode,
+    parentSuspense,
+    parentComponent,
+    container,
+    hiddenContainer,
+    anchor,
+    namespace,
+    slotScopeIds,
+    optimized,
+    rendererInternals
+  );
+  patch(
+    null,
+    suspense.pendingBranch = vnode.ssContent,
+    hiddenContainer,
+    null,
+    parentComponent,
+    suspense,
+    namespace,
+    slotScopeIds
+  );
+  if (suspense.deps > 0) {
+    triggerEvent(vnode, "onPending");
+    triggerEvent(vnode, "onFallback");
+    patch(
+      null,
+      vnode.ssFallback,
+      container,
+      anchor,
+      parentComponent,
+      null,
+      // fallback tree will not have suspense context
+      namespace,
+      slotScopeIds
+    );
+    setActiveBranch(suspense, vnode.ssFallback);
+  } else {
+    suspense.resolve(false, true);
+  }
+}
+function patchSuspense(n1, n2, container, anchor, parentComponent, namespace, slotScopeIds, optimized, { p: patch, um: unmount, o: { createElement } }) {
+  const suspense = n2.suspense = n1.suspense;
+  suspense.vnode = n2;
+  n2.el = n1.el;
+  const newBranch = n2.ssContent;
+  const newFallback = n2.ssFallback;
+  const { activeBranch, pendingBranch, isInFallback, isHydrating } = suspense;
+  if (pendingBranch) {
+    suspense.pendingBranch = newBranch;
+    if (isSameVNodeType(newBranch, pendingBranch)) {
+      patch(
+        pendingBranch,
+        newBranch,
+        suspense.hiddenContainer,
+        null,
+        parentComponent,
+        suspense,
+        namespace,
+        slotScopeIds,
+        optimized
+      );
+      if (suspense.deps <= 0) {
+        suspense.resolve();
+      } else if (isInFallback) {
+        if (!isHydrating) {
+          patch(
+            activeBranch,
+            newFallback,
+            container,
+            anchor,
+            parentComponent,
+            null,
+            // fallback tree will not have suspense context
+            namespace,
+            slotScopeIds,
+            optimized
+          );
+          setActiveBranch(suspense, newFallback);
+        }
+      }
+    } else {
+      suspense.pendingId = suspenseId++;
+      if (isHydrating) {
+        suspense.isHydrating = false;
+        suspense.activeBranch = pendingBranch;
+      } else {
+        unmount(pendingBranch, parentComponent, suspense);
+      }
+      suspense.deps = 0;
+      suspense.effects.length = 0;
+      suspense.hiddenContainer = createElement("div");
+      if (isInFallback) {
+        patch(
+          null,
+          newBranch,
+          suspense.hiddenContainer,
+          null,
+          parentComponent,
+          suspense,
+          namespace,
+          slotScopeIds,
+          optimized
+        );
+        if (suspense.deps <= 0) {
+          suspense.resolve();
+        } else {
+          patch(
+            activeBranch,
+            newFallback,
+            container,
+            anchor,
+            parentComponent,
+            null,
+            // fallback tree will not have suspense context
+            namespace,
+            slotScopeIds,
+            optimized
+          );
+          setActiveBranch(suspense, newFallback);
+        }
+      } else if (activeBranch && isSameVNodeType(newBranch, activeBranch)) {
+        patch(
+          activeBranch,
+          newBranch,
+          container,
+          anchor,
+          parentComponent,
+          suspense,
+          namespace,
+          slotScopeIds,
+          optimized
+        );
+        suspense.resolve(true);
+      } else {
+        patch(
+          null,
+          newBranch,
+          suspense.hiddenContainer,
+          null,
+          parentComponent,
+          suspense,
+          namespace,
+          slotScopeIds,
+          optimized
+        );
+        if (suspense.deps <= 0) {
+          suspense.resolve();
+        }
+      }
+    }
+  } else {
+    if (activeBranch && isSameVNodeType(newBranch, activeBranch)) {
+      patch(
+        activeBranch,
+        newBranch,
+        container,
+        anchor,
+        parentComponent,
+        suspense,
+        namespace,
+        slotScopeIds,
+        optimized
+      );
+      setActiveBranch(suspense, newBranch);
+    } else {
+      triggerEvent(n2, "onPending");
+      suspense.pendingBranch = newBranch;
+      if (newBranch.shapeFlag & 512) {
+        suspense.pendingId = newBranch.component.suspenseId;
+      } else {
+        suspense.pendingId = suspenseId++;
+      }
+      patch(
+        null,
+        newBranch,
+        suspense.hiddenContainer,
+        null,
+        parentComponent,
+        suspense,
+        namespace,
+        slotScopeIds,
+        optimized
+      );
+      if (suspense.deps <= 0) {
+        suspense.resolve();
+      } else {
+        const { timeout, pendingId } = suspense;
+        if (timeout > 0) {
+          setTimeout(() => {
+            if (suspense.pendingId === pendingId) {
+              suspense.fallback(newFallback);
+            }
+          }, timeout);
+        } else if (timeout === 0) {
+          suspense.fallback(newFallback);
+        }
+      }
+    }
+  }
+}
+let hasWarned = false;
+function createSuspenseBoundary(vnode, parentSuspense, parentComponent, container, hiddenContainer, anchor, namespace, slotScopeIds, optimized, rendererInternals, isHydrating = false) {
+  if (!hasWarned) {
+    hasWarned = true;
+    console[console.info ? "info" : "log"](
+      `<Suspense> is an experimental feature and its API will likely change.`
+    );
+  }
+  const {
+    p: patch,
+    m: move,
+    um: unmount,
+    n: next,
+    o: { parentNode, remove }
+  } = rendererInternals;
+  let parentSuspenseId;
+  const isSuspensible = isVNodeSuspensible(vnode);
+  if (isSuspensible) {
+    if (parentSuspense && parentSuspense.pendingBranch) {
+      parentSuspenseId = parentSuspense.pendingId;
+      parentSuspense.deps++;
+    }
+  }
+  const timeout = vnode.props ? toNumber(vnode.props.timeout) : void 0;
+  {
+    assertNumber(timeout, `Suspense timeout`);
+  }
+  const initialAnchor = anchor;
+  const suspense = {
+    vnode,
+    parent: parentSuspense,
+    parentComponent,
+    namespace,
+    container,
+    hiddenContainer,
+    deps: 0,
+    pendingId: suspenseId++,
+    timeout: typeof timeout === "number" ? timeout : -1,
+    activeBranch: null,
+    pendingBranch: null,
+    isInFallback: !isHydrating,
+    isHydrating,
+    isUnmounted: false,
+    effects: [],
+    resolve(resume = false, sync = false) {
+      {
+        if (!resume && !suspense.pendingBranch) {
+          throw new Error(
+            `suspense.resolve() is called without a pending branch.`
+          );
+        }
+        if (suspense.isUnmounted) {
+          throw new Error(
+            `suspense.resolve() is called on an already unmounted suspense boundary.`
+          );
+        }
+      }
+      const {
+        vnode: vnode2,
+        activeBranch,
+        pendingBranch,
+        pendingId,
+        effects,
+        parentComponent: parentComponent2,
+        container: container2
+      } = suspense;
+      let delayEnter = false;
+      if (suspense.isHydrating) {
+        suspense.isHydrating = false;
+      } else if (!resume) {
+        delayEnter = activeBranch && pendingBranch.transition && pendingBranch.transition.mode === "out-in";
+        if (delayEnter) {
+          activeBranch.transition.afterLeave = () => {
+            if (pendingId === suspense.pendingId) {
+              move(
+                pendingBranch,
+                container2,
+                anchor === initialAnchor ? next(activeBranch) : anchor,
+                0
+              );
+              queuePostFlushCb(effects);
+            }
+          };
+        }
+        if (activeBranch) {
+          if (parentNode(activeBranch.el) === container2) {
+            anchor = next(activeBranch);
+          }
+          unmount(activeBranch, parentComponent2, suspense, true);
+        }
+        if (!delayEnter) {
+          move(pendingBranch, container2, anchor, 0);
+        }
+      }
+      setActiveBranch(suspense, pendingBranch);
+      suspense.pendingBranch = null;
+      suspense.isInFallback = false;
+      let parent = suspense.parent;
+      let hasUnresolvedAncestor = false;
+      while (parent) {
+        if (parent.pendingBranch) {
+          parent.effects.push(...effects);
+          hasUnresolvedAncestor = true;
+          break;
+        }
+        parent = parent.parent;
+      }
+      if (!hasUnresolvedAncestor && !delayEnter) {
+        queuePostFlushCb(effects);
+      }
+      suspense.effects = [];
+      if (isSuspensible) {
+        if (parentSuspense && parentSuspense.pendingBranch && parentSuspenseId === parentSuspense.pendingId) {
+          parentSuspense.deps--;
+          if (parentSuspense.deps === 0 && !sync) {
+            parentSuspense.resolve();
+          }
+        }
+      }
+      triggerEvent(vnode2, "onResolve");
+    },
+    fallback(fallbackVNode) {
+      if (!suspense.pendingBranch) {
+        return;
+      }
+      const { vnode: vnode2, activeBranch, parentComponent: parentComponent2, container: container2, namespace: namespace2 } = suspense;
+      triggerEvent(vnode2, "onFallback");
+      const anchor2 = next(activeBranch);
+      const mountFallback = () => {
+        if (!suspense.isInFallback) {
+          return;
+        }
+        patch(
+          null,
+          fallbackVNode,
+          container2,
+          anchor2,
+          parentComponent2,
+          null,
+          // fallback tree will not have suspense context
+          namespace2,
+          slotScopeIds,
+          optimized
+        );
+        setActiveBranch(suspense, fallbackVNode);
+      };
+      const delayEnter = fallbackVNode.transition && fallbackVNode.transition.mode === "out-in";
+      if (delayEnter) {
+        activeBranch.transition.afterLeave = mountFallback;
+      }
+      suspense.isInFallback = true;
+      unmount(
+        activeBranch,
+        parentComponent2,
+        null,
+        // no suspense so unmount hooks fire now
+        true
+        // shouldRemove
+      );
+      if (!delayEnter) {
+        mountFallback();
+      }
+    },
+    move(container2, anchor2, type) {
+      suspense.activeBranch && move(suspense.activeBranch, container2, anchor2, type);
+      suspense.container = container2;
+    },
+    next() {
+      return suspense.activeBranch && next(suspense.activeBranch);
+    },
+    registerDep(instance, setupRenderEffect, optimized2) {
+      const isInPendingSuspense = !!suspense.pendingBranch;
+      if (isInPendingSuspense) {
+        suspense.deps++;
+      }
+      const hydratedEl = instance.vnode.el;
+      instance.asyncDep.catch((err) => {
+        handleError(err, instance, 0);
+      }).then((asyncSetupResult) => {
+        if (instance.isUnmounted || suspense.isUnmounted || suspense.pendingId !== instance.suspenseId) {
+          return;
+        }
+        instance.asyncResolved = true;
+        const { vnode: vnode2 } = instance;
+        {
+          pushWarningContext(vnode2);
+        }
+        handleSetupResult(instance, asyncSetupResult, false);
+        if (hydratedEl) {
+          vnode2.el = hydratedEl;
+        }
+        const placeholder = !hydratedEl && instance.subTree.el;
+        setupRenderEffect(
+          instance,
+          vnode2,
+          // component may have been moved before resolve.
+          // if this is not a hydration, instance.subTree will be the comment
+          // placeholder.
+          parentNode(hydratedEl || instance.subTree.el),
+          // anchor will not be used if this is hydration, so only need to
+          // consider the comment placeholder case.
+          hydratedEl ? null : next(instance.subTree),
+          suspense,
+          namespace,
+          optimized2
+        );
+        if (placeholder) {
+          remove(placeholder);
+        }
+        updateHOCHostEl(instance, vnode2.el);
+        {
+          popWarningContext();
+        }
+        if (isInPendingSuspense && --suspense.deps === 0) {
+          suspense.resolve();
+        }
+      });
+    },
+    unmount(parentSuspense2, doRemove) {
+      suspense.isUnmounted = true;
+      if (suspense.activeBranch) {
+        unmount(
+          suspense.activeBranch,
+          parentComponent,
+          parentSuspense2,
+          doRemove
+        );
+      }
+      if (suspense.pendingBranch) {
+        unmount(
+          suspense.pendingBranch,
+          parentComponent,
+          parentSuspense2,
+          doRemove
+        );
+      }
+    }
+  };
+  return suspense;
+}
+function hydrateSuspense(node, vnode, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, rendererInternals, hydrateNode) {
+  const suspense = vnode.suspense = createSuspenseBoundary(
+    vnode,
+    parentSuspense,
+    parentComponent,
+    node.parentNode,
+    // eslint-disable-next-line no-restricted-globals
+    document.createElement("div"),
+    null,
+    namespace,
+    slotScopeIds,
+    optimized,
+    rendererInternals,
+    true
+  );
+  const result = hydrateNode(
+    node,
+    suspense.pendingBranch = vnode.ssContent,
+    parentComponent,
+    suspense,
+    slotScopeIds,
+    optimized
+  );
+  if (suspense.deps === 0) {
+    suspense.resolve(false, true);
+  }
+  return result;
+}
+function normalizeSuspenseChildren(vnode) {
+  const { shapeFlag, children } = vnode;
+  const isSlotChildren = shapeFlag & 32;
+  vnode.ssContent = normalizeSuspenseSlot(
+    isSlotChildren ? children.default : children
+  );
+  vnode.ssFallback = isSlotChildren ? normalizeSuspenseSlot(children.fallback) : createVNode(Comment);
+}
+function normalizeSuspenseSlot(s) {
+  let block;
+  if (isFunction(s)) {
+    const trackBlock = isBlockTreeEnabled && s._c;
+    if (trackBlock) {
+      s._d = false;
+      openBlock();
+    }
+    s = s();
+    if (trackBlock) {
+      s._d = true;
+      block = currentBlock;
+      closeBlock();
+    }
+  }
+  if (isArray(s)) {
+    const singleChild = filterSingleRoot(s);
+    if (!singleChild && s.filter((child) => child !== NULL_DYNAMIC_COMPONENT).length > 0) {
+      warn$1(`<Suspense> slots expect a single root node.`);
+    }
+    s = singleChild;
+  }
+  s = normalizeVNode(s);
+  if (block && !s.dynamicChildren) {
+    s.dynamicChildren = block.filter((c) => c !== s);
+  }
+  return s;
+}
+function queueEffectWithSuspense(fn, suspense) {
+  if (suspense && suspense.pendingBranch) {
+    if (isArray(fn)) {
+      suspense.effects.push(...fn);
+    } else {
+      suspense.effects.push(fn);
+    }
+  } else {
+    queuePostFlushCb(fn);
+  }
+}
+function setActiveBranch(suspense, branch) {
+  suspense.activeBranch = branch;
+  const { vnode, parentComponent } = suspense;
+  let el = branch.el;
+  while (!el && branch.component) {
+    branch = branch.component.subTree;
+    el = branch.el;
+  }
+  vnode.el = el;
+  if (parentComponent && parentComponent.subTree === vnode) {
+    parentComponent.vnode.el = el;
+    updateHOCHostEl(parentComponent, el);
+  }
+}
+function isVNodeSuspensible(vnode) {
+  const suspensible = vnode.props && vnode.props.suspensible;
+  return suspensible != null && suspensible !== false;
+}
+
+const Fragment = Symbol.for("v-fgt");
+const Text = Symbol.for("v-txt");
+const Comment = Symbol.for("v-cmt");
+const Static = Symbol.for("v-stc");
+const blockStack = [];
+let currentBlock = null;
+function openBlock(disableTracking = false) {
+  blockStack.push(currentBlock = disableTracking ? null : []);
+}
+function closeBlock() {
+  blockStack.pop();
+  currentBlock = blockStack[blockStack.length - 1] || null;
+}
+let isBlockTreeEnabled = 1;
+function setBlockTracking(value) {
+  isBlockTreeEnabled += value;
+  if (value < 0 && currentBlock) {
+    currentBlock.hasOnce = true;
+  }
+}
+function setupBlock(vnode) {
+  vnode.dynamicChildren = isBlockTreeEnabled > 0 ? currentBlock || EMPTY_ARR : null;
+  closeBlock();
+  if (isBlockTreeEnabled > 0 && currentBlock) {
+    currentBlock.push(vnode);
+  }
+  return vnode;
+}
+function createElementBlock(type, props, children, patchFlag, dynamicProps, shapeFlag) {
+  return setupBlock(
+    createBaseVNode(
+      type,
+      props,
+      children,
+      patchFlag,
+      dynamicProps,
+      shapeFlag,
+      true
+    )
+  );
+}
+function createBlock(type, props, children, patchFlag, dynamicProps) {
+  return setupBlock(
+    createVNode(
+      type,
+      props,
+      children,
+      patchFlag,
+      dynamicProps,
+      true
+    )
+  );
+}
+function isVNode(value) {
+  return value ? value.__v_isVNode === true : false;
+}
+function isSameVNodeType(n1, n2) {
+  if (n2.shapeFlag & 6 && n1.component) {
+    const dirtyInstances = hmrDirtyComponents.get(n2.type);
+    if (dirtyInstances && dirtyInstances.has(n1.component)) {
+      n1.shapeFlag &= ~256;
+      n2.shapeFlag &= ~512;
+      return false;
+    }
+  }
+  return n1.type === n2.type && n1.key === n2.key;
+}
+let vnodeArgsTransformer;
+function transformVNodeArgs(transformer) {
+  vnodeArgsTransformer = transformer;
+}
+const createVNodeWithArgsTransform = (...args) => {
+  return _createVNode(
+    ...vnodeArgsTransformer ? vnodeArgsTransformer(args, currentRenderingInstance) : args
+  );
+};
+const normalizeKey = ({ key }) => key != null ? key : null;
+const normalizeRef = ({
+  ref,
+  ref_key,
+  ref_for
+}) => {
+  if (typeof ref === "number") {
+    ref = "" + ref;
+  }
+  return ref != null ? isString(ref) || isRef(ref) || isFunction(ref) ? { i: currentRenderingInstance, r: ref, k: ref_key, f: !!ref_for } : ref : null;
+};
+function createBaseVNode(type, props = null, children = null, patchFlag = 0, dynamicProps = null, shapeFlag = type === Fragment ? 0 : 1, isBlockNode = false, needFullChildrenNormalization = false) {
+  const vnode = {
+    __v_isVNode: true,
+    __v_skip: true,
+    type,
+    props,
+    key: props && normalizeKey(props),
+    ref: props && normalizeRef(props),
+    scopeId: currentScopeId,
+    slotScopeIds: null,
+    children,
+    component: null,
+    suspense: null,
+    ssContent: null,
+    ssFallback: null,
+    dirs: null,
+    transition: null,
+    el: null,
+    anchor: null,
+    target: null,
+    targetStart: null,
+    targetAnchor: null,
+    staticCount: 0,
+    shapeFlag,
+    patchFlag,
+    dynamicProps,
+    dynamicChildren: null,
+    appContext: null,
+    ctx: currentRenderingInstance
+  };
+  if (needFullChildrenNormalization) {
+    normalizeChildren(vnode, children);
+    if (shapeFlag & 128) {
+      type.normalize(vnode);
+    }
+  } else if (children) {
+    vnode.shapeFlag |= isString(children) ? 8 : 16;
+  }
+  if (vnode.key !== vnode.key) {
+    warn$1(`VNode created with invalid key (NaN). VNode type:`, vnode.type);
+  }
+  if (isBlockTreeEnabled > 0 && // avoid a block node from tracking itself
+  !isBlockNode && // has current parent block
+  currentBlock && // presence of a patch flag indicates this node needs patching on updates.
+  // component nodes also should always be patched, because even if the
+  // component doesn't need to update, it needs to persist the instance on to
+  // the next vnode so that it can be properly unmounted later.
+  (vnode.patchFlag > 0 || shapeFlag & 6) && // the EVENTS flag is only for hydration and if it is the only flag, the
+  // vnode should not be considered dynamic due to handler caching.
+  vnode.patchFlag !== 32) {
+    currentBlock.push(vnode);
+  }
+  return vnode;
+}
+const createVNode = createVNodeWithArgsTransform ;
+function _createVNode(type, props = null, children = null, patchFlag = 0, dynamicProps = null, isBlockNode = false) {
+  if (!type || type === NULL_DYNAMIC_COMPONENT) {
+    if (!type) {
+      warn$1(`Invalid vnode type when creating vnode: ${type}.`);
+    }
+    type = Comment;
+  }
+  if (isVNode(type)) {
+    const cloned = cloneVNode(
+      type,
+      props,
+      true
+      /* mergeRef: true */
+    );
+    if (children) {
+      normalizeChildren(cloned, children);
+    }
+    if (isBlockTreeEnabled > 0 && !isBlockNode && currentBlock) {
+      if (cloned.shapeFlag & 6) {
+        currentBlock[currentBlock.indexOf(type)] = cloned;
+      } else {
+        currentBlock.push(cloned);
+      }
+    }
+    cloned.patchFlag = -2;
+    return cloned;
+  }
+  if (isClassComponent(type)) {
+    type = type.__vccOpts;
+  }
+  if (props) {
+    props = guardReactiveProps(props);
+    let { class: klass, style } = props;
+    if (klass && !isString(klass)) {
+      props.class = normalizeClass(klass);
+    }
+    if (isObject(style)) {
+      if (isProxy(style) && !isArray(style)) {
+        style = extend({}, style);
+      }
+      props.style = normalizeStyle(style);
+    }
+  }
+  const shapeFlag = isString(type) ? 1 : isSuspense(type) ? 128 : isTeleport(type) ? 64 : isObject(type) ? 4 : isFunction(type) ? 2 : 0;
+  if (shapeFlag & 4 && isProxy(type)) {
+    type = toRaw(type);
+    warn$1(
+      `Vue received a Component that was made a reactive object. This can lead to unnecessary performance overhead and should be avoided by marking the component with \`markRaw\` or using \`shallowRef\` instead of \`ref\`.`,
+      `
+Component that was made reactive: `,
+      type
+    );
+  }
+  return createBaseVNode(
+    type,
+    props,
+    children,
+    patchFlag,
+    dynamicProps,
+    shapeFlag,
+    isBlockNode,
+    true
+  );
+}
+function guardReactiveProps(props) {
+  if (!props) return null;
+  return isProxy(props) || isInternalObject(props) ? extend({}, props) : props;
+}
+function cloneVNode(vnode, extraProps, mergeRef = false, cloneTransition = false) {
+  const { props, ref, patchFlag, children, transition } = vnode;
+  const mergedProps = extraProps ? mergeProps(props || {}, extraProps) : props;
+  const cloned = {
+    __v_isVNode: true,
+    __v_skip: true,
+    type: vnode.type,
+    props: mergedProps,
+    key: mergedProps && normalizeKey(mergedProps),
+    ref: extraProps && extraProps.ref ? (
+      // #2078 in the case of <component :is="vnode" ref="extra"/>
+      // if the vnode itself already has a ref, cloneVNode will need to merge
+      // the refs so the single vnode can be set on multiple refs
+      mergeRef && ref ? isArray(ref) ? ref.concat(normalizeRef(extraProps)) : [ref, normalizeRef(extraProps)] : normalizeRef(extraProps)
+    ) : ref,
+    scopeId: vnode.scopeId,
+    slotScopeIds: vnode.slotScopeIds,
+    children: patchFlag === -1 && isArray(children) ? children.map(deepCloneVNode) : children,
+    target: vnode.target,
+    targetStart: vnode.targetStart,
+    targetAnchor: vnode.targetAnchor,
+    staticCount: vnode.staticCount,
+    shapeFlag: vnode.shapeFlag,
+    // if the vnode is cloned with extra props, we can no longer assume its
+    // existing patch flag to be reliable and need to add the FULL_PROPS flag.
+    // note: preserve flag for fragments since they use the flag for children
+    // fast paths only.
+    patchFlag: extraProps && vnode.type !== Fragment ? patchFlag === -1 ? 16 : patchFlag | 16 : patchFlag,
+    dynamicProps: vnode.dynamicProps,
+    dynamicChildren: vnode.dynamicChildren,
+    appContext: vnode.appContext,
+    dirs: vnode.dirs,
+    transition,
+    // These should technically only be non-null on mounted VNodes. However,
+    // they *should* be copied for kept-alive vnodes. So we just always copy
+    // them since them being non-null during a mount doesn't affect the logic as
+    // they will simply be overwritten.
+    component: vnode.component,
+    suspense: vnode.suspense,
+    ssContent: vnode.ssContent && cloneVNode(vnode.ssContent),
+    ssFallback: vnode.ssFallback && cloneVNode(vnode.ssFallback),
+    el: vnode.el,
+    anchor: vnode.anchor,
+    ctx: vnode.ctx,
+    ce: vnode.ce
+  };
+  if (transition && cloneTransition) {
+    setTransitionHooks(
+      cloned,
+      transition.clone(cloned)
+    );
+  }
+  return cloned;
+}
+function deepCloneVNode(vnode) {
+  const cloned = cloneVNode(vnode);
+  if (isArray(vnode.children)) {
+    cloned.children = vnode.children.map(deepCloneVNode);
+  }
+  return cloned;
+}
+function createTextVNode(text = " ", flag = 0) {
+  return createVNode(Text, null, text, flag);
+}
+function createStaticVNode(content, numberOfNodes) {
+  const vnode = createVNode(Static, null, content);
+  vnode.staticCount = numberOfNodes;
+  return vnode;
+}
+function createCommentVNode(text = "", asBlock = false) {
+  return asBlock ? (openBlock(), createBlock(Comment, null, text)) : createVNode(Comment, null, text);
+}
+function normalizeVNode(child) {
+  if (child == null || typeof child === "boolean") {
+    return createVNode(Comment);
+  } else if (isArray(child)) {
+    return createVNode(
+      Fragment,
+      null,
+      // #3666, avoid reference pollution when reusing vnode
+      child.slice()
+    );
+  } else if (isVNode(child)) {
+    return cloneIfMounted(child);
+  } else {
+    return createVNode(Text, null, String(child));
+  }
+}
+function cloneIfMounted(child) {
+  return child.el === null && child.patchFlag !== -1 || child.memo ? child : cloneVNode(child);
+}
+function normalizeChildren(vnode, children) {
+  let type = 0;
+  const { shapeFlag } = vnode;
+  if (children == null) {
+    children = null;
+  } else if (isArray(children)) {
+    type = 16;
+  } else if (typeof children === "object") {
+    if (shapeFlag & (1 | 64)) {
+      const slot = children.default;
+      if (slot) {
+        slot._c && (slot._d = false);
+        normalizeChildren(vnode, slot());
+        slot._c && (slot._d = true);
+      }
+      return;
+    } else {
+      type = 32;
+      const slotFlag = children._;
+      if (!slotFlag && !isInternalObject(children)) {
+        children._ctx = currentRenderingInstance;
+      } else if (slotFlag === 3 && currentRenderingInstance) {
+        if (currentRenderingInstance.slots._ === 1) {
+          children._ = 1;
+        } else {
+          children._ = 2;
+          vnode.patchFlag |= 1024;
+        }
+      }
+    }
+  } else if (isFunction(children)) {
+    children = { default: children, _ctx: currentRenderingInstance };
+    type = 32;
+  } else {
+    children = String(children);
+    if (shapeFlag & 64) {
+      type = 16;
+      children = [createTextVNode(children)];
+    } else {
+      type = 8;
+    }
+  }
+  vnode.children = children;
+  vnode.shapeFlag |= type;
+}
+function mergeProps(...args) {
+  const ret = {};
+  for (let i = 0; i < args.length; i++) {
+    const toMerge = args[i];
+    for (const key in toMerge) {
+      if (key === "class") {
+        if (ret.class !== toMerge.class) {
+          ret.class = normalizeClass([ret.class, toMerge.class]);
+        }
+      } else if (key === "style") {
+        ret.style = normalizeStyle([ret.style, toMerge.style]);
+      } else if (isOn(key)) {
+        const existing = ret[key];
+        const incoming = toMerge[key];
+        if (incoming && existing !== incoming && !(isArray(existing) && existing.includes(incoming))) {
+          ret[key] = existing ? [].concat(existing, incoming) : incoming;
+        }
+      } else if (key !== "") {
+        ret[key] = toMerge[key];
+      }
+    }
+  }
+  return ret;
+}
+function invokeVNodeHook(hook, instance, vnode, prevVNode = null) {
+  callWithAsyncErrorHandling(hook, instance, 7, [
+    vnode,
+    prevVNode
+  ]);
+}
+
+const emptyAppContext = createAppContext();
+let uid = 0;
+function createComponentInstance(vnode, parent, suspense) {
+  const type = vnode.type;
+  const appContext = (parent ? parent.appContext : vnode.appContext) || emptyAppContext;
+  const instance = {
+    uid: uid++,
+    vnode,
+    type,
+    parent,
+    appContext,
+    root: null,
+    // to be immediately set
+    next: null,
+    subTree: null,
+    // will be set synchronously right after creation
+    effect: null,
+    update: null,
+    // will be set synchronously right after creation
+    job: null,
+    scope: new EffectScope(
+      true
+      /* detached */
+    ),
+    render: null,
+    proxy: null,
+    exposed: null,
+    exposeProxy: null,
+    withProxy: null,
+    provides: parent ? parent.provides : Object.create(appContext.provides),
+    ids: parent ? parent.ids : ["", 0, 0],
+    accessCache: null,
+    renderCache: [],
+    // local resolved assets
+    components: null,
+    directives: null,
+    // resolved props and emits options
+    propsOptions: normalizePropsOptions(type, appContext),
+    emitsOptions: normalizeEmitsOptions(type, appContext),
+    // emit
+    emit: null,
+    // to be set immediately
+    emitted: null,
+    // props default value
+    propsDefaults: EMPTY_OBJ,
+    // inheritAttrs
+    inheritAttrs: type.inheritAttrs,
+    // state
+    ctx: EMPTY_OBJ,
+    data: EMPTY_OBJ,
+    props: EMPTY_OBJ,
+    attrs: EMPTY_OBJ,
+    slots: EMPTY_OBJ,
+    refs: EMPTY_OBJ,
+    setupState: EMPTY_OBJ,
+    setupContext: null,
+    // suspense related
+    suspense,
+    suspenseId: suspense ? suspense.pendingId : 0,
+    asyncDep: null,
+    asyncResolved: false,
+    // lifecycle hooks
+    // not using enums here because it results in computed properties
+    isMounted: false,
+    isUnmounted: false,
+    isDeactivated: false,
+    bc: null,
+    c: null,
+    bm: null,
+    m: null,
+    bu: null,
+    u: null,
+    um: null,
+    bum: null,
+    da: null,
+    a: null,
+    rtg: null,
+    rtc: null,
+    ec: null,
+    sp: null
+  };
+  {
+    instance.ctx = createDevRenderContext(instance);
+  }
+  instance.root = parent ? parent.root : instance;
+  instance.emit = emit.bind(null, instance);
+  if (vnode.ce) {
+    vnode.ce(instance);
+  }
+  return instance;
+}
+let currentInstance = null;
+const getCurrentInstance = () => currentInstance || currentRenderingInstance;
+let internalSetCurrentInstance;
+let setInSSRSetupState;
+{
+  const g = getGlobalThis();
+  const registerGlobalSetter = (key, setter) => {
+    let setters;
+    if (!(setters = g[key])) setters = g[key] = [];
+    setters.push(setter);
+    return (v) => {
+      if (setters.length > 1) setters.forEach((set) => set(v));
+      else setters[0](v);
+    };
+  };
+  internalSetCurrentInstance = registerGlobalSetter(
+    `__VUE_INSTANCE_SETTERS__`,
+    (v) => currentInstance = v
+  );
+  setInSSRSetupState = registerGlobalSetter(
+    `__VUE_SSR_SETTERS__`,
+    (v) => isInSSRComponentSetup = v
+  );
+}
+const setCurrentInstance = (instance) => {
+  const prev = currentInstance;
+  internalSetCurrentInstance(instance);
+  instance.scope.on();
+  return () => {
+    instance.scope.off();
+    internalSetCurrentInstance(prev);
+  };
+};
+const unsetCurrentInstance = () => {
+  currentInstance && currentInstance.scope.off();
+  internalSetCurrentInstance(null);
+};
+const isBuiltInTag = /* @__PURE__ */ makeMap("slot,component");
+function validateComponentName(name, { isNativeTag }) {
+  if (isBuiltInTag(name) || isNativeTag(name)) {
+    warn$1(
+      "Do not use built-in or reserved HTML elements as component id: " + name
+    );
+  }
+}
+function isStatefulComponent(instance) {
+  return instance.vnode.shapeFlag & 4;
+}
+let isInSSRComponentSetup = false;
+function setupComponent(instance, isSSR = false, optimized = false) {
+  isSSR && setInSSRSetupState(isSSR);
+  const { props, children } = instance.vnode;
+  const isStateful = isStatefulComponent(instance);
+  initProps(instance, props, isStateful, isSSR);
+  initSlots(instance, children, optimized);
+  const setupResult = isStateful ? setupStatefulComponent(instance, isSSR) : void 0;
+  isSSR && setInSSRSetupState(false);
+  return setupResult;
+}
+function setupStatefulComponent(instance, isSSR) {
+  var _a;
+  const Component = instance.type;
+  {
+    if (Component.name) {
+      validateComponentName(Component.name, instance.appContext.config);
+    }
+    if (Component.components) {
+      const names = Object.keys(Component.components);
+      for (let i = 0; i < names.length; i++) {
+        validateComponentName(names[i], instance.appContext.config);
+      }
+    }
+    if (Component.directives) {
+      const names = Object.keys(Component.directives);
+      for (let i = 0; i < names.length; i++) {
+        validateDirectiveName(names[i]);
+      }
+    }
+    if (Component.compilerOptions && isRuntimeOnly()) {
+      warn$1(
+        `"compilerOptions" is only supported when using a build of Vue that includes the runtime compiler. Since you are using a runtime-only build, the options should be passed via your build tool config instead.`
+      );
+    }
+  }
+  instance.accessCache = /* @__PURE__ */ Object.create(null);
+  instance.proxy = new Proxy(instance.ctx, PublicInstanceProxyHandlers);
+  {
+    exposePropsOnRenderContext(instance);
+  }
+  const { setup } = Component;
+  if (setup) {
+    pauseTracking();
+    const setupContext = instance.setupContext = setup.length > 1 ? createSetupContext(instance) : null;
+    const reset = setCurrentInstance(instance);
+    const setupResult = callWithErrorHandling(
+      setup,
+      instance,
+      0,
+      [
+        shallowReadonly(instance.props) ,
+        setupContext
+      ]
+    );
+    const isAsyncSetup = isPromise(setupResult);
+    resetTracking();
+    reset();
+    if ((isAsyncSetup || instance.sp) && !isAsyncWrapper(instance)) {
+      markAsyncBoundary(instance);
+    }
+    if (isAsyncSetup) {
+      setupResult.then(unsetCurrentInstance, unsetCurrentInstance);
+      if (isSSR) {
+        return setupResult.then((resolvedResult) => {
+          handleSetupResult(instance, resolvedResult, isSSR);
+        }).catch((e) => {
+          handleError(e, instance, 0);
+        });
+      } else {
+        instance.asyncDep = setupResult;
+        if (!instance.suspense) {
+          const name = (_a = Component.name) != null ? _a : "Anonymous";
+          warn$1(
+            `Component <${name}>: setup function returned a promise, but no <Suspense> boundary was found in the parent component tree. A component with async setup() must be nested in a <Suspense> in order to be rendered.`
+          );
+        }
+      }
+    } else {
+      handleSetupResult(instance, setupResult, isSSR);
+    }
+  } else {
+    finishComponentSetup(instance, isSSR);
+  }
+}
+function handleSetupResult(instance, setupResult, isSSR) {
+  if (isFunction(setupResult)) {
+    if (instance.type.__ssrInlineRender) {
+      instance.ssrRender = setupResult;
+    } else {
+      instance.render = setupResult;
+    }
+  } else if (isObject(setupResult)) {
+    if (isVNode(setupResult)) {
+      warn$1(
+        `setup() should not return VNodes directly - return a render function instead.`
+      );
+    }
+    {
+      instance.devtoolsRawSetupState = setupResult;
+    }
+    instance.setupState = proxyRefs(setupResult);
+    {
+      exposeSetupStateOnRenderContext(instance);
+    }
+  } else if (setupResult !== void 0) {
+    warn$1(
+      `setup() should return an object. Received: ${setupResult === null ? "null" : typeof setupResult}`
+    );
+  }
+  finishComponentSetup(instance, isSSR);
+}
+let compile$1;
+let installWithProxy;
+function registerRuntimeCompiler(_compile) {
+  compile$1 = _compile;
+  installWithProxy = (i) => {
+    if (i.render._rc) {
+      i.withProxy = new Proxy(i.ctx, RuntimeCompiledPublicInstanceProxyHandlers);
+    }
+  };
+}
+const isRuntimeOnly = () => !compile$1;
+function finishComponentSetup(instance, isSSR, skipOptions) {
+  const Component = instance.type;
+  if (!instance.render) {
+    if (!isSSR && compile$1 && !Component.render) {
+      const template = Component.template || resolveMergedOptions(instance).template;
+      if (template) {
+        {
+          startMeasure(instance, `compile`);
+        }
+        const { isCustomElement, compilerOptions } = instance.appContext.config;
+        const { delimiters, compilerOptions: componentCompilerOptions } = Component;
+        const finalCompilerOptions = extend(
+          extend(
+            {
+              isCustomElement,
+              delimiters
+            },
+            compilerOptions
+          ),
+          componentCompilerOptions
+        );
+        Component.render = compile$1(template, finalCompilerOptions);
+        {
+          endMeasure(instance, `compile`);
+        }
+      }
+    }
+    instance.render = Component.render || NOOP;
+    if (installWithProxy) {
+      installWithProxy(instance);
+    }
+  }
+  {
+    const reset = setCurrentInstance(instance);
+    pauseTracking();
+    try {
+      applyOptions(instance);
+    } finally {
+      resetTracking();
+      reset();
+    }
+  }
+  if (!Component.render && instance.render === NOOP && !isSSR) {
+    if (!compile$1 && Component.template) {
+      warn$1(
+        `Component provided template option but runtime compilation is not supported in this build of Vue.` + (` Use "vue.esm-browser.js" instead.` )
+      );
+    } else {
+      warn$1(`Component is missing template or render function: `, Component);
+    }
+  }
+}
+const attrsProxyHandlers = {
+  get(target, key) {
+    markAttrsAccessed();
+    track(target, "get", "");
+    return target[key];
+  },
+  set() {
+    warn$1(`setupContext.attrs is readonly.`);
+    return false;
+  },
+  deleteProperty() {
+    warn$1(`setupContext.attrs is readonly.`);
+    return false;
+  }
+} ;
+function getSlotsProxy(instance) {
+  return new Proxy(instance.slots, {
+    get(target, key) {
+      track(instance, "get", "$slots");
+      return target[key];
+    }
+  });
+}
+function createSetupContext(instance) {
+  const expose = (exposed) => {
+    {
+      if (instance.exposed) {
+        warn$1(`expose() should be called only once per setup().`);
+      }
+      if (exposed != null) {
+        let exposedType = typeof exposed;
+        if (exposedType === "object") {
+          if (isArray(exposed)) {
+            exposedType = "array";
+          } else if (isRef(exposed)) {
+            exposedType = "ref";
+          }
+        }
+        if (exposedType !== "object") {
+          warn$1(
+            `expose() should be passed a plain object, received ${exposedType}.`
+          );
+        }
+      }
+    }
+    instance.exposed = exposed || {};
+  };
+  {
+    let attrsProxy;
+    let slotsProxy;
+    return Object.freeze({
+      get attrs() {
+        return attrsProxy || (attrsProxy = new Proxy(instance.attrs, attrsProxyHandlers));
+      },
+      get slots() {
+        return slotsProxy || (slotsProxy = getSlotsProxy(instance));
+      },
+      get emit() {
+        return (event, ...args) => instance.emit(event, ...args);
+      },
+      expose
+    });
+  }
+}
+function getComponentPublicInstance(instance) {
+  if (instance.exposed) {
+    return instance.exposeProxy || (instance.exposeProxy = new Proxy(proxyRefs(markRaw(instance.exposed)), {
+      get(target, key) {
+        if (key in target) {
+          return target[key];
+        } else if (key in publicPropertiesMap) {
+          return publicPropertiesMap[key](instance);
+        }
+      },
+      has(target, key) {
+        return key in target || key in publicPropertiesMap;
+      }
+    }));
+  } else {
+    return instance.proxy;
+  }
+}
+const classifyRE = /(?:^|[-_])(\w)/g;
+const classify = (str) => str.replace(classifyRE, (c) => c.toUpperCase()).replace(/[-_]/g, "");
+function getComponentName(Component, includeInferred = true) {
+  return isFunction(Component) ? Component.displayName || Component.name : Component.name || includeInferred && Component.__name;
+}
+function formatComponentName(instance, Component, isRoot = false) {
+  let name = getComponentName(Component);
+  if (!name && Component.__file) {
+    const match = Component.__file.match(/([^/\\]+)\.\w+$/);
+    if (match) {
+      name = match[1];
+    }
+  }
+  if (!name && instance && instance.parent) {
+    const inferFromRegistry = (registry) => {
+      for (const key in registry) {
+        if (registry[key] === Component) {
+          return key;
+        }
+      }
+    };
+    name = inferFromRegistry(
+      instance.components || instance.parent.type.components
+    ) || inferFromRegistry(instance.appContext.components);
+  }
+  return name ? classify(name) : isRoot ? `App` : `Anonymous`;
+}
+function isClassComponent(value) {
+  return isFunction(value) && "__vccOpts" in value;
+}
+
+const computed = (getterOrOptions, debugOptions) => {
+  const c = computed$1(getterOrOptions, debugOptions, isInSSRComponentSetup);
+  {
+    const i = getCurrentInstance();
+    if (i && i.appContext.config.warnRecursiveComputed) {
+      c._warnRecursive = true;
+    }
+  }
+  return c;
+};
+
+function h(type, propsOrChildren, children) {
+  const l = arguments.length;
+  if (l === 2) {
+    if (isObject(propsOrChildren) && !isArray(propsOrChildren)) {
+      if (isVNode(propsOrChildren)) {
+        return createVNode(type, null, [propsOrChildren]);
+      }
+      return createVNode(type, propsOrChildren);
+    } else {
+      return createVNode(type, null, propsOrChildren);
+    }
+  } else {
+    if (l > 3) {
+      children = Array.prototype.slice.call(arguments, 2);
+    } else if (l === 3 && isVNode(children)) {
+      children = [children];
+    }
+    return createVNode(type, propsOrChildren, children);
+  }
+}
+
+function initCustomFormatter() {
+  if (typeof window === "undefined") {
+    return;
+  }
+  const vueStyle = { style: "color:#3ba776" };
+  const numberStyle = { style: "color:#1677ff" };
+  const stringStyle = { style: "color:#f5222d" };
+  const keywordStyle = { style: "color:#eb2f96" };
+  const formatter = {
+    __vue_custom_formatter: true,
+    header(obj) {
+      if (!isObject(obj)) {
+        return null;
+      }
+      if (obj.__isVue) {
+        return ["div", vueStyle, `VueInstance`];
+      } else if (isRef(obj)) {
+        return [
+          "div",
+          {},
+          ["span", vueStyle, genRefFlag(obj)],
+          "<",
+          // avoid debugger accessing value affecting behavior
+          formatValue("_value" in obj ? obj._value : obj),
+          `>`
+        ];
+      } else if (isReactive(obj)) {
+        return [
+          "div",
+          {},
+          ["span", vueStyle, isShallow(obj) ? "ShallowReactive" : "Reactive"],
+          "<",
+          formatValue(obj),
+          `>${isReadonly(obj) ? ` (readonly)` : ``}`
+        ];
+      } else if (isReadonly(obj)) {
+        return [
+          "div",
+          {},
+          ["span", vueStyle, isShallow(obj) ? "ShallowReadonly" : "Readonly"],
+          "<",
+          formatValue(obj),
+          ">"
+        ];
+      }
+      return null;
+    },
+    hasBody(obj) {
+      return obj && obj.__isVue;
+    },
+    body(obj) {
+      if (obj && obj.__isVue) {
+        return [
+          "div",
+          {},
+          ...formatInstance(obj.$)
+        ];
+      }
+    }
+  };
+  function formatInstance(instance) {
+    const blocks = [];
+    if (instance.type.props && instance.props) {
+      blocks.push(createInstanceBlock("props", toRaw(instance.props)));
+    }
+    if (instance.setupState !== EMPTY_OBJ) {
+      blocks.push(createInstanceBlock("setup", instance.setupState));
+    }
+    if (instance.data !== EMPTY_OBJ) {
+      blocks.push(createInstanceBlock("data", toRaw(instance.data)));
+    }
+    const computed = extractKeys(instance, "computed");
+    if (computed) {
+      blocks.push(createInstanceBlock("computed", computed));
+    }
+    const injected = extractKeys(instance, "inject");
+    if (injected) {
+      blocks.push(createInstanceBlock("injected", injected));
+    }
+    blocks.push([
+      "div",
+      {},
+      [
+        "span",
+        {
+          style: keywordStyle.style + ";opacity:0.66"
+        },
+        "$ (internal): "
+      ],
+      ["object", { object: instance }]
+    ]);
+    return blocks;
+  }
+  function createInstanceBlock(type, target) {
+    target = extend({}, target);
+    if (!Object.keys(target).length) {
+      return ["span", {}];
+    }
+    return [
+      "div",
+      { style: "line-height:1.25em;margin-bottom:0.6em" },
+      [
+        "div",
+        {
+          style: "color:#476582"
+        },
+        type
+      ],
+      [
+        "div",
+        {
+          style: "padding-left:1.25em"
+        },
+        ...Object.keys(target).map((key) => {
+          return [
+            "div",
+            {},
+            ["span", keywordStyle, key + ": "],
+            formatValue(target[key], false)
+          ];
+        })
+      ]
+    ];
+  }
+  function formatValue(v, asRaw = true) {
+    if (typeof v === "number") {
+      return ["span", numberStyle, v];
+    } else if (typeof v === "string") {
+      return ["span", stringStyle, JSON.stringify(v)];
+    } else if (typeof v === "boolean") {
+      return ["span", keywordStyle, v];
+    } else if (isObject(v)) {
+      return ["object", { object: asRaw ? toRaw(v) : v }];
+    } else {
+      return ["span", stringStyle, String(v)];
+    }
+  }
+  function extractKeys(instance, type) {
+    const Comp = instance.type;
+    if (isFunction(Comp)) {
+      return;
+    }
+    const extracted = {};
+    for (const key in instance.ctx) {
+      if (isKeyOfType(Comp, key, type)) {
+        extracted[key] = instance.ctx[key];
+      }
+    }
+    return extracted;
+  }
+  function isKeyOfType(Comp, key, type) {
+    const opts = Comp[type];
+    if (isArray(opts) && opts.includes(key) || isObject(opts) && key in opts) {
+      return true;
+    }
+    if (Comp.extends && isKeyOfType(Comp.extends, key, type)) {
+      return true;
+    }
+    if (Comp.mixins && Comp.mixins.some((m) => isKeyOfType(m, key, type))) {
+      return true;
+    }
+  }
+  function genRefFlag(v) {
+    if (isShallow(v)) {
+      return `ShallowRef`;
+    }
+    if (v.effect) {
+      return `ComputedRef`;
+    }
+    return `Ref`;
+  }
+  if (window.devtoolsFormatters) {
+    window.devtoolsFormatters.push(formatter);
+  } else {
+    window.devtoolsFormatters = [formatter];
+  }
+}
+
+function withMemo(memo, render, cache, index) {
+  const cached = cache[index];
+  if (cached && isMemoSame(cached, memo)) {
+    return cached;
+  }
+  const ret = render();
+  ret.memo = memo.slice();
+  ret.cacheIndex = index;
+  return cache[index] = ret;
+}
+function isMemoSame(cached, memo) {
+  const prev = cached.memo;
+  if (prev.length != memo.length) {
+    return false;
+  }
+  for (let i = 0; i < prev.length; i++) {
+    if (hasChanged(prev[i], memo[i])) {
+      return false;
+    }
+  }
+  if (isBlockTreeEnabled > 0 && currentBlock) {
+    currentBlock.push(cached);
+  }
+  return true;
+}
+
+const version = "3.5.12";
+const warn = warn$1 ;
+const ErrorTypeStrings = ErrorTypeStrings$1 ;
+const devtools = devtools$1 ;
+const setDevtoolsHook = setDevtoolsHook$1 ;
+const _ssrUtils = {
+  createComponentInstance,
+  setupComponent,
+  renderComponentRoot,
+  setCurrentRenderingInstance,
+  isVNode: isVNode,
+  normalizeVNode,
+  getComponentPublicInstance,
+  ensureValidVNode,
+  pushWarningContext,
+  popWarningContext
+};
+const ssrUtils = _ssrUtils ;
+const resolveFilter = null;
+const compatUtils = null;
+const DeprecationTypes = null;
+
+let policy = void 0;
+const tt = typeof window !== "undefined" && window.trustedTypes;
+if (tt) {
+  try {
+    policy = /* @__PURE__ */ tt.createPolicy("vue", {
+      createHTML: (val) => val
+    });
+  } catch (e) {
+    warn(`Error creating trusted types policy: ${e}`);
+  }
+}
+const unsafeToTrustedHTML = policy ? (val) => policy.createHTML(val) : (val) => val;
+const svgNS = "http://www.w3.org/2000/svg";
+const mathmlNS = "http://www.w3.org/1998/Math/MathML";
+const doc = typeof document !== "undefined" ? document : null;
+const templateContainer = doc && /* @__PURE__ */ doc.createElement("template");
+const nodeOps = {
+  insert: (child, parent, anchor) => {
+    parent.insertBefore(child, anchor || null);
+  },
+  remove: (child) => {
+    const parent = child.parentNode;
+    if (parent) {
+      parent.removeChild(child);
+    }
+  },
+  createElement: (tag, namespace, is, props) => {
+    const el = namespace === "svg" ? doc.createElementNS(svgNS, tag) : namespace === "mathml" ? doc.createElementNS(mathmlNS, tag) : is ? doc.createElement(tag, { is }) : doc.createElement(tag);
+    if (tag === "select" && props && props.multiple != null) {
+      el.setAttribute("multiple", props.multiple);
+    }
+    return el;
+  },
+  createText: (text) => doc.createTextNode(text),
+  createComment: (text) => doc.createComment(text),
+  setText: (node, text) => {
+    node.nodeValue = text;
+  },
+  setElementText: (el, text) => {
+    el.textContent = text;
+  },
+  parentNode: (node) => node.parentNode,
+  nextSibling: (node) => node.nextSibling,
+  querySelector: (selector) => doc.querySelector(selector),
+  setScopeId(el, id) {
+    el.setAttribute(id, "");
+  },
+  // __UNSAFE__
+  // Reason: innerHTML.
+  // Static content here can only come from compiled templates.
+  // As long as the user only uses trusted templates, this is safe.
+  insertStaticContent(content, parent, anchor, namespace, start, end) {
+    const before = anchor ? anchor.previousSibling : parent.lastChild;
+    if (start && (start === end || start.nextSibling)) {
+      while (true) {
+        parent.insertBefore(start.cloneNode(true), anchor);
+        if (start === end || !(start = start.nextSibling)) break;
+      }
+    } else {
+      templateContainer.innerHTML = unsafeToTrustedHTML(
+        namespace === "svg" ? `<svg>${content}</svg>` : namespace === "mathml" ? `<math>${content}</math>` : content
+      );
+      const template = templateContainer.content;
+      if (namespace === "svg" || namespace === "mathml") {
+        const wrapper = template.firstChild;
+        while (wrapper.firstChild) {
+          template.appendChild(wrapper.firstChild);
+        }
+        template.removeChild(wrapper);
+      }
+      parent.insertBefore(template, anchor);
+    }
+    return [
+      // first
+      before ? before.nextSibling : parent.firstChild,
+      // last
+      anchor ? anchor.previousSibling : parent.lastChild
+    ];
+  }
+};
+
+const TRANSITION$1 = "transition";
+const ANIMATION = "animation";
+const vtcKey = Symbol("_vtc");
+const DOMTransitionPropsValidators = {
+  name: String,
+  type: String,
+  css: {
+    type: Boolean,
+    default: true
+  },
+  duration: [String, Number, Object],
+  enterFromClass: String,
+  enterActiveClass: String,
+  enterToClass: String,
+  appearFromClass: String,
+  appearActiveClass: String,
+  appearToClass: String,
+  leaveFromClass: String,
+  leaveActiveClass: String,
+  leaveToClass: String
+};
+const TransitionPropsValidators = /* @__PURE__ */ extend(
+  {},
+  BaseTransitionPropsValidators,
+  DOMTransitionPropsValidators
+);
+const decorate$1 = (t) => {
+  t.displayName = "Transition";
+  t.props = TransitionPropsValidators;
+  return t;
+};
+const Transition = /* @__PURE__ */ decorate$1(
+  (props, { slots }) => h(BaseTransition, resolveTransitionProps(props), slots)
+);
+const callHook = (hook, args = []) => {
+  if (isArray(hook)) {
+    hook.forEach((h2) => h2(...args));
+  } else if (hook) {
+    hook(...args);
+  }
+};
+const hasExplicitCallback = (hook) => {
+  return hook ? isArray(hook) ? hook.some((h2) => h2.length > 1) : hook.length > 1 : false;
+};
+function resolveTransitionProps(rawProps) {
+  const baseProps = {};
+  for (const key in rawProps) {
+    if (!(key in DOMTransitionPropsValidators)) {
+      baseProps[key] = rawProps[key];
+    }
+  }
+  if (rawProps.css === false) {
+    return baseProps;
+  }
+  const {
+    name = "v",
+    type,
+    duration,
+    enterFromClass = `${name}-enter-from`,
+    enterActiveClass = `${name}-enter-active`,
+    enterToClass = `${name}-enter-to`,
+    appearFromClass = enterFromClass,
+    appearActiveClass = enterActiveClass,
+    appearToClass = enterToClass,
+    leaveFromClass = `${name}-leave-from`,
+    leaveActiveClass = `${name}-leave-active`,
+    leaveToClass = `${name}-leave-to`
+  } = rawProps;
+  const durations = normalizeDuration(duration);
+  const enterDuration = durations && durations[0];
+  const leaveDuration = durations && durations[1];
+  const {
+    onBeforeEnter,
+    onEnter,
+    onEnterCancelled,
+    onLeave,
+    onLeaveCancelled,
+    onBeforeAppear = onBeforeEnter,
+    onAppear = onEnter,
+    onAppearCancelled = onEnterCancelled
+  } = baseProps;
+  const finishEnter = (el, isAppear, done) => {
+    removeTransitionClass(el, isAppear ? appearToClass : enterToClass);
+    removeTransitionClass(el, isAppear ? appearActiveClass : enterActiveClass);
+    done && done();
+  };
+  const finishLeave = (el, done) => {
+    el._isLeaving = false;
+    removeTransitionClass(el, leaveFromClass);
+    removeTransitionClass(el, leaveToClass);
+    removeTransitionClass(el, leaveActiveClass);
+    done && done();
+  };
+  const makeEnterHook = (isAppear) => {
+    return (el, done) => {
+      const hook = isAppear ? onAppear : onEnter;
+      const resolve = () => finishEnter(el, isAppear, done);
+      callHook(hook, [el, resolve]);
+      nextFrame(() => {
+        removeTransitionClass(el, isAppear ? appearFromClass : enterFromClass);
+        addTransitionClass(el, isAppear ? appearToClass : enterToClass);
+        if (!hasExplicitCallback(hook)) {
+          whenTransitionEnds(el, type, enterDuration, resolve);
+        }
+      });
+    };
+  };
+  return extend(baseProps, {
+    onBeforeEnter(el) {
+      callHook(onBeforeEnter, [el]);
+      addTransitionClass(el, enterFromClass);
+      addTransitionClass(el, enterActiveClass);
+    },
+    onBeforeAppear(el) {
+      callHook(onBeforeAppear, [el]);
+      addTransitionClass(el, appearFromClass);
+      addTransitionClass(el, appearActiveClass);
+    },
+    onEnter: makeEnterHook(false),
+    onAppear: makeEnterHook(true),
+    onLeave(el, done) {
+      el._isLeaving = true;
+      const resolve = () => finishLeave(el, done);
+      addTransitionClass(el, leaveFromClass);
+      addTransitionClass(el, leaveActiveClass);
+      forceReflow();
+      nextFrame(() => {
+        if (!el._isLeaving) {
+          return;
+        }
+        removeTransitionClass(el, leaveFromClass);
+        addTransitionClass(el, leaveToClass);
+        if (!hasExplicitCallback(onLeave)) {
+          whenTransitionEnds(el, type, leaveDuration, resolve);
+        }
+      });
+      callHook(onLeave, [el, resolve]);
+    },
+    onEnterCancelled(el) {
+      finishEnter(el, false);
+      callHook(onEnterCancelled, [el]);
+    },
+    onAppearCancelled(el) {
+      finishEnter(el, true);
+      callHook(onAppearCancelled, [el]);
+    },
+    onLeaveCancelled(el) {
+      finishLeave(el);
+      callHook(onLeaveCancelled, [el]);
+    }
+  });
+}
+function normalizeDuration(duration) {
+  if (duration == null) {
+    return null;
+  } else if (isObject(duration)) {
+    return [NumberOf(duration.enter), NumberOf(duration.leave)];
+  } else {
+    const n = NumberOf(duration);
+    return [n, n];
+  }
+}
+function NumberOf(val) {
+  const res = toNumber(val);
+  {
+    assertNumber(res, "<transition> explicit duration");
+  }
+  return res;
+}
+function addTransitionClass(el, cls) {
+  cls.split(/\s+/).forEach((c) => c && el.classList.add(c));
+  (el[vtcKey] || (el[vtcKey] = /* @__PURE__ */ new Set())).add(cls);
+}
+function removeTransitionClass(el, cls) {
+  cls.split(/\s+/).forEach((c) => c && el.classList.remove(c));
+  const _vtc = el[vtcKey];
+  if (_vtc) {
+    _vtc.delete(cls);
+    if (!_vtc.size) {
+      el[vtcKey] = void 0;
+    }
+  }
+}
+function nextFrame(cb) {
+  requestAnimationFrame(() => {
+    requestAnimationFrame(cb);
+  });
+}
+let endId = 0;
+function whenTransitionEnds(el, expectedType, explicitTimeout, resolve) {
+  const id = el._endId = ++endId;
+  const resolveIfNotStale = () => {
+    if (id === el._endId) {
+      resolve();
+    }
+  };
+  if (explicitTimeout != null) {
+    return setTimeout(resolveIfNotStale, explicitTimeout);
+  }
+  const { type, timeout, propCount } = getTransitionInfo(el, expectedType);
+  if (!type) {
+    return resolve();
+  }
+  const endEvent = type + "end";
+  let ended = 0;
+  const end = () => {
+    el.removeEventListener(endEvent, onEnd);
+    resolveIfNotStale();
+  };
+  const onEnd = (e) => {
+    if (e.target === el && ++ended >= propCount) {
+      end();
+    }
+  };
+  setTimeout(() => {
+    if (ended < propCount) {
+      end();
+    }
+  }, timeout + 1);
+  el.addEventListener(endEvent, onEnd);
+}
+function getTransitionInfo(el, expectedType) {
+  const styles = window.getComputedStyle(el);
+  const getStyleProperties = (key) => (styles[key] || "").split(", ");
+  const transitionDelays = getStyleProperties(`${TRANSITION$1}Delay`);
+  const transitionDurations = getStyleProperties(`${TRANSITION$1}Duration`);
+  const transitionTimeout = getTimeout(transitionDelays, transitionDurations);
+  const animationDelays = getStyleProperties(`${ANIMATION}Delay`);
+  const animationDurations = getStyleProperties(`${ANIMATION}Duration`);
+  const animationTimeout = getTimeout(animationDelays, animationDurations);
+  let type = null;
+  let timeout = 0;
+  let propCount = 0;
+  if (expectedType === TRANSITION$1) {
+    if (transitionTimeout > 0) {
+      type = TRANSITION$1;
+      timeout = transitionTimeout;
+      propCount = transitionDurations.length;
+    }
+  } else if (expectedType === ANIMATION) {
+    if (animationTimeout > 0) {
+      type = ANIMATION;
+      timeout = animationTimeout;
+      propCount = animationDurations.length;
+    }
+  } else {
+    timeout = Math.max(transitionTimeout, animationTimeout);
+    type = timeout > 0 ? transitionTimeout > animationTimeout ? TRANSITION$1 : ANIMATION : null;
+    propCount = type ? type === TRANSITION$1 ? transitionDurations.length : animationDurations.length : 0;
+  }
+  const hasTransform = type === TRANSITION$1 && /\b(transform|all)(,|$)/.test(
+    getStyleProperties(`${TRANSITION$1}Property`).toString()
+  );
+  return {
+    type,
+    timeout,
+    propCount,
+    hasTransform
+  };
+}
+function getTimeout(delays, durations) {
+  while (delays.length < durations.length) {
+    delays = delays.concat(delays);
+  }
+  return Math.max(...durations.map((d, i) => toMs(d) + toMs(delays[i])));
+}
+function toMs(s) {
+  if (s === "auto") return 0;
+  return Number(s.slice(0, -1).replace(",", ".")) * 1e3;
+}
+function forceReflow() {
+  return document.body.offsetHeight;
+}
+
+function patchClass(el, value, isSVG) {
+  const transitionClasses = el[vtcKey];
+  if (transitionClasses) {
+    value = (value ? [value, ...transitionClasses] : [...transitionClasses]).join(" ");
+  }
+  if (value == null) {
+    el.removeAttribute("class");
+  } else if (isSVG) {
+    el.setAttribute("class", value);
+  } else {
+    el.className = value;
+  }
+}
+
+const vShowOriginalDisplay = Symbol("_vod");
+const vShowHidden = Symbol("_vsh");
+const vShow = {
+  beforeMount(el, { value }, { transition }) {
+    el[vShowOriginalDisplay] = el.style.display === "none" ? "" : el.style.display;
+    if (transition && value) {
+      transition.beforeEnter(el);
+    } else {
+      setDisplay(el, value);
+    }
+  },
+  mounted(el, { value }, { transition }) {
+    if (transition && value) {
+      transition.enter(el);
+    }
+  },
+  updated(el, { value, oldValue }, { transition }) {
+    if (!value === !oldValue) return;
+    if (transition) {
+      if (value) {
+        transition.beforeEnter(el);
+        setDisplay(el, true);
+        transition.enter(el);
+      } else {
+        transition.leave(el, () => {
+          setDisplay(el, false);
+        });
+      }
+    } else {
+      setDisplay(el, value);
+    }
+  },
+  beforeUnmount(el, { value }) {
+    setDisplay(el, value);
+  }
+};
+{
+  vShow.name = "show";
+}
+function setDisplay(el, value) {
+  el.style.display = value ? el[vShowOriginalDisplay] : "none";
+  el[vShowHidden] = !value;
+}
+function initVShowForSSR() {
+  vShow.getSSRProps = ({ value }) => {
+    if (!value) {
+      return { style: { display: "none" } };
+    }
+  };
+}
+
+const CSS_VAR_TEXT = Symbol("CSS_VAR_TEXT" );
+function useCssVars(getter) {
+  const instance = getCurrentInstance();
+  if (!instance) {
+    warn(`useCssVars is called without current active component instance.`);
+    return;
+  }
+  const updateTeleports = instance.ut = (vars = getter(instance.proxy)) => {
+    Array.from(
+      document.querySelectorAll(`[data-v-owner="${instance.uid}"]`)
+    ).forEach((node) => setVarsOnNode(node, vars));
+  };
+  {
+    instance.getCssVars = () => getter(instance.proxy);
+  }
+  const setVars = () => {
+    const vars = getter(instance.proxy);
+    if (instance.ce) {
+      setVarsOnNode(instance.ce, vars);
+    } else {
+      setVarsOnVNode(instance.subTree, vars);
+    }
+    updateTeleports(vars);
+  };
+  onBeforeMount(() => {
+    watchPostEffect(setVars);
+  });
+  onMounted(() => {
+    const ob = new MutationObserver(setVars);
+    ob.observe(instance.subTree.el.parentNode, { childList: true });
+    onUnmounted(() => ob.disconnect());
+  });
+}
+function setVarsOnVNode(vnode, vars) {
+  if (vnode.shapeFlag & 128) {
+    const suspense = vnode.suspense;
+    vnode = suspense.activeBranch;
+    if (suspense.pendingBranch && !suspense.isHydrating) {
+      suspense.effects.push(() => {
+        setVarsOnVNode(suspense.activeBranch, vars);
+      });
+    }
+  }
+  while (vnode.component) {
+    vnode = vnode.component.subTree;
+  }
+  if (vnode.shapeFlag & 1 && vnode.el) {
+    setVarsOnNode(vnode.el, vars);
+  } else if (vnode.type === Fragment) {
+    vnode.children.forEach((c) => setVarsOnVNode(c, vars));
+  } else if (vnode.type === Static) {
+    let { el, anchor } = vnode;
+    while (el) {
+      setVarsOnNode(el, vars);
+      if (el === anchor) break;
+      el = el.nextSibling;
+    }
+  }
+}
+function setVarsOnNode(el, vars) {
+  if (el.nodeType === 1) {
+    const style = el.style;
+    let cssText = "";
+    for (const key in vars) {
+      style.setProperty(`--${key}`, vars[key]);
+      cssText += `--${key}: ${vars[key]};`;
+    }
+    style[CSS_VAR_TEXT] = cssText;
+  }
+}
+
+const displayRE = /(^|;)\s*display\s*:/;
+function patchStyle(el, prev, next) {
+  const style = el.style;
+  const isCssString = isString(next);
+  let hasControlledDisplay = false;
+  if (next && !isCssString) {
+    if (prev) {
+      if (!isString(prev)) {
+        for (const key in prev) {
+          if (next[key] == null) {
+            setStyle(style, key, "");
+          }
+        }
+      } else {
+        for (const prevStyle of prev.split(";")) {
+          const key = prevStyle.slice(0, prevStyle.indexOf(":")).trim();
+          if (next[key] == null) {
+            setStyle(style, key, "");
+          }
+        }
+      }
+    }
+    for (const key in next) {
+      if (key === "display") {
+        hasControlledDisplay = true;
+      }
+      setStyle(style, key, next[key]);
+    }
+  } else {
+    if (isCssString) {
+      if (prev !== next) {
+        const cssVarText = style[CSS_VAR_TEXT];
+        if (cssVarText) {
+          next += ";" + cssVarText;
+        }
+        style.cssText = next;
+        hasControlledDisplay = displayRE.test(next);
+      }
+    } else if (prev) {
+      el.removeAttribute("style");
+    }
+  }
+  if (vShowOriginalDisplay in el) {
+    el[vShowOriginalDisplay] = hasControlledDisplay ? style.display : "";
+    if (el[vShowHidden]) {
+      style.display = "none";
+    }
+  }
+}
+const semicolonRE = /[^\\];\s*$/;
+const importantRE = /\s*!important$/;
+function setStyle(style, name, val) {
+  if (isArray(val)) {
+    val.forEach((v) => setStyle(style, name, v));
+  } else {
+    if (val == null) val = "";
+    {
+      if (semicolonRE.test(val)) {
+        warn(
+          `Unexpected semicolon at the end of '${name}' style value: '${val}'`
+        );
+      }
+    }
+    if (name.startsWith("--")) {
+      style.setProperty(name, val);
+    } else {
+      const prefixed = autoPrefix(style, name);
+      if (importantRE.test(val)) {
+        style.setProperty(
+          hyphenate(prefixed),
+          val.replace(importantRE, ""),
+          "important"
+        );
+      } else {
+        style[prefixed] = val;
+      }
+    }
+  }
+}
+const prefixes = ["Webkit", "Moz", "ms"];
+const prefixCache = {};
+function autoPrefix(style, rawName) {
+  const cached = prefixCache[rawName];
+  if (cached) {
+    return cached;
+  }
+  let name = camelize(rawName);
+  if (name !== "filter" && name in style) {
+    return prefixCache[rawName] = name;
+  }
+  name = capitalize(name);
+  for (let i = 0; i < prefixes.length; i++) {
+    const prefixed = prefixes[i] + name;
+    if (prefixed in style) {
+      return prefixCache[rawName] = prefixed;
+    }
+  }
+  return rawName;
+}
+
+const xlinkNS = "http://www.w3.org/1999/xlink";
+function patchAttr(el, key, value, isSVG, instance, isBoolean = isSpecialBooleanAttr(key)) {
+  if (isSVG && key.startsWith("xlink:")) {
+    if (value == null) {
+      el.removeAttributeNS(xlinkNS, key.slice(6, key.length));
+    } else {
+      el.setAttributeNS(xlinkNS, key, value);
+    }
+  } else {
+    if (value == null || isBoolean && !includeBooleanAttr(value)) {
+      el.removeAttribute(key);
+    } else {
+      el.setAttribute(
+        key,
+        isBoolean ? "" : isSymbol(value) ? String(value) : value
+      );
+    }
+  }
+}
+
+function patchDOMProp(el, key, value, parentComponent, attrName) {
+  if (key === "innerHTML" || key === "textContent") {
+    if (value != null) {
+      el[key] = key === "innerHTML" ? unsafeToTrustedHTML(value) : value;
+    }
+    return;
+  }
+  const tag = el.tagName;
+  if (key === "value" && tag !== "PROGRESS" && // custom elements may use _value internally
+  !tag.includes("-")) {
+    const oldValue = tag === "OPTION" ? el.getAttribute("value") || "" : el.value;
+    const newValue = value == null ? (
+      // #11647: value should be set as empty string for null and undefined,
+      // but <input type="checkbox"> should be set as 'on'.
+      el.type === "checkbox" ? "on" : ""
+    ) : String(value);
+    if (oldValue !== newValue || !("_value" in el)) {
+      el.value = newValue;
+    }
+    if (value == null) {
+      el.removeAttribute(key);
+    }
+    el._value = value;
+    return;
+  }
+  let needRemove = false;
+  if (value === "" || value == null) {
+    const type = typeof el[key];
+    if (type === "boolean") {
+      value = includeBooleanAttr(value);
+    } else if (value == null && type === "string") {
+      value = "";
+      needRemove = true;
+    } else if (type === "number") {
+      value = 0;
+      needRemove = true;
+    }
+  }
+  try {
+    el[key] = value;
+  } catch (e) {
+    if (!needRemove) {
+      warn(
+        `Failed setting prop "${key}" on <${tag.toLowerCase()}>: value ${value} is invalid.`,
+        e
+      );
+    }
+  }
+  needRemove && el.removeAttribute(attrName || key);
+}
+
+function addEventListener(el, event, handler, options) {
+  el.addEventListener(event, handler, options);
+}
+function removeEventListener(el, event, handler, options) {
+  el.removeEventListener(event, handler, options);
+}
+const veiKey = Symbol("_vei");
+function patchEvent(el, rawName, prevValue, nextValue, instance = null) {
+  const invokers = el[veiKey] || (el[veiKey] = {});
+  const existingInvoker = invokers[rawName];
+  if (nextValue && existingInvoker) {
+    existingInvoker.value = sanitizeEventValue(nextValue, rawName) ;
+  } else {
+    const [name, options] = parseName(rawName);
+    if (nextValue) {
+      const invoker = invokers[rawName] = createInvoker(
+        sanitizeEventValue(nextValue, rawName) ,
+        instance
+      );
+      addEventListener(el, name, invoker, options);
+    } else if (existingInvoker) {
+      removeEventListener(el, name, existingInvoker, options);
+      invokers[rawName] = void 0;
+    }
+  }
+}
+const optionsModifierRE = /(?:Once|Passive|Capture)$/;
+function parseName(name) {
+  let options;
+  if (optionsModifierRE.test(name)) {
+    options = {};
+    let m;
+    while (m = name.match(optionsModifierRE)) {
+      name = name.slice(0, name.length - m[0].length);
+      options[m[0].toLowerCase()] = true;
+    }
+  }
+  const event = name[2] === ":" ? name.slice(3) : hyphenate(name.slice(2));
+  return [event, options];
+}
+let cachedNow = 0;
+const p = /* @__PURE__ */ Promise.resolve();
+const getNow = () => cachedNow || (p.then(() => cachedNow = 0), cachedNow = Date.now());
+function createInvoker(initialValue, instance) {
+  const invoker = (e) => {
+    if (!e._vts) {
+      e._vts = Date.now();
+    } else if (e._vts <= invoker.attached) {
+      return;
+    }
+    callWithAsyncErrorHandling(
+      patchStopImmediatePropagation(e, invoker.value),
+      instance,
+      5,
+      [e]
+    );
+  };
+  invoker.value = initialValue;
+  invoker.attached = getNow();
+  return invoker;
+}
+function sanitizeEventValue(value, propName) {
+  if (isFunction(value) || isArray(value)) {
+    return value;
+  }
+  warn(
+    `Wrong type passed as event handler to ${propName} - did you forget @ or : in front of your prop?
+Expected function or array of functions, received type ${typeof value}.`
+  );
+  return NOOP;
+}
+function patchStopImmediatePropagation(e, value) {
+  if (isArray(value)) {
+    const originalStop = e.stopImmediatePropagation;
+    e.stopImmediatePropagation = () => {
+      originalStop.call(e);
+      e._stopped = true;
+    };
+    return value.map(
+      (fn) => (e2) => !e2._stopped && fn && fn(e2)
+    );
+  } else {
+    return value;
+  }
+}
+
+const isNativeOn = (key) => key.charCodeAt(0) === 111 && key.charCodeAt(1) === 110 && // lowercase letter
+key.charCodeAt(2) > 96 && key.charCodeAt(2) < 123;
+const patchProp = (el, key, prevValue, nextValue, namespace, parentComponent) => {
+  const isSVG = namespace === "svg";
+  if (key === "class") {
+    patchClass(el, nextValue, isSVG);
+  } else if (key === "style") {
+    patchStyle(el, prevValue, nextValue);
+  } else if (isOn(key)) {
+    if (!isModelListener(key)) {
+      patchEvent(el, key, prevValue, nextValue, parentComponent);
+    }
+  } else if (key[0] === "." ? (key = key.slice(1), true) : key[0] === "^" ? (key = key.slice(1), false) : shouldSetAsProp(el, key, nextValue, isSVG)) {
+    patchDOMProp(el, key, nextValue);
+    if (!el.tagName.includes("-") && (key === "value" || key === "checked" || key === "selected")) {
+      patchAttr(el, key, nextValue, isSVG, parentComponent, key !== "value");
+    }
+  } else if (
+    // #11081 force set props for possible async custom element
+    el._isVueCE && (/[A-Z]/.test(key) || !isString(nextValue))
+  ) {
+    patchDOMProp(el, camelize(key), nextValue, parentComponent, key);
+  } else {
+    if (key === "true-value") {
+      el._trueValue = nextValue;
+    } else if (key === "false-value") {
+      el._falseValue = nextValue;
+    }
+    patchAttr(el, key, nextValue, isSVG);
+  }
+};
+function shouldSetAsProp(el, key, value, isSVG) {
+  if (isSVG) {
+    if (key === "innerHTML" || key === "textContent") {
+      return true;
+    }
+    if (key in el && isNativeOn(key) && isFunction(value)) {
+      return true;
+    }
+    return false;
+  }
+  if (key === "spellcheck" || key === "draggable" || key === "translate") {
+    return false;
+  }
+  if (key === "form") {
+    return false;
+  }
+  if (key === "list" && el.tagName === "INPUT") {
+    return false;
+  }
+  if (key === "type" && el.tagName === "TEXTAREA") {
+    return false;
+  }
+  if (key === "width" || key === "height") {
+    const tag = el.tagName;
+    if (tag === "IMG" || tag === "VIDEO" || tag === "CANVAS" || tag === "SOURCE") {
+      return false;
+    }
+  }
+  if (isNativeOn(key) && isString(value)) {
+    return false;
+  }
+  return key in el;
+}
+
+const REMOVAL = {};
+/*! #__NO_SIDE_EFFECTS__ */
+// @__NO_SIDE_EFFECTS__
+function defineCustomElement(options, extraOptions, _createApp) {
+  const Comp = defineComponent(options, extraOptions);
+  if (isPlainObject(Comp)) extend(Comp, extraOptions);
+  class VueCustomElement extends VueElement {
+    constructor(initialProps) {
+      super(Comp, initialProps, _createApp);
+    }
+  }
+  VueCustomElement.def = Comp;
+  return VueCustomElement;
+}
+/*! #__NO_SIDE_EFFECTS__ */
+const defineSSRCustomElement = /* @__NO_SIDE_EFFECTS__ */ (options, extraOptions) => {
+  return /* @__PURE__ */ defineCustomElement(options, extraOptions, createSSRApp);
+};
+const BaseClass = typeof HTMLElement !== "undefined" ? HTMLElement : class {
+};
+class VueElement extends BaseClass {
+  constructor(_def, _props = {}, _createApp = createApp) {
+    super();
+    this._def = _def;
+    this._props = _props;
+    this._createApp = _createApp;
+    this._isVueCE = true;
+    /**
+     * @internal
+     */
+    this._instance = null;
+    /**
+     * @internal
+     */
+    this._app = null;
+    /**
+     * @internal
+     */
+    this._nonce = this._def.nonce;
+    this._connected = false;
+    this._resolved = false;
+    this._numberProps = null;
+    this._styleChildren = /* @__PURE__ */ new WeakSet();
+    this._ob = null;
+    if (this.shadowRoot && _createApp !== createApp) {
+      this._root = this.shadowRoot;
+    } else {
+      if (this.shadowRoot) {
+        warn(
+          `Custom element has pre-rendered declarative shadow root but is not defined as hydratable. Use \`defineSSRCustomElement\`.`
+        );
+      }
+      if (_def.shadowRoot !== false) {
+        this.attachShadow({ mode: "open" });
+        this._root = this.shadowRoot;
+      } else {
+        this._root = this;
+      }
+    }
+    if (!this._def.__asyncLoader) {
+      this._resolveProps(this._def);
+    }
+  }
+  connectedCallback() {
+    if (!this.isConnected) return;
+    if (!this.shadowRoot) {
+      this._parseSlots();
+    }
+    this._connected = true;
+    let parent = this;
+    while (parent = parent && (parent.parentNode || parent.host)) {
+      if (parent instanceof VueElement) {
+        this._parent = parent;
+        break;
+      }
+    }
+    if (!this._instance) {
+      if (this._resolved) {
+        this._setParent();
+        this._update();
+      } else {
+        if (parent && parent._pendingResolve) {
+          this._pendingResolve = parent._pendingResolve.then(() => {
+            this._pendingResolve = void 0;
+            this._resolveDef();
+          });
+        } else {
+          this._resolveDef();
+        }
+      }
+    }
+  }
+  _setParent(parent = this._parent) {
+    if (parent) {
+      this._instance.parent = parent._instance;
+      this._instance.provides = parent._instance.provides;
+    }
+  }
+  disconnectedCallback() {
+    this._connected = false;
+    nextTick(() => {
+      if (!this._connected) {
+        if (this._ob) {
+          this._ob.disconnect();
+          this._ob = null;
+        }
+        this._app && this._app.unmount();
+        if (this._instance) this._instance.ce = void 0;
+        this._app = this._instance = null;
+      }
+    });
+  }
+  /**
+   * resolve inner component definition (handle possible async component)
+   */
+  _resolveDef() {
+    if (this._pendingResolve) {
+      return;
+    }
+    for (let i = 0; i < this.attributes.length; i++) {
+      this._setAttr(this.attributes[i].name);
+    }
+    this._ob = new MutationObserver((mutations) => {
+      for (const m of mutations) {
+        this._setAttr(m.attributeName);
+      }
+    });
+    this._ob.observe(this, { attributes: true });
+    const resolve = (def, isAsync = false) => {
+      this._resolved = true;
+      this._pendingResolve = void 0;
+      const { props, styles } = def;
+      let numberProps;
+      if (props && !isArray(props)) {
+        for (const key in props) {
+          const opt = props[key];
+          if (opt === Number || opt && opt.type === Number) {
+            if (key in this._props) {
+              this._props[key] = toNumber(this._props[key]);
+            }
+            (numberProps || (numberProps = /* @__PURE__ */ Object.create(null)))[camelize(key)] = true;
+          }
+        }
+      }
+      this._numberProps = numberProps;
+      if (isAsync) {
+        this._resolveProps(def);
+      }
+      if (this.shadowRoot) {
+        this._applyStyles(styles);
+      } else if (styles) {
+        warn(
+          "Custom element style injection is not supported when using shadowRoot: false"
+        );
+      }
+      this._mount(def);
+    };
+    const asyncDef = this._def.__asyncLoader;
+    if (asyncDef) {
+      this._pendingResolve = asyncDef().then(
+        (def) => resolve(this._def = def, true)
+      );
+    } else {
+      resolve(this._def);
+    }
+  }
+  _mount(def) {
+    if (!def.name) {
+      def.name = "VueElement";
+    }
+    this._app = this._createApp(def);
+    if (def.configureApp) {
+      def.configureApp(this._app);
+    }
+    this._app._ceVNode = this._createVNode();
+    this._app.mount(this._root);
+    const exposed = this._instance && this._instance.exposed;
+    if (!exposed) return;
+    for (const key in exposed) {
+      if (!hasOwn(this, key)) {
+        Object.defineProperty(this, key, {
+          // unwrap ref to be consistent with public instance behavior
+          get: () => unref(exposed[key])
+        });
+      } else {
+        warn(`Exposed property "${key}" already exists on custom element.`);
+      }
+    }
+  }
+  _resolveProps(def) {
+    const { props } = def;
+    const declaredPropKeys = isArray(props) ? props : Object.keys(props || {});
+    for (const key of Object.keys(this)) {
+      if (key[0] !== "_" && declaredPropKeys.includes(key)) {
+        this._setProp(key, this[key]);
+      }
+    }
+    for (const key of declaredPropKeys.map(camelize)) {
+      Object.defineProperty(this, key, {
+        get() {
+          return this._getProp(key);
+        },
+        set(val) {
+          this._setProp(key, val, true, true);
+        }
+      });
+    }
+  }
+  _setAttr(key) {
+    if (key.startsWith("data-v-")) return;
+    const has = this.hasAttribute(key);
+    let value = has ? this.getAttribute(key) : REMOVAL;
+    const camelKey = camelize(key);
+    if (has && this._numberProps && this._numberProps[camelKey]) {
+      value = toNumber(value);
+    }
+    this._setProp(camelKey, value, false, true);
+  }
+  /**
+   * @internal
+   */
+  _getProp(key) {
+    return this._props[key];
+  }
+  /**
+   * @internal
+   */
+  _setProp(key, val, shouldReflect = true, shouldUpdate = false) {
+    if (val !== this._props[key]) {
+      if (val === REMOVAL) {
+        delete this._props[key];
+      } else {
+        this._props[key] = val;
+        if (key === "key" && this._app) {
+          this._app._ceVNode.key = val;
+        }
+      }
+      if (shouldUpdate && this._instance) {
+        this._update();
+      }
+      if (shouldReflect) {
+        if (val === true) {
+          this.setAttribute(hyphenate(key), "");
+        } else if (typeof val === "string" || typeof val === "number") {
+          this.setAttribute(hyphenate(key), val + "");
+        } else if (!val) {
+          this.removeAttribute(hyphenate(key));
+        }
+      }
+    }
+  }
+  _update() {
+    render(this._createVNode(), this._root);
+  }
+  _createVNode() {
+    const baseProps = {};
+    if (!this.shadowRoot) {
+      baseProps.onVnodeMounted = baseProps.onVnodeUpdated = this._renderSlots.bind(this);
+    }
+    const vnode = createVNode(this._def, extend(baseProps, this._props));
+    if (!this._instance) {
+      vnode.ce = (instance) => {
+        this._instance = instance;
+        instance.ce = this;
+        instance.isCE = true;
+        {
+          instance.ceReload = (newStyles) => {
+            if (this._styles) {
+              this._styles.forEach((s) => this._root.removeChild(s));
+              this._styles.length = 0;
+            }
+            this._applyStyles(newStyles);
+            this._instance = null;
+            this._update();
+          };
+        }
+        const dispatch = (event, args) => {
+          this.dispatchEvent(
+            new CustomEvent(
+              event,
+              isPlainObject(args[0]) ? extend({ detail: args }, args[0]) : { detail: args }
+            )
+          );
+        };
+        instance.emit = (event, ...args) => {
+          dispatch(event, args);
+          if (hyphenate(event) !== event) {
+            dispatch(hyphenate(event), args);
+          }
+        };
+        this._setParent();
+      };
+    }
+    return vnode;
+  }
+  _applyStyles(styles, owner) {
+    if (!styles) return;
+    if (owner) {
+      if (owner === this._def || this._styleChildren.has(owner)) {
+        return;
+      }
+      this._styleChildren.add(owner);
+    }
+    const nonce = this._nonce;
+    for (let i = styles.length - 1; i >= 0; i--) {
+      const s = document.createElement("style");
+      if (nonce) s.setAttribute("nonce", nonce);
+      s.textContent = styles[i];
+      this.shadowRoot.prepend(s);
+      {
+        if (owner) {
+          if (owner.__hmrId) {
+            if (!this._childStyles) this._childStyles = /* @__PURE__ */ new Map();
+            let entry = this._childStyles.get(owner.__hmrId);
+            if (!entry) {
+              this._childStyles.set(owner.__hmrId, entry = []);
+            }
+            entry.push(s);
+          }
+        } else {
+          (this._styles || (this._styles = [])).push(s);
+        }
+      }
+    }
+  }
+  /**
+   * Only called when shadowRoot is false
+   */
+  _parseSlots() {
+    const slots = this._slots = {};
+    let n;
+    while (n = this.firstChild) {
+      const slotName = n.nodeType === 1 && n.getAttribute("slot") || "default";
+      (slots[slotName] || (slots[slotName] = [])).push(n);
+      this.removeChild(n);
+    }
+  }
+  /**
+   * Only called when shadowRoot is false
+   */
+  _renderSlots() {
+    const outlets = (this._teleportTarget || this).querySelectorAll("slot");
+    const scopeId = this._instance.type.__scopeId;
+    for (let i = 0; i < outlets.length; i++) {
+      const o = outlets[i];
+      const slotName = o.getAttribute("name") || "default";
+      const content = this._slots[slotName];
+      const parent = o.parentNode;
+      if (content) {
+        for (const n of content) {
+          if (scopeId && n.nodeType === 1) {
+            const id = scopeId + "-s";
+            const walker = document.createTreeWalker(n, 1);
+            n.setAttribute(id, "");
+            let child;
+            while (child = walker.nextNode()) {
+              child.setAttribute(id, "");
+            }
+          }
+          parent.insertBefore(n, o);
+        }
+      } else {
+        while (o.firstChild) parent.insertBefore(o.firstChild, o);
+      }
+      parent.removeChild(o);
+    }
+  }
+  /**
+   * @internal
+   */
+  _injectChildStyle(comp) {
+    this._applyStyles(comp.styles, comp);
+  }
+  /**
+   * @internal
+   */
+  _removeChildStyle(comp) {
+    {
+      this._styleChildren.delete(comp);
+      if (this._childStyles && comp.__hmrId) {
+        const oldStyles = this._childStyles.get(comp.__hmrId);
+        if (oldStyles) {
+          oldStyles.forEach((s) => this._root.removeChild(s));
+          oldStyles.length = 0;
+        }
+      }
+    }
+  }
+}
+function useHost(caller) {
+  const instance = getCurrentInstance();
+  const el = instance && instance.ce;
+  if (el) {
+    return el;
+  } else {
+    if (!instance) {
+      warn(
+        `${caller || "useHost"} called without an active component instance.`
+      );
+    } else {
+      warn(
+        `${caller || "useHost"} can only be used in components defined via defineCustomElement.`
+      );
+    }
+  }
+  return null;
+}
+function useShadowRoot() {
+  const el = useHost("useShadowRoot") ;
+  return el && el.shadowRoot;
+}
+
+function useCssModule(name = "$style") {
+  {
+    const instance = getCurrentInstance();
+    if (!instance) {
+      warn(`useCssModule must be called inside setup()`);
+      return EMPTY_OBJ;
+    }
+    const modules = instance.type.__cssModules;
+    if (!modules) {
+      warn(`Current instance does not have CSS modules injected.`);
+      return EMPTY_OBJ;
+    }
+    const mod = modules[name];
+    if (!mod) {
+      warn(`Current instance does not have CSS module named "${name}".`);
+      return EMPTY_OBJ;
+    }
+    return mod;
+  }
+}
+
+const positionMap = /* @__PURE__ */ new WeakMap();
+const newPositionMap = /* @__PURE__ */ new WeakMap();
+const moveCbKey = Symbol("_moveCb");
+const enterCbKey = Symbol("_enterCb");
+const decorate = (t) => {
+  delete t.props.mode;
+  return t;
+};
+const TransitionGroupImpl = /* @__PURE__ */ decorate({
+  name: "TransitionGroup",
+  props: /* @__PURE__ */ extend({}, TransitionPropsValidators, {
+    tag: String,
+    moveClass: String
+  }),
+  setup(props, { slots }) {
+    const instance = getCurrentInstance();
+    const state = useTransitionState();
+    let prevChildren;
+    let children;
+    onUpdated(() => {
+      if (!prevChildren.length) {
+        return;
+      }
+      const moveClass = props.moveClass || `${props.name || "v"}-move`;
+      if (!hasCSSTransform(
+        prevChildren[0].el,
+        instance.vnode.el,
+        moveClass
+      )) {
+        return;
+      }
+      prevChildren.forEach(callPendingCbs);
+      prevChildren.forEach(recordPosition);
+      const movedChildren = prevChildren.filter(applyTranslation);
+      forceReflow();
+      movedChildren.forEach((c) => {
+        const el = c.el;
+        const style = el.style;
+        addTransitionClass(el, moveClass);
+        style.transform = style.webkitTransform = style.transitionDuration = "";
+        const cb = el[moveCbKey] = (e) => {
+          if (e && e.target !== el) {
+            return;
+          }
+          if (!e || /transform$/.test(e.propertyName)) {
+            el.removeEventListener("transitionend", cb);
+            el[moveCbKey] = null;
+            removeTransitionClass(el, moveClass);
+          }
+        };
+        el.addEventListener("transitionend", cb);
+      });
+    });
+    return () => {
+      const rawProps = toRaw(props);
+      const cssTransitionProps = resolveTransitionProps(rawProps);
+      let tag = rawProps.tag || Fragment;
+      prevChildren = [];
+      if (children) {
+        for (let i = 0; i < children.length; i++) {
+          const child = children[i];
+          if (child.el && child.el instanceof Element) {
+            prevChildren.push(child);
+            setTransitionHooks(
+              child,
+              resolveTransitionHooks(
+                child,
+                cssTransitionProps,
+                state,
+                instance
+              )
+            );
+            positionMap.set(
+              child,
+              child.el.getBoundingClientRect()
+            );
+          }
+        }
+      }
+      children = slots.default ? getTransitionRawChildren(slots.default()) : [];
+      for (let i = 0; i < children.length; i++) {
+        const child = children[i];
+        if (child.key != null) {
+          setTransitionHooks(
+            child,
+            resolveTransitionHooks(child, cssTransitionProps, state, instance)
+          );
+        } else if (child.type !== Text) {
+          warn(`<TransitionGroup> children must be keyed.`);
+        }
+      }
+      return createVNode(tag, null, children);
+    };
+  }
+});
+const TransitionGroup = TransitionGroupImpl;
+function callPendingCbs(c) {
+  const el = c.el;
+  if (el[moveCbKey]) {
+    el[moveCbKey]();
+  }
+  if (el[enterCbKey]) {
+    el[enterCbKey]();
+  }
+}
+function recordPosition(c) {
+  newPositionMap.set(c, c.el.getBoundingClientRect());
+}
+function applyTranslation(c) {
+  const oldPos = positionMap.get(c);
+  const newPos = newPositionMap.get(c);
+  const dx = oldPos.left - newPos.left;
+  const dy = oldPos.top - newPos.top;
+  if (dx || dy) {
+    const s = c.el.style;
+    s.transform = s.webkitTransform = `translate(${dx}px,${dy}px)`;
+    s.transitionDuration = "0s";
+    return c;
+  }
+}
+function hasCSSTransform(el, root, moveClass) {
+  const clone = el.cloneNode();
+  const _vtc = el[vtcKey];
+  if (_vtc) {
+    _vtc.forEach((cls) => {
+      cls.split(/\s+/).forEach((c) => c && clone.classList.remove(c));
+    });
+  }
+  moveClass.split(/\s+/).forEach((c) => c && clone.classList.add(c));
+  clone.style.display = "none";
+  const container = root.nodeType === 1 ? root : root.parentNode;
+  container.appendChild(clone);
+  const { hasTransform } = getTransitionInfo(clone);
+  container.removeChild(clone);
+  return hasTransform;
+}
+
+const getModelAssigner = (vnode) => {
+  const fn = vnode.props["onUpdate:modelValue"] || false;
+  return isArray(fn) ? (value) => invokeArrayFns(fn, value) : fn;
+};
+function onCompositionStart(e) {
+  e.target.composing = true;
+}
+function onCompositionEnd(e) {
+  const target = e.target;
+  if (target.composing) {
+    target.composing = false;
+    target.dispatchEvent(new Event("input"));
+  }
+}
+const assignKey = Symbol("_assign");
+const vModelText = {
+  created(el, { modifiers: { lazy, trim, number } }, vnode) {
+    el[assignKey] = getModelAssigner(vnode);
+    const castToNumber = number || vnode.props && vnode.props.type === "number";
+    addEventListener(el, lazy ? "change" : "input", (e) => {
+      if (e.target.composing) return;
+      let domValue = el.value;
+      if (trim) {
+        domValue = domValue.trim();
+      }
+      if (castToNumber) {
+        domValue = looseToNumber(domValue);
+      }
+      el[assignKey](domValue);
+    });
+    if (trim) {
+      addEventListener(el, "change", () => {
+        el.value = el.value.trim();
+      });
+    }
+    if (!lazy) {
+      addEventListener(el, "compositionstart", onCompositionStart);
+      addEventListener(el, "compositionend", onCompositionEnd);
+      addEventListener(el, "change", onCompositionEnd);
+    }
+  },
+  // set value on mounted so it's after min/max for type="range"
+  mounted(el, { value }) {
+    el.value = value == null ? "" : value;
+  },
+  beforeUpdate(el, { value, oldValue, modifiers: { lazy, trim, number } }, vnode) {
+    el[assignKey] = getModelAssigner(vnode);
+    if (el.composing) return;
+    const elValue = (number || el.type === "number") && !/^0\d/.test(el.value) ? looseToNumber(el.value) : el.value;
+    const newValue = value == null ? "" : value;
+    if (elValue === newValue) {
+      return;
+    }
+    if (document.activeElement === el && el.type !== "range") {
+      if (lazy && value === oldValue) {
+        return;
+      }
+      if (trim && el.value.trim() === newValue) {
+        return;
+      }
+    }
+    el.value = newValue;
+  }
+};
+const vModelCheckbox = {
+  // #4096 array checkboxes need to be deep traversed
+  deep: true,
+  created(el, _, vnode) {
+    el[assignKey] = getModelAssigner(vnode);
+    addEventListener(el, "change", () => {
+      const modelValue = el._modelValue;
+      const elementValue = getValue(el);
+      const checked = el.checked;
+      const assign = el[assignKey];
+      if (isArray(modelValue)) {
+        const index = looseIndexOf(modelValue, elementValue);
+        const found = index !== -1;
+        if (checked && !found) {
+          assign(modelValue.concat(elementValue));
+        } else if (!checked && found) {
+          const filtered = [...modelValue];
+          filtered.splice(index, 1);
+          assign(filtered);
+        }
+      } else if (isSet(modelValue)) {
+        const cloned = new Set(modelValue);
+        if (checked) {
+          cloned.add(elementValue);
+        } else {
+          cloned.delete(elementValue);
+        }
+        assign(cloned);
+      } else {
+        assign(getCheckboxValue(el, checked));
+      }
+    });
+  },
+  // set initial checked on mount to wait for true-value/false-value
+  mounted: setChecked,
+  beforeUpdate(el, binding, vnode) {
+    el[assignKey] = getModelAssigner(vnode);
+    setChecked(el, binding, vnode);
+  }
+};
+function setChecked(el, { value, oldValue }, vnode) {
+  el._modelValue = value;
+  let checked;
+  if (isArray(value)) {
+    checked = looseIndexOf(value, vnode.props.value) > -1;
+  } else if (isSet(value)) {
+    checked = value.has(vnode.props.value);
+  } else {
+    if (value === oldValue) return;
+    checked = looseEqual(value, getCheckboxValue(el, true));
+  }
+  if (el.checked !== checked) {
+    el.checked = checked;
+  }
+}
+const vModelRadio = {
+  created(el, { value }, vnode) {
+    el.checked = looseEqual(value, vnode.props.value);
+    el[assignKey] = getModelAssigner(vnode);
+    addEventListener(el, "change", () => {
+      el[assignKey](getValue(el));
+    });
+  },
+  beforeUpdate(el, { value, oldValue }, vnode) {
+    el[assignKey] = getModelAssigner(vnode);
+    if (value !== oldValue) {
+      el.checked = looseEqual(value, vnode.props.value);
+    }
+  }
+};
+const vModelSelect = {
+  // <select multiple> value need to be deep traversed
+  deep: true,
+  created(el, { value, modifiers: { number } }, vnode) {
+    const isSetModel = isSet(value);
+    addEventListener(el, "change", () => {
+      const selectedVal = Array.prototype.filter.call(el.options, (o) => o.selected).map(
+        (o) => number ? looseToNumber(getValue(o)) : getValue(o)
+      );
+      el[assignKey](
+        el.multiple ? isSetModel ? new Set(selectedVal) : selectedVal : selectedVal[0]
+      );
+      el._assigning = true;
+      nextTick(() => {
+        el._assigning = false;
+      });
+    });
+    el[assignKey] = getModelAssigner(vnode);
+  },
+  // set value in mounted & updated because <select> relies on its children
+  // <option>s.
+  mounted(el, { value }) {
+    setSelected(el, value);
+  },
+  beforeUpdate(el, _binding, vnode) {
+    el[assignKey] = getModelAssigner(vnode);
+  },
+  updated(el, { value }) {
+    if (!el._assigning) {
+      setSelected(el, value);
+    }
+  }
+};
+function setSelected(el, value) {
+  const isMultiple = el.multiple;
+  const isArrayValue = isArray(value);
+  if (isMultiple && !isArrayValue && !isSet(value)) {
+    warn(
+      `<select multiple v-model> expects an Array or Set value for its binding, but got ${Object.prototype.toString.call(value).slice(8, -1)}.`
+    );
+    return;
+  }
+  for (let i = 0, l = el.options.length; i < l; i++) {
+    const option = el.options[i];
+    const optionValue = getValue(option);
+    if (isMultiple) {
+      if (isArrayValue) {
+        const optionType = typeof optionValue;
+        if (optionType === "string" || optionType === "number") {
+          option.selected = value.some((v) => String(v) === String(optionValue));
+        } else {
+          option.selected = looseIndexOf(value, optionValue) > -1;
+        }
+      } else {
+        option.selected = value.has(optionValue);
+      }
+    } else if (looseEqual(getValue(option), value)) {
+      if (el.selectedIndex !== i) el.selectedIndex = i;
+      return;
+    }
+  }
+  if (!isMultiple && el.selectedIndex !== -1) {
+    el.selectedIndex = -1;
+  }
+}
+function getValue(el) {
+  return "_value" in el ? el._value : el.value;
+}
+function getCheckboxValue(el, checked) {
+  const key = checked ? "_trueValue" : "_falseValue";
+  return key in el ? el[key] : checked;
+}
+const vModelDynamic = {
+  created(el, binding, vnode) {
+    callModelHook(el, binding, vnode, null, "created");
+  },
+  mounted(el, binding, vnode) {
+    callModelHook(el, binding, vnode, null, "mounted");
+  },
+  beforeUpdate(el, binding, vnode, prevVNode) {
+    callModelHook(el, binding, vnode, prevVNode, "beforeUpdate");
+  },
+  updated(el, binding, vnode, prevVNode) {
+    callModelHook(el, binding, vnode, prevVNode, "updated");
+  }
+};
+function resolveDynamicModel(tagName, type) {
+  switch (tagName) {
+    case "SELECT":
+      return vModelSelect;
+    case "TEXTAREA":
+      return vModelText;
+    default:
+      switch (type) {
+        case "checkbox":
+          return vModelCheckbox;
+        case "radio":
+          return vModelRadio;
+        default:
+          return vModelText;
+      }
+  }
+}
+function callModelHook(el, binding, vnode, prevVNode, hook) {
+  const modelToUse = resolveDynamicModel(
+    el.tagName,
+    vnode.props && vnode.props.type
+  );
+  const fn = modelToUse[hook];
+  fn && fn(el, binding, vnode, prevVNode);
+}
+function initVModelForSSR() {
+  vModelText.getSSRProps = ({ value }) => ({ value });
+  vModelRadio.getSSRProps = ({ value }, vnode) => {
+    if (vnode.props && looseEqual(vnode.props.value, value)) {
+      return { checked: true };
+    }
+  };
+  vModelCheckbox.getSSRProps = ({ value }, vnode) => {
+    if (isArray(value)) {
+      if (vnode.props && looseIndexOf(value, vnode.props.value) > -1) {
+        return { checked: true };
+      }
+    } else if (isSet(value)) {
+      if (vnode.props && value.has(vnode.props.value)) {
+        return { checked: true };
+      }
+    } else if (value) {
+      return { checked: true };
+    }
+  };
+  vModelDynamic.getSSRProps = (binding, vnode) => {
+    if (typeof vnode.type !== "string") {
+      return;
+    }
+    const modelToUse = resolveDynamicModel(
+      // resolveDynamicModel expects an uppercase tag name, but vnode.type is lowercase
+      vnode.type.toUpperCase(),
+      vnode.props && vnode.props.type
+    );
+    if (modelToUse.getSSRProps) {
+      return modelToUse.getSSRProps(binding, vnode);
+    }
+  };
+}
+
+const systemModifiers = ["ctrl", "shift", "alt", "meta"];
+const modifierGuards = {
+  stop: (e) => e.stopPropagation(),
+  prevent: (e) => e.preventDefault(),
+  self: (e) => e.target !== e.currentTarget,
+  ctrl: (e) => !e.ctrlKey,
+  shift: (e) => !e.shiftKey,
+  alt: (e) => !e.altKey,
+  meta: (e) => !e.metaKey,
+  left: (e) => "button" in e && e.button !== 0,
+  middle: (e) => "button" in e && e.button !== 1,
+  right: (e) => "button" in e && e.button !== 2,
+  exact: (e, modifiers) => systemModifiers.some((m) => e[`${m}Key`] && !modifiers.includes(m))
+};
+const withModifiers = (fn, modifiers) => {
+  const cache = fn._withMods || (fn._withMods = {});
+  const cacheKey = modifiers.join(".");
+  return cache[cacheKey] || (cache[cacheKey] = (event, ...args) => {
+    for (let i = 0; i < modifiers.length; i++) {
+      const guard = modifierGuards[modifiers[i]];
+      if (guard && guard(event, modifiers)) return;
+    }
+    return fn(event, ...args);
+  });
+};
+const keyNames = {
+  esc: "escape",
+  space: " ",
+  up: "arrow-up",
+  left: "arrow-left",
+  right: "arrow-right",
+  down: "arrow-down",
+  delete: "backspace"
+};
+const withKeys = (fn, modifiers) => {
+  const cache = fn._withKeys || (fn._withKeys = {});
+  const cacheKey = modifiers.join(".");
+  return cache[cacheKey] || (cache[cacheKey] = (event) => {
+    if (!("key" in event)) {
+      return;
+    }
+    const eventKey = hyphenate(event.key);
+    if (modifiers.some(
+      (k) => k === eventKey || keyNames[k] === eventKey
+    )) {
+      return fn(event);
+    }
+  });
+};
+
+const rendererOptions = /* @__PURE__ */ extend({ patchProp }, nodeOps);
+let renderer;
+let enabledHydration = false;
+function ensureRenderer() {
+  return renderer || (renderer = createRenderer(rendererOptions));
+}
+function ensureHydrationRenderer() {
+  renderer = enabledHydration ? renderer : createHydrationRenderer(rendererOptions);
+  enabledHydration = true;
+  return renderer;
+}
+const render = (...args) => {
+  ensureRenderer().render(...args);
+};
+const hydrate = (...args) => {
+  ensureHydrationRenderer().hydrate(...args);
+};
+const createApp = (...args) => {
+  const app = ensureRenderer().createApp(...args);
+  {
+    injectNativeTagCheck(app);
+    injectCompilerOptionsCheck(app);
+  }
+  const { mount } = app;
+  app.mount = (containerOrSelector) => {
+    const container = normalizeContainer(containerOrSelector);
+    if (!container) return;
+    const component = app._component;
+    if (!isFunction(component) && !component.render && !component.template) {
+      component.template = container.innerHTML;
+    }
+    if (container.nodeType === 1) {
+      container.textContent = "";
+    }
+    const proxy = mount(container, false, resolveRootNamespace(container));
+    if (container instanceof Element) {
+      container.removeAttribute("v-cloak");
+      container.setAttribute("data-v-app", "");
+    }
+    return proxy;
+  };
+  return app;
+};
+const createSSRApp = (...args) => {
+  const app = ensureHydrationRenderer().createApp(...args);
+  {
+    injectNativeTagCheck(app);
+    injectCompilerOptionsCheck(app);
+  }
+  const { mount } = app;
+  app.mount = (containerOrSelector) => {
+    const container = normalizeContainer(containerOrSelector);
+    if (container) {
+      return mount(container, true, resolveRootNamespace(container));
+    }
+  };
+  return app;
+};
+function resolveRootNamespace(container) {
+  if (container instanceof SVGElement) {
+    return "svg";
+  }
+  if (typeof MathMLElement === "function" && container instanceof MathMLElement) {
+    return "mathml";
+  }
+}
+function injectNativeTagCheck(app) {
+  Object.defineProperty(app.config, "isNativeTag", {
+    value: (tag) => isHTMLTag(tag) || isSVGTag(tag) || isMathMLTag(tag),
+    writable: false
+  });
+}
+function injectCompilerOptionsCheck(app) {
+  if (isRuntimeOnly()) {
+    const isCustomElement = app.config.isCustomElement;
+    Object.defineProperty(app.config, "isCustomElement", {
+      get() {
+        return isCustomElement;
+      },
+      set() {
+        warn(
+          `The \`isCustomElement\` config option is deprecated. Use \`compilerOptions.isCustomElement\` instead.`
+        );
+      }
+    });
+    const compilerOptions = app.config.compilerOptions;
+    const msg = `The \`compilerOptions\` config option is only respected when using a build of Vue.js that includes the runtime compiler (aka "full build"). Since you are using the runtime-only build, \`compilerOptions\` must be passed to \`@vue/compiler-dom\` in the build setup instead.
+- For vue-loader: pass it via vue-loader's \`compilerOptions\` loader option.
+- For vue-cli: see https://cli.vuejs.org/guide/webpack.html#modifying-options-of-a-loader
+- For vite: pass it via @vitejs/plugin-vue options. See https://github.com/vitejs/vite-plugin-vue/tree/main/packages/plugin-vue#example-for-passing-options-to-vuecompiler-sfc`;
+    Object.defineProperty(app.config, "compilerOptions", {
+      get() {
+        warn(msg);
+        return compilerOptions;
+      },
+      set() {
+        warn(msg);
+      }
+    });
+  }
+}
+function normalizeContainer(container) {
+  if (isString(container)) {
+    const res = document.querySelector(container);
+    if (!res) {
+      warn(
+        `Failed to mount app: mount target selector "${container}" returned null.`
+      );
+    }
+    return res;
+  }
+  if (window.ShadowRoot && container instanceof window.ShadowRoot && container.mode === "closed") {
+    warn(
+      `mounting on a ShadowRoot with \`{mode: "closed"}\` may lead to unpredictable bugs`
+    );
+  }
+  return container;
+}
+let ssrDirectiveInitialized = false;
+const initDirectivesForSSR = () => {
+  if (!ssrDirectiveInitialized) {
+    ssrDirectiveInitialized = true;
+    initVModelForSSR();
+    initVShowForSSR();
+  }
+} ;
+
+var runtimeDom = /*#__PURE__*/Object.freeze({
+  __proto__: null,
+  BaseTransition: BaseTransition,
+  BaseTransitionPropsValidators: BaseTransitionPropsValidators,
+  Comment: Comment,
+  DeprecationTypes: DeprecationTypes,
+  EffectScope: EffectScope,
+  ErrorCodes: ErrorCodes,
+  ErrorTypeStrings: ErrorTypeStrings,
+  Fragment: Fragment,
+  KeepAlive: KeepAlive,
+  ReactiveEffect: ReactiveEffect,
+  Static: Static,
+  Suspense: Suspense,
+  Teleport: Teleport,
+  Text: Text,
+  TrackOpTypes: TrackOpTypes,
+  Transition: Transition,
+  TransitionGroup: TransitionGroup,
+  TriggerOpTypes: TriggerOpTypes,
+  VueElement: VueElement,
+  assertNumber: assertNumber,
+  callWithAsyncErrorHandling: callWithAsyncErrorHandling,
+  callWithErrorHandling: callWithErrorHandling,
+  camelize: camelize,
+  capitalize: capitalize,
+  cloneVNode: cloneVNode,
+  compatUtils: compatUtils,
+  computed: computed,
+  createApp: createApp,
+  createBlock: createBlock,
+  createCommentVNode: createCommentVNode,
+  createElementBlock: createElementBlock,
+  createElementVNode: createBaseVNode,
+  createHydrationRenderer: createHydrationRenderer,
+  createPropsRestProxy: createPropsRestProxy,
+  createRenderer: createRenderer,
+  createSSRApp: createSSRApp,
+  createSlots: createSlots,
+  createStaticVNode: createStaticVNode,
+  createTextVNode: createTextVNode,
+  createVNode: createVNode,
+  customRef: customRef,
+  defineAsyncComponent: defineAsyncComponent,
+  defineComponent: defineComponent,
+  defineCustomElement: defineCustomElement,
+  defineEmits: defineEmits,
+  defineExpose: defineExpose,
+  defineModel: defineModel,
+  defineOptions: defineOptions,
+  defineProps: defineProps,
+  defineSSRCustomElement: defineSSRCustomElement,
+  defineSlots: defineSlots,
+  devtools: devtools,
+  effect: effect,
+  effectScope: effectScope,
+  getCurrentInstance: getCurrentInstance,
+  getCurrentScope: getCurrentScope,
+  getCurrentWatcher: getCurrentWatcher,
+  getTransitionRawChildren: getTransitionRawChildren,
+  guardReactiveProps: guardReactiveProps,
+  h: h,
+  handleError: handleError,
+  hasInjectionContext: hasInjectionContext,
+  hydrate: hydrate,
+  hydrateOnIdle: hydrateOnIdle,
+  hydrateOnInteraction: hydrateOnInteraction,
+  hydrateOnMediaQuery: hydrateOnMediaQuery,
+  hydrateOnVisible: hydrateOnVisible,
+  initCustomFormatter: initCustomFormatter,
+  initDirectivesForSSR: initDirectivesForSSR,
+  inject: inject,
+  isMemoSame: isMemoSame,
+  isProxy: isProxy,
+  isReactive: isReactive,
+  isReadonly: isReadonly,
+  isRef: isRef,
+  isRuntimeOnly: isRuntimeOnly,
+  isShallow: isShallow,
+  isVNode: isVNode,
+  markRaw: markRaw,
+  mergeDefaults: mergeDefaults,
+  mergeModels: mergeModels,
+  mergeProps: mergeProps,
+  nextTick: nextTick,
+  normalizeClass: normalizeClass,
+  normalizeProps: normalizeProps,
+  normalizeStyle: normalizeStyle,
+  onActivated: onActivated,
+  onBeforeMount: onBeforeMount,
+  onBeforeUnmount: onBeforeUnmount,
+  onBeforeUpdate: onBeforeUpdate,
+  onDeactivated: onDeactivated,
+  onErrorCaptured: onErrorCaptured,
+  onMounted: onMounted,
+  onRenderTracked: onRenderTracked,
+  onRenderTriggered: onRenderTriggered,
+  onScopeDispose: onScopeDispose,
+  onServerPrefetch: onServerPrefetch,
+  onUnmounted: onUnmounted,
+  onUpdated: onUpdated,
+  onWatcherCleanup: onWatcherCleanup,
+  openBlock: openBlock,
+  popScopeId: popScopeId,
+  provide: provide,
+  proxyRefs: proxyRefs,
+  pushScopeId: pushScopeId,
+  queuePostFlushCb: queuePostFlushCb,
+  reactive: reactive,
+  readonly: readonly,
+  ref: ref,
+  registerRuntimeCompiler: registerRuntimeCompiler,
+  render: render,
+  renderList: renderList,
+  renderSlot: renderSlot,
+  resolveComponent: resolveComponent,
+  resolveDirective: resolveDirective,
+  resolveDynamicComponent: resolveDynamicComponent,
+  resolveFilter: resolveFilter,
+  resolveTransitionHooks: resolveTransitionHooks,
+  setBlockTracking: setBlockTracking,
+  setDevtoolsHook: setDevtoolsHook,
+  setTransitionHooks: setTransitionHooks,
+  shallowReactive: shallowReactive,
+  shallowReadonly: shallowReadonly,
+  shallowRef: shallowRef,
+  ssrContextKey: ssrContextKey,
+  ssrUtils: ssrUtils,
+  stop: stop,
+  toDisplayString: toDisplayString,
+  toHandlerKey: toHandlerKey,
+  toHandlers: toHandlers,
+  toRaw: toRaw,
+  toRef: toRef,
+  toRefs: toRefs,
+  toValue: toValue,
+  transformVNodeArgs: transformVNodeArgs,
+  triggerRef: triggerRef,
+  unref: unref,
+  useAttrs: useAttrs,
+  useCssModule: useCssModule,
+  useCssVars: useCssVars,
+  useHost: useHost,
+  useId: useId,
+  useModel: useModel,
+  useSSRContext: useSSRContext,
+  useShadowRoot: useShadowRoot,
+  useSlots: useSlots,
+  useTemplateRef: useTemplateRef,
+  useTransitionState: useTransitionState,
+  vModelCheckbox: vModelCheckbox,
+  vModelDynamic: vModelDynamic,
+  vModelRadio: vModelRadio,
+  vModelSelect: vModelSelect,
+  vModelText: vModelText,
+  vShow: vShow,
+  version: version,
+  warn: warn,
+  watch: watch,
+  watchEffect: watchEffect,
+  watchPostEffect: watchPostEffect,
+  watchSyncEffect: watchSyncEffect,
+  withAsyncContext: withAsyncContext,
+  withCtx: withCtx,
+  withDefaults: withDefaults,
+  withDirectives: withDirectives,
+  withKeys: withKeys,
+  withMemo: withMemo,
+  withModifiers: withModifiers,
+  withScopeId: withScopeId
+});
+
+function initDev() {
+  {
+    {
+      console.info(
+        `You are running a development build of Vue.
+Make sure to use the production build (*.prod.js) when deploying for production.`
+      );
+    }
+    initCustomFormatter();
+  }
+}
+
+const FRAGMENT = Symbol(`Fragment` );
+const TELEPORT = Symbol(`Teleport` );
+const SUSPENSE = Symbol(`Suspense` );
+const KEEP_ALIVE = Symbol(`KeepAlive` );
+const BASE_TRANSITION = Symbol(
+  `BaseTransition` 
+);
+const OPEN_BLOCK = Symbol(`openBlock` );
+const CREATE_BLOCK = Symbol(`createBlock` );
+const CREATE_ELEMENT_BLOCK = Symbol(
+  `createElementBlock` 
+);
+const CREATE_VNODE = Symbol(`createVNode` );
+const CREATE_ELEMENT_VNODE = Symbol(
+  `createElementVNode` 
+);
+const CREATE_COMMENT = Symbol(
+  `createCommentVNode` 
+);
+const CREATE_TEXT = Symbol(
+  `createTextVNode` 
+);
+const CREATE_STATIC = Symbol(
+  `createStaticVNode` 
+);
+const RESOLVE_COMPONENT = Symbol(
+  `resolveComponent` 
+);
+const RESOLVE_DYNAMIC_COMPONENT = Symbol(
+  `resolveDynamicComponent` 
+);
+const RESOLVE_DIRECTIVE = Symbol(
+  `resolveDirective` 
+);
+const RESOLVE_FILTER = Symbol(
+  `resolveFilter` 
+);
+const WITH_DIRECTIVES = Symbol(
+  `withDirectives` 
+);
+const RENDER_LIST = Symbol(`renderList` );
+const RENDER_SLOT = Symbol(`renderSlot` );
+const CREATE_SLOTS = Symbol(`createSlots` );
+const TO_DISPLAY_STRING = Symbol(
+  `toDisplayString` 
+);
+const MERGE_PROPS = Symbol(`mergeProps` );
+const NORMALIZE_CLASS = Symbol(
+  `normalizeClass` 
+);
+const NORMALIZE_STYLE = Symbol(
+  `normalizeStyle` 
+);
+const NORMALIZE_PROPS = Symbol(
+  `normalizeProps` 
+);
+const GUARD_REACTIVE_PROPS = Symbol(
+  `guardReactiveProps` 
+);
+const TO_HANDLERS = Symbol(`toHandlers` );
+const CAMELIZE = Symbol(`camelize` );
+const CAPITALIZE = Symbol(`capitalize` );
+const TO_HANDLER_KEY = Symbol(
+  `toHandlerKey` 
+);
+const SET_BLOCK_TRACKING = Symbol(
+  `setBlockTracking` 
+);
+const PUSH_SCOPE_ID = Symbol(`pushScopeId` );
+const POP_SCOPE_ID = Symbol(`popScopeId` );
+const WITH_CTX = Symbol(`withCtx` );
+const UNREF = Symbol(`unref` );
+const IS_REF = Symbol(`isRef` );
+const WITH_MEMO = Symbol(`withMemo` );
+const IS_MEMO_SAME = Symbol(`isMemoSame` );
+const helperNameMap = {
+  [FRAGMENT]: `Fragment`,
+  [TELEPORT]: `Teleport`,
+  [SUSPENSE]: `Suspense`,
+  [KEEP_ALIVE]: `KeepAlive`,
+  [BASE_TRANSITION]: `BaseTransition`,
+  [OPEN_BLOCK]: `openBlock`,
+  [CREATE_BLOCK]: `createBlock`,
+  [CREATE_ELEMENT_BLOCK]: `createElementBlock`,
+  [CREATE_VNODE]: `createVNode`,
+  [CREATE_ELEMENT_VNODE]: `createElementVNode`,
+  [CREATE_COMMENT]: `createCommentVNode`,
+  [CREATE_TEXT]: `createTextVNode`,
+  [CREATE_STATIC]: `createStaticVNode`,
+  [RESOLVE_COMPONENT]: `resolveComponent`,
+  [RESOLVE_DYNAMIC_COMPONENT]: `resolveDynamicComponent`,
+  [RESOLVE_DIRECTIVE]: `resolveDirective`,
+  [RESOLVE_FILTER]: `resolveFilter`,
+  [WITH_DIRECTIVES]: `withDirectives`,
+  [RENDER_LIST]: `renderList`,
+  [RENDER_SLOT]: `renderSlot`,
+  [CREATE_SLOTS]: `createSlots`,
+  [TO_DISPLAY_STRING]: `toDisplayString`,
+  [MERGE_PROPS]: `mergeProps`,
+  [NORMALIZE_CLASS]: `normalizeClass`,
+  [NORMALIZE_STYLE]: `normalizeStyle`,
+  [NORMALIZE_PROPS]: `normalizeProps`,
+  [GUARD_REACTIVE_PROPS]: `guardReactiveProps`,
+  [TO_HANDLERS]: `toHandlers`,
+  [CAMELIZE]: `camelize`,
+  [CAPITALIZE]: `capitalize`,
+  [TO_HANDLER_KEY]: `toHandlerKey`,
+  [SET_BLOCK_TRACKING]: `setBlockTracking`,
+  [PUSH_SCOPE_ID]: `pushScopeId`,
+  [POP_SCOPE_ID]: `popScopeId`,
+  [WITH_CTX]: `withCtx`,
+  [UNREF]: `unref`,
+  [IS_REF]: `isRef`,
+  [WITH_MEMO]: `withMemo`,
+  [IS_MEMO_SAME]: `isMemoSame`
+};
+function registerRuntimeHelpers(helpers) {
+  Object.getOwnPropertySymbols(helpers).forEach((s) => {
+    helperNameMap[s] = helpers[s];
+  });
+}
+
+const locStub = {
+  start: { line: 1, column: 1, offset: 0 },
+  end: { line: 1, column: 1, offset: 0 },
+  source: ""
+};
+function createRoot(children, source = "") {
+  return {
+    type: 0,
+    source,
+    children,
+    helpers: /* @__PURE__ */ new Set(),
+    components: [],
+    directives: [],
+    hoists: [],
+    imports: [],
+    cached: [],
+    temps: 0,
+    codegenNode: void 0,
+    loc: locStub
+  };
+}
+function createVNodeCall(context, tag, props, children, patchFlag, dynamicProps, directives, isBlock = false, disableTracking = false, isComponent = false, loc = locStub) {
+  if (context) {
+    if (isBlock) {
+      context.helper(OPEN_BLOCK);
+      context.helper(getVNodeBlockHelper(context.inSSR, isComponent));
+    } else {
+      context.helper(getVNodeHelper(context.inSSR, isComponent));
+    }
+    if (directives) {
+      context.helper(WITH_DIRECTIVES);
+    }
+  }
+  return {
+    type: 13,
+    tag,
+    props,
+    children,
+    patchFlag,
+    dynamicProps,
+    directives,
+    isBlock,
+    disableTracking,
+    isComponent,
+    loc
+  };
+}
+function createArrayExpression(elements, loc = locStub) {
+  return {
+    type: 17,
+    loc,
+    elements
+  };
+}
+function createObjectExpression(properties, loc = locStub) {
+  return {
+    type: 15,
+    loc,
+    properties
+  };
+}
+function createObjectProperty(key, value) {
+  return {
+    type: 16,
+    loc: locStub,
+    key: isString(key) ? createSimpleExpression(key, true) : key,
+    value
+  };
+}
+function createSimpleExpression(content, isStatic = false, loc = locStub, constType = 0) {
+  return {
+    type: 4,
+    loc,
+    content,
+    isStatic,
+    constType: isStatic ? 3 : constType
+  };
+}
+function createCompoundExpression(children, loc = locStub) {
+  return {
+    type: 8,
+    loc,
+    children
+  };
+}
+function createCallExpression(callee, args = [], loc = locStub) {
+  return {
+    type: 14,
+    loc,
+    callee,
+    arguments: args
+  };
+}
+function createFunctionExpression(params, returns = void 0, newline = false, isSlot = false, loc = locStub) {
+  return {
+    type: 18,
+    params,
+    returns,
+    newline,
+    isSlot,
+    loc
+  };
+}
+function createConditionalExpression(test, consequent, alternate, newline = true) {
+  return {
+    type: 19,
+    test,
+    consequent,
+    alternate,
+    newline,
+    loc: locStub
+  };
+}
+function createCacheExpression(index, value, needPauseTracking = false) {
+  return {
+    type: 20,
+    index,
+    value,
+    needPauseTracking,
+    needArraySpread: false,
+    loc: locStub
+  };
+}
+function createBlockStatement(body) {
+  return {
+    type: 21,
+    body,
+    loc: locStub
+  };
+}
+function getVNodeHelper(ssr, isComponent) {
+  return ssr || isComponent ? CREATE_VNODE : CREATE_ELEMENT_VNODE;
+}
+function getVNodeBlockHelper(ssr, isComponent) {
+  return ssr || isComponent ? CREATE_BLOCK : CREATE_ELEMENT_BLOCK;
+}
+function convertToBlock(node, { helper, removeHelper, inSSR }) {
+  if (!node.isBlock) {
+    node.isBlock = true;
+    removeHelper(getVNodeHelper(inSSR, node.isComponent));
+    helper(OPEN_BLOCK);
+    helper(getVNodeBlockHelper(inSSR, node.isComponent));
+  }
+}
+
+const defaultDelimitersOpen = new Uint8Array([123, 123]);
+const defaultDelimitersClose = new Uint8Array([125, 125]);
+function isTagStartChar(c) {
+  return c >= 97 && c <= 122 || c >= 65 && c <= 90;
+}
+function isWhitespace(c) {
+  return c === 32 || c === 10 || c === 9 || c === 12 || c === 13;
+}
+function isEndOfTagSection(c) {
+  return c === 47 || c === 62 || isWhitespace(c);
+}
+function toCharCodes(str) {
+  const ret = new Uint8Array(str.length);
+  for (let i = 0; i < str.length; i++) {
+    ret[i] = str.charCodeAt(i);
+  }
+  return ret;
+}
+const Sequences = {
+  Cdata: new Uint8Array([67, 68, 65, 84, 65, 91]),
+  // CDATA[
+  CdataEnd: new Uint8Array([93, 93, 62]),
+  // ]]>
+  CommentEnd: new Uint8Array([45, 45, 62]),
+  // `-->`
+  ScriptEnd: new Uint8Array([60, 47, 115, 99, 114, 105, 112, 116]),
+  // `<\/script`
+  StyleEnd: new Uint8Array([60, 47, 115, 116, 121, 108, 101]),
+  // `</style`
+  TitleEnd: new Uint8Array([60, 47, 116, 105, 116, 108, 101]),
+  // `</title`
+  TextareaEnd: new Uint8Array([
+    60,
+    47,
+    116,
+    101,
+    120,
+    116,
+    97,
+    114,
+    101,
+    97
+  ])
+  // `</textarea
+};
+class Tokenizer {
+  constructor(stack, cbs) {
+    this.stack = stack;
+    this.cbs = cbs;
+    /** The current state the tokenizer is in. */
+    this.state = 1;
+    /** The read buffer. */
+    this.buffer = "";
+    /** The beginning of the section that is currently being read. */
+    this.sectionStart = 0;
+    /** The index within the buffer that we are currently looking at. */
+    this.index = 0;
+    /** The start of the last entity. */
+    this.entityStart = 0;
+    /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
+    this.baseState = 1;
+    /** For special parsing behavior inside of script and style tags. */
+    this.inRCDATA = false;
+    /** For disabling RCDATA tags handling */
+    this.inXML = false;
+    /** For disabling interpolation parsing in v-pre */
+    this.inVPre = false;
+    /** Record newline positions for fast line / column calculation */
+    this.newlines = [];
+    this.mode = 0;
+    this.delimiterOpen = defaultDelimitersOpen;
+    this.delimiterClose = defaultDelimitersClose;
+    this.delimiterIndex = -1;
+    this.currentSequence = void 0;
+    this.sequenceIndex = 0;
+  }
+  get inSFCRoot() {
+    return this.mode === 2 && this.stack.length === 0;
+  }
+  reset() {
+    this.state = 1;
+    this.mode = 0;
+    this.buffer = "";
+    this.sectionStart = 0;
+    this.index = 0;
+    this.baseState = 1;
+    this.inRCDATA = false;
+    this.currentSequence = void 0;
+    this.newlines.length = 0;
+    this.delimiterOpen = defaultDelimitersOpen;
+    this.delimiterClose = defaultDelimitersClose;
+  }
+  /**
+   * Generate Position object with line / column information using recorded
+   * newline positions. We know the index is always going to be an already
+   * processed index, so all the newlines up to this index should have been
+   * recorded.
+   */
+  getPos(index) {
+    let line = 1;
+    let column = index + 1;
+    for (let i = this.newlines.length - 1; i >= 0; i--) {
+      const newlineIndex = this.newlines[i];
+      if (index > newlineIndex) {
+        line = i + 2;
+        column = index - newlineIndex;
+        break;
+      }
+    }
+    return {
+      column,
+      line,
+      offset: index
+    };
+  }
+  peek() {
+    return this.buffer.charCodeAt(this.index + 1);
+  }
+  stateText(c) {
+    if (c === 60) {
+      if (this.index > this.sectionStart) {
+        this.cbs.ontext(this.sectionStart, this.index);
+      }
+      this.state = 5;
+      this.sectionStart = this.index;
+    } else if (!this.inVPre && c === this.delimiterOpen[0]) {
+      this.state = 2;
+      this.delimiterIndex = 0;
+      this.stateInterpolationOpen(c);
+    }
+  }
+  stateInterpolationOpen(c) {
+    if (c === this.delimiterOpen[this.delimiterIndex]) {
+      if (this.delimiterIndex === this.delimiterOpen.length - 1) {
+        const start = this.index + 1 - this.delimiterOpen.length;
+        if (start > this.sectionStart) {
+          this.cbs.ontext(this.sectionStart, start);
+        }
+        this.state = 3;
+        this.sectionStart = start;
+      } else {
+        this.delimiterIndex++;
+      }
+    } else if (this.inRCDATA) {
+      this.state = 32;
+      this.stateInRCDATA(c);
+    } else {
+      this.state = 1;
+      this.stateText(c);
+    }
+  }
+  stateInterpolation(c) {
+    if (c === this.delimiterClose[0]) {
+      this.state = 4;
+      this.delimiterIndex = 0;
+      this.stateInterpolationClose(c);
+    }
+  }
+  stateInterpolationClose(c) {
+    if (c === this.delimiterClose[this.delimiterIndex]) {
+      if (this.delimiterIndex === this.delimiterClose.length - 1) {
+        this.cbs.oninterpolation(this.sectionStart, this.index + 1);
+        if (this.inRCDATA) {
+          this.state = 32;
+        } else {
+          this.state = 1;
+        }
+        this.sectionStart = this.index + 1;
+      } else {
+        this.delimiterIndex++;
+      }
+    } else {
+      this.state = 3;
+      this.stateInterpolation(c);
+    }
+  }
+  stateSpecialStartSequence(c) {
+    const isEnd = this.sequenceIndex === this.currentSequence.length;
+    const isMatch = isEnd ? (
+      // If we are at the end of the sequence, make sure the tag name has ended
+      isEndOfTagSection(c)
+    ) : (
+      // Otherwise, do a case-insensitive comparison
+      (c | 32) === this.currentSequence[this.sequenceIndex]
+    );
+    if (!isMatch) {
+      this.inRCDATA = false;
+    } else if (!isEnd) {
+      this.sequenceIndex++;
+      return;
+    }
+    this.sequenceIndex = 0;
+    this.state = 6;
+    this.stateInTagName(c);
+  }
+  /** Look for an end tag. For <title> and <textarea>, also decode entities. */
+  stateInRCDATA(c) {
+    if (this.sequenceIndex === this.currentSequence.length) {
+      if (c === 62 || isWhitespace(c)) {
+        const endOfText = this.index - this.currentSequence.length;
+        if (this.sectionStart < endOfText) {
+          const actualIndex = this.index;
+          this.index = endOfText;
+          this.cbs.ontext(this.sectionStart, endOfText);
+          this.index = actualIndex;
+        }
+        this.sectionStart = endOfText + 2;
+        this.stateInClosingTagName(c);
+        this.inRCDATA = false;
+        return;
+      }
+      this.sequenceIndex = 0;
+    }
+    if ((c | 32) === this.currentSequence[this.sequenceIndex]) {
+      this.sequenceIndex += 1;
+    } else if (this.sequenceIndex === 0) {
+      if (this.currentSequence === Sequences.TitleEnd || this.currentSequence === Sequences.TextareaEnd && !this.inSFCRoot) {
+        if (!this.inVPre && c === this.delimiterOpen[0]) {
+          this.state = 2;
+          this.delimiterIndex = 0;
+          this.stateInterpolationOpen(c);
+        }
+      } else if (this.fastForwardTo(60)) {
+        this.sequenceIndex = 1;
+      }
+    } else {
+      this.sequenceIndex = Number(c === 60);
+    }
+  }
+  stateCDATASequence(c) {
+    if (c === Sequences.Cdata[this.sequenceIndex]) {
+      if (++this.sequenceIndex === Sequences.Cdata.length) {
+        this.state = 28;
+        this.currentSequence = Sequences.CdataEnd;
+        this.sequenceIndex = 0;
+        this.sectionStart = this.index + 1;
+      }
+    } else {
+      this.sequenceIndex = 0;
+      this.state = 23;
+      this.stateInDeclaration(c);
+    }
+  }
+  /**
+   * When we wait for one specific character, we can speed things up
+   * by skipping through the buffer until we find it.
+   *
+   * @returns Whether the character was found.
+   */
+  fastForwardTo(c) {
+    while (++this.index < this.buffer.length) {
+      const cc = this.buffer.charCodeAt(this.index);
+      if (cc === 10) {
+        this.newlines.push(this.index);
+      }
+      if (cc === c) {
+        return true;
+      }
+    }
+    this.index = this.buffer.length - 1;
+    return false;
+  }
+  /**
+   * Comments and CDATA end with `-->` and `]]>`.
+   *
+   * Their common qualities are:
+   * - Their end sequences have a distinct character they start with.
+   * - That character is then repeated, so we have to check multiple repeats.
+   * - All characters but the start character of the sequence can be skipped.
+   */
+  stateInCommentLike(c) {
+    if (c === this.currentSequence[this.sequenceIndex]) {
+      if (++this.sequenceIndex === this.currentSequence.length) {
+        if (this.currentSequence === Sequences.CdataEnd) {
+          this.cbs.oncdata(this.sectionStart, this.index - 2);
+        } else {
+          this.cbs.oncomment(this.sectionStart, this.index - 2);
+        }
+        this.sequenceIndex = 0;
+        this.sectionStart = this.index + 1;
+        this.state = 1;
+      }
+    } else if (this.sequenceIndex === 0) {
+      if (this.fastForwardTo(this.currentSequence[0])) {
+        this.sequenceIndex = 1;
+      }
+    } else if (c !== this.currentSequence[this.sequenceIndex - 1]) {
+      this.sequenceIndex = 0;
+    }
+  }
+  startSpecial(sequence, offset) {
+    this.enterRCDATA(sequence, offset);
+    this.state = 31;
+  }
+  enterRCDATA(sequence, offset) {
+    this.inRCDATA = true;
+    this.currentSequence = sequence;
+    this.sequenceIndex = offset;
+  }
+  stateBeforeTagName(c) {
+    if (c === 33) {
+      this.state = 22;
+      this.sectionStart = this.index + 1;
+    } else if (c === 63) {
+      this.state = 24;
+      this.sectionStart = this.index + 1;
+    } else if (isTagStartChar(c)) {
+      this.sectionStart = this.index;
+      if (this.mode === 0) {
+        this.state = 6;
+      } else if (this.inSFCRoot) {
+        this.state = 34;
+      } else if (!this.inXML) {
+        if (c === 116) {
+          this.state = 30;
+        } else {
+          this.state = c === 115 ? 29 : 6;
+        }
+      } else {
+        this.state = 6;
+      }
+    } else if (c === 47) {
+      this.state = 8;
+    } else {
+      this.state = 1;
+      this.stateText(c);
+    }
+  }
+  stateInTagName(c) {
+    if (isEndOfTagSection(c)) {
+      this.handleTagName(c);
+    }
+  }
+  stateInSFCRootTagName(c) {
+    if (isEndOfTagSection(c)) {
+      const tag = this.buffer.slice(this.sectionStart, this.index);
+      if (tag !== "template") {
+        this.enterRCDATA(toCharCodes(`</` + tag), 0);
+      }
+      this.handleTagName(c);
+    }
+  }
+  handleTagName(c) {
+    this.cbs.onopentagname(this.sectionStart, this.index);
+    this.sectionStart = -1;
+    this.state = 11;
+    this.stateBeforeAttrName(c);
+  }
+  stateBeforeClosingTagName(c) {
+    if (isWhitespace(c)) ; else if (c === 62) {
+      {
+        this.cbs.onerr(14, this.index);
+      }
+      this.state = 1;
+      this.sectionStart = this.index + 1;
+    } else {
+      this.state = isTagStartChar(c) ? 9 : 27;
+      this.sectionStart = this.index;
+    }
+  }
+  stateInClosingTagName(c) {
+    if (c === 62 || isWhitespace(c)) {
+      this.cbs.onclosetag(this.sectionStart, this.index);
+      this.sectionStart = -1;
+      this.state = 10;
+      this.stateAfterClosingTagName(c);
+    }
+  }
+  stateAfterClosingTagName(c) {
+    if (c === 62) {
+      this.state = 1;
+      this.sectionStart = this.index + 1;
+    }
+  }
+  stateBeforeAttrName(c) {
+    if (c === 62) {
+      this.cbs.onopentagend(this.index);
+      if (this.inRCDATA) {
+        this.state = 32;
+      } else {
+        this.state = 1;
+      }
+      this.sectionStart = this.index + 1;
+    } else if (c === 47) {
+      this.state = 7;
+      if (this.peek() !== 62) {
+        this.cbs.onerr(22, this.index);
+      }
+    } else if (c === 60 && this.peek() === 47) {
+      this.cbs.onopentagend(this.index);
+      this.state = 5;
+      this.sectionStart = this.index;
+    } else if (!isWhitespace(c)) {
+      if (c === 61) {
+        this.cbs.onerr(
+          19,
+          this.index
+        );
+      }
+      this.handleAttrStart(c);
+    }
+  }
+  handleAttrStart(c) {
+    if (c === 118 && this.peek() === 45) {
+      this.state = 13;
+      this.sectionStart = this.index;
+    } else if (c === 46 || c === 58 || c === 64 || c === 35) {
+      this.cbs.ondirname(this.index, this.index + 1);
+      this.state = 14;
+      this.sectionStart = this.index + 1;
+    } else {
+      this.state = 12;
+      this.sectionStart = this.index;
+    }
+  }
+  stateInSelfClosingTag(c) {
+    if (c === 62) {
+      this.cbs.onselfclosingtag(this.index);
+      this.state = 1;
+      this.sectionStart = this.index + 1;
+      this.inRCDATA = false;
+    } else if (!isWhitespace(c)) {
+      this.state = 11;
+      this.stateBeforeAttrName(c);
+    }
+  }
+  stateInAttrName(c) {
+    if (c === 61 || isEndOfTagSection(c)) {
+      this.cbs.onattribname(this.sectionStart, this.index);
+      this.handleAttrNameEnd(c);
+    } else if (c === 34 || c === 39 || c === 60) {
+      this.cbs.onerr(
+        17,
+        this.index
+      );
+    }
+  }
+  stateInDirName(c) {
+    if (c === 61 || isEndOfTagSection(c)) {
+      this.cbs.ondirname(this.sectionStart, this.index);
+      this.handleAttrNameEnd(c);
+    } else if (c === 58) {
+      this.cbs.ondirname(this.sectionStart, this.index);
+      this.state = 14;
+      this.sectionStart = this.index + 1;
+    } else if (c === 46) {
+      this.cbs.ondirname(this.sectionStart, this.index);
+      this.state = 16;
+      this.sectionStart = this.index + 1;
+    }
+  }
+  stateInDirArg(c) {
+    if (c === 61 || isEndOfTagSection(c)) {
+      this.cbs.ondirarg(this.sectionStart, this.index);
+      this.handleAttrNameEnd(c);
+    } else if (c === 91) {
+      this.state = 15;
+    } else if (c === 46) {
+      this.cbs.ondirarg(this.sectionStart, this.index);
+      this.state = 16;
+      this.sectionStart = this.index + 1;
+    }
+  }
+  stateInDynamicDirArg(c) {
+    if (c === 93) {
+      this.state = 14;
+    } else if (c === 61 || isEndOfTagSection(c)) {
+      this.cbs.ondirarg(this.sectionStart, this.index + 1);
+      this.handleAttrNameEnd(c);
+      {
+        this.cbs.onerr(
+          27,
+          this.index
+        );
+      }
+    }
+  }
+  stateInDirModifier(c) {
+    if (c === 61 || isEndOfTagSection(c)) {
+      this.cbs.ondirmodifier(this.sectionStart, this.index);
+      this.handleAttrNameEnd(c);
+    } else if (c === 46) {
+      this.cbs.ondirmodifier(this.sectionStart, this.index);
+      this.sectionStart = this.index + 1;
+    }
+  }
+  handleAttrNameEnd(c) {
+    this.sectionStart = this.index;
+    this.state = 17;
+    this.cbs.onattribnameend(this.index);
+    this.stateAfterAttrName(c);
+  }
+  stateAfterAttrName(c) {
+    if (c === 61) {
+      this.state = 18;
+    } else if (c === 47 || c === 62) {
+      this.cbs.onattribend(0, this.sectionStart);
+      this.sectionStart = -1;
+      this.state = 11;
+      this.stateBeforeAttrName(c);
+    } else if (!isWhitespace(c)) {
+      this.cbs.onattribend(0, this.sectionStart);
+      this.handleAttrStart(c);
+    }
+  }
+  stateBeforeAttrValue(c) {
+    if (c === 34) {
+      this.state = 19;
+      this.sectionStart = this.index + 1;
+    } else if (c === 39) {
+      this.state = 20;
+      this.sectionStart = this.index + 1;
+    } else if (!isWhitespace(c)) {
+      this.sectionStart = this.index;
+      this.state = 21;
+      this.stateInAttrValueNoQuotes(c);
+    }
+  }
+  handleInAttrValue(c, quote) {
+    if (c === quote || this.fastForwardTo(quote)) {
+      this.cbs.onattribdata(this.sectionStart, this.index);
+      this.sectionStart = -1;
+      this.cbs.onattribend(
+        quote === 34 ? 3 : 2,
+        this.index + 1
+      );
+      this.state = 11;
+    }
+  }
+  stateInAttrValueDoubleQuotes(c) {
+    this.handleInAttrValue(c, 34);
+  }
+  stateInAttrValueSingleQuotes(c) {
+    this.handleInAttrValue(c, 39);
+  }
+  stateInAttrValueNoQuotes(c) {
+    if (isWhitespace(c) || c === 62) {
+      this.cbs.onattribdata(this.sectionStart, this.index);
+      this.sectionStart = -1;
+      this.cbs.onattribend(1, this.index);
+      this.state = 11;
+      this.stateBeforeAttrName(c);
+    } else if (c === 34 || c === 39 || c === 60 || c === 61 || c === 96) {
+      this.cbs.onerr(
+        18,
+        this.index
+      );
+    } else ;
+  }
+  stateBeforeDeclaration(c) {
+    if (c === 91) {
+      this.state = 26;
+      this.sequenceIndex = 0;
+    } else {
+      this.state = c === 45 ? 25 : 23;
+    }
+  }
+  stateInDeclaration(c) {
+    if (c === 62 || this.fastForwardTo(62)) {
+      this.state = 1;
+      this.sectionStart = this.index + 1;
+    }
+  }
+  stateInProcessingInstruction(c) {
+    if (c === 62 || this.fastForwardTo(62)) {
+      this.cbs.onprocessinginstruction(this.sectionStart, this.index);
+      this.state = 1;
+      this.sectionStart = this.index + 1;
+    }
+  }
+  stateBeforeComment(c) {
+    if (c === 45) {
+      this.state = 28;
+      this.currentSequence = Sequences.CommentEnd;
+      this.sequenceIndex = 2;
+      this.sectionStart = this.index + 1;
+    } else {
+      this.state = 23;
+    }
+  }
+  stateInSpecialComment(c) {
+    if (c === 62 || this.fastForwardTo(62)) {
+      this.cbs.oncomment(this.sectionStart, this.index);
+      this.state = 1;
+      this.sectionStart = this.index + 1;
+    }
+  }
+  stateBeforeSpecialS(c) {
+    if (c === Sequences.ScriptEnd[3]) {
+      this.startSpecial(Sequences.ScriptEnd, 4);
+    } else if (c === Sequences.StyleEnd[3]) {
+      this.startSpecial(Sequences.StyleEnd, 4);
+    } else {
+      this.state = 6;
+      this.stateInTagName(c);
+    }
+  }
+  stateBeforeSpecialT(c) {
+    if (c === Sequences.TitleEnd[3]) {
+      this.startSpecial(Sequences.TitleEnd, 4);
+    } else if (c === Sequences.TextareaEnd[3]) {
+      this.startSpecial(Sequences.TextareaEnd, 4);
+    } else {
+      this.state = 6;
+      this.stateInTagName(c);
+    }
+  }
+  startEntity() {
+  }
+  stateInEntity() {
+  }
+  /**
+   * Iterates through the buffer, calling the function corresponding to the current state.
+   *
+   * States that are more likely to be hit are higher up, as a performance improvement.
+   */
+  parse(input) {
+    this.buffer = input;
+    while (this.index < this.buffer.length) {
+      const c = this.buffer.charCodeAt(this.index);
+      if (c === 10) {
+        this.newlines.push(this.index);
+      }
+      switch (this.state) {
+        case 1: {
+          this.stateText(c);
+          break;
+        }
+        case 2: {
+          this.stateInterpolationOpen(c);
+          break;
+        }
+        case 3: {
+          this.stateInterpolation(c);
+          break;
+        }
+        case 4: {
+          this.stateInterpolationClose(c);
+          break;
+        }
+        case 31: {
+          this.stateSpecialStartSequence(c);
+          break;
+        }
+        case 32: {
+          this.stateInRCDATA(c);
+          break;
+        }
+        case 26: {
+          this.stateCDATASequence(c);
+          break;
+        }
+        case 19: {
+          this.stateInAttrValueDoubleQuotes(c);
+          break;
+        }
+        case 12: {
+          this.stateInAttrName(c);
+          break;
+        }
+        case 13: {
+          this.stateInDirName(c);
+          break;
+        }
+        case 14: {
+          this.stateInDirArg(c);
+          break;
+        }
+        case 15: {
+          this.stateInDynamicDirArg(c);
+          break;
+        }
+        case 16: {
+          this.stateInDirModifier(c);
+          break;
+        }
+        case 28: {
+          this.stateInCommentLike(c);
+          break;
+        }
+        case 27: {
+          this.stateInSpecialComment(c);
+          break;
+        }
+        case 11: {
+          this.stateBeforeAttrName(c);
+          break;
+        }
+        case 6: {
+          this.stateInTagName(c);
+          break;
+        }
+        case 34: {
+          this.stateInSFCRootTagName(c);
+          break;
+        }
+        case 9: {
+          this.stateInClosingTagName(c);
+          break;
+        }
+        case 5: {
+          this.stateBeforeTagName(c);
+          break;
+        }
+        case 17: {
+          this.stateAfterAttrName(c);
+          break;
+        }
+        case 20: {
+          this.stateInAttrValueSingleQuotes(c);
+          break;
+        }
+        case 18: {
+          this.stateBeforeAttrValue(c);
+          break;
+        }
+        case 8: {
+          this.stateBeforeClosingTagName(c);
+          break;
+        }
+        case 10: {
+          this.stateAfterClosingTagName(c);
+          break;
+        }
+        case 29: {
+          this.stateBeforeSpecialS(c);
+          break;
+        }
+        case 30: {
+          this.stateBeforeSpecialT(c);
+          break;
+        }
+        case 21: {
+          this.stateInAttrValueNoQuotes(c);
+          break;
+        }
+        case 7: {
+          this.stateInSelfClosingTag(c);
+          break;
+        }
+        case 23: {
+          this.stateInDeclaration(c);
+          break;
+        }
+        case 22: {
+          this.stateBeforeDeclaration(c);
+          break;
+        }
+        case 25: {
+          this.stateBeforeComment(c);
+          break;
+        }
+        case 24: {
+          this.stateInProcessingInstruction(c);
+          break;
+        }
+        case 33: {
+          this.stateInEntity();
+          break;
+        }
+      }
+      this.index++;
+    }
+    this.cleanup();
+    this.finish();
+  }
+  /**
+   * Remove data that has already been consumed from the buffer.
+   */
+  cleanup() {
+    if (this.sectionStart !== this.index) {
+      if (this.state === 1 || this.state === 32 && this.sequenceIndex === 0) {
+        this.cbs.ontext(this.sectionStart, this.index);
+        this.sectionStart = this.index;
+      } else if (this.state === 19 || this.state === 20 || this.state === 21) {
+        this.cbs.onattribdata(this.sectionStart, this.index);
+        this.sectionStart = this.index;
+      }
+    }
+  }
+  finish() {
+    this.handleTrailingData();
+    this.cbs.onend();
+  }
+  /** Handle any trailing data. */
+  handleTrailingData() {
+    const endIndex = this.buffer.length;
+    if (this.sectionStart >= endIndex) {
+      return;
+    }
+    if (this.state === 28) {
+      if (this.currentSequence === Sequences.CdataEnd) {
+        this.cbs.oncdata(this.sectionStart, endIndex);
+      } else {
+        this.cbs.oncomment(this.sectionStart, endIndex);
+      }
+    } else if (this.state === 6 || this.state === 11 || this.state === 18 || this.state === 17 || this.state === 12 || this.state === 13 || this.state === 14 || this.state === 15 || this.state === 16 || this.state === 20 || this.state === 19 || this.state === 21 || this.state === 9) ; else {
+      this.cbs.ontext(this.sectionStart, endIndex);
+    }
+  }
+  emitCodePoint(cp, consumed) {
+  }
+}
+
+function defaultOnError(error) {
+  throw error;
+}
+function defaultOnWarn(msg) {
+  console.warn(`[Vue warn] ${msg.message}`);
+}
+function createCompilerError(code, loc, messages, additionalMessage) {
+  const msg = (messages || errorMessages)[code] + (additionalMessage || ``) ;
+  const error = new SyntaxError(String(msg));
+  error.code = code;
+  error.loc = loc;
+  return error;
+}
+const errorMessages = {
+  // parse errors
+  [0]: "Illegal comment.",
+  [1]: "CDATA section is allowed only in XML context.",
+  [2]: "Duplicate attribute.",
+  [3]: "End tag cannot have attributes.",
+  [4]: "Illegal '/' in tags.",
+  [5]: "Unexpected EOF in tag.",
+  [6]: "Unexpected EOF in CDATA section.",
+  [7]: "Unexpected EOF in comment.",
+  [8]: "Unexpected EOF in script.",
+  [9]: "Unexpected EOF in tag.",
+  [10]: "Incorrectly closed comment.",
+  [11]: "Incorrectly opened comment.",
+  [12]: "Illegal tag name. Use '&lt;' to print '<'.",
+  [13]: "Attribute value was expected.",
+  [14]: "End tag name was expected.",
+  [15]: "Whitespace was expected.",
+  [16]: "Unexpected '<!--' in comment.",
+  [17]: `Attribute name cannot contain U+0022 ("), U+0027 ('), and U+003C (<).`,
+  [18]: "Unquoted attribute value cannot contain U+0022 (\"), U+0027 ('), U+003C (<), U+003D (=), and U+0060 (`).",
+  [19]: "Attribute name cannot start with '='.",
+  [21]: "'<?' is allowed only in XML context.",
+  [20]: `Unexpected null character.`,
+  [22]: "Illegal '/' in tags.",
+  // Vue-specific parse errors
+  [23]: "Invalid end tag.",
+  [24]: "Element is missing end tag.",
+  [25]: "Interpolation end sign was not found.",
+  [27]: "End bracket for dynamic directive argument was not found. Note that dynamic directive argument cannot contain spaces.",
+  [26]: "Legal directive name was expected.",
+  // transform errors
+  [28]: `v-if/v-else-if is missing expression.`,
+  [29]: `v-if/else branches must use unique keys.`,
+  [30]: `v-else/v-else-if has no adjacent v-if or v-else-if.`,
+  [31]: `v-for is missing expression.`,
+  [32]: `v-for has invalid expression.`,
+  [33]: `<template v-for> key should be placed on the <template> tag.`,
+  [34]: `v-bind is missing expression.`,
+  [52]: `v-bind with same-name shorthand only allows static argument.`,
+  [35]: `v-on is missing expression.`,
+  [36]: `Unexpected custom directive on <slot> outlet.`,
+  [37]: `Mixed v-slot usage on both the component and nested <template>. When there are multiple named slots, all slots should use <template> syntax to avoid scope ambiguity.`,
+  [38]: `Duplicate slot names found. `,
+  [39]: `Extraneous children found when component already has explicitly named default slot. These children will be ignored.`,
+  [40]: `v-slot can only be used on components or <template> tags.`,
+  [41]: `v-model is missing expression.`,
+  [42]: `v-model value must be a valid JavaScript member expression.`,
+  [43]: `v-model cannot be used on v-for or v-slot scope variables because they are not writable.`,
+  [44]: `v-model cannot be used on a prop, because local prop bindings are not writable.
+Use a v-bind binding combined with a v-on listener that emits update:x event instead.`,
+  [45]: `Error parsing JavaScript expression: `,
+  [46]: `<KeepAlive> expects exactly one child component.`,
+  [51]: `@vnode-* hooks in templates are no longer supported. Use the vue: prefix instead. For example, @vnode-mounted should be changed to @vue:mounted. @vnode-* hooks support has been removed in 3.4.`,
+  // generic errors
+  [47]: `"prefixIdentifiers" option is not supported in this build of compiler.`,
+  [48]: `ES module mode is not supported in this build of compiler.`,
+  [49]: `"cacheHandlers" option is only supported when the "prefixIdentifiers" option is enabled.`,
+  [50]: `"scopeId" option is only supported in module mode.`,
+  // just to fulfill types
+  [53]: ``
+};
+
+const isStaticExp = (p) => p.type === 4 && p.isStatic;
+function isCoreComponent(tag) {
+  switch (tag) {
+    case "Teleport":
+    case "teleport":
+      return TELEPORT;
+    case "Suspense":
+    case "suspense":
+      return SUSPENSE;
+    case "KeepAlive":
+    case "keep-alive":
+      return KEEP_ALIVE;
+    case "BaseTransition":
+    case "base-transition":
+      return BASE_TRANSITION;
+  }
+}
+const nonIdentifierRE = /^\d|[^\$\w\xA0-\uFFFF]/;
+const isSimpleIdentifier = (name) => !nonIdentifierRE.test(name);
+const validFirstIdentCharRE = /[A-Za-z_$\xA0-\uFFFF]/;
+const validIdentCharRE = /[\.\?\w$\xA0-\uFFFF]/;
+const whitespaceRE = /\s+[.[]\s*|\s*[.[]\s+/g;
+const getExpSource = (exp) => exp.type === 4 ? exp.content : exp.loc.source;
+const isMemberExpressionBrowser = (exp) => {
+  const path = getExpSource(exp).trim().replace(whitespaceRE, (s) => s.trim());
+  let state = 0 /* inMemberExp */;
+  let stateStack = [];
+  let currentOpenBracketCount = 0;
+  let currentOpenParensCount = 0;
+  let currentStringType = null;
+  for (let i = 0; i < path.length; i++) {
+    const char = path.charAt(i);
+    switch (state) {
+      case 0 /* inMemberExp */:
+        if (char === "[") {
+          stateStack.push(state);
+          state = 1 /* inBrackets */;
+          currentOpenBracketCount++;
+        } else if (char === "(") {
+          stateStack.push(state);
+          state = 2 /* inParens */;
+          currentOpenParensCount++;
+        } else if (!(i === 0 ? validFirstIdentCharRE : validIdentCharRE).test(char)) {
+          return false;
+        }
+        break;
+      case 1 /* inBrackets */:
+        if (char === `'` || char === `"` || char === "`") {
+          stateStack.push(state);
+          state = 3 /* inString */;
+          currentStringType = char;
+        } else if (char === `[`) {
+          currentOpenBracketCount++;
+        } else if (char === `]`) {
+          if (!--currentOpenBracketCount) {
+            state = stateStack.pop();
+          }
+        }
+        break;
+      case 2 /* inParens */:
+        if (char === `'` || char === `"` || char === "`") {
+          stateStack.push(state);
+          state = 3 /* inString */;
+          currentStringType = char;
+        } else if (char === `(`) {
+          currentOpenParensCount++;
+        } else if (char === `)`) {
+          if (i === path.length - 1) {
+            return false;
+          }
+          if (!--currentOpenParensCount) {
+            state = stateStack.pop();
+          }
+        }
+        break;
+      case 3 /* inString */:
+        if (char === currentStringType) {
+          state = stateStack.pop();
+          currentStringType = null;
+        }
+        break;
+    }
+  }
+  return !currentOpenBracketCount && !currentOpenParensCount;
+};
+const isMemberExpression = isMemberExpressionBrowser ;
+const fnExpRE = /^\s*(async\s*)?(\([^)]*?\)|[\w$_]+)\s*(:[^=]+)?=>|^\s*(async\s+)?function(?:\s+[\w$]+)?\s*\(/;
+const isFnExpressionBrowser = (exp) => fnExpRE.test(getExpSource(exp));
+const isFnExpression = isFnExpressionBrowser ;
+function assert(condition, msg) {
+  if (!condition) {
+    throw new Error(msg || `unexpected compiler condition`);
+  }
+}
+function findDir(node, name, allowEmpty = false) {
+  for (let i = 0; i < node.props.length; i++) {
+    const p = node.props[i];
+    if (p.type === 7 && (allowEmpty || p.exp) && (isString(name) ? p.name === name : name.test(p.name))) {
+      return p;
+    }
+  }
+}
+function findProp(node, name, dynamicOnly = false, allowEmpty = false) {
+  for (let i = 0; i < node.props.length; i++) {
+    const p = node.props[i];
+    if (p.type === 6) {
+      if (dynamicOnly) continue;
+      if (p.name === name && (p.value || allowEmpty)) {
+        return p;
+      }
+    } else if (p.name === "bind" && (p.exp || allowEmpty) && isStaticArgOf(p.arg, name)) {
+      return p;
+    }
+  }
+}
+function isStaticArgOf(arg, name) {
+  return !!(arg && isStaticExp(arg) && arg.content === name);
+}
+function hasDynamicKeyVBind(node) {
+  return node.props.some(
+    (p) => p.type === 7 && p.name === "bind" && (!p.arg || // v-bind="obj"
+    p.arg.type !== 4 || // v-bind:[_ctx.foo]
+    !p.arg.isStatic)
+    // v-bind:[foo]
+  );
+}
+function isText$1(node) {
+  return node.type === 5 || node.type === 2;
+}
+function isVSlot(p) {
+  return p.type === 7 && p.name === "slot";
+}
+function isTemplateNode(node) {
+  return node.type === 1 && node.tagType === 3;
+}
+function isSlotOutlet(node) {
+  return node.type === 1 && node.tagType === 2;
+}
+const propsHelperSet = /* @__PURE__ */ new Set([NORMALIZE_PROPS, GUARD_REACTIVE_PROPS]);
+function getUnnormalizedProps(props, callPath = []) {
+  if (props && !isString(props) && props.type === 14) {
+    const callee = props.callee;
+    if (!isString(callee) && propsHelperSet.has(callee)) {
+      return getUnnormalizedProps(
+        props.arguments[0],
+        callPath.concat(props)
+      );
+    }
+  }
+  return [props, callPath];
+}
+function injectProp(node, prop, context) {
+  let propsWithInjection;
+  let props = node.type === 13 ? node.props : node.arguments[2];
+  let callPath = [];
+  let parentCall;
+  if (props && !isString(props) && props.type === 14) {
+    const ret = getUnnormalizedProps(props);
+    props = ret[0];
+    callPath = ret[1];
+    parentCall = callPath[callPath.length - 1];
+  }
+  if (props == null || isString(props)) {
+    propsWithInjection = createObjectExpression([prop]);
+  } else if (props.type === 14) {
+    const first = props.arguments[0];
+    if (!isString(first) && first.type === 15) {
+      if (!hasProp(prop, first)) {
+        first.properties.unshift(prop);
+      }
+    } else {
+      if (props.callee === TO_HANDLERS) {
+        propsWithInjection = createCallExpression(context.helper(MERGE_PROPS), [
+          createObjectExpression([prop]),
+          props
+        ]);
+      } else {
+        props.arguments.unshift(createObjectExpression([prop]));
+      }
+    }
+    !propsWithInjection && (propsWithInjection = props);
+  } else if (props.type === 15) {
+    if (!hasProp(prop, props)) {
+      props.properties.unshift(prop);
+    }
+    propsWithInjection = props;
+  } else {
+    propsWithInjection = createCallExpression(context.helper(MERGE_PROPS), [
+      createObjectExpression([prop]),
+      props
+    ]);
+    if (parentCall && parentCall.callee === GUARD_REACTIVE_PROPS) {
+      parentCall = callPath[callPath.length - 2];
+    }
+  }
+  if (node.type === 13) {
+    if (parentCall) {
+      parentCall.arguments[0] = propsWithInjection;
+    } else {
+      node.props = propsWithInjection;
+    }
+  } else {
+    if (parentCall) {
+      parentCall.arguments[0] = propsWithInjection;
+    } else {
+      node.arguments[2] = propsWithInjection;
+    }
+  }
+}
+function hasProp(prop, props) {
+  let result = false;
+  if (prop.key.type === 4) {
+    const propKeyName = prop.key.content;
+    result = props.properties.some(
+      (p) => p.key.type === 4 && p.key.content === propKeyName
+    );
+  }
+  return result;
+}
+function toValidAssetId(name, type) {
+  return `_${type}_${name.replace(/[^\w]/g, (searchValue, replaceValue) => {
+    return searchValue === "-" ? "_" : name.charCodeAt(replaceValue).toString();
+  })}`;
+}
+function getMemoedVNodeCall(node) {
+  if (node.type === 14 && node.callee === WITH_MEMO) {
+    return node.arguments[1].returns;
+  } else {
+    return node;
+  }
+}
+const forAliasRE = /([\s\S]*?)\s+(?:in|of)\s+(\S[\s\S]*)/;
+
+const defaultParserOptions = {
+  parseMode: "base",
+  ns: 0,
+  delimiters: [`{{`, `}}`],
+  getNamespace: () => 0,
+  isVoidTag: NO,
+  isPreTag: NO,
+  isIgnoreNewlineTag: NO,
+  isCustomElement: NO,
+  onError: defaultOnError,
+  onWarn: defaultOnWarn,
+  comments: true,
+  prefixIdentifiers: false
+};
+let currentOptions = defaultParserOptions;
+let currentRoot = null;
+let currentInput = "";
+let currentOpenTag = null;
+let currentProp = null;
+let currentAttrValue = "";
+let currentAttrStartIndex = -1;
+let currentAttrEndIndex = -1;
+let inPre = 0;
+let inVPre = false;
+let currentVPreBoundary = null;
+const stack = [];
+const tokenizer = new Tokenizer(stack, {
+  onerr: emitError,
+  ontext(start, end) {
+    onText(getSlice(start, end), start, end);
+  },
+  ontextentity(char, start, end) {
+    onText(char, start, end);
+  },
+  oninterpolation(start, end) {
+    if (inVPre) {
+      return onText(getSlice(start, end), start, end);
+    }
+    let innerStart = start + tokenizer.delimiterOpen.length;
+    let innerEnd = end - tokenizer.delimiterClose.length;
+    while (isWhitespace(currentInput.charCodeAt(innerStart))) {
+      innerStart++;
+    }
+    while (isWhitespace(currentInput.charCodeAt(innerEnd - 1))) {
+      innerEnd--;
+    }
+    let exp = getSlice(innerStart, innerEnd);
+    if (exp.includes("&")) {
+      {
+        exp = currentOptions.decodeEntities(exp, false);
+      }
+    }
+    addNode({
+      type: 5,
+      content: createExp(exp, false, getLoc(innerStart, innerEnd)),
+      loc: getLoc(start, end)
+    });
+  },
+  onopentagname(start, end) {
+    const name = getSlice(start, end);
+    currentOpenTag = {
+      type: 1,
+      tag: name,
+      ns: currentOptions.getNamespace(name, stack[0], currentOptions.ns),
+      tagType: 0,
+      // will be refined on tag close
+      props: [],
+      children: [],
+      loc: getLoc(start - 1, end),
+      codegenNode: void 0
+    };
+  },
+  onopentagend(end) {
+    endOpenTag(end);
+  },
+  onclosetag(start, end) {
+    const name = getSlice(start, end);
+    if (!currentOptions.isVoidTag(name)) {
+      let found = false;
+      for (let i = 0; i < stack.length; i++) {
+        const e = stack[i];
+        if (e.tag.toLowerCase() === name.toLowerCase()) {
+          found = true;
+          if (i > 0) {
+            emitError(24, stack[0].loc.start.offset);
+          }
+          for (let j = 0; j <= i; j++) {
+            const el = stack.shift();
+            onCloseTag(el, end, j < i);
+          }
+          break;
+        }
+      }
+      if (!found) {
+        emitError(23, backTrack(start, 60));
+      }
+    }
+  },
+  onselfclosingtag(end) {
+    const name = currentOpenTag.tag;
+    currentOpenTag.isSelfClosing = true;
+    endOpenTag(end);
+    if (stack[0] && stack[0].tag === name) {
+      onCloseTag(stack.shift(), end);
+    }
+  },
+  onattribname(start, end) {
+    currentProp = {
+      type: 6,
+      name: getSlice(start, end),
+      nameLoc: getLoc(start, end),
+      value: void 0,
+      loc: getLoc(start)
+    };
+  },
+  ondirname(start, end) {
+    const raw = getSlice(start, end);
+    const name = raw === "." || raw === ":" ? "bind" : raw === "@" ? "on" : raw === "#" ? "slot" : raw.slice(2);
+    if (!inVPre && name === "") {
+      emitError(26, start);
+    }
+    if (inVPre || name === "") {
+      currentProp = {
+        type: 6,
+        name: raw,
+        nameLoc: getLoc(start, end),
+        value: void 0,
+        loc: getLoc(start)
+      };
+    } else {
+      currentProp = {
+        type: 7,
+        name,
+        rawName: raw,
+        exp: void 0,
+        arg: void 0,
+        modifiers: raw === "." ? [createSimpleExpression("prop")] : [],
+        loc: getLoc(start)
+      };
+      if (name === "pre") {
+        inVPre = tokenizer.inVPre = true;
+        currentVPreBoundary = currentOpenTag;
+        const props = currentOpenTag.props;
+        for (let i = 0; i < props.length; i++) {
+          if (props[i].type === 7) {
+            props[i] = dirToAttr(props[i]);
+          }
+        }
+      }
+    }
+  },
+  ondirarg(start, end) {
+    if (start === end) return;
+    const arg = getSlice(start, end);
+    if (inVPre) {
+      currentProp.name += arg;
+      setLocEnd(currentProp.nameLoc, end);
+    } else {
+      const isStatic = arg[0] !== `[`;
+      currentProp.arg = createExp(
+        isStatic ? arg : arg.slice(1, -1),
+        isStatic,
+        getLoc(start, end),
+        isStatic ? 3 : 0
+      );
+    }
+  },
+  ondirmodifier(start, end) {
+    const mod = getSlice(start, end);
+    if (inVPre) {
+      currentProp.name += "." + mod;
+      setLocEnd(currentProp.nameLoc, end);
+    } else if (currentProp.name === "slot") {
+      const arg = currentProp.arg;
+      if (arg) {
+        arg.content += "." + mod;
+        setLocEnd(arg.loc, end);
+      }
+    } else {
+      const exp = createSimpleExpression(mod, true, getLoc(start, end));
+      currentProp.modifiers.push(exp);
+    }
+  },
+  onattribdata(start, end) {
+    currentAttrValue += getSlice(start, end);
+    if (currentAttrStartIndex < 0) currentAttrStartIndex = start;
+    currentAttrEndIndex = end;
+  },
+  onattribentity(char, start, end) {
+    currentAttrValue += char;
+    if (currentAttrStartIndex < 0) currentAttrStartIndex = start;
+    currentAttrEndIndex = end;
+  },
+  onattribnameend(end) {
+    const start = currentProp.loc.start.offset;
+    const name = getSlice(start, end);
+    if (currentProp.type === 7) {
+      currentProp.rawName = name;
+    }
+    if (currentOpenTag.props.some(
+      (p) => (p.type === 7 ? p.rawName : p.name) === name
+    )) {
+      emitError(2, start);
+    }
+  },
+  onattribend(quote, end) {
+    if (currentOpenTag && currentProp) {
+      setLocEnd(currentProp.loc, end);
+      if (quote !== 0) {
+        if (currentAttrValue.includes("&")) {
+          currentAttrValue = currentOptions.decodeEntities(
+            currentAttrValue,
+            true
+          );
+        }
+        if (currentProp.type === 6) {
+          if (currentProp.name === "class") {
+            currentAttrValue = condense(currentAttrValue).trim();
+          }
+          if (quote === 1 && !currentAttrValue) {
+            emitError(13, end);
+          }
+          currentProp.value = {
+            type: 2,
+            content: currentAttrValue,
+            loc: quote === 1 ? getLoc(currentAttrStartIndex, currentAttrEndIndex) : getLoc(currentAttrStartIndex - 1, currentAttrEndIndex + 1)
+          };
+          if (tokenizer.inSFCRoot && currentOpenTag.tag === "template" && currentProp.name === "lang" && currentAttrValue && currentAttrValue !== "html") {
+            tokenizer.enterRCDATA(toCharCodes(`</template`), 0);
+          }
+        } else {
+          let expParseMode = 0 /* Normal */;
+          currentProp.exp = createExp(
+            currentAttrValue,
+            false,
+            getLoc(currentAttrStartIndex, currentAttrEndIndex),
+            0,
+            expParseMode
+          );
+          if (currentProp.name === "for") {
+            currentProp.forParseResult = parseForExpression(currentProp.exp);
+          }
+        }
+      }
+      if (currentProp.type !== 7 || currentProp.name !== "pre") {
+        currentOpenTag.props.push(currentProp);
+      }
+    }
+    currentAttrValue = "";
+    currentAttrStartIndex = currentAttrEndIndex = -1;
+  },
+  oncomment(start, end) {
+    if (currentOptions.comments) {
+      addNode({
+        type: 3,
+        content: getSlice(start, end),
+        loc: getLoc(start - 4, end + 3)
+      });
+    }
+  },
+  onend() {
+    const end = currentInput.length;
+    if (tokenizer.state !== 1) {
+      switch (tokenizer.state) {
+        case 5:
+        case 8:
+          emitError(5, end);
+          break;
+        case 3:
+        case 4:
+          emitError(
+            25,
+            tokenizer.sectionStart
+          );
+          break;
+        case 28:
+          if (tokenizer.currentSequence === Sequences.CdataEnd) {
+            emitError(6, end);
+          } else {
+            emitError(7, end);
+          }
+          break;
+        case 6:
+        case 7:
+        case 9:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+        case 15:
+        case 16:
+        case 17:
+        case 18:
+        case 19:
+        // "
+        case 20:
+        // '
+        case 21:
+          emitError(9, end);
+          break;
+      }
+    }
+    for (let index = 0; index < stack.length; index++) {
+      onCloseTag(stack[index], end - 1);
+      emitError(24, stack[index].loc.start.offset);
+    }
+  },
+  oncdata(start, end) {
+    if (stack[0].ns !== 0) {
+      onText(getSlice(start, end), start, end);
+    } else {
+      emitError(1, start - 9);
+    }
+  },
+  onprocessinginstruction(start) {
+    if ((stack[0] ? stack[0].ns : currentOptions.ns) === 0) {
+      emitError(
+        21,
+        start - 1
+      );
+    }
+  }
+});
+const forIteratorRE = /,([^,\}\]]*)(?:,([^,\}\]]*))?$/;
+const stripParensRE = /^\(|\)$/g;
+function parseForExpression(input) {
+  const loc = input.loc;
+  const exp = input.content;
+  const inMatch = exp.match(forAliasRE);
+  if (!inMatch) return;
+  const [, LHS, RHS] = inMatch;
+  const createAliasExpression = (content, offset, asParam = false) => {
+    const start = loc.start.offset + offset;
+    const end = start + content.length;
+    return createExp(
+      content,
+      false,
+      getLoc(start, end),
+      0,
+      asParam ? 1 /* Params */ : 0 /* Normal */
+    );
+  };
+  const result = {
+    source: createAliasExpression(RHS.trim(), exp.indexOf(RHS, LHS.length)),
+    value: void 0,
+    key: void 0,
+    index: void 0,
+    finalized: false
+  };
+  let valueContent = LHS.trim().replace(stripParensRE, "").trim();
+  const trimmedOffset = LHS.indexOf(valueContent);
+  const iteratorMatch = valueContent.match(forIteratorRE);
+  if (iteratorMatch) {
+    valueContent = valueContent.replace(forIteratorRE, "").trim();
+    const keyContent = iteratorMatch[1].trim();
+    let keyOffset;
+    if (keyContent) {
+      keyOffset = exp.indexOf(keyContent, trimmedOffset + valueContent.length);
+      result.key = createAliasExpression(keyContent, keyOffset, true);
+    }
+    if (iteratorMatch[2]) {
+      const indexContent = iteratorMatch[2].trim();
+      if (indexContent) {
+        result.index = createAliasExpression(
+          indexContent,
+          exp.indexOf(
+            indexContent,
+            result.key ? keyOffset + keyContent.length : trimmedOffset + valueContent.length
+          ),
+          true
+        );
+      }
+    }
+  }
+  if (valueContent) {
+    result.value = createAliasExpression(valueContent, trimmedOffset, true);
+  }
+  return result;
+}
+function getSlice(start, end) {
+  return currentInput.slice(start, end);
+}
+function endOpenTag(end) {
+  if (tokenizer.inSFCRoot) {
+    currentOpenTag.innerLoc = getLoc(end + 1, end + 1);
+  }
+  addNode(currentOpenTag);
+  const { tag, ns } = currentOpenTag;
+  if (ns === 0 && currentOptions.isPreTag(tag)) {
+    inPre++;
+  }
+  if (currentOptions.isVoidTag(tag)) {
+    onCloseTag(currentOpenTag, end);
+  } else {
+    stack.unshift(currentOpenTag);
+    if (ns === 1 || ns === 2) {
+      tokenizer.inXML = true;
+    }
+  }
+  currentOpenTag = null;
+}
+function onText(content, start, end) {
+  {
+    const tag = stack[0] && stack[0].tag;
+    if (tag !== "script" && tag !== "style" && content.includes("&")) {
+      content = currentOptions.decodeEntities(content, false);
+    }
+  }
+  const parent = stack[0] || currentRoot;
+  const lastNode = parent.children[parent.children.length - 1];
+  if (lastNode && lastNode.type === 2) {
+    lastNode.content += content;
+    setLocEnd(lastNode.loc, end);
+  } else {
+    parent.children.push({
+      type: 2,
+      content,
+      loc: getLoc(start, end)
+    });
+  }
+}
+function onCloseTag(el, end, isImplied = false) {
+  if (isImplied) {
+    setLocEnd(el.loc, backTrack(end, 60));
+  } else {
+    setLocEnd(el.loc, lookAhead(end, 62) + 1);
+  }
+  if (tokenizer.inSFCRoot) {
+    if (el.children.length) {
+      el.innerLoc.end = extend({}, el.children[el.children.length - 1].loc.end);
+    } else {
+      el.innerLoc.end = extend({}, el.innerLoc.start);
+    }
+    el.innerLoc.source = getSlice(
+      el.innerLoc.start.offset,
+      el.innerLoc.end.offset
+    );
+  }
+  const { tag, ns, children } = el;
+  if (!inVPre) {
+    if (tag === "slot") {
+      el.tagType = 2;
+    } else if (isFragmentTemplate(el)) {
+      el.tagType = 3;
+    } else if (isComponent(el)) {
+      el.tagType = 1;
+    }
+  }
+  if (!tokenizer.inRCDATA) {
+    el.children = condenseWhitespace(children);
+  }
+  if (ns === 0 && currentOptions.isIgnoreNewlineTag(tag)) {
+    const first = children[0];
+    if (first && first.type === 2) {
+      first.content = first.content.replace(/^\r?\n/, "");
+    }
+  }
+  if (ns === 0 && currentOptions.isPreTag(tag)) {
+    inPre--;
+  }
+  if (currentVPreBoundary === el) {
+    inVPre = tokenizer.inVPre = false;
+    currentVPreBoundary = null;
+  }
+  if (tokenizer.inXML && (stack[0] ? stack[0].ns : currentOptions.ns) === 0) {
+    tokenizer.inXML = false;
+  }
+}
+function lookAhead(index, c) {
+  let i = index;
+  while (currentInput.charCodeAt(i) !== c && i < currentInput.length - 1) i++;
+  return i;
+}
+function backTrack(index, c) {
+  let i = index;
+  while (currentInput.charCodeAt(i) !== c && i >= 0) i--;
+  return i;
+}
+const specialTemplateDir = /* @__PURE__ */ new Set(["if", "else", "else-if", "for", "slot"]);
+function isFragmentTemplate({ tag, props }) {
+  if (tag === "template") {
+    for (let i = 0; i < props.length; i++) {
+      if (props[i].type === 7 && specialTemplateDir.has(props[i].name)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+function isComponent({ tag, props }) {
+  if (currentOptions.isCustomElement(tag)) {
+    return false;
+  }
+  if (tag === "component" || isUpperCase(tag.charCodeAt(0)) || isCoreComponent(tag) || currentOptions.isBuiltInComponent && currentOptions.isBuiltInComponent(tag) || currentOptions.isNativeTag && !currentOptions.isNativeTag(tag)) {
+    return true;
+  }
+  for (let i = 0; i < props.length; i++) {
+    const p = props[i];
+    if (p.type === 6) {
+      if (p.name === "is" && p.value) {
+        if (p.value.content.startsWith("vue:")) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+function isUpperCase(c) {
+  return c > 64 && c < 91;
+}
+const windowsNewlineRE = /\r\n/g;
+function condenseWhitespace(nodes, tag) {
+  const shouldCondense = currentOptions.whitespace !== "preserve";
+  let removedWhitespace = false;
+  for (let i = 0; i < nodes.length; i++) {
+    const node = nodes[i];
+    if (node.type === 2) {
+      if (!inPre) {
+        if (isAllWhitespace(node.content)) {
+          const prev = nodes[i - 1] && nodes[i - 1].type;
+          const next = nodes[i + 1] && nodes[i + 1].type;
+          if (!prev || !next || shouldCondense && (prev === 3 && (next === 3 || next === 1) || prev === 1 && (next === 3 || next === 1 && hasNewlineChar(node.content)))) {
+            removedWhitespace = true;
+            nodes[i] = null;
+          } else {
+            node.content = " ";
+          }
+        } else if (shouldCondense) {
+          node.content = condense(node.content);
+        }
+      } else {
+        node.content = node.content.replace(windowsNewlineRE, "\n");
+      }
+    }
+  }
+  return removedWhitespace ? nodes.filter(Boolean) : nodes;
+}
+function isAllWhitespace(str) {
+  for (let i = 0; i < str.length; i++) {
+    if (!isWhitespace(str.charCodeAt(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+function hasNewlineChar(str) {
+  for (let i = 0; i < str.length; i++) {
+    const c = str.charCodeAt(i);
+    if (c === 10 || c === 13) {
+      return true;
+    }
+  }
+  return false;
+}
+function condense(str) {
+  let ret = "";
+  let prevCharIsWhitespace = false;
+  for (let i = 0; i < str.length; i++) {
+    if (isWhitespace(str.charCodeAt(i))) {
+      if (!prevCharIsWhitespace) {
+        ret += " ";
+        prevCharIsWhitespace = true;
+      }
+    } else {
+      ret += str[i];
+      prevCharIsWhitespace = false;
+    }
+  }
+  return ret;
+}
+function addNode(node) {
+  (stack[0] || currentRoot).children.push(node);
+}
+function getLoc(start, end) {
+  return {
+    start: tokenizer.getPos(start),
+    // @ts-expect-error allow late attachment
+    end: end == null ? end : tokenizer.getPos(end),
+    // @ts-expect-error allow late attachment
+    source: end == null ? end : getSlice(start, end)
+  };
+}
+function cloneLoc(loc) {
+  return getLoc(loc.start.offset, loc.end.offset);
+}
+function setLocEnd(loc, end) {
+  loc.end = tokenizer.getPos(end);
+  loc.source = getSlice(loc.start.offset, end);
+}
+function dirToAttr(dir) {
+  const attr = {
+    type: 6,
+    name: dir.rawName,
+    nameLoc: getLoc(
+      dir.loc.start.offset,
+      dir.loc.start.offset + dir.rawName.length
+    ),
+    value: void 0,
+    loc: dir.loc
+  };
+  if (dir.exp) {
+    const loc = dir.exp.loc;
+    if (loc.end.offset < dir.loc.end.offset) {
+      loc.start.offset--;
+      loc.start.column--;
+      loc.end.offset++;
+      loc.end.column++;
+    }
+    attr.value = {
+      type: 2,
+      content: dir.exp.content,
+      loc
+    };
+  }
+  return attr;
+}
+function createExp(content, isStatic = false, loc, constType = 0, parseMode = 0 /* Normal */) {
+  const exp = createSimpleExpression(content, isStatic, loc, constType);
+  return exp;
+}
+function emitError(code, index, message) {
+  currentOptions.onError(
+    createCompilerError(code, getLoc(index, index), void 0, message)
+  );
+}
+function reset() {
+  tokenizer.reset();
+  currentOpenTag = null;
+  currentProp = null;
+  currentAttrValue = "";
+  currentAttrStartIndex = -1;
+  currentAttrEndIndex = -1;
+  stack.length = 0;
+}
+function baseParse(input, options) {
+  reset();
+  currentInput = input;
+  currentOptions = extend({}, defaultParserOptions);
+  if (options) {
+    let key;
+    for (key in options) {
+      if (options[key] != null) {
+        currentOptions[key] = options[key];
+      }
+    }
+  }
+  {
+    if (!currentOptions.decodeEntities) {
+      throw new Error(
+        `[@vue/compiler-core] decodeEntities option is required in browser builds.`
+      );
+    }
+  }
+  tokenizer.mode = currentOptions.parseMode === "html" ? 1 : currentOptions.parseMode === "sfc" ? 2 : 0;
+  tokenizer.inXML = currentOptions.ns === 1 || currentOptions.ns === 2;
+  const delimiters = options && options.delimiters;
+  if (delimiters) {
+    tokenizer.delimiterOpen = toCharCodes(delimiters[0]);
+    tokenizer.delimiterClose = toCharCodes(delimiters[1]);
+  }
+  const root = currentRoot = createRoot([], input);
+  tokenizer.parse(currentInput);
+  root.loc = getLoc(0, input.length);
+  root.children = condenseWhitespace(root.children);
+  currentRoot = null;
+  return root;
+}
+
+function cacheStatic(root, context) {
+  walk(
+    root,
+    void 0,
+    context,
+    // Root node is unfortunately non-hoistable due to potential parent
+    // fallthrough attributes.
+    isSingleElementRoot(root, root.children[0])
+  );
+}
+function isSingleElementRoot(root, child) {
+  const { children } = root;
+  return children.length === 1 && child.type === 1 && !isSlotOutlet(child);
+}
+function walk(node, parent, context, doNotHoistNode = false, inFor = false) {
+  const { children } = node;
+  const toCache = [];
+  for (let i = 0; i < children.length; i++) {
+    const child = children[i];
+    if (child.type === 1 && child.tagType === 0) {
+      const constantType = doNotHoistNode ? 0 : getConstantType(child, context);
+      if (constantType > 0) {
+        if (constantType >= 2) {
+          child.codegenNode.patchFlag = -1;
+          toCache.push(child);
+          continue;
+        }
+      } else {
+        const codegenNode = child.codegenNode;
+        if (codegenNode.type === 13) {
+          const flag = codegenNode.patchFlag;
+          if ((flag === void 0 || flag === 512 || flag === 1) && getGeneratedPropsConstantType(child, context) >= 2) {
+            const props = getNodeProps(child);
+            if (props) {
+              codegenNode.props = context.hoist(props);
+            }
+          }
+          if (codegenNode.dynamicProps) {
+            codegenNode.dynamicProps = context.hoist(codegenNode.dynamicProps);
+          }
+        }
+      }
+    } else if (child.type === 12) {
+      const constantType = doNotHoistNode ? 0 : getConstantType(child, context);
+      if (constantType >= 2) {
+        toCache.push(child);
+        continue;
+      }
+    }
+    if (child.type === 1) {
+      const isComponent = child.tagType === 1;
+      if (isComponent) {
+        context.scopes.vSlot++;
+      }
+      walk(child, node, context, false, inFor);
+      if (isComponent) {
+        context.scopes.vSlot--;
+      }
+    } else if (child.type === 11) {
+      walk(child, node, context, child.children.length === 1, true);
+    } else if (child.type === 9) {
+      for (let i2 = 0; i2 < child.branches.length; i2++) {
+        walk(
+          child.branches[i2],
+          node,
+          context,
+          child.branches[i2].children.length === 1,
+          inFor
+        );
+      }
+    }
+  }
+  let cachedAsArray = false;
+  if (toCache.length === children.length && node.type === 1) {
+    if (node.tagType === 0 && node.codegenNode && node.codegenNode.type === 13 && isArray(node.codegenNode.children)) {
+      node.codegenNode.children = getCacheExpression(
+        createArrayExpression(node.codegenNode.children)
+      );
+      cachedAsArray = true;
+    } else if (node.tagType === 1 && node.codegenNode && node.codegenNode.type === 13 && node.codegenNode.children && !isArray(node.codegenNode.children) && node.codegenNode.children.type === 15) {
+      const slot = getSlotNode(node.codegenNode, "default");
+      if (slot) {
+        slot.returns = getCacheExpression(
+          createArrayExpression(slot.returns)
+        );
+        cachedAsArray = true;
+      }
+    } else if (node.tagType === 3 && parent && parent.type === 1 && parent.tagType === 1 && parent.codegenNode && parent.codegenNode.type === 13 && parent.codegenNode.children && !isArray(parent.codegenNode.children) && parent.codegenNode.children.type === 15) {
+      const slotName = findDir(node, "slot", true);
+      const slot = slotName && slotName.arg && getSlotNode(parent.codegenNode, slotName.arg);
+      if (slot) {
+        slot.returns = getCacheExpression(
+          createArrayExpression(slot.returns)
+        );
+        cachedAsArray = true;
+      }
+    }
+  }
+  if (!cachedAsArray) {
+    for (const child of toCache) {
+      child.codegenNode = context.cache(child.codegenNode);
+    }
+  }
+  function getCacheExpression(value) {
+    const exp = context.cache(value);
+    if (inFor && context.hmr) {
+      exp.needArraySpread = true;
+    }
+    return exp;
+  }
+  function getSlotNode(node2, name) {
+    if (node2.children && !isArray(node2.children) && node2.children.type === 15) {
+      const slot = node2.children.properties.find(
+        (p) => p.key === name || p.key.content === name
+      );
+      return slot && slot.value;
+    }
+  }
+  if (toCache.length && context.transformHoist) {
+    context.transformHoist(children, context, node);
+  }
+}
+function getConstantType(node, context) {
+  const { constantCache } = context;
+  switch (node.type) {
+    case 1:
+      if (node.tagType !== 0) {
+        return 0;
+      }
+      const cached = constantCache.get(node);
+      if (cached !== void 0) {
+        return cached;
+      }
+      const codegenNode = node.codegenNode;
+      if (codegenNode.type !== 13) {
+        return 0;
+      }
+      if (codegenNode.isBlock && node.tag !== "svg" && node.tag !== "foreignObject" && node.tag !== "math") {
+        return 0;
+      }
+      if (codegenNode.patchFlag === void 0) {
+        let returnType2 = 3;
+        const generatedPropsType = getGeneratedPropsConstantType(node, context);
+        if (generatedPropsType === 0) {
+          constantCache.set(node, 0);
+          return 0;
+        }
+        if (generatedPropsType < returnType2) {
+          returnType2 = generatedPropsType;
+        }
+        for (let i = 0; i < node.children.length; i++) {
+          const childType = getConstantType(node.children[i], context);
+          if (childType === 0) {
+            constantCache.set(node, 0);
+            return 0;
+          }
+          if (childType < returnType2) {
+            returnType2 = childType;
+          }
+        }
+        if (returnType2 > 1) {
+          for (let i = 0; i < node.props.length; i++) {
+            const p = node.props[i];
+            if (p.type === 7 && p.name === "bind" && p.exp) {
+              const expType = getConstantType(p.exp, context);
+              if (expType === 0) {
+                constantCache.set(node, 0);
+                return 0;
+              }
+              if (expType < returnType2) {
+                returnType2 = expType;
+              }
+            }
+          }
+        }
+        if (codegenNode.isBlock) {
+          for (let i = 0; i < node.props.length; i++) {
+            const p = node.props[i];
+            if (p.type === 7) {
+              constantCache.set(node, 0);
+              return 0;
+            }
+          }
+          context.removeHelper(OPEN_BLOCK);
+          context.removeHelper(
+            getVNodeBlockHelper(context.inSSR, codegenNode.isComponent)
+          );
+          codegenNode.isBlock = false;
+          context.helper(getVNodeHelper(context.inSSR, codegenNode.isComponent));
+        }
+        constantCache.set(node, returnType2);
+        return returnType2;
+      } else {
+        constantCache.set(node, 0);
+        return 0;
+      }
+    case 2:
+    case 3:
+      return 3;
+    case 9:
+    case 11:
+    case 10:
+      return 0;
+    case 5:
+    case 12:
+      return getConstantType(node.content, context);
+    case 4:
+      return node.constType;
+    case 8:
+      let returnType = 3;
+      for (let i = 0; i < node.children.length; i++) {
+        const child = node.children[i];
+        if (isString(child) || isSymbol(child)) {
+          continue;
+        }
+        const childType = getConstantType(child, context);
+        if (childType === 0) {
+          return 0;
+        } else if (childType < returnType) {
+          returnType = childType;
+        }
+      }
+      return returnType;
+    case 20:
+      return 2;
+    default:
+      return 0;
+  }
+}
+const allowHoistedHelperSet = /* @__PURE__ */ new Set([
+  NORMALIZE_CLASS,
+  NORMALIZE_STYLE,
+  NORMALIZE_PROPS,
+  GUARD_REACTIVE_PROPS
+]);
+function getConstantTypeOfHelperCall(value, context) {
+  if (value.type === 14 && !isString(value.callee) && allowHoistedHelperSet.has(value.callee)) {
+    const arg = value.arguments[0];
+    if (arg.type === 4) {
+      return getConstantType(arg, context);
+    } else if (arg.type === 14) {
+      return getConstantTypeOfHelperCall(arg, context);
+    }
+  }
+  return 0;
+}
+function getGeneratedPropsConstantType(node, context) {
+  let returnType = 3;
+  const props = getNodeProps(node);
+  if (props && props.type === 15) {
+    const { properties } = props;
+    for (let i = 0; i < properties.length; i++) {
+      const { key, value } = properties[i];
+      const keyType = getConstantType(key, context);
+      if (keyType === 0) {
+        return keyType;
+      }
+      if (keyType < returnType) {
+        returnType = keyType;
+      }
+      let valueType;
+      if (value.type === 4) {
+        valueType = getConstantType(value, context);
+      } else if (value.type === 14) {
+        valueType = getConstantTypeOfHelperCall(value, context);
+      } else {
+        valueType = 0;
+      }
+      if (valueType === 0) {
+        return valueType;
+      }
+      if (valueType < returnType) {
+        returnType = valueType;
+      }
+    }
+  }
+  return returnType;
+}
+function getNodeProps(node) {
+  const codegenNode = node.codegenNode;
+  if (codegenNode.type === 13) {
+    return codegenNode.props;
+  }
+}
+
+function createTransformContext(root, {
+  filename = "",
+  prefixIdentifiers = false,
+  hoistStatic = false,
+  hmr = false,
+  cacheHandlers = false,
+  nodeTransforms = [],
+  directiveTransforms = {},
+  transformHoist = null,
+  isBuiltInComponent = NOOP,
+  isCustomElement = NOOP,
+  expressionPlugins = [],
+  scopeId = null,
+  slotted = true,
+  ssr = false,
+  inSSR = false,
+  ssrCssVars = ``,
+  bindingMetadata = EMPTY_OBJ,
+  inline = false,
+  isTS = false,
+  onError = defaultOnError,
+  onWarn = defaultOnWarn,
+  compatConfig
+}) {
+  const nameMatch = filename.replace(/\?.*$/, "").match(/([^/\\]+)\.\w+$/);
+  const context = {
+    // options
+    filename,
+    selfName: nameMatch && capitalize(camelize(nameMatch[1])),
+    prefixIdentifiers,
+    hoistStatic,
+    hmr,
+    cacheHandlers,
+    nodeTransforms,
+    directiveTransforms,
+    transformHoist,
+    isBuiltInComponent,
+    isCustomElement,
+    expressionPlugins,
+    scopeId,
+    slotted,
+    ssr,
+    inSSR,
+    ssrCssVars,
+    bindingMetadata,
+    inline,
+    isTS,
+    onError,
+    onWarn,
+    compatConfig,
+    // state
+    root,
+    helpers: /* @__PURE__ */ new Map(),
+    components: /* @__PURE__ */ new Set(),
+    directives: /* @__PURE__ */ new Set(),
+    hoists: [],
+    imports: [],
+    cached: [],
+    constantCache: /* @__PURE__ */ new WeakMap(),
+    temps: 0,
+    identifiers: /* @__PURE__ */ Object.create(null),
+    scopes: {
+      vFor: 0,
+      vSlot: 0,
+      vPre: 0,
+      vOnce: 0
+    },
+    parent: null,
+    grandParent: null,
+    currentNode: root,
+    childIndex: 0,
+    inVOnce: false,
+    // methods
+    helper(name) {
+      const count = context.helpers.get(name) || 0;
+      context.helpers.set(name, count + 1);
+      return name;
+    },
+    removeHelper(name) {
+      const count = context.helpers.get(name);
+      if (count) {
+        const currentCount = count - 1;
+        if (!currentCount) {
+          context.helpers.delete(name);
+        } else {
+          context.helpers.set(name, currentCount);
+        }
+      }
+    },
+    helperString(name) {
+      return `_${helperNameMap[context.helper(name)]}`;
+    },
+    replaceNode(node) {
+      {
+        if (!context.currentNode) {
+          throw new Error(`Node being replaced is already removed.`);
+        }
+        if (!context.parent) {
+          throw new Error(`Cannot replace root node.`);
+        }
+      }
+      context.parent.children[context.childIndex] = context.currentNode = node;
+    },
+    removeNode(node) {
+      if (!context.parent) {
+        throw new Error(`Cannot remove root node.`);
+      }
+      const list = context.parent.children;
+      const removalIndex = node ? list.indexOf(node) : context.currentNode ? context.childIndex : -1;
+      if (removalIndex < 0) {
+        throw new Error(`node being removed is not a child of current parent`);
+      }
+      if (!node || node === context.currentNode) {
+        context.currentNode = null;
+        context.onNodeRemoved();
+      } else {
+        if (context.childIndex > removalIndex) {
+          context.childIndex--;
+          context.onNodeRemoved();
+        }
+      }
+      context.parent.children.splice(removalIndex, 1);
+    },
+    onNodeRemoved: NOOP,
+    addIdentifiers(exp) {
+    },
+    removeIdentifiers(exp) {
+    },
+    hoist(exp) {
+      if (isString(exp)) exp = createSimpleExpression(exp);
+      context.hoists.push(exp);
+      const identifier = createSimpleExpression(
+        `_hoisted_${context.hoists.length}`,
+        false,
+        exp.loc,
+        2
+      );
+      identifier.hoisted = exp;
+      return identifier;
+    },
+    cache(exp, isVNode = false) {
+      const cacheExp = createCacheExpression(
+        context.cached.length,
+        exp,
+        isVNode
+      );
+      context.cached.push(cacheExp);
+      return cacheExp;
+    }
+  };
+  return context;
+}
+function transform(root, options) {
+  const context = createTransformContext(root, options);
+  traverseNode(root, context);
+  if (options.hoistStatic) {
+    cacheStatic(root, context);
+  }
+  if (!options.ssr) {
+    createRootCodegen(root, context);
+  }
+  root.helpers = /* @__PURE__ */ new Set([...context.helpers.keys()]);
+  root.components = [...context.components];
+  root.directives = [...context.directives];
+  root.imports = context.imports;
+  root.hoists = context.hoists;
+  root.temps = context.temps;
+  root.cached = context.cached;
+  root.transformed = true;
+}
+function createRootCodegen(root, context) {
+  const { helper } = context;
+  const { children } = root;
+  if (children.length === 1) {
+    const child = children[0];
+    if (isSingleElementRoot(root, child) && child.codegenNode) {
+      const codegenNode = child.codegenNode;
+      if (codegenNode.type === 13) {
+        convertToBlock(codegenNode, context);
+      }
+      root.codegenNode = codegenNode;
+    } else {
+      root.codegenNode = child;
+    }
+  } else if (children.length > 1) {
+    let patchFlag = 64;
+    if (children.filter((c) => c.type !== 3).length === 1) {
+      patchFlag |= 2048;
+    }
+    root.codegenNode = createVNodeCall(
+      context,
+      helper(FRAGMENT),
+      void 0,
+      root.children,
+      patchFlag,
+      void 0,
+      void 0,
+      true,
+      void 0,
+      false
+    );
+  } else ;
+}
+function traverseChildren(parent, context) {
+  let i = 0;
+  const nodeRemoved = () => {
+    i--;
+  };
+  for (; i < parent.children.length; i++) {
+    const child = parent.children[i];
+    if (isString(child)) continue;
+    context.grandParent = context.parent;
+    context.parent = parent;
+    context.childIndex = i;
+    context.onNodeRemoved = nodeRemoved;
+    traverseNode(child, context);
+  }
+}
+function traverseNode(node, context) {
+  context.currentNode = node;
+  const { nodeTransforms } = context;
+  const exitFns = [];
+  for (let i2 = 0; i2 < nodeTransforms.length; i2++) {
+    const onExit = nodeTransforms[i2](node, context);
+    if (onExit) {
+      if (isArray(onExit)) {
+        exitFns.push(...onExit);
+      } else {
+        exitFns.push(onExit);
+      }
+    }
+    if (!context.currentNode) {
+      return;
+    } else {
+      node = context.currentNode;
+    }
+  }
+  switch (node.type) {
+    case 3:
+      if (!context.ssr) {
+        context.helper(CREATE_COMMENT);
+      }
+      break;
+    case 5:
+      if (!context.ssr) {
+        context.helper(TO_DISPLAY_STRING);
+      }
+      break;
+    // for container types, further traverse downwards
+    case 9:
+      for (let i2 = 0; i2 < node.branches.length; i2++) {
+        traverseNode(node.branches[i2], context);
+      }
+      break;
+    case 10:
+    case 11:
+    case 1:
+    case 0:
+      traverseChildren(node, context);
+      break;
+  }
+  context.currentNode = node;
+  let i = exitFns.length;
+  while (i--) {
+    exitFns[i]();
+  }
+}
+function createStructuralDirectiveTransform(name, fn) {
+  const matches = isString(name) ? (n) => n === name : (n) => name.test(n);
+  return (node, context) => {
+    if (node.type === 1) {
+      const { props } = node;
+      if (node.tagType === 3 && props.some(isVSlot)) {
+        return;
+      }
+      const exitFns = [];
+      for (let i = 0; i < props.length; i++) {
+        const prop = props[i];
+        if (prop.type === 7 && matches(prop.name)) {
+          props.splice(i, 1);
+          i--;
+          const onExit = fn(node, prop, context);
+          if (onExit) exitFns.push(onExit);
+        }
+      }
+      return exitFns;
+    }
+  };
+}
+
+const PURE_ANNOTATION = `/*@__PURE__*/`;
+const aliasHelper = (s) => `${helperNameMap[s]}: _${helperNameMap[s]}`;
+function createCodegenContext(ast, {
+  mode = "function",
+  prefixIdentifiers = mode === "module",
+  sourceMap = false,
+  filename = `template.vue.html`,
+  scopeId = null,
+  optimizeImports = false,
+  runtimeGlobalName = `Vue`,
+  runtimeModuleName = `vue`,
+  ssrRuntimeModuleName = "vue/server-renderer",
+  ssr = false,
+  isTS = false,
+  inSSR = false
+}) {
+  const context = {
+    mode,
+    prefixIdentifiers,
+    sourceMap,
+    filename,
+    scopeId,
+    optimizeImports,
+    runtimeGlobalName,
+    runtimeModuleName,
+    ssrRuntimeModuleName,
+    ssr,
+    isTS,
+    inSSR,
+    source: ast.source,
+    code: ``,
+    column: 1,
+    line: 1,
+    offset: 0,
+    indentLevel: 0,
+    pure: false,
+    map: void 0,
+    helper(key) {
+      return `_${helperNameMap[key]}`;
+    },
+    push(code, newlineIndex = -2 /* None */, node) {
+      context.code += code;
+    },
+    indent() {
+      newline(++context.indentLevel);
+    },
+    deindent(withoutNewLine = false) {
+      if (withoutNewLine) {
+        --context.indentLevel;
+      } else {
+        newline(--context.indentLevel);
+      }
+    },
+    newline() {
+      newline(context.indentLevel);
+    }
+  };
+  function newline(n) {
+    context.push("\n" + `  `.repeat(n), 0 /* Start */);
+  }
+  return context;
+}
+function generate(ast, options = {}) {
+  const context = createCodegenContext(ast, options);
+  if (options.onContextCreated) options.onContextCreated(context);
+  const {
+    mode,
+    push,
+    prefixIdentifiers,
+    indent,
+    deindent,
+    newline,
+    scopeId,
+    ssr
+  } = context;
+  const helpers = Array.from(ast.helpers);
+  const hasHelpers = helpers.length > 0;
+  const useWithBlock = !prefixIdentifiers && mode !== "module";
+  const preambleContext = context;
+  {
+    genFunctionPreamble(ast, preambleContext);
+  }
+  const functionName = ssr ? `ssrRender` : `render`;
+  const args = ssr ? ["_ctx", "_push", "_parent", "_attrs"] : ["_ctx", "_cache"];
+  const signature = args.join(", ");
+  {
+    push(`function ${functionName}(${signature}) {`);
+  }
+  indent();
+  if (useWithBlock) {
+    push(`with (_ctx) {`);
+    indent();
+    if (hasHelpers) {
+      push(
+        `const { ${helpers.map(aliasHelper).join(", ")} } = _Vue
+`,
+        -1 /* End */
+      );
+      newline();
+    }
+  }
+  if (ast.components.length) {
+    genAssets(ast.components, "component", context);
+    if (ast.directives.length || ast.temps > 0) {
+      newline();
+    }
+  }
+  if (ast.directives.length) {
+    genAssets(ast.directives, "directive", context);
+    if (ast.temps > 0) {
+      newline();
+    }
+  }
+  if (ast.temps > 0) {
+    push(`let `);
+    for (let i = 0; i < ast.temps; i++) {
+      push(`${i > 0 ? `, ` : ``}_temp${i}`);
+    }
+  }
+  if (ast.components.length || ast.directives.length || ast.temps) {
+    push(`
+`, 0 /* Start */);
+    newline();
+  }
+  if (!ssr) {
+    push(`return `);
+  }
+  if (ast.codegenNode) {
+    genNode(ast.codegenNode, context);
+  } else {
+    push(`null`);
+  }
+  if (useWithBlock) {
+    deindent();
+    push(`}`);
+  }
+  deindent();
+  push(`}`);
+  return {
+    ast,
+    code: context.code,
+    preamble: ``,
+    map: context.map ? context.map.toJSON() : void 0
+  };
+}
+function genFunctionPreamble(ast, context) {
+  const {
+    ssr,
+    prefixIdentifiers,
+    push,
+    newline,
+    runtimeModuleName,
+    runtimeGlobalName,
+    ssrRuntimeModuleName
+  } = context;
+  const VueBinding = runtimeGlobalName;
+  const helpers = Array.from(ast.helpers);
+  if (helpers.length > 0) {
+    {
+      push(`const _Vue = ${VueBinding}
+`, -1 /* End */);
+      if (ast.hoists.length) {
+        const staticHelpers = [
+          CREATE_VNODE,
+          CREATE_ELEMENT_VNODE,
+          CREATE_COMMENT,
+          CREATE_TEXT,
+          CREATE_STATIC
+        ].filter((helper) => helpers.includes(helper)).map(aliasHelper).join(", ");
+        push(`const { ${staticHelpers} } = _Vue
+`, -1 /* End */);
+      }
+    }
+  }
+  genHoists(ast.hoists, context);
+  newline();
+  push(`return `);
+}
+function genAssets(assets, type, { helper, push, newline, isTS }) {
+  const resolver = helper(
+    type === "component" ? RESOLVE_COMPONENT : RESOLVE_DIRECTIVE
+  );
+  for (let i = 0; i < assets.length; i++) {
+    let id = assets[i];
+    const maybeSelfReference = id.endsWith("__self");
+    if (maybeSelfReference) {
+      id = id.slice(0, -6);
+    }
+    push(
+      `const ${toValidAssetId(id, type)} = ${resolver}(${JSON.stringify(id)}${maybeSelfReference ? `, true` : ``})${isTS ? `!` : ``}`
+    );
+    if (i < assets.length - 1) {
+      newline();
+    }
+  }
+}
+function genHoists(hoists, context) {
+  if (!hoists.length) {
+    return;
+  }
+  context.pure = true;
+  const { push, newline } = context;
+  newline();
+  for (let i = 0; i < hoists.length; i++) {
+    const exp = hoists[i];
+    if (exp) {
+      push(`const _hoisted_${i + 1} = `);
+      genNode(exp, context);
+      newline();
+    }
+  }
+  context.pure = false;
+}
+function isText(n) {
+  return isString(n) || n.type === 4 || n.type === 2 || n.type === 5 || n.type === 8;
+}
+function genNodeListAsArray(nodes, context) {
+  const multilines = nodes.length > 3 || nodes.some((n) => isArray(n) || !isText(n));
+  context.push(`[`);
+  multilines && context.indent();
+  genNodeList(nodes, context, multilines);
+  multilines && context.deindent();
+  context.push(`]`);
+}
+function genNodeList(nodes, context, multilines = false, comma = true) {
+  const { push, newline } = context;
+  for (let i = 0; i < nodes.length; i++) {
+    const node = nodes[i];
+    if (isString(node)) {
+      push(node, -3 /* Unknown */);
+    } else if (isArray(node)) {
+      genNodeListAsArray(node, context);
+    } else {
+      genNode(node, context);
+    }
+    if (i < nodes.length - 1) {
+      if (multilines) {
+        comma && push(",");
+        newline();
+      } else {
+        comma && push(", ");
+      }
+    }
+  }
+}
+function genNode(node, context) {
+  if (isString(node)) {
+    context.push(node, -3 /* Unknown */);
+    return;
+  }
+  if (isSymbol(node)) {
+    context.push(context.helper(node));
+    return;
+  }
+  switch (node.type) {
+    case 1:
+    case 9:
+    case 11:
+      assert(
+        node.codegenNode != null,
+        `Codegen node is missing for element/if/for node. Apply appropriate transforms first.`
+      );
+      genNode(node.codegenNode, context);
+      break;
+    case 2:
+      genText(node, context);
+      break;
+    case 4:
+      genExpression(node, context);
+      break;
+    case 5:
+      genInterpolation(node, context);
+      break;
+    case 12:
+      genNode(node.codegenNode, context);
+      break;
+    case 8:
+      genCompoundExpression(node, context);
+      break;
+    case 3:
+      genComment(node, context);
+      break;
+    case 13:
+      genVNodeCall(node, context);
+      break;
+    case 14:
+      genCallExpression(node, context);
+      break;
+    case 15:
+      genObjectExpression(node, context);
+      break;
+    case 17:
+      genArrayExpression(node, context);
+      break;
+    case 18:
+      genFunctionExpression(node, context);
+      break;
+    case 19:
+      genConditionalExpression(node, context);
+      break;
+    case 20:
+      genCacheExpression(node, context);
+      break;
+    case 21:
+      genNodeList(node.body, context, true, false);
+      break;
+    // SSR only types
+    case 22:
+      break;
+    case 23:
+      break;
+    case 24:
+      break;
+    case 25:
+      break;
+    case 26:
+      break;
+    /* v8 ignore start */
+    case 10:
+      break;
+    default:
+      {
+        assert(false, `unhandled codegen node type: ${node.type}`);
+        const exhaustiveCheck = node;
+        return exhaustiveCheck;
+      }
+  }
+}
+function genText(node, context) {
+  context.push(JSON.stringify(node.content), -3 /* Unknown */, node);
+}
+function genExpression(node, context) {
+  const { content, isStatic } = node;
+  context.push(
+    isStatic ? JSON.stringify(content) : content,
+    -3 /* Unknown */,
+    node
+  );
+}
+function genInterpolation(node, context) {
+  const { push, helper, pure } = context;
+  if (pure) push(PURE_ANNOTATION);
+  push(`${helper(TO_DISPLAY_STRING)}(`);
+  genNode(node.content, context);
+  push(`)`);
+}
+function genCompoundExpression(node, context) {
+  for (let i = 0; i < node.children.length; i++) {
+    const child = node.children[i];
+    if (isString(child)) {
+      context.push(child, -3 /* Unknown */);
+    } else {
+      genNode(child, context);
+    }
+  }
+}
+function genExpressionAsPropertyKey(node, context) {
+  const { push } = context;
+  if (node.type === 8) {
+    push(`[`);
+    genCompoundExpression(node, context);
+    push(`]`);
+  } else if (node.isStatic) {
+    const text = isSimpleIdentifier(node.content) ? node.content : JSON.stringify(node.content);
+    push(text, -2 /* None */, node);
+  } else {
+    push(`[${node.content}]`, -3 /* Unknown */, node);
+  }
+}
+function genComment(node, context) {
+  const { push, helper, pure } = context;
+  if (pure) {
+    push(PURE_ANNOTATION);
+  }
+  push(
+    `${helper(CREATE_COMMENT)}(${JSON.stringify(node.content)})`,
+    -3 /* Unknown */,
+    node
+  );
+}
+function genVNodeCall(node, context) {
+  const { push, helper, pure } = context;
+  const {
+    tag,
+    props,
+    children,
+    patchFlag,
+    dynamicProps,
+    directives,
+    isBlock,
+    disableTracking,
+    isComponent
+  } = node;
+  let patchFlagString;
+  if (patchFlag) {
+    {
+      if (patchFlag < 0) {
+        patchFlagString = patchFlag + ` /* ${PatchFlagNames[patchFlag]} */`;
+      } else {
+        const flagNames = Object.keys(PatchFlagNames).map(Number).filter((n) => n > 0 && patchFlag & n).map((n) => PatchFlagNames[n]).join(`, `);
+        patchFlagString = patchFlag + ` /* ${flagNames} */`;
+      }
+    }
+  }
+  if (directives) {
+    push(helper(WITH_DIRECTIVES) + `(`);
+  }
+  if (isBlock) {
+    push(`(${helper(OPEN_BLOCK)}(${disableTracking ? `true` : ``}), `);
+  }
+  if (pure) {
+    push(PURE_ANNOTATION);
+  }
+  const callHelper = isBlock ? getVNodeBlockHelper(context.inSSR, isComponent) : getVNodeHelper(context.inSSR, isComponent);
+  push(helper(callHelper) + `(`, -2 /* None */, node);
+  genNodeList(
+    genNullableArgs([tag, props, children, patchFlagString, dynamicProps]),
+    context
+  );
+  push(`)`);
+  if (isBlock) {
+    push(`)`);
+  }
+  if (directives) {
+    push(`, `);
+    genNode(directives, context);
+    push(`)`);
+  }
+}
+function genNullableArgs(args) {
+  let i = args.length;
+  while (i--) {
+    if (args[i] != null) break;
+  }
+  return args.slice(0, i + 1).map((arg) => arg || `null`);
+}
+function genCallExpression(node, context) {
+  const { push, helper, pure } = context;
+  const callee = isString(node.callee) ? node.callee : helper(node.callee);
+  if (pure) {
+    push(PURE_ANNOTATION);
+  }
+  push(callee + `(`, -2 /* None */, node);
+  genNodeList(node.arguments, context);
+  push(`)`);
+}
+function genObjectExpression(node, context) {
+  const { push, indent, deindent, newline } = context;
+  const { properties } = node;
+  if (!properties.length) {
+    push(`{}`, -2 /* None */, node);
+    return;
+  }
+  const multilines = properties.length > 1 || properties.some((p) => p.value.type !== 4);
+  push(multilines ? `{` : `{ `);
+  multilines && indent();
+  for (let i = 0; i < properties.length; i++) {
+    const { key, value } = properties[i];
+    genExpressionAsPropertyKey(key, context);
+    push(`: `);
+    genNode(value, context);
+    if (i < properties.length - 1) {
+      push(`,`);
+      newline();
+    }
+  }
+  multilines && deindent();
+  push(multilines ? `}` : ` }`);
+}
+function genArrayExpression(node, context) {
+  genNodeListAsArray(node.elements, context);
+}
+function genFunctionExpression(node, context) {
+  const { push, indent, deindent } = context;
+  const { params, returns, body, newline, isSlot } = node;
+  if (isSlot) {
+    push(`_${helperNameMap[WITH_CTX]}(`);
+  }
+  push(`(`, -2 /* None */, node);
+  if (isArray(params)) {
+    genNodeList(params, context);
+  } else if (params) {
+    genNode(params, context);
+  }
+  push(`) => `);
+  if (newline || body) {
+    push(`{`);
+    indent();
+  }
+  if (returns) {
+    if (newline) {
+      push(`return `);
+    }
+    if (isArray(returns)) {
+      genNodeListAsArray(returns, context);
+    } else {
+      genNode(returns, context);
+    }
+  } else if (body) {
+    genNode(body, context);
+  }
+  if (newline || body) {
+    deindent();
+    push(`}`);
+  }
+  if (isSlot) {
+    push(`)`);
+  }
+}
+function genConditionalExpression(node, context) {
+  const { test, consequent, alternate, newline: needNewline } = node;
+  const { push, indent, deindent, newline } = context;
+  if (test.type === 4) {
+    const needsParens = !isSimpleIdentifier(test.content);
+    needsParens && push(`(`);
+    genExpression(test, context);
+    needsParens && push(`)`);
+  } else {
+    push(`(`);
+    genNode(test, context);
+    push(`)`);
+  }
+  needNewline && indent();
+  context.indentLevel++;
+  needNewline || push(` `);
+  push(`? `);
+  genNode(consequent, context);
+  context.indentLevel--;
+  needNewline && newline();
+  needNewline || push(` `);
+  push(`: `);
+  const isNested = alternate.type === 19;
+  if (!isNested) {
+    context.indentLevel++;
+  }
+  genNode(alternate, context);
+  if (!isNested) {
+    context.indentLevel--;
+  }
+  needNewline && deindent(
+    true
+    /* without newline */
+  );
+}
+function genCacheExpression(node, context) {
+  const { push, helper, indent, deindent, newline } = context;
+  const { needPauseTracking, needArraySpread } = node;
+  if (needArraySpread) {
+    push(`[...(`);
+  }
+  push(`_cache[${node.index}] || (`);
+  if (needPauseTracking) {
+    indent();
+    push(`${helper(SET_BLOCK_TRACKING)}(-1),`);
+    newline();
+    push(`(`);
+  }
+  push(`_cache[${node.index}] = `);
+  genNode(node.value, context);
+  if (needPauseTracking) {
+    push(`).cacheIndex = ${node.index},`);
+    newline();
+    push(`${helper(SET_BLOCK_TRACKING)}(1),`);
+    newline();
+    push(`_cache[${node.index}]`);
+    deindent();
+  }
+  push(`)`);
+  if (needArraySpread) {
+    push(`)]`);
+  }
+}
+
+const prohibitedKeywordRE = new RegExp(
+  "\\b" + "arguments,await,break,case,catch,class,const,continue,debugger,default,delete,do,else,export,extends,finally,for,function,if,import,let,new,return,super,switch,throw,try,var,void,while,with,yield".split(",").join("\\b|\\b") + "\\b"
+);
+const stripStringRE = /'(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*"|`(?:[^`\\]|\\.)*\$\{|\}(?:[^`\\]|\\.)*`|`(?:[^`\\]|\\.)*`/g;
+function validateBrowserExpression(node, context, asParams = false, asRawStatements = false) {
+  const exp = node.content;
+  if (!exp.trim()) {
+    return;
+  }
+  try {
+    new Function(
+      asRawStatements ? ` ${exp} ` : `return ${asParams ? `(${exp}) => {}` : `(${exp})`}`
+    );
+  } catch (e) {
+    let message = e.message;
+    const keywordMatch = exp.replace(stripStringRE, "").match(prohibitedKeywordRE);
+    if (keywordMatch) {
+      message = `avoid using JavaScript keyword as property name: "${keywordMatch[0]}"`;
+    }
+    context.onError(
+      createCompilerError(
+        45,
+        node.loc,
+        void 0,
+        message
+      )
+    );
+  }
+}
+
+const transformExpression = (node, context) => {
+  if (node.type === 5) {
+    node.content = processExpression(
+      node.content,
+      context
+    );
+  } else if (node.type === 1) {
+    for (let i = 0; i < node.props.length; i++) {
+      const dir = node.props[i];
+      if (dir.type === 7 && dir.name !== "for") {
+        const exp = dir.exp;
+        const arg = dir.arg;
+        if (exp && exp.type === 4 && !(dir.name === "on" && arg)) {
+          dir.exp = processExpression(
+            exp,
+            context,
+            // slot args must be processed as function params
+            dir.name === "slot"
+          );
+        }
+        if (arg && arg.type === 4 && !arg.isStatic) {
+          dir.arg = processExpression(arg, context);
+        }
+      }
+    }
+  }
+};
+function processExpression(node, context, asParams = false, asRawStatements = false, localVars = Object.create(context.identifiers)) {
+  {
+    {
+      validateBrowserExpression(node, context, asParams, asRawStatements);
+    }
+    return node;
+  }
+}
+
+const transformIf = createStructuralDirectiveTransform(
+  /^(if|else|else-if)$/,
+  (node, dir, context) => {
+    return processIf(node, dir, context, (ifNode, branch, isRoot) => {
+      const siblings = context.parent.children;
+      let i = siblings.indexOf(ifNode);
+      let key = 0;
+      while (i-- >= 0) {
+        const sibling = siblings[i];
+        if (sibling && sibling.type === 9) {
+          key += sibling.branches.length;
+        }
+      }
+      return () => {
+        if (isRoot) {
+          ifNode.codegenNode = createCodegenNodeForBranch(
+            branch,
+            key,
+            context
+          );
+        } else {
+          const parentCondition = getParentCondition(ifNode.codegenNode);
+          parentCondition.alternate = createCodegenNodeForBranch(
+            branch,
+            key + ifNode.branches.length - 1,
+            context
+          );
+        }
+      };
+    });
+  }
+);
+function processIf(node, dir, context, processCodegen) {
+  if (dir.name !== "else" && (!dir.exp || !dir.exp.content.trim())) {
+    const loc = dir.exp ? dir.exp.loc : node.loc;
+    context.onError(
+      createCompilerError(28, dir.loc)
+    );
+    dir.exp = createSimpleExpression(`true`, false, loc);
+  }
+  if (dir.exp) {
+    validateBrowserExpression(dir.exp, context);
+  }
+  if (dir.name === "if") {
+    const branch = createIfBranch(node, dir);
+    const ifNode = {
+      type: 9,
+      loc: cloneLoc(node.loc),
+      branches: [branch]
+    };
+    context.replaceNode(ifNode);
+    if (processCodegen) {
+      return processCodegen(ifNode, branch, true);
+    }
+  } else {
+    const siblings = context.parent.children;
+    const comments = [];
+    let i = siblings.indexOf(node);
+    while (i-- >= -1) {
+      const sibling = siblings[i];
+      if (sibling && sibling.type === 3) {
+        context.removeNode(sibling);
+        comments.unshift(sibling);
+        continue;
+      }
+      if (sibling && sibling.type === 2 && !sibling.content.trim().length) {
+        context.removeNode(sibling);
+        continue;
+      }
+      if (sibling && sibling.type === 9) {
+        if (dir.name === "else-if" && sibling.branches[sibling.branches.length - 1].condition === void 0) {
+          context.onError(
+            createCompilerError(30, node.loc)
+          );
+        }
+        context.removeNode();
+        const branch = createIfBranch(node, dir);
+        if (comments.length && // #3619 ignore comments if the v-if is direct child of <transition>
+        !(context.parent && context.parent.type === 1 && (context.parent.tag === "transition" || context.parent.tag === "Transition"))) {
+          branch.children = [...comments, ...branch.children];
+        }
+        {
+          const key = branch.userKey;
+          if (key) {
+            sibling.branches.forEach(({ userKey }) => {
+              if (isSameKey(userKey, key)) {
+                context.onError(
+                  createCompilerError(
+                    29,
+                    branch.userKey.loc
+                  )
+                );
+              }
+            });
+          }
+        }
+        sibling.branches.push(branch);
+        const onExit = processCodegen && processCodegen(sibling, branch, false);
+        traverseNode(branch, context);
+        if (onExit) onExit();
+        context.currentNode = null;
+      } else {
+        context.onError(
+          createCompilerError(30, node.loc)
+        );
+      }
+      break;
+    }
+  }
+}
+function createIfBranch(node, dir) {
+  const isTemplateIf = node.tagType === 3;
+  return {
+    type: 10,
+    loc: node.loc,
+    condition: dir.name === "else" ? void 0 : dir.exp,
+    children: isTemplateIf && !findDir(node, "for") ? node.children : [node],
+    userKey: findProp(node, `key`),
+    isTemplateIf
+  };
+}
+function createCodegenNodeForBranch(branch, keyIndex, context) {
+  if (branch.condition) {
+    return createConditionalExpression(
+      branch.condition,
+      createChildrenCodegenNode(branch, keyIndex, context),
+      // make sure to pass in asBlock: true so that the comment node call
+      // closes the current block.
+      createCallExpression(context.helper(CREATE_COMMENT), [
+        '"v-if"' ,
+        "true"
+      ])
+    );
+  } else {
+    return createChildrenCodegenNode(branch, keyIndex, context);
+  }
+}
+function createChildrenCodegenNode(branch, keyIndex, context) {
+  const { helper } = context;
+  const keyProperty = createObjectProperty(
+    `key`,
+    createSimpleExpression(
+      `${keyIndex}`,
+      false,
+      locStub,
+      2
+    )
+  );
+  const { children } = branch;
+  const firstChild = children[0];
+  const needFragmentWrapper = children.length !== 1 || firstChild.type !== 1;
+  if (needFragmentWrapper) {
+    if (children.length === 1 && firstChild.type === 11) {
+      const vnodeCall = firstChild.codegenNode;
+      injectProp(vnodeCall, keyProperty, context);
+      return vnodeCall;
+    } else {
+      let patchFlag = 64;
+      if (!branch.isTemplateIf && children.filter((c) => c.type !== 3).length === 1) {
+        patchFlag |= 2048;
+      }
+      return createVNodeCall(
+        context,
+        helper(FRAGMENT),
+        createObjectExpression([keyProperty]),
+        children,
+        patchFlag,
+        void 0,
+        void 0,
+        true,
+        false,
+        false,
+        branch.loc
+      );
+    }
+  } else {
+    const ret = firstChild.codegenNode;
+    const vnodeCall = getMemoedVNodeCall(ret);
+    if (vnodeCall.type === 13) {
+      convertToBlock(vnodeCall, context);
+    }
+    injectProp(vnodeCall, keyProperty, context);
+    return ret;
+  }
+}
+function isSameKey(a, b) {
+  if (!a || a.type !== b.type) {
+    return false;
+  }
+  if (a.type === 6) {
+    if (a.value.content !== b.value.content) {
+      return false;
+    }
+  } else {
+    const exp = a.exp;
+    const branchExp = b.exp;
+    if (exp.type !== branchExp.type) {
+      return false;
+    }
+    if (exp.type !== 4 || exp.isStatic !== branchExp.isStatic || exp.content !== branchExp.content) {
+      return false;
+    }
+  }
+  return true;
+}
+function getParentCondition(node) {
+  while (true) {
+    if (node.type === 19) {
+      if (node.alternate.type === 19) {
+        node = node.alternate;
+      } else {
+        return node;
+      }
+    } else if (node.type === 20) {
+      node = node.value;
+    }
+  }
+}
+
+const transformBind = (dir, _node, context) => {
+  const { modifiers, loc } = dir;
+  const arg = dir.arg;
+  let { exp } = dir;
+  if (exp && exp.type === 4 && !exp.content.trim()) {
+    {
+      exp = void 0;
+    }
+  }
+  if (!exp) {
+    if (arg.type !== 4 || !arg.isStatic) {
+      context.onError(
+        createCompilerError(
+          52,
+          arg.loc
+        )
+      );
+      return {
+        props: [
+          createObjectProperty(arg, createSimpleExpression("", true, loc))
+        ]
+      };
+    }
+    transformBindShorthand(dir);
+    exp = dir.exp;
+  }
+  if (arg.type !== 4) {
+    arg.children.unshift(`(`);
+    arg.children.push(`) || ""`);
+  } else if (!arg.isStatic) {
+    arg.content = `${arg.content} || ""`;
+  }
+  if (modifiers.some((mod) => mod.content === "camel")) {
+    if (arg.type === 4) {
+      if (arg.isStatic) {
+        arg.content = camelize(arg.content);
+      } else {
+        arg.content = `${context.helperString(CAMELIZE)}(${arg.content})`;
+      }
+    } else {
+      arg.children.unshift(`${context.helperString(CAMELIZE)}(`);
+      arg.children.push(`)`);
+    }
+  }
+  if (!context.inSSR) {
+    if (modifiers.some((mod) => mod.content === "prop")) {
+      injectPrefix(arg, ".");
+    }
+    if (modifiers.some((mod) => mod.content === "attr")) {
+      injectPrefix(arg, "^");
+    }
+  }
+  return {
+    props: [createObjectProperty(arg, exp)]
+  };
+};
+const transformBindShorthand = (dir, context) => {
+  const arg = dir.arg;
+  const propName = camelize(arg.content);
+  dir.exp = createSimpleExpression(propName, false, arg.loc);
+};
+const injectPrefix = (arg, prefix) => {
+  if (arg.type === 4) {
+    if (arg.isStatic) {
+      arg.content = prefix + arg.content;
+    } else {
+      arg.content = `\`${prefix}\${${arg.content}}\``;
+    }
+  } else {
+    arg.children.unshift(`'${prefix}' + (`);
+    arg.children.push(`)`);
+  }
+};
+
+const transformFor = createStructuralDirectiveTransform(
+  "for",
+  (node, dir, context) => {
+    const { helper, removeHelper } = context;
+    return processFor(node, dir, context, (forNode) => {
+      const renderExp = createCallExpression(helper(RENDER_LIST), [
+        forNode.source
+      ]);
+      const isTemplate = isTemplateNode(node);
+      const memo = findDir(node, "memo");
+      const keyProp = findProp(node, `key`, false, true);
+      if (keyProp && keyProp.type === 7 && !keyProp.exp) {
+        transformBindShorthand(keyProp);
+      }
+      const keyExp = keyProp && (keyProp.type === 6 ? keyProp.value ? createSimpleExpression(keyProp.value.content, true) : void 0 : keyProp.exp);
+      const keyProperty = keyProp && keyExp ? createObjectProperty(`key`, keyExp) : null;
+      const isStableFragment = forNode.source.type === 4 && forNode.source.constType > 0;
+      const fragmentFlag = isStableFragment ? 64 : keyProp ? 128 : 256;
+      forNode.codegenNode = createVNodeCall(
+        context,
+        helper(FRAGMENT),
+        void 0,
+        renderExp,
+        fragmentFlag,
+        void 0,
+        void 0,
+        true,
+        !isStableFragment,
+        false,
+        node.loc
+      );
+      return () => {
+        let childBlock;
+        const { children } = forNode;
+        if (isTemplate) {
+          node.children.some((c) => {
+            if (c.type === 1) {
+              const key = findProp(c, "key");
+              if (key) {
+                context.onError(
+                  createCompilerError(
+                    33,
+                    key.loc
+                  )
+                );
+                return true;
+              }
+            }
+          });
+        }
+        const needFragmentWrapper = children.length !== 1 || children[0].type !== 1;
+        const slotOutlet = isSlotOutlet(node) ? node : isTemplate && node.children.length === 1 && isSlotOutlet(node.children[0]) ? node.children[0] : null;
+        if (slotOutlet) {
+          childBlock = slotOutlet.codegenNode;
+          if (isTemplate && keyProperty) {
+            injectProp(childBlock, keyProperty, context);
+          }
+        } else if (needFragmentWrapper) {
+          childBlock = createVNodeCall(
+            context,
+            helper(FRAGMENT),
+            keyProperty ? createObjectExpression([keyProperty]) : void 0,
+            node.children,
+            64,
+            void 0,
+            void 0,
+            true,
+            void 0,
+            false
+          );
+        } else {
+          childBlock = children[0].codegenNode;
+          if (isTemplate && keyProperty) {
+            injectProp(childBlock, keyProperty, context);
+          }
+          if (childBlock.isBlock !== !isStableFragment) {
+            if (childBlock.isBlock) {
+              removeHelper(OPEN_BLOCK);
+              removeHelper(
+                getVNodeBlockHelper(context.inSSR, childBlock.isComponent)
+              );
+            } else {
+              removeHelper(
+                getVNodeHelper(context.inSSR, childBlock.isComponent)
+              );
+            }
+          }
+          childBlock.isBlock = !isStableFragment;
+          if (childBlock.isBlock) {
+            helper(OPEN_BLOCK);
+            helper(getVNodeBlockHelper(context.inSSR, childBlock.isComponent));
+          } else {
+            helper(getVNodeHelper(context.inSSR, childBlock.isComponent));
+          }
+        }
+        if (memo) {
+          const loop = createFunctionExpression(
+            createForLoopParams(forNode.parseResult, [
+              createSimpleExpression(`_cached`)
+            ])
+          );
+          loop.body = createBlockStatement([
+            createCompoundExpression([`const _memo = (`, memo.exp, `)`]),
+            createCompoundExpression([
+              `if (_cached`,
+              ...keyExp ? [` && _cached.key === `, keyExp] : [],
+              ` && ${context.helperString(
+                IS_MEMO_SAME
+              )}(_cached, _memo)) return _cached`
+            ]),
+            createCompoundExpression([`const _item = `, childBlock]),
+            createSimpleExpression(`_item.memo = _memo`),
+            createSimpleExpression(`return _item`)
+          ]);
+          renderExp.arguments.push(
+            loop,
+            createSimpleExpression(`_cache`),
+            createSimpleExpression(String(context.cached.length))
+          );
+          context.cached.push(null);
+        } else {
+          renderExp.arguments.push(
+            createFunctionExpression(
+              createForLoopParams(forNode.parseResult),
+              childBlock,
+              true
+            )
+          );
+        }
+      };
+    });
+  }
+);
+function processFor(node, dir, context, processCodegen) {
+  if (!dir.exp) {
+    context.onError(
+      createCompilerError(31, dir.loc)
+    );
+    return;
+  }
+  const parseResult = dir.forParseResult;
+  if (!parseResult) {
+    context.onError(
+      createCompilerError(32, dir.loc)
+    );
+    return;
+  }
+  finalizeForParseResult(parseResult, context);
+  const { addIdentifiers, removeIdentifiers, scopes } = context;
+  const { source, value, key, index } = parseResult;
+  const forNode = {
+    type: 11,
+    loc: dir.loc,
+    source,
+    valueAlias: value,
+    keyAlias: key,
+    objectIndexAlias: index,
+    parseResult,
+    children: isTemplateNode(node) ? node.children : [node]
+  };
+  context.replaceNode(forNode);
+  scopes.vFor++;
+  const onExit = processCodegen && processCodegen(forNode);
+  return () => {
+    scopes.vFor--;
+    if (onExit) onExit();
+  };
+}
+function finalizeForParseResult(result, context) {
+  if (result.finalized) return;
+  {
+    validateBrowserExpression(result.source, context);
+    if (result.key) {
+      validateBrowserExpression(
+        result.key,
+        context,
+        true
+      );
+    }
+    if (result.index) {
+      validateBrowserExpression(
+        result.index,
+        context,
+        true
+      );
+    }
+    if (result.value) {
+      validateBrowserExpression(
+        result.value,
+        context,
+        true
+      );
+    }
+  }
+  result.finalized = true;
+}
+function createForLoopParams({ value, key, index }, memoArgs = []) {
+  return createParamsList([value, key, index, ...memoArgs]);
+}
+function createParamsList(args) {
+  let i = args.length;
+  while (i--) {
+    if (args[i]) break;
+  }
+  return args.slice(0, i + 1).map((arg, i2) => arg || createSimpleExpression(`_`.repeat(i2 + 1), false));
+}
+
+const defaultFallback = createSimpleExpression(`undefined`, false);
+const trackSlotScopes = (node, context) => {
+  if (node.type === 1 && (node.tagType === 1 || node.tagType === 3)) {
+    const vSlot = findDir(node, "slot");
+    if (vSlot) {
+      vSlot.exp;
+      context.scopes.vSlot++;
+      return () => {
+        context.scopes.vSlot--;
+      };
+    }
+  }
+};
+const buildClientSlotFn = (props, _vForExp, children, loc) => createFunctionExpression(
+  props,
+  children,
+  false,
+  true,
+  children.length ? children[0].loc : loc
+);
+function buildSlots(node, context, buildSlotFn = buildClientSlotFn) {
+  context.helper(WITH_CTX);
+  const { children, loc } = node;
+  const slotsProperties = [];
+  const dynamicSlots = [];
+  let hasDynamicSlots = context.scopes.vSlot > 0 || context.scopes.vFor > 0;
+  const onComponentSlot = findDir(node, "slot", true);
+  if (onComponentSlot) {
+    const { arg, exp } = onComponentSlot;
+    if (arg && !isStaticExp(arg)) {
+      hasDynamicSlots = true;
+    }
+    slotsProperties.push(
+      createObjectProperty(
+        arg || createSimpleExpression("default", true),
+        buildSlotFn(exp, void 0, children, loc)
+      )
+    );
+  }
+  let hasTemplateSlots = false;
+  let hasNamedDefaultSlot = false;
+  const implicitDefaultChildren = [];
+  const seenSlotNames = /* @__PURE__ */ new Set();
+  let conditionalBranchIndex = 0;
+  for (let i = 0; i < children.length; i++) {
+    const slotElement = children[i];
+    let slotDir;
+    if (!isTemplateNode(slotElement) || !(slotDir = findDir(slotElement, "slot", true))) {
+      if (slotElement.type !== 3) {
+        implicitDefaultChildren.push(slotElement);
+      }
+      continue;
+    }
+    if (onComponentSlot) {
+      context.onError(
+        createCompilerError(37, slotDir.loc)
+      );
+      break;
+    }
+    hasTemplateSlots = true;
+    const { children: slotChildren, loc: slotLoc } = slotElement;
+    const {
+      arg: slotName = createSimpleExpression(`default`, true),
+      exp: slotProps,
+      loc: dirLoc
+    } = slotDir;
+    let staticSlotName;
+    if (isStaticExp(slotName)) {
+      staticSlotName = slotName ? slotName.content : `default`;
+    } else {
+      hasDynamicSlots = true;
+    }
+    const vFor = findDir(slotElement, "for");
+    const slotFunction = buildSlotFn(slotProps, vFor, slotChildren, slotLoc);
+    let vIf;
+    let vElse;
+    if (vIf = findDir(slotElement, "if")) {
+      hasDynamicSlots = true;
+      dynamicSlots.push(
+        createConditionalExpression(
+          vIf.exp,
+          buildDynamicSlot(slotName, slotFunction, conditionalBranchIndex++),
+          defaultFallback
+        )
+      );
+    } else if (vElse = findDir(
+      slotElement,
+      /^else(-if)?$/,
+      true
+      /* allowEmpty */
+    )) {
+      let j = i;
+      let prev;
+      while (j--) {
+        prev = children[j];
+        if (prev.type !== 3) {
+          break;
+        }
+      }
+      if (prev && isTemplateNode(prev) && findDir(prev, /^(else-)?if$/)) {
+        let conditional = dynamicSlots[dynamicSlots.length - 1];
+        while (conditional.alternate.type === 19) {
+          conditional = conditional.alternate;
+        }
+        conditional.alternate = vElse.exp ? createConditionalExpression(
+          vElse.exp,
+          buildDynamicSlot(
+            slotName,
+            slotFunction,
+            conditionalBranchIndex++
+          ),
+          defaultFallback
+        ) : buildDynamicSlot(slotName, slotFunction, conditionalBranchIndex++);
+      } else {
+        context.onError(
+          createCompilerError(30, vElse.loc)
+        );
+      }
+    } else if (vFor) {
+      hasDynamicSlots = true;
+      const parseResult = vFor.forParseResult;
+      if (parseResult) {
+        finalizeForParseResult(parseResult, context);
+        dynamicSlots.push(
+          createCallExpression(context.helper(RENDER_LIST), [
+            parseResult.source,
+            createFunctionExpression(
+              createForLoopParams(parseResult),
+              buildDynamicSlot(slotName, slotFunction),
+              true
+            )
+          ])
+        );
+      } else {
+        context.onError(
+          createCompilerError(
+            32,
+            vFor.loc
+          )
+        );
+      }
+    } else {
+      if (staticSlotName) {
+        if (seenSlotNames.has(staticSlotName)) {
+          context.onError(
+            createCompilerError(
+              38,
+              dirLoc
+            )
+          );
+          continue;
+        }
+        seenSlotNames.add(staticSlotName);
+        if (staticSlotName === "default") {
+          hasNamedDefaultSlot = true;
+        }
+      }
+      slotsProperties.push(createObjectProperty(slotName, slotFunction));
+    }
+  }
+  if (!onComponentSlot) {
+    const buildDefaultSlotProperty = (props, children2) => {
+      const fn = buildSlotFn(props, void 0, children2, loc);
+      return createObjectProperty(`default`, fn);
+    };
+    if (!hasTemplateSlots) {
+      slotsProperties.push(buildDefaultSlotProperty(void 0, children));
+    } else if (implicitDefaultChildren.length && // #3766
+    // with whitespace: 'preserve', whitespaces between slots will end up in
+    // implicitDefaultChildren. Ignore if all implicit children are whitespaces.
+    implicitDefaultChildren.some((node2) => isNonWhitespaceContent(node2))) {
+      if (hasNamedDefaultSlot) {
+        context.onError(
+          createCompilerError(
+            39,
+            implicitDefaultChildren[0].loc
+          )
+        );
+      } else {
+        slotsProperties.push(
+          buildDefaultSlotProperty(void 0, implicitDefaultChildren)
+        );
+      }
+    }
+  }
+  const slotFlag = hasDynamicSlots ? 2 : hasForwardedSlots(node.children) ? 3 : 1;
+  let slots = createObjectExpression(
+    slotsProperties.concat(
+      createObjectProperty(
+        `_`,
+        // 2 = compiled but dynamic = can skip normalization, but must run diff
+        // 1 = compiled and static = can skip normalization AND diff as optimized
+        createSimpleExpression(
+          slotFlag + (` /* ${slotFlagsText[slotFlag]} */` ),
+          false
+        )
+      )
+    ),
+    loc
+  );
+  if (dynamicSlots.length) {
+    slots = createCallExpression(context.helper(CREATE_SLOTS), [
+      slots,
+      createArrayExpression(dynamicSlots)
+    ]);
+  }
+  return {
+    slots,
+    hasDynamicSlots
+  };
+}
+function buildDynamicSlot(name, fn, index) {
+  const props = [
+    createObjectProperty(`name`, name),
+    createObjectProperty(`fn`, fn)
+  ];
+  if (index != null) {
+    props.push(
+      createObjectProperty(`key`, createSimpleExpression(String(index), true))
+    );
+  }
+  return createObjectExpression(props);
+}
+function hasForwardedSlots(children) {
+  for (let i = 0; i < children.length; i++) {
+    const child = children[i];
+    switch (child.type) {
+      case 1:
+        if (child.tagType === 2 || hasForwardedSlots(child.children)) {
+          return true;
+        }
+        break;
+      case 9:
+        if (hasForwardedSlots(child.branches)) return true;
+        break;
+      case 10:
+      case 11:
+        if (hasForwardedSlots(child.children)) return true;
+        break;
+    }
+  }
+  return false;
+}
+function isNonWhitespaceContent(node) {
+  if (node.type !== 2 && node.type !== 12)
+    return true;
+  return node.type === 2 ? !!node.content.trim() : isNonWhitespaceContent(node.content);
+}
+
+const directiveImportMap = /* @__PURE__ */ new WeakMap();
+const transformElement = (node, context) => {
+  return function postTransformElement() {
+    node = context.currentNode;
+    if (!(node.type === 1 && (node.tagType === 0 || node.tagType === 1))) {
+      return;
+    }
+    const { tag, props } = node;
+    const isComponent = node.tagType === 1;
+    let vnodeTag = isComponent ? resolveComponentType(node, context) : `"${tag}"`;
+    const isDynamicComponent = isObject(vnodeTag) && vnodeTag.callee === RESOLVE_DYNAMIC_COMPONENT;
+    let vnodeProps;
+    let vnodeChildren;
+    let patchFlag = 0;
+    let vnodeDynamicProps;
+    let dynamicPropNames;
+    let vnodeDirectives;
+    let shouldUseBlock = (
+      // dynamic component may resolve to plain elements
+      isDynamicComponent || vnodeTag === TELEPORT || vnodeTag === SUSPENSE || !isComponent && // <svg> and <foreignObject> must be forced into blocks so that block
+      // updates inside get proper isSVG flag at runtime. (#639, #643)
+      // This is technically web-specific, but splitting the logic out of core
+      // leads to too much unnecessary complexity.
+      (tag === "svg" || tag === "foreignObject" || tag === "math")
+    );
+    if (props.length > 0) {
+      const propsBuildResult = buildProps(
+        node,
+        context,
+        void 0,
+        isComponent,
+        isDynamicComponent
+      );
+      vnodeProps = propsBuildResult.props;
+      patchFlag = propsBuildResult.patchFlag;
+      dynamicPropNames = propsBuildResult.dynamicPropNames;
+      const directives = propsBuildResult.directives;
+      vnodeDirectives = directives && directives.length ? createArrayExpression(
+        directives.map((dir) => buildDirectiveArgs(dir, context))
+      ) : void 0;
+      if (propsBuildResult.shouldUseBlock) {
+        shouldUseBlock = true;
+      }
+    }
+    if (node.children.length > 0) {
+      if (vnodeTag === KEEP_ALIVE) {
+        shouldUseBlock = true;
+        patchFlag |= 1024;
+        if (node.children.length > 1) {
+          context.onError(
+            createCompilerError(46, {
+              start: node.children[0].loc.start,
+              end: node.children[node.children.length - 1].loc.end,
+              source: ""
+            })
+          );
+        }
+      }
+      const shouldBuildAsSlots = isComponent && // Teleport is not a real component and has dedicated runtime handling
+      vnodeTag !== TELEPORT && // explained above.
+      vnodeTag !== KEEP_ALIVE;
+      if (shouldBuildAsSlots) {
+        const { slots, hasDynamicSlots } = buildSlots(node, context);
+        vnodeChildren = slots;
+        if (hasDynamicSlots) {
+          patchFlag |= 1024;
+        }
+      } else if (node.children.length === 1 && vnodeTag !== TELEPORT) {
+        const child = node.children[0];
+        const type = child.type;
+        const hasDynamicTextChild = type === 5 || type === 8;
+        if (hasDynamicTextChild && getConstantType(child, context) === 0) {
+          patchFlag |= 1;
+        }
+        if (hasDynamicTextChild || type === 2) {
+          vnodeChildren = child;
+        } else {
+          vnodeChildren = node.children;
+        }
+      } else {
+        vnodeChildren = node.children;
+      }
+    }
+    if (dynamicPropNames && dynamicPropNames.length) {
+      vnodeDynamicProps = stringifyDynamicPropNames(dynamicPropNames);
+    }
+    node.codegenNode = createVNodeCall(
+      context,
+      vnodeTag,
+      vnodeProps,
+      vnodeChildren,
+      patchFlag === 0 ? void 0 : patchFlag,
+      vnodeDynamicProps,
+      vnodeDirectives,
+      !!shouldUseBlock,
+      false,
+      isComponent,
+      node.loc
+    );
+  };
+};
+function resolveComponentType(node, context, ssr = false) {
+  let { tag } = node;
+  const isExplicitDynamic = isComponentTag(tag);
+  const isProp = findProp(
+    node,
+    "is",
+    false,
+    true
+    /* allow empty */
+  );
+  if (isProp) {
+    if (isExplicitDynamic || false) {
+      let exp;
+      if (isProp.type === 6) {
+        exp = isProp.value && createSimpleExpression(isProp.value.content, true);
+      } else {
+        exp = isProp.exp;
+        if (!exp) {
+          exp = createSimpleExpression(`is`, false, isProp.arg.loc);
+        }
+      }
+      if (exp) {
+        return createCallExpression(context.helper(RESOLVE_DYNAMIC_COMPONENT), [
+          exp
+        ]);
+      }
+    } else if (isProp.type === 6 && isProp.value.content.startsWith("vue:")) {
+      tag = isProp.value.content.slice(4);
+    }
+  }
+  const builtIn = isCoreComponent(tag) || context.isBuiltInComponent(tag);
+  if (builtIn) {
+    if (!ssr) context.helper(builtIn);
+    return builtIn;
+  }
+  context.helper(RESOLVE_COMPONENT);
+  context.components.add(tag);
+  return toValidAssetId(tag, `component`);
+}
+function buildProps(node, context, props = node.props, isComponent, isDynamicComponent, ssr = false) {
+  const { tag, loc: elementLoc, children } = node;
+  let properties = [];
+  const mergeArgs = [];
+  const runtimeDirectives = [];
+  const hasChildren = children.length > 0;
+  let shouldUseBlock = false;
+  let patchFlag = 0;
+  let hasRef = false;
+  let hasClassBinding = false;
+  let hasStyleBinding = false;
+  let hasHydrationEventBinding = false;
+  let hasDynamicKeys = false;
+  let hasVnodeHook = false;
+  const dynamicPropNames = [];
+  const pushMergeArg = (arg) => {
+    if (properties.length) {
+      mergeArgs.push(
+        createObjectExpression(dedupeProperties(properties), elementLoc)
+      );
+      properties = [];
+    }
+    if (arg) mergeArgs.push(arg);
+  };
+  const pushRefVForMarker = () => {
+    if (context.scopes.vFor > 0) {
+      properties.push(
+        createObjectProperty(
+          createSimpleExpression("ref_for", true),
+          createSimpleExpression("true")
+        )
+      );
+    }
+  };
+  const analyzePatchFlag = ({ key, value }) => {
+    if (isStaticExp(key)) {
+      const name = key.content;
+      const isEventHandler = isOn(name);
+      if (isEventHandler && (!isComponent || isDynamicComponent) && // omit the flag for click handlers because hydration gives click
+      // dedicated fast path.
+      name.toLowerCase() !== "onclick" && // omit v-model handlers
+      name !== "onUpdate:modelValue" && // omit onVnodeXXX hooks
+      !isReservedProp(name)) {
+        hasHydrationEventBinding = true;
+      }
+      if (isEventHandler && isReservedProp(name)) {
+        hasVnodeHook = true;
+      }
+      if (isEventHandler && value.type === 14) {
+        value = value.arguments[0];
+      }
+      if (value.type === 20 || (value.type === 4 || value.type === 8) && getConstantType(value, context) > 0) {
+        return;
+      }
+      if (name === "ref") {
+        hasRef = true;
+      } else if (name === "class") {
+        hasClassBinding = true;
+      } else if (name === "style") {
+        hasStyleBinding = true;
+      } else if (name !== "key" && !dynamicPropNames.includes(name)) {
+        dynamicPropNames.push(name);
+      }
+      if (isComponent && (name === "class" || name === "style") && !dynamicPropNames.includes(name)) {
+        dynamicPropNames.push(name);
+      }
+    } else {
+      hasDynamicKeys = true;
+    }
+  };
+  for (let i = 0; i < props.length; i++) {
+    const prop = props[i];
+    if (prop.type === 6) {
+      const { loc, name, nameLoc, value } = prop;
+      let isStatic = true;
+      if (name === "ref") {
+        hasRef = true;
+        pushRefVForMarker();
+      }
+      if (name === "is" && (isComponentTag(tag) || value && value.content.startsWith("vue:") || false)) {
+        continue;
+      }
+      properties.push(
+        createObjectProperty(
+          createSimpleExpression(name, true, nameLoc),
+          createSimpleExpression(
+            value ? value.content : "",
+            isStatic,
+            value ? value.loc : loc
+          )
+        )
+      );
+    } else {
+      const { name, arg, exp, loc, modifiers } = prop;
+      const isVBind = name === "bind";
+      const isVOn = name === "on";
+      if (name === "slot") {
+        if (!isComponent) {
+          context.onError(
+            createCompilerError(40, loc)
+          );
+        }
+        continue;
+      }
+      if (name === "once" || name === "memo") {
+        continue;
+      }
+      if (name === "is" || isVBind && isStaticArgOf(arg, "is") && (isComponentTag(tag) || false)) {
+        continue;
+      }
+      if (isVOn && ssr) {
+        continue;
+      }
+      if (
+        // #938: elements with dynamic keys should be forced into blocks
+        isVBind && isStaticArgOf(arg, "key") || // inline before-update hooks need to force block so that it is invoked
+        // before children
+        isVOn && hasChildren && isStaticArgOf(arg, "vue:before-update")
+      ) {
+        shouldUseBlock = true;
+      }
+      if (isVBind && isStaticArgOf(arg, "ref")) {
+        pushRefVForMarker();
+      }
+      if (!arg && (isVBind || isVOn)) {
+        hasDynamicKeys = true;
+        if (exp) {
+          if (isVBind) {
+            pushRefVForMarker();
+            pushMergeArg();
+            mergeArgs.push(exp);
+          } else {
+            pushMergeArg({
+              type: 14,
+              loc,
+              callee: context.helper(TO_HANDLERS),
+              arguments: isComponent ? [exp] : [exp, `true`]
+            });
+          }
+        } else {
+          context.onError(
+            createCompilerError(
+              isVBind ? 34 : 35,
+              loc
+            )
+          );
+        }
+        continue;
+      }
+      if (isVBind && modifiers.some((mod) => mod.content === "prop")) {
+        patchFlag |= 32;
+      }
+      const directiveTransform = context.directiveTransforms[name];
+      if (directiveTransform) {
+        const { props: props2, needRuntime } = directiveTransform(prop, node, context);
+        !ssr && props2.forEach(analyzePatchFlag);
+        if (isVOn && arg && !isStaticExp(arg)) {
+          pushMergeArg(createObjectExpression(props2, elementLoc));
+        } else {
+          properties.push(...props2);
+        }
+        if (needRuntime) {
+          runtimeDirectives.push(prop);
+          if (isSymbol(needRuntime)) {
+            directiveImportMap.set(prop, needRuntime);
+          }
+        }
+      } else if (!isBuiltInDirective(name)) {
+        runtimeDirectives.push(prop);
+        if (hasChildren) {
+          shouldUseBlock = true;
+        }
+      }
+    }
+  }
+  let propsExpression = void 0;
+  if (mergeArgs.length) {
+    pushMergeArg();
+    if (mergeArgs.length > 1) {
+      propsExpression = createCallExpression(
+        context.helper(MERGE_PROPS),
+        mergeArgs,
+        elementLoc
+      );
+    } else {
+      propsExpression = mergeArgs[0];
+    }
+  } else if (properties.length) {
+    propsExpression = createObjectExpression(
+      dedupeProperties(properties),
+      elementLoc
+    );
+  }
+  if (hasDynamicKeys) {
+    patchFlag |= 16;
+  } else {
+    if (hasClassBinding && !isComponent) {
+      patchFlag |= 2;
+    }
+    if (hasStyleBinding && !isComponent) {
+      patchFlag |= 4;
+    }
+    if (dynamicPropNames.length) {
+      patchFlag |= 8;
+    }
+    if (hasHydrationEventBinding) {
+      patchFlag |= 32;
+    }
+  }
+  if (!shouldUseBlock && (patchFlag === 0 || patchFlag === 32) && (hasRef || hasVnodeHook || runtimeDirectives.length > 0)) {
+    patchFlag |= 512;
+  }
+  if (!context.inSSR && propsExpression) {
+    switch (propsExpression.type) {
+      case 15:
+        let classKeyIndex = -1;
+        let styleKeyIndex = -1;
+        let hasDynamicKey = false;
+        for (let i = 0; i < propsExpression.properties.length; i++) {
+          const key = propsExpression.properties[i].key;
+          if (isStaticExp(key)) {
+            if (key.content === "class") {
+              classKeyIndex = i;
+            } else if (key.content === "style") {
+              styleKeyIndex = i;
+            }
+          } else if (!key.isHandlerKey) {
+            hasDynamicKey = true;
+          }
+        }
+        const classProp = propsExpression.properties[classKeyIndex];
+        const styleProp = propsExpression.properties[styleKeyIndex];
+        if (!hasDynamicKey) {
+          if (classProp && !isStaticExp(classProp.value)) {
+            classProp.value = createCallExpression(
+              context.helper(NORMALIZE_CLASS),
+              [classProp.value]
+            );
+          }
+          if (styleProp && // the static style is compiled into an object,
+          // so use `hasStyleBinding` to ensure that it is a dynamic style binding
+          (hasStyleBinding || styleProp.value.type === 4 && styleProp.value.content.trim()[0] === `[` || // v-bind:style and style both exist,
+          // v-bind:style with static literal object
+          styleProp.value.type === 17)) {
+            styleProp.value = createCallExpression(
+              context.helper(NORMALIZE_STYLE),
+              [styleProp.value]
+            );
+          }
+        } else {
+          propsExpression = createCallExpression(
+            context.helper(NORMALIZE_PROPS),
+            [propsExpression]
+          );
+        }
+        break;
+      case 14:
+        break;
+      default:
+        propsExpression = createCallExpression(
+          context.helper(NORMALIZE_PROPS),
+          [
+            createCallExpression(context.helper(GUARD_REACTIVE_PROPS), [
+              propsExpression
+            ])
+          ]
+        );
+        break;
+    }
+  }
+  return {
+    props: propsExpression,
+    directives: runtimeDirectives,
+    patchFlag,
+    dynamicPropNames,
+    shouldUseBlock
+  };
+}
+function dedupeProperties(properties) {
+  const knownProps = /* @__PURE__ */ new Map();
+  const deduped = [];
+  for (let i = 0; i < properties.length; i++) {
+    const prop = properties[i];
+    if (prop.key.type === 8 || !prop.key.isStatic) {
+      deduped.push(prop);
+      continue;
+    }
+    const name = prop.key.content;
+    const existing = knownProps.get(name);
+    if (existing) {
+      if (name === "style" || name === "class" || isOn(name)) {
+        mergeAsArray(existing, prop);
+      }
+    } else {
+      knownProps.set(name, prop);
+      deduped.push(prop);
+    }
+  }
+  return deduped;
+}
+function mergeAsArray(existing, incoming) {
+  if (existing.value.type === 17) {
+    existing.value.elements.push(incoming.value);
+  } else {
+    existing.value = createArrayExpression(
+      [existing.value, incoming.value],
+      existing.loc
+    );
+  }
+}
+function buildDirectiveArgs(dir, context) {
+  const dirArgs = [];
+  const runtime = directiveImportMap.get(dir);
+  if (runtime) {
+    dirArgs.push(context.helperString(runtime));
+  } else {
+    {
+      context.helper(RESOLVE_DIRECTIVE);
+      context.directives.add(dir.name);
+      dirArgs.push(toValidAssetId(dir.name, `directive`));
+    }
+  }
+  const { loc } = dir;
+  if (dir.exp) dirArgs.push(dir.exp);
+  if (dir.arg) {
+    if (!dir.exp) {
+      dirArgs.push(`void 0`);
+    }
+    dirArgs.push(dir.arg);
+  }
+  if (Object.keys(dir.modifiers).length) {
+    if (!dir.arg) {
+      if (!dir.exp) {
+        dirArgs.push(`void 0`);
+      }
+      dirArgs.push(`void 0`);
+    }
+    const trueExpression = createSimpleExpression(`true`, false, loc);
+    dirArgs.push(
+      createObjectExpression(
+        dir.modifiers.map(
+          (modifier) => createObjectProperty(modifier, trueExpression)
+        ),
+        loc
+      )
+    );
+  }
+  return createArrayExpression(dirArgs, dir.loc);
+}
+function stringifyDynamicPropNames(props) {
+  let propsNamesString = `[`;
+  for (let i = 0, l = props.length; i < l; i++) {
+    propsNamesString += JSON.stringify(props[i]);
+    if (i < l - 1) propsNamesString += ", ";
+  }
+  return propsNamesString + `]`;
+}
+function isComponentTag(tag) {
+  return tag === "component" || tag === "Component";
+}
+
+const transformSlotOutlet = (node, context) => {
+  if (isSlotOutlet(node)) {
+    const { children, loc } = node;
+    const { slotName, slotProps } = processSlotOutlet(node, context);
+    const slotArgs = [
+      context.prefixIdentifiers ? `_ctx.$slots` : `$slots`,
+      slotName,
+      "{}",
+      "undefined",
+      "true"
+    ];
+    let expectedLen = 2;
+    if (slotProps) {
+      slotArgs[2] = slotProps;
+      expectedLen = 3;
+    }
+    if (children.length) {
+      slotArgs[3] = createFunctionExpression([], children, false, false, loc);
+      expectedLen = 4;
+    }
+    if (context.scopeId && !context.slotted) {
+      expectedLen = 5;
+    }
+    slotArgs.splice(expectedLen);
+    node.codegenNode = createCallExpression(
+      context.helper(RENDER_SLOT),
+      slotArgs,
+      loc
+    );
+  }
+};
+function processSlotOutlet(node, context) {
+  let slotName = `"default"`;
+  let slotProps = void 0;
+  const nonNameProps = [];
+  for (let i = 0; i < node.props.length; i++) {
+    const p = node.props[i];
+    if (p.type === 6) {
+      if (p.value) {
+        if (p.name === "name") {
+          slotName = JSON.stringify(p.value.content);
+        } else {
+          p.name = camelize(p.name);
+          nonNameProps.push(p);
+        }
+      }
+    } else {
+      if (p.name === "bind" && isStaticArgOf(p.arg, "name")) {
+        if (p.exp) {
+          slotName = p.exp;
+        } else if (p.arg && p.arg.type === 4) {
+          const name = camelize(p.arg.content);
+          slotName = p.exp = createSimpleExpression(name, false, p.arg.loc);
+        }
+      } else {
+        if (p.name === "bind" && p.arg && isStaticExp(p.arg)) {
+          p.arg.content = camelize(p.arg.content);
+        }
+        nonNameProps.push(p);
+      }
+    }
+  }
+  if (nonNameProps.length > 0) {
+    const { props, directives } = buildProps(
+      node,
+      context,
+      nonNameProps,
+      false,
+      false
+    );
+    slotProps = props;
+    if (directives.length) {
+      context.onError(
+        createCompilerError(
+          36,
+          directives[0].loc
+        )
+      );
+    }
+  }
+  return {
+    slotName,
+    slotProps
+  };
+}
+
+const transformOn$1 = (dir, node, context, augmentor) => {
+  const { loc, modifiers, arg } = dir;
+  if (!dir.exp && !modifiers.length) {
+    context.onError(createCompilerError(35, loc));
+  }
+  let eventName;
+  if (arg.type === 4) {
+    if (arg.isStatic) {
+      let rawName = arg.content;
+      if (rawName.startsWith("vnode")) {
+        context.onError(createCompilerError(51, arg.loc));
+      }
+      if (rawName.startsWith("vue:")) {
+        rawName = `vnode-${rawName.slice(4)}`;
+      }
+      const eventString = node.tagType !== 0 || rawName.startsWith("vnode") || !/[A-Z]/.test(rawName) ? (
+        // for non-element and vnode lifecycle event listeners, auto convert
+        // it to camelCase. See issue #2249
+        toHandlerKey(camelize(rawName))
+      ) : (
+        // preserve case for plain element listeners that have uppercase
+        // letters, as these may be custom elements' custom events
+        `on:${rawName}`
+      );
+      eventName = createSimpleExpression(eventString, true, arg.loc);
+    } else {
+      eventName = createCompoundExpression([
+        `${context.helperString(TO_HANDLER_KEY)}(`,
+        arg,
+        `)`
+      ]);
+    }
+  } else {
+    eventName = arg;
+    eventName.children.unshift(`${context.helperString(TO_HANDLER_KEY)}(`);
+    eventName.children.push(`)`);
+  }
+  let exp = dir.exp;
+  if (exp && !exp.content.trim()) {
+    exp = void 0;
+  }
+  let shouldCache = context.cacheHandlers && !exp && !context.inVOnce;
+  if (exp) {
+    const isMemberExp = isMemberExpression(exp);
+    const isInlineStatement = !(isMemberExp || isFnExpression(exp));
+    const hasMultipleStatements = exp.content.includes(`;`);
+    {
+      validateBrowserExpression(
+        exp,
+        context,
+        false,
+        hasMultipleStatements
+      );
+    }
+    if (isInlineStatement || shouldCache && isMemberExp) {
+      exp = createCompoundExpression([
+        `${isInlineStatement ? `$event` : `${``}(...args)`} => ${hasMultipleStatements ? `{` : `(`}`,
+        exp,
+        hasMultipleStatements ? `}` : `)`
+      ]);
+    }
+  }
+  let ret = {
+    props: [
+      createObjectProperty(
+        eventName,
+        exp || createSimpleExpression(`() => {}`, false, loc)
+      )
+    ]
+  };
+  if (augmentor) {
+    ret = augmentor(ret);
+  }
+  if (shouldCache) {
+    ret.props[0].value = context.cache(ret.props[0].value);
+  }
+  ret.props.forEach((p) => p.key.isHandlerKey = true);
+  return ret;
+};
+
+const transformText = (node, context) => {
+  if (node.type === 0 || node.type === 1 || node.type === 11 || node.type === 10) {
+    return () => {
+      const children = node.children;
+      let currentContainer = void 0;
+      let hasText = false;
+      for (let i = 0; i < children.length; i++) {
+        const child = children[i];
+        if (isText$1(child)) {
+          hasText = true;
+          for (let j = i + 1; j < children.length; j++) {
+            const next = children[j];
+            if (isText$1(next)) {
+              if (!currentContainer) {
+                currentContainer = children[i] = createCompoundExpression(
+                  [child],
+                  child.loc
+                );
+              }
+              currentContainer.children.push(` + `, next);
+              children.splice(j, 1);
+              j--;
+            } else {
+              currentContainer = void 0;
+              break;
+            }
+          }
+        }
+      }
+      if (!hasText || // if this is a plain element with a single text child, leave it
+      // as-is since the runtime has dedicated fast path for this by directly
+      // setting textContent of the element.
+      // for component root it's always normalized anyway.
+      children.length === 1 && (node.type === 0 || node.type === 1 && node.tagType === 0 && // #3756
+      // custom directives can potentially add DOM elements arbitrarily,
+      // we need to avoid setting textContent of the element at runtime
+      // to avoid accidentally overwriting the DOM elements added
+      // by the user through custom directives.
+      !node.props.find(
+        (p) => p.type === 7 && !context.directiveTransforms[p.name]
+      ) && // in compat mode, <template> tags with no special directives
+      // will be rendered as a fragment so its children must be
+      // converted into vnodes.
+      true)) {
+        return;
+      }
+      for (let i = 0; i < children.length; i++) {
+        const child = children[i];
+        if (isText$1(child) || child.type === 8) {
+          const callArgs = [];
+          if (child.type !== 2 || child.content !== " ") {
+            callArgs.push(child);
+          }
+          if (!context.ssr && getConstantType(child, context) === 0) {
+            callArgs.push(
+              1 + (` /* ${PatchFlagNames[1]} */` )
+            );
+          }
+          children[i] = {
+            type: 12,
+            content: child,
+            loc: child.loc,
+            codegenNode: createCallExpression(
+              context.helper(CREATE_TEXT),
+              callArgs
+            )
+          };
+        }
+      }
+    };
+  }
+};
+
+const seen$1 = /* @__PURE__ */ new WeakSet();
+const transformOnce = (node, context) => {
+  if (node.type === 1 && findDir(node, "once", true)) {
+    if (seen$1.has(node) || context.inVOnce || context.inSSR) {
+      return;
+    }
+    seen$1.add(node);
+    context.inVOnce = true;
+    context.helper(SET_BLOCK_TRACKING);
+    return () => {
+      context.inVOnce = false;
+      const cur = context.currentNode;
+      if (cur.codegenNode) {
+        cur.codegenNode = context.cache(
+          cur.codegenNode,
+          true
+          /* isVNode */
+        );
+      }
+    };
+  }
+};
+
+const transformModel$1 = (dir, node, context) => {
+  const { exp, arg } = dir;
+  if (!exp) {
+    context.onError(
+      createCompilerError(41, dir.loc)
+    );
+    return createTransformProps();
+  }
+  const rawExp = exp.loc.source.trim();
+  const expString = exp.type === 4 ? exp.content : rawExp;
+  const bindingType = context.bindingMetadata[rawExp];
+  if (bindingType === "props" || bindingType === "props-aliased") {
+    context.onError(createCompilerError(44, exp.loc));
+    return createTransformProps();
+  }
+  const maybeRef = false;
+  if (!expString.trim() || !isMemberExpression(exp) && !maybeRef) {
+    context.onError(
+      createCompilerError(42, exp.loc)
+    );
+    return createTransformProps();
+  }
+  const propName = arg ? arg : createSimpleExpression("modelValue", true);
+  const eventName = arg ? isStaticExp(arg) ? `onUpdate:${camelize(arg.content)}` : createCompoundExpression(['"onUpdate:" + ', arg]) : `onUpdate:modelValue`;
+  let assignmentExp;
+  const eventArg = context.isTS ? `($event: any)` : `$event`;
+  {
+    assignmentExp = createCompoundExpression([
+      `${eventArg} => ((`,
+      exp,
+      `) = $event)`
+    ]);
+  }
+  const props = [
+    // modelValue: foo
+    createObjectProperty(propName, dir.exp),
+    // "onUpdate:modelValue": $event => (foo = $event)
+    createObjectProperty(eventName, assignmentExp)
+  ];
+  if (dir.modifiers.length && node.tagType === 1) {
+    const modifiers = dir.modifiers.map((m) => m.content).map((m) => (isSimpleIdentifier(m) ? m : JSON.stringify(m)) + `: true`).join(`, `);
+    const modifiersKey = arg ? isStaticExp(arg) ? `${arg.content}Modifiers` : createCompoundExpression([arg, ' + "Modifiers"']) : `modelModifiers`;
+    props.push(
+      createObjectProperty(
+        modifiersKey,
+        createSimpleExpression(
+          `{ ${modifiers} }`,
+          false,
+          dir.loc,
+          2
+        )
+      )
+    );
+  }
+  return createTransformProps(props);
+};
+function createTransformProps(props = []) {
+  return { props };
+}
+
+const seen = /* @__PURE__ */ new WeakSet();
+const transformMemo = (node, context) => {
+  if (node.type === 1) {
+    const dir = findDir(node, "memo");
+    if (!dir || seen.has(node)) {
+      return;
+    }
+    seen.add(node);
+    return () => {
+      const codegenNode = node.codegenNode || context.currentNode.codegenNode;
+      if (codegenNode && codegenNode.type === 13) {
+        if (node.tagType !== 1) {
+          convertToBlock(codegenNode, context);
+        }
+        node.codegenNode = createCallExpression(context.helper(WITH_MEMO), [
+          dir.exp,
+          createFunctionExpression(void 0, codegenNode),
+          `_cache`,
+          String(context.cached.length)
+        ]);
+        context.cached.push(null);
+      }
+    };
+  }
+};
+
+function getBaseTransformPreset(prefixIdentifiers) {
+  return [
+    [
+      transformOnce,
+      transformIf,
+      transformMemo,
+      transformFor,
+      ...[],
+      ...[transformExpression] ,
+      transformSlotOutlet,
+      transformElement,
+      trackSlotScopes,
+      transformText
+    ],
+    {
+      on: transformOn$1,
+      bind: transformBind,
+      model: transformModel$1
+    }
+  ];
+}
+function baseCompile(source, options = {}) {
+  const onError = options.onError || defaultOnError;
+  const isModuleMode = options.mode === "module";
+  {
+    if (options.prefixIdentifiers === true) {
+      onError(createCompilerError(47));
+    } else if (isModuleMode) {
+      onError(createCompilerError(48));
+    }
+  }
+  const prefixIdentifiers = false;
+  if (options.cacheHandlers) {
+    onError(createCompilerError(49));
+  }
+  if (options.scopeId && !isModuleMode) {
+    onError(createCompilerError(50));
+  }
+  const resolvedOptions = extend({}, options, {
+    prefixIdentifiers
+  });
+  const ast = isString(source) ? baseParse(source, resolvedOptions) : source;
+  const [nodeTransforms, directiveTransforms] = getBaseTransformPreset();
+  transform(
+    ast,
+    extend({}, resolvedOptions, {
+      nodeTransforms: [
+        ...nodeTransforms,
+        ...options.nodeTransforms || []
+        // user transforms
+      ],
+      directiveTransforms: extend(
+        {},
+        directiveTransforms,
+        options.directiveTransforms || {}
+        // user transforms
+      )
+    })
+  );
+  return generate(ast, resolvedOptions);
+}
+
+const noopDirectiveTransform = () => ({ props: [] });
+
+const V_MODEL_RADIO = Symbol(`vModelRadio` );
+const V_MODEL_CHECKBOX = Symbol(
+  `vModelCheckbox` 
+);
+const V_MODEL_TEXT = Symbol(`vModelText` );
+const V_MODEL_SELECT = Symbol(
+  `vModelSelect` 
+);
+const V_MODEL_DYNAMIC = Symbol(
+  `vModelDynamic` 
+);
+const V_ON_WITH_MODIFIERS = Symbol(
+  `vOnModifiersGuard` 
+);
+const V_ON_WITH_KEYS = Symbol(
+  `vOnKeysGuard` 
+);
+const V_SHOW = Symbol(`vShow` );
+const TRANSITION = Symbol(`Transition` );
+const TRANSITION_GROUP = Symbol(
+  `TransitionGroup` 
+);
+registerRuntimeHelpers({
+  [V_MODEL_RADIO]: `vModelRadio`,
+  [V_MODEL_CHECKBOX]: `vModelCheckbox`,
+  [V_MODEL_TEXT]: `vModelText`,
+  [V_MODEL_SELECT]: `vModelSelect`,
+  [V_MODEL_DYNAMIC]: `vModelDynamic`,
+  [V_ON_WITH_MODIFIERS]: `withModifiers`,
+  [V_ON_WITH_KEYS]: `withKeys`,
+  [V_SHOW]: `vShow`,
+  [TRANSITION]: `Transition`,
+  [TRANSITION_GROUP]: `TransitionGroup`
+});
+
+let decoder;
+function decodeHtmlBrowser(raw, asAttr = false) {
+  if (!decoder) {
+    decoder = document.createElement("div");
+  }
+  if (asAttr) {
+    decoder.innerHTML = `<div foo="${raw.replace(/"/g, "&quot;")}">`;
+    return decoder.children[0].getAttribute("foo");
+  } else {
+    decoder.innerHTML = raw;
+    return decoder.textContent;
+  }
+}
+
+const parserOptions = {
+  parseMode: "html",
+  isVoidTag,
+  isNativeTag: (tag) => isHTMLTag(tag) || isSVGTag(tag) || isMathMLTag(tag),
+  isPreTag: (tag) => tag === "pre",
+  isIgnoreNewlineTag: (tag) => tag === "pre" || tag === "textarea",
+  decodeEntities: decodeHtmlBrowser ,
+  isBuiltInComponent: (tag) => {
+    if (tag === "Transition" || tag === "transition") {
+      return TRANSITION;
+    } else if (tag === "TransitionGroup" || tag === "transition-group") {
+      return TRANSITION_GROUP;
+    }
+  },
+  // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction-dispatcher
+  getNamespace(tag, parent, rootNamespace) {
+    let ns = parent ? parent.ns : rootNamespace;
+    if (parent && ns === 2) {
+      if (parent.tag === "annotation-xml") {
+        if (tag === "svg") {
+          return 1;
+        }
+        if (parent.props.some(
+          (a) => a.type === 6 && a.name === "encoding" && a.value != null && (a.value.content === "text/html" || a.value.content === "application/xhtml+xml")
+        )) {
+          ns = 0;
+        }
+      } else if (/^m(?:[ions]|text)$/.test(parent.tag) && tag !== "mglyph" && tag !== "malignmark") {
+        ns = 0;
+      }
+    } else if (parent && ns === 1) {
+      if (parent.tag === "foreignObject" || parent.tag === "desc" || parent.tag === "title") {
+        ns = 0;
+      }
+    }
+    if (ns === 0) {
+      if (tag === "svg") {
+        return 1;
+      }
+      if (tag === "math") {
+        return 2;
+      }
+    }
+    return ns;
+  }
+};
+
+const transformStyle = (node) => {
+  if (node.type === 1) {
+    node.props.forEach((p, i) => {
+      if (p.type === 6 && p.name === "style" && p.value) {
+        node.props[i] = {
+          type: 7,
+          name: `bind`,
+          arg: createSimpleExpression(`style`, true, p.loc),
+          exp: parseInlineCSS(p.value.content, p.loc),
+          modifiers: [],
+          loc: p.loc
+        };
+      }
+    });
+  }
+};
+const parseInlineCSS = (cssText, loc) => {
+  const normalized = parseStringStyle(cssText);
+  return createSimpleExpression(
+    JSON.stringify(normalized),
+    false,
+    loc,
+    3
+  );
+};
+
+function createDOMCompilerError(code, loc) {
+  return createCompilerError(
+    code,
+    loc,
+    DOMErrorMessages 
+  );
+}
+const DOMErrorMessages = {
+  [53]: `v-html is missing expression.`,
+  [54]: `v-html will override element children.`,
+  [55]: `v-text is missing expression.`,
+  [56]: `v-text will override element children.`,
+  [57]: `v-model can only be used on <input>, <textarea> and <select> elements.`,
+  [58]: `v-model argument is not supported on plain elements.`,
+  [59]: `v-model cannot be used on file inputs since they are read-only. Use a v-on:change listener instead.`,
+  [60]: `Unnecessary value binding used alongside v-model. It will interfere with v-model's behavior.`,
+  [61]: `v-show is missing expression.`,
+  [62]: `<Transition> expects exactly one child element or component.`,
+  [63]: `Tags with side effect (<script> and <style>) are ignored in client component templates.`
+};
+
+const transformVHtml = (dir, node, context) => {
+  const { exp, loc } = dir;
+  if (!exp) {
+    context.onError(
+      createDOMCompilerError(53, loc)
+    );
+  }
+  if (node.children.length) {
+    context.onError(
+      createDOMCompilerError(54, loc)
+    );
+    node.children.length = 0;
+  }
+  return {
+    props: [
+      createObjectProperty(
+        createSimpleExpression(`innerHTML`, true, loc),
+        exp || createSimpleExpression("", true)
+      )
+    ]
+  };
+};
+
+const transformVText = (dir, node, context) => {
+  const { exp, loc } = dir;
+  if (!exp) {
+    context.onError(
+      createDOMCompilerError(55, loc)
+    );
+  }
+  if (node.children.length) {
+    context.onError(
+      createDOMCompilerError(56, loc)
+    );
+    node.children.length = 0;
+  }
+  return {
+    props: [
+      createObjectProperty(
+        createSimpleExpression(`textContent`, true),
+        exp ? getConstantType(exp, context) > 0 ? exp : createCallExpression(
+          context.helperString(TO_DISPLAY_STRING),
+          [exp],
+          loc
+        ) : createSimpleExpression("", true)
+      )
+    ]
+  };
+};
+
+const transformModel = (dir, node, context) => {
+  const baseResult = transformModel$1(dir, node, context);
+  if (!baseResult.props.length || node.tagType === 1) {
+    return baseResult;
+  }
+  if (dir.arg) {
+    context.onError(
+      createDOMCompilerError(
+        58,
+        dir.arg.loc
+      )
+    );
+  }
+  function checkDuplicatedValue() {
+    const value = findDir(node, "bind");
+    if (value && isStaticArgOf(value.arg, "value")) {
+      context.onError(
+        createDOMCompilerError(
+          60,
+          value.loc
+        )
+      );
+    }
+  }
+  const { tag } = node;
+  const isCustomElement = context.isCustomElement(tag);
+  if (tag === "input" || tag === "textarea" || tag === "select" || isCustomElement) {
+    let directiveToUse = V_MODEL_TEXT;
+    let isInvalidType = false;
+    if (tag === "input" || isCustomElement) {
+      const type = findProp(node, `type`);
+      if (type) {
+        if (type.type === 7) {
+          directiveToUse = V_MODEL_DYNAMIC;
+        } else if (type.value) {
+          switch (type.value.content) {
+            case "radio":
+              directiveToUse = V_MODEL_RADIO;
+              break;
+            case "checkbox":
+              directiveToUse = V_MODEL_CHECKBOX;
+              break;
+            case "file":
+              isInvalidType = true;
+              context.onError(
+                createDOMCompilerError(
+                  59,
+                  dir.loc
+                )
+              );
+              break;
+            default:
+              checkDuplicatedValue();
+              break;
+          }
+        }
+      } else if (hasDynamicKeyVBind(node)) {
+        directiveToUse = V_MODEL_DYNAMIC;
+      } else {
+        checkDuplicatedValue();
+      }
+    } else if (tag === "select") {
+      directiveToUse = V_MODEL_SELECT;
+    } else {
+      checkDuplicatedValue();
+    }
+    if (!isInvalidType) {
+      baseResult.needRuntime = context.helper(directiveToUse);
+    }
+  } else {
+    context.onError(
+      createDOMCompilerError(
+        57,
+        dir.loc
+      )
+    );
+  }
+  baseResult.props = baseResult.props.filter(
+    (p) => !(p.key.type === 4 && p.key.content === "modelValue")
+  );
+  return baseResult;
+};
+
+const isEventOptionModifier = /* @__PURE__ */ makeMap(`passive,once,capture`);
+const isNonKeyModifier = /* @__PURE__ */ makeMap(
+  // event propagation management
+  `stop,prevent,self,ctrl,shift,alt,meta,exact,middle`
+);
+const maybeKeyModifier = /* @__PURE__ */ makeMap("left,right");
+const isKeyboardEvent = /* @__PURE__ */ makeMap(`onkeyup,onkeydown,onkeypress`);
+const resolveModifiers = (key, modifiers, context, loc) => {
+  const keyModifiers = [];
+  const nonKeyModifiers = [];
+  const eventOptionModifiers = [];
+  for (let i = 0; i < modifiers.length; i++) {
+    const modifier = modifiers[i].content;
+    if (isEventOptionModifier(modifier)) {
+      eventOptionModifiers.push(modifier);
+    } else {
+      if (maybeKeyModifier(modifier)) {
+        if (isStaticExp(key)) {
+          if (isKeyboardEvent(key.content.toLowerCase())) {
+            keyModifiers.push(modifier);
+          } else {
+            nonKeyModifiers.push(modifier);
+          }
+        } else {
+          keyModifiers.push(modifier);
+          nonKeyModifiers.push(modifier);
+        }
+      } else {
+        if (isNonKeyModifier(modifier)) {
+          nonKeyModifiers.push(modifier);
+        } else {
+          keyModifiers.push(modifier);
+        }
+      }
+    }
+  }
+  return {
+    keyModifiers,
+    nonKeyModifiers,
+    eventOptionModifiers
+  };
+};
+const transformClick = (key, event) => {
+  const isStaticClick = isStaticExp(key) && key.content.toLowerCase() === "onclick";
+  return isStaticClick ? createSimpleExpression(event, true) : key.type !== 4 ? createCompoundExpression([
+    `(`,
+    key,
+    `) === "onClick" ? "${event}" : (`,
+    key,
+    `)`
+  ]) : key;
+};
+const transformOn = (dir, node, context) => {
+  return transformOn$1(dir, node, context, (baseResult) => {
+    const { modifiers } = dir;
+    if (!modifiers.length) return baseResult;
+    let { key, value: handlerExp } = baseResult.props[0];
+    const { keyModifiers, nonKeyModifiers, eventOptionModifiers } = resolveModifiers(key, modifiers, context, dir.loc);
+    if (nonKeyModifiers.includes("right")) {
+      key = transformClick(key, `onContextmenu`);
+    }
+    if (nonKeyModifiers.includes("middle")) {
+      key = transformClick(key, `onMouseup`);
+    }
+    if (nonKeyModifiers.length) {
+      handlerExp = createCallExpression(context.helper(V_ON_WITH_MODIFIERS), [
+        handlerExp,
+        JSON.stringify(nonKeyModifiers)
+      ]);
+    }
+    if (keyModifiers.length && // if event name is dynamic, always wrap with keys guard
+    (!isStaticExp(key) || isKeyboardEvent(key.content.toLowerCase()))) {
+      handlerExp = createCallExpression(context.helper(V_ON_WITH_KEYS), [
+        handlerExp,
+        JSON.stringify(keyModifiers)
+      ]);
+    }
+    if (eventOptionModifiers.length) {
+      const modifierPostfix = eventOptionModifiers.map(capitalize).join("");
+      key = isStaticExp(key) ? createSimpleExpression(`${key.content}${modifierPostfix}`, true) : createCompoundExpression([`(`, key, `) + "${modifierPostfix}"`]);
+    }
+    return {
+      props: [createObjectProperty(key, handlerExp)]
+    };
+  });
+};
+
+const transformShow = (dir, node, context) => {
+  const { exp, loc } = dir;
+  if (!exp) {
+    context.onError(
+      createDOMCompilerError(61, loc)
+    );
+  }
+  return {
+    props: [],
+    needRuntime: context.helper(V_SHOW)
+  };
+};
+
+const transformTransition = (node, context) => {
+  if (node.type === 1 && node.tagType === 1) {
+    const component = context.isBuiltInComponent(node.tag);
+    if (component === TRANSITION) {
+      return () => {
+        if (!node.children.length) {
+          return;
+        }
+        if (hasMultipleChildren(node)) {
+          context.onError(
+            createDOMCompilerError(
+              62,
+              {
+                start: node.children[0].loc.start,
+                end: node.children[node.children.length - 1].loc.end,
+                source: ""
+              }
+            )
+          );
+        }
+        const child = node.children[0];
+        if (child.type === 1) {
+          for (const p of child.props) {
+            if (p.type === 7 && p.name === "show") {
+              node.props.push({
+                type: 6,
+                name: "persisted",
+                nameLoc: node.loc,
+                value: void 0,
+                loc: node.loc
+              });
+            }
+          }
+        }
+      };
+    }
+  }
+};
+function hasMultipleChildren(node) {
+  const children = node.children = node.children.filter(
+    (c) => c.type !== 3 && !(c.type === 2 && !c.content.trim())
+  );
+  const child = children[0];
+  return children.length !== 1 || child.type === 11 || child.type === 9 && child.branches.some(hasMultipleChildren);
+}
+
+const ignoreSideEffectTags = (node, context) => {
+  if (node.type === 1 && node.tagType === 0 && (node.tag === "script" || node.tag === "style")) {
+    context.onError(
+      createDOMCompilerError(
+        63,
+        node.loc
+      )
+    );
+    context.removeNode();
+  }
+};
+
+function isValidHTMLNesting(parent, child) {
+  if (parent in onlyValidChildren) {
+    return onlyValidChildren[parent].has(child);
+  }
+  if (child in onlyValidParents) {
+    return onlyValidParents[child].has(parent);
+  }
+  if (parent in knownInvalidChildren) {
+    if (knownInvalidChildren[parent].has(child)) return false;
+  }
+  if (child in knownInvalidParents) {
+    if (knownInvalidParents[child].has(parent)) return false;
+  }
+  return true;
+}
+const headings = /* @__PURE__ */ new Set(["h1", "h2", "h3", "h4", "h5", "h6"]);
+const emptySet = /* @__PURE__ */ new Set([]);
+const onlyValidChildren = {
+  head: /* @__PURE__ */ new Set([
+    "base",
+    "basefront",
+    "bgsound",
+    "link",
+    "meta",
+    "title",
+    "noscript",
+    "noframes",
+    "style",
+    "script",
+    "template"
+  ]),
+  optgroup: /* @__PURE__ */ new Set(["option"]),
+  select: /* @__PURE__ */ new Set(["optgroup", "option", "hr"]),
+  // table
+  table: /* @__PURE__ */ new Set(["caption", "colgroup", "tbody", "tfoot", "thead"]),
+  tr: /* @__PURE__ */ new Set(["td", "th"]),
+  colgroup: /* @__PURE__ */ new Set(["col"]),
+  tbody: /* @__PURE__ */ new Set(["tr"]),
+  thead: /* @__PURE__ */ new Set(["tr"]),
+  tfoot: /* @__PURE__ */ new Set(["tr"]),
+  // these elements can not have any children elements
+  script: emptySet,
+  iframe: emptySet,
+  option: emptySet,
+  textarea: emptySet,
+  style: emptySet,
+  title: emptySet
+};
+const onlyValidParents = {
+  // sections
+  html: emptySet,
+  body: /* @__PURE__ */ new Set(["html"]),
+  head: /* @__PURE__ */ new Set(["html"]),
+  // table
+  td: /* @__PURE__ */ new Set(["tr"]),
+  colgroup: /* @__PURE__ */ new Set(["table"]),
+  caption: /* @__PURE__ */ new Set(["table"]),
+  tbody: /* @__PURE__ */ new Set(["table"]),
+  tfoot: /* @__PURE__ */ new Set(["table"]),
+  col: /* @__PURE__ */ new Set(["colgroup"]),
+  th: /* @__PURE__ */ new Set(["tr"]),
+  thead: /* @__PURE__ */ new Set(["table"]),
+  tr: /* @__PURE__ */ new Set(["tbody", "thead", "tfoot"]),
+  // data list
+  dd: /* @__PURE__ */ new Set(["dl", "div"]),
+  dt: /* @__PURE__ */ new Set(["dl", "div"]),
+  // other
+  figcaption: /* @__PURE__ */ new Set(["figure"]),
+  // li: new Set(["ul", "ol"]),
+  summary: /* @__PURE__ */ new Set(["details"]),
+  area: /* @__PURE__ */ new Set(["map"])
+};
+const knownInvalidChildren = {
+  p: /* @__PURE__ */ new Set([
+    "address",
+    "article",
+    "aside",
+    "blockquote",
+    "center",
+    "details",
+    "dialog",
+    "dir",
+    "div",
+    "dl",
+    "fieldset",
+    "figure",
+    "footer",
+    "form",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "header",
+    "hgroup",
+    "hr",
+    "li",
+    "main",
+    "nav",
+    "menu",
+    "ol",
+    "p",
+    "pre",
+    "section",
+    "table",
+    "ul"
+  ]),
+  svg: /* @__PURE__ */ new Set([
+    "b",
+    "blockquote",
+    "br",
+    "code",
+    "dd",
+    "div",
+    "dl",
+    "dt",
+    "em",
+    "embed",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "hr",
+    "i",
+    "img",
+    "li",
+    "menu",
+    "meta",
+    "ol",
+    "p",
+    "pre",
+    "ruby",
+    "s",
+    "small",
+    "span",
+    "strong",
+    "sub",
+    "sup",
+    "table",
+    "u",
+    "ul",
+    "var"
+  ])
+};
+const knownInvalidParents = {
+  a: /* @__PURE__ */ new Set(["a"]),
+  button: /* @__PURE__ */ new Set(["button"]),
+  dd: /* @__PURE__ */ new Set(["dd", "dt"]),
+  dt: /* @__PURE__ */ new Set(["dd", "dt"]),
+  form: /* @__PURE__ */ new Set(["form"]),
+  li: /* @__PURE__ */ new Set(["li"]),
+  h1: headings,
+  h2: headings,
+  h3: headings,
+  h4: headings,
+  h5: headings,
+  h6: headings
+};
+
+const validateHtmlNesting = (node, context) => {
+  if (node.type === 1 && node.tagType === 0 && context.parent && context.parent.type === 1 && context.parent.tagType === 0 && !isValidHTMLNesting(context.parent.tag, node.tag)) {
+    const error = new SyntaxError(
+      `<${node.tag}> cannot be child of <${context.parent.tag}>, according to HTML specifications. This can cause hydration errors or potentially disrupt future functionality.`
+    );
+    error.loc = node.loc;
+    context.onWarn(error);
+  }
+};
+
+const DOMNodeTransforms = [
+  transformStyle,
+  ...[transformTransition, validateHtmlNesting] 
+];
+const DOMDirectiveTransforms = {
+  cloak: noopDirectiveTransform,
+  html: transformVHtml,
+  text: transformVText,
+  model: transformModel,
+  // override compiler-core
+  on: transformOn,
+  // override compiler-core
+  show: transformShow
+};
+function compile(src, options = {}) {
+  return baseCompile(
+    src,
+    extend({}, parserOptions, options, {
+      nodeTransforms: [
+        // ignore <script> and <tag>
+        // this is not put inside DOMNodeTransforms because that list is used
+        // by compiler-ssr to generate vnode fallback branches
+        ignoreSideEffectTags,
+        ...DOMNodeTransforms,
+        ...options.nodeTransforms || []
+      ],
+      directiveTransforms: extend(
+        {},
+        DOMDirectiveTransforms,
+        options.directiveTransforms || {}
+      ),
+      transformHoist: null 
+    })
+  );
+}
+
+{
+  initDev();
+}
+const compileCache = /* @__PURE__ */ Object.create(null);
+function compileToFunction(template, options) {
+  if (!isString(template)) {
+    if (template.nodeType) {
+      template = template.innerHTML;
+    } else {
+      warn(`invalid template option: `, template);
+      return NOOP;
+    }
+  }
+  const key = genCacheKey(template, options);
+  const cached = compileCache[key];
+  if (cached) {
+    return cached;
+  }
+  if (template[0] === "#") {
+    const el = document.querySelector(template);
+    if (!el) {
+      warn(`Template element not found or is empty: ${template}`);
+    }
+    template = el ? el.innerHTML : ``;
+  }
+  const opts = extend(
+    {
+      hoistStatic: true,
+      onError: onError ,
+      onWarn: (e) => onError(e, true) 
+    },
+    options
+  );
+  if (!opts.isCustomElement && typeof customElements !== "undefined") {
+    opts.isCustomElement = (tag) => !!customElements.get(tag);
+  }
+  const { code } = compile(template, opts);
+  function onError(err, asWarning = false) {
+    const message = asWarning ? err.message : `Template compilation error: ${err.message}`;
+    const codeFrame = err.loc && generateCodeFrame(
+      template,
+      err.loc.start.offset,
+      err.loc.end.offset
+    );
+    warn(codeFrame ? `${message}
+${codeFrame}` : message);
+  }
+  const render = new Function("Vue", code)(runtimeDom);
+  render._rc = true;
+  return compileCache[key] = render;
+}
+registerRuntimeCompiler(compileToFunction);
+
+export { BaseTransition, BaseTransitionPropsValidators, Comment, DeprecationTypes, EffectScope, ErrorCodes, ErrorTypeStrings, Fragment, KeepAlive, ReactiveEffect, Static, Suspense, Teleport, Text, TrackOpTypes, Transition, TransitionGroup, TriggerOpTypes, VueElement, assertNumber, callWithAsyncErrorHandling, callWithErrorHandling, camelize, capitalize, cloneVNode, compatUtils, compileToFunction as compile, computed, createApp, createBlock, createCommentVNode, createElementBlock, createBaseVNode as createElementVNode, createHydrationRenderer, createPropsRestProxy, createRenderer, createSSRApp, createSlots, createStaticVNode, createTextVNode, createVNode, customRef, defineAsyncComponent, defineComponent, defineCustomElement, defineEmits, defineExpose, defineModel, defineOptions, defineProps, defineSSRCustomElement, defineSlots, devtools, effect, effectScope, getCurrentInstance, getCurrentScope, getCurrentWatcher, getTransitionRawChildren, guardReactiveProps, h, handleError, hasInjectionContext, hydrate, hydrateOnIdle, hydrateOnInteraction, hydrateOnMediaQuery, hydrateOnVisible, initCustomFormatter, initDirectivesForSSR, inject, isMemoSame, isProxy, isReactive, isReadonly, isRef, isRuntimeOnly, isShallow, isVNode, markRaw, mergeDefaults, mergeModels, mergeProps, nextTick, normalizeClass, normalizeProps, normalizeStyle, onActivated, onBeforeMount, onBeforeUnmount, onBeforeUpdate, onDeactivated, onErrorCaptured, onMounted, onRenderTracked, onRenderTriggered, onScopeDispose, onServerPrefetch, onUnmounted, onUpdated, onWatcherCleanup, openBlock, popScopeId, provide, proxyRefs, pushScopeId, queuePostFlushCb, reactive, readonly, ref, registerRuntimeCompiler, render, renderList, renderSlot, resolveComponent, resolveDirective, resolveDynamicComponent, resolveFilter, resolveTransitionHooks, setBlockTracking, setDevtoolsHook, setTransitionHooks, shallowReactive, shallowReadonly, shallowRef, ssrContextKey, ssrUtils, stop, toDisplayString, toHandlerKey, toHandlers, toRaw, toRef, toRefs, toValue, transformVNodeArgs, triggerRef, unref, useAttrs, useCssModule, useCssVars, useHost, useId, useModel, useSSRContext, useShadowRoot, useSlots, useTemplateRef, useTransitionState, vModelCheckbox, vModelDynamic, vModelRadio, vModelSelect, vModelText, vShow, version, warn, watch, watchEffect, watchPostEffect, watchSyncEffect, withAsyncContext, withCtx, withDefaults, withDirectives, withKeys, withMemo, withModifiers, withScopeId };
+
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index a95f5c6df..850c652ac 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -3,1301 +3,582 @@
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
   <meta name="color-scheme" content="light dark">
-  <title>llama.cpp - chat</title>
+  <title>🦙 llama.cpp - chat</title>
 
-  <style>
-    body {
-      font-family: system-ui;
-      font-size: 90%;
-    }
-
-    .grid-container {
-      display: grid;
-      grid-template-columns: auto auto auto;
-      padding: 10px;
-    }
-
-    .grid-item {
-      padding: 5px;
-      /* font-size: 30px; */
-      text-align: center;
-    }
-
-    #container {
-      margin: 0em auto;
-      display: flex;
-      flex-direction: column;
-      justify-content: space-between;
-      height: 100%;
-    }
-
-    main {
-      margin: 3px;
-      display: flex;
-      flex-direction: column;
-      justify-content: space-between;
-      gap: 1em;
-
-      flex-grow: 1;
-      overflow-y: auto;
-
-      border: 1px solid #ccc;
-      border-radius: 5px;
-      padding: 0.5em;
-    }
-
-    h1 {
-      text-align: center;
-    }
-
-    .customlink:link {
-      color: white;
-      background-color: #007aff;
-      font-weight: 600;
-      text-decoration: none;
-      float: right;
-      margin-top: 30px;
-      display: flex;
-      flex-direction: row;
-      gap: 0.5em;
-      justify-content: flex-end;
-      border-radius: 4px;
-      padding: 8px;
-    }
-
-    .customlink:visited {
-      color: white;
-      background-color: #007aff;
-      font-weight: 600;
-      text-decoration: none;
-      float: right;
-      margin-top: 30px;
-      display: flex;
-      flex-direction: row;
-      gap: 0.5em;
-      justify-content: flex-end;
-      padding: 8px;
-    }
-
-    .customlink:hover {
-      color: white;
-      background-color: #0070ee;
-      font-weight: 600;
-      text-decoration: none;
-      float: right;
-      margin-top: 30px;
-      display: flex;
-      flex-direction: row;
-      gap: 0.5em;
-      justify-content: flex-end;
-      padding: 8px;
-    }
-
-    .customlink:active {
-      color: #0070ee;
-      background-color: #80b3ef;
-      font-weight: 600;
-      text-decoration: none;
-      float: right;
-      margin-top: 30px;
-      display: flex;
-      flex-direction: row;
-      gap: 0.5em;
-      justify-content: flex-end;
-      padding: 8px;
-    }
-
-    body {
-      max-width: 600px;
-      min-width: 300px;
-      line-height: 1.2;
-      margin: 0 auto;
-      padding: 0 0.5em;
-    }
-
-    p {
-      overflow-wrap: break-word;
-      word-wrap: break-word;
-      hyphens: auto;
-      margin-top: 0.5em;
-      margin-bottom: 0.5em;
-    }
-
-    #write form {
-      margin: 1em 0 0 0;
-      display: flex;
-      flex-direction: column;
-      gap: 0.5em;
-      align-items: stretch;
-    }
-
-    .message-controls {
-      display: flex;
-      justify-content: flex-end;
-    }
-    .message-controls > div:nth-child(2) {
-      display: flex;
-      flex-direction: column;
-      gap: 0.5em;
-    }
-    .message-controls > div:nth-child(2) > div {
-      display: flex;
-      margin-left: auto;
-      gap: 0.5em;
-    }
-
-    fieldset {
-      border: none;
-      padding: 0;
-      margin: 0;
-    }
-
-    fieldset.two {
-      display: grid;
-      grid-template: "a a";
-      gap: 1em;
-    }
-
-    fieldset.three {
-      display: grid;
-      grid-template: "a a a";
-      gap: 1em;
-    }
-
-    details {
-      border: 1px solid #aaa;
-      border-radius: 4px;
-      padding: 0.5em 0.5em 0;
-      margin-top: 0.5em;
-    }
-
-    summary {
-      font-weight: bold;
-      margin: -0.5em -0.5em 0;
-      padding: 0.5em;
-      cursor: pointer;
-    }
-
-    details[open] {
-      padding: 0.5em;
-    }
-
-    .prob-set {
-      padding: 0.3em;
-      border-bottom: 1px solid #ccc;
-    }
-
-    .popover-content {
-      position: absolute;
-      background-color: white;
-      padding: 0.2em;
-      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
-    }
-
-    textarea {
-      padding: 5px;
-      flex-grow: 1;
-      width: 100%;
-    }
-
-    pre code {
-      display: block;
-      background-color: #222;
-      color: #ddd;
-    }
-
-    code {
-      font-family: monospace;
-      padding: 0.1em 0.3em;
-      border-radius: 3px;
-    }
-
-    fieldset label {
-      margin: 0.5em 0;
-      display: block;
-    }
-
-    fieldset label.slim {
-      margin: 0 0.5em;
-      display: inline;
-    }
-
-    header,
-    footer {
-      text-align: center;
-    }
-
-    footer {
-      font-size: 80%;
-      color: #888;
-    }
-
-    .mode-chat textarea[name=prompt] {
-      height: 4.5em;
-    }
-
-    .mode-completion textarea[name=prompt] {
-      height: 10em;
-    }
-
-    [contenteditable] {
-      display: inline-block;
-      white-space: pre-wrap;
-      outline: 0px solid transparent;
-    }
-
-    @keyframes loading-bg-wipe {
-      0% {
-        background-position: 0%;
-      }
-
-      100% {
-        background-position: 100%;
+  <!-- Note: dependencies can de updated using ./deps.sh script -->
+  <link href="./deps_daisyui.min.css" rel="stylesheet" type="text/css" />
+  <script src="./deps_tailwindcss.js"></script>
+  <style type="text/tailwindcss">
+    .markdown {
+      h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
+      pre {
+        @apply whitespace-pre-wrap my-4 rounded-lg p-2;
+        border: 1px solid currentColor;
       }
+      /* TODO: fix markdown table */
     }
-
-    .loading {
-      --loading-color-1: #eeeeee00;
-      --loading-color-2: #eeeeeeff;
-      background-size: 50% 100%;
-      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
-      animation: loading-bg-wipe 2s linear infinite;
+    /*
+      Note for daisyui: because we're using a subset of daisyui via CDN, many things won't be included
+      We can manually add the missing styles from https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/full.css
+    */
+    .bg-base-100 {background-color: var(--fallback-b1,oklch(var(--b1)/1))}
+    .bg-base-200 {background-color: var(--fallback-b2,oklch(var(--b2)/1))}
+    .bg-base-300 {background-color: var(--fallback-b3,oklch(var(--b3)/1))}
+    .text-base-content {color: var(--fallback-bc,oklch(var(--bc)/1))}
+    .btn-mini {
+      @apply cursor-pointer opacity-0 group-hover:opacity-100 hover:shadow-md;
     }
-
-    @media (prefers-color-scheme: dark) {
-      .loading {
-        --loading-color-1: #22222200;
-        --loading-color-2: #222222ff;
-      }
-
-      .popover-content {
-        background-color: black;
-      }
+    .chat-screen { max-width: 900px; }
+    /* because the default bubble color is quite dark, we will make a custom one using bg-base-300 */
+    .chat-bubble-base-300 {
+      --tw-bg-opacity: 1;
+      --tw-text-opacity: 1;
+      @apply bg-base-300 text-base-content;
     }
   </style>
-
-  <script type="module">
-    import {
-      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
-    } from './index.js';
-
-    import { llama } from './completion.js';
-    import { SchemaConverter } from './json-schema-to-grammar.mjs';
-
-    let selected_image = false;
-    var slot_id = -1;
-
-    const session = signal({
-      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
-      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
-      historyTemplate: "{{name}}: {{message}}",
-      transcript: [],
-      type: "chat",  // "chat" | "completion"
-      char: "Llama",
-      user: "User",
-      image_selected: ''
-    })
-
-    const params = signal({
-      n_predict: 400,
-      temperature: 0.7,
-      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
-      repeat_penalty: 1.18, // 1.0 = disabled
-      penalize_nl: false,
-      dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
-      dry_base: 1.75,     // 0.0 = disabled
-      dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
-      dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
-      top_k: 40, // <= 0 to use vocab size
-      top_p: 0.95, // 1.0 = disabled
-      min_p: 0.05, // 0 = disabled
-      xtc_probability: 0.0, // 0 = disabled;
-      xtc_threshold: 0.1, // > 0.5 disables XTC;
-      typical_p: 1.0, // 1.0 = disabled
-      presence_penalty: 0.0, // 0.0 = disabled
-      frequency_penalty: 0.0, // 0.0 = disabled
-      mirostat: 0, // 0/1/2
-      mirostat_tau: 5, // target entropy
-      mirostat_eta: 0.1, // learning rate
-      grammar: '',
-      n_probs: 0, // no completion_probabilities,
-      min_keep: 0, // min probs from each sampler,
-      image_data: [],
-      cache_prompt: true,
-      api_key: ''
-    })
-
-    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
-
-    const local_storage_storageKey = "llamacpp_server_local_storage";
-
-    function local_storage_setDataFromObject(tag, content) {
-      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
-    }
-
-    function local_storage_setDataFromRawText(tag, content) {
-      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
-    }
-
-    function local_storage_getDataAsObject(tag) {
-      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
-      if (!item) {
-        return null;
-      } else {
-        return JSON.parse(item);
-      }
-    }
-
-    function local_storage_getDataAsRawText(tag) {
-      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
-      if (!item) {
-        return null;
-      } else {
-        return item;
-      }
-    }
-
-    // create a container for user templates and settings
-
-    const savedUserTemplates = signal({})
-    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
-
-    // let's import locally saved templates and settings if there are any
-    // user templates and settings are stored in one object
-    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
-
-    console.log('Importing saved templates')
-
-    let importedTemplates = local_storage_getDataAsObject('user_templates')
-
-    if (importedTemplates) {
-      // saved templates were successfully imported.
-
-      console.log('Processing saved templates and updating default template')
-      params.value = { ...params.value, image_data: [] };
-
-      //console.log(importedTemplates);
-      savedUserTemplates.value = importedTemplates;
-
-      //override default template
-      savedUserTemplates.value.default = { session: session.value, params: params.value }
-      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
-    } else {
-      // no saved templates detected.
-
-      console.log('Initializing LocalStorage and saving default template')
-
-      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
-      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
-    }
-
-    function userTemplateResetToDefault() {
-      console.log('Resetting template to default')
-      selectedUserTemplate.value.name = 'default';
-      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
-    }
-
-    function userTemplateApply(t) {
-      session.value = t.data.session;
-      session.value = { ...session.value, image_selected: '' };
-      params.value = t.data.params;
-      params.value = { ...params.value, image_data: [] };
-    }
-
-    function userTemplateResetToDefaultAndApply() {
-      userTemplateResetToDefault()
-      userTemplateApply(selectedUserTemplate.value)
-    }
-
-    function userTemplateLoadAndApplyAutosaved() {
-      // get autosaved last used template
-      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
-
-      if (lastUsedTemplate) {
-
-        console.log('Autosaved template found, restoring')
-
-        selectedUserTemplate.value = lastUsedTemplate
-      }
-      else {
-
-        console.log('No autosaved template found, using default template')
-        // no autosaved last used template was found, so load from default.
-
-        userTemplateResetToDefault()
-      }
-
-      console.log('Applying template')
-      // and update internal data from templates
-
-      userTemplateApply(selectedUserTemplate.value)
-    }
-
-    //console.log(savedUserTemplates.value)
-    //console.log(selectedUserTemplate.value)
-
-    function userTemplateAutosave() {
-      console.log('Template Autosave...')
-      if (selectedUserTemplate.value.name == 'default') {
-        // we don't want to save over default template, so let's create a new one
-        let newTemplateName = 'UserTemplate-' + Date.now().toString()
-        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
-
-        console.log('Saving as ' + newTemplateName)
-
-        // save in the autosave slot
-        local_storage_setDataFromObject('user_templates_last', newTemplate)
-
-        // and load it back and apply
-        userTemplateLoadAndApplyAutosaved()
-      } else {
-        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
-      }
-    }
-
-    console.log('Checking for autosaved last used template')
-    userTemplateLoadAndApplyAutosaved()
-
-    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
-
-    const tts = window.speechSynthesis;
-    const ttsVoice = signal(null)
-
-    const llamaStats = signal(null)
-    const controller = signal(null)
-
-    // currently generating a completion?
-    const generating = computed(() => controller.value != null)
-
-    // has the user started a chat?
-    const chatStarted = computed(() => session.value.transcript.length > 0)
-
-    const transcriptUpdate = (transcript) => {
-      session.value = {
-        ...session.value,
-        transcript
-      }
-    }
-
-    // simple template replace
-    const template = (str, extraSettings) => {
-      let settings = session.value;
-      if (extraSettings) {
-        settings = { ...settings, ...extraSettings };
-      }
-      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
-    }
-
-    async function runLlama(prompt, llamaParams, char) {
-      const currentMessages = [];
-      const history = session.value.transcript;
-      if (controller.value) {
-        throw new Error("already running");
-      }
-      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
-        const data = chunk.data;
-
-        if (data.stop) {
-          while (
-            currentMessages.length > 0 &&
-            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
-          ) {
-            currentMessages.pop();
-          }
-          transcriptUpdate([...history, [char, currentMessages]])
-          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
-        } else {
-          currentMessages.push(data);
-          slot_id = data.slot_id;
-          if (selected_image && !data.multimodal) {
-            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
-            return;
-          }
-          transcriptUpdate([...history, [char, currentMessages]])
-        }
-
-        if (data.timings) {
-          llamaStats.value = data;
-        }
-      }
-
-      controller.value = null;
-    }
-
-    // send message to server
-    const chat = async (msg) => {
-      if (controller.value) {
-        console.log('already running...');
-        return;
-      }
-
-      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
-
-      let prompt = template(session.value.template, {
-        message: msg,
-        history: session.value.transcript.flatMap(
-          ([name, data]) =>
-            template(
-              session.value.historyTemplate,
-              {
-                name,
-                message: Array.isArray(data) ?
-                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
-                  data,
-              }
-            )
-        ).join("\n"),
-      });
-      if (selected_image) {
-        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
-      }
-      await runLlama(prompt, {
-        ...params.value,
-        slot_id: slot_id,
-        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
-      }, "{{char}}");
-    }
-
-    const runCompletion = () => {
-      if (controller.value) {
-        console.log('already running...');
-        return;
-      }
-      const { prompt } = session.value;
-      transcriptUpdate([...session.value.transcript, ["", prompt]]);
-      runLlama(prompt, {
-        ...params.value,
-        slot_id: slot_id,
-        stop: [],
-      }, "").finally(() => {
-        session.value.prompt = session.value.transcript.map(([_, data]) =>
-          Array.isArray(data) ? data.map(msg => msg.content).join('') : data
-        ).join('');
-        session.value.transcript = [];
-      })
-    }
-
-    const stop = (e) => {
-      e.preventDefault();
-      if (controller.value) {
-        controller.value.abort();
-        controller.value = null;
-      }
-    }
-
-    const reset = (e) => {
-      stop(e);
-      transcriptUpdate([]);
-    }
-
-    const uploadImage = (e) => {
-      e.preventDefault();
-      document.getElementById("fileInput").click();
-      document.getElementById("fileInput").addEventListener("change", function (event) {
-        const selectedFile = event.target.files[0];
-        if (selectedFile) {
-          const reader = new FileReader();
-          reader.onload = function () {
-            const image_data = reader.result;
-            session.value = { ...session.value, image_selected: image_data };
-            params.value = {
-              ...params.value, image_data: [
-                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
-            }
-          };
-          selected_image = true;
-          reader.readAsDataURL(selectedFile);
-        }
-      });
-    }
-
-    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
-    const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
-    function MessageInput() {
-      const message = useSignal("");
-
-      const talkActive = useSignal(false);
-      const sendOnTalk = useSignal(false);
-      const talkStop = (e) => {
-        if (e) e.preventDefault();
-
-        talkActive.value = false;
-        talkRecognition?.stop();
-      }
-      const talk = (e) => {
-        e.preventDefault();
-
-        if (talkRecognition)
-          talkRecognition.start();
-        else
-          alert("Speech recognition is not supported by this browser.");
-      }
-      if(talkRecognition) {
-        talkRecognition.onstart = () => {
-          talkActive.value = true;
-        }
-        talkRecognition.onresult = (e) => {
-          if (event.results.length > 0) {
-            message.value = event.results[0][0].transcript;
-            if (sendOnTalk.value) {
-              submit(e);
-            }
-          }
-        }
-        talkRecognition.onspeechend = () => {
-          talkStop();
-        }
-      }
-
-      const ttsVoices = useSignal(tts?.getVoices() || []);
-      const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
-      if (tts) {
-        tts.onvoiceschanged = () => {
-          ttsVoices.value = tts.getVoices();
-        }
-      }
-
-      const submit = (e) => {
-        stop(e);
-        chat(message.value);
-        message.value = "";
-      }
-
-      const enterSubmits = (event) => {
-        if (event.which === 13 && !event.shiftKey) {
-          submit(event);
-        }
-      }
-
-      return html`
-        <form onsubmit=${submit}>
-          <div>
-            <textarea
-               className=${generating.value ? "loading" : null}
-               oninput=${(e) => message.value = e.target.value}
-               onkeypress=${enterSubmits}
-               placeholder="Say something..."
-               rows=2
-               type="text"
-               value="${message}"
-            />
-          </div>
-          <div class="message-controls">
-            <div> </div>
-            <div>
-              <div>
-                <button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
-                <button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
-                <button onclick=${stop} disabled=${!generating.value}>Stop</button>
-                <button onclick=${reset}>Reset</button>
-              </div>
-              <div>
-                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
-                  e.preventDefault();
-                  alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
-                  `(TTS and speech recognition are not provided by llama.cpp)\n` +
-                  `Note: STT requires HTTPS to work.`);
-                }}>[?]</a>
-                <button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
-                <div>
-                  <input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
-                  <label for="send-on-talk" style="line-height: initial;">Send after talking</label>
-                </div>
-              </div>
-              <div>
-                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
-                  e.preventDefault();
-                  alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
-                }}>[?]</a>
-                <label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
-                <select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
-                  <option value="" selected="${!ttsVoice.value}">None</option>
-                  ${[
-                    ...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
-                    ...ttsVoices.value.filter(v => !v.default),
-                  ].map(
-                    v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
-                  )}
-                </select>
-              </div>
-            </div>
-          </div>
-        </form>
-      `
-    }
-
-    function CompletionControls() {
-      const submit = (e) => {
-        stop(e);
-        runCompletion();
-      }
-      return html`
-        <div>
-          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
-          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
-          <button onclick=${reset}>Reset</button>
-        </div>`;
-    }
-
-    const ChatLog = (props) => {
-      const messages = session.value.transcript;
-      const container = useRef(null)
-
-      useEffect(() => {
-        // scroll to bottom (if needed)
-        const parent = container.current.parentElement;
-        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
-          parent.scrollTo(0, parent.scrollHeight)
-        }
-      }, [messages])
-
-      const ttsChatLineActiveIx = useSignal(undefined);
-      const ttsChatLine = (e, ix, msg) => {
-        if (e) e.preventDefault();
-
-        if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
-
-        const ttsVoices = tts.getVoices();
-        const voice = ttsVoices.find(v => v.name === ttsVoice.value);
-        if (!voice) return;
-
-        if (ttsChatLineActiveIx.value !== undefined) {
-          tts.cancel();
-          if (ttsChatLineActiveIx.value === ix) {
-            ttsChatLineActiveIx.value = undefined;
-            return;
-          }
-        }
-
-        ttsChatLineActiveIx.value = ix;
-        let ttsUtter = new SpeechSynthesisUtterance(msg);
-        ttsUtter.voice = voice;
-        ttsUtter.onend = e => {
-          ttsChatLineActiveIx.value = undefined;
-        };
-        tts.speak(ttsUtter);
-      }
-
-      const isCompletionMode = session.value.type === 'completion'
-
-      // Try play the last bot message
-      const lastCharChatLinesIxs = useSignal([]);
-      const lastCharChatLinesIxsOld = useSignal([]);
-      useEffect(() => {
-        if (
-          !isCompletionMode
-          && lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
-          && !generating.value
-        ) {
-          const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
-          if (ix !== undefined) {
-            const msg = messages[ix];
-            ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
-          }
-
-          lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
-        }
-      }, [generating.value]);
-
-      const chatLine = ([user, data], index) => {
-        let message
-        const isArrayMessage = Array.isArray(data);
-        const text = isArrayMessage ?
-            data.map(msg => msg.content).join('') :
-            data;
-        if (params.value.n_probs > 0 && isArrayMessage) {
-          message = html`<${Probabilities} data=${data} />`
-        } else {
-          message = isCompletionMode ?
-            text :
-            html`<${Markdownish} text=${template(text)} />`
-        }
-
-        const fromBot = user && user === '{{char}}';
-        if (fromBot && !lastCharChatLinesIxs.value.includes(index))
-          lastCharChatLinesIxs.value.push(index);
-
-        if (user) {
-          return html`
-          <div>
-            <p key=${index}><strong>${template(user)}:</strong> ${message}</p>
-            ${
-              fromBot && ttsVoice.value
-              && html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
-            }
-          </div>
-          `;
-        } else {
-          return isCompletionMode ?
-            html`<span key=${index}>${message}</span>` :
-            html`<div><p key=${index}>${message}</p></div>`
-        }
-      };
-
-      const handleCompletionEdit = (e) => {
-        session.value.prompt = e.target.innerText;
-        session.value.transcript = [];
-      }
-
-      return html`
-        <div id="chat" ref=${container} key=${messages.length}>
-          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
-          <span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
-            ${messages.flatMap(chatLine)}
-          </span>
-        </div>`;
-    };
-
-    const ConfigForm = (props) => {
-      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
-      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
-      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
-      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
-      const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
-
-      const grammarJsonSchemaPropOrder = signal('')
-      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
-      const convertJSONSchemaGrammar = async () => {
-        try {
-          let schema = JSON.parse(params.value.grammar)
-          const converter = new SchemaConverter({
-            prop_order: grammarJsonSchemaPropOrder.value
-              .split(',')
-              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
-            allow_fetch: true,
-          })
-          schema = await converter.resolveRefs(schema, 'input')
-          converter.visit(schema, '')
-          params.value = {
-            ...params.value,
-            grammar: converter.formatGrammar(),
-          }
-        } catch (e) {
-          alert(`Convert failed: ${e.message}`)
-        }
-      }
-
-      const FloatField = ({ label, max, min, name, step, value }) => {
-        return html`
-          <div>
-            <label for="${name}">${label}</label>
-            <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
-            <span>${value}</span>
-          </div>
-        `
-      };
-
-      const IntField = ({ label, max, min, name, value }) => {
-        return html`
-          <div>
-            <label for="${name}">${label}</label>
-            <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
-            <span>${value}</span>
-          </div>
-        `
-      };
-
-      const BoolField = ({ label, name, value }) => {
-        return html`
-          <div>
-            <label for="${name}">${label}</label>
-            <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
-          </div>
-        `
-      };
-
-      const userTemplateReset = (e) => {
-        e.preventDefault();
-        userTemplateResetToDefaultAndApply()
-      }
-
-      const UserTemplateResetButton = () => {
-        if (selectedUserTemplate.value.name == 'default') {
-          return html`
-            <button disabled>Using default template</button>
-          `
-        }
-
-        return html`
-          <button onclick=${userTemplateReset}>Reset all to default</button>
-        `
-      };
-
-      useEffect(() => {
-        // autosave template on every change
-        userTemplateAutosave()
-      }, [session.value, params.value])
-
-      const GrammarControl = () => (
-        html`
-          <div>
-            <label for="template">Grammar</label>
-            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
-            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
-            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
-          </div>
-          `
-      );
-
-      const PromptControlFieldSet = () => (
-        html`
-        <fieldset>
-          <div>
-            <label htmlFor="prompt">Prompt</label>
-            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
-          </div>
-        </fieldset>
-        `
-      );
-
-      const ChatConfigForm = () => (
-        html`
-          ${PromptControlFieldSet()}
-
-          <fieldset class="two">
-            <div>
-              <label for="user">User name</label>
-              <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
-            </div>
-
-            <div>
-              <label for="bot">Bot name</label>
-              <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
-            </div>
-          </fieldset>
-
-          <fieldset>
-            <div>
-              <label for="template">Prompt template</label>
-              <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
-            </div>
-
-            <div>
-              <label for="template">Chat history template</label>
-              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
-            </div>
-            ${GrammarControl()}
-          </fieldset>
-      `
-      );
-
-      const CompletionConfigForm = () => (
-        html`
-          ${PromptControlFieldSet()}
-          <fieldset>${GrammarControl()}</fieldset>
-        `
-      );
-
-      return html`
-        <form>
-          <fieldset class="two">
-            <${UserTemplateResetButton}/>
-            <div>
-              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
-              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
-            </div>
-          </fieldset>
-
-          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
-
-          <fieldset class="two">
-            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
-            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
-            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
-            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
-            ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
-            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
-            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
-            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
-          </fieldset>
-          <details>
-            <summary>More options</summary>
-            <fieldset class="two">
-              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
-              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
-              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
-              ${FloatField({ label: "DRY Penalty Multiplier", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })}
-              ${FloatField({ label: "DRY Base", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })}
-              ${IntField({ label: "DRY Allowed Length", max: 10, min: 2, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })}
-              ${IntField({ label: "DRY Penalty Last N", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })}
-              ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
-              ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
-            </fieldset>
-            <hr />
-            <fieldset class="three">
-              <div>
-                <label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
-                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
-                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
-              </div>
-              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
-              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
-            </fieldset>
-            <fieldset>
-              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
-            </fieldset>
-            <fieldset>
-              ${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
-            </fieldset>
-            <fieldset>
-              <label for="api_key">API Key</label>
-              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
-            </fieldset>
-          </details>
-        </form>
-      `
-    }
-
-    const probColor = (p) => {
-      const r = Math.floor(192 * (1 - p));
-      const g = Math.floor(192 * p);
-      return `rgba(${r},${g},0,0.3)`;
-    }
-
-    const Probabilities = (params) => {
-      return params.data.map(msg => {
-        const { completion_probabilities } = msg;
-        if (
-          !completion_probabilities ||
-          completion_probabilities.length === 0
-        ) return msg.content
-
-        if (completion_probabilities.length > 1) {
-          // Not for byte pair
-          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
-
-          const splitData = completion_probabilities.map(prob => ({
-            content: prob.content,
-            completion_probabilities: [prob]
-          }))
-          return html`<${Probabilities} data=${splitData} />`
-        }
-
-        const { probs, content } = completion_probabilities[0]
-        const found = probs.find(p => p.tok_str === msg.content)
-        const pColor = found ? probColor(found.prob) : 'transparent'
-
-        const popoverChildren = html`
-          <div class="prob-set">
-            ${probs.map((p, index) => {
-          return html`
-                <div
-                  key=${index}
-                  title=${`prob: ${p.prob}`}
-                  style=${{
-              padding: '0.3em',
-              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
-            }}
-                >
-                  <span>${p.tok_str}: </span>
-                  <span>${Math.floor(p.prob * 100)}%</span>
-                </div>
-              `
-        })}
-          </div>
-        `
-
-        return html`
-          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
-            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
-          </>
-        `
-      });
-    }
-
-    // poor mans markdown replacement
-    const Markdownish = (params) => {
-      const chunks = params.text.split('```');
-
-      for (let i = 0; i < chunks.length; i++) {
-        if (i % 2 === 0) { // outside code block
-          chunks[i] = chunks[i]
-          .replace(/&/g, '&amp;')
-          .replace(/</g, '&lt;')
-          .replace(/>/g, '&gt;')
-          .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
-          .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
-          .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
-          .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
-          .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
-          .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
-          .replace(/`(.*?)`/g, '<code>$1</code>')
-          .replace(/\n/gim, '<br />');
-        } else { // inside code block
-          chunks[i] = `<pre><code>${chunks[i]}</code></pre>`;
-        }
-      }
-
-      const restoredText = chunks.join('');
-
-      return html`<span dangerouslySetInnerHTML=${{ __html: restoredText }} />`;
-    };
-
-    const ModelGenerationInfo = (params) => {
-      if (!llamaStats.value) {
-        return html`<span/>`
-      }
-      return html`
-        <span>
-          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
-        </span>
-      `
-    }
-
-
-    // simple popover impl
-    const Popover = (props) => {
-      const isOpen = useSignal(false);
-      const position = useSignal({ top: '0px', left: '0px' });
-      const buttonRef = useRef(null);
-      const popoverRef = useRef(null);
-
-      const togglePopover = () => {
-        if (buttonRef.current) {
-          const rect = buttonRef.current.getBoundingClientRect();
-          position.value = {
-            top: `${rect.bottom + window.scrollY}px`,
-            left: `${rect.left + window.scrollX}px`,
-          };
-        }
-        isOpen.value = !isOpen.value;
-      };
-
-      const handleClickOutside = (event) => {
-        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
-          isOpen.value = false;
-        }
-      };
-
-      useEffect(() => {
-        document.addEventListener('mousedown', handleClickOutside);
-        return () => {
-          document.removeEventListener('mousedown', handleClickOutside);
-        };
-      }, []);
-
-      return html`
-        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
-        ${isOpen.value && html`
-          <${Portal} into="#portal">
-            <div
-              ref=${popoverRef}
-              class="popover-content"
-              style=${{
-            top: position.value.top,
-            left: position.value.left,
-          }}
-            >
-              ${props.popoverChildren}
-            </div>
-          </${Portal}>
-        `}
-      `;
-    };
-
-    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
-    /** Redirect rendering of descendants into the given CSS selector */
-    class Portal extends Component {
-      componentDidUpdate(props) {
-        for (let i in props) {
-          if (props[i] !== this.props[i]) {
-            return setTimeout(this.renderLayer);
-          }
-        }
-      }
-
-      componentDidMount() {
-        this.isMounted = true;
-        this.renderLayer = this.renderLayer.bind(this);
-        this.renderLayer();
-      }
-
-      componentWillUnmount() {
-        this.renderLayer(false);
-        this.isMounted = false;
-        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
-      }
-
-      findNode(node) {
-        return typeof node === 'string' ? document.querySelector(node) : node;
-      }
-
-      renderLayer(show = true) {
-        if (!this.isMounted) return;
-
-        // clean up old node if moving bases:
-        if (this.props.into !== this.intoPointer) {
-          this.intoPointer = this.props.into;
-          if (this.into && this.remote) {
-            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
-          }
-          this.into = this.findNode(this.props.into);
-        }
-
-        this.remote = render(html`
-          <${PortalProxy} context=${this.context}>
-            ${show && this.props.children || null}
-          </${PortalProxy}>
-        `, this.into, this.remote);
-      }
-
-      render() {
-        return null;
-      }
-    }
-    // high-order component that renders its first child if it exists.
-    // used as a conditional rendering proxy.
-    class PortalProxy extends Component {
-      getChildContext() {
-        return this.props.context;
-      }
-      render({ children }) {
-        return children || null;
-      }
-    }
-
-    function App(props) {
-      useEffect(() => {
-        const query = new URLSearchParams(location.search).get("q");
-        if (query) chat(query);
-      }, []);
-
-      return html`
-        <div class="mode-${session.value.type}">
-          <header>
-            <div class="grid-container">
-              <div class="grid-item"></div>
-              <div class="grid-item"><h1>llama.cpp</h1></div>
-              <div class="grid-item"><a class="customlink" href="index-new.html">New UI</a></div>
-            </div>
-          </header>
-
-          <main id="content">
-            <${chatStarted.value ? ChatLog : ConfigForm} />
-          </main>
-
-          <section id="write">
-            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
-          </section>
-
-          <footer>
-            <p><${ModelGenerationInfo} /></p>
-            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
-          </footer>
-        </div>
-      `;
-    }
-
-    render(h(App), document.querySelector('#container'));
-  </script>
 </head>
 
 <body>
-  <div id="container">
-    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  <div id="app" class="flex flex-row opacity-0"> <!-- opacity-0 will be removed on app mounted -->
+    <!-- sidebar -->
+    <div class="flex flex-col bg-black bg-opacity-5 w-64 py-8 px-4 h-screen overflow-y-auto">
+      <h2 class="font-bold mb-4 ml-4">Conversations</h2>
+
+      <!-- list of conversations -->
+      <div :class="{
+        'btn btn-ghost justify-start': true,
+        'btn-active': messages.length === 0,
+      }" @click="newConversation">
+        + New conversation
+      </div>
+      <div v-for="conv in conversations" :class="{
+        'btn btn-ghost justify-start font-normal': true,
+        'btn-active': conv.id === viewingConvId,
+      }" @click="setViewingConv(conv.id)">
+        <span class="truncate">{{ conv.messages[0].content }}</span>
+      </div>
+      <div class="text-center text-xs opacity-40 mt-auto mx-4">
+        Conversations are saved to browser's localStorage
+      </div>
+    </div>
+
+    <div class="chat-screen flex flex-col w-screen h-screen px-8 mx-auto">
+      <!-- header -->
+      <div class="flex flex-row items-center">
+        <div class="grow text-2xl font-bold mt-8 mb-6">
+          🦙 llama.cpp - chat
+        </div>
+
+        <!-- action buttons (top right) -->
+        <div class="flex items-center">
+          <button v-if="messages.length > 0" class="btn mr-1" @click="deleteConv(viewingConvId)" :disabled="isGenerating">
+            <!-- delete conversation button -->
+            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-trash" viewBox="0 0 16 16">
+              <path d="M5.5 5.5A.5.5 0 0 1 6 6v6a.5.5 0 0 1-1 0V6a.5.5 0 0 1 .5-.5m2.5 0a.5.5 0 0 1 .5.5v6a.5.5 0 0 1-1 0V6a.5.5 0 0 1 .5-.5m3 .5a.5.5 0 0 0-1 0v6a.5.5 0 0 0 1 0z"/>
+              <path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/>
+            </svg>
+          </button>
+
+          <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
+            <!-- edit config button -->
+            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
+              <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
+              <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
+            </svg>
+          </button>
+
+          <!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
+          <div class="dropdown dropdown-end dropdown-bottom">
+            <div tabindex="0" role="button" class="btn m-1">
+              Theme
+              <svg width="12px" height="12px" class="inline-block h-2 w-2 fill-current opacity-60" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2048 2048">
+                <path d="M1799 349l242 241-1017 1017L7 590l242-241 775 775 775-775z"></path>
+              </svg>
+            </div>
+            <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
+              <li>
+                <button
+                  class="btn btn-sm btn-block w-full btn-ghost justify-start"
+                  :class="{ 'btn-active': selectedTheme === 'auto' }"
+                  @click="setSelectedTheme('auto')">
+                  auto
+                </button>
+              </li>
+              <li v-for="theme in themes">
+                <input
+                  type="radio"
+                  name="theme-dropdown"
+                  class="theme-controller btn btn-sm btn-block w-full btn-ghost justify-start"
+                  :aria-label="theme"
+                  :value="theme"
+                  :checked="selectedTheme === theme"
+                  @click="setSelectedTheme(theme)" />
+              </li>
+            </ul>
+          </div>
+        </div>
+      </div>
+
+      <!-- chat messages -->
+      <div id="messages-list" class="flex flex-col grow overflow-y-auto">
+        <div class="mt-auto flex justify-center">
+          <!-- placeholder to shift the message to the bottom -->
+           {{ messages.length === 0 ? 'Send a message to start' : '' }}
+        </div>
+        <div v-for="msg in messages" class="group">
+          <div :class="{
+            'chat': true,
+            'chat-start': msg.role !== 'user',
+            'chat-end': msg.role === 'user',
+          }">
+            <div :class="{
+              'chat-bubble markdown': true,
+              'chat-bubble-base-300': msg.role !== 'user',
+            }">
+              <!-- textarea for editing message -->
+              <template v-if="editingMsg && editingMsg.id === msg.id">
+                <textarea
+                  class="textarea textarea-bordered bg-base-100 text-base-content w-96"
+                  v-model="msg.content"></textarea>
+                <br/>
+                <button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
+                <button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
+              </template>
+              <!-- render message as markdown -->
+              <vue-markdown v-else :source="msg.content" />
+            </div>
+          </div>
+
+          <!-- actions for each message -->
+          <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
+            <!-- user message -->
+            <button v-if="msg.role === 'user'" class="badge btn-mini" @click="editingMsg = msg" :disabled="isGenerating">
+              ✍️ Edit
+            </button>
+            <!-- assistant message -->
+            <button v-if="msg.role === 'assistant'" class="badge btn-mini mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
+              🔄 Regenerate
+            </button>
+            <button v-if="msg.role === 'assistant'" class="badge btn-mini mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
+              📋 Copy
+            </button>
+          </div>
+        </div>
+
+        <!-- pending (ongoing) assistant message -->
+        <div id="pending-msg" class="chat chat-start">
+          <div v-if="pendingMsg" class="chat-bubble markdown">
+            <span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
+            <vue-markdown v-else :source="pendingMsg.content" />
+          </div>
+        </div>
+      </div>
+
+      <!-- chat input -->
+      <div class="flex flex-row items-center mt-8 mb-6">
+        <textarea
+          class="textarea textarea-bordered w-full"
+          placeholder="Type a message (Shift+Enter to add a new line)"
+          v-model="inputMsg"
+          @keydown.enter.exact.prevent="sendMessage"
+          @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
+          :disabled="isGenerating"
+          id="msg-input"
+        ></textarea>
+        <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
+        <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
+      </div>
+    </div>
+
+    <!-- modal for editing config -->
+    <dialog class="modal" :class="{'modal-open': showConfigDialog}">
+      <div class="modal-box">
+        <h3 class="text-lg font-bold mb-6">Settings</h3>
+        <div class="h-[calc(90vh-12rem)] overflow-y-auto">
+          <p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
+          <label class="form-control mb-2">
+            <div class="label">System Message</div>
+            <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
+          </label>
+          <template v-for="key in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
+            <label class="input input-bordered flex items-center gap-2 mb-2">
+              <b>{{ key }}</b>
+              <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[key] || 'none')" v-model="config[key]" />
+            </label>
+          </template>
+          <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
+          <div class="collapse collapse-arrow bg-base-200 mb-2">
+            <input type="checkbox" />
+            <div class="collapse-title font-bold">Advanced config</div>
+            <div class="collapse-content">
+              <label class="form-control mb-2">
+                <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
+                <textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
+              </label>
+            </div>
+          </div>
+        </div>
+
+        <!-- action buttons -->
+        <div class="modal-action">
+          <button class="btn" @click="resetConfigDialog">Reset to default</button>
+          <button class="btn" @click="closeAndDiscardConfigDialog">Close</button>
+          <button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save and close</button>
+        </div>
+      </div>
+    </dialog>
   </div>
-  <div id="portal"></div>
+
+  <script src="./deps_markdown-it.js"></script>
+  <script type="module">
+    import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
+    import { llama } from './completion.js';
+
+    const isString = (x) => !!x.toLowerCase;
+    const isNumeric = (n) => !isString(n) && !isNaN(n);
+
+    const BASE_URL = localStorage.getItem('base') // for debugging
+      || (new URL('.', document.baseURI).href).toString(); // for production
+    const CONFIG_DEFAULT = {
+      // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
+      apiKey: '',
+      systemMessage: 'You are a helpful assistant.',
+      // make sure these default values are in sync with `common.h`
+      temperature: 0.8,
+      top_k: 40,
+      top_p: 0.95,
+      min_p: 0.05,
+      max_tokens: -1,
+      custom: '', // custom json-stringified object
+    };
+    // config keys having numeric value (i.e. temperature, top_k, top_p, etc)
+    const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
+    // list of themes supported by daisyui
+    const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
+
+    // markdown support
+    const VueMarkdown = defineComponent(
+      (props) => {
+        const md = shallowRef(new markdownit(props.options ?? { breaks: true }));
+        for (const plugin of props.plugins ?? []) {
+          md.value.use(plugin);
+        }
+        const content = computed(() => md.value.render(props.source));
+        return () => h("div", { innerHTML: content.value });
+      },
+      { props: ["source", "options", "plugins"] }
+    );
+
+    // coversations is stored in localStorage
+    // format: { [convId]: { id: string, lastModified: number, messages: [...] } }
+    // convId is a string prefixed with 'conv-'
+    const StorageUtils = {
+      // manage conversations
+      getAllConversations() {
+        const res = [];
+        for (const key in localStorage) {
+          if (key.startsWith('conv-')) {
+            res.push(JSON.parse(localStorage.getItem(key)));
+          }
+        }
+        res.sort((a, b) => b.lastModified - a.lastModified);
+        return res;
+      },
+      // can return null if convId does not exist
+      getOneConversation(convId) {
+        return JSON.parse(localStorage.getItem(convId) || 'null');
+      },
+      // if convId does not exist, create one
+      appendMsg(convId, msg) {
+        if (msg.content === null) return;
+        const conv = StorageUtils.getOneConversation(convId) || {
+          id: convId,
+          lastModified: Date.now(),
+          messages: [],
+        };
+        conv.messages.push(msg);
+        conv.lastModified = Date.now();
+        localStorage.setItem(convId, JSON.stringify(conv));
+      },
+      getNewConvId() {
+        return `conv-${Date.now()}`;
+      },
+      remove(convId) {
+        localStorage.removeItem(convId);
+      },
+      filterAndKeepMsgs(convId, predicate) {
+        const conv = StorageUtils.getOneConversation(convId);
+        if (!conv) return;
+        conv.messages = conv.messages.filter(predicate);
+        conv.lastModified = Date.now();
+        localStorage.setItem(convId, JSON.stringify(conv));
+      },
+      popMsg(convId) {
+        const conv = StorageUtils.getOneConversation(convId);
+        if (!conv) return;
+        const msg = conv.messages.pop();
+        conv.lastModified = Date.now();
+        localStorage.setItem(convId, JSON.stringify(conv));
+        return msg;
+      },
+
+      // manage config
+      getConfig() {
+        const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
+        // to prevent breaking changes in the future, we always provide default value for missing keys
+        return {
+          ...CONFIG_DEFAULT,
+          ...savedVal,
+        };
+      },
+      setConfig(config) {
+        localStorage.setItem('config', JSON.stringify(config));
+      },
+      getTheme() {
+        return localStorage.getItem('theme') || 'auto';
+      },
+      setTheme(theme) {
+        if (theme === 'auto') {
+          localStorage.removeItem('theme');
+        } else {
+          localStorage.setItem('theme', theme);
+        }
+      },
+    };
+
+    // scroll to bottom of chat messages
+    // if requiresNearBottom is true, only auto-scroll if user is near bottom
+    const chatScrollToBottom = (requiresNearBottom) => {
+      const msgListElem = document.getElementById('messages-list');
+      const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight;
+      if (!requiresNearBottom || (spaceToBottom < 100)) {
+        setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1);
+      }
+    };
+
+    const mainApp = createApp({
+      components: {
+        VueMarkdown,
+      },
+      data() {
+        return {
+          conversations: StorageUtils.getAllConversations(),
+          messages: [], // { id: number, role: 'user' | 'assistant', content: string }
+          viewingConvId: StorageUtils.getNewConvId(),
+          inputMsg: '',
+          isGenerating: false,
+          pendingMsg: null, // the on-going message from assistant
+          stopGeneration: () => {},
+          selectedTheme: StorageUtils.getTheme(),
+          config: StorageUtils.getConfig(),
+          showConfigDialog: false,
+          editingMsg: null,
+          // const
+          themes: THEMES,
+          configDefault: {...CONFIG_DEFAULT},
+        }
+      },
+      computed: {},
+      mounted() {
+        document.getElementById('app').classList.remove('opacity-0'); // show app
+        // scroll to the bottom when the pending message height is updated
+        const pendingMsgElem = document.getElementById('pending-msg');
+        const resizeObserver = new ResizeObserver(() => {
+          if (this.isGenerating) chatScrollToBottom(true);
+        });
+        resizeObserver.observe(pendingMsgElem);
+      },
+      methods: {
+        setSelectedTheme(theme) {
+          this.selectedTheme = theme;
+          StorageUtils.setTheme(theme);
+        },
+        newConversation() {
+          if (this.isGenerating) return;
+          this.viewingConvId = StorageUtils.getNewConvId();
+          this.editingMsg = null;
+          this.fetchMessages();
+          chatScrollToBottom();
+        },
+        setViewingConv(convId) {
+          if (this.isGenerating) return;
+          this.viewingConvId = convId;
+          this.editingMsg = null;
+          this.fetchMessages();
+          chatScrollToBottom();
+        },
+        deleteConv(convId) {
+          if (this.isGenerating) return;
+          if (window.confirm('Are you sure to delete this conversation?')) {
+            StorageUtils.remove(convId);
+            if (this.viewingConvId === convId) {
+              this.viewingConvId = StorageUtils.getNewConvId();
+              this.editingMsg = null;
+            }
+            this.fetchConversation();
+            this.fetchMessages();
+          }
+        },
+        async sendMessage() {
+          if (!this.inputMsg) return;
+          const currConvId = this.viewingConvId;
+
+          StorageUtils.appendMsg(currConvId, {
+            id: Date.now(),
+            role: 'user',
+            content: this.inputMsg,
+          });
+          this.fetchConversation();
+          this.fetchMessages();
+          this.inputMsg = '';
+          this.editingMsg = null;
+          this.generateMessage(currConvId);
+          chatScrollToBottom();
+        },
+        async generateMessage(currConvId) {
+          if (this.isGenerating) return;
+          this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
+          this.isGenerating = true;
+          this.editingMsg = null;
+
+          try {
+            const abortController = new AbortController();
+            this.stopGeneration = () => abortController.abort();
+            const params = {
+              messages: [
+                { role: 'system', content: this.config.systemMessage },
+                ...this.messages,
+              ],
+              stream: true,
+              cache_prompt: true,
+              temperature: this.config.temperature,
+              top_k: this.config.top_k,
+              top_p: this.config.top_p,
+              max_tokens: this.config.max_tokens,
+              ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
+              ...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
+            };
+            const config = {
+              controller: abortController,
+              api_url: BASE_URL,
+              endpoint: '/chat/completions',
+            };
+            for await (const chunk of llama(prompt, params, config)) {
+              const stop = chunk.data.stop;
+              const addedContent = chunk.data.choices[0].delta.content;
+              const lastContent = this.pendingMsg.content || '';
+              if (addedContent) {
+                this.pendingMsg = {
+                  id: this.pendingMsg.id,
+                  role: 'assistant',
+                  content: lastContent + addedContent,
+                };
+              }
+            }
+
+            StorageUtils.appendMsg(currConvId, this.pendingMsg);
+            this.fetchConversation();
+            this.fetchMessages();
+            setTimeout(() => document.getElementById('msg-input').focus(), 1);
+          } catch (error) {
+            if (error.name === 'AbortError') {
+              // user stopped the generation via stopGeneration() function
+              StorageUtils.appendMsg(currConvId, this.pendingMsg);
+              this.fetchConversation();
+              this.fetchMessages();
+            } else {
+              console.error(error);
+              alert(error);
+              // pop last user message
+              const lastUserMsg = StorageUtils.popMsg(currConvId);
+              this.inputMsg = lastUserMsg ? lastUserMsg.content : '';
+            }
+          }
+
+          this.pendingMsg = null;
+          this.isGenerating = false;
+          this.stopGeneration = () => {};
+          this.fetchMessages();
+        },
+
+        // message actions
+        regenerateMsg(msg) {
+          if (this.isGenerating) return;
+          // TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds
+          const currConvId = this.viewingConvId;
+          StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
+          this.fetchConversation();
+          this.fetchMessages();
+          this.generateMessage(currConvId);
+        },
+        copyMsg(msg) {
+          navigator.clipboard.writeText(msg.content);
+        },
+        editUserMsgAndRegenerate(msg) {
+          if (this.isGenerating) return;
+          const currConvId = this.viewingConvId;
+          const newContent = msg.content;
+          this.editingMsg = null;
+          StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
+          StorageUtils.appendMsg(currConvId, {
+            id: Date.now(),
+            role: 'user',
+            content: newContent,
+          });
+          this.fetchConversation();
+          this.fetchMessages();
+          this.generateMessage(currConvId);
+        },
+
+        // settings dialog methods
+        closeAndSaveConfigDialog() {
+          try {
+            if (this.config.custom.length) JSON.parse(this.config.custom);
+          } catch (error) {
+            alert('Invalid JSON for custom config. Please either fix it or leave it empty.');
+            return;
+          }
+          for (const key of CONFIG_NUMERIC_KEYS) {
+            if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) {
+              alert(`Invalid number for ${key} (expected an integer or a float)`);
+              return;
+            }
+            this.config[key] = parseFloat(this.config[key]);
+          }
+          this.showConfigDialog = false;
+          StorageUtils.setConfig(this.config);
+        },
+        closeAndDiscardConfigDialog() {
+          this.showConfigDialog = false;
+          this.config = StorageUtils.getConfig();
+        },
+        resetConfigDialog() {
+          if (window.confirm('Are you sure to reset all settings?')) {
+            this.config = {...CONFIG_DEFAULT};
+          }
+        },
+
+        // sync state functions
+        fetchConversation() {
+          this.conversations = StorageUtils.getAllConversations();
+        },
+        fetchMessages() {
+          this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
+        },
+      },
+    });
+    mainApp.config.errorHandler = alert;
+    try {
+      mainApp.mount('#app');
+    } catch (err) {
+      console.error(err);
+      document.getElementById('app').innerHTML = `<div style="margin:2em auto">
+        Failed to start app. Please try clearing localStorage and try again.<br/>
+        <br/>
+        <button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
+      </div>`;
+    }
+  </script>
 </body>
 
 </html>
diff --git a/examples/server/public/colorthemes.css b/examples/server/public_legacy/colorthemes.css
similarity index 100%
rename from examples/server/public/colorthemes.css
rename to examples/server/public_legacy/colorthemes.css
diff --git a/examples/server/public_legacy/completion.js b/examples/server/public_legacy/completion.js
new file mode 100644
index 000000000..30df7c2fa
--- /dev/null
+++ b/examples/server/public_legacy/completion.js
@@ -0,0 +1,209 @@
+const paramDefaults = {
+  stream: true,
+  n_predict: 500,
+  temperature: 0.2,
+  stop: ["</s>"]
+};
+
+let generation_settings = null;
+
+
+// Completes the prompt as a generator. Recommended for most use cases.
+//
+// Example:
+//
+//    import { llama } from '/completion.js'
+//
+//    const request = llama("Tell me a joke", {n_predict: 800})
+//    for await (const chunk of request) {
+//      document.write(chunk.data.content)
+//    }
+//
+export async function* llama(prompt, params = {}, config = {}) {
+  let controller = config.controller;
+  const api_url = config.api_url?.replace(/\/+$/, '') || "";
+
+  if (!controller) {
+    controller = new AbortController();
+  }
+
+  const completionParams = { ...paramDefaults, ...params, prompt };
+
+  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
+    method: 'POST',
+    body: JSON.stringify(completionParams),
+    headers: {
+      'Connection': 'keep-alive',
+      'Content-Type': 'application/json',
+      'Accept': 'text/event-stream',
+      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
+    },
+    signal: controller.signal,
+  });
+
+  const reader = response.body.getReader();
+  const decoder = new TextDecoder();
+
+  let content = "";
+  let leftover = ""; // Buffer for partially read lines
+
+  try {
+    let cont = true;
+
+    while (cont) {
+      const result = await reader.read();
+      if (result.done) {
+        break;
+      }
+
+      // Add any leftover data to the current chunk of data
+      const text = leftover + decoder.decode(result.value);
+
+      // Check if the last character is a line break
+      const endsWithLineBreak = text.endsWith('\n');
+
+      // Split the text into lines
+      let lines = text.split('\n');
+
+      // If the text doesn't end with a line break, then the last line is incomplete
+      // Store it in leftover to be added to the next chunk of data
+      if (!endsWithLineBreak) {
+        leftover = lines.pop();
+      } else {
+        leftover = ""; // Reset leftover if we have a line break at the end
+      }
+
+      // Parse all sse events and add them to result
+      const regex = /^(\S+):\s(.*)$/gm;
+      for (const line of lines) {
+        const match = regex.exec(line);
+        if (match) {
+          result[match[1]] = match[2];
+          if (result.data === '[DONE]') {
+            cont = false;
+            break;
+          }
+
+          // since we know this is llama.cpp, let's just decode the json in data
+          if (result.data) {
+            result.data = JSON.parse(result.data);
+            content += result.data.content;
+
+            // yield
+            yield result;
+
+            // if we got a stop token from server, we will break here
+            if (result.data.stop) {
+              if (result.data.generation_settings) {
+                generation_settings = result.data.generation_settings;
+              }
+              cont = false;
+              break;
+            }
+          }
+          if (result.error) {
+            try {
+              result.error = JSON.parse(result.error);
+              if (result.error.message.includes('slot unavailable')) {
+                // Throw an error to be caught by upstream callers
+                throw new Error('slot unavailable');
+              } else {
+                console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
+              }
+            } catch(e) {
+              console.error(`llama.cpp error ${result.error}`)
+            }
+          }
+        }
+      }
+    }
+  } catch (e) {
+    if (e.name !== 'AbortError') {
+      console.error("llama error: ", e);
+    }
+    throw e;
+  }
+  finally {
+    controller.abort();
+  }
+
+  return content;
+}
+
+// Call llama, return an event target that you can subscribe to
+//
+// Example:
+//
+//    import { llamaEventTarget } from '/completion.js'
+//
+//    const conn = llamaEventTarget(prompt)
+//    conn.addEventListener("message", (chunk) => {
+//      document.write(chunk.detail.content)
+//    })
+//
+export const llamaEventTarget = (prompt, params = {}, config = {}) => {
+  const eventTarget = new EventTarget();
+  (async () => {
+    let content = "";
+    for await (const chunk of llama(prompt, params, config)) {
+      if (chunk.data) {
+        content += chunk.data.content;
+        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
+      }
+      if (chunk.data.generation_settings) {
+        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
+      }
+      if (chunk.data.timings) {
+        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
+      }
+    }
+    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
+  })();
+  return eventTarget;
+}
+
+// Call llama, return a promise that resolves to the completed text. This does not support streaming
+//
+// Example:
+//
+//     llamaPromise(prompt).then((content) => {
+//       document.write(content)
+//     })
+//
+//     or
+//
+//     const content = await llamaPromise(prompt)
+//     document.write(content)
+//
+export const llamaPromise = (prompt, params = {}, config = {}) => {
+  return new Promise(async (resolve, reject) => {
+    let content = "";
+    try {
+      for await (const chunk of llama(prompt, params, config)) {
+        content += chunk.data.content;
+      }
+      resolve(content);
+    } catch (error) {
+      reject(error);
+    }
+  });
+};
+
+/**
+ * (deprecated)
+ */
+export const llamaComplete = async (params, controller, callback) => {
+  for await (const chunk of llama(params.prompt, params, { controller })) {
+    callback(chunk);
+  }
+}
+
+// Get the model info from the server. This is useful for getting the context window and so on.
+export const llamaModelInfo = async (config = {}) => {
+  if (!generation_settings) {
+    const api_url = config.api_url?.replace(/\/+$/, '') || "";
+    const props = await fetch(`${api_url}/props`).then(r => r.json());
+    generation_settings = props.default_generation_settings;
+  }
+  return generation_settings;
+}
diff --git a/examples/server/public/favicon.ico b/examples/server/public_legacy/favicon.ico
similarity index 100%
rename from examples/server/public/favicon.ico
rename to examples/server/public_legacy/favicon.ico
diff --git a/examples/server/public/index-new.html b/examples/server/public_legacy/index-new.html
similarity index 100%
rename from examples/server/public/index-new.html
rename to examples/server/public_legacy/index-new.html
diff --git a/examples/server/public_legacy/index.html b/examples/server/public_legacy/index.html
new file mode 100644
index 000000000..a95f5c6df
--- /dev/null
+++ b/examples/server/public_legacy/index.html
@@ -0,0 +1,1303 @@
+<html>
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>llama.cpp - chat</title>
+
+  <style>
+    body {
+      font-family: system-ui;
+      font-size: 90%;
+    }
+
+    .grid-container {
+      display: grid;
+      grid-template-columns: auto auto auto;
+      padding: 10px;
+    }
+
+    .grid-item {
+      padding: 5px;
+      /* font-size: 30px; */
+      text-align: center;
+    }
+
+    #container {
+      margin: 0em auto;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      height: 100%;
+    }
+
+    main {
+      margin: 3px;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      gap: 1em;
+
+      flex-grow: 1;
+      overflow-y: auto;
+
+      border: 1px solid #ccc;
+      border-radius: 5px;
+      padding: 0.5em;
+    }
+
+    h1 {
+      text-align: center;
+    }
+
+    .customlink:link {
+      color: white;
+      background-color: #007aff;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      border-radius: 4px;
+      padding: 8px;
+    }
+
+    .customlink:visited {
+      color: white;
+      background-color: #007aff;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      padding: 8px;
+    }
+
+    .customlink:hover {
+      color: white;
+      background-color: #0070ee;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      padding: 8px;
+    }
+
+    .customlink:active {
+      color: #0070ee;
+      background-color: #80b3ef;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      padding: 8px;
+    }
+
+    body {
+      max-width: 600px;
+      min-width: 300px;
+      line-height: 1.2;
+      margin: 0 auto;
+      padding: 0 0.5em;
+    }
+
+    p {
+      overflow-wrap: break-word;
+      word-wrap: break-word;
+      hyphens: auto;
+      margin-top: 0.5em;
+      margin-bottom: 0.5em;
+    }
+
+    #write form {
+      margin: 1em 0 0 0;
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+      align-items: stretch;
+    }
+
+    .message-controls {
+      display: flex;
+      justify-content: flex-end;
+    }
+    .message-controls > div:nth-child(2) {
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+    }
+    .message-controls > div:nth-child(2) > div {
+      display: flex;
+      margin-left: auto;
+      gap: 0.5em;
+    }
+
+    fieldset {
+      border: none;
+      padding: 0;
+      margin: 0;
+    }
+
+    fieldset.two {
+      display: grid;
+      grid-template: "a a";
+      gap: 1em;
+    }
+
+    fieldset.three {
+      display: grid;
+      grid-template: "a a a";
+      gap: 1em;
+    }
+
+    details {
+      border: 1px solid #aaa;
+      border-radius: 4px;
+      padding: 0.5em 0.5em 0;
+      margin-top: 0.5em;
+    }
+
+    summary {
+      font-weight: bold;
+      margin: -0.5em -0.5em 0;
+      padding: 0.5em;
+      cursor: pointer;
+    }
+
+    details[open] {
+      padding: 0.5em;
+    }
+
+    .prob-set {
+      padding: 0.3em;
+      border-bottom: 1px solid #ccc;
+    }
+
+    .popover-content {
+      position: absolute;
+      background-color: white;
+      padding: 0.2em;
+      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+    }
+
+    textarea {
+      padding: 5px;
+      flex-grow: 1;
+      width: 100%;
+    }
+
+    pre code {
+      display: block;
+      background-color: #222;
+      color: #ddd;
+    }
+
+    code {
+      font-family: monospace;
+      padding: 0.1em 0.3em;
+      border-radius: 3px;
+    }
+
+    fieldset label {
+      margin: 0.5em 0;
+      display: block;
+    }
+
+    fieldset label.slim {
+      margin: 0 0.5em;
+      display: inline;
+    }
+
+    header,
+    footer {
+      text-align: center;
+    }
+
+    footer {
+      font-size: 80%;
+      color: #888;
+    }
+
+    .mode-chat textarea[name=prompt] {
+      height: 4.5em;
+    }
+
+    .mode-completion textarea[name=prompt] {
+      height: 10em;
+    }
+
+    [contenteditable] {
+      display: inline-block;
+      white-space: pre-wrap;
+      outline: 0px solid transparent;
+    }
+
+    @keyframes loading-bg-wipe {
+      0% {
+        background-position: 0%;
+      }
+
+      100% {
+        background-position: 100%;
+      }
+    }
+
+    .loading {
+      --loading-color-1: #eeeeee00;
+      --loading-color-2: #eeeeeeff;
+      background-size: 50% 100%;
+      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
+      animation: loading-bg-wipe 2s linear infinite;
+    }
+
+    @media (prefers-color-scheme: dark) {
+      .loading {
+        --loading-color-1: #22222200;
+        --loading-color-2: #222222ff;
+      }
+
+      .popover-content {
+        background-color: black;
+      }
+    }
+  </style>
+
+  <script type="module">
+    import {
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
+    } from './index.js';
+
+    import { llama } from './completion.js';
+    import { SchemaConverter } from './json-schema-to-grammar.mjs';
+
+    let selected_image = false;
+    var slot_id = -1;
+
+    const session = signal({
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
+      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
+      historyTemplate: "{{name}}: {{message}}",
+      transcript: [],
+      type: "chat",  // "chat" | "completion"
+      char: "Llama",
+      user: "User",
+      image_selected: ''
+    })
+
+    const params = signal({
+      n_predict: 400,
+      temperature: 0.7,
+      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.18, // 1.0 = disabled
+      penalize_nl: false,
+      dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
+      dry_base: 1.75,     // 0.0 = disabled
+      dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
+      dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+      top_k: 40, // <= 0 to use vocab size
+      top_p: 0.95, // 1.0 = disabled
+      min_p: 0.05, // 0 = disabled
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
+      typical_p: 1.0, // 1.0 = disabled
+      presence_penalty: 0.0, // 0.0 = disabled
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat: 0, // 0/1/2
+      mirostat_tau: 5, // target entropy
+      mirostat_eta: 0.1, // learning rate
+      grammar: '',
+      n_probs: 0, // no completion_probabilities,
+      min_keep: 0, // min probs from each sampler,
+      image_data: [],
+      cache_prompt: true,
+      api_key: ''
+    })
+
+    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfully imported.
+
+      console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Resetting template to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
+      params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const tts = window.speechSynthesis;
+    const ttsVoice = signal(null)
+
+    const llamaStats = signal(null)
+    const controller = signal(null)
+
+    // currently generating a completion?
+    const generating = computed(() => controller.value != null)
+
+    // has the user started a chat?
+    const chatStarted = computed(() => session.value.transcript.length > 0)
+
+    const transcriptUpdate = (transcript) => {
+      session.value = {
+        ...session.value,
+        transcript
+      }
+    }
+
+    // simple template replace
+    const template = (str, extraSettings) => {
+      let settings = session.value;
+      if (extraSettings) {
+        settings = { ...settings, ...extraSettings };
+      }
+      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
+    }
+
+    async function runLlama(prompt, llamaParams, char) {
+      const currentMessages = [];
+      const history = session.value.transcript;
+      if (controller.value) {
+        throw new Error("already running");
+      }
+      controller.value = new AbortController();
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
+        const data = chunk.data;
+
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
+            return;
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+        }
+
+        if (data.timings) {
+          llamaStats.value = data;
+        }
+      }
+
+      controller.value = null;
+    }
+
+    // send message to server
+    const chat = async (msg) => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+
+      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
+
+      let prompt = template(session.value.template, {
+        message: msg,
+        history: session.value.transcript.flatMap(
+          ([name, data]) =>
+            template(
+              session.value.historyTemplate,
+              {
+                name,
+                message: Array.isArray(data) ?
+                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
+                  data,
+              }
+            )
+        ).join("\n"),
+      });
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
+      await runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
+      }, "{{char}}");
+    }
+
+    const runCompletion = () => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+      const { prompt } = session.value;
+      transcriptUpdate([...session.value.transcript, ["", prompt]]);
+      runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: [],
+      }, "").finally(() => {
+        session.value.prompt = session.value.transcript.map(([_, data]) =>
+          Array.isArray(data) ? data.map(msg => msg.content).join('') : data
+        ).join('');
+        session.value.transcript = [];
+      })
+    }
+
+    const stop = (e) => {
+      e.preventDefault();
+      if (controller.value) {
+        controller.value.abort();
+        controller.value = null;
+      }
+    }
+
+    const reset = (e) => {
+      stop(e);
+      transcriptUpdate([]);
+    }
+
+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
+    }
+
+    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+    const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
+    function MessageInput() {
+      const message = useSignal("");
+
+      const talkActive = useSignal(false);
+      const sendOnTalk = useSignal(false);
+      const talkStop = (e) => {
+        if (e) e.preventDefault();
+
+        talkActive.value = false;
+        talkRecognition?.stop();
+      }
+      const talk = (e) => {
+        e.preventDefault();
+
+        if (talkRecognition)
+          talkRecognition.start();
+        else
+          alert("Speech recognition is not supported by this browser.");
+      }
+      if(talkRecognition) {
+        talkRecognition.onstart = () => {
+          talkActive.value = true;
+        }
+        talkRecognition.onresult = (e) => {
+          if (event.results.length > 0) {
+            message.value = event.results[0][0].transcript;
+            if (sendOnTalk.value) {
+              submit(e);
+            }
+          }
+        }
+        talkRecognition.onspeechend = () => {
+          talkStop();
+        }
+      }
+
+      const ttsVoices = useSignal(tts?.getVoices() || []);
+      const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
+      if (tts) {
+        tts.onvoiceschanged = () => {
+          ttsVoices.value = tts.getVoices();
+        }
+      }
+
+      const submit = (e) => {
+        stop(e);
+        chat(message.value);
+        message.value = "";
+      }
+
+      const enterSubmits = (event) => {
+        if (event.which === 13 && !event.shiftKey) {
+          submit(event);
+        }
+      }
+
+      return html`
+        <form onsubmit=${submit}>
+          <div>
+            <textarea
+               className=${generating.value ? "loading" : null}
+               oninput=${(e) => message.value = e.target.value}
+               onkeypress=${enterSubmits}
+               placeholder="Say something..."
+               rows=2
+               type="text"
+               value="${message}"
+            />
+          </div>
+          <div class="message-controls">
+            <div> </div>
+            <div>
+              <div>
+                <button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
+                <button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
+                <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+                <button onclick=${reset}>Reset</button>
+              </div>
+              <div>
+                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
+                  e.preventDefault();
+                  alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
+                  `(TTS and speech recognition are not provided by llama.cpp)\n` +
+                  `Note: STT requires HTTPS to work.`);
+                }}>[?]</a>
+                <button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
+                <div>
+                  <input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
+                  <label for="send-on-talk" style="line-height: initial;">Send after talking</label>
+                </div>
+              </div>
+              <div>
+                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
+                  e.preventDefault();
+                  alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
+                }}>[?]</a>
+                <label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
+                <select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
+                  <option value="" selected="${!ttsVoice.value}">None</option>
+                  ${[
+                    ...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
+                    ...ttsVoices.value.filter(v => !v.default),
+                  ].map(
+                    v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
+                  )}
+                </select>
+              </div>
+            </div>
+          </div>
+        </form>
+      `
+    }
+
+    function CompletionControls() {
+      const submit = (e) => {
+        stop(e);
+        runCompletion();
+      }
+      return html`
+        <div>
+          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
+          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+          <button onclick=${reset}>Reset</button>
+        </div>`;
+    }
+
+    const ChatLog = (props) => {
+      const messages = session.value.transcript;
+      const container = useRef(null)
+
+      useEffect(() => {
+        // scroll to bottom (if needed)
+        const parent = container.current.parentElement;
+        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
+          parent.scrollTo(0, parent.scrollHeight)
+        }
+      }, [messages])
+
+      const ttsChatLineActiveIx = useSignal(undefined);
+      const ttsChatLine = (e, ix, msg) => {
+        if (e) e.preventDefault();
+
+        if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
+
+        const ttsVoices = tts.getVoices();
+        const voice = ttsVoices.find(v => v.name === ttsVoice.value);
+        if (!voice) return;
+
+        if (ttsChatLineActiveIx.value !== undefined) {
+          tts.cancel();
+          if (ttsChatLineActiveIx.value === ix) {
+            ttsChatLineActiveIx.value = undefined;
+            return;
+          }
+        }
+
+        ttsChatLineActiveIx.value = ix;
+        let ttsUtter = new SpeechSynthesisUtterance(msg);
+        ttsUtter.voice = voice;
+        ttsUtter.onend = e => {
+          ttsChatLineActiveIx.value = undefined;
+        };
+        tts.speak(ttsUtter);
+      }
+
+      const isCompletionMode = session.value.type === 'completion'
+
+      // Try play the last bot message
+      const lastCharChatLinesIxs = useSignal([]);
+      const lastCharChatLinesIxsOld = useSignal([]);
+      useEffect(() => {
+        if (
+          !isCompletionMode
+          && lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
+          && !generating.value
+        ) {
+          const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
+          if (ix !== undefined) {
+            const msg = messages[ix];
+            ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
+          }
+
+          lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
+        }
+      }, [generating.value]);
+
+      const chatLine = ([user, data], index) => {
+        let message
+        const isArrayMessage = Array.isArray(data);
+        const text = isArrayMessage ?
+            data.map(msg => msg.content).join('') :
+            data;
+        if (params.value.n_probs > 0 && isArrayMessage) {
+          message = html`<${Probabilities} data=${data} />`
+        } else {
+          message = isCompletionMode ?
+            text :
+            html`<${Markdownish} text=${template(text)} />`
+        }
+
+        const fromBot = user && user === '{{char}}';
+        if (fromBot && !lastCharChatLinesIxs.value.includes(index))
+          lastCharChatLinesIxs.value.push(index);
+
+        if (user) {
+          return html`
+          <div>
+            <p key=${index}><strong>${template(user)}:</strong> ${message}</p>
+            ${
+              fromBot && ttsVoice.value
+              && html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
+            }
+          </div>
+          `;
+        } else {
+          return isCompletionMode ?
+            html`<span key=${index}>${message}</span>` :
+            html`<div><p key=${index}>${message}</p></div>`
+        }
+      };
+
+      const handleCompletionEdit = (e) => {
+        session.value.prompt = e.target.innerText;
+        session.value.transcript = [];
+      }
+
+      return html`
+        <div id="chat" ref=${container} key=${messages.length}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
+          <span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
+            ${messages.flatMap(chatLine)}
+          </span>
+        </div>`;
+    };
+
+    const ConfigForm = (props) => {
+      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+      const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
+
+      const grammarJsonSchemaPropOrder = signal('')
+      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+      const convertJSONSchemaGrammar = async () => {
+        try {
+          let schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter({
+            prop_order: grammarJsonSchemaPropOrder.value
+              .split(',')
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+            allow_fetch: true,
+          })
+          schema = await converter.resolveRefs(schema, 'input')
+          converter.visit(schema, '')
+          params.value = {
+            ...params.value,
+            grammar: converter.formatGrammar(),
+          }
+        } catch (e) {
+          alert(`Convert failed: ${e.message}`)
+        }
+      }
+
+      const FloatField = ({ label, max, min, name, step, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const IntField = ({ label, max, min, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const BoolField = ({ label, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
+          </div>
+        `
+      };
+
+      const userTemplateReset = (e) => {
+        e.preventDefault();
+        userTemplateResetToDefaultAndApply()
+      }
+
+      const UserTemplateResetButton = () => {
+        if (selectedUserTemplate.value.name == 'default') {
+          return html`
+            <button disabled>Using default template</button>
+          `
+        }
+
+        return html`
+          <button onclick=${userTemplateReset}>Reset all to default</button>
+        `
+      };
+
+      useEffect(() => {
+        // autosave template on every change
+        userTemplateAutosave()
+      }, [session.value, params.value])
+
+      const GrammarControl = () => (
+        html`
+          <div>
+            <label for="template">Grammar</label>
+            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+          </div>
+          `
+      );
+
+      const PromptControlFieldSet = () => (
+        html`
+        <fieldset>
+          <div>
+            <label htmlFor="prompt">Prompt</label>
+            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
+          </div>
+        </fieldset>
+        `
+      );
+
+      const ChatConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+
+          <fieldset class="two">
+            <div>
+              <label for="user">User name</label>
+              <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
+            </div>
+
+            <div>
+              <label for="bot">Bot name</label>
+              <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
+            </div>
+          </fieldset>
+
+          <fieldset>
+            <div>
+              <label for="template">Prompt template</label>
+              <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
+            </div>
+
+            <div>
+              <label for="template">Chat history template</label>
+              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
+            </div>
+            ${GrammarControl()}
+          </fieldset>
+      `
+      );
+
+      const CompletionConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+          <fieldset>${GrammarControl()}</fieldset>
+        `
+      );
+
+      return html`
+        <form>
+          <fieldset class="two">
+            <${UserTemplateResetButton}/>
+            <div>
+              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
+              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
+            </div>
+          </fieldset>
+
+          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
+
+          <fieldset class="two">
+            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
+            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
+            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
+            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
+            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
+          </fieldset>
+          <details>
+            <summary>More options</summary>
+            <fieldset class="two">
+              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+              ${FloatField({ label: "DRY Penalty Multiplier", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })}
+              ${FloatField({ label: "DRY Base", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })}
+              ${IntField({ label: "DRY Allowed Length", max: 10, min: 2, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })}
+              ${IntField({ label: "DRY Penalty Last N", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })}
+              ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+              ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
+            </fieldset>
+            <hr />
+            <fieldset class="three">
+              <div>
+                <label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
+                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
+                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
+              </div>
+              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
+            </fieldset>
+            <fieldset>
+              <label for="api_key">API Key</label>
+              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+            </fieldset>
+          </details>
+        </form>
+      `
+    }
+
+    const probColor = (p) => {
+      const r = Math.floor(192 * (1 - p));
+      const g = Math.floor(192 * p);
+      return `rgba(${r},${g},0,0.3)`;
+    }
+
+    const Probabilities = (params) => {
+      return params.data.map(msg => {
+        const { completion_probabilities } = msg;
+        if (
+          !completion_probabilities ||
+          completion_probabilities.length === 0
+        ) return msg.content
+
+        if (completion_probabilities.length > 1) {
+          // Not for byte pair
+          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
+
+          const splitData = completion_probabilities.map(prob => ({
+            content: prob.content,
+            completion_probabilities: [prob]
+          }))
+          return html`<${Probabilities} data=${splitData} />`
+        }
+
+        const { probs, content } = completion_probabilities[0]
+        const found = probs.find(p => p.tok_str === msg.content)
+        const pColor = found ? probColor(found.prob) : 'transparent'
+
+        const popoverChildren = html`
+          <div class="prob-set">
+            ${probs.map((p, index) => {
+          return html`
+                <div
+                  key=${index}
+                  title=${`prob: ${p.prob}`}
+                  style=${{
+              padding: '0.3em',
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+            }}
+                >
+                  <span>${p.tok_str}: </span>
+                  <span>${Math.floor(p.prob * 100)}%</span>
+                </div>
+              `
+        })}
+          </div>
+        `
+
+        return html`
+          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
+            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
+          </>
+        `
+      });
+    }
+
+    // poor mans markdown replacement
+    const Markdownish = (params) => {
+      const chunks = params.text.split('```');
+
+      for (let i = 0; i < chunks.length; i++) {
+        if (i % 2 === 0) { // outside code block
+          chunks[i] = chunks[i]
+          .replace(/&/g, '&amp;')
+          .replace(/</g, '&lt;')
+          .replace(/>/g, '&gt;')
+          .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
+          .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+          .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+          .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+          .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+          .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+          .replace(/`(.*?)`/g, '<code>$1</code>')
+          .replace(/\n/gim, '<br />');
+        } else { // inside code block
+          chunks[i] = `<pre><code>${chunks[i]}</code></pre>`;
+        }
+      }
+
+      const restoredText = chunks.join('');
+
+      return html`<span dangerouslySetInnerHTML=${{ __html: restoredText }} />`;
+    };
+
+    const ModelGenerationInfo = (params) => {
+      if (!llamaStats.value) {
+        return html`<span/>`
+      }
+      return html`
+        <span>
+          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+        </span>
+      `
+    }
+
+
+    // simple popover impl
+    const Popover = (props) => {
+      const isOpen = useSignal(false);
+      const position = useSignal({ top: '0px', left: '0px' });
+      const buttonRef = useRef(null);
+      const popoverRef = useRef(null);
+
+      const togglePopover = () => {
+        if (buttonRef.current) {
+          const rect = buttonRef.current.getBoundingClientRect();
+          position.value = {
+            top: `${rect.bottom + window.scrollY}px`,
+            left: `${rect.left + window.scrollX}px`,
+          };
+        }
+        isOpen.value = !isOpen.value;
+      };
+
+      const handleClickOutside = (event) => {
+        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
+          isOpen.value = false;
+        }
+      };
+
+      useEffect(() => {
+        document.addEventListener('mousedown', handleClickOutside);
+        return () => {
+          document.removeEventListener('mousedown', handleClickOutside);
+        };
+      }, []);
+
+      return html`
+        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
+        ${isOpen.value && html`
+          <${Portal} into="#portal">
+            <div
+              ref=${popoverRef}
+              class="popover-content"
+              style=${{
+            top: position.value.top,
+            left: position.value.left,
+          }}
+            >
+              ${props.popoverChildren}
+            </div>
+          </${Portal}>
+        `}
+      `;
+    };
+
+    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
+    /** Redirect rendering of descendants into the given CSS selector */
+    class Portal extends Component {
+      componentDidUpdate(props) {
+        for (let i in props) {
+          if (props[i] !== this.props[i]) {
+            return setTimeout(this.renderLayer);
+          }
+        }
+      }
+
+      componentDidMount() {
+        this.isMounted = true;
+        this.renderLayer = this.renderLayer.bind(this);
+        this.renderLayer();
+      }
+
+      componentWillUnmount() {
+        this.renderLayer(false);
+        this.isMounted = false;
+        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
+      }
+
+      findNode(node) {
+        return typeof node === 'string' ? document.querySelector(node) : node;
+      }
+
+      renderLayer(show = true) {
+        if (!this.isMounted) return;
+
+        // clean up old node if moving bases:
+        if (this.props.into !== this.intoPointer) {
+          this.intoPointer = this.props.into;
+          if (this.into && this.remote) {
+            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
+          }
+          this.into = this.findNode(this.props.into);
+        }
+
+        this.remote = render(html`
+          <${PortalProxy} context=${this.context}>
+            ${show && this.props.children || null}
+          </${PortalProxy}>
+        `, this.into, this.remote);
+      }
+
+      render() {
+        return null;
+      }
+    }
+    // high-order component that renders its first child if it exists.
+    // used as a conditional rendering proxy.
+    class PortalProxy extends Component {
+      getChildContext() {
+        return this.props.context;
+      }
+      render({ children }) {
+        return children || null;
+      }
+    }
+
+    function App(props) {
+      useEffect(() => {
+        const query = new URLSearchParams(location.search).get("q");
+        if (query) chat(query);
+      }, []);
+
+      return html`
+        <div class="mode-${session.value.type}">
+          <header>
+            <div class="grid-container">
+              <div class="grid-item"></div>
+              <div class="grid-item"><h1>llama.cpp</h1></div>
+              <div class="grid-item"><a class="customlink" href="index-new.html">New UI</a></div>
+            </div>
+          </header>
+
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
+          <section id="write">
+            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
+          </section>
+
+          <footer>
+            <p><${ModelGenerationInfo} /></p>
+            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+          </footer>
+        </div>
+      `;
+    }
+
+    render(h(App), document.querySelector('#container'));
+  </script>
+</head>
+
+<body>
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
+  <div id="portal"></div>
+</body>
+
+</html>
diff --git a/examples/server/public/index.js b/examples/server/public_legacy/index.js
similarity index 100%
rename from examples/server/public/index.js
rename to examples/server/public_legacy/index.js
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public_legacy/json-schema-to-grammar.mjs
similarity index 100%
rename from examples/server/public/json-schema-to-grammar.mjs
rename to examples/server/public_legacy/json-schema-to-grammar.mjs
diff --git a/examples/server/public_legacy/loading.html b/examples/server/public_legacy/loading.html
new file mode 100644
index 000000000..c3fd19a0f
--- /dev/null
+++ b/examples/server/public_legacy/loading.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta http-equiv="refresh" content="5">
+    </head>
+    <body>
+        <div id="loading">
+            The model is loading. Please wait.<br/>
+            The user interface will appear soon.
+        </div>
+    </body>
+</html>
diff --git a/examples/server/public/prompt-formats.js b/examples/server/public_legacy/prompt-formats.js
similarity index 100%
rename from examples/server/public/prompt-formats.js
rename to examples/server/public_legacy/prompt-formats.js
diff --git a/examples/server/public/style.css b/examples/server/public_legacy/style.css
similarity index 100%
rename from examples/server/public/style.css
rename to examples/server/public_legacy/style.css
diff --git a/examples/server/public/system-prompts.js b/examples/server/public_legacy/system-prompts.js
similarity index 100%
rename from examples/server/public/system-prompts.js
rename to examples/server/public_legacy/system-prompts.js
diff --git a/examples/server/public/theme-beeninorder.css b/examples/server/public_legacy/theme-beeninorder.css
similarity index 100%
rename from examples/server/public/theme-beeninorder.css
rename to examples/server/public_legacy/theme-beeninorder.css
diff --git a/examples/server/public/theme-ketivah.css b/examples/server/public_legacy/theme-ketivah.css
similarity index 100%
rename from examples/server/public/theme-ketivah.css
rename to examples/server/public_legacy/theme-ketivah.css
diff --git a/examples/server/public/theme-mangotango.css b/examples/server/public_legacy/theme-mangotango.css
similarity index 100%
rename from examples/server/public/theme-mangotango.css
rename to examples/server/public_legacy/theme-mangotango.css
diff --git a/examples/server/public/theme-playground.css b/examples/server/public_legacy/theme-playground.css
similarity index 100%
rename from examples/server/public/theme-playground.css
rename to examples/server/public_legacy/theme-playground.css
diff --git a/examples/server/public/theme-polarnight.css b/examples/server/public_legacy/theme-polarnight.css
similarity index 100%
rename from examples/server/public/theme-polarnight.css
rename to examples/server/public_legacy/theme-polarnight.css
diff --git a/examples/server/public/theme-snowstorm.css b/examples/server/public_legacy/theme-snowstorm.css
similarity index 100%
rename from examples/server/public/theme-snowstorm.css
rename to examples/server/public_legacy/theme-snowstorm.css
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1c7f0fd1d..a6d3a1c95 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -14,22 +14,13 @@
 #define MIMETYPE_JSON "application/json; charset=utf-8"
 
 // auto generated files (update with ./deps.sh)
-#include "colorthemes.css.hpp"
-#include "style.css.hpp"
-#include "theme-beeninorder.css.hpp"
-#include "theme-ketivah.css.hpp"
-#include "theme-mangotango.css.hpp"
-#include "theme-playground.css.hpp"
-#include "theme-polarnight.css.hpp"
-#include "theme-snowstorm.css.hpp"
 #include "index.html.hpp"
-#include "index-new.html.hpp"
-#include "index.js.hpp"
 #include "completion.js.hpp"
-#include "system-prompts.js.hpp"
-#include "prompt-formats.js.hpp"
-#include "json-schema-to-grammar.mjs.hpp"
 #include "loading.html.hpp"
+#include "deps_daisyui.min.css.hpp"
+#include "deps_markdown-it.js.hpp"
+#include "deps_tailwindcss.js.hpp"
+#include "deps_vue.esm-browser.js.hpp"
 
 #include <atomic>
 #include <condition_variable>
@@ -2285,16 +2276,6 @@ int main(int argc, char ** argv) {
     std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
 
     svr->set_default_headers({{"Server", "llama.cpp"}});
-
-    // CORS preflight
-    svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) {
-        // Access-Control-Allow-Origin is already set by middleware
-        res.set_header("Access-Control-Allow-Credentials", "true");
-        res.set_header("Access-Control-Allow-Methods",     "POST");
-        res.set_header("Access-Control-Allow-Headers",     "*");
-        return res.set_content("", "text/html"); // blank response, no data
-    });
-
     svr->set_logger(log_server_request);
 
     auto res_error = [](httplib::Response & res, const json & error_data) {
@@ -2407,6 +2388,14 @@ int main(int argc, char ** argv) {
     // register server middlewares
     svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
         res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+        // If this is OPTIONS request, skip validation because browsers don't include Authorization header
+        if (req.method == "OPTIONS") {
+            res.set_header("Access-Control-Allow-Credentials", "true");
+            res.set_header("Access-Control-Allow-Methods",     "GET, POST");
+            res.set_header("Access-Control-Allow-Headers",     "*");
+            res.set_content("", "text/html"); // blank response, no data
+            return httplib::Server::HandlerResponse::Handled; // skip further processing
+        }
         if (!middleware_server_state(req, res)) {
             return httplib::Server::HandlerResponse::Handled;
         }
@@ -3116,33 +3105,19 @@ int main(int argc, char ** argv) {
     // register static assets routes
     if (!params.public_path.empty()) {
         // Set the base directory for serving static files
-        svr->set_base_dir(params.public_path);
-    }
-
-    if (!params.api_keys.empty()) {
-        // for now, if API key is set, web UI is unusable
-        svr->Get("/", [&](const httplib::Request &, httplib::Response & res) {
-            return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
-        });
+        bool is_found = svr->set_mount_point("/", params.public_path);
+        if (!is_found) {
+            LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
+            return 1;
+        }
     } else {
         // using embedded static files
-        svr->Get("/",                           handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
-        svr->Get("/index.js",                   handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
-        svr->Get("/completion.js",              handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
-        svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
-
-        // add new-ui files
-        svr->Get("/colorthemes.css",       handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
-        svr->Get("/style.css",             handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
-        svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
-        svr->Get("/theme-ketivah.css",     handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
-        svr->Get("/theme-mangotango.css",  handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
-        svr->Get("/theme-playground.css",  handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
-        svr->Get("/theme-polarnight.css",  handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
-        svr->Get("/theme-snowstorm.css",   handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
-        svr->Get("/index-new.html",        handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
-        svr->Get("/system-prompts.js",     handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
-        svr->Get("/prompt-formats.js",     handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
+        svr->Get("/",                        handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+        svr->Get("/completion.js",           handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+        svr->Get("/deps_daisyui.min.css",    handle_static_file(deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8"));
+        svr->Get("/deps_markdown-it.js",     handle_static_file(deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8"));
+        svr->Get("/deps_tailwindcss.js",     handle_static_file(deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8"));
+        svr->Get("/deps_vue.esm-browser.js", handle_static_file(deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8"));
     }
 
     // register API routes
diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature
index 0a3c5cc77..ef30007c3 100644
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@@ -64,5 +64,5 @@ Feature: Security
       | localhost       | Access-Control-Allow-Origin      | localhost         |
       | web.mydomain.fr | Access-Control-Allow-Origin      | web.mydomain.fr   |
       | origin          | Access-Control-Allow-Credentials | true              |
-      | web.mydomain.fr | Access-Control-Allow-Methods     | POST              |
+      | web.mydomain.fr | Access-Control-Allow-Methods     | GET, POST         |
       | web.mydomain.fr | Access-Control-Allow-Headers     | *                 |
diff --git a/grammars/README.md b/grammars/README.md
index 4e8b4e2fc..4e57bca5f 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -124,7 +124,7 @@ You can use GBNF grammars:
 - In [llama-cli](../examples/main), passed as the `--json` / `-j` flag
 - To convert to a grammar ahead of time:
     - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
-    - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
+    - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
 
 Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
 
diff --git a/tests/run-json-schema-to-grammar.mjs b/tests/run-json-schema-to-grammar.mjs
index 71bf62ed3..b20ac1d6b 100644
--- a/tests/run-json-schema-to-grammar.mjs
+++ b/tests/run-json-schema-to-grammar.mjs
@@ -1,5 +1,5 @@
 import { readFileSync } from "fs"
-import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
+import { SchemaConverter } from "../examples/server/public_legacy/json-schema-to-grammar.mjs"
 
 const [, , file] = process.argv
 const url = `file://${file}`

From 76c6e7f10551960e4ec9e14e0535b72081f1c7ad Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 7 Nov 2024 18:44:38 -0400
Subject: [PATCH 180/396] server : minor UI fix (#10207)

---
 examples/server/public/index.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 850c652ac..bf1d1b794 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -167,7 +167,7 @@
 
         <!-- pending (ongoing) assistant message -->
         <div id="pending-msg" class="chat chat-start">
-          <div v-if="pendingMsg" class="chat-bubble markdown">
+          <div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
             <span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
             <vue-markdown v-else :source="pendingMsg.content" />
           </div>

From d05b3127bd30515955aa4ee2bacdb68ebafe88f4 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <iainst0409@gmail.com>
Date: Fri, 8 Nov 2024 17:34:06 +0800
Subject: [PATCH 181/396] swift : exclude ggml-metal-embed.metal (#10211)

* llama.swift : exclude ggml-metal-embed.metal

* swift : exclude build/
---
 Package.swift | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Package.swift b/Package.swift
index d3661d13c..0f4f19018 100644
--- a/Package.swift
+++ b/Package.swift
@@ -61,13 +61,15 @@ let package = Package(
             name: "llama",
             path: ".",
             exclude: [
+               "build",
                "cmake",
                "examples",
                "scripts",
                "models",
                "tests",
                "CMakeLists.txt",
-               "Makefile"
+               "Makefile",
+               "ggml/src/ggml-metal-embed.metal"
             ],
             sources: sources,
             resources: resources,

From 841f27abdbbcecc9daac14dc540ba6202e4ffe40 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 8 Nov 2024 13:47:22 +0200
Subject: [PATCH 182/396] metal : optimize FA kernels (#10171)

* ggml : add ggml_flash_attn_ext_get_prec

* metal : use F16 precision in FA kernels

ggml-ci

* metal : minor clean-up

* metal : compile-guard bf16 FA kernels

ggml-ci

* build : remove obsolete compile flag [no ci]

* metal : prevent int overflows [no ci]

* cuda : disable BF16 FA

ggml-ci

* metal : fix BF16 requirement for FA kernels

ggml-ci

* make : clean-up [no ci]
---
 examples/llama-bench/llama-bench.cpp |   3 +
 ggml/include/ggml.h                  |   3 +
 ggml/src/ggml-cuda.cu                |   3 +
 ggml/src/ggml-cuda/fattn.cu          |  10 +-
 ggml/src/ggml-metal.m                |  74 ++-
 ggml/src/ggml-metal.metal            | 733 +++++++++++++++------------
 ggml/src/ggml.c                      |   9 +
 tests/test-backend-ops.cpp           |   2 +-
 8 files changed, 498 insertions(+), 339 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index e7873a143..1eddfd0db 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -256,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
     if (s == "f16") {
         return GGML_TYPE_F16;
     }
+    if (s == "bf16") {
+        return GGML_TYPE_BF16;
+    }
     if (s == "q8_0") {
         return GGML_TYPE_Q8_0;
     }
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 0d143d2fe..73ede1813 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1746,6 +1746,9 @@ extern "C" {
             struct ggml_tensor * a,
             enum ggml_prec       prec);
 
+    GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
+            const struct ggml_tensor * a);
+
     // TODO: needs to be adapted to ggml_flash_attn_ext
     GGML_API struct ggml_tensor * ggml_flash_attn_back(
            struct ggml_context * ctx,
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index e27c8e87d..357cee660 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3159,6 +3159,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 #ifndef FLASH_ATTN_AVAILABLE
             return false;
 #endif
+            if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
+                return false;
+            }
             if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
                 return true;
             }
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index 83e5589a1..0e7ebbc53 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -13,9 +13,9 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
     const ggml_tensor * KQV = dst;
     const ggml_tensor * Q   = dst->src[0];
 
-    const int32_t precision = KQV->op_params[3];
+    const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
 
-    if (precision != GGML_PREC_DEFAULT) {
+    if (prec != GGML_PREC_DEFAULT) {
         if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
             constexpr int cols_per_block = 16;
             switch (Q->ne[0]) {
@@ -301,11 +301,11 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
 
     ggml_cuda_set_device(ctx.device);
     const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    const int32_t precision = KQV->op_params[3];
+    const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
 
     // On AMD the tile kernels perform poorly, use the vec kernel instead:
     if (cc >= CC_OFFSET_AMD) {
-        if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
+        if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
             ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
         } else {
             ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
@@ -332,7 +332,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     }
 
     if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
-        if (precision == GGML_PREC_DEFAULT) {
+        if (prec == GGML_PREC_DEFAULT) {
             ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
             return;
         } else if(Q->ne[0] <= 128) {
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index f13adee38..e19397fd2 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -269,6 +269,12 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,
@@ -300,12 +306,14 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,
@@ -585,6 +593,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
             struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
             id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
             kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \
+            GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
+                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
+                    (int) kernel->pipeline.threadExecutionWidth); \
             [metal_function release]; \
             if (error) { \
                 GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
@@ -777,6 +788,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,       flash_attn_ext_f16_h112,        has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,       flash_attn_ext_f16_h128,        has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64,       flash_attn_ext_bf16_h64,        has_simdgroup_mm && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80,       flash_attn_ext_bf16_h80,        has_simdgroup_mm && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96,       flash_attn_ext_bf16_h96,        has_simdgroup_mm && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112,      flash_attn_ext_bf16_h112,       has_simdgroup_mm && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128,      flash_attn_ext_bf16_h128,       has_simdgroup_mm && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256,      flash_attn_ext_bf16_h256,       has_simdgroup_mm && has_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,       flash_attn_ext_q4_0_h64,        has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,       flash_attn_ext_q4_0_h80,        has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,       flash_attn_ext_q4_0_h96,        has_simdgroup_mm);
@@ -808,12 +825,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,      flash_attn_ext_q8_0_h128,       has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,      flash_attn_ext_q8_0_h256,       has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,   flash_attn_ext_vec_f16_h128,    has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,  flash_attn_ext_vec_bf16_h128,   has_simdgroup_reduction && has_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,  flash_attn_ext_vec_q4_0_h128,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,  flash_attn_ext_vec_q4_1_h128,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,  flash_attn_ext_vec_q5_0_h128,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,  flash_attn_ext_vec_q5_1_h128,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,  flash_attn_ext_vec_q8_0_h128,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256,  flash_attn_ext_vec_bf16_h256,   has_simdgroup_reduction && has_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,  flash_attn_ext_vec_q4_0_h256,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,  flash_attn_ext_vec_q4_1_h256,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,  flash_attn_ext_vec_q5_0_h256,   has_simdgroup_reduction);
@@ -1111,7 +1130,7 @@ static void ggml_metal_encode_node(
     const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
     const uint64_t nb21 = src2 ? src2->nb[1] : 0;
     const uint64_t nb22 = src2 ? src2->nb[2] : 0;
-    const uint64_t nb23 = src2 ? src2->nb[3] : 0;
+    const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
 
     const int64_t  ne0  =  dst ?  dst->ne[0] : 0;
     const int64_t  ne1  =  dst ?  dst->ne[1] : 0;
@@ -3033,6 +3052,23 @@ static void ggml_metal_encode_node(
                                               }
                                 }
                             } break;
+                        case GGML_TYPE_BF16:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
                         case GGML_TYPE_Q4_0:
                             {
                                 switch (ne00) {
@@ -3133,6 +3169,7 @@ static void ggml_metal_encode_node(
                             {
                                 switch (src1->type) {
                                     case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
+                                    case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128].pipeline; break;
                                     case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128].pipeline; break;
                                     case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128].pipeline; break;
                                     case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128].pipeline; break;
@@ -3150,6 +3187,7 @@ static void ggml_metal_encode_node(
                             {
                                 switch (src1->type) {
                                     case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
+                                    case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256].pipeline; break;
                                     case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256].pipeline; break;
                                     case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256].pipeline; break;
                                     case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256].pipeline; break;
@@ -3194,18 +3232,15 @@ static void ggml_metal_encode_node(
                 [encoder setBytes:&nb11          length:sizeof(uint64_t)      atIndex:14];
                 [encoder setBytes:&nb12          length:sizeof(uint64_t)      atIndex:15];
                 [encoder setBytes:&nb13          length:sizeof(uint64_t)      atIndex:16];
-                [encoder setBytes:&nb21          length:sizeof(uint64_t)      atIndex:17];
-                [encoder setBytes:&nb22          length:sizeof(uint64_t)      atIndex:18];
-                [encoder setBytes:&nb23          length:sizeof(uint64_t)      atIndex:19];
-                [encoder setBytes:&nb31          length:sizeof(uint64_t)      atIndex:20];
-                [encoder setBytes:&ne1           length:sizeof( int64_t)      atIndex:21];
-                [encoder setBytes:&ne2           length:sizeof( int64_t)      atIndex:22];
-                [encoder setBytes:&scale         length:sizeof(   float)      atIndex:23];
-                [encoder setBytes:&max_bias      length:sizeof(   float)      atIndex:24];
-                [encoder setBytes:&m0            length:sizeof(m0)            atIndex:25];
-                [encoder setBytes:&m1            length:sizeof(m1)            atIndex:26];
-                [encoder setBytes:&n_head_log2   length:sizeof(n_head_log2)   atIndex:27];
-                [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28];
+                [encoder setBytes:&nb31          length:sizeof(uint64_t)      atIndex:17];
+                [encoder setBytes:&ne1           length:sizeof( int64_t)      atIndex:18];
+                [encoder setBytes:&ne2           length:sizeof( int64_t)      atIndex:19];
+                [encoder setBytes:&scale         length:sizeof(   float)      atIndex:20];
+                [encoder setBytes:&max_bias      length:sizeof(   float)      atIndex:21];
+                [encoder setBytes:&m0            length:sizeof(m0)            atIndex:22];
+                [encoder setBytes:&m1            length:sizeof(m1)            atIndex:23];
+                [encoder setBytes:&n_head_log2   length:sizeof(n_head_log2)   atIndex:24];
+                [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:25];
 
                 if (!use_vec_kernel) {
                     // half8x8 kernel
@@ -3216,11 +3251,14 @@ static void ggml_metal_encode_node(
                     GGML_ASSERT(nqptg  % 8  == 0);
                     GGML_ASSERT(ncpsg  % 32 == 0);
 
+                    // 2*(2*ncpsg + nqptg)*(nsg)
+                    // ncpsg soft_max values + ncpsg mask values + a diagonal scaling matrix (in float)
+                    //
                     // 16*32*(nsg)
                     // the shared memory needed for the simdgroups to load the KV cache
                     // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
                     //
-#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
+#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
 
                     int64_t nsgmax = 2;
 
@@ -3254,12 +3292,12 @@ static void ggml_metal_encode_node(
 
                     // ne00 + 2*ncpsg*(nsg)
                     // for each query, we load it as f16 in shared memory (ne00)
-                    // and store the attention scores (nqptg x ncpsg) as f32
+                    // and store the soft_max values and the mask
                     //
-                    // 2*ne00*(nsg)
-                    // each simdgroup has a full f32 head vector in shared mem to accumulate results
+                    // ne00*(nsg)
+                    // each simdgroup has a full f16 head vector in shared mem to accumulate results
                     //
-#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + 2*ne00*(nsg))*(sizeof(float)/2), 16))
+#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + ne00*(nsg))*(sizeof(float)/2), 16))
 
                     int64_t nsgmax = 2;
 
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index 16b5da3ff..edce74108 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -57,10 +57,14 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg
     const ushort mask0 = il ? 0x00F0 : 0x000F;
     const ushort mask1 = mask0 << 8;
 
-    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
-        reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
+    float4x4 reg_f;
+
+    for (int i = 0; i < 8; i++) {
+        reg_f[i/2][2*(i%2) + 0] = d1 * (qs[i] & mask0) + md;
+        reg_f[i/2][2*(i%2) + 1] = d2 * (qs[i] & mask1) + md;
     }
+
+    reg = (type4x4) reg_f;
 }
 
 template <typename type4x4>
@@ -72,10 +76,14 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg
     const ushort mask0 = il ? 0x00F0 : 0x000F;
     const ushort mask1 = mask0 << 8;
 
-    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
-        reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
+    float4x4 reg_f;
+
+    for (int i = 0; i < 8; i++) {
+        reg_f[i/2][2*(i%2) + 0] = ((qs[i] & mask0) * d1) + m;
+        reg_f[i/2][2*(i%2) + 1] = ((qs[i] & mask1) * d2) + m;
     }
+
+    reg = (type4x4) reg_f;
 }
 
 template <typename type4x4>
@@ -92,6 +100,8 @@ void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg
     const int gh_mv = il ? 12 : 0;
     const int gh_bk = il ?  0 : 4;
 
+    float4x4 reg_f;
+
     for (int i = 0; i < 8; i++) {
         // extract the 5-th bits for x0 and x1
         const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
@@ -101,9 +111,11 @@ void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg
         const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
         const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
 
-        reg[i/2][2*(i%2)+0] = d * x0 + md;
-        reg[i/2][2*(i%2)+1] = d * x1 + md;
+        reg_f[i/2][2*(i%2) + 0] = d * x0 + md;
+        reg_f[i/2][2*(i%2) + 1] = d * x1 + md;
     }
+
+    reg = (type4x4) reg_f;
 }
 
 template <typename type4x4>
@@ -120,6 +132,8 @@ void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg
     const int gh_mv = il ? 12 : 0;
     const int gh_bk = il ?  0 : 4;
 
+    float4x4 reg_f;
+
     for (int i = 0; i < 8; i++) {
         // extract the 5-th bits for x0 and x1
         const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
@@ -129,9 +143,11 @@ void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg
         const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
         const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
 
-        reg[i/2][2*(i%2)+0] = d * x0 + m;
-        reg[i/2][2*(i%2)+1] = d * x1 + m;
+        reg_f[i/2][2*(i%2) + 0] = d * x0 + m;
+        reg_f[i/2][2*(i%2) + 1] = d * x1 + m;
     }
+
+    reg = (type4x4) reg_f;
 }
 
 template <typename type4x4>
@@ -139,9 +155,13 @@ void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg
     device const int8_t * qs = ((device const int8_t *)xb->qs);
     const half d = xb->d;
 
+    float4x4 reg_f;
+
     for (int i = 0; i < 16; i++) {
-        reg[i/4][i%4] = (qs[i + 16*il] * d);
+        reg_f[i/4][i%4] = (qs[i + 16*il] * d);
     }
+
+    reg = (type4x4) reg_f;
 }
 
 template <typename type4x4>
@@ -2755,44 +2775,65 @@ kernel void kernel_leaky_relu_f32(
 }
 
 // ref: https://arxiv.org/pdf/2307.08691.pdf
-// D - head size, Q - queries per threadgroup, KV - key/value processed per each simdgroup, C - cache items per threadgroup
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &), short D, short Q = 8, short KV = 8, short C = 32>
+template<
+    typename q_t,     // query types in shared memory
+    typename q4_t,
+    typename q8x8_t,
+    typename k_t,     // key types in shared memory
+    typename k4x4_t,
+    typename k8x8_t,
+    typename v_t,     // value types in shared memory
+    typename v4x4_t,
+    typename v8x8_t,
+    typename qk_t,    // Q*K types
+    typename qk8x8_t,
+    typename s_t,     // soft-max types
+    typename s8x8_t,
+    typename o_t,     // attention accumulation types
+    typename o4_t,
+    typename o8x8_t,
+    typename kd4x4_t, // key type in device memory
+    short nl_k,
+    void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
+    typename vd4x4_t, // key type in device memory
+    short nl_v,
+    void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
+    short D,         // head size
+    short Q  = 8,    // queries per threadgroup
+    short KV = 8,    // key/value processed per each simdgroup
+    short C  = 32>   // cache items per threadgroup
 kernel void kernel_flash_attn_ext(
         device const  char * q,
         device const  char * k,
         device const  char * v,
         device const  char * mask,
         device       float * dst,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant   int64_t & ne13,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant  uint64_t & nb21,
-        constant  uint64_t & nb22,
-        constant  uint64_t & nb23,
-        constant  uint64_t & nb31,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
+        constant   int32_t & ne01,
+        constant   int32_t & ne02,
+        constant   int32_t & ne03,
+        constant  uint32_t & nb01,
+        constant  uint32_t & nb02,
+        constant  uint32_t & nb03,
+        constant   int32_t & ne11,
+        constant   int32_t & ne_12_2, // assume K and V are same shape
+        constant   int32_t & ne_12_3,
+        constant  uint32_t & nb_12_1,
+        constant  uint32_t & nb_12_2,
+        constant  uint32_t & nb_12_3,
+        constant  uint32_t & nb31,
+        constant   int32_t & ne1,
+        constant   int32_t & ne2,
         constant     float & scale,
         constant     float & max_bias,
         constant     float & m0,
         constant     float & m1,
-        constant  uint32_t & n_head_log2,
+        constant  uint16_t & n_head_log2,
         constant     float & logit_softcap,
         threadgroup   half * shared [[threadgroup(0)]],
-        uint3  tgpig[[threadgroup_position_in_grid]],
-        uint3  tpitg[[thread_position_in_threadgroup]],
-        uint3    ntg[[threads_per_threadgroup]],
-        ushort tiisg[[thread_index_in_simdgroup]],
-        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+        ushort3  tgpig[[threadgroup_position_in_grid]],
+        ushort3    ntg[[threads_per_threadgroup]],
+        ushort   tiisg[[thread_index_in_simdgroup]],
+        ushort   sgitg[[simdgroup_index_in_threadgroup]]) {
     const short nsg = ntg.y; // number of simdgroups
 
     const int iq3 = tgpig[2];
@@ -2803,21 +2844,25 @@ kernel void kernel_flash_attn_ext(
     const short D8  = D/8;
     const short D16 = D/16;
     const short NW  = N_SIMDWIDTH;
-    const short SH  = (C + Q); // shared memory per simdgroup in (half)
+    const short SH  = (2*C + Q); // shared memory per simdgroup (s_t == float)
 
-    const short T  = D + 2*nsg*SH; // shared memory size per query in (half)
-    const short TF = T/2;        // shared memory size per query in (float)
-    const short T4 = T/4;        // shared memory size per query in (half4)
+    const short TS = nsg*SH;   // shared memory size per query in (s_t == float)
+    const short T  = D + 2*TS; // shared memory size per query in (half)
 
-    threadgroup half  * sq  = (threadgroup half  *) (shared +              0*D); // holds the query data
-    threadgroup half4 * sq4 = (threadgroup half4 *) (shared +              0*D); // same as above but in half4
-    threadgroup float * ss  = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
+    threadgroup q_t  * sq  = (threadgroup q_t  *) (shared +              0*D); // holds the query data
+    threadgroup q4_t * sq4 = (threadgroup q4_t *) (shared +              0*D); // same as above but in q4_t
+    threadgroup o_t  * so  = (threadgroup o_t  *) (shared +              0*D); // reuse query data for accumulation
+    threadgroup o4_t * so4 = (threadgroup o4_t *) (shared +              0*D); // same as above but in o4_t
+    threadgroup s_t  * ss  = (threadgroup s_t  *) (shared + 2*sgitg*SH + Q*D); // scratch buffer for attention, mask and diagonal matrix
 
-    threadgroup half    * skv  = (threadgroup half    *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K and V in shared memory
-    threadgroup half4x4 * skv4 = (threadgroup half4x4 *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in half4x4
+    threadgroup k_t    * sk    = (threadgroup k_t    *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
+    threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in k4x4_t
+
+    threadgroup v_t    * sv    = (threadgroup v_t    *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load V in shared memory
+    threadgroup v4x4_t * sv4x4 = (threadgroup v4x4_t *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in v4x4_t
 
     // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
-    simdgroup_half8x8 lo[D8];
+    o8x8_t lo[D8];
 
     // load heads from Q to shared memory
     for (short j = sgitg; j < Q; j += nsg) {
@@ -2825,71 +2870,61 @@ kernel void kernel_flash_attn_ext(
 
         for (short i = tiisg; i < D4; i += NW) {
             if (iq1 + j < ne01) {
-                sq4[j*T4 + i] = (half4) q4[i];
+                sq4[j*D4 + i] = (q4_t) q4[i];
             } else {
-                sq4[j*T4 + i] = 0.0h;
+                sq4[j*D4 + i] = (q4_t) 0.0f;
             }
         }
     }
 
     // zero out lo
     for (short i = 0; i < D8; ++i) {
-        lo[i] = make_filled_simdgroup_matrix<half, 8>(0.0h);
+        lo[i] = make_filled_simdgroup_matrix<o_t, 8>((o_t) 0.0f);
     }
 
     // zero out shared memory SH
     for (short j = 0; j < Q; ++j) {
         for (short i = tiisg; i < SH; i += NW) {
-            ss[j*TF + i] = 0.0f;
+            ss[j*TS + i] = 0.0f;
         }
     }
 
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     {
-        float S[Q] = { [0 ... Q-1] = 0.0f };
-        float M[Q] = { [0 ... Q-1] = -FLT_MAX/2 };
+        half S[Q] = { [0 ... Q-1] = 0.0f };
+        half M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
 
         // thread indices inside the simdgroup
+        // TODO: see if we can utilize quad-group functions for better performance
+        //       https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (6.9.3)
         const short tx = tiisg%4;
         const short ty = tiisg/4;
 
-        // assume K and V are same shape
-        const short ne22 = ne12;
-        const short ne23 = ne13;
+        // broadcast kv
+        //const short rk2 = ne02/ne12;
+        //const short rk3 = ne03/ne13;
 
-        // broadcast k
-        const short rk2 = ne02/ne12;
-        const short rk3 = ne03/ne13;
-
-        const short ik2 = iq2/rk2;
-        const short ik3 = iq3/rk3;
-
-        // broadcast v
-        const short rv2 = ne02/ne22;
-        const short rv3 = ne03/ne23;
-
-        const short iv2 = iq2/rv2;
-        const short iv3 = iq3/rv3;
+        const short ikv2 = iq2/(ne02/ne_12_2);
+        const short ikv3 = iq3/(ne03/ne_12_3);
 
         // load the queries from shared memory into local memory
-        simdgroup_half8x8 mq[D8];
+        q8x8_t mq[D8];
 
         for (short i = 0; i < D8; ++i) {
-            simdgroup_load(mq[i], sq + i*8, T);
+            simdgroup_load(mq[i], sq + i*8, D);
         }
 
-        // pointer to the mask
-        device const half * mp = (device const half *) (mask + iq1*nb31);
+        const bool has_mask = mask != q;
 
-        float slope = 1.0f;
+        half slope = 1.0f;
 
         // ALiBi
         if (max_bias > 0.0f) {
-            const uint32_t h = iq2;
+            const short h = iq2;
 
-            const float base = h < n_head_log2 ? m0 : m1;
-            const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+            const half  base = h < n_head_log2 ? m0 : m1;
+            const short exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
 
             slope = pow(base, exph);
         }
@@ -2902,120 +2937,137 @@ kernel void kernel_flash_attn_ext(
                 break;
             }
 
+            if (has_mask) {
+                // used to detect blocks full of -INF
+                half smax = -INFINITY;
+
+                // load the mask in shared memory
+                for (short j = 0; j < Q; ++j) {
+                    device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*nb31);
+
+                    const half m = pm[ic + tiisg];
+
+                    ss[j*TS + C + tiisg] = m;
+                    smax = max(smax, m);
+                }
+
+                smax = simd_max(smax);
+
+                if (smax == -INFINITY) {
+                    continue;
+                }
+            }
+
             // Q*K^T
             {
                 for (short cc = 0; cc < C/8; ++cc) {
-                    simdgroup_float8x8 mqk = make_filled_simdgroup_matrix<float, 8>(0.h);
+                    qk8x8_t mqk = make_filled_simdgroup_matrix<qk_t, 8>((qk_t) 0.0f);
 
                     // this is compile-time check, so it does not have runtime overhead
-                    if (is_same<block_q, half4x4>::value) {
+                    if (is_same<kd4x4_t, k4x4_t>::value) {
                         // we can read directly from global memory
-                        device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13));
+                        device const k_t * pk = (device const k_t *) ((device const char *) k + ((ic + 8*cc)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 
+#pragma unroll
                         for (short i = 0; i < D8; ++i) {
-                            simdgroup_half8x8 mk;
-                            simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose
+                            k8x8_t mk;
+                            simdgroup_load(mk, pk + i*8, nb_12_1/sizeof(k_t), 0, true); // transpose // TODO: use ne10
 
                             simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
                         }
                     } else {
                         for (short ii = 0; ii < D16; ii += 4) {
-                            device const block_q * pk4 = (device const block_q *) ((device const char *) k + ((ic + 8*cc + ty)*nb11 + ik2*nb12 + ik3*nb13));
+                            device const kd4x4_t * pk4x4 = (device const kd4x4_t *) ((device const char *) k + ((ic + 8*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 
                             if (D16%4 == 0) {
                                 // the head is evenly divisible by 4*16 = 64, so no need for bound checks
-                                half4x4 tmp;
-                                dequantize_func(pk4 + (ii + tx)/nl, (ii + tx)%nl, tmp);
-                                skv4[4*ty + tx] = tmp;
+                                {
+                                    k4x4_t tmp;
+                                    deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp);
+                                    sk4x4[4*ty + tx] = tmp;
+                                }
 
                                 simdgroup_barrier(mem_flags::mem_threadgroup);
 
 #pragma unroll
                                 for (short k = 0; k < 4; ++k) {
-                                    simdgroup_half8x8 mk;
+                                    k8x8_t mk;
 
-                                    simdgroup_load(mk, skv + 16*k + 0*8, 4*16, 0, true); // transpose
+                                    simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose
                                     simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk);
 
-                                    simdgroup_load(mk, skv + 16*k + 1*8, 4*16, 0, true); // transpose
+                                    simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose
                                     simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk);
                                 }
                             } else {
                                 if (ii + tx < D16) {
-                                    half4x4 tmp;
-                                    dequantize_func(pk4 + (ii + tx)/nl, (ii + tx)%nl, tmp);
-                                    skv4[4*ty + tx] = tmp;
+                                    k4x4_t tmp;
+                                    deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp);
+                                    sk4x4[4*ty + tx] = tmp;
                                 }
 
                                 simdgroup_barrier(mem_flags::mem_threadgroup);
 
                                 for (short k = 0; k < 4 && ii + k < D16; ++k) {
-                                    simdgroup_half8x8 mk;
+                                    k8x8_t mk;
 
-                                    simdgroup_load(mk, skv + 16*k + 0*8, 4*16, 0, true); // transpose
+                                    simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose
                                     simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk);
 
-                                    simdgroup_load(mk, skv + 16*k + 1*8, 4*16, 0, true); // transpose
+                                    simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose
                                     simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk);
                                 }
                             }
                         }
                     }
 
-                    simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
+                    // cast qk_t -> s_t
+                    //s8x8_t mqks(1.0f);
+                    //simdgroup_multiply(mqks, mqk, mqks);
+                    //simdgroup_store(mqks, ss + 8*cc, TS, 0, false);
+
+                    simdgroup_store(mqk, ss + 8*cc, TS, 0, false);
                 }
             }
 
-            // used to detect blocks full of -INF
-            float smax = -INFINITY;
-
             // online softmax
             {
-                float ms[Q];
-
-                for (short j = 0; j < Q; ++j) {
-                    const float m = M[j];
+                for (ushort j = 0; j < Q; ++j) {
+                    const half m = M[j];
 
                     // scale and apply the logitcap / mask
-                    float s = ss[j*TF + tiisg]*scale;
+                    half s = ss[j*TS + tiisg]*scale;
 
                     if (logit_softcap != 0.0f) {
                         s = logit_softcap*precise::tanh(s);
                     }
 
-                    if (mask != q) {
-                        // mqk = mqk + mask*slope
-                        s += slope*mp[ic + j*nb31/sizeof(half) + tiisg];
-                    }
+                    // mqk = mqk + mask*slope
+                    s += slope*ss[j*TS + C + tiisg];
 
-                    smax = simd_max(max(smax, s));
                     M[j] = simd_max(max(M[j], s));
 
-                                ms[j] = exp(m - M[j]);
-                    const float vs    = exp(s - M[j]);
+                    const half ms = exp(m - M[j]);
+                    const half vs = exp(s - M[j]);
 
-                    S[j] = S[j]*ms[j] + simd_sum(vs);
+                    S[j] = S[j]*ms + simd_sum(vs);
 
                     // the P matrix from the paper (Q rows, C columns)
-                    ss[j*TF + tiisg] = vs;
-                }
+                    ss[j*TS + tiisg] = vs;
 
-                // create a QxQ diagonal matrix for rescaling the output
-                if (tiisg < Q) {
-                    ss[tiisg*TF + C + tiisg] = ms[tiisg];
+                    // create a QxQ diagonal matrix for rescaling the output
+                    if (tiisg == j) {
+                        ss[j*TS + 2*C + j] = ms;
+                    }
                 }
             }
 
-            // skip -INF blocks
-            if (smax == -INFINITY) {
-                continue;
-            }
-
             // O = diag(ms)*O
             {
-                simdgroup_float8x8 mm;
-                simdgroup_load(mm, ss + C, TF, 0, false);
+                s8x8_t mm;
+                simdgroup_load(mm, ss + 2*C, TS, 0, false);
 
+#pragma unroll
                 for (short i = 0; i < D8; ++i) {
                     simdgroup_multiply(lo[i], mm, lo[i]);
                 }
@@ -3024,57 +3076,59 @@ kernel void kernel_flash_attn_ext(
             // O = O + (Q*K^T)*V
             {
                 for (short cc = 0; cc < C/8; ++cc) {
-                    simdgroup_float8x8 ms;
-                    simdgroup_load(ms, ss + 8*cc, TF, 0, false);
+                    s8x8_t ms;
+                    simdgroup_load(ms, ss + 8*cc, TS, 0, false);
 
-                    if (is_same<block_q, half4x4>::value) {
+                    if (is_same<vd4x4_t, v4x4_t>::value) {
                         // we can read directly from global memory
-                        device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23));
+                        device const v_t * pv = (device const v_t *) ((device const char *) v + ((ic + 8*cc)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 #pragma unroll
                         for (short i = 0; i < D8; ++i) {
-                            simdgroup_half8x8 mv;
-                            simdgroup_load(mv, pv + i*8, nb21/sizeof(half), 0, false);
+                            v8x8_t mv;
+                            simdgroup_load(mv, pv + i*8, nb_12_1/sizeof(v_t), 0, false); // TODO: use ne20
 
                             simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
                         }
                     } else {
                         for (short ii = 0; ii < D16; ii += 4) {
-                            device const block_q * pv4 = (device const block_q *) ((device const char *) v + ((ic + 8*cc + ty)*nb21 + iv2*nb22 + iv3*nb23));
+                            device const vd4x4_t * pv4x4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 8*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 
                             if (D16%4 == 0) {
                                 // no need for bound checks
-                                half4x4 tmp;
-                                dequantize_func(pv4 + (ii + tx)/nl, (ii + tx)%nl, tmp);
-                                skv4[4*ty + tx] = tmp;
+                                {
+                                    v4x4_t tmp;
+                                    deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp);
+                                    sv4x4[4*ty + tx] = tmp;
+                                }
 
                                 simdgroup_barrier(mem_flags::mem_threadgroup);
 
 #pragma unroll
                                 for (short k = 0; k < 4; ++k) {
-                                    simdgroup_half8x8 mv;
+                                    v8x8_t mv;
 
-                                    simdgroup_load(mv, skv + 16*k + 0*8, 4*16, 0, false);
+                                    simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
                                     simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
 
-                                    simdgroup_load(mv, skv + 16*k + 1*8, 4*16, 0, false);
+                                    simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
                                     simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
                                 }
                             } else {
                                 if (ii + tx < D16) {
-                                    half4x4 tmp;
-                                    dequantize_func(pv4 + (ii + tx)/nl, (ii + tx)%nl, tmp);
-                                    skv4[4*ty + tx] = tmp;
+                                    v4x4_t tmp;
+                                    deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp);
+                                    sv4x4[4*ty + tx] = tmp;
                                 }
 
                                 simdgroup_barrier(mem_flags::mem_threadgroup);
 
                                 for (short k = 0; k < 4 && ii + k < D16; ++k) {
-                                    simdgroup_half8x8 mv;
+                                    v8x8_t mv;
 
-                                    simdgroup_load(mv, skv + 16*k + 0*8, 4*16, 0, false);
+                                    simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
                                     simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
 
-                                    simdgroup_load(mv, skv + 16*k + 1*8, 4*16, 0, false);
+                                    simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
                                     simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
                                 }
                             }
@@ -3087,23 +3141,23 @@ kernel void kernel_flash_attn_ext(
         // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
         for (short j = 0; j < Q; ++j) {
             if (tiisg == 0) {
-                ss[j*TF + 0] = S[j];
-                ss[j*TF + 1] = M[j];
+                ss[j*TS + 0] = S[j];
+                ss[j*TS + 1] = M[j];
             }
         }
     }
 
     // reduce the warps sequentially
-    for (short sg = 1; sg < nsg; ++sg) {
-        float S = { 0.0f };
-        float M = { -FLT_MAX/2 };
+    for (ushort sg = 1; sg < nsg; ++sg) {
+        half S = { 0.0f };
+        half M = { -__FLT16_MAX__/2 };
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
         // each simdgroup stores its output to shared memory, reusing sq
         if (sgitg == sg) {
             for (short i = 0; i < D8; ++i) {
-                simdgroup_store(lo[i], sq + i*8, T, 0, false);
+                simdgroup_store(lo[i], so + i*8, D, 0, false);
             }
         }
 
@@ -3112,39 +3166,40 @@ kernel void kernel_flash_attn_ext(
         // the first simdgroup accumulates the results from the other simdgroups
         if (sgitg == 0) {
             for (short j = 0; j < Q; ++j) {
-                const float S0 = ss[j*TF +         0];
-                const float S1 = ss[j*TF + sg*SH + 0];
+                const half S0 = ss[j*TS +         0];
+                const half S1 = ss[j*TS + sg*SH + 0];
 
-                const float M0 = ss[j*TF +         1];
-                const float M1 = ss[j*TF + sg*SH + 1];
+                const half M0 = ss[j*TS +         1];
+                const half M1 = ss[j*TS + sg*SH + 1];
 
                 M = max(M0, M1);
 
-                const float ms0 = exp(M0 - M);
-                const float ms1 = exp(M1 - M);
+                const half ms0 = exp(M0 - M);
+                const half ms1 = exp(M1 - M);
 
                 S = S0*ms0 + S1*ms1;
 
                 if (tiisg == 0) {
-                    ss[j*TF + 0] = S;
-                    ss[j*TF + 1] = M;
+                    ss[j*TS + 0] = S;
+                    ss[j*TS + 1] = M;
 
-                    ss[j*TF + C + j        ] = ms0;
-                    ss[j*TF + C + j + sg*SH] = ms1;
+                    ss[j*TS + 2*C + j        ] = ms0;
+                    ss[j*TS + 2*C + j + sg*SH] = ms1;
                 }
             }
 
             // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
             {
-                simdgroup_half8x8 t;
-                simdgroup_float8x8 ms0;
-                simdgroup_float8x8 ms1;
+                s8x8_t ms0;
+                s8x8_t ms1;
 
-                simdgroup_load(ms0, ss + C,         TF, 0, false);
-                simdgroup_load(ms1, ss + C + sg*SH, TF, 0, false);
+                simdgroup_load(ms0, ss + 2*C,         TS, 0, false);
+                simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
 
                 for (short i = 0; i < D8; ++i) {
-                    simdgroup_load    (t, sq + i*8, T, 0, false);
+                    o8x8_t t;
+
+                    simdgroup_load    (t, so + i*8, D, 0, false);
                     simdgroup_multiply(t, ms1, t);
 
                     simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
@@ -3156,7 +3211,7 @@ kernel void kernel_flash_attn_ext(
     // store result to shared memory (reuse sq)
     if (sgitg == 0) {
         for (short i = 0; i < D8; ++i) {
-            simdgroup_store(lo[i], sq + i*8, T, 0, false);
+            simdgroup_store(lo[i], so + i*8, D, 0, false);
         }
     }
 
@@ -3165,98 +3220,133 @@ kernel void kernel_flash_attn_ext(
     // final rescale with 1/S and store to global memory
     if (sgitg == 0) {
         for (short j = 0; j < Q && iq1 + j < ne01; ++j) {
-            const float S = ss[j*TF + 0];
+            const float S = ss[j*TS + 0];
 
             for (short i = tiisg; i < D4; i += NW) {
-                dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) sq4[j*T4 + i]/S;
+                dst4[((int64_t)iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) so4[j*D4 + i]/S;
             }
         }
     }
 }
 
-typedef decltype(kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 64>) flash_attn_ext_t;
+// TODO: this is quite ugly. in the future these types will be hardcoded in the kernel, but for now keep them as
+//       template to be able to explore different combinations
+//
+#define FA_TYPES \
+    half,  half4,   simdgroup_half8x8,  \
+    half,  half4x4, simdgroup_half8x8,  \
+    half,  half4x4, simdgroup_half8x8,  \
+    float,          simdgroup_float8x8, \
+    float,          simdgroup_float8x8, \
+    half,  half4,   simdgroup_half8x8
 
-template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 64>;
-template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 80>;
-template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 96>;
-template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 112>;
-template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 128>;
-template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<half4x4, 1, dequantize_f16, 256>;
+typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64>) flash_attn_ext_t;
 
-template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 64>;
-template [[host_name("kernel_flash_attn_ext_q4_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 80>;
-template [[host_name("kernel_flash_attn_ext_q4_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 96>;
-template [[host_name("kernel_flash_attn_ext_q4_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 112>;
-template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 128>;
-template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_0, 2, dequantize_q4_0, 256>;
+template [[host_name("kernel_flash_attn_ext_f16_h64" )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  64>;
+template [[host_name("kernel_flash_attn_ext_f16_h80" )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  80>;
+template [[host_name("kernel_flash_attn_ext_f16_h96" )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  96>;
+template [[host_name("kernel_flash_attn_ext_f16_h112")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  112>;
+template [[host_name("kernel_flash_attn_ext_f16_h128")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  128>;
+template [[host_name("kernel_flash_attn_ext_f16_h256")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  256>;
 
-template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 64>;
-template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 80>;
-template [[host_name("kernel_flash_attn_ext_q4_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 96>;
-template [[host_name("kernel_flash_attn_ext_q4_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 112>;
-template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 128>;
-template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q4_1, 2, dequantize_q4_1, 256>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 64>;
+template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 80>;
+template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 96>;
+template [[host_name("kernel_flash_attn_ext_bf16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 112>;
+template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 128>;
+template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 256>;
+#endif
 
-template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 64>;
-template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 80>;
-template [[host_name("kernel_flash_attn_ext_q5_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 96>;
-template [[host_name("kernel_flash_attn_ext_q5_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 112>;
-template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 128>;
-template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_0, 2, dequantize_q5_0, 256>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 80>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 96>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 112>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 128>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256>;
 
-template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 64>;
-template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 80>;
-template [[host_name("kernel_flash_attn_ext_q5_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 96>;
-template [[host_name("kernel_flash_attn_ext_q5_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 112>;
-template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 128>;
-template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q5_1, 2, dequantize_q5_1, 256>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 64>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 80>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 96>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 112>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 128>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256>;
 
-template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 64>;
-template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 80>;
-template [[host_name("kernel_flash_attn_ext_q8_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 96>;
-template [[host_name("kernel_flash_attn_ext_q8_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 112>;
-template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 128>;
-template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<block_q8_0, 2, dequantize_q8_0, 256>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 64>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 80>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 96>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 112>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 128>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256>;
 
-// NOTE: can use half instead of float precision for some extra perf
-// D - head size, Q - queries per threadgroup, C - cache items per threadgroup
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &), short D, short Q = 1, short C = 32>
+template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 64>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 80>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 96>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 112>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 128>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256>;
+
+template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 64>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 80>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 96>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 112>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 128>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256>;
+
+#undef FA_TYPES
+
+template<
+    typename q4_t,    // query types in shared memory
+    typename q4x4_t,
+    typename k4x4_t,  // key types in shared memory
+    typename v4x4_t,  // value types in shared memory
+    typename qk_t,    // Q*K types
+    typename s_t,     // soft-max types
+    typename s4_t,
+    typename s4x4_t,
+    typename o4x4_t,  // attention accumulation types
+    typename kd4x4_t, // key type in device memory
+    short nl_k,
+    void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
+    typename vd4x4_t, // key type in device memory
+    short nl_v,
+    void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
+    short D,         // head size
+    short Q  = 1,    // queries per threadgroup
+    short C  = 32>   // cache items per threadgroup
 kernel void kernel_flash_attn_ext_vec(
         device const  char * q,
         device const  char * k,
         device const  char * v,
         device const  char * mask,
         device       float * dst,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant   int64_t & ne13,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant  uint64_t & nb21,
-        constant  uint64_t & nb22,
-        constant  uint64_t & nb23,
-        constant  uint64_t & nb31,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
+        constant   int32_t & ne01,
+        constant   int32_t & ne02,
+        constant   int32_t & ne03,
+        constant  uint32_t & nb01,
+        constant  uint32_t & nb02,
+        constant  uint32_t & nb03,
+        constant   int32_t & ne11,
+        constant   int32_t & ne_12_2, // assume K and V are same shape
+        constant   int32_t & ne_12_3,
+        constant  uint32_t & nb_12_1,
+        constant  uint32_t & nb_12_2,
+        constant  uint32_t & nb_12_3,
+        constant  uint32_t & nb31,
+        constant   int32_t & ne1,
+        constant   int32_t & ne2,
         constant     float & scale,
         constant     float & max_bias,
         constant     float & m0,
         constant     float & m1,
-        constant  uint32_t & n_head_log2,
+        constant  uint16_t & n_head_log2,
         constant     float & logit_softcap,
         threadgroup   half * shared [[threadgroup(0)]],
-        uint3  tgpig[[threadgroup_position_in_grid]],
-        uint3  tpitg[[thread_position_in_threadgroup]],
-        uint3    ntg[[threads_per_threadgroup]],
-        ushort tiisg[[thread_index_in_simdgroup]],
-        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+        ushort3  tgpig[[threadgroup_position_in_grid]],
+        ushort3  tpitg[[thread_position_in_threadgroup]],
+        ushort3    ntg[[threads_per_threadgroup]],
+        ushort   tiisg[[thread_index_in_simdgroup]],
+        ushort   sgitg[[simdgroup_index_in_threadgroup]]) {
     const short nsg = ntg.y; // number of simdgroups
 
     const int iq3 = tgpig[2];
@@ -3267,89 +3357,81 @@ kernel void kernel_flash_attn_ext_vec(
     const short D16 = D/16;
     const short NW  = N_SIMDWIDTH;
     const short NW4 = NW/4;
-    const short SH  = C; // shared memory per simdgroup in (half)
+    const short SH  = 2*C; // shared memory per simdgroup
 
-    const short T  = D + 2*nsg*SH; // shared memory size per query in (half)
+    const short T = D + nsg*SH; // shared memory size per query in (half)
 
-  //threadgroup half     * sq   = (threadgroup half     *) (shared +              0*D); // holds the query data
-    threadgroup half4    * sq4  = (threadgroup half4    *) (shared +              0*D); // same as above but in half4
-    threadgroup half4x4  * sq44 = (threadgroup half4x4  *) (shared +              0*D); // same as above but in half4x4
-    threadgroup float    * ss   = (threadgroup float    *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention
-    threadgroup float4   * ss4  = (threadgroup float4   *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4
-    threadgroup float4x4 * sr44 = (threadgroup float4x4 *) (shared + 2*sgitg*D  + Q*T); // scratch buffer for the results
+  //threadgroup q_t    * sq    = (threadgroup q_t    *) (shared +                0*D); // holds the query data
+    threadgroup q4_t   * sq4   = (threadgroup q4_t   *) (shared +                0*D); // same as above but in q4_t
+    threadgroup q4x4_t * sq4x4 = (threadgroup q4x4_t *) (shared +                0*D); // same as above but in q4x4_t
+    threadgroup s_t    * ss    = (threadgroup s_t    *) (shared + sgitg*SH     + Q*D); // scratch buffer for attention
+    threadgroup s4_t   * ss4   = (threadgroup s4_t   *) (shared + sgitg*SH     + Q*D); // same as above but in s4_t
+    threadgroup half   * sm    = (threadgroup half   *) (shared + sgitg*SH + C + Q*D); // scratch buffer for mask
+    threadgroup o4x4_t * sr4x4 = (threadgroup o4x4_t *) (shared + sgitg*D      + Q*T); // scratch buffer for the results
 
     // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
-    float4x4 lo[D16/NW4];
+    o4x4_t lo[D16/NW4];
 
     // load heads from Q to shared memory
     device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03));
 
     for (short i = tiisg; i < D4; i += NW) {
         if (iq1 < ne01) {
-            sq4[i] = (half4) q4[i];
+            sq4[i] = (q4_t) q4[i];
         } else {
-            sq4[i] = 0.0h;
+            sq4[i] = (q4_t) 0.0f;
         }
     }
 
     // zero out lo
     for (short i = 0; i < D16/NW4; i += NW4) {
-        lo[i] = float4x4(0.0f);
+        lo[i] = (o4x4_t) 0.0f;
     }
 
     // zero out shared memory SH
     for (short i = tiisg; i < SH/4; i += NW) {
-        ss4[i] = 0.0h;
+        ss4[i] = (s4_t) 0.0f;
     }
 
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     {
-        float S = 0.0f;
-        float M = -FLT_MAX/2;
+        half S = 0.0f;
+        half M = -__FLT16_MAX__/2;
 
         // thread indices inside the simdgroup
         const short tx = tiisg%8;
         const short ty = tiisg/8;
 
-        // assume K and V are same shape
-        const short ne22 = ne12;
-        const short ne23 = ne13;
+        // broadcast kv
+        //const short rk2 = ne02/ne12;
+        //const short rk3 = ne03/ne13;
 
-        // broadcast k
-        const short rk2 = ne02/ne12;
-        const short rk3 = ne03/ne13;
-
-        const short ik2 = iq2/rk2;
-        const short ik3 = iq3/rk3;
-
-        // broadcast v
-        const short rv2 = ne02/ne22;
-        const short rv3 = ne03/ne23;
-
-        const short iv2 = iq2/rv2;
-        const short iv3 = iq3/rv3;
+        const short ikv2 = iq2/(ne02/ne_12_2);
+        const short ikv3 = iq3/(ne03/ne_12_3);
 
         // load the queries from shared memory into local memory
-        float4x4 mq[D16/NW4];
+        q4x4_t mq[D16/NW4];
 
         for (short ii = 0; ii < D16; ii += NW4) {
-            mq[ii/NW4] = (float4x4) sq44[ii + tx];
+            mq[ii/NW4] = sq4x4[ii + tx];
         }
 
-        // pointer to the mask
-        device const half * mp = (device const half *) (mask + iq1*nb31);
+        const bool has_mask = mask != q;
 
-        float slope = 1.0f;
+        // pointer to the mask
+        device const half * pm = (device const half *) (mask + iq1*nb31);
+
+        half slope = 1.0f;
 
         // ALiBi
         if (max_bias > 0.0f) {
-            const uint32_t h = iq2;
+            const short h = iq2;
 
-            const float base = h < n_head_log2 ? m0 : m1;
-            const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+            const half  base = h < n_head_log2 ? m0 : m1;
+            const short exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
 
-            slope = pow(base, exp);
+            slope = pow(base, exph);
         }
 
         // loop over the KV cache
@@ -3360,20 +3442,24 @@ kernel void kernel_flash_attn_ext_vec(
                 break;
             }
 
+            if (has_mask) {
+                sm[tiisg] = pm[ic + tiisg];
+            }
+
             // Q*K^T
             {
                 // each simdgroup processes 1 query and 4 keys
                 for (short cc = 0; cc < C/4; ++cc) {
-                    float mqk = 0.0;
+                    qk_t mqk = 0.0;
 
-                    device const block_q * pk = (device const block_q *) ((device const char *) k + ((ic + 4*cc + ty)*nb11 + ik2*nb12 + ik3*nb13));
+                    device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 
 #pragma unroll
                     for (short ii = 0; ii < D16; ii += NW4) {
                         const short i = ii + tx;
 
-                        float4x4 mk;
-                        dequantize_func(pk + i/nl, i%nl, mk);
+                        k4x4_t mk;
+                        deq_k(pk + i/nl_k, i%nl_k, mk);
 
                         mqk +=
                             dot(mq[ii/NW4][0], mk[0]) +
@@ -3401,7 +3487,7 @@ kernel void kernel_flash_attn_ext_vec(
                             mqk = logit_softcap*precise::tanh(mqk);
                         }
 
-                        mqk += (mask != q) ? ((float) mp[ic + 4*cc + ty])*slope : (float) 0.0f;
+                        mqk += sm[4*cc + ty]*slope;
 
                         ss[4*cc + ty] = mqk;
                     }
@@ -3412,20 +3498,18 @@ kernel void kernel_flash_attn_ext_vec(
 
             // online softmax
             {
-                const short p = tiisg;
-
-                const float m = M;
-                const float s = ss[p];
+                const half m = M;
+                const half s = ss[tiisg];
 
                 M = simd_max(max(M, s));
 
-                const float ms = exp(m - M);
-                const float vs = exp(s - M);
+                const half ms = exp(m - M);
+                const half vs = exp(s - M);
 
                 S = S*ms + simd_sum(vs);
 
                 // the P matrix from the paper (Q rows, C columns)
-                ss[p] = vs;
+                ss[tiisg] = vs;
 
                 // O = diag(ms)*O
 #pragma unroll
@@ -3440,18 +3524,18 @@ kernel void kernel_flash_attn_ext_vec(
             {
 #pragma unroll
                 for (short cc = 0; cc < C/4; ++cc) {
-                    device const block_q * pv4 = (device const block_q *) ((device const char *) v + ((ic + 4*cc + ty)*nb21 + iv2*nb22 + iv3*nb23));
+                    device const vd4x4_t * pv4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 
-                    const float4x4 lss(ss[4*cc + ty]);
+                    const s4x4_t ms(ss[4*cc + ty]);
 
 #pragma unroll
                     for (short ii = 0; ii < D16; ii += NW4) {
                         const short i = ii + tx;
 
-                        float4x4 mv;
-                        dequantize_func(pv4 + i/nl, i%nl, mv);
+                        v4x4_t mv;
+                        deq_v(pv4 + i/nl_v, i%nl_v, mv);
 
-                        lo[ii/NW4] += mv*lss;
+                        lo[ii/NW4] += mv*ms;
                     }
                 }
             }
@@ -3459,8 +3543,8 @@ kernel void kernel_flash_attn_ext_vec(
 
         // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
         if (tiisg == 0) {
-            ss[0] = S;
-            ss[1] = M;
+            ss[0] = (s_t) S;
+            ss[1] = (s_t) M;
         }
     }
 
@@ -3489,7 +3573,7 @@ kernel void kernel_flash_attn_ext_vec(
 
     // store results to shared memory
     for (short i = tiisg; i < D16; i += NW4) {
-        sr44[i] = lo[i/NW4];
+        sr4x4[i] = lo[i/NW4];
     }
 
     threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -3497,18 +3581,18 @@ kernel void kernel_flash_attn_ext_vec(
     // parallel reduce
     for (short r = nsg/2; r > 0; r >>= 1) {
         if (sgitg < r) {
-            const float S0 = ss[       0];
-            const float S1 = ss[r*SH + 0];
+            const half S0 = ss[       0];
+            const half S1 = ss[r*SH + 0];
 
-            const float M0 = ss[       1];
-            const float M1 = ss[r*SH + 1];
+            const half M0 = ss[       1];
+            const half M1 = ss[r*SH + 1];
 
-            const float M = max(M0, M1);
+            const half M = max(M0, M1);
 
-            const float ms0 = exp(M0 - M);
-            const float ms1 = exp(M1 - M);
+            const half ms0 = exp(M0 - M);
+            const half ms1 = exp(M1 - M);
 
-            const float S = S0*ms0 + S1*ms1;
+            const half S = S0*ms0 + S1*ms1;
 
             if (tiisg == 0) {
                 ss[0] = S;
@@ -3517,7 +3601,7 @@ kernel void kernel_flash_attn_ext_vec(
 
             // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
             for (short i = tiisg; i < D16; i += NW) {
-                sr44[i] = sr44[i]*ms0 + sr44[i + r*D16]*ms1;
+                sr4x4[i] = sr4x4[i]*ms0 + sr4x4[i + r*D16]*ms1;
             }
         }
 
@@ -3531,26 +3615,45 @@ kernel void kernel_flash_attn_ext_vec(
         const float S = ss[0];
 
         for (short i = tiisg; i < D16; i += NW) {
-            dst44[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D16 + i] = sr44[i]/S;
+            dst44[((int64_t)iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D16 + i] = (float4x4) sr4x4[i]/S;
         }
     }
 }
 
-typedef decltype(kernel_flash_attn_ext_vec<half4x4, 1, dequantize_f16, 64>) flash_attn_ext_vec_t;
+// note: I think the s_t can be half instead of float, because the Q*K scaling is done before storing to shared mem
+//       in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max
+//
+#define FA_TYPES \
+           half4,  half4x4, \
+                   half4x4, \
+                   half4x4, \
+    float,                  \
+    half,  half4,  half4x4, \
+                   half4x4
 
-template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<half4x4,    1, dequantize_f16,  128>;
-template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q4_0, 2, dequantize_q4_0, 128>;
-template [[host_name("kernel_flash_attn_ext_vec_q4_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q4_1, 2, dequantize_q4_1, 128>;
-template [[host_name("kernel_flash_attn_ext_vec_q5_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q5_0, 2, dequantize_q5_0, 128>;
-template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q5_1, 2, dequantize_q5_1, 128>;
-template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q8_0, 2, dequantize_q8_0, 128>;
+typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64>) flash_attn_ext_vec_t;
 
-template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<half4x4,    1, dequantize_f16,  256>;
-template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q4_0, 2, dequantize_q4_0, 256>;
-template [[host_name("kernel_flash_attn_ext_vec_q4_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q4_1, 2, dequantize_q4_1, 256>;
-template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q5_0, 2, dequantize_q5_0, 256>;
-template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q5_1, 2, dequantize_q5_1, 256>;
-template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<block_q8_0, 2, dequantize_q8_0, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,     1, dequantize_f16,  128>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_flash_attn_ext_vec_bf16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,   1, dequantize_bf16, 128>;
+#endif
+template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0,  2, dequantize_q4_0, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1,  2, dequantize_q4_1, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0,  2, dequantize_q5_0, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1,  2, dequantize_q5_1, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0,  2, dequantize_q8_0, 128>;
+
+template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,     1, dequantize_f16,  256>;
+#if !defined(GGML_METAL_NO_BFLOAT)
+template [[host_name("kernel_flash_attn_ext_vec_bf16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,   1, dequantize_bf16, 256>;
+#endif
+template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0,  2, dequantize_q4_0, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1,  2, dequantize_q4_1, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0,  2, dequantize_q5_0, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1,  2, dequantize_q5_1, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0,  2, dequantize_q8_0, 256>;
+
+#undef FA_TYPES
 
 template<typename T0, typename T1>
 kernel void kernel_cpy(
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index bc034015f..cd26a361b 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4228,6 +4228,15 @@ void ggml_flash_attn_ext_set_prec(
     ggml_set_op_params_i32(a, 3, prec_i32); // scale is on first pos, max_bias on second
 }
 
+enum ggml_prec ggml_flash_attn_ext_get_prec(
+        const struct ggml_tensor * a) {
+    GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
+
+    const int32_t prec_i32 = ggml_get_op_params_i32(a, 3);
+
+    return (enum ggml_prec) prec_i32;
+}
+
 // ggml_flash_attn_back
 
 struct ggml_tensor * ggml_flash_attn_back(
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9d48a2717..65be43281 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3745,7 +3745,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                     for (int nh : { 32, }) {
                         for (int kv : { 512, 1024, }) {
                             for (int nb : { 1, 3, 32, 35, }) {
-                                for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
+                                for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
                                     test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
                                 }
                             }

From 695ad752b2631af84ba321177656705b30c6e401 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 8 Nov 2024 18:37:41 +0200
Subject: [PATCH 183/396] metal : improve clarity (minor) (#10171)

---
 ggml/src/ggml-metal.metal | 70 +++++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index edce74108..89f12724d 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -3356,7 +3356,7 @@ kernel void kernel_flash_attn_ext_vec(
     const short D4  = D/4;
     const short D16 = D/16;
     const short NW  = N_SIMDWIDTH;
-    const short NW4 = NW/4;
+    const short NL  = NW/4;
     const short SH  = 2*C; // shared memory per simdgroup
 
     const short T = D + nsg*SH; // shared memory size per query in (half)
@@ -3370,7 +3370,7 @@ kernel void kernel_flash_attn_ext_vec(
     threadgroup o4x4_t * sr4x4 = (threadgroup o4x4_t *) (shared + sgitg*D      + Q*T); // scratch buffer for the results
 
     // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
-    o4x4_t lo[D16/NW4];
+    o4x4_t lo[D16/NL];
 
     // load heads from Q to shared memory
     device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03));
@@ -3384,7 +3384,7 @@ kernel void kernel_flash_attn_ext_vec(
     }
 
     // zero out lo
-    for (short i = 0; i < D16/NW4; i += NW4) {
+    for (short i = 0; i < D16/NL; ++i) {
         lo[i] = (o4x4_t) 0.0f;
     }
 
@@ -3400,8 +3400,8 @@ kernel void kernel_flash_attn_ext_vec(
         half M = -__FLT16_MAX__/2;
 
         // thread indices inside the simdgroup
-        const short tx = tiisg%8;
-        const short ty = tiisg/8;
+        const short tx = tiisg%NL;
+        const short ty = tiisg/NL;
 
         // broadcast kv
         //const short rk2 = ne02/ne12;
@@ -3411,10 +3411,10 @@ kernel void kernel_flash_attn_ext_vec(
         const short ikv3 = iq3/(ne03/ne_12_3);
 
         // load the queries from shared memory into local memory
-        q4x4_t mq[D16/NW4];
+        q4x4_t mq[D16/NL];
 
-        for (short ii = 0; ii < D16; ii += NW4) {
-            mq[ii/NW4] = sq4x4[ii + tx];
+        for (short ii = 0; ii < D16; ii += NL) {
+            mq[ii/NL] = sq4x4[ii + tx];
         }
 
         const bool has_mask = mask != q;
@@ -3455,17 +3455,17 @@ kernel void kernel_flash_attn_ext_vec(
                     device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 
 #pragma unroll
-                    for (short ii = 0; ii < D16; ii += NW4) {
+                    for (short ii = 0; ii < D16; ii += NL) {
                         const short i = ii + tx;
 
                         k4x4_t mk;
                         deq_k(pk + i/nl_k, i%nl_k, mk);
 
                         mqk +=
-                            dot(mq[ii/NW4][0], mk[0]) +
-                            dot(mq[ii/NW4][1], mk[1]) +
-                            dot(mq[ii/NW4][2], mk[2]) +
-                            dot(mq[ii/NW4][3], mk[3]);
+                            dot(mq[ii/NL][0], mk[0]) +
+                            dot(mq[ii/NL][1], mk[1]) +
+                            dot(mq[ii/NL][2], mk[2]) +
+                            dot(mq[ii/NL][3], mk[3]);
                     }
 
                     // simdgroup reduce
@@ -3513,8 +3513,8 @@ kernel void kernel_flash_attn_ext_vec(
 
                 // O = diag(ms)*O
 #pragma unroll
-                for (short ii = 0; ii < D16; ii += NW4) {
-                    lo[ii/NW4] *= ms;
+                for (short ii = 0; ii < D16; ii += NL) {
+                    lo[ii/NL] *= ms;
                 }
             }
 
@@ -3529,13 +3529,13 @@ kernel void kernel_flash_attn_ext_vec(
                     const s4x4_t ms(ss[4*cc + ty]);
 
 #pragma unroll
-                    for (short ii = 0; ii < D16; ii += NW4) {
+                    for (short ii = 0; ii < D16; ii += NL) {
                         const short i = ii + tx;
 
                         v4x4_t mv;
                         deq_v(pv4 + i/nl_v, i%nl_v, mv);
 
-                        lo[ii/NW4] += mv*ms;
+                        lo[ii/NL] += mv*ms;
                     }
                 }
             }
@@ -3557,23 +3557,37 @@ kernel void kernel_flash_attn_ext_vec(
     // [ 5, 13, 21, 29] -> [ 5]
     // [ 6, 14, 22, 30] -> [ 6]
     // [ 7, 15, 23, 31] -> [ 7]
-    for (short ii = 0; ii < D16; ii += NW4) {
-        lo[ii/NW4][0] += simd_shuffle_down(lo[ii/NW4][0], 16);
-        lo[ii/NW4][0] += simd_shuffle_down(lo[ii/NW4][0],  8);
+    for (short ii = 0; ii < D16; ii += NL) {
+        lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 16);
+        lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0],  8);
+      //lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0],  4);
+      //lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0],  2);
+      //lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0],  1);
 
-        lo[ii/NW4][1] += simd_shuffle_down(lo[ii/NW4][1], 16);
-        lo[ii/NW4][1] += simd_shuffle_down(lo[ii/NW4][1],  8);
+        lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 16);
+        lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1],  8);
+      //lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1],  4);
+      //lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1],  2);
+      //lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1],  1);
 
-        lo[ii/NW4][2] += simd_shuffle_down(lo[ii/NW4][2], 16);
-        lo[ii/NW4][2] += simd_shuffle_down(lo[ii/NW4][2],  8);
+        lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 16);
+        lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2],  8);
+      //lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2],  4);
+      //lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2],  2);
+      //lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2],  1);
 
-        lo[ii/NW4][3] += simd_shuffle_down(lo[ii/NW4][3], 16);
-        lo[ii/NW4][3] += simd_shuffle_down(lo[ii/NW4][3],  8);
+        lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 16);
+        lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3],  8);
+      //lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3],  4);
+      //lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3],  2);
+      //lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3],  1);
     }
 
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
     // store results to shared memory
-    for (short i = tiisg; i < D16; i += NW4) {
-        sr4x4[i] = lo[i/NW4];
+    for (short i = tiisg; i < D16; i += NL) {
+        sr4x4[i] = lo[i/NL];
     }
 
     threadgroup_barrier(mem_flags::mem_threadgroup);

From ec450d3bbf9fdb3cd06b27c00c684fd1861cb0cf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 8 Nov 2024 21:59:46 +0200
Subject: [PATCH 184/396] metal : opt-in compile flag for BF16 (#10218)

* metal : opt-in compile flag for BF16

ggml-ci

* ci : use BF16

ggml-ci

* swift : switch back to v12

* metal : has_float -> use_float

ggml-ci

* metal : fix BF16 check in MSL

ggml-ci
---
 .github/workflows/build.yml | 17 +++++++++--
 Makefile                    |  4 +++
 ci/run.sh                   |  2 +-
 ggml/CMakeLists.txt         |  1 +
 ggml/src/CMakeLists.txt     |  4 +++
 ggml/src/ggml-metal.m       | 59 ++++++++++++++++++++++---------------
 ggml/src/ggml-metal.metal   | 32 ++++++++++----------
 7 files changed, 77 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 02dcee963..1e37a3c79 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -55,7 +55,13 @@ jobs:
           sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake .. \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DGGML_RPC=ON \
+            -DBUILD_SHARED_LIBS=OFF
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -113,7 +119,12 @@ jobs:
           sysctl -a
           # Metal is disabled due to intermittent failures with Github runners not having a GPU:
           # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON \
+            -DBUILD_SHARED_LIBS=OFF
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -569,6 +580,7 @@ jobs:
           mkdir build
           cd build
           cmake -G Xcode .. \
+            -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
@@ -599,6 +611,7 @@ jobs:
           mkdir build
           cd build
           cmake -G Xcode .. \
+            -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
diff --git a/Makefile b/Makefile
index b9131eae5..dfa32d516 100644
--- a/Makefile
+++ b/Makefile
@@ -878,6 +878,10 @@ ifdef GGML_METAL
 	MK_CPPFLAGS += -DGGML_USE_METAL
 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
 	OBJ_GGML	+= ggml/src/ggml-metal.o
+
+ifdef GGML_METAL_USE_BF16
+	MK_CPPFLAGS += -DGGML_METAL_USE_BF16
+endif # GGML_METAL_USE_BF16
 ifdef GGML_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
diff --git a/ci/run.sh b/ci/run.sh
index 21b62dd1e..20610e560 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -39,7 +39,7 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
 
 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
 fi
 
 if [ ! -z ${GG_BUILD_CUDA} ]; then
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 6866a25d3..81b7a02f5 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -153,6 +153,7 @@ option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
+option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
 option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 34b81bd7f..6c5b816d2 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -58,6 +58,10 @@ if (GGML_METAL)
         add_compile_definitions(GGML_METAL_NDEBUG)
     endif()
 
+    if (GGML_METAL_USE_BF16)
+        add_compile_definitions(GGML_METAL_USE_BF16)
+    endif()
+
     # copy ggml-common.h and ggml-metal.metal to bin directory
     configure_file(ggml-common.h    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
     configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index e19397fd2..10d59cb9f 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -39,6 +39,7 @@ static struct ggml_backend_metal_device_context {
     bool has_simdgroup_reduction;
     bool has_simdgroup_mm;
     bool has_bfloat;
+    bool use_bfloat;
 
     char name[128];
 } g_ggml_ctx_dev_main = {
@@ -47,6 +48,7 @@ static struct ggml_backend_metal_device_context {
     /*.has_simdgroup_reduction =*/ false,
     /*.has_simdgroup_mm        =*/ false,
     /*.has_bfloat              =*/ false,
+    /*.use_bfloat              =*/ false,
     /*.name                    =*/ "",
 };
 
@@ -65,6 +67,12 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
         ctx->has_bfloat  = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
         ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
 
+#if defined(GGML_METAL_USE_BF16)
+        ctx->use_bfloat = ctx->has_bfloat;
+#else
+        ctx->use_bfloat = false;
+#endif
+
         strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1);
     }
 
@@ -504,6 +512,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
                 // dictionary of preprocessor macros
                 NSMutableDictionary * prep = [NSMutableDictionary dictionary];
 
+                if (ctx_dev->use_bfloat) {
+                    [prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
+                }
+
                 MTLCompileOptions * options = [MTLCompileOptions new];
                 options.preprocessorMacros = prep;
 
@@ -556,7 +568,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
 
     GGML_LOG_INFO("%s: simdgroup reduction   = %s\n", __func__, ctx_dev->has_simdgroup_reduction     ? "true" : "false");
     GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm            ? "true" : "false");
-    GGML_LOG_INFO("%s: bfloat                = %s\n", __func__, ctx_dev->has_bfloat                  ? "true" : "false");
+    GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, ctx_dev->has_bfloat                  ? "true" : "false");
+    GGML_LOG_INFO("%s: use bfloat            = %s\n", __func__, ctx_dev->use_bfloat                  ? "true" : "false");
     GGML_LOG_INFO("%s: hasUnifiedMemory      = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
 
     ctx->capture_next_compute = false;
@@ -608,7 +621,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
 
         const bool has_simdgroup_mm        = ctx_dev->has_simdgroup_mm;
         const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction;
-        const bool has_bfloat              = ctx_dev->has_bfloat;
+        const bool use_bfloat              = ctx_dev->use_bfloat;
 
         // simd_sum and simd_max requires MTLGPUFamilyApple7
 
@@ -644,7 +657,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,               diag_mask_inf_8,                true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,                  get_rows_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,                  get_rows_f16,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16,                 get_rows_bf16,                  has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16,                 get_rows_bf16,                  use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,                 get_rows_q4_0,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,                 get_rows_q4_1,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,                 get_rows_q5_0,                  true);
@@ -671,10 +684,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,                  ssm_conv_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,                  ssm_scan_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                mul_mv_f32_f32,                 has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32,               mul_mv_bf16_f32,                has_simdgroup_reduction && has_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW,          mul_mv_bf16_f32_1row,           has_simdgroup_reduction && has_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4,            mul_mv_bf16_f32_l4,             has_simdgroup_reduction && has_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16,              mul_mv_bf16_bf16,               has_simdgroup_reduction && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32,               mul_mv_bf16_f32,                has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW,          mul_mv_bf16_f32_1row,           has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4,            mul_mv_bf16_f32_l4,             has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16,              mul_mv_bf16_bf16,               has_simdgroup_reduction && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,                mul_mv_f16_f32,                 has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,           mul_mv_f16_f32_1row,            has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,             mul_mv_f16_f32_l4,              has_simdgroup_reduction);
@@ -703,7 +716,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
       //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,        mul_mv_id_f16_f32_1row,         has_simdgroup_reduction);
       //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,          mul_mv_id_f16_f32_l4,           has_simdgroup_reduction);
       //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,             mul_mv_id_f16_f16,              has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32,            mul_mv_id_bf16_f32,             has_simdgroup_reduction && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32,            mul_mv_id_bf16_f32,             has_simdgroup_reduction && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,            mul_mv_id_q4_0_f32,             has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,            mul_mv_id_q4_1_f32,             has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,            mul_mv_id_q5_0_f32,             has_simdgroup_reduction);
@@ -725,7 +738,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,          mul_mv_id_iq4_xs_f32,           has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,                mul_mm_f32_f32,                 has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,                mul_mm_f16_f32,                 has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,               mul_mm_bf16_f32,                has_simdgroup_mm && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,               mul_mm_bf16_f32,                has_simdgroup_mm && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,               mul_mm_q4_0_f32,                has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,               mul_mm_q4_1_f32,                has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,               mul_mm_q5_0_f32,                has_simdgroup_mm);
@@ -747,7 +760,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,             mul_mm_iq4_xs_f32,              has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,             mul_mm_id_f32_f32,              has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,             mul_mm_id_f16_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,            mul_mm_id_bf16_f32,             has_simdgroup_mm && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,            mul_mm_id_bf16_f32,             has_simdgroup_mm && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,            mul_mm_id_q4_0_f32,             has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,            mul_mm_id_q4_1_f32,             has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,            mul_mm_id_q5_0_f32,             has_simdgroup_mm);
@@ -788,12 +801,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,       flash_attn_ext_f16_h112,        has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,       flash_attn_ext_f16_h128,        has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64,       flash_attn_ext_bf16_h64,        has_simdgroup_mm && has_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80,       flash_attn_ext_bf16_h80,        has_simdgroup_mm && has_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96,       flash_attn_ext_bf16_h96,        has_simdgroup_mm && has_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112,      flash_attn_ext_bf16_h112,       has_simdgroup_mm && has_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128,      flash_attn_ext_bf16_h128,       has_simdgroup_mm && has_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256,      flash_attn_ext_bf16_h256,       has_simdgroup_mm && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64,       flash_attn_ext_bf16_h64,        has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80,       flash_attn_ext_bf16_h80,        has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96,       flash_attn_ext_bf16_h96,        has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112,      flash_attn_ext_bf16_h112,       has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128,      flash_attn_ext_bf16_h128,       has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256,      flash_attn_ext_bf16_h256,       has_simdgroup_mm && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,       flash_attn_ext_q4_0_h64,        has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,       flash_attn_ext_q4_0_h80,        has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,       flash_attn_ext_q4_0_h96,        has_simdgroup_mm);
@@ -825,14 +838,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,      flash_attn_ext_q8_0_h128,       has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,      flash_attn_ext_q8_0_h256,       has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,   flash_attn_ext_vec_f16_h128,    has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,  flash_attn_ext_vec_bf16_h128,   has_simdgroup_reduction && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,  flash_attn_ext_vec_bf16_h128,   has_simdgroup_reduction && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,  flash_attn_ext_vec_q4_0_h128,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,  flash_attn_ext_vec_q4_1_h128,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,  flash_attn_ext_vec_q5_0_h128,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,  flash_attn_ext_vec_q5_1_h128,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,  flash_attn_ext_vec_q8_0_h128,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256,  flash_attn_ext_vec_bf16_h256,   has_simdgroup_reduction && has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256,  flash_attn_ext_vec_bf16_h256,   has_simdgroup_reduction && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,  flash_attn_ext_vec_q4_0_h256,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,  flash_attn_ext_vec_q4_1_h256,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,  flash_attn_ext_vec_q5_0_h256,   has_simdgroup_reduction);
@@ -840,11 +853,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,  flash_attn_ext_vec_q8_0_h256,   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,                   cpy_f32_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,                   cpy_f32_f16,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,                  cpy_f32_bf16,                   has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,                  cpy_f32_bf16,                   use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32,                   cpy_f16_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,                   cpy_f16_f16,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_F32,                  cpy_bf16_f32,                   has_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16,                 cpy_bf16_bf16,                  has_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_F32,                  cpy_bf16_f32,                   use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16,                 cpy_bf16_bf16,                  use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,                  cpy_f32_q8_0,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,                  cpy_f32_q4_0,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,                  cpy_f32_q4_1,                   true);
@@ -936,9 +949,9 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
 static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_context * ctx_dev, const struct ggml_tensor * op) {
     const bool has_simdgroup_mm        = ctx_dev->has_simdgroup_mm;
     const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction;
-    const bool has_bfloat              = ctx_dev->has_bfloat;
+    const bool use_bfloat              = ctx_dev->use_bfloat;
 
-    if (!has_bfloat) {
+    if (!use_bfloat) {
         for (size_t i = 0, n = 3; i < n; ++i) {
             if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
                 return false;
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index 89f12724d..7e1517414 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -18,11 +18,11 @@ using namespace metal;
 //   .../usr/bin/metal -dM -E -c                             ggml/src/ggml-metal.metal
 //   .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal.metal
 //
-#if __METAL_VERSION__ < 310
-#define GGML_METAL_NO_BFLOAT
+#if __METAL_VERSION__ < 310 && defined(GGML_METAL_USE_BF16)
+#undef GGML_METAL_USE_BF16
 #endif
 
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 typedef matrix<bfloat, 4, 4> bfloat4x4;
 #endif
 
@@ -41,7 +41,7 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
     reg = (type4x4)(*src);
 }
 
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template <typename type4x4>
 void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) {
     reg = (type4x4)(*src);
@@ -2082,7 +2082,7 @@ typedef decltype(kernel_mul_mv<half, half4, half, half4>) mul_mv_t;
 template [[host_name("kernel_mul_mv_f32_f32")]]   kernel mul_mv_t kernel_mul_mv<float,  float4,  float,  float4>;
 template [[host_name("kernel_mul_mv_f16_f32")]]   kernel mul_mv_t kernel_mul_mv<half,   half4,   float,  float4>;
 template [[host_name("kernel_mul_mv_f16_f16")]]   kernel mul_mv_t kernel_mul_mv<half,   half4,   half,   half4>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_mul_mv_bf16_f32")]]  kernel mul_mv_t kernel_mul_mv<bfloat, bfloat4, float,  float4>;
 template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv<bfloat, bfloat4, bfloat, bfloat4>;
 #endif
@@ -2155,7 +2155,7 @@ kernel void kernel_mul_mv_1row(
 typedef decltype(kernel_mul_mv_1row<half, half4>) mul_mv_1row_t;
 
 template [[host_name("kernel_mul_mv_f16_f32_1row")]]  kernel mul_mv_1row_t kernel_mul_mv_1row<half,   half4>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_mul_mv_bf16_f32_1row")]] kernel mul_mv_1row_t kernel_mul_mv_1row<bfloat, bfloat4>;
 #endif
 
@@ -2217,7 +2217,7 @@ kernel void kernel_mul_mv_l4(
 typedef decltype(kernel_mul_mv_l4<half, half4>) mul_mv_l4_t;
 
 template [[host_name("kernel_mul_mv_f16_f32_l4")]]  kernel mul_mv_l4_t kernel_mul_mv_l4<half, half4>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_mul_mv_bf16_f32_l4")]] kernel mul_mv_l4_t kernel_mul_mv_l4<bfloat, bfloat4>;
 #endif
 
@@ -3249,7 +3249,7 @@ template [[host_name("kernel_flash_attn_ext_f16_h112")]]  kernel flash_attn_ext_
 template [[host_name("kernel_flash_attn_ext_f16_h128")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  128>;
 template [[host_name("kernel_flash_attn_ext_f16_h256")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  256>;
 
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 64>;
 template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 80>;
 template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 96>;
@@ -3648,7 +3648,7 @@ kernel void kernel_flash_attn_ext_vec(
 typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64>) flash_attn_ext_vec_t;
 
 template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,     1, dequantize_f16,  128>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_flash_attn_ext_vec_bf16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,   1, dequantize_bf16, 128>;
 #endif
 template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0,  2, dequantize_q4_0, 128>;
@@ -3658,7 +3658,7 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_
 template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0,  2, dequantize_q8_0, 128>;
 
 template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,     1, dequantize_f16,  256>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_flash_attn_ext_vec_bf16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,   1, dequantize_bf16, 256>;
 #endif
 template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0,  2, dequantize_q4_0, 256>;
@@ -3715,12 +3715,12 @@ typedef decltype(kernel_cpy<float, float>) kernel_cpy_t;
 
 template [[host_name("kernel_cpy_f32_f32")]]   kernel kernel_cpy_t kernel_cpy<float,  float>;
 template [[host_name("kernel_cpy_f32_f16")]]   kernel kernel_cpy_t kernel_cpy<float,  half>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_cpy_f32_bf16")]]  kernel kernel_cpy_t kernel_cpy<float,  bfloat>;
 #endif
 template [[host_name("kernel_cpy_f16_f32")]]   kernel kernel_cpy_t kernel_cpy<half,   float>;
 template [[host_name("kernel_cpy_f16_f16")]]   kernel kernel_cpy_t kernel_cpy<half,   half>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_cpy_bf16_f32")]]  kernel kernel_cpy_t kernel_cpy<bfloat, float>;
 template [[host_name("kernel_cpy_bf16_bf16")]] kernel kernel_cpy_t kernel_cpy<bfloat, bfloat>;
 #endif
@@ -6628,7 +6628,7 @@ typedef decltype(kernel_get_rows_f<float>) get_rows_f_t;
 
 template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_f_t kernel_get_rows_f<float>;
 template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_f_t kernel_get_rows_f<half>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_rows_f<bfloat>;
 #endif
 
@@ -6662,7 +6662,7 @@ typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, de
 
 template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
 template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mat_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
 #endif
 template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
@@ -6693,7 +6693,7 @@ typedef decltype(kernel_mul_mm_id<float4x4, 1, dequantize_f32>) mat_mm_id_t;
 
 template [[host_name("kernel_mul_mm_id_f32_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<float4x4,      1,     dequantize_f32>;
 template [[host_name("kernel_mul_mm_id_f16_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<half4x4,       1,     dequantize_f16>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_mul_mm_id_bf16_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<bfloat4x4,     1,     dequantize_bf16>;
 #endif
 template [[host_name("kernel_mul_mm_id_q4_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0,    2,     dequantize_q4_0>;
@@ -6919,7 +6919,7 @@ typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float
 
 template [[host_name("kernel_mul_mv_id_f32_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
 template [[host_name("kernel_mul_mv_id_f16_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<half, half4, float, float4>>>;
-#if !defined(GGML_METAL_NO_BFLOAT)
+#if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_mul_mv_id_bf16_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<bfloat, bfloat4, float, float4>>>;
 #endif
 template [[host_name("kernel_mul_mv_id_q8_0_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q8_0_f32_impl>>;

From 8fc393f246c550d2481e53323a47644a94e8d01f Mon Sep 17 00:00:00 2001
From: haopeng <657407891@qq.com>
Date: Sat, 9 Nov 2024 15:06:54 +0800
Subject: [PATCH 185/396] scripts : fix pattern and get n_tokens in one go
 (#10221)

---
 examples/chat-persistent.sh | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh
index d9cab9836..9d761ebb8 100755
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
 NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
 NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
 
-SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
-SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
+SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
+'|'\
+'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
 SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
 
 CTX_SIZE=2048
@@ -129,15 +130,12 @@ while read -e line; do
 
     printf ' '
 
-    # HACK get num tokens from debug message
-    # TODO get both messages in one go
-    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
-        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
+    if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
         echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
         exit 1
     fi
 
-    n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
+    n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
 
     if ((n_tokens > CTX_ROTATE_POINT)); then
         tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"

From e89213492d3e01705739789733f0f2d250b4c449 Mon Sep 17 00:00:00 2001
From: amritahs-ibm <amritahs@linux.vnet.ibm.com>
Date: Sat, 9 Nov 2024 12:47:50 +0530
Subject: [PATCH 186/396] ggml : optimize llamafile cpu matrix multiplication
 for ppc64le (#10156)

This change upstreams llamafile's cpu matrix
multiplication kernels for ppc64le using MMA
builtins for FP32 datatype.

This change results in a consistent 90%
improvement in input processing time, and 20%
to 80% improvement in output processing time,
across various batch sizes.

The patch is tested with Meta-Lllama-3-8B,
Mistral-7B, Llama-2-7B-chat-hf models on a
IBM POWER10 machine.

Signed-off-by: Amrita H S <amritahs@linux.vnet.ibm.com>
---
 ggml/src/CMakeLists.txt      |   9 +-
 ggml/src/llamafile/sgemm.cpp | 608 +++++++++++++++++++++++++++++++++++
 2 files changed, 615 insertions(+), 2 deletions(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 6c5b816d2..a05f8c505 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1265,8 +1265,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
     message(STATUS "PowerPC detected")
-    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
+    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
+                   OUTPUT_VARIABLE POWER10_M)
+    string(FIND ${POWER10_M} "POWER10" substring_index)
+    if(${substring_index} GREATER_EQUAL 0)
+       list(APPEND ARCH_FLAGS -mcpu=power10)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+       list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
     else()
         list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
         #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp
index 9eead3f61..da4146ec4 100644
--- a/ggml/src/llamafile/sgemm.cpp
+++ b/ggml/src/llamafile/sgemm.cpp
@@ -106,6 +106,10 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
 inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
+#if defined(__MMA__)
+typedef vector unsigned char vec_t;
+typedef __vector_quad acc_t;
+#endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // VECTORIZED FUSED MULTIPLY ADD
 
@@ -1026,6 +1030,600 @@ class tinyBLAS_Q0_AVX {
 };
 #endif // __AVX__
 
+//PPC Implementation
+#if defined(__MMA__)
+
+#define SAVE_ACC(ACC, ii, jj) \
+   __builtin_mma_disassemble_acc(vec_C, ACC); \
+   for (int I = 0; I < 4; I++) { \
+      for (int J = 0; J < 4; J++) { \
+         *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); \
+      } \
+   } \
+
+template <typename TA, typename TB, typename TC>
+class tinyBLAS_PPC {
+  public:
+    tinyBLAS_PPC(int64_t k,
+                const TA *A, int64_t lda,
+                const TB *B, int64_t ldb,
+                TC *C, int64_t ldc,
+                int ith, int nth)
+        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+    }
+
+    void matmul(int64_t m, int64_t n) {
+       mnpack(0, m, 0, n);
+    }
+
+  private:
+
+    void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
+
+    void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) {
+        int64_t i, j;
+        float *aoffset = NULL, *boffset = NULL;
+        float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
+        float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
+
+        aoffset = const_cast<float*>(a);
+        boffset = vec;
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+                aoffset1 = aoffset;
+                aoffset2 = aoffset1 + lda;
+                aoffset3 = aoffset2 + lda;
+                aoffset4 = aoffset3 + lda;
+                aoffset5 = aoffset4 + lda;
+                aoffset6 = aoffset5 + lda;
+                aoffset7 = aoffset6 + lda;
+                aoffset8 = aoffset7 + lda;
+                aoffset += 8 * lda;
+                i = (cols >> 3);
+                if (i > 0) {
+                    __vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
+                    vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2];
+                    vector float t1, t2, t3, t4, t5, t6, t7, t8;
+                    do {
+                        C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
+                        C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
+                        C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
+                        C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
+                        C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5);
+                        C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6);
+                        C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7);
+                        C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8);
+                        __builtin_vsx_disassemble_pair(c1, &C1);
+                        __builtin_vsx_disassemble_pair(c2, &C2);
+                        __builtin_vsx_disassemble_pair(c3, &C3);
+                        __builtin_vsx_disassemble_pair(c4, &C4);
+                        __builtin_vsx_disassemble_pair(c5, &C5);
+                        __builtin_vsx_disassemble_pair(c6, &C6);
+                        __builtin_vsx_disassemble_pair(c7, &C7);
+                        __builtin_vsx_disassemble_pair(c8, &C8);
+
+                        t1 = vec_mergeh(c1[0], c2[0]);
+                        t2 = vec_mergeh(c3[0], c4[0]);
+                        t3 = vec_mergeh(c5[0], c6[0]);
+                        t4 = vec_mergeh(c7[0], c8[0]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset);
+                        vec_xst(t6, 0, boffset+4);
+                        vec_xst(t7, 0, boffset+8);
+                        vec_xst(t8, 0, boffset+12);
+
+                        t1 = vec_mergel(c1[0], c2[0]);
+                        t2 = vec_mergel(c3[0], c4[0]);
+                        t3 = vec_mergel(c5[0], c6[0]);
+                        t4 = vec_mergel(c7[0], c8[0]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset+16);
+                        vec_xst(t6, 0, boffset+20);
+                        vec_xst(t7, 0, boffset+24);
+                        vec_xst(t8, 0, boffset+28);
+
+                        t1 = vec_mergeh(c1[1], c2[1]);
+                        t2 = vec_mergeh(c3[1], c4[1]);
+                        t3 = vec_mergeh(c5[1], c6[1]);
+                        t4 = vec_mergeh(c7[1], c8[1]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset+32);
+                        vec_xst(t6, 0, boffset+36);
+                        vec_xst(t7, 0, boffset+40);
+                        vec_xst(t8, 0, boffset+44);
+
+                        t1 = vec_mergel(c1[1], c2[1]);
+                        t2 = vec_mergel(c3[1], c4[1]);
+                        t3 = vec_mergel(c5[1], c6[1]);
+                        t4 = vec_mergel(c7[1], c8[1]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset+48);
+                        vec_xst(t6, 0, boffset+52);
+                        vec_xst(t7, 0, boffset+56);
+                        vec_xst(t8, 0, boffset+60);
+
+                        aoffset1 += 8*lda;
+                        aoffset2 += 8*lda;
+                        aoffset3 += 8*lda;
+                        aoffset4 += 8*lda;
+                        boffset += 64;
+                        i--;
+                    } while(i > 0);
+                }
+                if (cols & 4) {
+                    vector float c1, c2, c3, c4, c5, c6, c7, c8;
+                    vector float t1, t2, t3, t4, t5, t6, t7, t8;
+                    c1 = vec_xl(0, aoffset1);
+                    c2 = vec_xl(0, aoffset2);
+                    c3 = vec_xl(0, aoffset3);
+                    c4 = vec_xl(0, aoffset4);
+                    c5 = vec_xl(0, aoffset5);
+                    c6 = vec_xl(0, aoffset6);
+                    c7 = vec_xl(0, aoffset7);
+                    c8 = vec_xl(0, aoffset8);
+
+                    t1 = vec_mergeh(c1, c2);
+                    t2 = vec_mergeh(c3, c4);
+                    t3 = vec_mergeh(c5, c6);
+                    t4 = vec_mergeh(c7, c8);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t3, t4, 0);
+                    t7 = vec_xxpermdi(t1, t2, 3);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset);
+                    vec_xst(t6, 0, boffset+4);
+                    vec_xst(t7, 0, boffset+8);
+                    vec_xst(t8, 0, boffset+12);
+
+                    t1 = vec_mergel(c1, c2);
+                    t2 = vec_mergel(c3, c4);
+                    t3 = vec_mergel(c5, c6);
+                    t4 = vec_mergel(c7, c8);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t3, t4, 0);
+                    t7 = vec_xxpermdi(t1, t2, 3);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset+16);
+                    vec_xst(t6, 0, boffset+20);
+                    vec_xst(t7, 0, boffset+24);
+                    vec_xst(t8, 0, boffset+28);
+                }
+            j--;
+            } while(j > 0);
+        }
+
+        if (rows & 4) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            aoffset4 = aoffset3 + lda;
+            aoffset += 4 * lda;
+            i = (cols >> 3);
+            if (i > 0) {
+                __vector_pair C1, C2, C3, C4;
+                vector float c1[2], c2[2], c3[2], c4[2];
+                vector float t1, t2, t3, t4, t5, t6, t7, t8;
+                do {
+                    C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
+                    C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
+                    C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
+                    C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
+                    __builtin_vsx_disassemble_pair(c1, &C1);
+                    __builtin_vsx_disassemble_pair(c2, &C2);
+                    __builtin_vsx_disassemble_pair(c3, &C3);
+                    __builtin_vsx_disassemble_pair(c4, &C4);
+
+                    t1 = vec_mergeh(c1[0], c2[0]);
+                    t2 = vec_mergeh(c3[0], c4[0]);
+                    t3 = vec_mergel(c1[0], c2[0]);
+                    t4 = vec_mergel(c3[0], c4[0]);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t1, t2, 3);
+                    t7 = vec_xxpermdi(t3, t4, 0);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset);
+                    vec_xst(t6, 0, boffset+4);
+                    vec_xst(t7, 0, boffset+8);
+                    vec_xst(t8, 0, boffset+12);
+
+                    t1 = vec_mergeh(c1[1], c2[1]);
+                    t2 = vec_mergeh(c3[1], c4[1]);
+                    t3 = vec_mergel(c1[1], c2[1]);
+                    t4 = vec_mergel(c3[1], c4[1]);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t1, t2, 3);
+                    t7 = vec_xxpermdi(t3, t4, 0);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset+16);
+                    vec_xst(t6, 0, boffset+20);
+                    vec_xst(t7, 0, boffset+24);
+                    vec_xst(t8, 0, boffset+28);
+
+                    aoffset1 += 8*lda;
+                    aoffset2 += 8*lda;
+                    aoffset3 += 8*lda;
+                    aoffset4 += 8*lda;
+                    boffset += 32;
+                    i--;
+                } while(i > 0);
+            }
+
+            if (cols & 4) {
+                vector float c1, c2, c3, c4;
+                vector float t1, t2, t3, t4;
+                c1 = vec_xl(0, aoffset1);
+                c2 = vec_xl(0, aoffset2);
+                c3 = vec_xl(0, aoffset3);
+                c4 = vec_xl(0, aoffset4);
+
+                t1 = vec_mergeh(c1, c2);
+                t2 = vec_mergeh(c3, c4);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset);
+                vec_xst(t4, 0, boffset+4);
+
+                t1 = vec_mergel(c1, c2);
+                t2 = vec_mergel(c3, c4);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset+8);
+                vec_xst(t4, 0, boffset+12);
+            }
+        }
+        if (rows & 3) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            if (cols & 4) {
+                vector float c1, c2, c3, c4 = {0};
+                vector float t1, t2, t3, t4;
+                c1 = vec_xl(0, aoffset1);
+                c2 = vec_xl(0, aoffset2);
+                c3 = vec_xl(0, aoffset3);
+
+                t1 = vec_mergeh(c1, c2);
+                t2 = vec_mergeh(c3, c4);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset);
+                vec_xst(t4, 0, boffset+4);
+
+                t1 = vec_mergel(c1, c2);
+                t2 = vec_mergel(c3, c4);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset+8);
+                vec_xst(t4, 0, boffset+12);
+            }
+        }
+    }
+
+    void KERNEL_4x4(int64_t ii, int64_t jj) {
+        vec_t vec_A[4], vec_B[4], vec_C[4];
+        acc_t acc_0;
+        __builtin_mma_xxsetaccz(&acc_0);
+        for (int l = 0; l < k; l+=4) {
+            READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
+            READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+    }
+
+    void KERNEL_4x8(int64_t ii, int64_t jj) {
+        vec_t vec_A[4], vec_B[8], vec_C[4];
+        acc_t acc_0, acc_1;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        for (int64_t l = 0; l < k; l+=4) {
+            READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
+            READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[1], (vec_t)vec_B[3]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], (vec_t)vec_B[4]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[2], (vec_t)vec_B[5]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], (vec_t)vec_B[6]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[3], (vec_t)vec_B[7]);
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii, jj+4);
+    }
+
+    void KERNEL_8x4(int64_t ii, int64_t jj) {
+        vec_t vec_A[8], vec_B[4], vec_C[4];
+        acc_t acc_0, acc_1;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        for (int64_t l = 0; l < k; l+=4) {
+            READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A);
+            READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[3], vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[4], vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[5], vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[6], vec_B[3]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[7], vec_B[3]);
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii+4, jj);
+    }
+
+    void KERNEL_8x8(int64_t ii, int64_t jj) {
+        vec_t vec_A[16], vec_B[16], vec_C[4];
+        acc_t acc_0, acc_1, acc_2, acc_3;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        __builtin_mma_xxsetaccz(&acc_2);
+        __builtin_mma_xxsetaccz(&acc_3);
+        for (int l = 0; l < k; l+=8) {
+            READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A);
+            READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B);
+            for(int x = 0; x < 16; x+=2) {
+                __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
+                __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
+                __builtin_mma_xvf32gerpp(&acc_2, (vec_t)vec_A[x+1], vec_B[x]);
+                __builtin_mma_xvf32gerpp(&acc_3, (vec_t)vec_A[x+1], vec_B[x+1]);
+            }
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii, jj+4);
+        SAVE_ACC(&acc_2, ii+4, jj);
+        SAVE_ACC(&acc_3, ii+4, jj+4);
+    }
+
+    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t mc, nc, mp, np;
+        int m_rem = MIN(m - m0, 16);
+        int n_rem = MIN(n - n0, 16);
+        if (m_rem >= 16 && n_rem >= 8) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if(m_rem >= 8 && n_rem >= 16) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if (m_rem >= 8 && n_rem >= 8) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if (m_rem >= 4 && n_rem >= 8) {
+            mc = 4;
+            nc = 8;
+            gemm<4,8>(m0, m, n0, n);
+        } else if (m_rem >= 8 && n_rem >= 4) {
+            mc = 8;
+            nc = 4;
+            gemm<8,4>(m0, m, n0, n);
+        } else if (m_rem >= 4 && n_rem >= 4) {
+            mc = 4;
+            nc = 4;
+            gemm<4,4>(m0, m, n0, n);
+        } else if ((m_rem < 4) && (n_rem > 4)) {
+            nc = 4;
+            switch(m_rem) {
+                case 1:
+                    mc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 2:
+                    mc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 3:
+                    mc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                default:
+                    return;
+            }
+        } else if ((m_rem > 4) && (n_rem < 4)) {
+            mc = 4;
+            switch(n_rem) {
+                case 1:
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 2:
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 3:
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                default:
+                    return;
+            }
+        } else {
+            switch((m_rem << 4) | n_rem) {
+                case 0x43:
+                    mc = 4;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x42:
+                    mc = 4;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x41:
+                    mc = 4;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x34:
+                    mc = 3;
+                    nc = 4;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x33:
+                    mc = 3;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x32:
+                    mc = 3;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x31:
+                    mc = 3;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x24:
+                    mc = 2;
+                    nc = 4;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x23:
+                    mc = 2;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x22:
+                    mc = 2;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x21:
+                    mc = 2;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x14:
+                    mc = 1;
+                    nc = 4;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x13:
+                    mc = 1;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x12:
+                    mc = 1;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x11:
+                    mc = 1;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                default:
+                    return;
+            }
+        }
+        mp = m0 + (m - m0) / mc * mc;
+        np = n0 + (n - n0) / nc * nc;
+        mnpack(mp, m, n0, np);
+        mnpack(m0, m, np, n);
+    }
+
+     void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            vec_t vec_C[4];
+            acc_t acc_0;
+            __builtin_mma_xxsetaccz(&acc_0);
+            vec_t vec_A[4], vec_B[4];
+            for (int l=0; l<k; l+=4) {
+                if (RN >= 4 && RM == 1) {
+                    float* a = const_cast<float*>(A+(ii)*lda+l);
+                    READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
+                    vec_A[0] = (vec_t)vec_xl(0,a);
+                    vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1));
+                    vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2));
+                    vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3));
+                } else {
+                    READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A);
+                    READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B);
+                }
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
+            }
+            __builtin_mma_disassemble_acc(vec_C, &acc_0);
+            for (int I = 0; I < RM; I++) {
+                for (int J = 0; J < RN; J++) {
+                    *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J);
+                }
+            }
+       }
+    }
+
+    template <int RM, int RN>
+    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (RM == 4 && RN == 4) {
+            kernel = &tinyBLAS_PPC::KERNEL_4x4;
+        } else if (RM == 4 && RN == 8) {
+            kernel = &tinyBLAS_PPC::KERNEL_4x8;
+        } else if (RM == 8 && RN == 4) {
+            kernel = &tinyBLAS_PPC::KERNEL_8x4;
+        } else if (RM == 8 && RN == 8) {
+            kernel = &tinyBLAS_PPC::KERNEL_8x8;
+        }
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            (this->*kernel)(ii, jj);
+        }
+    }
+
+    const TA *const A;
+    const TB *const B;
+    TC *C;
+    TA *At;
+    TB *Bt;
+    const int64_t k;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+    const int ith;
+    const int nth;
+};
+#endif
 } // namespace
 
 /**
@@ -1114,6 +1712,16 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
             ith, nth};
         tb.matmul(m, n);
         return true;
+#elif defined(__MMA__)
+        if (k % 8)
+            return false;
+        tinyBLAS_PPC<float, float, float> tb{
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc,
+            ith, nth};
+        tb.matmul(m, n);
+        return true;
 #else
         return false;
 #endif

From 5b359bb1e3585de45bec79fd6c18934897662cdf Mon Sep 17 00:00:00 2001
From: SXX <sxx1136965276@gmail.com>
Date: Sat, 9 Nov 2024 15:35:46 +0800
Subject: [PATCH 187/396] =?UTF-8?q?ggml:=20fix=20zero=20division=20in=20?=
 =?UTF-8?q?=E2=80=98dne=E2=80=99=20calculation=20in=20CUDA=20COUNT=5FEQUAL?=
 =?UTF-8?q?=20operator=20when=20=E2=80=98ne=E2=80=99=20is=20small=20(#1021?=
 =?UTF-8?q?3)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ggml/src/ggml-cuda/count-equal.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/count-equal.cu b/ggml/src/ggml-cuda/count-equal.cu
index ffb053b10..08898115d 100644
--- a/ggml/src/ggml-cuda/count-equal.cu
+++ b/ggml/src/ggml-cuda/count-equal.cu
@@ -44,7 +44,7 @@ void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
-    const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
+    const int64_t dne = GGML_PAD((ne + 4*nsm - 1) / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
 
     CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));
 

From 46323fa9efd5e6c8aeef8d6eb6c332ee0d95eb13 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 9 Nov 2024 11:21:49 +0200
Subject: [PATCH 188/396] metal : hide debug messages from normal log

---
 ggml/src/ggml-metal.m | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 10d59cb9f..c112fd866 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -596,17 +596,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
             ctx->kernels[i].pipeline = nil;
         }
 
-        /*
-            GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
-                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
-                    (int) kernel->pipeline.threadExecutionWidth); \
-        */
 #define GGML_METAL_ADD_KERNEL(e, name, supported) \
         if (supported) { \
             struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
             id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
             kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \
-            GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
+            GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
                     (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
                     (int) kernel->pipeline.threadExecutionWidth); \
             [metal_function release]; \

From f018acba22095b8995bf6c5ef815b16a3ce4cf1b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 9 Nov 2024 11:26:34 +0200
Subject: [PATCH 189/396] llama : fix Qwen model type strings

---
 src/llama.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index 034441e1f..4d89c5222 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2301,6 +2301,7 @@ enum e_model {
     MODEL_1B,
     MODEL_1_3B,
     MODEL_1_4B,
+    MODEL_1_5B,
     MODEL_1_6B,
     MODEL_2B,
     MODEL_2_8B,
@@ -5227,6 +5228,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_1B:            return "1B";
         case MODEL_1_3B:          return "1.3B";
         case MODEL_1_4B:          return "1.4B";
+        case MODEL_1_5B:          return "1.5B";
         case MODEL_1_6B:          return "1.6B";
         case MODEL_2B:            return "2B";
         case MODEL_2_8B:          return "2.8B";
@@ -5598,6 +5600,7 @@ static void llm_load_hparams(
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
+                    case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
                     case 32: model.type = e_model::MODEL_7B; break;
                     case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
                     case 80: model.type = e_model::MODEL_70B; break;

From bb38cdd8baf37de1fadab3e867c6ba4ae452eff6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 9 Nov 2024 11:52:45 +0200
Subject: [PATCH 190/396] metal : fix F32 accumulation in FA vec kernel
 (#10232)

---
 ggml/src/ggml-metal.metal | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index 7e1517414..1f233ba7f 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -3450,7 +3450,7 @@ kernel void kernel_flash_attn_ext_vec(
             {
                 // each simdgroup processes 1 query and 4 keys
                 for (short cc = 0; cc < C/4; ++cc) {
-                    qk_t mqk = 0.0;
+                    qk_t mqka[4] = { 0.0, 0.0, 0.0, 0.0 };
 
                     device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 
@@ -3461,13 +3461,14 @@ kernel void kernel_flash_attn_ext_vec(
                         k4x4_t mk;
                         deq_k(pk + i/nl_k, i%nl_k, mk);
 
-                        mqk +=
-                            dot(mq[ii/NL][0], mk[0]) +
-                            dot(mq[ii/NL][1], mk[1]) +
-                            dot(mq[ii/NL][2], mk[2]) +
-                            dot(mq[ii/NL][3], mk[3]);
+                        mqka[0] += dot(mq[ii/NL][0], mk[0]);
+                        mqka[1] += dot(mq[ii/NL][1], mk[1]);
+                        mqka[2] += dot(mq[ii/NL][2], mk[2]);
+                        mqka[3] += dot(mq[ii/NL][3], mk[3]);
                     }
 
+                    qk_t mqk = mqka[0] + mqka[1] + mqka[2] + mqka[3];
+
                     // simdgroup reduce
                     // [ 0 ..  7] -> [ 0]
                     // [ 8 .. 15] -> [ 8]

From 39a334a9aaf2000f93a899d9f43d889e460640ee Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 9 Nov 2024 11:53:02 +0200
Subject: [PATCH 191/396] metal : fix build and some more comments (#10229)

---
 ggml/src/ggml-metal.m     | 2 ++
 ggml/src/ggml-metal.metal | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index c112fd866..04ec5117f 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -3041,6 +3041,8 @@ static void ggml_metal_encode_node(
 
                 bool use_vec_kernel = false;
 
+                // TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
+                //       for now avoiding mainly to keep the number of templates/kernels a bit lower
                 if (ne01 >= 4 || (ne00%128 != 0)) {
                     switch (src1->type) {
                         case GGML_TYPE_F16:
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index 1f233ba7f..779f45968 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -3356,8 +3356,8 @@ kernel void kernel_flash_attn_ext_vec(
     const short D4  = D/4;
     const short D16 = D/16;
     const short NW  = N_SIMDWIDTH;
-    const short NL  = NW/4;
-    const short SH  = 2*C; // shared memory per simdgroup
+    const short NL  = NW/4; // note: this can be adjusted to support D%64 == 0 and D%32 == 0
+    const short SH  = 2*C;  // shared memory per simdgroup
 
     const short T = D + nsg*SH; // shared memory size per query in (half)
 
@@ -3448,7 +3448,7 @@ kernel void kernel_flash_attn_ext_vec(
 
             // Q*K^T
             {
-                // each simdgroup processes 1 query and 4 keys
+                // each simdgroup processes 1 query and 4 (NW/NL) keys
                 for (short cc = 0; cc < C/4; ++cc) {
                     qk_t mqka[4] = { 0.0, 0.0, 0.0, 0.0 };
 
@@ -3646,7 +3646,7 @@ kernel void kernel_flash_attn_ext_vec(
     half,  half4,  half4x4, \
                    half4x4
 
-typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64>) flash_attn_ext_vec_t;
+typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 128>) flash_attn_ext_vec_t;
 
 template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,     1, dequantize_f16,  128>;
 #if defined(GGML_METAL_USE_BF16)

From 6423c65aa8be1b98f990cf207422505ac5a441a1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 9 Nov 2024 11:53:13 +0200
Subject: [PATCH 192/396] metal : reorder write loop in mul mat kernel + style
 (#10231)

* metal : reorder write loop

* metal : int -> short, style

ggml-ci
---
 ggml/src/ggml-metal.metal | 76 ++++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 32 deletions(-)

diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index 779f45968..413661c8a 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -6318,8 +6318,8 @@ kernel void kernel_mul_mm(device const  uchar * src0,
     const uint im = tgpig.z;
 
     // if this block is of 64x32 shape or smaller
-    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    short n_rows = (ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    short n_cols = (ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
 
     // a thread shouldn't load data outside of the matrix
     short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
@@ -6327,9 +6327,10 @@ kernel void kernel_mul_mm(device const  uchar * src0,
 
     simdgroup_T8x8     ma[4];
     simdgroup_float8x8 mb[2];
-    simdgroup_float8x8 c_res[8];
-    for (int i = 0; i < 8; i++){
-        c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
+    simdgroup_float8x8 mc[8];
+
+    for (short i = 0; i < 8; i++){
+        mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
     }
 
     short il = (tiitg % THREAD_PER_ROW);
@@ -6340,7 +6341,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
     uint   offset0 = (i12/r2)*nb02 + (i13/r3)*nb03;
     ushort offset1 = il/nl;
 
-    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
+    device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*nb01 + offset0) + offset1;
     device const float   * y = (device const float   *)(src1
         + nb13 * i13
         + nb12 * i12
@@ -6354,13 +6355,13 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
         #pragma unroll(16)
-        for (int i = 0; i < 16; i++) {
-            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
-            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
+        for (short i = 0; i < 16; i++) {
+            *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
+            +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
+            +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
         }
 
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
+        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL)*8*32 + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
 
         il = (il + 2 < nl) ? il + 2 : il % 2;
         x  = (il < 2) ? x + (2+nl-1)/nl : x;
@@ -6369,27 +6370,27 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
         // load matrices from threadgroup memory and conduct outer products
-        threadgroup T     * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
+        threadgroup T     * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
+        threadgroup float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
 
         #pragma unroll(4)
-        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+        for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
             #pragma unroll(4)
-            for (int i = 0; i < 4; i++) {
-                simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i);
+            for (short i = 0; i < 4; i++) {
+                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
             }
             simdgroup_barrier(mem_flags::mem_none);
             #pragma unroll(2)
-            for (int i = 0; i < 2; i++) {
-                simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i);
+            for (short i = 0; i < 2; i++) {
+                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
             }
 
-            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
-            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
+            lsma += BLOCK_SIZE_M/SG_MAT_ROW * SG_MAT_SIZE;
+            lsmb += BLOCK_SIZE_N/SG_MAT_ROW * SG_MAT_SIZE;
 
             #pragma unroll(8)
-            for (int i = 0; i < 8; i++){
-                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
+            for (short i = 0; i < 8; i++){
+                simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
             }
         }
     }
@@ -6397,25 +6398,36 @@ kernel void kernel_mul_mm(device const  uchar * src0,
     if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
         device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) \
                                + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
+        for (short i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
         }
     } else {
         // block is smaller than 64x32, we should avoid writing data outside of the matrix
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
+        threadgroup float * temp_str = ((threadgroup float *) shared_memory) \
+                                      + 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M;
+        for (short i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
         }
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
-        device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
         if (sgitg == 0) {
-            for (int i = 0; i < n_rows; i++) {
-                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                    *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
+            for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
+                device float  * D  = dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*ne0 + im*ne1*ne0;
+                device float4 * D4 = (device float4 *) D;
+
+                threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
+                threadgroup float4 * C4 = (threadgroup float4 *) C;
+
+                int i = 0;
+                for (; i < n_rows/4; i++) {
+                    *(D4 + i) = *(C4 + i);
+                }
+
+                i *= 4;
+                for (; i < n_rows; i++) {
+                    *(D + i) = *(C + i);
                 }
             }
         }

From 160687b3ed002eee83a04de83a9cd752f928ced1 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Sun, 10 Nov 2024 05:37:56 -0600
Subject: [PATCH 193/396] vulkan: Fix newly added tests for permuted mul_mat
 and 1D im2col (#10226)

---
 ggml/src/ggml-vulkan.cpp | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index a8e78c4db..6c4c92262 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -3147,7 +3147,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
     const bool qx_needs_dequant = mmp == nullptr || x_non_contig;
     const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
 
-    if (mmp == nullptr) {
+    if (qx_needs_dequant) {
         // Fall back to dequant + f16 mulmat
         mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16);
     }
@@ -3630,9 +3630,19 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
 
 static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
     VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
-    if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
+    if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1 &&
+        // detect 0213 permutation, and batch size of 1
+        src0->nb[0] <= src0->nb[2] &&
+        src0->nb[2] <= src0->nb[1] &&
+        src0->nb[1] <= src0->nb[3] &&
+        src1->nb[0] <= src1->nb[2] &&
+        src1->nb[2] <= src1->nb[1] &&
+        src1->nb[1] <= src1->nb[3] &&
+        src0->ne[3] == 1 &&
+        src1->ne[3] == 1) {
         ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
-    } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
+    } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
+               !ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
         ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
     } else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
         ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun);
@@ -3708,7 +3718,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
     const bool qx_needs_dequant = mmp == nullptr || x_non_contig;
     const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
 
-    if (mmp == nullptr) {
+    if (qx_needs_dequant) {
         GGML_ABORT("fatal error");
     }
 
@@ -4470,7 +4480,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
             const uint32_t OH = is_2D ? dst->ne[2] : 1;
             const uint32_t OW =         dst->ne[1];
 
-            const uint32_t batch = src1->ne[3];
+            const uint32_t batch = src1->ne[is_2D ? 3 : 2];
 
             elements = { OW * KW * KH, OH, batch * IC };
         } break;
@@ -4915,7 +4925,7 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
     const uint32_t OW =         dst->ne[1];
 
     const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
-    const uint32_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
+    const uint32_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
 
     const uint32_t pelements = OW * KW * KH;
 
@@ -6804,6 +6814,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                 if (a->ne[3] != b->ne[3]) {
                     return false;
                 }
+                if (!(ggml_vk_dim01_contiguous(op->src[0]) || op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) ||
+                    !(ggml_vk_dim01_contiguous(op->src[1]) || op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16)) {
+                    return false;
+                }
+
                 return true;
             } break;
         case GGML_OP_GET_ROWS:

From 505f33274d60676320216b662a97672a76ec600e Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Mon, 11 Nov 2024 00:42:25 +0500
Subject: [PATCH 194/396] server : (web UI) Add back sampler settings (#10239)

* Add back samplers to server

* Added tooltips with basic information

* Fixed stretching of input fields.

* use component for settings input, move help msg to tooltips

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 examples/server/public/index.html | 106 +++++++++++++++++++++++++++---
 1 file changed, 97 insertions(+), 9 deletions(-)

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index bf1d1b794..55639a944 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -200,23 +200,38 @@
             <div class="label">System Message</div>
             <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
           </label>
-          <template v-for="key in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
-            <label class="input input-bordered flex items-center gap-2 mb-2">
-              <b>{{ key }}</b>
-              <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[key] || 'none')" v-model="config[key]" />
-            </label>
+          <template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
+            <settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
           </template>
           <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
-          <div class="collapse collapse-arrow bg-base-200 mb-2">
-            <input type="checkbox" />
-            <div class="collapse-title font-bold">Advanced config</div>
+          <!-- Section: Other sampler settings -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Other sampler settings</summary>
+            <div class="collapse-content">
+              <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
+                <settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+              </template>
+            </div>
+          </details>
+          <!-- Section: Penalties settings -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Penalties settings</summary>
+            <div class="collapse-content">
+              <template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
+                <settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+              </template>
+            </div>
+          </details>
+          <!-- Section: Advanced config -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Advanced config</summary>
             <div class="collapse-content">
               <label class="form-control mb-2">
                 <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
                 <textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
               </label>
             </div>
-          </div>
+          </details>
         </div>
 
         <!-- action buttons -->
@@ -229,6 +244,21 @@
     </dialog>
   </div>
 
+  <!-- Template to be used by settings modal -->
+  <template id="settings-modal-numeric-input">
+    <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
+      <!-- Show help message on hovering on the input label -->
+      <div class="dropdown dropdown-hover">
+        <div tabindex="0" role="button" class="font-bold">{{ configKey }}</div>
+        <div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
+          {{ configInfo[configKey] || '(no help message available)' }}
+        </div>
+      </div>
+      <!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
+      <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
+    </label>
+  </template>
+
   <script src="./deps_markdown-it.js"></script>
   <script type="module">
     import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
@@ -245,12 +275,48 @@
       systemMessage: 'You are a helpful assistant.',
       // make sure these default values are in sync with `common.h`
       temperature: 0.8,
+      dynatemp_range: 0.0,
+      dynatemp_exponent: 1.0,
       top_k: 40,
       top_p: 0.95,
       min_p: 0.05,
+      xtc_probability: 0.0,
+      xtc_threshold: 0.1,
+      typical_p: 1.0,
+      repeat_last_n: 64,
+      repeat_penalty: 1.0,
+      presence_penalty: 0.0,
+      frequency_penalty: 0.0,
+      dry_multiplier: 0.0,
+      dry_base: 1.75,
+      dry_allowed_length: 2,
+      dry_penalty_last_n: -1,
       max_tokens: -1,
       custom: '', // custom json-stringified object
     };
+    const CONFIG_INFO = {
+      apiKey: '',
+      systemMessage: 'The starting message that defines how model should behave.',
+      temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
+      dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
+      dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
+      top_k: 'Keeps only k top tokens.',
+      top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
+      min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
+      xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
+      xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
+      typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
+      repeat_last_n: 'Last n tokens to consider for penalizing repetition',
+      repeat_penalty: 'Controls the repetition of token sequences in the generated text',
+      presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
+      frequency_penalty: 'Limits tokens based on how often they appear in the output.',
+      dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
+      dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
+      dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
+      dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
+      max_tokens: 'The maximum number of token per output.',
+      custom: '', // custom json-stringified object
+    };
     // config keys having numeric value (i.e. temperature, top_k, top_p, etc)
     const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
     // list of themes supported by daisyui
@@ -269,6 +335,12 @@
       { props: ["source", "options", "plugins"] }
     );
 
+    // inout field to be used by settings modal
+    const SettingsModalNumericInput = defineComponent({
+      template: document.getElementById('settings-modal-numeric-input').innerHTML,
+      props: ['configKey', 'configDefault', 'configInfo', 'modelValue'],
+    });
+
     // coversations is stored in localStorage
     // format: { [convId]: { id: string, lastModified: number, messages: [...] } }
     // convId is a string prefixed with 'conv-'
@@ -359,6 +431,7 @@
     const mainApp = createApp({
       components: {
         VueMarkdown,
+        SettingsModalNumericInput,
       },
       data() {
         return {
@@ -376,6 +449,7 @@
           // const
           themes: THEMES,
           configDefault: {...CONFIG_DEFAULT},
+          configInfo: {...CONFIG_INFO},
         }
       },
       computed: {},
@@ -452,8 +526,22 @@
               stream: true,
               cache_prompt: true,
               temperature: this.config.temperature,
+              dynatemp_range: this.config.dynatemp_range,
+              dynatemp_exponent: this.config.dynatemp_exponent,
               top_k: this.config.top_k,
               top_p: this.config.top_p,
+              min_p: this.config.min_p,
+              typical_p: this.config.typical_p,
+              xtc_probability: this.config.xtc_probability,
+              xtc_threshold: this.config.xtc_threshold,
+              repeat_last_n: this.config.repeat_last_n,
+              repeat_penalty: this.config.repeat_penalty,
+              presence_penalty: this.config.presence_penalty,
+              frequency_penalty: this.config.frequency_penalty,
+              dry_multiplier: this.config.dry_multiplier,
+              dry_base: this.config.dry_base,
+              dry_allowed_length: this.config.dry_allowed_length,
+              dry_penalty_last_n: this.config.dry_penalty_last_n,
               max_tokens: this.config.max_tokens,
               ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
               ...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),

From 4b3a9212b602be3d4e2e3ca26efd796cef13c55e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 10 Nov 2024 21:45:25 +0200
Subject: [PATCH 195/396] flake.lock: Update (#10243)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/807e9154dcb16384b1b765ebe9cd2bba2ac287fd?narHash=sha256-l253w0XMT8nWHGXuXqyiIC/bMvh1VRszGXgdpQlfhvU%3D' (2024-10-29)
  → 'github:NixOS/nixpkgs/4aa36568d413aca0ea84a1684d2d46f55dbabad7?narHash=sha256-Zwl8YgTVJTEum%2BL%2B0zVAWvXAGbWAuXHax3KzuejaDyo%3D' (2024-11-05)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index c170c4952..470363a2f 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1730200266,
-        "narHash": "sha256-l253w0XMT8nWHGXuXqyiIC/bMvh1VRszGXgdpQlfhvU=",
+        "lastModified": 1730785428,
+        "narHash": "sha256-Zwl8YgTVJTEum+L+0zVAWvXAGbWAuXHax3KzuejaDyo=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "807e9154dcb16384b1b765ebe9cd2bba2ac287fd",
+        "rev": "4aa36568d413aca0ea84a1684d2d46f55dbabad7",
         "type": "github"
       },
       "original": {

From b141e5f6efbab3a00633df88c4f9425bfe8b78ab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 11 Nov 2024 08:38:43 +0200
Subject: [PATCH 196/396] server : enable KV cache defrag by default (#10233)

ggml-ci
---
 common/common.h           |  2 +-
 examples/server/README.md | 20 +++++++++++---------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/common/common.h b/common/common.h
index 727f85baa..6289feaeb 100644
--- a/common/common.h
+++ b/common/common.h
@@ -178,7 +178,7 @@ struct common_params {
     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
     float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
     int32_t yarn_orig_ctx         =     0; // YaRN original context length
-    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
+    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
 
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
diff --git a/examples/server/README.md b/examples/server/README.md
index 562494077..c911745f4 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -39,7 +39,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
 | `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
-| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
+| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
 | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
@@ -64,7 +64,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
 | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
-| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
+| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
@@ -99,25 +99,27 @@ The project is under active development, and we are [looking for feedback and co
 
 | Argument | Explanation |
 | -------- | ----------- |
-| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;typ_p;top_p;min_p;temperature) |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: dry;top_k;typ_p;top_p;min_p;xtc;temperature) |
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
-| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
+| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
 | `--penalize-nl` | penalize newline tokens (default: false) |
 | `--temp N` | temperature (default: 0.8) |
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
 | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
 | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
+| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
+| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
 | `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
 | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
 | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
 | `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
 | `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
-| `--dry-multiplier N` | DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
-| `--dry-base N` | DRY sampling base value (default: 1.75) |
-| `--dry-allowed-length N` | allowed length for DRY sampling (default: 2) |
-| `--dry-penalty-last-n N` | DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
-| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers (`['\n', ':', '"', '*']`) in the process; use `"none"` to not use any sequence breakers
+| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
+| `--dry-base N` | set DRY sampling base value (default: 1.75) |
+| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
+| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
+| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers<br/> |
 | `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
 | `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
 | `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |

From b0cefea58a20020754b7431e3c725625274372a5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 11 Nov 2024 08:39:13 +0200
Subject: [PATCH 197/396] metal : more precise Q*K in FA vec kernel (#10247)

---
 ggml/src/ggml-metal.metal | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index 413661c8a..e8b71a9f8 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -2942,6 +2942,7 @@ kernel void kernel_flash_attn_ext(
                 half smax = -INFINITY;
 
                 // load the mask in shared memory
+                #pragma unroll(Q)
                 for (short j = 0; j < Q; ++j) {
                     device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*nb31);
 
@@ -2968,7 +2969,7 @@ kernel void kernel_flash_attn_ext(
                         // we can read directly from global memory
                         device const k_t * pk = (device const k_t *) ((device const char *) k + ((ic + 8*cc)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 
-#pragma unroll
+                        #pragma unroll(D8)
                         for (short i = 0; i < D8; ++i) {
                             k8x8_t mk;
                             simdgroup_load(mk, pk + i*8, nb_12_1/sizeof(k_t), 0, true); // transpose // TODO: use ne10
@@ -2989,7 +2990,7 @@ kernel void kernel_flash_attn_ext(
 
                                 simdgroup_barrier(mem_flags::mem_threadgroup);
 
-#pragma unroll
+                                #pragma unroll(4)
                                 for (short k = 0; k < 4; ++k) {
                                     k8x8_t mk;
 
@@ -3067,7 +3068,7 @@ kernel void kernel_flash_attn_ext(
                 s8x8_t mm;
                 simdgroup_load(mm, ss + 2*C, TS, 0, false);
 
-#pragma unroll
+                #pragma unroll(D8)
                 for (short i = 0; i < D8; ++i) {
                     simdgroup_multiply(lo[i], mm, lo[i]);
                 }
@@ -3082,7 +3083,8 @@ kernel void kernel_flash_attn_ext(
                     if (is_same<vd4x4_t, v4x4_t>::value) {
                         // we can read directly from global memory
                         device const v_t * pv = (device const v_t *) ((device const char *) v + ((ic + 8*cc)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
-#pragma unroll
+
+                        #pragma unroll(D8)
                         for (short i = 0; i < D8; ++i) {
                             v8x8_t mv;
                             simdgroup_load(mv, pv + i*8, nb_12_1/sizeof(v_t), 0, false); // TODO: use ne20
@@ -3103,7 +3105,7 @@ kernel void kernel_flash_attn_ext(
 
                                 simdgroup_barrier(mem_flags::mem_threadgroup);
 
-#pragma unroll
+                                #pragma unroll(4)
                                 for (short k = 0; k < 4; ++k) {
                                     v8x8_t mv;
 
@@ -3196,6 +3198,7 @@ kernel void kernel_flash_attn_ext(
                 simdgroup_load(ms0, ss + 2*C,         TS, 0, false);
                 simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
 
+                #pragma unroll(D8)
                 for (short i = 0; i < D8; ++i) {
                     o8x8_t t;
 
@@ -3413,6 +3416,7 @@ kernel void kernel_flash_attn_ext_vec(
         // load the queries from shared memory into local memory
         q4x4_t mq[D16/NL];
 
+        #pragma unroll(D16/NL)
         for (short ii = 0; ii < D16; ii += NL) {
             mq[ii/NL] = sq4x4[ii + tx];
         }
@@ -3454,17 +3458,23 @@ kernel void kernel_flash_attn_ext_vec(
 
                     device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 
-#pragma unroll
+                    #pragma unroll(D16/NL)
                     for (short ii = 0; ii < D16; ii += NL) {
                         const short i = ii + tx;
 
                         k4x4_t mk;
                         deq_k(pk + i/nl_k, i%nl_k, mk);
 
-                        mqka[0] += dot(mq[ii/NL][0], mk[0]);
-                        mqka[1] += dot(mq[ii/NL][1], mk[1]);
-                        mqka[2] += dot(mq[ii/NL][2], mk[2]);
-                        mqka[3] += dot(mq[ii/NL][3], mk[3]);
+                        // note: this is less precise than the version below
+                        //mqka[0] += dot(mq[ii/NL][0], mk[0]);
+                        //mqka[1] += dot(mq[ii/NL][1], mk[1]);
+                        //mqka[2] += dot(mq[ii/NL][2], mk[2]);
+                        //mqka[3] += dot(mq[ii/NL][3], mk[3]);
+
+                        mqka[0] += dot((float4) mq[ii/NL][0], (float4) mk[0]);
+                        mqka[1] += dot((float4) mq[ii/NL][1], (float4) mk[1]);
+                        mqka[2] += dot((float4) mq[ii/NL][2], (float4) mk[2]);
+                        mqka[3] += dot((float4) mq[ii/NL][3], (float4) mk[3]);
                     }
 
                     qk_t mqk = mqka[0] + mqka[1] + mqka[2] + mqka[3];
@@ -3513,7 +3523,7 @@ kernel void kernel_flash_attn_ext_vec(
                 ss[tiisg] = vs;
 
                 // O = diag(ms)*O
-#pragma unroll
+                #pragma unroll(D16/NL)
                 for (short ii = 0; ii < D16; ii += NL) {
                     lo[ii/NL] *= ms;
                 }
@@ -3523,13 +3533,12 @@ kernel void kernel_flash_attn_ext_vec(
 
             // O = O + (Q*K^T)*V
             {
-#pragma unroll
                 for (short cc = 0; cc < C/4; ++cc) {
                     device const vd4x4_t * pv4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
 
                     const s4x4_t ms(ss[4*cc + ty]);
 
-#pragma unroll
+                    #pragma unroll(D16/NL)
                     for (short ii = 0; ii < D16; ii += NL) {
                         const short i = ii + tx;
 

From 54ef9cfc726a799e6f454ac22c4815d037716eda Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Mon, 11 Nov 2024 11:13:51 -0600
Subject: [PATCH 198/396] vulkan: Throttle the number of shader compiles during
 the build step. (#10222)

Fixes #9582

Spawning too many concurrent copies of glslc leads to "Failed to create pipes"
errors on Linux. This change applies the same throttling we use for
multithreaded pipeline creation.
---
 .../src/vulkan-shaders/vulkan-shaders-gen.cpp | 280 ++++++------------
 1 file changed, 96 insertions(+), 184 deletions(-)

diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
index 49759c593..477355c29 100644
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -16,6 +16,7 @@
 #include <cstdio>
 #include <cstring>
 #include <cstdlib>
+#include <cassert>
 #include <sys/stat.h>
 #include <sys/types.h>
 
@@ -92,11 +93,11 @@ void execute_command(const std::string& command, std::string& stdout_str, std::s
     std::array<char, 128> buffer;
     DWORD bytes_read;
 
-    while (ReadFile(stdout_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
+    while (ReadFile(stdout_read, buffer.data(), (DWORD)buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
         stdout_str.append(buffer.data(), bytes_read);
     }
 
-    while (ReadFile(stderr_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
+    while (ReadFile(stderr_read, buffer.data(), (DWORD)buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
         stderr_str.append(buffer.data(), bytes_read);
     }
 
@@ -190,7 +191,12 @@ std::string basename(const std::string &path) {
     return path.substr(path.find_last_of("/\\") + 1);
 }
 
-void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
+// variables to track number of compiles in progress
+static uint32_t compile_count = 0;
+static std::mutex compile_count_mutex;
+static std::condition_variable compile_count_cond;
+
+void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
     std::string name = _name + (fp16 ? "" : "_fp32");
     std::string out_fname = join_paths(output_dir, name + ".spv");
     std::string in_path = join_paths(input_dir, in_fname);
@@ -233,6 +239,12 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
     } catch (const std::exception& e) {
         std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
     }
+    {
+        std::lock_guard<std::mutex> guard(compile_count_mutex);
+        assert(compile_count > 0);
+        compile_count--;
+    }
+    compile_count_cond.notify_all();
 }
 
 std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b) {
@@ -241,7 +253,22 @@ std::map<std::string, std::string> merge_maps(const std::map<std::string, std::s
     return result;
 }
 
-void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id) {
+static std::vector<std::future<void>> compiles;
+void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
+    {
+        // wait until fewer than N compiles are in progress.
+        // 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors.
+        uint32_t N = 16;
+        std::unique_lock<std::mutex> guard(compile_count_mutex);
+        while (compile_count >= N) {
+            compile_count_cond.wait(guard);
+        }
+        compile_count++;
+    }
+    compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16));
+}
+
+void matmul_shaders(bool fp16, bool matmul_id) {
     std::string load_vec = fp16 ? "8" : "4";
     std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4";
     std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4";
@@ -259,19 +286,11 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu
     }
 
     // Shaders with f16 B_TYPE
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
-    }));
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
-    }));
+    string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
+    string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
 
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
-    }));
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
-    }));
+    string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
+    string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
 
     for (const auto& tname : type_names) {
         std::string data_a_key = "DATA_A_" + to_uppercase(tname);
@@ -279,22 +298,18 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu
         std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
         // For aligned matmul loads
         std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
-        tasks.push_back(std::async(std::launch::async, [=] {
-            string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
-        }));
-        tasks.push_back(std::async(std::launch::async, [=] {
-            string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
-        }));
+        string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
+        string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
     }
 }
 
-void process_shaders(std::vector<std::future<void>>& tasks) {
+void process_shaders() {
     std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl;
     std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
 
     for (const auto& fp16 : {false, true}) {
-        matmul_shaders(tasks, fp16, false);
-        matmul_shaders(tasks, fp16, true);
+        matmul_shaders(fp16, false);
+        matmul_shaders(fp16, true);
     }
 
     for (const auto& tname : type_names) {
@@ -302,201 +317,103 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
         std::string data_a_key = "DATA_A_" + to_uppercase(tname);
         std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
 
-        tasks.push_back(std::async(std::launch::async, [=] {
-            string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
-        }));
-        tasks.push_back(std::async(std::launch::async, [=] {
-            string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
-        }));
+        string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
 
-        tasks.push_back(std::async(std::launch::async, [=] {
-            string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
-        }));
+        string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
 
         // Dequant shaders
         if (tname != "f16") {
-            tasks.push_back(std::async(std::launch::async, [=] {
-                string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
-            }));
+            string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
         }
 
         if (!string_ends_with(tname, "_k")) {
             shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
 
             if (tname == "f16") {
-                tasks.push_back(std::async(std::launch::async, [=] {
-                    string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
-                }));
+                string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
             } else {
-                tasks.push_back(std::async(std::launch::async, [=] {
-                    string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
-                }));
+                string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
             }
-            tasks.push_back(std::async(std::launch::async, [=] {
-                string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
-            }));
+            string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
         }
     }
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
+    string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
 
     // Norms
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
-    }));
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
-    }));
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
-    }));
+    string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
-    }));
+    string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
-    }));
+    string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+    string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    }));
+    string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
-    }));
+    string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    }));
+    string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    }));
+    string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
+    string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    }));
+    string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    }));
+    string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    }));
+    string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    }));
+    string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    }));
+    string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
+    string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
-    }));
+    string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
+    string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
+    string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
+    string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
+    string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
-    }));
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
-    }));
+    string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
-    }));
+    string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    }));
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
-    }));
+    string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
 
-    tasks.push_back(std::async(std::launch::async, [] {
-        string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
-    }));
+    string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
 
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
-    }));
+    string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
 
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
-    }));
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
-    }));
+    string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
 
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
-    }));
+    string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
 
-    tasks.push_back(std::async(std::launch::async, [=] {
-        string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
-    }));
+    string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+
+    for (auto &c : compiles) {
+        c.wait();
+    }
 }
 
 void write_output_files() {
@@ -591,12 +508,7 @@ int main(int argc, char** argv) {
         }
     }
 
-    std::vector<std::future<void>> tasks;
-    process_shaders(tasks);
-
-    for (auto& task : tasks) {
-        task.get();
-    }
+    process_shaders();
 
     write_output_files();
 

From 80dd7ff22fd050fed58b552cc8001aaf968b7ebf Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 13 Nov 2024 00:58:57 -0600
Subject: [PATCH 199/396] vulkan: Optimize contiguous copies (#10254)

* tests: Fix memory bandwidth calculation for perf tests

Add a flops calculation for flash attention.

Add one GGML_OP_CPY perf test.

* vulkan: Optimize contiguous copies

Add a variant of the copy shader for when the tensors are contiguous. Avoid
the complex addressing calculations, and do four elements per invocation
to hide some other overhead.

Apply similar changes to the scale shader, since scale is always contiguous.

Add a "progress bar" for shader compiles.
---
 ggml/src/ggml-vulkan.cpp                      | 76 ++++++++++++++-----
 ggml/src/vulkan-shaders/clamp.comp            |  2 +
 ggml/src/vulkan-shaders/contig_copy.comp      | 42 ++++++++++
 ggml/src/vulkan-shaders/copy.comp             |  2 +
 ggml/src/vulkan-shaders/cos.comp              |  2 +
 .../vulkan-shaders/generic_unary_head.comp    |  3 +-
 ggml/src/vulkan-shaders/pad.comp              |  2 +
 ggml/src/vulkan-shaders/repeat.comp           |  2 +
 ggml/src/vulkan-shaders/scale.comp            | 20 +++--
 ggml/src/vulkan-shaders/sin.comp              |  2 +
 ggml/src/vulkan-shaders/square.comp           |  2 +
 .../src/vulkan-shaders/vulkan-shaders-gen.cpp |  3 +
 tests/test-backend-ops.cpp                    | 13 +++-
 13 files changed, 144 insertions(+), 27 deletions(-)
 create mode 100644 ggml/src/vulkan-shaders/contig_copy.comp

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 6c4c92262..ec31e726a 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -196,6 +196,7 @@ struct vk_device_struct {
     vk_pipeline pipeline_pad_f32;
     vk_pipeline pipeline_repeat_f32;
     vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
+    vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
     vk_pipeline pipeline_norm_f32;
     vk_pipeline pipeline_group_norm_f32;
     vk_pipeline pipeline_rms_norm_f32;
@@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
         std::lock_guard<std::mutex> guard(compile_count_mutex);
         assert(compile_count > 0);
         compile_count--;
+
+        // "Progress bar" for shader compiles
+        static uint32_t total_compile_count = 0;
+        if ((total_compile_count++ % 10) == 0) {
+            std::cerr << ".";
+        }
     }
     compile_count_cond.notify_all();
 }
@@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
 static void ggml_vk_load_shaders(vk_device& device) {
     VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
 
+    std::cerr << "ggml_vulkan: Compiling shaders";
+
     // mulmat
     std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
     std::initializer_list<uint32_t> warptile_m = { 128,  64,  64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
@@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
 
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
     ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
 
@@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     for (auto &c : compiles) {
         c.wait();
     }
+    std::cerr << "Done!" << std::endl;
 }
 
 static vk_device ggml_vk_get_device(size_t idx) {
@@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
 
-static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
-    if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
-        return ctx->device->pipeline_cpy_f32_f32;
+static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
+
+    // Choose "contiguous copy" shader if src/dst are contiguous
+    bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));
+
+    if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f32_f32;
+        } else {
+            return ctx->device->pipeline_cpy_f32_f32;
+        }
     }
-    if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
-        return ctx->device->pipeline_cpy_f32_f16;
+    if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f32_f16;
+        } else {
+            return ctx->device->pipeline_cpy_f32_f16;
+        }
     }
-    if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
-        return ctx->device->pipeline_cpy_f16_f16;
+    if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f16_f16;
+        } else {
+            return ctx->device->pipeline_cpy_f16_f16;
+        }
     }
 
-    std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
+    std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
     GGML_ABORT("fatal error");
 }
 
@@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
     const int tensor_type_size = ggml_type_size(tensor->type);
 
     const uint32_t ne = ggml_nelements(tensor);
+    std::array<uint32_t, 3> elements;
+
+    if (ne > 262144) {
+        elements = { 512, 512, CEIL_DIV(ne, 262144) };
+    } else if (ne > 512) {
+        elements = { 512, CEIL_DIV(ne, 512), 1 };
+    } else {
+        elements = { ne, 1, 1 };
+    }
 
     const vk_op_unary_push_constants pc = {
         (uint32_t)ne,
@@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
         0.0f, 0.0f,
     };
     ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
 }
 
 static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
     vk_pipeline to_fp16_vk_1 = nullptr;
 
     if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
     } else {
         to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
     }
     if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
     } else {
         to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
     }
@@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
     vk_pipeline to_fp16_vk_0 = nullptr;
     vk_pipeline to_fp16_vk_1 = nullptr;
     if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
     }
     if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
     } else {
         to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
     }
@@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
     vk_pipeline to_fp16_vk_1 = nullptr;
 
     if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
     } else {
         to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
     }
     if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
     } else {
         to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
     }
@@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
     vk_pipeline to_fp16_vk_0 = nullptr;
     vk_pipeline to_fp16_vk_1 = nullptr;
     if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
     }
     if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
     } else {
         to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
     }
@@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
     case GGML_OP_CPY:
     case GGML_OP_CONT:
     case GGML_OP_DUP:
-        return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type);
+        return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
     case GGML_OP_NORM:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
             return ctx->device->pipeline_norm_f32;
@@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
     case GGML_OP_DIV:
     case GGML_OP_CONCAT:
     case GGML_OP_UPSCALE:
-    case GGML_OP_SCALE:
     case GGML_OP_SQR:
     case GGML_OP_SIN:
     case GGML_OP_COS:
diff --git a/ggml/src/vulkan-shaders/clamp.comp b/ggml/src/vulkan-shaders/clamp.comp
index 7071302a4..ae8fa8753 100644
--- a/ggml/src/vulkan-shaders/clamp.comp
+++ b/ggml/src/vulkan-shaders/clamp.comp
@@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
     const uint idx = get_idx();
 
diff --git a/ggml/src/vulkan-shaders/contig_copy.comp b/ggml/src/vulkan-shaders/contig_copy.comp
new file mode 100644
index 000000000..9acbdd3d2
--- /dev/null
+++ b/ggml/src/vulkan-shaders/contig_copy.comp
@@ -0,0 +1,42 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+#extension GL_EXT_control_flow_attributes : require
+
+const uint num_threads = 128;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    uint idx = get_idx();
+
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 4;
+
+    // fast path for when all four iterations are in-bounds
+    if (idx + (num_iter-1)*num_threads < p.ne) {
+        [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+            data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
+#else
+            data_d[p.d_offset + idx] = data_a[idx];
+#endif
+            idx += num_threads;
+        }
+    } else {
+        [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+            if (idx >= p.ne) {
+                continue;
+            }
+
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+            data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
+#else
+            data_d[p.d_offset + idx] = data_a[idx];
+#endif
+            idx += num_threads;
+        }
+    }
+}
diff --git a/ggml/src/vulkan-shaders/copy.comp b/ggml/src/vulkan-shaders/copy.comp
index c26917c0f..2775068f9 100644
--- a/ggml/src/vulkan-shaders/copy.comp
+++ b/ggml/src/vulkan-shaders/copy.comp
@@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
     const uint idx = get_idx();
 
diff --git a/ggml/src/vulkan-shaders/cos.comp b/ggml/src/vulkan-shaders/cos.comp
index f9a858cbf..fbd9d272c 100644
--- a/ggml/src/vulkan-shaders/cos.comp
+++ b/ggml/src/vulkan-shaders/cos.comp
@@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
     const uint idx = get_idx();
 
diff --git a/ggml/src/vulkan-shaders/generic_unary_head.comp b/ggml/src/vulkan-shaders/generic_unary_head.comp
index eacdefc7d..4e1fa3af3 100644
--- a/ggml/src/vulkan-shaders/generic_unary_head.comp
+++ b/ggml/src/vulkan-shaders/generic_unary_head.comp
@@ -1,4 +1,5 @@
 #extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_control_flow_attributes : require
 
 layout (push_constant) uniform parameter
 {
@@ -9,8 +10,6 @@ layout (push_constant) uniform parameter
     float param1; float param2;
 } p;
 
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 
diff --git a/ggml/src/vulkan-shaders/pad.comp b/ggml/src/vulkan-shaders/pad.comp
index a465cd52b..e87d8b18b 100644
--- a/ggml/src/vulkan-shaders/pad.comp
+++ b/ggml/src/vulkan-shaders/pad.comp
@@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 
diff --git a/ggml/src/vulkan-shaders/repeat.comp b/ggml/src/vulkan-shaders/repeat.comp
index a86af87e7..c03f737cc 100644
--- a/ggml/src/vulkan-shaders/repeat.comp
+++ b/ggml/src/vulkan-shaders/repeat.comp
@@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 uint src0_idx_mod(uint idx) {
     const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
     const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
diff --git a/ggml/src/vulkan-shaders/scale.comp b/ggml/src/vulkan-shaders/scale.comp
index 5cd2f668d..5cfee8c3b 100644
--- a/ggml/src/vulkan-shaders/scale.comp
+++ b/ggml/src/vulkan-shaders/scale.comp
@@ -3,12 +3,22 @@
 #include "types.comp"
 #include "generic_unary_head.comp"
 
+const uint num_threads = 128;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
-    const uint idx = get_idx();
+    uint idx = get_idx();
 
-    if (idx >= p.ne) {
-        return;
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 4;
+
+    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+        if (idx >= p.ne) {
+            continue;
+        }
+
+        data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
+        idx += num_threads;
     }
-
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
 }
diff --git a/ggml/src/vulkan-shaders/sin.comp b/ggml/src/vulkan-shaders/sin.comp
index 7faf9be93..67c48fb9a 100644
--- a/ggml/src/vulkan-shaders/sin.comp
+++ b/ggml/src/vulkan-shaders/sin.comp
@@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
     const uint idx = get_idx();
 
diff --git a/ggml/src/vulkan-shaders/square.comp b/ggml/src/vulkan-shaders/square.comp
index 1fa118c99..2ff48ddc5 100644
--- a/ggml/src/vulkan-shaders/square.comp
+++ b/ggml/src/vulkan-shaders/square.comp
@@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
     const uint idx = get_idx();
 
diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
index 477355c29..5c84f473f 100644
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -350,6 +350,9 @@ void process_shaders() {
     string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
     string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
+    string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
 
     string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
     string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 65be43281..6618d03d1 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -681,6 +681,7 @@ struct test_case {
 
         // run
         int64_t total_time_us = 0;
+        int64_t total_mem = 0;
         int total_runs = 0;
         do {
             int64_t start_time = ggml_time_us();
@@ -688,6 +689,7 @@ struct test_case {
             int64_t end_time = ggml_time_us();
 
             total_time_us += end_time - start_time;
+            total_mem += mem;
             total_runs += n_runs;
         } while (total_time_us < 1000*1000); // run for at least 1 second
 
@@ -717,7 +719,7 @@ struct test_case {
         } else {
             printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
                 op_size(out) / 1024,
-                mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
+                total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
         }
         printf("\n");
 
@@ -2740,6 +2742,13 @@ struct test_flash_attn_ext : public test_case {
         return 5e-4;
     }
 
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        // Just counting matmul costs:
+        // Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
+        return 2 * 2 * nh * nb * hs * kv;
+    }
+
     test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
                         bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
         : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
@@ -3779,6 +3788,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
     test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
 
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
+
     for (int bs : {1, 512}) {
         for (ggml_type type_a : all_types) {
             for (ggml_type type_b : {GGML_TYPE_F32}) {

From 2e82ffa4af29f87e7d3d6dff8060a2a79613b72f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Wed, 13 Nov 2024 09:40:57 +0000
Subject: [PATCH 200/396] sycl : Fixes to broken builds and test-backend-ops
 (#10257)

* Fixes broken build for the SYCL CUDA backend caused by non-explicit gemm call in outprod (merged in with RWKV6 in
Optimize RWKV6 Operator Naming and Implement Multi-core CPU/ SYCL Acceleration #10133)

* Marks permuted MUL_MAT as unsupported to be able to run test-backend-ops

* Fixes asserts in norm to fix debug builds.
---
 ggml/src/ggml-sycl.cpp         | 4 ++++
 ggml/src/ggml-sycl/norm.cpp    | 6 +++---
 ggml/src/ggml-sycl/outprod.cpp | 3 ++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 255bc64c6..2dba15d23 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -4350,6 +4350,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 if (op->op == GGML_OP_MUL_MAT) {
                     a = op->src[0];
                     b = op->src[1];
+                    if (ggml_is_permuted(a) || ggml_is_permuted(b)) {
+                        // TODO: fix like https://github.com/ggerganov/llama.cpp/pull/10021
+                        return false;
+                    }
                 } else {
                     a = op->src[2];
                     b = op->src[1];
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index b3159b9d1..72d8fdb87 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -8,7 +8,6 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
 
     const int nthreads = item_ct1.get_local_range(2);
     const int nwarps = nthreads / WARP_SIZE;
-    assert(nwarps % WARP_SIZE == 0);
     sycl::float2 mean_var = sycl::float2(0.f, 0.f);
 
     for (int col = tid; col < ncols; col += block_size) {
@@ -55,7 +54,6 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
     int end = start + group_size;
     const int nthreads = item_ct1.get_local_range(2);
     const int nwarps = nthreads / WARP_SIZE;
-    assert(nwarps % WARP_SIZE == 0);
     start += item_ct1.get_local_id(2);
     int nreduce = nwarps / WARP_SIZE;
 
@@ -144,7 +142,6 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
     const int tid = item_ct1.get_local_id(2);
     const int nthreads = item_ct1.get_local_range(2);
     const int nwarps = nthreads / WARP_SIZE;
-    assert(nwarps % WARP_SIZE == 0);
     float tmp = 0.0f; // partial sum for thread in warp
 
     for (int col = tid; col < ncols; col += block_size) {
@@ -202,6 +199,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
     }
     else {
         const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+        assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
         const sycl::range<3> block_dims(1, 1, work_group_size);
         /*
         DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
@@ -244,6 +242,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
     }
     else {
         const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+        assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
         const sycl::range<3> block_dims(1, 1, work_group_size);
         /*
         DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
@@ -290,6 +289,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
     }
     else {
         const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+        assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
         const sycl::range<3> block_dims(1, 1, work_group_size);
         /*
         DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp
index c2779df0e..e61cdc2ca 100644
--- a/ggml/src/ggml-sycl/outprod.cpp
+++ b/ggml/src/ggml-sycl/outprod.cpp
@@ -1,4 +1,5 @@
 #include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
 #include "outprod.hpp"
 
 
@@ -39,7 +40,7 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* sr
 
     try {
         // Perform matrix multiplication using oneMKL GEMM
-        oneapi::mkl::blas::gemm(*stream,
+        oneapi::mkl::blas::column_major::gemm(*stream,
             oneapi::mkl::transpose::nontrans, src1_op,
             ne0, ne1, ne01,
             alpha,

From a0ec17b32ec6077f5ca22fe833ebdc9b86795a4d Mon Sep 17 00:00:00 2001
From: Brian <mofosyne@gmail.com>
Date: Wed, 13 Nov 2024 21:10:38 +1100
Subject: [PATCH 201/396] metadata: Detailed Dataset Authorship Metadata
 (#8875)

Converter script can now read these two fields as a detailed base model and dataset source.
This was done so that it will be easier for Hugging Face to integrate detailed metadata as needed.

 -  base_model_sources (List[dict], optional)
 -  dataset_sources (List[dict], optional)

Dataset now represented as:

 - general.dataset.count
 - general.dataset.{id}.name
 - general.dataset.{id}.author
 - general.dataset.{id}.version
 - general.dataset.{id}.organization
 - general.dataset.{id}.description
 - general.dataset.{id}.url
 - general.dataset.{id}.doi
 - general.dataset.{id}.uuid
 - general.dataset.{id}.repo_url

This also adds to base model these metadata:

 - general.base_model.{id}.description
---
 examples/convert_legacy_llama.py |  26 +++++-
 gguf-py/gguf/constants.py        |  14 ++-
 gguf-py/gguf/gguf_writer.py      |  36 +++++++-
 gguf-py/gguf/metadata.py         | 145 +++++++++++++++++++++++++++----
 gguf-py/tests/test_metadata.py   |  37 +++++++-
 5 files changed, 233 insertions(+), 25 deletions(-)

diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py
index 9ab9ab06e..c4ec5c524 100755
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@@ -840,6 +840,8 @@ class OutputFile:
                         self.gguf.add_base_model_version(key, base_model_entry["version"])
                     if "organization" in base_model_entry:
                         self.gguf.add_base_model_organization(key, base_model_entry["organization"])
+                    if "description" in base_model_entry:
+                        self.gguf.add_base_model_description(key, base_model_entry["description"])
                     if "url" in base_model_entry:
                         self.gguf.add_base_model_url(key, base_model_entry["url"])
                     if "doi" in base_model_entry:
@@ -849,12 +851,32 @@ class OutputFile:
                     if "repo_url" in base_model_entry:
                         self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
 
+            if metadata.datasets is not None:
+                self.gguf.add_dataset_count(len(metadata.datasets))
+                for key, dataset_entry in enumerate(metadata.datasets):
+                    if "name" in dataset_entry:
+                        self.gguf.add_dataset_name(key, dataset_entry["name"])
+                    if "author" in dataset_entry:
+                        self.gguf.add_dataset_author(key, dataset_entry["author"])
+                    if "version" in dataset_entry:
+                        self.gguf.add_dataset_version(key, dataset_entry["version"])
+                    if "organization" in dataset_entry:
+                        self.gguf.add_dataset_organization(key, dataset_entry["organization"])
+                    if "description" in dataset_entry:
+                        self.gguf.add_dataset_description(key, dataset_entry["description"])
+                    if "url" in dataset_entry:
+                        self.gguf.add_dataset_url(key, dataset_entry["url"])
+                    if "doi" in dataset_entry:
+                        self.gguf.add_dataset_doi(key, dataset_entry["doi"])
+                    if "uuid" in dataset_entry:
+                        self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
+                    if "repo_url" in dataset_entry:
+                        self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
+
             if metadata.tags is not None:
                 self.gguf.add_tags(metadata.tags)
             if metadata.languages is not None:
                 self.gguf.add_languages(metadata.languages)
-            if metadata.datasets is not None:
-                self.gguf.add_datasets(metadata.datasets)
 
     def add_meta_arch(self, params: Params) -> None:
         # Metadata About The Neural Architecture Itself
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 7ab08b036..bc2b649d1 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -64,15 +64,27 @@ class Keys:
         BASE_MODEL_AUTHOR          = "general.base_model.{id}.author"
         BASE_MODEL_VERSION         = "general.base_model.{id}.version"
         BASE_MODEL_ORGANIZATION    = "general.base_model.{id}.organization"
+        BASE_MODEL_DESCRIPTION     = "general.base_model.{id}.description"
         BASE_MODEL_URL             = "general.base_model.{id}.url" # Model Website/Paper
         BASE_MODEL_DOI             = "general.base_model.{id}.doi"
         BASE_MODEL_UUID            = "general.base_model.{id}.uuid"
         BASE_MODEL_REPO_URL        = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...)
 
+        # Dataset Source
+        DATASET_COUNT           = "general.dataset.count"
+        DATASET_NAME            = "general.dataset.{id}.name"
+        DATASET_AUTHOR          = "general.dataset.{id}.author"
+        DATASET_VERSION         = "general.dataset.{id}.version"
+        DATASET_ORGANIZATION    = "general.dataset.{id}.organization"
+        DATASET_DESCRIPTION     = "general.dataset.{id}.description"
+        DATASET_URL             = "general.dataset.{id}.url" # Model Website/Paper
+        DATASET_DOI             = "general.dataset.{id}.doi"
+        DATASET_UUID            = "general.dataset.{id}.uuid"
+        DATASET_REPO_URL        = "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...)
+
         # Array based KV stores
         TAGS                       = "general.tags"
         LANGUAGES                  = "general.languages"
-        DATASETS                   = "general.datasets"
 
     class LLM:
         VOCAB_SIZE                        = "{arch}.vocab_size"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 0d8d8a0b0..7a55d1296 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -568,6 +568,9 @@ class GGUFWriter:
     def add_base_model_organization(self, source_id: int, organization: str) -> None:
         self.add_string(Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization)
 
+    def add_base_model_description(self, source_id: int, description: str) -> None:
+        self.add_string(Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description)
+
     def add_base_model_url(self, source_id: int, url: str) -> None:
         self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)
 
@@ -580,15 +583,42 @@ class GGUFWriter:
     def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
         self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)
 
+    def add_dataset_count(self, source_count: int) -> None:
+        self.add_uint32(Keys.General.DATASET_COUNT, source_count)
+
+    def add_dataset_name(self, source_id: int, name: str) -> None:
+        self.add_string(Keys.General.DATASET_NAME.format(id=source_id), name)
+
+    def add_dataset_author(self, source_id: int, author: str) -> None:
+        self.add_string(Keys.General.DATASET_AUTHOR.format(id=source_id), author)
+
+    def add_dataset_version(self, source_id: int, version: str) -> None:
+        self.add_string(Keys.General.DATASET_VERSION.format(id=source_id), version)
+
+    def add_dataset_organization(self, source_id: int, organization: str) -> None:
+        self.add_string(Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization)
+
+    def add_dataset_description(self, source_id: int, description: str) -> None:
+        self.add_string(Keys.General.DATASET_DESCRIPTION.format(id=source_id), description)
+
+    def add_dataset_url(self, source_id: int, url: str) -> None:
+        self.add_string(Keys.General.DATASET_URL.format(id=source_id), url)
+
+    def add_dataset_doi(self, source_id: int, doi: str) -> None:
+        self.add_string(Keys.General.DATASET_DOI.format(id=source_id), doi)
+
+    def add_dataset_uuid(self, source_id: int, uuid: str) -> None:
+        self.add_string(Keys.General.DATASET_UUID.format(id=source_id), uuid)
+
+    def add_dataset_repo_url(self, source_id: int, repo_url: str) -> None:
+        self.add_string(Keys.General.DATASET_REPO_URL.format(id=source_id), repo_url)
+
     def add_tags(self, tags: Sequence[str]) -> None:
         self.add_array(Keys.General.TAGS, tags)
 
     def add_languages(self, languages: Sequence[str]) -> None:
         self.add_array(Keys.General.LANGUAGES, languages)
 
-    def add_datasets(self, datasets: Sequence[str]) -> None:
-        self.add_array(Keys.General.DATASETS, datasets)
-
     def add_tensor_data_layout(self, layout: str) -> None:
         self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
 
diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index db318542a..321cbcd4c 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -41,7 +41,7 @@ class Metadata:
     base_models: Optional[list[dict]] = None
     tags: Optional[list[str]] = None
     languages: Optional[list[str]] = None
-    datasets: Optional[list[str]] = None
+    datasets: Optional[list[dict]] = None
 
     @staticmethod
     def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Path] = None, model_name: Optional[str] = None, total_params: int = 0) -> Metadata:
@@ -91,9 +91,11 @@ class Metadata:
         # Base Models is received here as an array of models
         metadata.base_models     = metadata_override.get("general.base_models",        metadata.base_models)
 
+        # Datasets is received here as an array of datasets
+        metadata.datasets        = metadata_override.get("general.datasets",           metadata.datasets)
+
         metadata.tags            = metadata_override.get(Keys.General.TAGS,            metadata.tags)
         metadata.languages       = metadata_override.get(Keys.General.LANGUAGES,       metadata.languages)
-        metadata.datasets        = metadata_override.get(Keys.General.DATASETS,        metadata.datasets)
 
         # Direct Metadata Override (via direct cli argument)
         if model_name is not None:
@@ -346,12 +348,12 @@ class Metadata:
             use_model_card_metadata("author", "model_creator")
             use_model_card_metadata("basename", "model_type")
 
-            if "base_model" in model_card:
+            if "base_model" in model_card or "base_models" in model_card or "base_model_sources" in model_card:
                 # This represents the parent models that this is based on
                 # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
                 # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
                 metadata_base_models = []
-                base_model_value = model_card.get("base_model", None)
+                base_model_value = model_card.get("base_model", model_card.get("base_models", model_card.get("base_model_sources", None)))
 
                 if base_model_value is not None:
                     if isinstance(base_model_value, str):
@@ -364,18 +366,106 @@ class Metadata:
 
                 for model_id in metadata_base_models:
                     # NOTE: model size of base model is assumed to be similar to the size of the current model
-                    model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
                     base_model = {}
-                    if model_full_name_component is not None:
-                        base_model["name"] = Metadata.id_to_title(model_full_name_component)
-                    if org_component is not None:
-                        base_model["organization"] = Metadata.id_to_title(org_component)
-                    if version is not None:
-                        base_model["version"] = version
-                    if org_component is not None and model_full_name_component is not None:
-                        base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
+                    if isinstance(model_id, str):
+                        if model_id.startswith("http://") or model_id.startswith("https://") or model_id.startswith("ssh://"):
+                            base_model["repo_url"] = model_id
+
+                            # Check if Hugging Face ID is present in URL
+                            if "huggingface.co" in model_id:
+                                match = re.match(r"https?://huggingface.co/([^/]+/[^/]+)$", model_id)
+                                if match:
+                                    model_id_component = match.group(1)
+                                    model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id_component, total_params)
+
+                                    # Populate model dictionary with extracted components
+                                    if model_full_name_component is not None:
+                                        base_model["name"] = Metadata.id_to_title(model_full_name_component)
+                                    if org_component is not None:
+                                        base_model["organization"] = Metadata.id_to_title(org_component)
+                                    if version is not None:
+                                        base_model["version"] = version
+
+                        else:
+                            # Likely a Hugging Face ID
+                            model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
+
+                            # Populate model dictionary with extracted components
+                            if model_full_name_component is not None:
+                                base_model["name"] = Metadata.id_to_title(model_full_name_component)
+                            if org_component is not None:
+                                base_model["organization"] = Metadata.id_to_title(org_component)
+                            if version is not None:
+                                base_model["version"] = version
+                            if org_component is not None and model_full_name_component is not None:
+                                base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
+
+                    elif isinstance(model_id, dict):
+                        base_model = model_id
+
+                    else:
+                        logger.error(f"base model entry '{str(model_id)}' not in a known format")
+
                     metadata.base_models.append(base_model)
 
+            if "datasets" in model_card or "dataset" in model_card or "dataset_sources" in model_card:
+                # This represents the datasets that this was trained from
+                metadata_datasets = []
+                dataset_value = model_card.get("datasets", model_card.get("dataset", model_card.get("dataset_sources", None)))
+
+                if dataset_value is not None:
+                    if isinstance(dataset_value, str):
+                        metadata_datasets.append(dataset_value)
+                    elif isinstance(dataset_value, list):
+                        metadata_datasets.extend(dataset_value)
+
+                if metadata.datasets is None:
+                    metadata.datasets = []
+
+                for dataset_id in metadata_datasets:
+                    # NOTE: model size of base model is assumed to be similar to the size of the current model
+                    dataset = {}
+                    if isinstance(dataset_id, str):
+                        if dataset_id.startswith(("http://", "https://", "ssh://")):
+                            dataset["repo_url"] = dataset_id
+
+                            # Check if Hugging Face ID is present in URL
+                            if "huggingface.co" in dataset_id:
+                                match = re.match(r"https?://huggingface.co/([^/]+/[^/]+)$", dataset_id)
+                                if match:
+                                    dataset_id_component = match.group(1)
+                                    dataset_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(dataset_id_component, total_params)
+
+                                    # Populate dataset dictionary with extracted components
+                                    if dataset_name_component is not None:
+                                        dataset["name"] = Metadata.id_to_title(dataset_name_component)
+                                    if org_component is not None:
+                                        dataset["organization"] = Metadata.id_to_title(org_component)
+                                    if version is not None:
+                                        dataset["version"] = version
+
+                        else:
+                            # Likely a Hugging Face ID
+                            dataset_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(dataset_id, total_params)
+
+                            # Populate dataset dictionary with extracted components
+                            if dataset_name_component is not None:
+                                dataset["name"] = Metadata.id_to_title(dataset_name_component)
+                            if org_component is not None:
+                                dataset["organization"] = Metadata.id_to_title(org_component)
+                            if version is not None:
+                                dataset["version"] = version
+                            if org_component is not None and dataset_name_component is not None:
+                                dataset["repo_url"] = f"https://huggingface.co/{org_component}/{dataset_name_component}"
+
+                    elif isinstance(dataset_id, dict):
+                        dataset = dataset_id
+
+                    else:
+                        logger.error(f"dataset entry '{str(dataset_id)}' not in a known format")
+
+                    metadata.datasets.append(dataset)
+
             use_model_card_metadata("license", "license")
             use_model_card_metadata("license_name", "license_name")
             use_model_card_metadata("license_link", "license_link")
@@ -386,9 +476,6 @@ class Metadata:
             use_array_model_card_metadata("languages", "languages")
             use_array_model_card_metadata("languages", "language")
 
-            use_array_model_card_metadata("datasets", "datasets")
-            use_array_model_card_metadata("datasets", "dataset")
-
         # Hugging Face Parameter Heuristics
         ####################################
 
@@ -493,6 +580,8 @@ class Metadata:
                     gguf_writer.add_base_model_version(key, base_model_entry["version"])
                 if "organization" in base_model_entry:
                     gguf_writer.add_base_model_organization(key, base_model_entry["organization"])
+                if "description" in base_model_entry:
+                    gguf_writer.add_base_model_description(key, base_model_entry["description"])
                 if "url" in base_model_entry:
                     gguf_writer.add_base_model_url(key, base_model_entry["url"])
                 if "doi" in base_model_entry:
@@ -502,9 +591,29 @@ class Metadata:
                 if "repo_url" in base_model_entry:
                     gguf_writer.add_base_model_repo_url(key, base_model_entry["repo_url"])
 
+        if self.datasets is not None:
+            gguf_writer.add_dataset_count(len(self.datasets))
+            for key, dataset_entry in enumerate(self.datasets):
+                if "name" in dataset_entry:
+                    gguf_writer.add_dataset_name(key, dataset_entry["name"])
+                if "author" in dataset_entry:
+                    gguf_writer.add_dataset_author(key, dataset_entry["author"])
+                if "version" in dataset_entry:
+                    gguf_writer.add_dataset_version(key, dataset_entry["version"])
+                if "organization" in dataset_entry:
+                    gguf_writer.add_dataset_organization(key, dataset_entry["organization"])
+                if "description" in dataset_entry:
+                    gguf_writer.add_dataset_description(key, dataset_entry["description"])
+                if "url" in dataset_entry:
+                    gguf_writer.add_dataset_url(key, dataset_entry["url"])
+                if "doi" in dataset_entry:
+                    gguf_writer.add_dataset_doi(key, dataset_entry["doi"])
+                if "uuid" in dataset_entry:
+                    gguf_writer.add_dataset_uuid(key, dataset_entry["uuid"])
+                if "repo_url" in dataset_entry:
+                    gguf_writer.add_dataset_repo_url(key, dataset_entry["repo_url"])
+
         if self.tags is not None:
             gguf_writer.add_tags(self.tags)
         if self.languages is not None:
             gguf_writer.add_languages(self.languages)
-        if self.datasets is not None:
-            gguf_writer.add_datasets(self.datasets)
diff --git a/gguf-py/tests/test_metadata.py b/gguf-py/tests/test_metadata.py
index 81a2a30ae..40d484f4e 100755
--- a/gguf-py/tests/test_metadata.py
+++ b/gguf-py/tests/test_metadata.py
@@ -182,8 +182,43 @@ class TestMetadataMethod(unittest.TestCase):
         expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
         expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
         expect.languages=['en']
-        expect.datasets=['teknium/OpenHermes-2.5']
+        expect.datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]
+        self.assertEqual(got, expect)
 
+        # Base Model spec is inferred from model id
+        model_card = {'base_models': 'teknium/OpenHermes-2.5'}
+        expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+        # Base Model spec is only url
+        model_card = {'base_models': ['https://huggingface.co/teknium/OpenHermes-2.5']}
+        expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+        # Base Model spec is given directly
+        model_card = {'base_models': [{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]}
+        expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+        # Dataset spec is inferred from model id
+        model_card = {'datasets': 'teknium/OpenHermes-2.5'}
+        expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+        # Dataset spec is only url
+        model_card = {'datasets': ['https://huggingface.co/teknium/OpenHermes-2.5']}
+        expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+        # Dataset spec is given directly
+        model_card = {'datasets': [{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]}
+        expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
         self.assertEqual(got, expect)
 
     def test_apply_metadata_heuristic_from_hf_parameters(self):

From 0e712a5acbbdd1593e5aeb86d4f6b896a11b438c Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <iainst0409@gmail.com>
Date: Wed, 13 Nov 2024 19:15:23 +0800
Subject: [PATCH 202/396] server : fix incorrect res in
 validate_model_chat_template (#10272)

* server : fix validate_model_chat_template

* server : fix chat res
---
 examples/server/server.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a6d3a1c95..cac55007a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -655,11 +655,16 @@ struct server_context {
     }
 
     bool validate_model_chat_template() const {
-        llama_chat_message chat[] = {{"user", "test"}};
-
-        const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
-
-        return res > 0;
+        std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
+        std::string template_key = "tokenizer.chat_template";
+        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
+        if (res >= 0) {
+            llama_chat_message chat[] = {{"user", "test"}};
+            std::string tmpl = std::string(model_template.data(), model_template.size());
+            int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
+            return chat_res > 0;
+        }
+        return false;
     }
 
     void init() {

From ff7fb670d0a62897c5662536aeb53422c549fbe8 Mon Sep 17 00:00:00 2001
From: Alexey Parfenov <zxed@alkatrazstudio.net>
Date: Wed, 13 Nov 2024 11:16:30 +0000
Subject: [PATCH 203/396] server : add missing docs (#10269)

---
 examples/server/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index c911745f4..6f72c6bb8 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -383,6 +383,10 @@ node index.js
 
     `dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
 
+    `xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
+
+    `xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
+
     `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
 
     `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
@@ -411,7 +415,7 @@ node index.js
 
     `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
 
-    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
+    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
 
 **Response format**
 

From 1ee9eea094fe5846c7d8d770aa7caa749d246b23 Mon Sep 17 00:00:00 2001
From: Small Grass Forest <zixuanxcl@gmail.com>
Date: Wed, 13 Nov 2024 19:17:10 +0800
Subject: [PATCH 204/396] docs : update bindings list (#10261)

Signed-off-by: tianzixuan <tianzixuan335@hellobike.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0378a674e..6ab6acf12 100644
--- a/README.md
+++ b/README.md
@@ -131,6 +131,7 @@ Typically finetunes of the base models below are supported as well.
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
+- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
 - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)

From 5ea926dad7f62ebccff7b24784bd1e01a06d13ae Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 13 Nov 2024 18:11:54 +0200
Subject: [PATCH 205/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index e82984f49..199237a21 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-89952d649e0c5cabbb9ff8c4906f5a843a789fb2
+8a3d799484d861748f86eb87c8314fa2dbccc254

From fb4a0ec0833c71cff5a1a367ba375447ce6106eb Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Wed, 13 Nov 2024 20:00:35 +0200
Subject: [PATCH 206/396] llama : propagate the results of `graph_compute`
 (#9525)

* llama: propagating the results of `graph_compute` to the user interface

* llama: reverting kv_cache in case of failed compute

* llama: `llama_kv_cache_state` was removed, only the result of `llama_graph_compute` is returned

* llama: restore a kv_cache in case of failed computation

* llama: correct reverting of the entire batch.
also updates `llama_kv_cache_find_slot`, will correctly count the number of `used` cells for recurrent models

* llama: updated comments

* llama : add comments about KV cache state after error

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 include/llama.h |   4 +-
 src/llama.cpp   | 120 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 109 insertions(+), 15 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index ccb48f73c..5e742642e 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -797,7 +797,7 @@ extern "C" {
     // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
     // Stores the encoder output internally for later use by the decoder cross-attention layers.
     //   0 - success
-    // < 0 - error
+    // < 0 - error. the KV cache state is restored to the state before this call
     LLAMA_API int32_t llama_encode(
             struct llama_context * ctx,
               struct llama_batch   batch);
@@ -805,7 +805,7 @@ extern "C" {
     // Positive return values does not mean a fatal error, but rather a warning.
     //   0 - success
     //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    // < 0 - error
+    // < 0 - error. the KV cache state is restored to the state before this call
     LLAMA_API int32_t llama_decode(
             struct llama_context * ctx,
               struct llama_batch   batch);
diff --git a/src/llama.cpp b/src/llama.cpp
index 4d89c5222..97eee26a5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3502,11 +3502,24 @@ static bool llama_kv_cache_init(
     return true;
 }
 
+// a structure holds information about the slot found in llama_kv_cache_find_slot
+struct llama_kv_cache_slot_info {
+    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
+    bool found = false;                       // the slot was found
+
+    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
+    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
+
+    operator bool() const { return found; }
+};
+static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
+
 // find an empty slot of size "n_tokens" in the cache
 // updates the cache head
+// returns a structure holding information about the slot found
 // Note: On success, it's important that cache.head points
 // to the first cell of the slot.
-static bool llama_kv_cache_find_slot(
+static struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
            struct llama_kv_cache & cache,
        const struct llama_ubatch & batch) {
     const uint32_t n_tokens = batch.n_tokens;
@@ -3534,7 +3547,7 @@ static bool llama_kv_cache_find_slot(
                     // too big seq_id
                     // TODO: would it be possible to resize the cache instead?
                     LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
-                    return false;
+                    return llama_kv_cache_slot_info_failed;
                 }
                 if (j > 0) {
                     llama_kv_cell & seq = cache.cells[seq_id];
@@ -3669,15 +3682,17 @@ static bool llama_kv_cache_find_slot(
         // allow getting the range of used cells, from head to head + n
         cache.head = min;
         cache.n    = max - min + 1;
+        cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
+            [](const llama_kv_cell& cell){ return !cell.is_empty(); });
 
         // sanity check
-        return cache.n >= n_seqs;
+        return llama_kv_cache_slot_info(cache.n >= n_seqs);
     }
     // otherwise, one cell per token.
 
     if (n_tokens > cache.size) {
         LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
-        return false;
+        return llama_kv_cache_slot_info_failed;
     }
 
     uint32_t n_tested = 0;
@@ -3705,7 +3720,7 @@ static bool llama_kv_cache_find_slot(
 
         if (n_tested >= cache.size) {
             //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
-            return false;
+            return llama_kv_cache_slot_info_failed;
         }
     }
 
@@ -3722,7 +3737,7 @@ static bool llama_kv_cache_find_slot(
 
     cache.used += n_tokens;
 
-    return true;
+    return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
 }
 
 // find how many cells are currently in use
@@ -3998,6 +4013,53 @@ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams)
     return cparams.flash_attn ? 256u : 32u;
 }
 
+// saves the kv_cache state for future recovery.
+// used to rollback llama_kv_cache_find_slot changes.
+struct llama_kv_slot_restorer {
+    struct llama_kv_cache_state {
+        uint32_t head = 0;
+        uint32_t n    = 0;
+    } old_state;
+
+    // for non-recurrent models only
+    // list of slots to restore
+    std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
+
+    bool do_restore = false;
+
+    explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
+        old_state.head  = cache.head;
+        old_state.n     = cache.n;
+    }
+
+    // saves a slot information for future restoration
+    void save(const struct llama_kv_cache_slot_info & slot) {
+        if (slot) {
+            do_restore = true;
+            if (slot.boundaries.first != slot.boundaries.second) {
+                slot_boundaries.push_back(slot.boundaries);
+            }
+        }
+    }
+
+    // must be explicitly called to restore the kv_cache state
+    // and rollback changes from all llama_kv_cache_find_slot calls
+    void restore(struct llama_kv_cache & cache) {
+        if (do_restore) {
+            cache.head  = old_state.head;
+            cache.n     = old_state.n;
+
+            if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
+                llama_kv_cache_seq_rm(cache, -1, -1, -1);
+            } else {
+                for (auto & slot : slot_boundaries) {
+                    llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
+                }
+            }
+        }
+    }
+};
+
 //
 // model loading and saving
 //
@@ -17181,7 +17243,8 @@ static void llama_output_reorder(struct llama_context * ctx) {
     }
 }
 
-static void llama_graph_compute(
+// returns the result of ggml_backend_sched_graph_compute_async execution
+static enum ggml_status llama_graph_compute(
           llama_context & lctx,
             ggml_cgraph * gf,
                     int   n_threads,
@@ -17196,15 +17259,20 @@ static void llama_graph_compute(
         set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
     }
 
-    auto err = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
-    if (err != GGML_STATUS_SUCCESS) {
-        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
+    auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
     }
 
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
+
+    return status;
 }
 
 // decode a batch of tokens by evaluating the transformer
+// in case of unsuccessful decoding (error or warning),
+// the kv_cache state will be returned to its original state
+// (for non-recurrent models) or cleaned (for recurrent models)
 //
 //   - lctx:      llama context
 //   - batch:     batch to evaluate
@@ -17254,6 +17322,7 @@ static int llama_decode_internal(
     lctx.n_queued_tokens += n_tokens_all;
 
     auto & kv_self = lctx.kv_self;
+    llama_kv_slot_restorer kv_slot_restorer(kv_self);
 
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = hparams.n_vocab;
@@ -17338,9 +17407,11 @@ static int llama_decode_internal(
                 kv_self.head = 0;
             }
 
-            if (!llama_kv_cache_find_slot(kv_self, ubatch)) {
+            const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+            if (!slot) {
                 return 1;
             }
+            kv_slot_restorer.save(slot);
 
             if (!kv_self.recurrent) {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
@@ -17387,7 +17458,19 @@ static int llama_decode_internal(
 
         llama_set_inputs(lctx, ubatch);
 
-        llama_graph_compute(lctx, gf, n_threads, threadpool);
+        const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
+        if (compute_status != GGML_STATUS_SUCCESS) {
+            kv_slot_restorer.restore(kv_self);
+            switch (compute_status) {
+                case GGML_STATUS_ABORTED:
+                    return 2;
+                case GGML_STATUS_ALLOC_FAILED:
+                    return -2;
+                case GGML_STATUS_FAILED:
+                default:
+                    return -3;
+            }
+        }
 
         // update the kv ring buffer
         {
@@ -17624,7 +17707,18 @@ static int llama_encode_internal(
 
     llama_set_inputs(lctx, ubatch);
 
-    llama_graph_compute(lctx, gf, n_threads, threadpool);
+    const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
+    switch (compute_status) {
+        case GGML_STATUS_SUCCESS:
+            break;
+        case GGML_STATUS_ABORTED:
+            return 2;
+        case GGML_STATUS_ALLOC_FAILED:
+            return -2;
+        case GGML_STATUS_FAILED:
+        default:
+            return -3;
+    }
 
     // extract embeddings
     if (embd) {

From 66798e42fbe636f1cb6236e4bc30939d23ef7c25 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 13 Nov 2024 14:59:47 -0600
Subject: [PATCH 207/396] vulkan: Use macros to make the mat mul pipeline
 creation more concise (#10259)

Also add vk_matmul_pipeline2 to hold f16/f32 accumulator versions of a
pipeline. This isn't really used yet.
---
 ggml/src/ggml-vulkan.cpp | 568 +++++++++------------------------------
 1 file changed, 126 insertions(+), 442 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index ec31e726a..361fdcbc1 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -106,6 +106,15 @@ struct vk_matmul_pipeline_struct {
 
 typedef std::shared_ptr<vk_matmul_pipeline_struct> vk_matmul_pipeline;
 
+struct vk_matmul_pipeline2 {
+    vk_matmul_pipeline2() {
+        f16acc = std::make_shared<vk_matmul_pipeline_struct>();
+        f32acc = std::make_shared<vk_matmul_pipeline_struct>();
+    }
+    vk_matmul_pipeline f32acc;
+    vk_matmul_pipeline f16acc;
+};
+
 struct vk_device_struct;
 typedef std::shared_ptr<vk_device_struct> vk_device;
 typedef std::weak_ptr<vk_device_struct> vk_device_ref;
@@ -161,11 +170,11 @@ struct vk_device_struct {
 
     vk_matmul_pipeline pipeline_matmul_f32;
     vk_matmul_pipeline pipeline_matmul_f32_f16;
-    vk_matmul_pipeline pipeline_matmul_f16;
-    vk_matmul_pipeline pipeline_matmul_f16_f32;
+    vk_matmul_pipeline2 pipeline_matmul_f16;
+    vk_matmul_pipeline2 pipeline_matmul_f16_f32;
     vk_pipeline pipeline_matmul_split_k_reduce;
 
-    vk_matmul_pipeline pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
+    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
 
     vk_matmul_pipeline pipeline_matmul_id_f32;
     vk_matmul_pipeline pipeline_matmul_id_f16;
@@ -1210,37 +1219,28 @@ static void ggml_vk_load_shaders(vk_device& device) {
     std::cerr << "ggml_vulkan: Compiling shaders";
 
     // mulmat
-    std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
-    std::initializer_list<uint32_t> warptile_m = { 128,  64,  64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
-    std::initializer_list<uint32_t> warptile_s = { std::max(device->subgroup_size, 16u),  32,  32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
+    std::vector<uint32_t> l_warptile, m_warptile, s_warptile, l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
+    std::array<uint32_t, 3> l_wg_denoms, m_wg_denoms, s_wg_denoms;
+    uint32_t l_align, m_align, s_align;
 
-    std::initializer_list<uint32_t> warptile_mmq_l = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
-    std::initializer_list<uint32_t> warptile_mmq_m = { 128,  64,  64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
-    std::initializer_list<uint32_t> warptile_mmq_s = { std::max(device->subgroup_size, 16u),  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
+    l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
+    m_warptile = { 128,  64,  64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
+    s_warptile = { std::max(device->subgroup_size, 16u),  32,  32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
 
-    std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
-    std::array<uint32_t, 3> m_wg_denoms = { 64,  64, 1 };
-    std::array<uint32_t, 3> s_wg_denoms = { 32,  32, 1 };
+    l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
+    m_warptile_mmq = { 128,  64,  64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
+    s_warptile_mmq = { std::max(device->subgroup_size, 16u),  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
 
-    uint32_t l_align = 128;
-    uint32_t m_align =  64;
-    uint32_t s_align =  32;
+    l_wg_denoms = {128, 128, 1 };
+    m_wg_denoms = { 64,  64, 1 };
+    s_wg_denoms = { 32,  32, 1 };
+
+    l_align = 128;
+    m_align =  64;
+    s_align =  32;
 
     device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
     device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
 
     device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
     device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
@@ -1258,7 +1258,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
 
     std::vector<std::future<void>> compiles;
-    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
+    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align) {
         {
             // wait until fewer than N compiles are in progress
             uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@@ -1272,411 +1272,95 @@ static void ggml_vk_load_shaders(vk_device& device) {
     };
 
     if (device->fp16) {
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
+        // Create 6 variants, {s,m,l}x{unaligned,aligned}
+#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \
 
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
+        CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
 
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+        CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+        CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+        CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
 
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_1_f32_l", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_1_f32_m", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_1_f32_s", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_1_f32_aligned_l", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_1_f32_aligned_m", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_1_f32_aligned_s", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
 
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "matmul_id_q4_1_f32_l", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "matmul_id_q4_1_f32_m", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "matmul_id_q4_1_f32_s", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "matmul_id_q4_1_f32_aligned_l", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "matmul_id_q4_1_f32_aligned_m", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "matmul_id_q4_1_f32_aligned_s", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+#undef CREATE_MM
     } else {
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
+        // Create 6 variants, {s,m,l}x{unaligned,aligned}
+#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \
 
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
+        CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
 
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+        CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+        CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+        CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
 
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_1_f32_l", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_1_f32_m", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_1_f32_s", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_1_f32_aligned_l", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_1_f32_aligned_m", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_1_f32_aligned_s", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
 
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "matmul_id_q4_1_f32_l", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "matmul_id_q4_1_f32_m", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "matmul_id_q4_1_f32_s", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "matmul_id_q4_1_f32_aligned_l", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "matmul_id_q4_1_f32_aligned_m", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "matmul_id_q4_1_f32_aligned_s", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+#undef CREATE_MM
     }
 
     // mul mat vec
@@ -2313,10 +1997,10 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
         return ctx->device->pipeline_matmul_f32_f16;
     }
     if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
-        return ctx->device->pipeline_matmul_f16_f32;
+        return ctx->device->pipeline_matmul_f16_f32.f32acc;
     }
     if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
-        return ctx->device->pipeline_matmul_f16;
+        return ctx->device->pipeline_matmul_f16.f32acc;
     }
 
     if (src1_type != GGML_TYPE_F32) {
@@ -2340,7 +2024,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
             return nullptr;
     }
 
-    return ctx->device->pipeline_dequant_mul_mat_mat[src0_type];
+    return ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
 }
 
 static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
@@ -5070,10 +4754,10 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
             p = ctx->device->pipeline_matmul_f32_f16->a_s;
             shname = "F32_F16_ALIGNED_S";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = ctx->device->pipeline_matmul_f16_f32->a_s;
+            p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_s;
             shname = "F16_F32_ALIGNED_S";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-            p = ctx->device->pipeline_matmul_f16->a_s;
+            p = ctx->device->pipeline_matmul_f16.f32acc->a_s;
             shname = "F16_ALIGNED_S";
         } else {
             GGML_ABORT("fatal error");
@@ -5086,10 +4770,10 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
             p = ctx->device->pipeline_matmul_f32_f16->a_m;
             shname = "F32_F16_ALIGNED_M";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = ctx->device->pipeline_matmul_f16_f32->a_m;
+            p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_m;
             shname = "F16_F32_ALIGNED_M";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-            p = ctx->device->pipeline_matmul_f16->a_m;
+            p = ctx->device->pipeline_matmul_f16.f32acc->a_m;
             shname = "F16_ALIGNED_M";
         } else {
             GGML_ABORT("fatal error");
@@ -5102,10 +4786,10 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
             p = ctx->device->pipeline_matmul_f32_f16->a_l;
             shname = "F32_F16_ALIGNED_L";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = ctx->device->pipeline_matmul_f16_f32->a_l;
+            p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_l;
             shname = "F16_F32_ALIGNED_L";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-            p = ctx->device->pipeline_matmul_f16->a_l;
+            p = ctx->device->pipeline_matmul_f16.f32acc->a_l;
             shname = "F16_ALIGNED_L";
         } else {
             GGML_ABORT("fatal error");
@@ -5125,10 +4809,10 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
                 p = ctx->device->pipeline_matmul_f32_f16->s;
                 shname = "F32_F16_S";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = ctx->device->pipeline_matmul_f16_f32->s;
+                p = ctx->device->pipeline_matmul_f16_f32.f32acc->s;
                 shname = "F16_F32_S";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-                p = ctx->device->pipeline_matmul_f16->s;
+                p = ctx->device->pipeline_matmul_f16.f32acc->s;
                 shname = "F16_S";
             }
         } else if (shader_size == 1) {
@@ -5139,10 +4823,10 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
                 p = ctx->device->pipeline_matmul_f32_f16->m;
                 shname = "F32_F16_M";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = ctx->device->pipeline_matmul_f16_f32->m;
+                p = ctx->device->pipeline_matmul_f16_f32.f32acc->m;
                 shname = "F16_F32_M";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-                p = ctx->device->pipeline_matmul_f16->m;
+                p = ctx->device->pipeline_matmul_f16.f32acc->m;
                 shname = "F16_M";
             }
         } else if (shader_size == 2) {
@@ -5153,10 +4837,10 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
                 p = ctx->device->pipeline_matmul_f32_f16->l;
                 shname = "F32_F16_L";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = ctx->device->pipeline_matmul_f16_f32->l;
+                p = ctx->device->pipeline_matmul_f16_f32.f32acc->l;
                 shname = "F16_F32_L";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-                p = ctx->device->pipeline_matmul_f16->l;
+                p = ctx->device->pipeline_matmul_f16.f32acc->l;
                 shname = "F16_L";
             }
         }
@@ -5485,13 +5169,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
     vk_pipeline p;
     std::string shname;
     if (shader_size == 0) {
-        p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_s;
+        p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_s;
         shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S";
     } else if (shader_size == 1) {
-        p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_m;
+        p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_m;
         shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M";
     } else if (shader_size == 2) {
-        p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_l;
+        p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_l;
         shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L";
     } else {
         GGML_ASSERT(0);
@@ -5501,13 +5185,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
 
     if (k != kpad) {
         if (shader_size == 0) {
-            p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->s;
+            p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->s;
             shname = std::string(ggml_type_name(quant)) + "_S";
         } else if (shader_size == 1) {
-            p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->m;
+            p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->m;
             shname = std::string(ggml_type_name(quant)) + "_M";
         } else if (shader_size == 2) {
-            p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->l;
+            p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->l;
             shname = std::string(ggml_type_name(quant)) + "_L";
         } else {
             GGML_ASSERT(0);

From af148c9386da825a60c7038549c121c35ca56b50 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 13 Nov 2024 23:22:55 -0600
Subject: [PATCH 208/396] vulkan: Optimize binary ops (#10270)

Reuse the index calculations across all of src0/src1/dst. Add a shader
variant for when src0/src1 are the same dimensions and additional modulus
for src1 aren't needed. Div/mod are slow, so add "fast" div/mod that
have a fast path when the calculation isn't needed or can be done more
cheaply.
---
 ggml/src/ggml-vulkan.cpp                      | 27 +++++----
 ggml/src/vulkan-shaders/acc.comp              |  9 ++-
 ggml/src/vulkan-shaders/add.comp              | 25 +++++++--
 ggml/src/vulkan-shaders/concat.comp           |  2 +
 ggml/src/vulkan-shaders/div.comp              | 23 ++++++--
 .../vulkan-shaders/generic_binary_head.comp   | 56 +++++++++++--------
 ggml/src/vulkan-shaders/get_rows.comp         |  2 +
 ggml/src/vulkan-shaders/get_rows_quant.comp   |  2 +
 ggml/src/vulkan-shaders/mul.comp              | 23 ++++++--
 9 files changed, 117 insertions(+), 52 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 361fdcbc1..c02c35665 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -192,9 +192,10 @@ struct vk_device_struct {
     vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
     vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
     vk_pipeline pipeline_acc_f32;
-    vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16;
-    vk_pipeline pipeline_mul_f32;
-    vk_pipeline pipeline_div_f32;
+    vk_pipeline pipeline_add_f32, pipeline_add_f32_norepeat;
+    vk_pipeline pipeline_add_f16_f32_f16, pipeline_add_f16_f32_f16_norepeat;
+    vk_pipeline pipeline_mul_f32, pipeline_mul_f32_norepeat;
+    vk_pipeline pipeline_div_f32, pipeline_div_f32_norepeat;
     vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
     vk_pipeline pipeline_upscale_f32;
     vk_pipeline pipeline_scale_f32;
@@ -1456,13 +1457,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_add_f32_norepeat, "add_f32_norepeat", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16_norepeat, "add_f16_f32_f16_norepeat", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_mul_f32_norepeat, "mul_f32_norepeat", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_div_f32_norepeat, "div_f32_norepeat", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
@@ -3801,20 +3806,20 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
         return nullptr;
     case GGML_OP_ADD:
         if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_add_f32;
+            return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_f32_norepeat : ctx->device->pipeline_add_f32;
         }
         if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
-            return ctx->device->pipeline_add_f16_f32_f16;
+            return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_f16_f32_f16_norepeat : ctx->device->pipeline_add_f16_f32_f16;
         }
         return nullptr;
     case GGML_OP_MUL:
         if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_mul_f32;
+            return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_mul_f32_norepeat : ctx->device->pipeline_mul_f32;
         }
         return nullptr;
     case GGML_OP_DIV:
         if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_div_f32;
+            return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_div_f32_norepeat : ctx->device->pipeline_div_f32;
         }
         return nullptr;
     case GGML_OP_CONCAT:
diff --git a/ggml/src/vulkan-shaders/acc.comp b/ggml/src/vulkan-shaders/acc.comp
index 4c8739efe..4f5a04e71 100644
--- a/ggml/src/vulkan-shaders/acc.comp
+++ b/ggml/src/vulkan-shaders/acc.comp
@@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_binary_head.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
     const uint idx = gl_GlobalInvocationID.x;
     if (idx >= p.ne) {
@@ -15,10 +17,13 @@ void main() {
     const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
     const uint ox = src1_i % p.nb01;
 
+    uint i00, i01, i02, i03;
+    get_indices(idx, i00, i01, i02, i03);
+
     if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
-        data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
+        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
     } else {
-        data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]));
+        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]));
     }
 }
 
diff --git a/ggml/src/vulkan-shaders/add.comp b/ggml/src/vulkan-shaders/add.comp
index 3974845d6..da61b76df 100644
--- a/ggml/src/vulkan-shaders/add.comp
+++ b/ggml/src/vulkan-shaders/add.comp
@@ -1,14 +1,29 @@
 #version 450
 
+#extension GL_EXT_shader_16bit_storage : require
+
 #include "types.comp"
 #include "generic_binary_head.comp"
 
+const uint num_threads = 256;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
-    const uint idx = get_idx();
+    uint idx = get_idx();
 
-    if (idx >= p.ne) {
-        return;
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 2;
+
+    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+        if (idx >= p.ne) {
+            continue;
+        }
+        uint i00, i01, i02, i03;
+        get_indices(idx, i00, i01, i02, i03);
+
+        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
+
+        idx += num_threads;
     }
-
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
 }
diff --git a/ggml/src/vulkan-shaders/concat.comp b/ggml/src/vulkan-shaders/concat.comp
index c23b6eb1b..683f9ac3c 100644
--- a/ggml/src/vulkan-shaders/concat.comp
+++ b/ggml/src/vulkan-shaders/concat.comp
@@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_binary_head.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
     const int dim = p.param3;
diff --git a/ggml/src/vulkan-shaders/div.comp b/ggml/src/vulkan-shaders/div.comp
index 8cfce58b1..e581905b3 100644
--- a/ggml/src/vulkan-shaders/div.comp
+++ b/ggml/src/vulkan-shaders/div.comp
@@ -3,12 +3,25 @@
 #include "types.comp"
 #include "generic_binary_head.comp"
 
+const uint num_threads = 256;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
-    const uint idx = get_idx();
+    uint idx = get_idx();
 
-    if (idx >= p.ne) {
-        return;
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 2;
+
+    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+        if (idx >= p.ne) {
+            continue;
+        }
+        uint i00, i01, i02, i03;
+        get_indices(idx, i00, i01, i02, i03);
+
+        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
+
+        idx += num_threads;
     }
-
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
 }
diff --git a/ggml/src/vulkan-shaders/generic_binary_head.comp b/ggml/src/vulkan-shaders/generic_binary_head.comp
index b6beaff1c..a6555fa27 100644
--- a/ggml/src/vulkan-shaders/generic_binary_head.comp
+++ b/ggml/src/vulkan-shaders/generic_binary_head.comp
@@ -1,4 +1,5 @@
 #extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_control_flow_attributes : require
 
 layout (push_constant) uniform parameter
 {
@@ -10,43 +11,50 @@ layout (push_constant) uniform parameter
     float param1; float param2; int param3;
 } p;
 
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
 
+// true if src0/src1 are the same shape and the indices can be reused without additional modulus
+layout(constant_id = 0) const bool norepeat = false;
+
 uint get_idx() {
     return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 }
 
-uint src0_idx(uint idx) {
-    const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
+// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
+uint fastmod(uint a, uint b) {
+    if ((b & (b-1)) == 0) {
+        return a & (b-1);
+    }
+    return a % b;
+}
+
+uint fastdiv(uint a, uint b) {
+    return (a < b) ? 0 : (a / b);
+}
+
+void get_indices(uint idx, out uint i00, out uint i01, out uint i02, out uint i03) {
+    i03 = fastdiv(idx, (p.ne02*p.ne01*p.ne00));
     const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
-    const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00);
+    i02 = fastdiv((idx - i03_offset), (p.ne01*p.ne00));
     const uint i02_offset = i02*p.ne01*p.ne00;
-    const uint i01 = (idx - i03_offset - i02_offset) / p.ne00;
-    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
+    i01 = (idx - i03_offset - i02_offset) / p.ne00;
+    i00 = idx - i03_offset - i02_offset - i01*p.ne00;
+}
+
+uint src0_idx(uint i00, uint i01, uint i02, uint i03) {
     return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
 }
 
-uint src1_idx(uint idx) {
-    const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
-    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
-    const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00);
-    const uint i02_offset = i02*p.ne01*p.ne00;
-    const uint i01 = (idx - i03_offset - i02_offset) / p.ne00;
-    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
-
-    return (i03 % p.ne13)*p.nb13 + (i02 % p.ne12)*p.nb12 + (i01 % p.ne11)*p.nb11 + (i00 % p.ne10)*p.nb10;
+uint src1_idx(uint i00, uint i01, uint i02, uint i03) {
+    if (norepeat) {
+        return i03*p.nb13 + i02*p.nb12 + i01*p.nb11 + i00*p.nb10;
+    } else {
+        return fastmod(i03, p.ne13)*p.nb13 + fastmod(i02, p.ne12)*p.nb12 + fastmod(i01, p.ne11)*p.nb11 + fastmod(i00, p.ne10)*p.nb10;
+    }
 }
 
-uint dst_idx(uint idx) {
-    const uint i23 = idx / (p.ne22*p.ne21*p.ne20);
-    const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20;
-    const uint i22 = (idx - i23_offset) / (p.ne21*p.ne20);
-    const uint i22_offset = i22*p.ne21*p.ne20;
-    const uint i21 = (idx - i23_offset - i22_offset) / p.ne20;
-    const uint i20 = idx - i23_offset - i22_offset - i21*p.ne20;
-    return i23*p.nb23 + i22*p.nb22 + i21*p.nb21 + i20*p.nb20;
+uint dst_idx(uint i00, uint i01, uint i02, uint i03) {
+    return i03*p.nb23 + i02*p.nb22 + i01*p.nb21 + i00*p.nb20;
 }
diff --git a/ggml/src/vulkan-shaders/get_rows.comp b/ggml/src/vulkan-shaders/get_rows.comp
index e9ff22efa..a7b81e52c 100644
--- a/ggml/src/vulkan-shaders/get_rows.comp
+++ b/ggml/src/vulkan-shaders/get_rows.comp
@@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_binary_head.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
     const uint i00 = gl_GlobalInvocationID.x;
     const uint i10 = gl_GlobalInvocationID.y;
diff --git a/ggml/src/vulkan-shaders/get_rows_quant.comp b/ggml/src/vulkan-shaders/get_rows_quant.comp
index 53a9a96f2..8d30b63c1 100644
--- a/ggml/src/vulkan-shaders/get_rows_quant.comp
+++ b/ggml/src/vulkan-shaders/get_rows_quant.comp
@@ -4,6 +4,8 @@
 #include "generic_binary_head.comp"
 #include "dequant_funcs.comp"
 
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
     const uint i00 = (gl_GlobalInvocationID.x)*2;
     const uint i10 = gl_GlobalInvocationID.y;
diff --git a/ggml/src/vulkan-shaders/mul.comp b/ggml/src/vulkan-shaders/mul.comp
index bfb61c92d..5ce57cbcf 100644
--- a/ggml/src/vulkan-shaders/mul.comp
+++ b/ggml/src/vulkan-shaders/mul.comp
@@ -3,12 +3,25 @@
 #include "types.comp"
 #include "generic_binary_head.comp"
 
+const uint num_threads = 256;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
-    const uint idx = get_idx();
+    uint idx = get_idx();
 
-    if (idx >= p.ne) {
-        return;
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 2;
+
+    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+        if (idx >= p.ne) {
+            continue;
+        }
+        uint i00, i01, i02, i03;
+        get_indices(idx, i00, i01, i02, i03);
+
+        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
+
+        idx += num_threads;
     }
-
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
 }

From 2a82891a853db908679f7b24b04586e6f6393fe0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 Nov 2024 11:44:15 +0200
Subject: [PATCH 209/396] speculative : fix out-of-bounds access (#10289)

---
 examples/speculative/speculative.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index a40e755a2..6cafd8a83 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -267,11 +267,12 @@ int main(int argc, char ** argv) {
                         for (size_t i = 0; i < dist_tgt.size; i++) {
                             if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
                                 p_tgt = dist_tgt.data[i].p;
+                                break;
                             }
+                        }
+                        for (size_t i = 0; i < dist_dft.size; i++) {
                             if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
                                 p_dft = dist_dft.data[i].p;
-                            }
-                            if (p_tgt && p_dft) {
                                 break;
                             }
                         }

From 4a8ccb37ad9c9027cbcfd5548c19cdffe48d5197 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 14 Nov 2024 13:00:15 +0100
Subject: [PATCH 210/396] CUDA: no -sm row for very small matrices (#10185)

---
 ggml/src/ggml-cuda.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 357cee660..b5096b3ee 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2978,6 +2978,17 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
             {
                 struct ggml_tensor * a = op->src[0];
                 struct ggml_tensor * b = op->src[1];
+                // for small weight matrices the active device can end up without any rows, don't use row split in those cases
+                // this avoids some edge cases (and the performance would not be good anyways)
+                if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
+                    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
+                    int64_t row_low;
+                    int64_t row_high;
+                    get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
+                    if (row_low == row_high) {
+                        return false;
+                    }
+                }
                 if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
                     return false;
                 }

From ae8de6d50a09d49545e0afab2e50cc4acfb280e2 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 14 Nov 2024 18:04:35 +0100
Subject: [PATCH 211/396] ggml : build backends as libraries (#10256)

* ggml : build backends as libraries

---------

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: R0CKSTAR <xiaodong.ye@mthreads.com>
---
 .devops/llama-cli-cuda.Dockerfile             |     9 +-
 .devops/llama-cli-musa.Dockerfile             |     7 +-
 .devops/llama-server-cuda.Dockerfile          |     7 +-
 .devops/llama-server-musa.Dockerfile          |     7 +-
 .devops/nix/package.nix                       |     6 +-
 .github/workflows/build.yml                   |    10 +-
 .gitmodules                                   |     2 +-
 CMakeLists.txt                                |     1 -
 Makefile                                      |   267 +-
 Package.swift                                 |    15 +-
 cmake/llama-config.cmake.in                   |     2 +-
 common/common.cpp                             |     5 -
 docs/build.md                                 |     6 +-
 examples/llama-bench/llama-bench.cpp          |    32 +-
 examples/quantize-stats/quantize-stats.cpp    |    19 +-
 ggml/CMakeLists.txt                           |    10 +-
 ggml/include/ggml-amx.h                       |    10 +-
 ggml/include/ggml-backend.h                   |    14 +
 ggml/include/ggml-blas.h                      |     8 +-
 ggml/include/ggml-cann.h                      |    16 +-
 ggml/include/ggml-cpu.h                       |   104 +-
 ggml/include/ggml-cuda.h                      |    24 +-
 ggml/include/ggml-kompute.h                   |     8 +-
 ggml/include/ggml-metal.h                     |    16 +-
 ggml/include/ggml-rpc.h                       |    14 +-
 ggml/include/ggml-sycl.h                      |    26 +-
 ggml/include/ggml-vulkan.h                    |    18 +-
 ggml/include/ggml.h                           |    43 +-
 ggml/src/CMakeLists.txt                       |  1267 +-
 ggml/src/ggml-aarch64.c                       |  3386 +----
 ggml/src/ggml-aarch64.h                       |    20 -
 ggml/src/ggml-amx/CMakeLists.txt              |   107 +
 ggml/src/ggml-amx/common.h                    |     3 +-
 ggml/src/{ => ggml-amx}/ggml-amx.cpp          |    15 +-
 ggml/src/ggml-amx/mmq.cpp                     |     7 +-
 ggml/src/ggml-backend-reg.cpp                 |   195 +
 ggml/src/ggml-backend.cpp                     |   676 +-
 ggml/src/ggml-blas/CMakeLists.txt             |    91 +
 ggml/src/{ => ggml-blas}/ggml-blas.cpp        |     4 +-
 ggml/src/ggml-cann/CMakeLists.txt             |    46 +
 ggml/src/{ => ggml-cann}/ggml-cann.cpp        |     0
 ggml/src/ggml-cpu/CMakeLists.txt              |   244 +
 ggml/{ => src/ggml-cpu}/cmake/FindSIMD.cmake  |     0
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c          |  3387 +++++
 ggml/src/ggml-cpu/ggml-cpu-aarch64.h          |    27 +
 ggml/src/{ => ggml-cpu}/ggml-cpu-impl.h       |   243 -
 ggml/src/ggml-cpu/ggml-cpu-quants.c           | 10796 ++++++++++++++++
 ggml/src/ggml-cpu/ggml-cpu-quants.h           |    63 +
 ggml/src/{ => ggml-cpu}/ggml-cpu.c            |   248 +-
 ggml/src/ggml-cpu/ggml-cpu.cpp                |   575 +
 ggml/src/{ => ggml-cpu}/llamafile/sgemm.cpp   |     0
 ggml/src/{ => ggml-cpu}/llamafile/sgemm.h     |     0
 ggml/src/ggml-cuda/CMakeLists.txt             |   165 +
 ggml/src/ggml-cuda/common.cuh                 |    50 +-
 ggml/src/ggml-cuda/fattn-common.cuh           |     4 +-
 ggml/src/ggml-cuda/fattn-tile-f16.cu          |     4 +-
 ggml/src/ggml-cuda/fattn-tile-f32.cu          |     4 +-
 ggml/src/ggml-cuda/fattn-vec-f16.cuh          |     4 +-
 ggml/src/ggml-cuda/fattn-vec-f32.cuh          |     4 +-
 ggml/src/ggml-cuda/fattn-wmma-f16.cuh         |     4 +-
 ggml/src/{ => ggml-cuda}/ggml-cuda.cu         |    26 +-
 ggml/src/ggml-cuda/mmq.cuh                    |    22 +-
 ggml/src/ggml-cuda/mmvq.cu                    |     8 +-
 ggml/src/ggml-cuda/sum.cu                     |     4 +-
 ggml/src/ggml-hip/CMakeLists.txt              |   113 +
 ggml/src/ggml-impl.h                          |   280 +-
 ggml/src/ggml-kompute/CMakeLists.txt          |   162 +
 ggml/src/{ => ggml-kompute}/ggml-kompute.cpp  |     0
 ggml/src/{ => ggml-kompute}/kompute           |     0
 .../kompute-shaders/common.comp               |     0
 .../kompute-shaders/op_add.comp               |     0
 .../kompute-shaders/op_addrow.comp            |     0
 .../kompute-shaders/op_cpy_f16_f16.comp       |     0
 .../kompute-shaders/op_cpy_f16_f32.comp       |     0
 .../kompute-shaders/op_cpy_f32_f16.comp       |     0
 .../kompute-shaders/op_cpy_f32_f32.comp       |     0
 .../kompute-shaders/op_diagmask.comp          |     0
 .../kompute-shaders/op_gelu.comp              |     0
 .../kompute-shaders/op_getrows.comp           |     0
 .../kompute-shaders/op_getrows_f16.comp       |     0
 .../kompute-shaders/op_getrows_f32.comp       |     0
 .../kompute-shaders/op_getrows_q4_0.comp      |     0
 .../kompute-shaders/op_getrows_q4_1.comp      |     0
 .../kompute-shaders/op_getrows_q6_k.comp      |     0
 .../kompute-shaders/op_mul.comp               |     0
 .../kompute-shaders/op_mul_mat_f16.comp       |     0
 .../kompute-shaders/op_mul_mat_mat_f32.comp   |     0
 .../kompute-shaders/op_mul_mat_q4_0.comp      |     0
 .../kompute-shaders/op_mul_mat_q4_1.comp      |     0
 .../kompute-shaders/op_mul_mat_q4_k.comp      |     0
 .../kompute-shaders/op_mul_mat_q6_k.comp      |     0
 .../kompute-shaders/op_mul_mat_q8_0.comp      |     0
 .../kompute-shaders/op_mul_mv_q_n.comp        |     0
 .../kompute-shaders/op_mul_mv_q_n_pre.comp    |     0
 .../kompute-shaders/op_norm.comp              |     0
 .../kompute-shaders/op_relu.comp              |     0
 .../kompute-shaders/op_rmsnorm.comp           |     0
 .../kompute-shaders/op_rope_f16.comp          |     0
 .../kompute-shaders/op_rope_f32.comp          |     0
 .../kompute-shaders/op_scale.comp             |     0
 .../kompute-shaders/op_scale_8.comp           |     0
 .../kompute-shaders/op_silu.comp              |     0
 .../kompute-shaders/op_softmax.comp           |     0
 .../kompute-shaders/rope_common.comp          |     0
 ggml/src/ggml-metal/CMakeLists.txt            |   104 +
 ggml/src/{ => ggml-metal}/ggml-metal.m        |     4 +
 ggml/src/{ => ggml-metal}/ggml-metal.metal    |    11 +-
 ggml/src/ggml-musa/CMakeLists.txt             |   111 +
 ggml/src/ggml-quants.c                        | 10506 +--------------
 ggml/src/ggml-quants.h                        |   185 +-
 ggml/src/ggml-rpc/CMakeLists.txt              |    11 +
 ggml/src/{ => ggml-rpc}/ggml-rpc.cpp          |     6 +-
 ggml/src/ggml-sycl/CMakeLists.txt             |    81 +
 ggml/src/{ => ggml-sycl}/ggml-sycl.cpp        |     0
 ggml/src/ggml-threading.cpp                   |    12 +
 ggml/src/ggml-threading.h                     |    12 +
 ggml/src/ggml-vulkan/CMakeLists.txt           |    78 +
 ggml/src/{ => ggml-vulkan}/ggml-vulkan.cpp    |     0
 .../vulkan-shaders/CMakeLists.txt             |     0
 .../{ => ggml-vulkan}/vulkan-shaders/acc.comp |     0
 .../{ => ggml-vulkan}/vulkan-shaders/add.comp |     0
 .../vulkan-shaders/argsort.comp               |     0
 .../vulkan-shaders/clamp.comp                 |     0
 .../vulkan-shaders/concat.comp                |     0
 .../vulkan-shaders/contig_copy.comp           |     0
 .../vulkan-shaders/copy.comp                  |     0
 .../{ => ggml-vulkan}/vulkan-shaders/cos.comp |     0
 .../vulkan-shaders/dequant_f32.comp           |     0
 .../vulkan-shaders/dequant_funcs.comp         |     0
 .../vulkan-shaders/dequant_head.comp          |     0
 .../vulkan-shaders/dequant_iq4_nl.comp        |     0
 .../vulkan-shaders/dequant_q2_k.comp          |     0
 .../vulkan-shaders/dequant_q3_k.comp          |     0
 .../vulkan-shaders/dequant_q4_0.comp          |     0
 .../vulkan-shaders/dequant_q4_1.comp          |     0
 .../vulkan-shaders/dequant_q4_k.comp          |     0
 .../vulkan-shaders/dequant_q5_0.comp          |     0
 .../vulkan-shaders/dequant_q5_1.comp          |     0
 .../vulkan-shaders/dequant_q5_k.comp          |     0
 .../vulkan-shaders/dequant_q6_k.comp          |     0
 .../vulkan-shaders/dequant_q8_0.comp          |     0
 .../vulkan-shaders/diag_mask_inf.comp         |     0
 .../{ => ggml-vulkan}/vulkan-shaders/div.comp |     0
 .../vulkan-shaders/gelu.comp                  |     0
 .../vulkan-shaders/gelu_quick.comp            |     0
 .../vulkan-shaders/generic_binary_head.comp   |     0
 .../vulkan-shaders/generic_head.comp          |     0
 .../vulkan-shaders/generic_unary_head.comp    |     0
 .../vulkan-shaders/get_rows.comp              |     0
 .../vulkan-shaders/get_rows_quant.comp        |     0
 .../vulkan-shaders/group_norm.comp            |     0
 .../vulkan-shaders/im2col.comp                |     0
 .../vulkan-shaders/leaky_relu.comp            |     0
 .../{ => ggml-vulkan}/vulkan-shaders/mul.comp |     0
 .../mul_mat_split_k_reduce.comp               |     0
 .../vulkan-shaders/mul_mat_vec.comp           |     0
 .../vulkan-shaders/mul_mat_vec_base.comp      |     0
 .../vulkan-shaders/mul_mat_vec_nc.comp        |     0
 .../vulkan-shaders/mul_mat_vec_p021.comp      |     0
 .../vulkan-shaders/mul_mat_vec_q2_k.comp      |     0
 .../vulkan-shaders/mul_mat_vec_q3_k.comp      |     0
 .../vulkan-shaders/mul_mat_vec_q4_k.comp      |     0
 .../vulkan-shaders/mul_mat_vec_q5_k.comp      |     0
 .../vulkan-shaders/mul_mat_vec_q6_k.comp      |     0
 .../vulkan-shaders/mul_mm.comp                |     0
 .../vulkan-shaders/norm.comp                  |     0
 .../{ => ggml-vulkan}/vulkan-shaders/pad.comp |     0
 .../vulkan-shaders/pool2d.comp                |     0
 .../vulkan-shaders/relu.comp                  |     0
 .../vulkan-shaders/repeat.comp                |     0
 .../vulkan-shaders/rms_norm.comp              |     0
 .../vulkan-shaders/rope_head.comp             |     0
 .../vulkan-shaders/rope_neox.comp             |     0
 .../vulkan-shaders/rope_norm.comp             |     0
 .../vulkan-shaders/scale.comp                 |     0
 .../vulkan-shaders/silu.comp                  |     0
 .../{ => ggml-vulkan}/vulkan-shaders/sin.comp |     0
 .../vulkan-shaders/soft_max.comp              |     0
 .../vulkan-shaders/square.comp                |     0
 .../vulkan-shaders/sum_rows.comp              |     0
 .../vulkan-shaders/tanh.comp                  |     0
 .../vulkan-shaders/timestep_embedding.comp    |     0
 .../vulkan-shaders/types.comp                 |     0
 .../vulkan-shaders/upscale.comp               |     0
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |     0
 ggml/src/ggml.c                               |   286 +-
 pocs/vdot/vdot.cpp                            |     7 +-
 src/CMakeLists.txt                            |     3 +-
 src/llama.cpp                                 |     7 +-
 tests/test-quantize-fns.cpp                   |    19 +-
 tests/test-quantize-perf.cpp                  |    17 +-
 191 files changed, 17541 insertions(+), 16879 deletions(-)
 create mode 100644 ggml/src/ggml-amx/CMakeLists.txt
 rename ggml/src/{ => ggml-amx}/ggml-amx.cpp (98%)
 create mode 100644 ggml/src/ggml-backend-reg.cpp
 create mode 100644 ggml/src/ggml-blas/CMakeLists.txt
 rename ggml/src/{ => ggml-blas}/ggml-blas.cpp (99%)
 create mode 100644 ggml/src/ggml-cann/CMakeLists.txt
 rename ggml/src/{ => ggml-cann}/ggml-cann.cpp (100%)
 create mode 100644 ggml/src/ggml-cpu/CMakeLists.txt
 rename ggml/{ => src/ggml-cpu}/cmake/FindSIMD.cmake (100%)
 create mode 100644 ggml/src/ggml-cpu/ggml-cpu-aarch64.c
 create mode 100644 ggml/src/ggml-cpu/ggml-cpu-aarch64.h
 rename ggml/src/{ => ggml-cpu}/ggml-cpu-impl.h (52%)
 create mode 100644 ggml/src/ggml-cpu/ggml-cpu-quants.c
 create mode 100644 ggml/src/ggml-cpu/ggml-cpu-quants.h
 rename ggml/src/{ => ggml-cpu}/ggml-cpu.c (99%)
 create mode 100644 ggml/src/ggml-cpu/ggml-cpu.cpp
 rename ggml/src/{ => ggml-cpu}/llamafile/sgemm.cpp (100%)
 rename ggml/src/{ => ggml-cpu}/llamafile/sgemm.h (100%)
 create mode 100644 ggml/src/ggml-cuda/CMakeLists.txt
 rename ggml/src/{ => ggml-cuda}/ggml-cuda.cu (99%)
 create mode 100644 ggml/src/ggml-hip/CMakeLists.txt
 create mode 100644 ggml/src/ggml-kompute/CMakeLists.txt
 rename ggml/src/{ => ggml-kompute}/ggml-kompute.cpp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/common.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_add.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_addrow.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_cpy_f16_f16.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_cpy_f16_f32.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_cpy_f32_f16.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_cpy_f32_f32.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_diagmask.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_gelu.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_getrows.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_getrows_f16.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_getrows_f32.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_getrows_q4_0.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_getrows_q4_1.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_getrows_q6_k.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_mul.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_mul_mat_f16.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_mul_mat_mat_f32.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_mul_mat_q4_0.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_mul_mat_q4_1.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_mul_mat_q4_k.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_mul_mat_q6_k.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_mul_mat_q8_0.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_mul_mv_q_n.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_mul_mv_q_n_pre.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_norm.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_relu.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_rmsnorm.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_rope_f16.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_rope_f32.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_scale.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_scale_8.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_silu.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/op_softmax.comp (100%)
 rename ggml/src/{ => ggml-kompute}/kompute-shaders/rope_common.comp (100%)
 create mode 100644 ggml/src/ggml-metal/CMakeLists.txt
 rename ggml/src/{ => ggml-metal}/ggml-metal.m (99%)
 rename ggml/src/{ => ggml-metal}/ggml-metal.metal (99%)
 create mode 100644 ggml/src/ggml-musa/CMakeLists.txt
 create mode 100644 ggml/src/ggml-rpc/CMakeLists.txt
 rename ggml/src/{ => ggml-rpc}/ggml-rpc.cpp (99%)
 create mode 100644 ggml/src/ggml-sycl/CMakeLists.txt
 rename ggml/src/{ => ggml-sycl}/ggml-sycl.cpp (100%)
 create mode 100644 ggml/src/ggml-threading.cpp
 create mode 100644 ggml/src/ggml-threading.h
 create mode 100644 ggml/src/ggml-vulkan/CMakeLists.txt
 rename ggml/src/{ => ggml-vulkan}/ggml-vulkan.cpp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/CMakeLists.txt (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/acc.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/add.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/argsort.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/clamp.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/concat.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/contig_copy.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/copy.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/cos.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_f32.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_funcs.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_head.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_iq4_nl.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_q2_k.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_q3_k.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_q4_0.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_q4_1.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_q4_k.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_q5_0.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_q5_1.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_q5_k.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_q6_k.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/dequant_q8_0.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/diag_mask_inf.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/div.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/gelu.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/gelu_quick.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/generic_binary_head.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/generic_head.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/generic_unary_head.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/get_rows.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/get_rows_quant.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/group_norm.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/im2col.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/leaky_relu.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mat_split_k_reduce.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mat_vec.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mat_vec_base.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mat_vec_nc.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mat_vec_p021.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mat_vec_q2_k.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mat_vec_q3_k.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mat_vec_q4_k.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mat_vec_q5_k.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mat_vec_q6_k.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/mul_mm.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/norm.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/pad.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/pool2d.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/relu.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/repeat.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/rms_norm.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/rope_head.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/rope_neox.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/rope_norm.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/scale.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/silu.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/sin.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/soft_max.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/square.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/sum_rows.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/tanh.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/timestep_embedding.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/types.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/upscale.comp (100%)
 rename ggml/src/{ => ggml-vulkan}/vulkan-shaders/vulkan-shaders-gen.cpp (100%)

diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile
index b75163b94..3279c8da4 100644
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@@ -23,15 +23,16 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
     fi && \
     cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc)
+    cmake --build build --config Release --target llama-cli -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
 
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 
 RUN apt-get update && \
     apt-get install -y libgomp1
 
-COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-cli /
 
 ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile
index b5696794f..1edf75cf2 100644
--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/llama-cli-musa.Dockerfile
@@ -16,15 +16,16 @@ WORKDIR /app
 COPY . .
 
 RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc)
+    cmake --build build --config Release --target llama-cli -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
 
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 
 RUN apt-get update && \
     apt-get install -y libgomp1
 
-COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 
 ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile
index a40e24205..ea07a4d52 100644
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -23,15 +23,16 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
     fi && \
     cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc)
+    cmake --build build --config Release --target llama-server -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
 
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
-COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-server /llama-server
 
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile
index 193a6d77c..259877468 100644
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/llama-server-musa.Dockerfile
@@ -16,15 +16,16 @@ WORKDIR /app
 COPY . .
 
 RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc)
+    cmake --build build --config Release --target llama-server -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
 
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
-COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-server /llama-server
 
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 5d7d7ea5a..b88e6ca80 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -126,9 +126,9 @@ effectiveStdenv.mkDerivation (finalAttrs: {
   };
 
   postPatch = ''
-    substituteInPlace ./ggml/src/ggml-metal.m \
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
       --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-    substituteInPlace ./ggml/src/ggml-metal.m \
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
       --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
   '';
 
@@ -173,7 +173,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
       (cmakeBool "GGML_NATIVE" false)
       (cmakeBool "GGML_BLAS" useBlas)
       (cmakeBool "GGML_CUDA" useCuda)
-      (cmakeBool "GGML_HIPBLAS" useRocm)
+      (cmakeBool "GGML_HIP" useRocm)
       (cmakeBool "GGML_METAL" useMetalKit)
       (cmakeBool "GGML_VULKAN" useVulkan)
       (cmakeBool "GGML_STATIC" enableStatic)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1e37a3c79..d6a7b66a5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -405,13 +405,13 @@ jobs:
       - name: Build with native CMake HIP support
         id: cmake_build
         run: |
-          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
+          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
           cmake --build build --config Release -j $(nproc)
 
       - name: Build with legacy HIP support
         id: cmake_build_legacy_hip
         run: |
-          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
+          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
           cmake --build build2 --config Release -j $(nproc)
 
   ubuntu-22-cmake-sycl:
@@ -747,7 +747,7 @@ jobs:
         id: clone_kompute
         if: ${{ matrix.build == 'kompute-x64' }}
         run: |
-          git submodule update --init ggml/src/kompute
+          git submodule update --init ggml/src/ggml-kompute/kompute
 
       - name: Download OpenBLAS
         id: get_openblas
@@ -1014,7 +1014,7 @@ jobs:
         run: |
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
           $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
           cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
 
   windows-latest-cmake-hip-release:
@@ -1050,7 +1050,7 @@ jobs:
         run: |
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
           $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
           cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
           md "build\bin\rocblas\library\"
           cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
diff --git a/.gitmodules b/.gitmodules
index 5861d59cb..23ce5ff05 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "kompute"]
-	path = ggml/src/kompute
+	path = ggml/src/ggml-kompute/kompute
 	url = https://github.com/nomic-ai/kompute.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef0932a7b..93c60ef43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -140,7 +140,6 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
 
-
 # At the moment some compile definitions are placed within the ggml/src
 # directory but not exported on the `ggml` target. This could be improved by
 # determining _precisely_ which defines are necessary for the llama-config
diff --git a/Makefile b/Makefile
index dfa32d516..de06cb8b0 100644
--- a/Makefile
+++ b/Makefile
@@ -523,65 +523,54 @@ ifndef GGML_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
+		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
 		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
 		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
 		MK_LDFLAGS  += -framework Accelerate
-		OBJ_GGML    += ggml/src/ggml-blas.o
+		OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
 	endif
 endif # GGML_NO_ACCELERATE
 
-ifdef GGML_MUSA
-	CC := clang
-	CXX := clang++
-	GGML_CUDA := 1
-	MK_CPPFLAGS += -DGGML_USE_MUSA
-endif
-
 ifndef GGML_NO_OPENMP
 	MK_CPPFLAGS += -DGGML_USE_OPENMP
 	MK_CFLAGS   += -fopenmp
 	MK_CXXFLAGS += -fopenmp
-	ifdef GGML_MUSA
-		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
-		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
-	endif # GGML_MUSA
 endif # GGML_NO_OPENMP
 
 ifdef GGML_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_OPENBLAS
 
 ifdef GGML_OPENBLAS64
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_OPENBLAS64
 
 ifdef GGML_BLIS
 	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_BLIS
 
 ifdef GGML_NVPL
 	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
 	MK_LDFLAGS  += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_NVPL
 
 ifndef GGML_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJ_GGML    += ggml/src/llamafile/sgemm.o
+	OBJ_GGML    += ggml/src/ggml-cpu/llamafile/sgemm.o
 endif
 
 ifndef GGML_NO_AMX
 	MK_CPPFLAGS += -DGGML_USE_AMX
-	OBJ_GGML    += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
+	OBJ_GGML    += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
 endif
 
 ifdef GGML_RPC
@@ -601,29 +590,17 @@ else
 endif # GGML_CUDA_FA_ALL_QUANTS
 
 ifdef GGML_CUDA
-	ifdef GGML_MUSA
-		ifneq ('', '$(wildcard /opt/musa)')
-			CUDA_PATH ?= /opt/musa
-		else
-			CUDA_PATH ?= /usr/local/musa
-		endif
-
-		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
-		MK_LDFLAGS   += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
-		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
+	ifneq ('', '$(wildcard /opt/cuda)')
+		CUDA_PATH ?= /opt/cuda
 	else
-		ifneq ('', '$(wildcard /opt/cuda)')
-			CUDA_PATH ?= /opt/cuda
-		else
-			CUDA_PATH ?= /usr/local/cuda
-		endif
+		CUDA_PATH ?= /usr/local/cuda
+	endif
 
-		MK_CPPFLAGS  += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-		MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
-		MK_NVCCFLAGS += -use_fast_math
-	endif # GGML_MUSA
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+	MK_NVCCFLAGS += -use_fast_math
 
-	OBJ_GGML += ggml/src/ggml-cuda.o
+	OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML += $(OBJ_CUDA_TMPL)
 
@@ -631,11 +608,9 @@ ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # LLAMA_FATAL_WARNINGS
 
-ifndef GGML_MUSA
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
-endif # GGML_MUSA
 
 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
@@ -648,11 +623,7 @@ endif # GGML_CUDA_DEBUG
 ifdef GGML_CUDA_NVCC
 	NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
 else
-	ifdef GGML_MUSA
-		NVCC = $(CCACHE) mcc
-	else
-		NVCC = $(CCACHE) nvcc
-	endif # GGML_MUSA
+	NVCC = $(CCACHE) nvcc
 endif # GGML_CUDA_NVCC
 
 ifdef CUDA_DOCKER_ARCH
@@ -724,15 +695,9 @@ define NVCC_COMPILE
 	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
 else
-	ifdef GGML_MUSA
-define NVCC_COMPILE
-	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
-endef # NVCC_COMPILE
-	else
 define NVCC_COMPILE
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
-	endif # GGML_MUSA
 endif # JETSON_EOL_MODULE_DETECT
 
 ggml/src/ggml-cuda/%.o: \
@@ -742,8 +707,8 @@ ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-cuda/common.cuh
 	$(NVCC_COMPILE)
 
-ggml/src/ggml-cuda.o: \
-	ggml/src/ggml-cuda.cu \
+ggml/src/ggml-cuda/ggml-cuda.o: \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h \
@@ -819,7 +784,7 @@ ifdef GGML_HIPBLAS
 	GGML_CUDA_MMV_Y        ?= 1
 	GGML_CUDA_KQUANTS_ITER ?= 2
 
-	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
+	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
 
 ifdef GGML_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
@@ -852,12 +817,12 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY
 
-	OBJ_GGML += ggml/src/ggml-cuda.o
+	OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML += $(OBJ_CUDA_TMPL)
 
-ggml/src/ggml-cuda.o: \
-	ggml/src/ggml-cuda.cu \
+ggml/src/ggml-cuda/ggml-cuda.o: \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h \
@@ -874,10 +839,111 @@ ggml/src/ggml-cuda/%.o: \
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif # GGML_HIPBLAS
 
+ifdef GGML_MUSA
+	ifeq ($(wildcard /opt/musa),)
+		MUSA_PATH ?= /usr/local/musa
+	else
+		MUSA_PATH ?= /opt/musa
+	endif
+	MTGPU_TARGETS ?= mp_21 mp_22
+
+	MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
+	MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
+	MK_LDFLAGS += -lmusa -lmusart -lmublas
+
+	ifndef GGML_NO_OPENMP
+		# For Ubuntu Focal
+		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
+		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
+		# For Ubuntu Jammy
+		MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include
+		MK_LDFLAGS  += -L/usr/lib/llvm-14/lib
+	endif # GGML_NO_OPENMP
+
+	CC  := $(MUSA_PATH)/bin/clang
+	CXX := $(MUSA_PATH)/bin/clang++
+	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
+
+	MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
+
+ifdef GGML_CUDA_FORCE_DMMV
+	MUSAFLAGS += -DGGML_CUDA_FORCE_DMMV
+endif # GGML_CUDA_FORCE_DMMV
+
+ifdef GGML_CUDA_FORCE_MMQ
+	MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_CUBLAS
+	MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+
+ifdef GGML_CUDA_DMMV_X
+	MUSAFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
+else
+	MUSAFLAGS += -DGGML_CUDA_DMMV_X=32
+endif # GGML_CUDA_DMMV_X
+
+ifdef GGML_CUDA_MMV_Y
+	MUSAFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
+else
+	MUSAFLAGS += -DGGML_CUDA_MMV_Y=1
+endif # GGML_CUDA_MMV_Y
+
+ifdef GGML_CUDA_F16
+	MUSAFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_F16
+
+ifdef GGML_CUDA_DMMV_F16
+	MUSAFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_DMMV_F16
+
+ifdef GGML_CUDA_KQUANTS_ITER
+	MUSAFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
+else
+	MUSAFLAGS += -DK_QUANTS_PER_ITERATION=2
+endif
+
+ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
+else
+	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+ifdef GGML_CUDA_NO_PEER_COPY
+	MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+
+ifdef GGML_CUDA_FA_ALL_QUANTS
+	MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
+endif # GGML_CUDA_FA_ALL_QUANTS
+
+	OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML += $(OBJ_CUDA_TMPL)
+
+ggml/src/ggml-cuda/ggml-cuda.o: \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h \
+	ggml/src/ggml-backend-impl.h \
+	ggml/src/ggml-common.h \
+	$(wildcard ggml/src/ggml-cuda/*.cuh)
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
+
+ggml/src/ggml-cuda/%.o: \
+	ggml/src/ggml-cuda/%.cu \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h \
+	ggml/src/ggml-cuda/common.cuh
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
+endif # GGML_MUSA
+
 ifdef GGML_METAL
 	MK_CPPFLAGS += -DGGML_USE_METAL
 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	OBJ_GGML	+= ggml/src/ggml-metal.o
+	OBJ_GGML	+= ggml/src/ggml-metal/ggml-metal.o
 
 ifdef GGML_METAL_USE_BF16
 	MK_CPPFLAGS += -DGGML_METAL_USE_BF16
@@ -887,30 +953,30 @@ ifdef GGML_METAL_NDEBUG
 endif
 ifdef GGML_METAL_EMBED_LIBRARY
 	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
-	OBJ_GGML   += ggml/src/ggml-metal-embed.o
+	OBJ_GGML    += ggml/src/ggml-metal-embed.o
 endif
 endif # GGML_METAL
 
 ifdef GGML_METAL
-ggml/src/ggml-metal.o: \
-	ggml/src/ggml-metal.m \
+ggml/src/ggml-metal/ggml-metal.o: \
+	ggml/src/ggml-metal/ggml-metal.m \
 	ggml/include/ggml-metal.h \
 	ggml/include/ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@
 
 ifdef GGML_METAL_EMBED_LIBRARY
 ggml/src/ggml-metal-embed.o: \
-	ggml/src/ggml-metal.metal \
+	ggml/src/ggml-metal/ggml-metal.metal \
 	ggml/src/ggml-common.h
 	@echo "Embedding Metal library"
-	@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
+	@sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
-	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".section __DATA, __ggml_metallib"                       >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_start"                            >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_start:"                                  >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_end"                              >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_end:"                                    >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
 	$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
 	@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
 	@rmdir ${TEMP_ASSEMBLY}
@@ -919,11 +985,16 @@ endif # GGML_METAL
 
 OBJ_GGML += \
 	ggml/src/ggml.o \
-	ggml/src/ggml-cpu.o \
+	ggml/src/ggml-aarch64.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
+	ggml/src/ggml-backend-reg.o \
 	ggml/src/ggml-quants.o \
-	ggml/src/ggml-aarch64.o
+	ggml/src/ggml-threading.o \
+	ggml/src/ggml-cpu/ggml-cpu.o \
+	ggml/src/ggml-cpu/ggml-cpu-cpp.o \
+	ggml/src/ggml-cpu/ggml-cpu-aarch64.o \
+	ggml/src/ggml-cpu/ggml-cpu-quants.o
 
 OBJ_LLAMA = \
 	src/llama.o \
@@ -997,7 +1068,6 @@ $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef GGML_CUDA
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
-ifndef GGML_MUSA
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
 
 ifndef CUDA_DOCKER_ARCH
@@ -1007,7 +1077,6 @@ endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # GGML_MUSA
 endif # GGML_CUDA
 $(info )
 
@@ -1051,12 +1120,23 @@ ggml/src/ggml.o: \
 	ggml/include/ggml.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-ggml/src/ggml-cpu.o: \
-	ggml/src/ggml-cpu.c \
+ggml/src/ggml-threading.o: \
+	ggml/src/ggml-threading.cpp \
+	ggml/include/ggml.h
+	$(CXX) $(XXCFLAGS)   -c $< -o $@
+
+ggml/src/ggml-cpu/ggml-cpu.o: \
+	ggml/src/ggml-cpu/ggml-cpu.c \
 	ggml/include/ggml.h \
 	ggml/src/ggml-common.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
+ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
+	ggml/src/ggml-cpu/ggml-cpu.cpp \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h
+	$(CXX) $(CXXFLAGS)   -c $< -o $@
+
 ggml/src/ggml-alloc.o: \
 	ggml/src/ggml-alloc.c \
 	ggml/include/ggml.h \
@@ -1084,22 +1164,22 @@ ggml/src/ggml-aarch64.o: \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 
-ggml/src/ggml-blas.o: \
-	ggml/src/ggml-blas.cpp \
+ggml/src/ggml-blas/ggml-blas.o: \
+	ggml/src/ggml-blas/ggml-blas.cpp \
 	ggml/include/ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 ifndef GGML_NO_LLAMAFILE
-ggml/src/llamafile/sgemm.o: \
-	ggml/src/llamafile/sgemm.cpp \
-	ggml/src/llamafile/sgemm.h \
+ggml/src/ggml-cpu/llamafile/sgemm.o: \
+	ggml/src/ggml-cpu/llamafile/sgemm.cpp \
+	ggml/src/ggml-cpu/llamafile/sgemm.h \
 	ggml/include/ggml.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+	$(CXX) $(CXXFLAGS) -c $< -o $@ -I ggml/src -I ggml/src/ggml-cpu
 endif # GGML_NO_LLAMAFILE
 
 ifndef GGML_NO_AMX
-ggml/src/ggml-amx.o: \
-	ggml/src/ggml-amx.cpp \
+ggml/src/ggml-amx/ggml-amx.o: \
+	ggml/src/ggml-amx/ggml-amx.cpp \
 	ggml/include/ggml-amx.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
@@ -1250,13 +1330,24 @@ clean:
 	rm -rvf ggml/*.a
 	rm -rvf ggml/*.dll
 	rm -rvf ggml/*.so
-	rm -vrf ggml/src/*.o
-	rm -rvf ggml/src/llamafile/*.o
+	rm -rvf ggml/src/*.o
 	rm -rvf common/build-info.cpp
-	rm -vrf ggml/src/ggml-metal-embed.metal
+	rm -rvf ggml/src/ggml-cpu/*.o
+	rm -rvf ggml/src/ggml-cpu/llamafile/*.o
+	rm -vrf ggml/src/ggml-amx/*.o
+	rm -vrf ggml/src/ggml-blas/*.o
+	rm -vrf ggml/src/ggml-cann/*.o
+	rm -vrf ggml/src/ggml-cpu/*.o
 	rm -vrf ggml/src/ggml-cuda/*.o
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
-	rm -vrf ggml/src/ggml-amx/*.o
+	rm -vrf ggml/src/ggml-hip/*.o
+	rm -vrf ggml/src/ggml-kompute/*.o
+	rm -vrf ggml/src/ggml-metal/*.o
+	rm -vrf ggml/src/ggml-metal/ggml-metal-embed.metal
+	rm -vrf ggml/src/ggml-rpc/*.o
+	rm -vrf ggml/src/ggml-sycl/*.o
+	rm -vrf ggml/src/ggml-vulkan/*.o
+	rm -vrf ggml/src/ggml-musa/*.o
 	rm -rvf $(BUILD_TARGETS)
 	rm -rvf $(TEST_TARGETS)
 	rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
diff --git a/Package.swift b/Package.swift
index 0f4f19018..6b68aecde 100644
--- a/Package.swift
+++ b/Package.swift
@@ -10,11 +10,16 @@ var sources = [
     "src/unicode.cpp",
     "src/unicode-data.cpp",
     "ggml/src/ggml.c",
-    "ggml/src/ggml-cpu.c",
+    "ggml/src/ggml-aarch64.c",
     "ggml/src/ggml-alloc.c",
     "ggml/src/ggml-backend.cpp",
+    "ggml/src/ggml-backend-reg.cpp",
+    "ggml/src/ggml-cpu/ggml-cpu.c",
+    "ggml/src/ggml-cpu/ggml-cpu.cpp",
+    "ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
+    "ggml/src/ggml-cpu/ggml-cpu-quants.c",
+    "ggml/src/ggml-threading.cpp",
     "ggml/src/ggml-quants.c",
-    "ggml/src/ggml-aarch64.c",
 ]
 
 var resources: [Resource] = []
@@ -22,6 +27,7 @@ var linkerSettings: [LinkerSetting] = []
 var cSettings: [CSetting] =  [
     .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
     .unsafeFlags(["-fno-objc-arc"]),
+    .headerSearchPath("ggml/src"),
     // NOTE: NEW_LAPACK will required iOS version 16.4+
     // We should consider add this in the future when we drop support for iOS 14
     // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
@@ -30,8 +36,9 @@ var cSettings: [CSetting] =  [
 ]
 
 #if canImport(Darwin)
-sources.append("ggml/src/ggml-metal.m")
-resources.append(.process("ggml/src/ggml-metal.metal"))
+sources.append("ggml/src/ggml-common.h")
+sources.append("ggml/src/ggml-metal/ggml-metal.m")
+resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
 linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
     contentsOf: [
diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in
index f072b76a3..ef68417b4 100644
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -6,7 +6,7 @@ set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
 set(GGML_BLAS       @GGML_BLAS@)
 set(GGML_CUDA       @GGML_CUDA@)
 set(GGML_METAL      @GGML_METAL@)
-set(GGML_HIPBLAS    @GGML_HIPBLAS@)
+set(GGML_HIP        @GGML_HIP@)
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
 set(GGML_VULKAN @GGML_VULKAN@)
 set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
diff --git a/common/common.cpp b/common/common.cpp
index 19674af15..ebd16b600 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1967,18 +1967,13 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
     fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
     fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
     fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_cuda: %s\n",        ggml_cpu_has_cuda()        ? "true" : "false");
-    fprintf(stream, "cpu_has_vulkan: %s\n",      ggml_cpu_has_vulkan()      ? "true" : "false");
-    fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
     fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
-    fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
     fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
     fprintf(stream, "cpu_has_sve: %s\n",         ggml_cpu_has_sve()         ? "true" : "false");
     fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
     fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
     fprintf(stream, "cpu_has_riscv_v: %s\n",     ggml_cpu_has_riscv_v()     ? "true" : "false");
     fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
     fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
     fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
     fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
diff --git a/docs/build.md b/docs/build.md
index 4e362ebc7..95512415a 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -230,7 +230,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
   ```bash
   HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
       && cmake --build build --config Release -- -j 16
   ```
   On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
@@ -247,7 +247,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
   ```bash
   HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
   HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
       && cmake --build build -- -j 16
   ```
 
@@ -259,7 +259,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
   ```bash
   set PATH=%HIP_PATH%\bin;%PATH%
-  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
   cmake --build build
   ```
   Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 1eddfd0db..8f4e0e206 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -774,13 +774,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 struct test {
     static const std::string build_commit;
     static const int build_number;
-    static const bool cuda;
-    static const bool vulkan;
-    static const bool kompute;
-    static const bool metal;
-    static const bool sycl;
-    static const bool gpu_blas;
-    static const bool blas;
     static const std::string cpu_info;
     static const std::string gpu_info;
     std::string model_filename;
@@ -793,7 +786,6 @@ struct test {
     std::string cpu_mask;
     bool cpu_strict;
     int poll;
-    bool has_rpc;
     ggml_type type_k;
     ggml_type type_v;
     int n_gpu_layers;
@@ -822,7 +814,6 @@ struct test {
         cpu_mask = inst.cpu_mask;
         cpu_strict = inst.cpu_strict;
         poll = inst.poll;
-        has_rpc = !inst.rpc_servers.empty();
         type_k = inst.type_k;
         type_v = inst.type_v;
         n_gpu_layers = inst.n_gpu_layers;
@@ -881,8 +872,7 @@ struct test {
     static const std::vector<std::string> & get_fields() {
         static const std::vector<std::string> fields = {
             "build_commit", "build_number",
-            "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
-            "cpu_info", "gpu_info",
+            "cpu_info", "gpu_info", "backends",
             "model_filename", "model_type", "model_size", "model_n_params",
             "n_batch", "n_ubatch",
             "n_threads", "cpu_mask", "cpu_strict", "poll",
@@ -908,8 +898,7 @@ struct test {
             field == "avg_ns" || field == "stddev_ns") {
             return INT;
         }
-        if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
-            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
+        if (field == "f16_kv" || field == "no_kv_offload" ||
             field == "cpu_strict" ||
             field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
             return BOOL;
@@ -938,9 +927,7 @@ struct test {
         }
         std::vector<std::string> values = {
             build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
-            cpu_info, gpu_info,
+            cpu_info, gpu_info, get_backend(),
             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
             std::to_string(n_batch), std::to_string(n_ubatch),
             std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
@@ -967,13 +954,6 @@ struct test {
 
 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
-const bool        test::cuda         = !!ggml_cpu_has_cuda();
-const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
-const bool        test::kompute      = !!ggml_cpu_has_kompute();
-const bool        test::metal        = !!ggml_cpu_has_metal();
-const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
-const bool        test::blas         = !!ggml_cpu_has_blas();
-const bool        test::sycl         = !!ggml_cpu_has_sycl();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();
 
@@ -1178,7 +1158,8 @@ struct markdown_printer : public printer {
         fields.emplace_back("size");
         fields.emplace_back("params");
         fields.emplace_back("backend");
-        bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
+        bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
+                              test::get_backend().find("BLAS") != std::string::npos;
         if (!is_cpu_backend) {
             fields.emplace_back("n_gpu_layers");
         }
@@ -1268,9 +1249,6 @@ struct markdown_printer : public printer {
                 value = buf;
             } else if (field == "backend") {
                 value = test::get_backend();
-                if (t.has_rpc) {
-                    value += "+RPC";
-                }
             } else if (field == "test") {
                 if (t.n_prompt > 0 && t.n_gen == 0) {
                     snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index e372856c6..912caf346 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
 }
 
 static void test_roundtrip_on_chunk(
-    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
+    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
     float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
 ) {
     if (layer->type == GGML_TYPE_F16) {
@@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk(
     if (use_reference) {
         qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
     } else {
-        qfns.from_float(input_scratch, quantized_scratch, chunk_size);
+        qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
     }
     qfns.to_float(quantized_scratch, output_scratch, chunk_size);
 
@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
 
 // Run quantization function for a single layer and update error stats
 static void test_roundtrip_on_layer(
-    std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
+    std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
     const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
     std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
 ) {
@@ -187,13 +187,13 @@ static void test_roundtrip_on_layer(
     int num_chunks = (nelements + chunk_size - 1)/chunk_size;
 
     if (num_chunks < 2 || max_thread < 2) {
-        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
+        test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
                 output_scratch.data(), print_layer_stats ? layer_error : total_error);
     } else {
         auto & stats = print_layer_stats ? layer_error : total_error;
         std::mutex mutex;
         uint64_t counter = 0;
-        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
+        auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
              &quantized_scratch, &output_scratch, chunk_size] () {
             error_stats local_stats {};
             while (true) {
@@ -205,7 +205,7 @@ static void test_roundtrip_on_layer(
                 }
                 lock.unlock();
                 uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
-                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
+                test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
                         quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
             }
         };
@@ -371,8 +371,9 @@ int main(int argc, char ** argv) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
-        const auto *  qfns = ggml_get_type_traits(type);
-        if (qfns->from_float && qfns->to_float) {
+        const auto *  qfns     = ggml_get_type_traits(type);
+        const auto *  qfns_cpu = ggml_get_type_traits_cpu(type);
+        if (qfns_cpu->from_float && qfns->to_float) {
             if (params.verbose) {
                 printf("testing %s ...\n",  ggml_type_name(type));
             }
@@ -393,7 +394,7 @@ int main(int argc, char ** argv) {
                 test_roundtrip_on_layer(
                         layer_name,
                         params.per_layer_stats,
-                        *qfns,
+                        *qfns, *qfns_cpu,
                         params.reference,
                         kv_tensor.second,
                         input_scratch,
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 81b7a02f5..3e5b16f86 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -116,6 +116,7 @@ endif()
 
 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
+option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
 
 # 3rd party libs / backends
 option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
@@ -141,7 +142,7 @@ option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
 
-option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
+option(GGML_HIP                             "ggml: use HIP"                                   OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
@@ -238,12 +239,15 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 install(TARGETS ggml PUBLIC_HEADER)
 
 if (BUILD_SHARED_LIBS)
-    install(TARGETS ggml LIBRARY)
+    install(TARGETS ggml      LIBRARY)
+    install(TARGETS ggml-base LIBRARY)
 endif()
 
+# FIXME: this should be done in the backend cmake files
 if (GGML_METAL)
+    # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
     install(
-        FILES src/ggml-metal.metal
+        FILES src/ggml-metal/ggml-metal.metal
         PERMISSIONS
             OWNER_READ
             OWNER_WRITE
diff --git a/ggml/include/ggml-amx.h b/ggml/include/ggml-amx.h
index 22b3f70f4..042d6d919 100644
--- a/ggml/include/ggml-amx.h
+++ b/ggml/include/ggml-amx.h
@@ -9,16 +9,16 @@ extern "C" {
 #endif
 
 // buffer_type API
-GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
 
-GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_amx_init(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
 
-GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
+GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
 
-GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 125413d1b..0a65dbfca 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -3,6 +3,20 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 
+#ifdef GGML_BACKEND_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BACKEND_BUILD
+#            define GGML_BACKEND_API __declspec(dllexport) extern
+#        else
+#            define GGML_BACKEND_API __declspec(dllimport) extern
+#        endif
+#    else
+#        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
+#    endif
+#else
+#    define GGML_BACKEND_API extern
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
diff --git a/ggml/include/ggml-blas.h b/ggml/include/ggml-blas.h
index 25b2e637f..87a81b363 100644
--- a/ggml/include/ggml-blas.h
+++ b/ggml/include/ggml-blas.h
@@ -9,15 +9,15 @@ extern "C" {
 #endif
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_blas_init(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
 
-GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
 
 // number of threads used for conversion to float
 // for openblas and blis, this will also set the number of threads used for blas operations
-GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
 
-GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
 
 
 #ifdef  __cplusplus
diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h
index 528975493..b469e228d 100644
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@@ -34,7 +34,7 @@ extern "C" {
  */
 #define GGML_CANN_MAX_DEVICES 16
 
-GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
 
 /**
  * @brief Initializes the CANN backend for a specified device.
@@ -46,7 +46,7 @@ GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
  * @param device The index of the device to initialize.
  * @return A pointer to the initialized backend instance, or nullptr on failure.
  */
-GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
 
 /**
  * @brief Checks if a given backend is a CANN backend.
@@ -57,7 +57,7 @@ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
  * @param backend The backend instance to check.
  * @return True if the backend is a CANN backend, false otherwise.
  */
-GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
 
 /**
  * @brief Retrieves the CANN buffer type for a specified device.
@@ -69,7 +69,7 @@ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
  * @return A pointer to the buffer type interface for the specified device, or
  * nullptr if the device index is out of range.
  */
-GGML_API ggml_backend_buffer_type_t
+GGML_BACKEND_API ggml_backend_buffer_type_t
 ggml_backend_cann_buffer_type(int32_t device);
 
 /**
@@ -80,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device);
  *
  * @return The number of CANN devices available.
  */
-GGML_API int32_t ggml_backend_cann_get_device_count(void);
+GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
 
 /**
  * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
  *
  * @return A pointer to the host buffer type interface.
  */
-GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
 
 /**
  * @brief Retrieves the description of a specific CANN device.
@@ -99,7 +99,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
  * @param description Pointer to a buffer where the description will be written.
  * @param description_size Size of the description buffer.
  */
-GGML_API void ggml_backend_cann_get_device_description(
+GGML_BACKEND_API void ggml_backend_cann_get_device_description(
     int32_t device, char* description, size_t description_size);
 
 /**
@@ -114,7 +114,7 @@ GGML_API void ggml_backend_cann_get_device_description(
  * @param total Pointer to a variable where the total memory size will be
  * stored.
  */
-GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
+GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
                                                   size_t* free,
                                                   size_t* total);
 
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 7f1ee7573..4da62cb2b 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -54,54 +54,77 @@ extern "C" {
         GGML_NUMA_STRATEGY_COUNT
     };
 
-    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
-    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+    GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
+    GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
 
-    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
 
-    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
 
-    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+    GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
 
-    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+    GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
 
-    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+    GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
 
-    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
 
-    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
-    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
-    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
-    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_BACKEND_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_BACKEND_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_BACKEND_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
 
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan(
+    GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
                   const struct ggml_cgraph * cgraph,
                                        int   n_threads, /* = GGML_DEFAULT_N_THREADS */
                     struct ggml_threadpool * threadpool /* = NULL */ );
-    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
 
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
 
-    // TODO: move to backend interface
-    GGML_API int ggml_cpu_has_neon       (void);
-    GGML_API int ggml_cpu_has_sve        (void);
-    GGML_API int ggml_cpu_has_matmul_int8(void);
-    // get the sve vector length in bytes
-    GGML_API int ggml_cpu_get_sve_cnt(void);
+    //
+    // system info
+    //
+
+    // x86
+    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
+    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
+    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
+    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
+    GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
+    // ARM
+    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
+    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
+    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
+    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
+    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
+    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
+    // other
+    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
+    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 
     // Internal types and functions exposed for tests and benchmarks
 
@@ -115,6 +138,7 @@ extern "C" {
                                        const void * GGML_RESTRICT y, int nr, int nc);
 
     struct ggml_type_traits_cpu {
+        ggml_from_float_t        from_float;
         ggml_from_float_to_mat_t from_float_to_mat;
         ggml_vec_dot_t           vec_dot;
         enum ggml_type           vec_dot_type;
@@ -124,25 +148,25 @@ extern "C" {
         ggml_gemm_t              gemm;
     };
 
-    GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
+    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
 
-    GGML_API void ggml_cpu_init(void);
+    GGML_BACKEND_API void ggml_cpu_init(void);
 
     //
     // CPU backend
     //
 
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+    GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
 
-    GGML_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
-    GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+    GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+    GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 
-    GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
+    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 
 #ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+    GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 #endif
 
 #ifdef __cplusplus
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index 305d0b636..22ad2c009 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -7,7 +7,7 @@
 extern "C" {
 #endif
 
-#ifdef GGML_USE_HIPBLAS
+#ifdef GGML_USE_HIP
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #elif defined(GGML_USE_MUSA)
@@ -20,27 +20,27 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
 
-GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
 
 // device buffer
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
 
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
 
-GGML_API int  ggml_backend_cuda_get_device_count(void);
-GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+GGML_BACKEND_API int  ggml_backend_cuda_get_device_count(void);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
 
-GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
-GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
+GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
+GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 
-GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h
index c0c43521b..154aa56a7 100644
--- a/ggml/include/ggml-kompute.h
+++ b/ggml/include/ggml-kompute.h
@@ -37,13 +37,13 @@ struct ggml_vk_device ggml_vk_current_device(void);
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
 
-GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
 
-GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
 
-GGML_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h
index b8d3f678b..669c1f84a 100644
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -39,27 +39,27 @@ extern "C" {
 // user-code should use only these functions
 //
 
-GGML_API ggml_backend_t ggml_backend_metal_init(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
 
-GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
 GGML_DEPRECATED(
-        GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
+        GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
         "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
 
-GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
+GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
+GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
 
 // capture all command buffers committed the next time `ggml_backend_graph_compute` is called
-GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
+GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
 
-GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h
index d57967368..ade6c3b0e 100644
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -10,18 +10,18 @@ extern "C" {
 #define GGML_RPC_MAX_SERVERS       16
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
-GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
+GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
 
-GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
 
-GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
 
-GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
 
-GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h
index af521f599..5ce349a88 100644
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@@ -17,32 +17,32 @@ extern "C" {
 #endif
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
 
-GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
 
 // devide buffer
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
 
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 
-GGML_API void ggml_backend_sycl_print_sycl_devices(void);
-GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_API void ggml_backend_sycl_get_device_description(int device,
+GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
+GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
                                                        char *description,
                                                        size_t description_size);
-GGML_API int  ggml_backend_sycl_get_device_count();
-GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
+GGML_BACKEND_API int  ggml_backend_sycl_get_device_count();
+GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 
 // SYCL doesn't support registering host memory, keep here for reference
-// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
-// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
+// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
+// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
 
-GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h
index c03bbfe5e..53cdba072 100644
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@@ -10,21 +10,21 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16
 
-GGML_API void ggml_vk_instance_init(void);
+GGML_BACKEND_API void ggml_vk_instance_init(void);
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
 
-GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
-GGML_API int  ggml_backend_vk_get_device_count(void);
-GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
-GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
+GGML_BACKEND_API int  ggml_backend_vk_get_device_count(void);
+GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
 
-GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 73ede1813..3b3f6798a 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -176,15 +176,15 @@
 #ifdef GGML_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef GGML_BUILD
-#            define GGML_API __declspec(dllexport)
+#            define GGML_API __declspec(dllexport) extern
 #        else
-#            define GGML_API __declspec(dllimport)
+#            define GGML_API __declspec(dllimport) extern
 #        endif
 #    else
-#        define GGML_API __attribute__ ((visibility ("default")))
+#        define GGML_API __attribute__ ((visibility ("default"))) extern
 #    endif
 #else
-#    define GGML_API
+#    define GGML_API extern
 #endif
 
 // TODO: support for clang
@@ -1490,7 +1490,7 @@ extern "C" {
         "use ggml_rope_ext_inplace instead");
 
     // compute correction dims for YaRN RoPE scaling
-    void ggml_rope_yarn_corr_dims(
+    GGML_API void ggml_rope_yarn_corr_dims(
         int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
 
     // rotary position embedding backward, i.e compute dx from dy
@@ -2384,38 +2384,6 @@ extern "C" {
     GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
     GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
 
-    //
-    // system info
-    //
-
-    GGML_API int ggml_cpu_has_avx        (void);
-    GGML_API int ggml_cpu_has_avx_vnni   (void);
-    GGML_API int ggml_cpu_has_avx2       (void);
-    GGML_API int ggml_cpu_has_avx512     (void);
-    GGML_API int ggml_cpu_has_avx512_vbmi(void);
-    GGML_API int ggml_cpu_has_avx512_vnni(void);
-    GGML_API int ggml_cpu_has_avx512_bf16(void);
-    GGML_API int ggml_cpu_has_amx_int8   (void);
-    GGML_API int ggml_cpu_has_fma        (void);
-    GGML_API int ggml_cpu_has_arm_fma    (void);
-    GGML_API int ggml_cpu_has_metal      (void);
-    GGML_API int ggml_cpu_has_f16c       (void);
-    GGML_API int ggml_cpu_has_fp16_va    (void);
-    GGML_API int ggml_cpu_has_wasm_simd  (void);
-    GGML_API int ggml_cpu_has_blas       (void);
-    GGML_API int ggml_cpu_has_cuda       (void);
-    GGML_API int ggml_cpu_has_vulkan     (void);
-    GGML_API int ggml_cpu_has_kompute    (void);
-    GGML_API int ggml_cpu_has_gpublas    (void);
-    GGML_API int ggml_cpu_has_sse3       (void);
-    GGML_API int ggml_cpu_has_ssse3      (void);
-    GGML_API int ggml_cpu_has_riscv_v    (void);
-    GGML_API int ggml_cpu_has_sycl       (void);
-    GGML_API int ggml_cpu_has_rpc        (void);
-    GGML_API int ggml_cpu_has_vsx        (void);
-    GGML_API int ggml_cpu_has_cann       (void);
-    GGML_API int ggml_cpu_has_llamafile  (void);
-
 #ifdef  __cplusplus
 // restrict not standard in C++
 #define GGML_RESTRICT
@@ -2432,7 +2400,6 @@ extern "C" {
         size_t                   type_size;
         bool                     is_quantized;
         ggml_to_float_t          to_float;
-        ggml_from_float_t        from_float;
         ggml_from_float_t        from_float_ref;
     };
 
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index a05f8c505..71934c679 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1,7 +1,5 @@
 include(CheckCXXCompilerFlag)
 
-unset(GGML_CDEF_PUBLIC)
-
 add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
 
 # enable libstdc++ assertions for debug builds
@@ -26,929 +24,6 @@ if (NOT MSVC)
     endif()
 endif()
 
-unset(GGML_EXTRA_LIBS_PRIVATE)
-unset(GGML_EXTRA_LIBS_PUBLIC)
-
-if (APPLE AND GGML_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
-
-        add_compile_definitions(GGML_USE_ACCELERATE)
-        add_compile_definitions(ACCELERATE_NEW_LAPACK)
-        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
-
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK})
-    else()
-        message(WARNING "Accelerate framework not found")
-    endif()
-endif()
-
-if (GGML_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
-
-    message(STATUS "Metal framework found")
-    set(GGML_HEADERS_METAL ../include/ggml-metal.h)
-    set(GGML_SOURCES_METAL ggml-metal.m)
-
-    list(APPEND GGML_CDEF_PUBLIC GGML_USE_METAL)
-    if (GGML_METAL_NDEBUG)
-        add_compile_definitions(GGML_METAL_NDEBUG)
-    endif()
-
-    if (GGML_METAL_USE_BF16)
-        add_compile_definitions(GGML_METAL_USE_BF16)
-    endif()
-
-    # copy ggml-common.h and ggml-metal.metal to bin directory
-    configure_file(ggml-common.h    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
-    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
-
-    if (GGML_METAL_EMBED_LIBRARY)
-        enable_language(ASM)
-
-        add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
-
-        set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h")
-        set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
-
-        file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
-
-        # merge ggml-common.h and ggml-metal.metal into a single file
-        set(METALLIB_EMBED_ASM    "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
-        set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
-
-        add_custom_command(
-            OUTPUT ${METALLIB_EMBED_ASM}
-            COMMAND echo "Embedding Metal library"
-            COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED}
-            COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
-            COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
-            COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
-            COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
-            COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
-            COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
-            DEPENDS ggml-metal.metal ggml-common.h
-            COMMENT "Generate assembly for embedded Metal library"
-        )
-
-        list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
-    else()
-        if (GGML_METAL_SHADER_DEBUG)
-            # custom command to do the following:
-            #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-            #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
-            #
-            # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
-            #       disabling fast math is needed in order to pass tests/test-backend-ops
-            # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
-            # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-            #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
-            set(XC_FLAGS -fno-fast-math -fno-inline -g)
-        else()
-            set(XC_FLAGS -O3)
-        endif()
-
-        # Append macOS metal versioning flags
-        if (GGML_METAL_MACOSX_VERSION_MIN)
-            message(STATUS "Adding  -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
-            list   (APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN})
-        endif()
-
-        if (GGML_METAL_STD)
-            message(STATUS "Adding  -std=${GGML_METAL_STD} flag to metal compilation")
-            list   (APPEND XC_FLAGS -std=${GGML_METAL_STD})
-        endif()
-
-        add_custom_command(
-            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
-            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
-            DEPENDS ggml-metal.metal ggml-common.h
-            COMMENT "Compiling Metal kernels"
-            )
-
-        add_custom_target(
-            ggml-metal ALL
-            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            )
-    endif() # GGML_METAL_EMBED_LIBRARY
-
-    list(APPEND GGML_EXTRA_LIBS_PRIVATE
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        )
-endif()
-
-if (GGML_MUSA)
-    set(CMAKE_C_COMPILER clang)
-    set(CMAKE_C_EXTENSIONS OFF)
-    set(CMAKE_CXX_COMPILER clang++)
-    set(CMAKE_CXX_EXTENSIONS OFF)
-
-    set(GGML_CUDA ON)
-
-    list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
-endif()
-
-if (GGML_OPENMP)
-    find_package(OpenMP)
-    if (OpenMP_FOUND)
-        message(STATUS "OpenMP found")
-
-        add_compile_definitions(GGML_USE_OPENMP)
-
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-
-        if (GGML_MUSA)
-            list(APPEND GGML_EXTRA_INCLUDES     "/usr/lib/llvm-14/lib/clang/14.0.0/include")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
-        endif()
-    else()
-        message(WARNING "OpenMP not found")
-    endif()
-endif()
-
-if (GGML_BLAS)
-    if (GGML_STATIC)
-        set(BLA_STATIC ON)
-    endif()
-    #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
-    #    set(BLA_SIZEOF_INTEGER 8)
-    #endif()
-
-    set(BLA_VENDOR ${GGML_BLAS_VENDOR})
-    find_package(BLAS)
-
-    if (BLAS_FOUND)
-        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
-
-        if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${GGML_BLAS_VENDOR} MATCHES "Apple"))
-            # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
-            # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
-            find_package(PkgConfig REQUIRED)
-            if (${GGML_BLAS_VENDOR} MATCHES "Generic")
-                pkg_check_modules(DepBLAS blas)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
-                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-                pkg_check_modules(DepBLAS openblas64)
-                if (NOT DepBLAS_FOUND)
-                    pkg_check_modules(DepBLAS openblas)
-                endif()
-            elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
-                add_compile_definitions(GGML_BLAS_USE_BLIS)
-                pkg_check_modules(DepBLAS blis)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
-                pkg_check_modules(DepBLAS blas-atlas)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
-                pkg_check_modules(DepBLAS flexiblas_api)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
-                add_compile_definitions(GGML_BLAS_USE_MKL)
-                # all Intel* libraries share the same include path
-                pkg_check_modules(DepBLAS mkl-sdl)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
-                # this doesn't provide pkg-config
-                # suggest to assign BLAS_INCLUDE_DIRS on your own
-                if ("${NVHPC_VERSION}" STREQUAL "")
-                    message(WARNING "Better to set NVHPC_VERSION")
-                else()
-                    set(DepBLAS_FOUND ON)
-                    set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
-                endif()
-            endif()
-            if (DepBLAS_FOUND)
-                set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
-            else()
-                message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
-                " detected by pkgconfig, trying to find cblas.h from possible paths...")
-                find_path(BLAS_INCLUDE_DIRS
-                    NAMES cblas.h
-                    HINTS
-                        /usr/include
-                        /usr/local/include
-                        /usr/include/openblas
-                        /opt/homebrew/opt/openblas/include
-                        /usr/local/opt/openblas/include
-                        /usr/include/x86_64-linux-gnu/openblas/include
-                )
-            endif()
-        endif()
-
-        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
-
-        add_compile_options(${BLAS_LINKER_FLAGS})
-
-        list(APPEND GGML_CDEF_PUBLIC GGML_USE_BLAS)
-
-        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
-            add_compile_definitions(GGML_BLAS_USE_MKL)
-        endif()
-
-        set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
-        set(GGML_SOURCES_BLAS ggml-blas.cpp)
-
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES})
-        list(APPEND GGML_EXTRA_INCLUDES     ${BLAS_INCLUDE_DIRS})
-    else()
-        message(WARNING "BLAS not found, please refer to "
-        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-        " to set correct GGML_BLAS_VENDOR")
-    endif()
-endif()
-
-if (GGML_LLAMAFILE)
-    message(STATUS "Using llamafile")
-
-    add_compile_definitions(GGML_USE_LLAMAFILE)
-
-    set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
-endif()
-
-if (GGML_AMX)
-    if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
-    else()
-        set(GGML_AMX OFF)
-        message(WARNING "AMX requires gcc version > 11.0. Turning off GGML_AMX.")
-    endif()
-
-    if (GGML_AMX)
-        message(STATUS "Using AMX")
-
-        list(APPEND GGML_CDEF_PUBLIC GGML_USE_AMX)
-
-        file(GLOB   GGML_HEADERS_AMX "ggml-amx/*.h")
-        list(APPEND GGML_HEADERS_AMX "../include/ggml-amx.h")
-
-        file(GLOB   GGML_SOURCES_AMX "ggml-amx/*.cpp")
-        list(APPEND GGML_SOURCES_AMX "ggml-amx.cpp")
-    endif()
-endif()
-
-if (GGML_CUDA)
-    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-
-    if (GGML_MUSA)
-        list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
-        find_package(MUSAToolkit)
-        set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
-    else()
-        find_package(CUDAToolkit)
-    endif()
-
-    if (CUDAToolkit_FOUND)
-        message(STATUS "CUDA found")
-
-        if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-            # 52 == lowest CUDA 12 standard
-            # 60 == FP16 CUDA intrinsics
-            # 61 == integer CUDA intrinsics
-            # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
-            if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-                set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
-            else()
-                set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
-                #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
-            endif()
-        endif()
-        message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-        if (GGML_MUSA)
-            set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
-        else()
-            enable_language(CUDA)
-        endif()
-
-        file(GLOB   GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
-        list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
-
-        file(GLOB   GGML_SOURCES_CUDA "ggml-cuda/*.cu")
-        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
-        file(GLOB   SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "ggml-cuda/template-instances/mmq*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-
-        if (GGML_CUDA_FA_ALL_QUANTS)
-            file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-        else()
-            file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        endif()
-
-        list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
-
-        add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
-        add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
-        add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
-        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
-
-        if (GGML_CUDA_GRAPHS)
-            add_compile_definitions(GGML_CUDA_USE_GRAPHS)
-        endif()
-
-        if (GGML_CUDA_FORCE_DMMV)
-            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-        endif()
-
-        if (GGML_CUDA_FORCE_MMQ)
-            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-        endif()
-
-        if (GGML_CUDA_FORCE_CUBLAS)
-            add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
-        endif()
-
-        if (GGML_CUDA_NO_VMM)
-            add_compile_definitions(GGML_CUDA_NO_VMM)
-        endif()
-
-        if (DEFINED GGML_CUDA_DMMV_Y)
-            add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
-        endif()
-
-        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-            add_compile_definitions(GGML_CUDA_F16)
-        endif()
-
-        if (GGML_CUDA_NO_PEER_COPY)
-            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-        endif()
-
-        if (GGML_MUSA)
-            set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
-            foreach(SOURCE ${GGML_SOURCES_CUDA})
-                set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
-            endforeach()
-        endif()
-
-        if (GGML_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                if (GGML_MUSA)
-                    list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static)
-                else()
-                    list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-                endif()
-            endif()
-        else()
-            if (GGML_MUSA)
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas)
-            else()
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
-            endif()
-        endif()
-
-        if (GGML_CUDA_NO_VMM)
-            # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
-        else()
-            if (GGML_MUSA)
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
-            else()
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
-            endif()
-        endif()
-    else()
-        message(WARNING "CUDA not found")
-    endif()
-endif()
-
-if (GGML_HIPBLAS)
-    if (NOT EXISTS $ENV{ROCM_PATH})
-        if (NOT EXISTS /opt/rocm)
-            set(ROCM_PATH /usr)
-        else()
-            set(ROCM_PATH /opt/rocm)
-        endif()
-    else()
-        set(ROCM_PATH $ENV{ROCM_PATH})
-    endif()
-
-    list(APPEND CMAKE_PREFIX_PATH  ${ROCM_PATH})
-    list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
-
-    # CMake on Windows doesn't support the HIP language yet
-    if (WIN32)
-        set(CXX_IS_HIPCC TRUE)
-    else()
-        string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
-    endif()
-
-    if (CXX_IS_HIPCC)
-        if (LINUX)
-            if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-                message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
-            endif()
-
-            message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
-                    " Prefer setting the HIP compiler directly. See README for details.")
-        endif()
-    else()
-        # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
-        if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-            set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
-        endif()
-        cmake_minimum_required(VERSION 3.21)
-        enable_language(HIP)
-    endif()
-
-    find_package(hip     REQUIRED)
-    find_package(hipblas REQUIRED)
-    find_package(rocblas REQUIRED)
-
-    message(STATUS "HIP and hipBLAS found")
-
-    file(GLOB   GGML_HEADERS_ROCM "ggml-cuda/*.cuh")
-    list(APPEND GGML_HEADERS_ROCM "../include/ggml-cuda.h")
-
-    file(GLOB   GGML_SOURCES_ROCM "ggml-cuda/*.cu")
-    list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
-    file(GLOB   SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
-    list(APPEND GGML_SOURCES_ROCM ${SRCS})
-    file(GLOB   SRCS "ggml-cuda/template-instances/mmq*.cu")
-    list(APPEND GGML_SOURCES_ROCM ${SRCS})
-
-    if (GGML_CUDA_FA_ALL_QUANTS)
-        file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-    else()
-        file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-    endif()
-
-    list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
-
-    add_compile_definitions(GGML_USE_HIPBLAS)
-    add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
-    add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
-    add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
-
-    if (GGML_HIP_UMA)
-        add_compile_definitions(GGML_HIP_UMA)
-    endif()
-
-    if (GGML_CUDA_FORCE_DMMV)
-        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-    endif()
-
-    if (GGML_CUDA_FORCE_MMQ)
-        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-    endif()
-
-    if (GGML_CUDA_FORCE_CUBLAS)
-        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
-    endif()
-
-    if (GGML_CUDA_NO_PEER_COPY)
-        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-    endif()
-
-    if (CXX_IS_HIPCC)
-        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device)
-    else()
-        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
-    endif()
-
-    if (GGML_STATIC)
-        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
-    endif()
-
-    list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas)
-endif()
-
-if (GGML_SYCL)
-    if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
-        message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
-    endif()
-
-    check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
-
-    if (DEFINED ENV{ONEAPI_ROOT})
-        message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
-    elseif(SUPPORTS_SYCL)
-        message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
-         If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
-         source /opt/intel/oneapi/setvars.sh")
-    else()
-        message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
-    endif()
-    message(STATUS "SYCL found")
-    #todo: AOT
-
-    list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
-
-    if (GGML_SYCL_F16)
-        if (GGML_SYCL_TARGET STREQUAL "AMD")
-            message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
-        endif()
-        add_compile_definitions(GGML_SYCL_F16)
-    endif()
-
-    if (GGML_CUDA_FORCE_MMQ)
-        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
-    endif()
-
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
-
-    if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-        add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
-    elseif (GGML_SYCL_TARGET STREQUAL "AMD")
-        # INFO: Allowed Sub_group_sizes are not consistent through all
-        # hip targets. For example, 64 is used for certain models, but the backend
-        # does not support it.
-        # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
-        add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
-    else()
-        add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
-    endif()
-
-    file(GLOB   GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
-    list(APPEND GGML_HEADERS_SYCL "../include/ggml-sycl.h")
-
-    file(GLOB   GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
-    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
-
-    find_package(DNNL)
-    message("-- DNNL found:" ${DNNL_FOUND})
-
-    if (GGML_SYCL_TARGET STREQUAL "INTEL")
-        add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
-    else()
-        add_compile_definitions(GGML_SYCL_DNNL=0)
-    endif()
-
-    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
-    endif()
-
-    if (WIN32)
-        find_package(IntelSYCL REQUIRED)
-        find_package(MKL REQUIRED)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
-    else()
-        if (GGML_SYCL_TARGET STREQUAL "INTEL")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
-        elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
-        elseif (GGML_SYCL_TARGET STREQUAL "AMD")
-            if (GGML_SYCL_HIP_TARGET STREQUAL "")
-                message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.")
-            endif()
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
-        endif()
-    endif()
-endif()
-
-if (GGML_RPC)
-    message(STATUS "RPC found")
-
-    list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)
-
-    if (WIN32)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32)
-    endif()
-
-    set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
-    set(GGML_SOURCES_RPC ggml-rpc.cpp)
-endif()
-
-if (GGML_VULKAN)
-    find_package(Vulkan COMPONENTS glslc REQUIRED)
-
-    if (Vulkan_FOUND)
-        message(STATUS "Vulkan found")
-
-        list(APPEND GGML_CDEF_PUBLIC GGML_USE_VULKAN)
-
-        # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
-        # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
-        if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-            add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
-        endif()
-
-        if (GGML_VULKAN_CHECK_RESULTS)
-            add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
-        endif()
-
-        if (GGML_VULKAN_DEBUG)
-            add_compile_definitions(GGML_VULKAN_DEBUG)
-        endif()
-
-        if (GGML_VULKAN_MEMORY_DEBUG)
-            add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
-        endif()
-
-        if (GGML_VULKAN_SHADER_DEBUG_INFO)
-            add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
-        endif()
-
-        if (GGML_VULKAN_PERF)
-            add_compile_definitions(GGML_VULKAN_PERF)
-        endif()
-
-        if (GGML_VULKAN_VALIDATE)
-            add_compile_definitions(GGML_VULKAN_VALIDATE)
-        endif()
-
-        if (GGML_VULKAN_RUN_TESTS)
-            add_compile_definitions(GGML_VULKAN_RUN_TESTS)
-        endif()
-
-        add_subdirectory(vulkan-shaders)
-
-        set (_ggml_vk_genshaders_cmd vulkan-shaders-gen)
-        set (_ggml_vk_header     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
-        set (_ggml_vk_source     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
-        set (_ggml_vk_input_dir  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
-        set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
-
-        file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
-
-        add_custom_command(
-            OUTPUT ${_ggml_vk_header}
-                   ${_ggml_vk_source}
-
-            COMMAND ${_ggml_vk_genshaders_cmd}
-                --glslc      ${Vulkan_GLSLC_EXECUTABLE}
-                --input-dir  ${_ggml_vk_input_dir}
-                --output-dir ${_ggml_vk_output_dir}
-                --target-hpp ${_ggml_vk_header}
-                --target-cpp ${_ggml_vk_source}
-                --no-clean
-
-            DEPENDS ${_ggml_vk_shader_deps}
-            COMMENT "Generate vulkan shaders"
-        )
-
-        set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
-        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
-
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan)
-        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
-    else()
-        message(WARNING "Vulkan not found")
-    endif()
-endif()
-
-if (GGML_KOMPUTE)
-    add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
-
-    find_package(Vulkan COMPONENTS glslc REQUIRED)
-    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
-
-    if (NOT glslc_executable)
-        message(FATAL_ERROR "glslc not found")
-    endif()
-
-    function(compile_shader)
-        set(options)
-        set(oneValueArgs)
-        set(multiValueArgs SOURCES)
-        cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-        foreach(source ${compile_shader_SOURCES})
-            get_filename_component(filename ${source} NAME)
-            set(spv_file ${filename}.spv)
-            add_custom_command(
-                OUTPUT ${spv_file}
-                DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
-                COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-                COMMENT "Compiling ${source} to ${spv_file}"
-                )
-
-            get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
-            set(FILE_NAME "shader${RAW_FILE_NAME}")
-            string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
-            string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
-            string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
-            set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
-            message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
-            if(CMAKE_GENERATOR MATCHES "Visual Studio")
-                add_custom_command(
-                    OUTPUT ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    DEPENDS ${spv_file} xxd
-                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
-                    )
-            else()
-                add_custom_command(
-                    OUTPUT ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    DEPENDS ${spv_file} xxd
-                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
-                    )
-            endif()
-        endforeach()
-    endfunction()
-
-    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
-        message(STATUS "Kompute found")
-        set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
-        add_subdirectory(kompute)
-
-        # Compile our shaders
-        compile_shader(SOURCES
-            kompute-shaders/op_scale.comp
-            kompute-shaders/op_scale_8.comp
-            kompute-shaders/op_add.comp
-            kompute-shaders/op_addrow.comp
-            kompute-shaders/op_mul.comp
-            kompute-shaders/op_silu.comp
-            kompute-shaders/op_relu.comp
-            kompute-shaders/op_gelu.comp
-            kompute-shaders/op_softmax.comp
-            kompute-shaders/op_norm.comp
-            kompute-shaders/op_rmsnorm.comp
-            kompute-shaders/op_diagmask.comp
-            kompute-shaders/op_mul_mat_mat_f32.comp
-            kompute-shaders/op_mul_mat_f16.comp
-            kompute-shaders/op_mul_mat_q8_0.comp
-            kompute-shaders/op_mul_mat_q4_0.comp
-            kompute-shaders/op_mul_mat_q4_1.comp
-            kompute-shaders/op_mul_mat_q4_k.comp
-            kompute-shaders/op_mul_mat_q6_k.comp
-            kompute-shaders/op_getrows_f32.comp
-            kompute-shaders/op_getrows_f16.comp
-            kompute-shaders/op_getrows_q4_0.comp
-            kompute-shaders/op_getrows_q4_1.comp
-            kompute-shaders/op_getrows_q6_k.comp
-            kompute-shaders/op_rope_f16.comp
-            kompute-shaders/op_rope_f32.comp
-            kompute-shaders/op_cpy_f16_f16.comp
-            kompute-shaders/op_cpy_f16_f32.comp
-            kompute-shaders/op_cpy_f32_f16.comp
-            kompute-shaders/op_cpy_f32_f32.comp
-        )
-
-        # Create a custom target for our generated shaders
-        add_custom_target(generated_shaders DEPENDS
-            shaderop_scale.h
-            shaderop_scale_8.h
-            shaderop_add.h
-            shaderop_addrow.h
-            shaderop_mul.h
-            shaderop_silu.h
-            shaderop_relu.h
-            shaderop_gelu.h
-            shaderop_softmax.h
-            shaderop_norm.h
-            shaderop_rmsnorm.h
-            shaderop_diagmask.h
-            shaderop_mul_mat_mat_f32.h
-            shaderop_mul_mat_f16.h
-            shaderop_mul_mat_q8_0.h
-            shaderop_mul_mat_q4_0.h
-            shaderop_mul_mat_q4_1.h
-            shaderop_mul_mat_q4_k.h
-            shaderop_mul_mat_q6_k.h
-            shaderop_getrows_f32.h
-            shaderop_getrows_f16.h
-            shaderop_getrows_q4_0.h
-            shaderop_getrows_q4_1.h
-            shaderop_getrows_q6_k.h
-            shaderop_rope_f16.h
-            shaderop_rope_f32.h
-            shaderop_cpy_f16_f16.h
-            shaderop_cpy_f16_f32.h
-            shaderop_cpy_f32_f16.h
-            shaderop_cpy_f32_f32.h
-        )
-
-        # Create a custom command that depends on the generated_shaders
-        add_custom_command(
-            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
-            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
-            DEPENDS generated_shaders
-            COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
-        )
-
-        # Add the stamp to the main sources to ensure dependency tracking
-        set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp           ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-        set(GGML_HEADERS_KOMPUTE ../include/ggml-kompute.h  ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-
-        list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)
-
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute)
-        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
-    else()
-        message(WARNING "Kompute not found")
-    endif()
-endif()
-
-if (GGML_CPU_HBM)
-    find_library(memkind memkind REQUIRED)
-
-    message(STATUS "Using memkind for CPU HBM")
-
-    add_compile_definitions(GGML_USE_CPU_HBM)
-
-    target_link_libraries(ggml PUBLIC memkind)
-endif()
-
-if (GGML_CANN)
-    if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
-        set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
-        message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
-    endif()
-
-    if (CANN_INSTALL_DIR)
-        # Only Support Linux.
-        if (GGML_CANN)
-            if (NOT UNIX)
-                set(GGML_CANN OFF)
-                message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_CANN")
-            endif()
-        endif()
-
-        # Supported platforms: x86-64, arm64
-        if (GGML_CANN)
-            if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
-            elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
-            else()
-                set(GGML_CANN OFF)
-                message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_CANN")
-            endif()
-        endif()
-
-        # Set header and libs
-        if(GGML_CANN)
-            set(CANN_INCLUDE_DIRS
-                ${CANN_INSTALL_DIR}/include
-                ${CANN_INSTALL_DIR}/include/aclnn
-                ${CANN_INSTALL_DIR}/acllib/include
-            )
-
-            add_subdirectory(ggml-cann/kernels)
-            list(APPEND CANN_LIBRARIES
-                ascendcl
-                nnopbase
-                opapi
-                acl_op_compiler
-                ascendc_kernels
-            )
-
-            set(GGML_HEADERS_CANN "../include/ggml-cann.h")
-            file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp")
-            list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
-
-            message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
-            message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
-
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} )
-            list(APPEND GGML_EXTRA_INCLUDES     ${CANN_INCLUDE_DIRS})
-            list(APPEND GGML_EXTRA_LIBDIRS      ${CANN_INSTALL_DIR}/lib64)
-
-            list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
-        endif()
-    else()
-        set(GGML_CANN OFF)
-        message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off GGML_CANN")
-    endif()
-
-    if(NOT GGML_CANN)
-        message(WARNING "CANN: GGML_CANN is turned OFF, see above for details.")
-    endif()
-endif()
-
 function(get_flags CCID CCVER)
     set(C_FLAGS "")
     set(CXX_FLAGS "")
@@ -966,12 +41,6 @@ function(get_flags CCID CCVER)
     elseif (CCID STREQUAL "GNU")
         set(C_FLAGS   -Wdouble-promotion)
         set(CXX_FLAGS -Wno-array-bounds)
-
-        if (NOT GGML_MUSA)
-            if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-                list(APPEND CXX_FLAGS -Wno-format-truncation)
-            endif()
-        endif()
         if (CCVER VERSION_GREATER_EQUAL 8.1.0)
             list(APPEND CXX_FLAGS -Wextra-semi)
         endif()
@@ -1011,54 +80,6 @@ if (GGML_ALL_WARNINGS)
     endif()
 endif()
 
-set(CUDA_CXX_FLAGS "")
-
-if (GGML_CUDA)
-    set(CUDA_FLAGS -use_fast_math)
-
-    if (GGML_FATAL_WARNINGS)
-        list(APPEND CUDA_FLAGS -Werror all-warnings)
-    endif()
-
-    if (GGML_ALL_WARNINGS AND NOT MSVC)
-        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-        endif()
-
-        execute_process(
-            COMMAND ${NVCC_CMD} -Xcompiler --version
-            OUTPUT_VARIABLE CUDA_CCFULLVER
-            ERROR_QUIET
-        )
-
-        if (NOT CUDA_CCFULLVER MATCHES clang)
-            set(CUDA_CCID "GNU")
-            execute_process(
-                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
-                OUTPUT_VARIABLE CUDA_CCVER
-                ERROR_QUIET
-            )
-        else()
-            if (CUDA_CCFULLVER MATCHES Apple)
-                set(CUDA_CCID "AppleClang")
-            else()
-                set(CUDA_CCID "Clang")
-            endif()
-            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
-        endif()
-
-        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
-
-        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
-    endif()
-
-    if (NOT MSVC)
-        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
-    endif()
-endif()
-
 if (GGML_LTO)
     include(CheckIPOSupported)
     check_ipo_supported(RESULT result OUTPUT output)
@@ -1116,194 +137,6 @@ if (NOT MSVC)
     endif()
 endif()
 
-set(ARCH_FLAGS "")
-
-if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
-    CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-    (NOT CMAKE_OSX_ARCHITECTURES      AND
-     NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
-
-    message(STATUS "ARM detected")
-
-    if (MSVC)
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
-        add_compile_definitions(__ARM_NEON)
-        add_compile_definitions(__ARM_FEATURE_FMA)
-
-        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
-        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
-
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
-        if (GGML_COMPILER_SUPPORT_DOTPROD)
-            add_compile_definitions(__ARM_FEATURE_DOTPROD)
-        endif ()
-
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-
-        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
-        endif ()
-
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        endif ()
-
-        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
-    else()
-        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
-            # Raspberry Pi 1, Zero
-            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-                # Android armeabi-v7a
-                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
-            else()
-                # Raspberry Pi 2
-                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
-            endif()
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
-            # Android arm64-v8a
-            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            list(APPEND ARCH_FLAGS -mno-unaligned-access)
-        endif()
-        if (GGML_SVE)
-            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
-        endif()
-    endif()
-elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
-    message(STATUS "x86 detected")
-    if (MSVC)
-        # instruction set detection for MSVC only
-        if (GGML_NATIVE)
-            # TODO: improve, should not reference files from the parent folder
-            include(../cmake/FindSIMD.cmake)
-        endif ()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (GGML_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-            endif()
-            if (GGML_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-            endif()
-            if (GGML_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
-            endif()
-            if (GGML_AMX_TILE)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
-            endif()
-            if (GGML_AMX_INT8)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
-            endif()
-            if (GGML_AMX_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
-            endif()
-        elseif (GGML_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
-        elseif (GGML_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
-        endif()
-    else()
-        if (GGML_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (GGML_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
-        if (GGML_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
-        endif()
-        if (GGML_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
-        endif()
-        if (GGML_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
-        endif()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512dq)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (GGML_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (GGML_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-        if (GGML_AVX512_BF16)
-            list(APPEND ARCH_FLAGS -mavx512bf16)
-        endif()
-        if (GGML_AMX_TILE)
-            list(APPEND ARCH_FLAGS -mamx-tile)
-        endif()
-        if (GGML_AMX_INT8)
-            list(APPEND ARCH_FLAGS -mamx-int8)
-        endif()
-        if (GGML_AMX_BF16)
-            list(APPEND ARCH_FLAGS -mamx-bf16)
-        endif()
-    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-    message(STATUS "PowerPC detected")
-    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
-                   OUTPUT_VARIABLE POWER10_M)
-    string(FIND ${POWER10_M} "POWER10" substring_index)
-    if(${substring_index} GREATER_EQUAL 0)
-       list(APPEND ARCH_FLAGS -mcpu=power10)
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-       list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
-    else()
-        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
-    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-    message(STATUS "loongarch64 detected")
-
-    list(APPEND ARCH_FLAGS -march=loongarch64)
-    if (GGML_LASX)
-        list(APPEND ARCH_FLAGS -mlasx)
-    endif()
-    if (GGML_LSX)
-        list(APPEND ARCH_FLAGS -mlsx)
-    endif()
-else()
-    message(STATUS "Unknown architecture")
-endif()
-
-add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
-add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
-
-if (GGML_CUDA)
-    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
-    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
-
-    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
-        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
-    endif()
-
-    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
-endif()
-
 if (MINGW)
     # Target Windows 8 for PrefetchVirtualMemory
     add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
@@ -1317,14 +150,14 @@ endif()
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
 # posix_memalign came in POSIX.1-2001 / SUSv3
 # M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-add_compile_definitions(_XOPEN_SOURCE=600)
 
 # Somehow in OpenBSD whenever POSIX conformance is specified
 # some string functions rely on locale_t availability,
 # which was introduced in POSIX.1-2008, forcing us to go higher
 if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    remove_definitions(-D_XOPEN_SOURCE=600)
     add_compile_definitions(_XOPEN_SOURCE=700)
+else()
+    add_compile_definitions(_XOPEN_SOURCE=600)
 endif()
 
 # Data types, macros and functions related to controlling CPU affinity and
@@ -1367,67 +200,87 @@ if (WIN32)
     endif()
 endif()
 
-#
-# libraries
-#
-
 # ggml
 
-add_library(ggml
+add_library(ggml-base
             ../include/ggml.h
-            ../include/ggml-cpu.h
             ../include/ggml-alloc.h
             ../include/ggml-backend.h
             ../include/ggml-cpp.h
             ggml.c
-            ggml-cpu.c
             ggml-alloc.c
             ggml-backend.cpp
+            ggml-threading.cpp
+            ggml-threading.h
             ggml-quants.c
             ggml-quants.h
-            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
-            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
-            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
-            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
-            ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
-            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
-            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
-            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
-            ${GGML_SOURCES_AMX}       ${GGML_HEADERS_AMX}
-            ${GGML_SOURCES_CANN}      ${GGML_HEADERS_CANN}
-            ggml-aarch64.c            ggml-aarch64.h
-            )
+            ggml-aarch64.c
+            ggml-aarch64.h)
 
-if (EMSCRIPTEN)
-    set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
-endif()
+target_include_directories(ggml-base PRIVATE .)
 
-target_compile_definitions(ggml PUBLIC    ${GGML_CDEF_PUBLIC})
-target_include_directories(ggml PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
-target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
-target_link_directories   (ggml PRIVATE   ${GGML_EXTRA_LIBDIRS})
-target_compile_features   (ggml PRIVATE c_std_11) # don't bump
+add_library(ggml
+            ggml-backend-reg.cpp)
 
-list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
+target_link_libraries(ggml PUBLIC ggml-base)
+
+function(ggml_add_backend backend)
+    string(TOUPPER "GGML_${backend}" backend_id)
+    if (${backend_id})
+        string(TOLOWER "ggml-${backend}" backend_target)
+        add_subdirectory(${backend_target})
+        # check again in case the backend disabled itself
+        # note that this should NOT be the normal behavior, in case of errors the backend should fail the build
+        # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
+        if (${backend_id})
+            message(STATUS "Including ${backend} backend")
+            if (${BUILD_SHARED_LIBS})
+                target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
+                target_compile_definitions(${backend_target} PUBLIC  GGML_BACKEND_SHARED)
+                install(TARGETS ${backend_target} LIBRARY)
+            endif()
+            target_link_libraries(ggml PUBLIC ${backend_target})
+            string(TOUPPER "GGML_USE_${backend}" backend_use)
+            target_compile_definitions(ggml PUBLIC ${backend_use})
+        endif()
+    endif()
+endfunction()
+
+ggml_add_backend(CPU)
+ggml_add_backend(AMX)
+ggml_add_backend(BLAS)
+ggml_add_backend(CANN)
+ggml_add_backend(CUDA)
+ggml_add_backend(HIP)
+ggml_add_backend(Kompute)
+ggml_add_backend(METAL)
+ggml_add_backend(RPC)
+ggml_add_backend(SYCL)
+ggml_add_backend(Vulkan)
+ggml_add_backend(MUSA)
+
+foreach (target ggml-base ggml)
+    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
+    target_compile_features   (${target} PRIVATE c_std_11) # don't bump
+endforeach()
+
+target_link_libraries(ggml-base PRIVATE Threads::Threads)
 
 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
     if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
+        target_link_libraries(ggml-base PRIVATE m)
     endif()
 endif()
 
 if (CMAKE_SYSTEM_NAME MATCHES "Android")
-    list(APPEND GGML_EXTRA_LIBS_PRIVATE dl) # Must be linked explicitly
+    target_link_libraries(ggml-base PRIVATE dl)
 endif()
 
-list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
-list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
-target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
-
 if (BUILD_SHARED_LIBS)
-    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
+    foreach (target ggml-base ggml)
+        set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_compile_definitions(${target} PRIVATE GGML_BUILD)
+        target_compile_definitions(${target} PUBLIC  GGML_SHARED)
+    endforeach()
 endif()
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 81f62ff4f..688b17b45 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -1,192 +1,13 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-// SPDX-License-Identifier: MIT
-//
-
-#define GGML_COMMON_IMPL_C
+#define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 
-#include "ggml-quants.h"
-#include "ggml-impl.h"
-#include "ggml-cpu.h"
-#include "ggml-cpu-impl.h"
-
-#include <math.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <stdlib.h> // for qsort
-#include <stdio.h>  // for GGML_ASSERT
-
 #include "ggml-aarch64.h"
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Woverlength-strings"
-#elif defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
+#include "ggml-impl.h"
+#include "ggml-quants.h"
+#include <assert.h>
 
 #define UNUSED GGML_UNUSED
 
-// Functions to create the interleaved data layout formats
-
-// interleave 4 block_q4_0s in blocks of blck_size_interleave
-// returns an interleaved block_q4_0x4
-// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
-// first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
-//
-// - in                  : an array of block_q4_0 pointers
-// - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
-//                         blck_size_interleave bytes
-// - xor_mask            : the mask to convert the nibbles in block_q4_0 quants bytes
-//                         from bias offset form to pure sign form (this saves subtract
-//                         operations durin unpacking)
-//
-#if defined(__AVX__)
-#if defined(__F16C__)
-#if defined(__AVX512F__)
-#define GGML_F32Cx8x2_LOAD(x, y)     _mm512_cvtph_ps(_mm256_set_m128i(_mm_loadu_si128((const __m128i *)(y)), _mm_loadu_si128((const __m128i *)(x))))
-#define GGML_F32Cx16_REPEAT_LOAD(x)  _mm512_cvtph_ps(_mm256_set_m128i(x, x))
-#endif
-// the  _mm256_cvt intrinsics require F16C
-#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
-#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask)     _mm256_cvtph_ps(_mm_shuffle_epi32(_mm_maskload_epi32((int const*)(x), loadMask), 68))
-#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask)     _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask))
-#else
-#if defined(__AVX512F__)
-static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
-    float tmp[16];
-
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    for (int i = 0; i < 8; i++) {
-        tmp[i + 8] = GGML_FP16_TO_FP32(y[i]);
-    }
-
-    return _mm512_loadu_ps(tmp);
-}
-static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
-    float tmp[16];
-    uint16_t tmphalf[8];
-    _mm_storeu_si128((__m128i*)tmphalf, x);
-
-    for (int i = 0; i < 4; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
-        tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]);
-        tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]);
-        tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]);
-    }
-
-    return _mm512_loadu_ps(tmp);
-}
-#endif
-static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
-    float tmp[8];
-
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    return _mm256_loadu_ps(tmp);
-}
-static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
-    float tmp[8];
-
-    for (int i = 0; i < 4; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-        tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    return _mm256_loadu_ps(tmp);
-}
-static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrangeMask) {
-    uint16_t tmphalf[8];
-    float tmp[8];
-
-    _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
-    }
-
-    return _mm256_loadu_ps(tmp);
-}
-
-#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
-#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask)     __avx_repeat_f32cx8_load(x)
-#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask)     __avx_rearranged_f32cx8_load(x, arrangeMask)
-#if defined(__AVX512F__)
-#define GGML_F32Cx8x2_LOAD(x, y)     __avx512_f32cx8x2_load(x, y)
-#define GGML_F32Cx16_REPEAT_LOAD(x)  __avx512_repeat_f32cx16_load(x)
-#endif
-#endif
-#endif
-
-
-#if defined(__AVX2__) || defined(__AVX512F__)
-#if defined(__AVX512F__)
-// add int16_t pairwise and return as 512 bit int vector
-static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
-    const __m512i ones = _mm512_set1_epi16(1);
-    return _mm512_madd_epi16(ones, x);
-}
-
-static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
-    const __m512i zero = _mm512_setzero_si512();
-    return _mm512_dpbusd_epi32(zero, ax, sy);
-#else
-    // Perform multiplication and create 16-bit values
-    const __m512i dot = _mm512_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_int_32x16(dot);
-#endif
-}
-
-// multiply int8_t, add results pairwise twice and return as 512 bit int vector
-static inline __m512i mul_sum_i8_pairs_int32x16(const __m512i x, const __m512i y) {
-    const __m512i zero = _mm512_setzero_si512();
-    // Get absolute values of x vectors
-    const __m512i ax = _mm512_abs_epi8(x);
-    // Sign the values of the y vectors
-    __mmask64 blt0 = _mm512_movepi8_mask(x);
-    const __m512i sy = _mm512_mask_sub_epi8(y, blt0, zero, y);
-    return mul_sum_us8_pairs_int32x16(ax, sy);
-}
-#endif
-
-// add int16_t pairwise and return as 256 bit int vector
-static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
-    const __m256i ones = _mm256_set1_epi16(1);
-    return _mm256_madd_epi16(ones, x);
-}
-
-static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
-    const __m256i zero = _mm256_setzero_si256();
-    return _mm256_dpbusd_epi32(zero, ax, sy);
-#else
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_int32x8(dot);
-#endif
-}
-
-// Integer variant of the function defined in ggml-quants.c
-// multiply int8_t, add results pairwise twice and return as 256 bit int vector
-static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y) {
-#if __AVXVNNIINT8__
-    const __m256i zero = _mm256_setzero_si256();
-    return _mm256_dpbssd_epi32(zero, x, y);
-#else
-    // Get absolute values of x vectors
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return mul_sum_us8_pairs_int32x8(ax, sy);
-#endif
-}
-#endif
-
 static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
     block_q4_0x4 out;
 
@@ -227,327 +48,6 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in
     return out;
 }
 
-void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t srcv[4][8];
-    float id[4];
-
-    for (int i = 0; i < nb; i++) {
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int row_iter = 0; row_iter < 4; row_iter++) {
-            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
-            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
-
-            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
-            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
-            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
-
-            const float amax = vmaxvq_f32(amaxv[0]);
-
-            const float d = amax / ((1 << 7) - 1);
-            id[row_iter] = d ? 1.0f / d : 0.0f;
-
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-        }
-
-        for (int j = 0; j < 8; j++) {
-            float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
-            int32x4_t vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[1][j], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[2][j], id[2]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[3][j], id[3]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#else
-    // scalar
-    const int blck_size_interleave = 4;
-    float srcv[4][QK8_0];
-    float id[4];
-
-    for (int i = 0; i < nb; i++) {
-        for (int row_iter = 0; row_iter < 4; row_iter++) {
-            float amax = 0.0f; // absolute max
-
-            for (int j = 0; j < QK8_0; j++) {
-                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
-                amax = MAX(amax, fabsf(srcv[row_iter][j]));
-            }
-
-            const float d = amax / ((1 << 7) - 1);
-            id[row_iter] = d ? 1.0f / d : 0.0f;
-
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-        }
-
-        for (int j = 0; j < QK8_0 * 4; j++) {
-            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
-            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
-            src_offset += (j % blck_size_interleave);
-
-            float x0 = srcv[src_id][src_offset] * id[src_id];
-            y[i].qs[j] = roundf(x0);
-        }
-    }
-#endif
-}
-
-void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t srcv[4][8];
-    float id[4];
-
-    for (int i = 0; i < nb; i++) {
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int row_iter = 0; row_iter < 4; row_iter++) {
-            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
-            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
-
-            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
-            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
-            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
-
-            const float amax = vmaxvq_f32(amaxv[0]);
-
-            const float d = amax / ((1 << 7) - 1);
-            id[row_iter] = d ? 1.0f / d : 0.0f;
-
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-        }
-
-        for (int j = 0; j < 4; j++) {
-            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
-            int32x4_t vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[2][2 * j], id[2]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[3][2 * j], id[3]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    float id[4];
-    __m256 srcv[4][4];
-    __m256 idvec[4];
-
-    for (int i = 0; i < nb; i++) {
-        for (int row_iter = 0; row_iter < 4; row_iter++) {
-            // Load elements into 4 AVX vectors
-            __m256 v0 = _mm256_loadu_ps( x + row_iter * k + i * 32 );
-            __m256 v1 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 8 );
-            __m256 v2 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 16 );
-            __m256 v3 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 24 );
-
-            // Compute max(abs(e)) for the block
-            const __m256 signBit = _mm256_set1_ps( -0.0f );
-            __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-            maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-            maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-            maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-            __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-            max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-            max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-            const float maxScalar = _mm_cvtss_f32( max4 );
-
-            // Divided by 127.f to mirror results in quantize_row_q8_0
-            const float d = maxScalar  / 127.f;
-            id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
-
-            // Store the scale for the individual block
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-
-            // Store the values in blocks of eight values - Aim is to use these later for block interleaving
-            srcv[row_iter][0] = v0;
-            srcv[row_iter][1] = v1;
-            srcv[row_iter][2] = v2;
-            srcv[row_iter][3] = v3;
-            idvec[row_iter] = _mm256_set1_ps(id[row_iter]);
-        }
-
-        // The loop iterates four times - The aim is to get 4 corresponding chunks of eight bytes from the original weight blocks that are interleaved
-        for (int j = 0; j < 4; j++) {
-            // Apply the multiplier
-            __m256 v0 = _mm256_mul_ps(srcv[0][j], idvec[0]);
-            __m256 v1 = _mm256_mul_ps(srcv[1][j], idvec[1]);
-            __m256 v2 = _mm256_mul_ps(srcv[2][j], idvec[2]);
-            __m256 v3 = _mm256_mul_ps(srcv[3][j], idvec[3]);
-
-            // Round to nearest integer
-            v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-            v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-            v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-            v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-            // Convert floats to integers
-            __m256i i0 = _mm256_cvtps_epi32( v0 );
-            __m256i i1 = _mm256_cvtps_epi32( v1 );
-            __m256i i2 = _mm256_cvtps_epi32( v2 );
-            __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-            // Convert int32 to int16
-            i0 = _mm256_packs_epi32( i0, i1 );
-            i2 = _mm256_packs_epi32( i2, i3 );
-            // Convert int16 to int8
-            i0 = _mm256_packs_epi16( i0, i2 );
-
-            //  Permute and store the quantized weights in the required order after the pack instruction
-            const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-            i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-            _mm256_storeu_si256((__m256i *)(y[i].qs + 32 * j), i0);
-#else
-            // Since we don't have in AVX some necessary functions,
-            // we split the registers in half and call AVX2 analogs from SSE
-            __m128i ni0 = _mm256_castsi256_si128( i0 );
-            __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-            __m128i ni2 = _mm256_castsi256_si128( i1 );
-            __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-            __m128i ni4 = _mm256_castsi256_si128( i2 );
-            __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-            __m128i ni6 = _mm256_castsi256_si128( i3 );
-            __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-            // Convert int32 to int16
-            ni0 = _mm_packs_epi32( ni0, ni1 );
-            ni2 = _mm_packs_epi32( ni2, ni3 );
-            ni4 = _mm_packs_epi32( ni4, ni5 );
-            ni6 = _mm_packs_epi32( ni6, ni7 );
-            // Convert int16 to int8
-            ni0 = _mm_packs_epi16( ni0, ni2 );
-            ni4 = _mm_packs_epi16( ni4, ni6 );
-            _mm_storeu_si128((__m128i *)(y[i].qs + 32 * j), ni0);
-            _mm_storeu_si128((__m128i *)(y[i].qs + 32 * j + 16), ni4);
-#endif
-        }
-    }
-#else
-    // scalar
-    const int blck_size_interleave = 8;
-    float srcv[4][QK8_0];
-    float id[4];
-
-    for (int i = 0; i < nb; i++) {
-        for (int row_iter = 0; row_iter < 4; row_iter++) {
-            float amax = 0.0f; // absolute max
-
-            for (int j = 0; j < QK8_0; j++) {
-                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
-                amax = MAX(amax, fabsf(srcv[row_iter][j]));
-            }
-
-            const float d = amax / ((1 << 7) - 1);
-            id[row_iter] = d ? 1.0f / d : 0.0f;
-
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-        }
-
-        for (int j = 0; j < QK8_0 * 4; j++) {
-            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
-            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
-            src_offset += (j % blck_size_interleave);
-
-            float x0 = srcv[src_id][src_offset] * id[src_id];
-            y[i].qs[j] = roundf(x0);
-        }
-    }
-#endif
-}
-
-void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
-    assert(nrow == 4);
-    UNUSED(nrow);
-    if (blck_size_interleave == 4) {
-        quantize_q8_0_4x4(x, vy, n_per_row);
-    } else if (blck_size_interleave == 8) {
-        quantize_q8_0_4x8(x, vy, n_per_row);
-    } else {
-        assert(false);
-    }
-}
-
 static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
     assert(n_per_row % QK4_0 == 0);
     const int nb = n_per_row / QK4_0;
@@ -598,2881 +98,3 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
     UNUSED(quant_weights);
     return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
 }
-
-void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 4;
-
-    assert (n % qk == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    if (ggml_cpu_has_neon()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-
-        __asm__ __volatile__(
-            "movi v31.16b, #0x4\n"
-            "movi v30.16b, #0xf0\n"
-            "add %x[b_ptr], %x[b_ptr], #0x8\n"
-            "1:"  // Column loop
-            "add x22, %x[a_ptr], #0x2\n"
-            "movi v29.16b, #0x0\n"
-            "mov x21, %x[nb]\n"
-            "2:"  // Block loop
-            "ldr q28, [%x[b_ptr], #0x0]\n"
-            "ldr q27, [x22, #0x0]\n"
-            "movi v26.4s, #0x0\n"
-            "sub x20, x22, #0x2\n"
-            "ldr q25, [x22, #0x10]\n"
-            "ldr q24, [%x[b_ptr], #0x10]\n"
-            "sub x21, x21, #0x1\n"
-            "add x22, x22, #0x22\n"
-            "ldr q23, [%x[b_ptr], #0x20]\n"
-            "ldr q22, [%x[b_ptr], #0x30]\n"
-            "ld1r { v21.8h }, [x20]\n"
-            "ldr q20, [%x[b_ptr], #-0x8]\n"
-            "sshl v16.16b, v28.16b, v31.16b\n"
-            "and v28.16b, v28.16b, v30.16b\n"
-            "sshl v19.16b, v24.16b, v31.16b\n"
-            "and v24.16b, v24.16b, v30.16b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x48\n"
-            "sshl v18.16b, v23.16b, v31.16b\n"
-            "and v23.16b, v23.16b, v30.16b\n"
-            ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
-            "sshl v17.16b, v22.16b, v31.16b\n"
-            "and v22.16b, v22.16b, v30.16b\n"
-            "fcvtl v21.4s, v21.4h\n"
-            "fcvtl v16.4s, v20.4h\n"
-            ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
-            "fmul v16.4s, v16.4s, v21.4s\n"
-            ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
-            ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
-            ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
-            ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
-            ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
-            ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
-            "scvtf v26.4s, v26.4s, #0x4\n"
-            "fmla v29.4s, v26.4s, v16.4s\n"
-            "cbnz x21, 2b\n"
-            "sub %x[nc], %x[nc], #0x4\n"
-            "str q29, [%x[res_ptr], #0x0]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "cbnz %x[nc], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-            : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
-            );
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    float sumf[4];
-    int sumi;
-
-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
-
-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
-                    }
-                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
-                }
-            }
-        }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
-    }
-}
-
-void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 8;
-
-    assert (n % qk == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-
-        __asm__ __volatile__(
-            "movi v2.16b, #0x4\n"
-            "movi v1.16b, #0xf0\n"
-            "add %x[b_ptr], %x[b_ptr], #0x8\n"
-            "1:"  // Column loop
-            "add x23, %x[a_ptr], #0x2\n"
-            "movi v0.16b, #0x0\n"
-            "mov x22, %x[nb]\n"
-            "2:"  // Block loop
-            "ldr q31, [%x[b_ptr], #0x0]\n"
-            "ldr q30, [%x[b_ptr], #0x10]\n"
-            "mov x21, x23\n"
-            "movi v29.4s, #0x0\n"
-            "ldr q28, [%x[b_ptr], #0x20]\n"
-            "ldr q27, [%x[b_ptr], #0x30]\n"
-            "movi v26.4s, #0x0\n"
-            "sub x20, x23, #0x2\n"
-            "ld1r { v25.8h }, [x20]\n"
-            "ldr q24, [%x[b_ptr], #-0x8]\n"
-            "sub x22, x22, #0x1\n"
-            "add x23, x23, #0x22\n"
-            "ld1r { v23.2d }, [x21], #0x8\n"
-            "sshl v22.16b, v31.16b, v2.16b\n"
-            "sshl v16.16b, v30.16b, v2.16b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x48\n"
-            "ld1r { v21.2d }, [x21], #0x8\n"
-            "sshl v20.16b, v28.16b, v2.16b\n"
-            "sshl v19.16b, v27.16b, v2.16b\n"
-            "ld1r { v18.2d }, [x21], #0x8\n"
-            "ld1r { v17.2d }, [x21], #0x8\n"
-            "and v31.16b, v31.16b, v1.16b\n"
-            "and v30.16b, v30.16b, v1.16b\n"
-            ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
-            ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
-            "and v28.16b, v28.16b, v1.16b\n"
-            "and v27.16b, v27.16b, v1.16b\n"
-            "fcvtl v25.4s, v25.4h\n"
-            "fcvtl v16.4s, v24.4h\n"
-            ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
-            ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
-            "fmul v16.4s, v16.4s, v25.4s\n"
-            ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
-            ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
-            ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
-            ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
-            "addp v29.4s, v29.4s, v26.4s\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "fmla v0.4s, v29.4s, v16.4s\n"
-            "cbnz x22, 2b\n"
-            "sub %x[nc], %x[nc], #0x4\n"
-            "str q0, [%x[res_ptr], #0x0]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "cbnz %x[nc], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-            : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
-        );
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    float sumf[4];
-    int sumi;
-
-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
-
-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
-                    }
-                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
-                }
-            }
-        }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
-    }
-}
-
-void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 8;
-    const int blocklen = 8;
-
-    assert (n % qk == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-    if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-
-        __asm__ __volatile__(
-            "ptrue p0.b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x10\n"
-            "1:"  // Column loop
-            "add x22, %x[a_ptr], #0x2\n"
-            "mov z31.b, #0x0\n"
-            "mov x21, %x[nb]\n"
-            "2:"  // Block loop
-            "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
-            "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
-            "mov z28.s, #0x0\n"
-            "mov z27.s, #0x0\n"
-            "ld1rd { z26.d }, p0/Z, [x22]\n"
-            "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
-            "sub x20, x22, #0x2\n"
-            "sub x21, x21, #0x1\n"
-            "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
-            "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
-            "lsl z22.b, z30.b, #0x4\n"
-            "lsl z16.b, z29.b, #0x4\n"
-            "and z30.b, z30.b, #0xf0\n"
-            "and z29.b, z29.b, #0xf0\n"
-            "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
-            "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
-            "lsl z19.b, z25.b, #0x4\n"
-            "and z25.b, z25.b, #0xf0\n"
-            "ld1rh { z17.h }, p0/Z, [x20]\n"
-            "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
-            "sdot z28.s, z22.b, z26.b\n"
-            "sdot z27.s, z16.b, z26.b\n"
-            "lsl z16.b, z24.b, #0x4\n"
-            "add x22, x22, #0x22\n"
-            "and z24.b, z24.b, #0xf0\n"
-            "add %x[b_ptr], %x[b_ptr], #0x90\n"
-            "fcvt z17.s, p0/m, z17.h\n"
-            "fcvt z18.s, p0/m, z18.h\n"
-            "sdot z28.s, z19.b, z23.b\n"
-            "sdot z27.s, z16.b, z23.b\n"
-            "fmul z18.s, z18.s, z17.s\n"
-            "sdot z28.s, z30.b, z21.b\n"
-            "sdot z27.s, z29.b, z21.b\n"
-            "sdot z28.s, z25.b, z20.b\n"
-            "sdot z27.s, z24.b, z20.b\n"
-            "uzp1 z17.s, z28.s, z27.s\n"
-            "uzp2 z16.s, z28.s, z27.s\n"
-            "add z17.s, z17.s, z16.s\n"
-            "asr z17.s, z17.s, #0x4\n"
-            "scvtf z17.s, p0/m, z17.s\n"
-            "fmla z31.s, p0/M, z17.s, z18.s\n"
-            "cbnz x21, 2b\n"
-            "sub %x[nc], %x[nc], #0x8\n"
-            "st1w { z31.s }, p0, [%x[res_ptr]]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x20\n"
-            "cbnz %x[nc], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-            : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-        );
-        return;
-    }
-#endif // #if defined(__ARM_FEATURE_SVE)
-#elif defined(__AVX2__)
-    // Lookup table to convert signed nibbles to signed bytes
-    __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
-    signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
-    __m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
-    __m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
-
-    // Permute mask used for easier vector processing at later stages
-    const __m256i m4b = _mm256_set1_epi8(0x0F);
-
-    int64_t b_nb = n / QK4_0;
-
-    const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
-    const block_q8_0 * a_ptr_start = (const block_q8_0 *)vy;
-
-    // Process Q8_0 blocks one by one
-    for (int64_t y = 0; y < nr; y++) {
-
-        // Pointers to LHS blocks of block_q8_0 format
-        const block_q8_0 * a_ptr = a_ptr_start + (y * nb);
-
-        // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
-        for (int64_t x = 0; x < nc / 8; x++) {
-
-            // Pointers to RHS blocks
-            const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
-
-            // Master FP accumulator
-            __m256 acc_row = _mm256_setzero_ps();
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Load 8 blocks of Q4_0 interleaved as 8 bytes (B0 - B7)
-                const __m256i rhs_raw_vec_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
-                const __m256i rhs_raw_vec_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 1);
-                const __m256i rhs_raw_vec_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 2);
-                const __m256i rhs_raw_vec_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 3);
-
-                // 4-bit -> 8-bit - Sign is maintained
-                const __m256i rhs_vec_0123_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_0, m4b)); // B0(0-7) B1(0-7) B2(0-7) B3(0-7)
-                const __m256i rhs_vec_4567_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_0, m4b)); // B4(0-7) B5(0-7) B6(0-7) B7(0-7)
-                const __m256i rhs_vec_0123_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
-                const __m256i rhs_vec_4567_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
-
-                const __m256i rhs_vec_0123_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_0, 4), m4b)); // B0(16-23) B1(16-23) B2(16-23) B3(16-23)
-                const __m256i rhs_vec_4567_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_0, 4), m4b)); // B4(16-23) B5(16-23) B6(16-23) B7(16-23)
-                const __m256i rhs_vec_0123_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_1, 4), m4b)); // B0(24-31) B1(24-31) B2(24-31) B3(24-31)
-                const __m256i rhs_vec_4567_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_1, 4), m4b)); // B4(24-31) B5(24-31) B6(24-31) B7(24-31)
-
-                // Load the scale values for the 8 blocks interleaved in block_q4_0x8
-                const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
-
-                // Load and convert to FP32 scale from block_q8_0
-                const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
-
-                // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
-                __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
-                __m256i lhs_vec_1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 16)));
-
-                lhs_vec_0 = _mm256_permute2f128_si256(lhs_vec_0, lhs_vec_0, 0); // A0 (0-15) A0(0-15)
-                lhs_vec_1 = _mm256_permute2f128_si256(lhs_vec_1, lhs_vec_1, 0); // A0 (16-31) A0(16-31))
-
-                __m256i iacc = _mm256_setzero_si256();
-
-                // Dot product done within 32 bit lanes and accumulated in the same vector
-                // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
-                // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
-                // ...........................................................................
-                // B0(28-31) B4(28-31) B1(28-31) B5(28-31) B2(28-31) B6(28-31) B3(28-31) B7(28-31) with A0(28-31)
-
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
-
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
-
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
-
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
-                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
-
-                // Accumulated values multipled with appropriate scales
-                acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
-            }
-
-            // Accumulated output values permuted so as to be stored in appropriate order post accumulation
-            acc_row = _mm256_permutevar8x32_ps(acc_row, finalpermutemask);
-            _mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
-        }
-    }
-    return;
-#elif defined(__riscv_v_intrinsic)
-    if (__riscv_vlenb() >= QK4_0) {
-        const size_t vl = QK4_0;
-
-        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
-
-            vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
-            for (int l = 0; l < nb; l++) {
-                const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
-                const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
-                const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
-                const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
-                __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
-                const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
-                const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
-                const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
-                const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
-
-                const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
-                const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
-                const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
-                const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
-                const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
-                const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
-                const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
-
-                const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
-                const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
-                const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
-                const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
-
-                const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
-                const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
-                const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
-                const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
-                const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
-                const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
-                const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
-                const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
-                const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
-                const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
-                const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
-                const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
-                const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
-
-                // vector version needs Zvfhmin extension
-                const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
-                const float b_scales[8] = {
-                    GGML_FP16_TO_FP32(b_ptr[l].d[0]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[1]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[2]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[3]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[4]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[5]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[6]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[7])
-                };
-                const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
-                const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
-                sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
-            }
-            __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
-        }
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
-    {
-        float sumf[8];
-        int sumi;
-
-        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
-
-            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-            for (int l = 0; l < nb; l++) {
-                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                    for (int j = 0; j < ncols_interleaved; j++) {
-                        sumi = 0;
-                        for (int i = 0; i < blocklen; ++i) {
-                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
-                        }
-                        sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
-                    }
-                }
-            }
-            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
-        }
-    }
-}
-
-void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 4;
-
-    assert (n % qk == 0);
-    assert (nr % 4 == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    if (ggml_cpu_has_neon()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-        size_t res_stride = bs * sizeof(float);
-
-        __asm__ __volatile__(
-            "mov x10, %x[nr]\n"
-            "mov x9, #0x88\n"
-            "cmp x10, #0x10\n"
-            "mul x9, %x[nb], x9\n"
-            "blt 4f\n"
-            "1:"  // Row loop
-            "add x28, %x[b_ptr], #0x8\n"
-            "mov x27, %x[nc]\n"
-            "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
-            "2:"  // Column loop
-            "add x25, %x[a_ptr], #0x8\n"
-            "movi v15.16b, #0x0\n"
-            "movi v19.16b, #0x0\n"
-            "mov x24, %x[nb]\n"
-            "add x23, x25, x9\n"
-            "movi v18.16b, #0x0\n"
-            "movi v14.16b, #0x0\n"
-            "add x22, x23, x9\n"
-            "movi v11.16b, #0x0\n"
-            "movi v13.16b, #0x0\n"
-            "add x21, x22, x9\n"
-            "movi v23.16b, #0x0\n"
-            "movi v16.16b, #0x0\n"
-            "movi v25.16b, #0x0\n"
-            "movi v7.16b, #0x0\n"
-            "movi v0.16b, #0x0\n"
-            "movi v4.16b, #0x0\n"
-            "movi v5.16b, #0x0\n"
-            "movi v21.16b, #0x0\n"
-            "movi v8.16b, #0x0\n"
-            "movi v1.16b, #0x0\n"
-            "3:"  // Block loop
-            "ldr q3, [x28, #0x0]\n"
-            "ldr q31, [x25, #0x0]\n"
-            "movi v28.16b, #0x4\n"
-            "movi v10.4s, #0x0\n"
-            "ldr q22, [x28, #0x10]\n"
-            "ldr q6, [x25, #0x10]\n"
-            "movi v29.4s, #0x0\n"
-            "movi v9.4s, #0x0\n"
-            "ldr q27, [x28, #0x20]\n"
-            "ldr q30, [x28, #0x30]\n"
-            "movi v20.4s, #0x0\n"
-            "movi v24.16b, #0xf0\n"
-            "ldr d2, [x25, #-0x8]\n"
-            "ldr d26, [x23, #-0x8]\n"
-            "sshl v12.16b, v3.16b, v28.16b\n"
-            "sub x20, x28, #0x8\n"
-            "ldr d17, [x20, #0x0]\n"
-            "and v3.16b, v3.16b, v24.16b\n"
-            "subs x24, x24, #0x1\n"
-            "add x28, x28, #0x48\n"
-            ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
-            ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
-            ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
-            ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
-            "sshl v31.16b, v22.16b, v28.16b\n"
-            "and v22.16b, v22.16b, v24.16b\n"
-            "fcvtl v17.4s, v17.4h\n"
-            "fcvtl v2.4s, v2.4h\n"
-            "fcvtl v26.4s, v26.4h\n"
-            ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
-            ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
-            ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
-            ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
-            "sshl v6.16b, v27.16b, v28.16b\n"
-            "sshl v28.16b, v30.16b, v28.16b\n"
-            "and v27.16b, v27.16b, v24.16b\n"
-            "and v30.16b, v30.16b, v24.16b\n"
-            "ldr q24, [x25, #0x20]\n"
-            ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
-            ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
-            "ldr q24, [x25, #0x30]\n"
-            ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
-            ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
-            ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
-            "ldr q24, [x25, #0x40]\n"
-            ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
-            ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
-            "ldr q24, [x25, #0x50]\n"
-            ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
-            ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
-            ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
-            "ldr q24, [x25, #0x60]\n"
-            ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
-            ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
-            ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
-            "ldr q24, [x25, #0x70]\n"
-            "add x25, x25, #0x88\n"
-            ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
-            ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
-            ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
-            "fmul v24.4s, v17.4s, v2.s[0]\n"
-            "scvtf v10.4s, v10.4s, #0x4\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "scvtf v9.4s, v9.4s, #0x4\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "fmla v15.4s, v10.4s, v24.4s\n"
-            "ldr q24, [x23, #0x0]\n"
-            "fmul v10.4s, v17.4s, v2.s[1]\n"
-            "fmla v19.4s, v29.4s, v10.4s\n"
-            "ldr q10, [x23, #0x10]\n"
-            "fmul v29.4s, v17.4s, v2.s[2]\n"
-            "fmul v2.4s, v17.4s, v2.s[3]\n"
-            "fmla v18.4s, v9.4s, v29.4s\n"
-            "movi v9.4s, #0x0\n"
-            "movi v29.4s, #0x0\n"
-            ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
-            "fmla v14.4s, v20.4s, v2.4s\n"
-            "movi v20.4s, #0x0\n"
-            "movi v2.4s, #0x0\n"
-            ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
-            "ldr q24, [x23, #0x20]\n"
-            ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
-            ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
-            ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
-            ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
-            "ldr q10, [x23, #0x30]\n"
-            ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
-            ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
-            "ldr q24, [x23, #0x40]\n"
-            ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
-            ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
-            ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
-            ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
-            "ldr q10, [x23, #0x50]\n"
-            ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
-            ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
-            "ldr q24, [x23, #0x60]\n"
-            ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
-            ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
-            ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
-            ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
-            "ldr q10, [x23, #0x70]\n"
-            "add x23, x23, #0x88\n"
-            ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
-            ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
-            ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
-            "ldr q24, [x22, #0x0]\n"
-            ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
-            ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
-            ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
-            ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
-            "fmul v10.4s, v17.4s, v26.s[0]\n"
-            "scvtf v9.4s, v9.4s, #0x4\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "scvtf v2.4s, v2.4s, #0x4\n"
-            "fmla v11.4s, v9.4s, v10.4s\n"
-            "ldr q9, [x22, #0x10]\n"
-            "fmul v10.4s, v17.4s, v26.s[1]\n"
-            "fmla v13.4s, v29.4s, v10.4s\n"
-            "ldr d29, [x22, #-0x8]\n"
-            "fmul v10.4s, v17.4s, v26.s[2]\n"
-            "fmul v26.4s, v17.4s, v26.s[3]\n"
-            "fcvtl v29.4s, v29.4h\n"
-            "fmla v23.4s, v20.4s, v10.4s\n"
-            "movi v20.4s, #0x0\n"
-            "movi v10.4s, #0x0\n"
-            "fmla v16.4s, v2.4s, v26.4s\n"
-            "movi v26.4s, #0x0\n"
-            "movi v2.4s, #0x0\n"
-            ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
-            ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
-            "ldr q24, [x22, #0x20]\n"
-            ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
-            ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
-            ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
-            "ldr q9, [x22, #0x30]\n"
-            ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
-            ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
-            "ldr q24, [x22, #0x40]\n"
-            ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
-            ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
-            ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
-            "ldr q9, [x22, #0x50]\n"
-            ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
-            ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
-            "ldr q24, [x22, #0x60]\n"
-            ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
-            ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
-            ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
-            "ldr q9, [x22, #0x70]\n"
-            "add x22, x22, #0x88\n"
-            ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
-            ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
-            ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
-            "ldr q24, [x21, #0x0]\n"
-            ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
-            ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
-            ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
-            "fmul v9.4s, v17.4s, v29.s[0]\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "scvtf v10.4s, v10.4s, #0x4\n"
-            "scvtf v26.4s, v26.4s, #0x4\n"
-            "scvtf v2.4s, v2.4s, #0x4\n"
-            "fmla v25.4s, v20.4s, v9.4s\n"
-            "ldr q9, [x21, #0x10]\n"
-            "fmul v20.4s, v17.4s, v29.s[1]\n"
-            "fmla v7.4s, v10.4s, v20.4s\n"
-            "ldr d20, [x21, #-0x8]\n"
-            "fmul v10.4s, v17.4s, v29.s[2]\n"
-            "fmul v29.4s, v17.4s, v29.s[3]\n"
-            "fcvtl v20.4s, v20.4h\n"
-            "fmla v0.4s, v26.4s, v10.4s\n"
-            "movi v26.4s, #0x0\n"
-            "movi v10.4s, #0x0\n"
-            "fmla v4.4s, v2.4s, v29.4s\n"
-            "movi v2.4s, #0x0\n"
-            "movi v29.4s, #0x0\n"
-            ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
-            ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
-            "ldr q12, [x21, #0x20]\n"
-            "fmul v24.4s, v17.4s, v20.s[0]\n"
-            ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
-            ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
-            ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
-            "ldr q9, [x21, #0x30]\n"
-            "fmul v31.4s, v17.4s, v20.s[1]\n"
-            ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
-            ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
-            ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
-            ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
-            "ldr q12, [x21, #0x40]\n"
-            "fmul v6.4s, v17.4s, v20.s[2]\n"
-            "fmul v20.4s, v17.4s, v20.s[3]\n"
-            ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
-            ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
-            ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
-            "ldr q9, [x21, #0x50]\n"
-            ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
-            ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
-            ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
-            ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
-            "ldr q12, [x21, #0x60]\n"
-            ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
-            ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
-            ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
-            "ldr q17, [x21, #0x70]\n"
-            "add x21, x21, #0x88\n"
-            ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
-            ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
-            ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
-            ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
-            ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
-            ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
-            ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
-            ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
-            "scvtf v26.4s, v26.4s, #0x4\n"
-            "scvtf v10.4s, v10.4s, #0x4\n"
-            "fmla v5.4s, v26.4s, v24.4s\n"
-            "scvtf v2.4s, v2.4s, #0x4\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "fmla v21.4s, v10.4s, v31.4s\n"
-            "fmla v8.4s, v2.4s, v6.4s\n"
-            "fmla v1.4s, v29.4s, v20.4s\n"
-            "bgt 3b\n"
-            "mov x20, %x[res_ptr]\n"
-            "subs x27, x27, #0x4\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "str q15, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q19, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q18, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q14, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q11, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q13, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q23, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q16, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q25, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q7, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q0, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q4, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q5, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q21, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q8, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q1, [x20, #0x0]\n"
-            "bne 2b\n"
-            "mov x20, #0x4\n"
-            "sub x10, x10, #0x10\n"
-            "cmp x10, #0x10\n"
-            "mov %x[res_ptr], x26\n"
-            "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
-            "bge 1b\n"
-            "4:"  // Row loop skip
-            "cbz x10, 9f\n"
-            "5:"  // Row tail: Row loop
-            "add x24, %x[b_ptr], #0x8\n"
-            "mov x23, %x[nc]\n"
-            "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
-            "6:"  // Row tail: Column loop
-            "movi v15.16b, #0x0\n"
-            "movi v19.16b, #0x0\n"
-            "add x25, %x[a_ptr], #0x8\n"
-            "mov x21, %x[nb]\n"
-            "movi v18.16b, #0x0\n"
-            "movi v14.16b, #0x0\n"
-            "7:"  // Row tail: Block loop
-            "ldr q7, [x24, #0x0]\n"
-            "ldr q5, [x25, #0x0]\n"
-            "movi v9.16b, #0x4\n"
-            "movi v4.4s, #0x0\n"
-            "ldr q3, [x24, #0x10]\n"
-            "ldr q2, [x25, #0x10]\n"
-            "movi v1.4s, #0x0\n"
-            "movi v0.4s, #0x0\n"
-            "ldr q13, [x24, #0x20]\n"
-            "ldr q31, [x25, #0x20]\n"
-            "movi v30.4s, #0x0\n"
-            "movi v29.16b, #0xf0\n"
-            "ldr q28, [x24, #0x30]\n"
-            "ldr q27, [x25, #0x30]\n"
-            "sshl v20.16b, v7.16b, v9.16b\n"
-            "sub x20, x24, #0x8\n"
-            "ldr q26, [x25, #0x40]\n"
-            "ldr q25, [x25, #0x50]\n"
-            "sshl v17.16b, v3.16b, v9.16b\n"
-            "and v7.16b, v7.16b, v29.16b\n"
-            "ldr q24, [x25, #0x60]\n"
-            "ldr q16, [x25, #0x70]\n"
-            "sshl v22.16b, v13.16b, v9.16b\n"
-            "and v3.16b, v3.16b, v29.16b\n"
-            "ldr d21, [x20, #0x0]\n"
-            "ldr d12, [x25, #-0x8]\n"
-            ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
-            ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
-            ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
-            ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
-            "sshl v9.16b, v28.16b, v9.16b\n"
-            "subs x21, x21, #0x1\n"
-            "and v13.16b, v13.16b, v29.16b\n"
-            "and v28.16b, v28.16b, v29.16b\n"
-            "add x25, x25, #0x88\n"
-            "add x24, x24, #0x48\n"
-            "fcvtl v21.4s, v21.4h\n"
-            "fcvtl v12.4s, v12.4h\n"
-            ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
-            ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
-            ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
-            ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
-            "fmul v11.4s, v21.4s, v12.s[0]\n"
-            "fmul v23.4s, v21.4s, v12.s[1]\n"
-            "fmul v17.4s, v21.4s, v12.s[2]\n"
-            ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
-            "fmul v6.4s, v21.4s, v12.s[3]\n"
-            ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
-            ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
-            ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
-            ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
-            ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
-            ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
-            ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
-            ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
-            ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
-            ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
-            ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
-            ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
-            ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
-            ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
-            ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
-            ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
-            ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
-            ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
-            ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
-            ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
-            ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
-            "scvtf v4.4s, v4.4s, #0x4\n"
-            "scvtf v1.4s, v1.4s, #0x4\n"
-            "scvtf v0.4s, v0.4s, #0x4\n"
-            "fmla v15.4s, v4.4s, v11.4s\n"
-            "scvtf v30.4s, v30.4s, #0x4\n"
-            "fmla v19.4s, v1.4s, v23.4s\n"
-            "fmla v18.4s, v0.4s, v17.4s\n"
-            "fmla v14.4s, v30.4s, v6.4s\n"
-            "bgt 7b\n"
-            "mov x20, %x[res_ptr]\n"
-            "cmp x10, #0x1\n"
-            "str q15, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x10, #0x2\n"
-            "str q19, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x10, #0x3\n"
-            "str q18, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "str q14, [x20, #0x0]\n"
-            "8:"  // Row tail: Accumulator store skip
-            "subs x23, x23, #0x4\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "bne 6b\n"
-            "subs x10, x10, #0x4\n"
-            "add %x[a_ptr], %x[a_ptr], x9\n"
-            "mov %x[res_ptr], x22\n"
-            "bgt 5b\n"
-            "9:"  // Row tail: Row loop skip
-            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
-        );
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    {
-        float sumf[4][4];
-        int sumi;
-
-        for (int y = 0; y < nr / 4; y++) {
-            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-            for (int x = 0; x < nc / ncols_interleaved; x++) {
-                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
-                for (int m = 0; m < 4; m++) {
-                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
-                }
-                for (int l = 0; l < nb; l++) {
-                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                        for (int m = 0; m < 4; m++) {
-                            for (int j = 0; j < ncols_interleaved; j++) {
-                                sumi = 0;
-                                for (int i = 0; i < blocklen; ++i) {
-                                    const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                                    const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
-                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
-                                }
-                                sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
-                            }
-                        }
-                    }
-                }
-                for (int m = 0; m < 4; m++) {
-                    for (int j = 0; j < ncols_interleaved; j++)
-                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
-                }
-            }
-        }
-    }
-}
-
-void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 8;
-
-    assert (n % qk == 0);
-    assert (nr % 4 == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-        size_t res_stride = bs * sizeof(float);
-
-        __asm__ __volatile__(
-            "mov x10, %x[nr]\n"
-            "mov x9, #0x88\n"
-            "cmp x10, #0x10\n"
-            "mul x9, %x[nb], x9\n"
-            "blt 4f\n"
-            "1:"  // Row loop
-            "add x28, %x[b_ptr], #0x8\n"
-            "mov x27, %x[nc]\n"
-            "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
-            "2:"  // Column loop
-            "add x25, %x[a_ptr], #0x8\n"
-            "movi v2.16b, #0x0\n"
-            "movi v10.16b, #0x0\n"
-            "mov x24, %x[nb]\n"
-            "add x23, x25, x9\n"
-            "movi v12.16b, #0x0\n"
-            "movi v28.16b, #0x0\n"
-            "add x22, x23, x9\n"
-            "movi v11.16b, #0x0\n"
-            "movi v13.16b, #0x0\n"
-            "add x21, x22, x9\n"
-            "movi v22.16b, #0x0\n"
-            "movi v23.16b, #0x0\n"
-            "movi v25.16b, #0x0\n"
-            "movi v5.16b, #0x0\n"
-            "movi v7.16b, #0x0\n"
-            "movi v4.16b, #0x0\n"
-            "movi v6.16b, #0x0\n"
-            "movi v30.16b, #0x0\n"
-            "movi v24.16b, #0x0\n"
-            "movi v14.16b, #0x0\n"
-            "3:"  // Block loop
-            "ldr q21, [x28, #0x0]\n"
-            "ldr q16, [x28, #0x10]\n"
-            "movi v1.16b, #0x4\n"
-            "movi v19.4s, #0x0\n"
-            "ldr q27, [x25, #0x0]\n"
-            "ldr q15, [x25, #0x10]\n"
-            "movi v26.4s, #0x0\n"
-            "movi v18.4s, #0x0\n"
-            "ldr q29, [x28, #0x20]\n"
-            "ldr q3, [x28, #0x30]\n"
-            "movi v17.4s, #0x0\n"
-            "movi v0.16b, #0xf0\n"
-            "ldr d20, [x25, #-0x8]\n"
-            "ldr d9, [x23, #-0x8]\n"
-            "sshl v8.16b, v21.16b, v1.16b\n"
-            "sshl v31.16b, v16.16b, v1.16b\n"
-            "and v21.16b, v21.16b, v0.16b\n"
-            "and v16.16b, v16.16b, v0.16b\n"
-            "sub x20, x28, #0x8\n"
-            "subs x24, x24, #0x1\n"
-            "add x28, x28, #0x48\n"
-            ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
-            ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
-            "ldr q27, [x25, #0x20]\n"
-            ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
-            ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
-            "sshl v15.16b, v29.16b, v1.16b\n"
-            "sshl v1.16b, v3.16b, v1.16b\n"
-            "and v29.16b, v29.16b, v0.16b\n"
-            "and v3.16b, v3.16b, v0.16b\n"
-            "ldr q0, [x25, #0x30]\n"
-            "fcvtl v20.4s, v20.4h\n"
-            ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
-            "fcvtl v9.4s, v9.4h\n"
-            ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
-            "ldr q27, [x25, #0x40]\n"
-            ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
-            ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
-            "ldr q0, [x25, #0x50]\n"
-            ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
-            ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
-            "ldr q27, [x25, #0x60]\n"
-            ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
-            ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
-            "ldr q0, [x25, #0x70]\n"
-            "add x25, x25, #0x88\n"
-            ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
-            ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
-            "ldr d27, [x20, #0x0]\n"
-            ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
-            ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
-            "fcvtl v27.4s, v27.4h\n"
-            "uzp1 v0.2d, v19.2d, v26.2d\n"
-            "uzp2 v26.2d, v19.2d, v26.2d\n"
-            "fmul v19.4s, v27.4s, v20.s[0]\n"
-            "scvtf v0.4s, v0.4s, #0x4\n"
-            "scvtf v26.4s, v26.4s, #0x4\n"
-            "fmla v2.4s, v0.4s, v19.4s\n"
-            "ldr q19, [x23, #0x0]\n"
-            "uzp1 v0.2d, v18.2d, v17.2d\n"
-            "uzp2 v18.2d, v18.2d, v17.2d\n"
-            "fmul v17.4s, v27.4s, v20.s[1]\n"
-            "scvtf v0.4s, v0.4s, #0x4\n"
-            "scvtf v18.4s, v18.4s, #0x4\n"
-            "fmla v10.4s, v26.4s, v17.4s\n"
-            "ldr q17, [x23, #0x10]\n"
-            "fmul v26.4s, v27.4s, v20.s[2]\n"
-            "fmul v20.4s, v27.4s, v20.s[3]\n"
-            "fmla v12.4s, v0.4s, v26.4s\n"
-            "ldr d0, [x22, #-0x8]\n"
-            "ldr d26, [x21, #-0x8]\n"
-            "fcvtl v0.4s, v0.4h\n"
-            "fmla v28.4s, v18.4s, v20.4s\n"
-            "movi v20.4s, #0x0\n"
-            "movi v18.4s, #0x0\n"
-            ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
-            ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
-            "ldr q19, [x23, #0x20]\n"
-            "fcvtl v26.4s, v26.4h\n"
-            ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
-            ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
-            "ldr q19, [x23, #0x40]\n"
-            ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
-            ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
-            "ldr q19, [x23, #0x60]\n"
-            ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
-            ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
-            "uzp1 v19.2d, v20.2d, v18.2d\n"
-            "scvtf v19.4s, v19.4s, #0x4\n"
-            "uzp2 v20.2d, v20.2d, v18.2d\n"
-            "fmul v18.4s, v27.4s, v9.s[0]\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "fmla v11.4s, v19.4s, v18.4s\n"
-            "ldr q18, [x22, #0x0]\n"
-            "fmul v19.4s, v27.4s, v9.s[1]\n"
-            "fmla v13.4s, v20.4s, v19.4s\n"
-            "movi v19.4s, #0x0\n"
-            "movi v20.4s, #0x0\n"
-            ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
-            ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
-            "ldr q17, [x23, #0x30]\n"
-            ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
-            ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
-            "ldr q17, [x23, #0x50]\n"
-            ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
-            ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
-            "ldr q17, [x23, #0x70]\n"
-            "add x23, x23, #0x88\n"
-            ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
-            ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
-            "uzp1 v17.2d, v19.2d, v20.2d\n"
-            "scvtf v17.4s, v17.4s, #0x4\n"
-            "uzp2 v20.2d, v19.2d, v20.2d\n"
-            "fmul v19.4s, v27.4s, v9.s[2]\n"
-            "fmul v9.4s, v27.4s, v9.s[3]\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "fmla v22.4s, v17.4s, v19.4s\n"
-            "ldr q17, [x22, #0x10]\n"
-            "movi v19.4s, #0x0\n"
-            ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
-            "fmla v23.4s, v20.4s, v9.4s\n"
-            "movi v20.4s, #0x0\n"
-            "movi v9.4s, #0x0\n"
-            ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
-            "ldr q18, [x22, #0x20]\n"
-            ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
-            ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
-            ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
-            "ldr q18, [x22, #0x40]\n"
-            ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
-            ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
-            "ldr q18, [x22, #0x60]\n"
-            ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
-            ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
-            "movi v18.4s, #0x0\n"
-            ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
-            "ldr q17, [x22, #0x30]\n"
-            ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
-            ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
-            "ldr q17, [x22, #0x50]\n"
-            ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
-            ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
-            "ldr q17, [x22, #0x70]\n"
-            "add x22, x22, #0x88\n"
-            ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
-            ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
-            "uzp1 v17.2d, v19.2d, v20.2d\n"
-            "uzp2 v20.2d, v19.2d, v20.2d\n"
-            "fmul v19.4s, v27.4s, v0.s[0]\n"
-            "scvtf v17.4s, v17.4s, #0x4\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "fmla v25.4s, v17.4s, v19.4s\n"
-            "ldr q19, [x21, #0x0]\n"
-            "fmul v17.4s, v27.4s, v0.s[1]\n"
-            "fmla v5.4s, v20.4s, v17.4s\n"
-            "ldr q17, [x21, #0x10]\n"
-            "uzp1 v20.2d, v9.2d, v18.2d\n"
-            "uzp2 v9.2d, v9.2d, v18.2d\n"
-            "fmul v18.4s, v27.4s, v0.s[2]\n"
-            "fmul v0.4s, v27.4s, v0.s[3]\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "scvtf v9.4s, v9.4s, #0x4\n"
-            "fmla v7.4s, v20.4s, v18.4s\n"
-            "movi v20.4s, #0x0\n"
-            "movi v18.4s, #0x0\n"
-            ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
-            ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
-            "ldr q19, [x21, #0x20]\n"
-            "fmla v4.4s, v9.4s, v0.4s\n"
-            "movi v9.4s, #0x0\n"
-            "movi v0.4s, #0x0\n"
-            ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
-            "fmul v8.4s, v27.4s, v26.s[0]\n"
-            ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
-            "ldr q17, [x21, #0x30]\n"
-            ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
-            "fmul v31.4s, v27.4s, v26.s[1]\n"
-            ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
-            "ldr q19, [x21, #0x40]\n"
-            ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
-            "fmul v15.4s, v27.4s, v26.s[2]\n"
-            "fmul v27.4s, v27.4s, v26.s[3]\n"
-            ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
-            "ldr q1, [x21, #0x50]\n"
-            ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
-            ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
-            "ldr q26, [x21, #0x60]\n"
-            ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
-            ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
-            "ldr q21, [x21, #0x70]\n"
-            "add x21, x21, #0x88\n"
-            ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
-            ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
-            ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
-            ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
-            "uzp1 v29.2d, v20.2d, v18.2d\n"
-            "uzp2 v21.2d, v20.2d, v18.2d\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "uzp1 v18.2d, v9.2d, v0.2d\n"
-            "uzp2 v16.2d, v9.2d, v0.2d\n"
-            "scvtf v21.4s, v21.4s, #0x4\n"
-            "fmla v6.4s, v29.4s, v8.4s\n"
-            "scvtf v18.4s, v18.4s, #0x4\n"
-            "scvtf v16.4s, v16.4s, #0x4\n"
-            "fmla v30.4s, v21.4s, v31.4s\n"
-            "fmla v24.4s, v18.4s, v15.4s\n"
-            "fmla v14.4s, v16.4s, v27.4s\n"
-            "bgt 3b\n"
-            "mov x20, %x[res_ptr]\n"
-            "subs x27, x27, #0x4\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "str q2, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q10, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q12, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q28, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q11, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q13, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q22, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q23, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q25, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q5, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q7, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q4, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q6, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q30, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q24, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q14, [x20, #0x0]\n"
-            "bne 2b\n"
-            "mov x20, #0x4\n"
-            "sub x10, x10, #0x10\n"
-            "cmp x10, #0x10\n"
-            "mov %x[res_ptr], x26\n"
-            "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
-            "bge 1b\n"
-            "4:"  // Row loop skip
-            "cbz x10, 9f\n"
-            "5:"  // Row tail: Row loop
-            "add x24, %x[b_ptr], #0x8\n"
-            "mov x23, %x[nc]\n"
-            "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
-            "6:"  // Row tail: Column loop
-            "movi v2.16b, #0x0\n"
-            "movi v10.16b, #0x0\n"
-            "add x25, %x[a_ptr], #0x8\n"
-            "mov x21, %x[nb]\n"
-            "movi v12.16b, #0x0\n"
-            "movi v28.16b, #0x0\n"
-            "7:"  // Row tail: Block loop
-            "ldr q6, [x24, #0x0]\n"
-            "ldr q5, [x24, #0x10]\n"
-            "movi v17.16b, #0x4\n"
-            "movi v8.4s, #0x0\n"
-            "ldr q4, [x25, #0x0]\n"
-            "ldr q13, [x25, #0x10]\n"
-            "movi v27.4s, #0x0\n"
-            "movi v0.4s, #0x0\n"
-            "ldr q31, [x24, #0x20]\n"
-            "ldr q14, [x24, #0x30]\n"
-            "movi v29.4s, #0x0\n"
-            "movi v22.16b, #0xf0\n"
-            "ldr q11, [x25, #0x20]\n"
-            "ldr q23, [x25, #0x30]\n"
-            "sshl v21.16b, v6.16b, v17.16b\n"
-            "sshl v16.16b, v5.16b, v17.16b\n"
-            "ldr q20, [x25, #0x40]\n"
-            "ldr q26, [x25, #0x50]\n"
-            "and v6.16b, v6.16b, v22.16b\n"
-            "and v5.16b, v5.16b, v22.16b\n"
-            "ldr q25, [x25, #0x60]\n"
-            "ldr q3, [x25, #0x70]\n"
-            "sshl v19.16b, v31.16b, v17.16b\n"
-            "sshl v18.16b, v14.16b, v17.16b\n"
-            "ldr d17, [x25, #-0x8]\n"
-            ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
-            ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
-            "and v31.16b, v31.16b, v22.16b\n"
-            ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
-            ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
-            "and v14.16b, v14.16b, v22.16b\n"
-            "sub x20, x24, #0x8\n"
-            "ldr d16, [x20, #0x0]\n"
-            "subs x21, x21, #0x1\n"
-            "add x25, x25, #0x88\n"
-            "fcvtl v17.4s, v17.4h\n"
-            "add x24, x24, #0x48\n"
-            ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
-            ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
-            ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
-            ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
-            "fcvtl v16.4s, v16.4h\n"
-            ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
-            ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
-            "fmul v23.4s, v16.4s, v17.s[0]\n"
-            "fmul v21.4s, v16.4s, v17.s[1]\n"
-            "fmul v1.4s, v16.4s, v17.s[2]\n"
-            "fmul v20.4s, v16.4s, v17.s[3]\n"
-            ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
-            ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
-            ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
-            ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
-            ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
-            ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
-            "uzp1 v19.2d, v8.2d, v27.2d\n"
-            "uzp2 v18.2d, v8.2d, v27.2d\n"
-            "scvtf v19.4s, v19.4s, #0x4\n"
-            "uzp1 v17.2d, v0.2d, v29.2d\n"
-            "uzp2 v16.2d, v0.2d, v29.2d\n"
-            "scvtf v18.4s, v18.4s, #0x4\n"
-            "fmla v2.4s, v19.4s, v23.4s\n"
-            "scvtf v17.4s, v17.4s, #0x4\n"
-            "scvtf v16.4s, v16.4s, #0x4\n"
-            "fmla v10.4s, v18.4s, v21.4s\n"
-            "fmla v12.4s, v17.4s, v1.4s\n"
-            "fmla v28.4s, v16.4s, v20.4s\n"
-            "bgt 7b\n"
-            "mov x20, %x[res_ptr]\n"
-            "cmp x10, #0x1\n"
-            "str q2, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x10, #0x2\n"
-            "str q10, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x10, #0x3\n"
-            "str q12, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "str q28, [x20, #0x0]\n"
-            "8:"  // Row tail: Accumulator store skip
-            "subs x23, x23, #0x4\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "bne 6b\n"
-            "subs x10, x10, #0x4\n"
-            "add %x[a_ptr], %x[a_ptr], x9\n"
-            "mov %x[res_ptr], x22\n"
-            "bgt 5b\n"
-            "9:"  // Row tail: Row loop skip
-            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
-        );
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    float sumf[4][4];
-    int sumi;
-
-    for (int y = 0; y < nr / 4; y++) {
-        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
-            }
-            for (int l = 0; l < nb; l++) {
-                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                    for (int m = 0; m < 4; m++) {
-                        for (int j = 0; j < ncols_interleaved; j++) {
-                            sumi = 0;
-                            for (int i = 0; i < blocklen; ++i) {
-                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
-                                        (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
-                            }
-                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
-                        }
-                    }
-                }
-            }
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++)
-                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
-            }
-        }
-    }
-}
-
-void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 8;
-    const int blocklen = 8;
-
-    assert (n % qk == 0);
-    assert (nr % 4 == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-        size_t res_stride = bs * sizeof(float);
-
-        __asm__ __volatile__(
-            "mov x20, #0x4\n"
-            "mov x13, %x[nr]\n"
-            "mov z28.s, #-0x4\n"
-            "mov x12, #0x88\n"
-            "ptrue p1.b\n"
-            "whilelt p0.s, XZR, x20\n"
-            "cmp x13, #0x10\n"
-            "mul x12, %x[nb], x12\n"
-            "blt 4f\n"
-            "1:"  // Row loop
-            "add x11, %x[b_ptr], #0x10\n"
-            "mov x10, %x[nc]\n"
-            "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
-            "2:"  // Column loop
-            "add x28, %x[a_ptr], #0x8\n"
-            "mov z24.b, #0x0\n"
-            "mov z15.b, #0x0\n"
-            "mov x27, %x[nb]\n"
-            "add x26, x28, x12\n"
-            "mov z12.b, #0x0\n"
-            "mov z0.b, #0x0\n"
-            "add x25, x26, x12\n"
-            "mov z13.b, #0x0\n"
-            "mov z1.b, #0x0\n"
-            "add x24, x25, x12\n"
-            "mov z20.b, #0x0\n"
-            "mov z25.b, #0x0\n"
-            "mov z11.b, #0x0\n"
-            "mov z16.b, #0x0\n"
-            "mov z19.b, #0x0\n"
-            "mov z26.b, #0x0\n"
-            "mov z8.b, #0x0\n"
-            "mov z29.b, #0x0\n"
-            "mov z27.b, #0x0\n"
-            "mov z10.b, #0x0\n"
-            "3:"  // Block loop
-            "ld1b { z30.b }, p1/Z, [x11]\n"
-            "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
-            "mov z18.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            "ld1rqb { z3.b }, p1/Z, [x28]\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
-            "mov z9.s, #0x0\n"
-            "mov z22.s, #0x0\n"
-            "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
-            "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
-            "sub x20, x11, #0x10\n"
-            "sub x23, x28, #0x8\n"
-            "lsl z31.b, z30.b, #0x4\n"
-            "lsl z6.b, z21.b, #0x4\n"
-            "ld1h { z23.s }, p1/Z, [x20]\n"
-            "sub x22, x26, #0x8\n"
-            "and z30.b, z30.b, #0xf0\n"
-            "and z21.b, z21.b, #0xf0\n"
-            "sub x21, x25, #0x8\n"
-            "sub x20, x24, #0x8\n"
-            "lsl z14.b, z4.b, #0x4\n"
-            "lsl z2.b, z17.b, #0x4\n"
-            "subs x27, x27, #0x1\n"
-            "add x11, x11, #0x90\n"
-            ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
-            ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
-            "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
-            "and z4.b, z4.b, #0xf0\n"
-            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
-            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
-            "and z17.b, z17.b, #0xf0\n"
-            "fcvt z23.s, p1/m, z23.h\n"
-            ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
-            ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
-            "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
-            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
-            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
-            "fscale z23.s, p1/m, z23.s, z28.s\n"
-            ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
-            ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
-            "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
-            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
-            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
-            "add x28, x28, #0x88\n"
-            ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
-            ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
-            "ld1h { z3.s }, p0/Z, [x23]\n"
-            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
-            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
-            "fcvt z3.s, p1/m, z3.h\n"
-            "uzp1 z5.d, z18.d, z7.d\n"
-            "uzp2 z18.d, z18.d, z7.d\n"
-            "mov z3.q, z3.q[0]\n"
-            "uzp1 z7.d, z9.d, z22.d\n"
-            "uzp2 z22.d, z9.d, z22.d\n"
-            "fmul z9.s, z23.s, z3.s[0]\n"
-            "scvtf z5.s, p1/m, z5.s\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "scvtf z7.s, p1/m, z7.s\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z24.s, p1/M, z5.s, z9.s\n"
-            "ld1rqb { z5.b }, p1/Z, [x26]\n"
-            "fmul z9.s, z23.s, z3.s[1]\n"
-            "fmla z15.s, p1/M, z18.s, z9.s\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
-            "fmul z9.s, z23.s, z3.s[2]\n"
-            "fmul z3.s, z23.s, z3.s[3]\n"
-            "fmla z12.s, p1/M, z7.s, z9.s\n"
-            "mov z9.s, #0x0\n"
-            "ld1h { z7.s }, p0/Z, [x22]\n"
-            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
-            "fmla z0.s, p1/M, z22.s, z3.s\n"
-            "mov z22.s, #0x0\n"
-            "ld1h { z3.s }, p0/Z, [x21]\n"
-            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
-            "fcvt z7.s, p1/m, z7.h\n"
-            "fcvt z3.s, p1/m, z3.h\n"
-            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
-            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
-            "mov z7.q, z7.q[0]\n"
-            "mov z3.q, z3.q[0]\n"
-            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
-            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
-            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
-            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
-            "uzp1 z5.d, z9.d, z22.d\n"
-            "scvtf z5.s, p1/m, z5.s\n"
-            "uzp2 z22.d, z9.d, z22.d\n"
-            "fmul z9.s, z23.s, z7.s[0]\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z13.s, p1/M, z5.s, z9.s\n"
-            "ld1rqb { z9.b }, p1/Z, [x25]\n"
-            "fmul z5.s, z23.s, z7.s[1]\n"
-            "fmla z1.s, p1/M, z22.s, z5.s\n"
-            "mov z5.s, #0x0\n"
-            "mov z22.s, #0x0\n"
-            ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
-            ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
-            ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
-            ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
-            ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
-            ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
-            "add x26, x26, #0x88\n"
-            ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
-            ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
-            "uzp1 z18.d, z5.d, z22.d\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "uzp2 z22.d, z5.d, z22.d\n"
-            "fmul z5.s, z23.s, z7.s[2]\n"
-            "fmul z7.s, z23.s, z7.s[3]\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z20.s, p1/M, z18.s, z5.s\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
-            "ld1h { z5.s }, p0/Z, [x20]\n"
-            "fcvt z5.s, p1/m, z5.h\n"
-            "fmla z25.s, p1/M, z22.s, z7.s\n"
-            "mov z22.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
-            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
-            "mov z5.q, z5.q[0]\n"
-            ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
-            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
-            ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
-            ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
-            ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
-            ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
-            "uzp1 z9.d, z22.d, z7.d\n"
-            "scvtf z9.s, p1/m, z9.s\n"
-            "uzp2 z22.d, z22.d, z7.d\n"
-            "fmul z7.s, z23.s, z3.s[0]\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z11.s, p1/M, z9.s, z7.s\n"
-            "ld1rqb { z9.b }, p1/Z, [x24]\n"
-            "fmul z7.s, z23.s, z3.s[1]\n"
-            "fmla z16.s, p1/M, z22.s, z7.s\n"
-            "mov z22.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
-            ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
-            ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
-            ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
-            ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
-            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
-            "add x25, x25, #0x88\n"
-            ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
-            ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
-            "uzp1 z18.d, z22.d, z7.d\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "uzp2 z7.d, z22.d, z7.d\n"
-            "fmul z22.s, z23.s, z3.s[2]\n"
-            "fmul z3.s, z23.s, z3.s[3]\n"
-            "scvtf z7.s, p1/m, z7.s\n"
-            "fmla z19.s, p1/M, z18.s, z22.s\n"
-            "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
-            "fmul z22.s, z23.s, z5.s[0]\n"
-            "fmla z26.s, p1/M, z7.s, z3.s\n"
-            "mov z3.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
-            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
-            ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
-            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
-            "mov z9.s, #0x0\n"
-            ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
-            "mov z31.s, #0x0\n"
-            ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
-            "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
-            "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
-            ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
-            "fmul z14.s, z23.s, z5.s[1]\n"
-            ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
-            "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
-            "fmul z2.s, z23.s, z5.s[2]\n"
-            "fmul z23.s, z23.s, z5.s[3]\n"
-            ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
-            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
-            ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
-            ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
-            "add x24, x24, #0x88\n"
-            ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
-            ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
-            ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
-            ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
-            "uzp1 z18.d, z3.d, z7.d\n"
-            "uzp2 z5.d, z3.d, z7.d\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "uzp1 z6.d, z9.d, z31.d\n"
-            "uzp2 z9.d, z9.d, z31.d\n"
-            "scvtf z5.s, p1/m, z5.s\n"
-            "fmla z8.s, p1/M, z18.s, z22.s\n"
-            "scvtf z6.s, p1/m, z6.s\n"
-            "scvtf z9.s, p1/m, z9.s\n"
-            "fmla z29.s, p1/M, z5.s, z14.s\n"
-            "fmla z27.s, p1/M, z6.s, z2.s\n"
-            "fmla z10.s, p1/M, z9.s, z23.s\n"
-            "bgt 3b\n"
-            "mov x20, %x[res_ptr]\n"
-            "subs x10, x10, #0x8\n"
-            "add %x[res_ptr], %x[res_ptr], #0x20\n"
-            "st1w { z24.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z15.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z12.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z0.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z13.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z1.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z20.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z25.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z11.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z16.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z19.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z26.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z8.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z29.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z27.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z10.s }, p1, [x20]\n"
-            "bne 2b\n"
-            "mov x20, #0x4\n"
-            "sub x13, x13, #0x10\n"
-            "cmp x13, #0x10\n"
-            "mov %x[res_ptr], x9\n"
-            "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
-            "bge 1b\n"
-            "4:"  // Row loop skip
-            "cbz x13, 9f\n"
-            "5:"  // Row tail: Row loop
-            "add x25, %x[b_ptr], #0x10\n"
-            "mov x24, %x[nc]\n"
-            "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
-            "6:"  // Row tail: Column loop
-            "mov z24.b, #0x0\n"
-            "mov z15.b, #0x0\n"
-            "add x28, %x[a_ptr], #0x8\n"
-            "mov x22, %x[nb]\n"
-            "mov z12.b, #0x0\n"
-            "mov z0.b, #0x0\n"
-            "7:"  // Row tail: Block loop
-            "ld1b { z3.b }, p1/Z, [x25]\n"
-            "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
-            "mov z2.s, #0x0\n"
-            "mov z25.s, #0x0\n"
-            "ld1rqb { z26.b }, p1/Z, [x28]\n"
-            "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
-            "mov z27.s, #0x0\n"
-            "mov z19.s, #0x0\n"
-            "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
-            "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
-            "sub x21, x25, #0x10\n"
-            "sub x20, x28, #0x8\n"
-            "lsl z20.b, z3.b, #0x4\n"
-            "lsl z4.b, z6.b, #0x4\n"
-            "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
-            "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
-            "and z3.b, z3.b, #0xf0\n"
-            "and z6.b, z6.b, #0xf0\n"
-            "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
-            "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
-            "lsl z8.b, z29.b, #0x4\n"
-            "lsl z14.b, z16.b, #0x4\n"
-            "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
-            "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
-            ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
-            ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
-            "and z29.b, z29.b, #0xf0\n"
-            "ld1h { z17.s }, p1/Z, [x21]\n"
-            ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
-            ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
-            "and z16.b, z16.b, #0xf0\n"
-            "ld1h { z4.s }, p0/Z, [x20]\n"
-            "subs x22, x22, #0x1\n"
-            "add x28, x28, #0x88\n"
-            "fcvt z17.s, p1/m, z17.h\n"
-            "add x25, x25, #0x90\n"
-            ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
-            ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
-            "fcvt z4.s, p1/m, z4.h\n"
-            ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
-            ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
-            "fscale z17.s, p1/m, z17.s, z28.s\n"
-            "mov z4.q, z4.q[0]\n"
-            ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
-            ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
-            "fmul z23.s, z17.s, z4.s[0]\n"
-            "fmul z9.s, z17.s, z4.s[1]\n"
-            "fmul z21.s, z17.s, z4.s[2]\n"
-            "fmul z4.s, z17.s, z4.s[3]\n"
-            ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
-            ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
-            ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
-            ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
-            ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
-            ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
-            "uzp1 z31.d, z2.d, z25.d\n"
-            "uzp2 z13.d, z2.d, z25.d\n"
-            "scvtf z31.s, p1/m, z31.s\n"
-            "uzp1 z17.d, z27.d, z19.d\n"
-            "uzp2 z18.d, z27.d, z19.d\n"
-            "scvtf z13.s, p1/m, z13.s\n"
-            "fmla z24.s, p1/M, z31.s, z23.s\n"
-            "scvtf z17.s, p1/m, z17.s\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "fmla z15.s, p1/M, z13.s, z9.s\n"
-            "fmla z12.s, p1/M, z17.s, z21.s\n"
-            "fmla z0.s, p1/M, z18.s, z4.s\n"
-            "bgt 7b\n"
-            "mov x20, %x[res_ptr]\n"
-            "cmp x13, #0x1\n"
-            "st1w { z24.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x13, #0x2\n"
-            "st1w { z15.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x13, #0x3\n"
-            "st1w { z12.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "st1w { z0.s }, p1, [x20]\n"
-            "8:"  // Row tail: Accumulator store skip
-            "subs x24, x24, #0x8\n"
-            "add %x[res_ptr], %x[res_ptr], #0x20\n"
-            "bne 6b\n"
-            "subs x13, x13, #0x4\n"
-            "add %x[a_ptr], %x[a_ptr], x12\n"
-            "mov %x[res_ptr], x23\n"
-            "bgt 5b\n"
-            "9:"  // Row tail: Row loop skip
-            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
-            : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-        );
-        return;
-    }
-#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-#elif defined(__AVX2__) || defined(__AVX512F__)
-    {
-        const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
-        const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
-        int64_t b_nb = n / QK4_0;
-        int64_t y = 0;
-        // Mask to mask out nibbles from packed bytes
-        const __m256i m4b = _mm256_set1_epi8(0x0F);
-        const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
-        // Lookup table to convert signed nibbles to signed bytes
-        __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
-        signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
-        // Permute mask used for easier vector processing at later stages
-        __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
-        int64_t xstart = 0;
-        int anr = nr - nr%16; // Used to align nr with boundary of 16
-    #ifdef __AVX512F__
-        int anc = nc - nc%16; // Used to align nc with boundary of 16
-        // Mask to mask out nibbles from packed bytes expanded to 512 bit length
-        const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
-        // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
-        __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
-
-        // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
-        for (; y < anr / 4; y += 4) {
-
-            const block_q8_0x4 * a_ptrs[4];
-
-            a_ptrs[0] = a_ptr_start + (y * nb);
-            for (int i = 0; i < 3; ++i) {
-                a_ptrs[i + 1] = a_ptrs[i] + nb;
-            }
-
-            // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
-            for (int64_t x = 0; x < anc / 8; x += 2) {
-
-                const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
-                const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
-
-                // Master FP accumulators
-                __m512 acc_rows[16];
-                for (int i = 0; i < 16; i++) {
-                    acc_rows[i] = _mm512_setzero_ps();
-                }
-
-                for (int64_t b = 0; b < nb; b++) {
-                    // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
-                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
-                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
-                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
-                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
-
-                    const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
-                    const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
-                    const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
-                    const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
-
-                    // Save the values in the following vectors in the formats B0B1B4B5B8B9BCBD, B2B3B6B7BABBBEBF for further processing and storing of values
-                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
-                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
-
-                    const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
-                    const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
-
-                    const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
-                    const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
-                    const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
-                    const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
-
-                    // 4-bit -> 8-bit - Sign is maintained
-                    const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
-                    const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
-
-                    const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
-                    const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
-
-                    const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
-                    const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
-
-                    const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
-                    const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
-
-                    // Shuffle pattern one - right side input
-                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
-                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
-
-                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
-                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
-
-                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
-                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
-
-                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
-                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
-
-                    // Shuffle pattern two - right side input
-
-                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
-                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
-
-                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
-                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
-
-                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
-                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
-
-                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
-                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
-
-                    // Scale values - Load the weight scale values of two block_q4_0x8
-                    const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
-
-                    // Process LHS in pairs of rows
-                    for (int rp = 0; rp < 4; rp++) {
-
-                        // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
-                        // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
-                        __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
-                        __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
-                        __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
-                        __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
-                        __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
-                        __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
-                        __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
-                        __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
-                        __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
-                        __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
-                        __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
-                        __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
-
-                        __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
-                        __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
-                        __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
-                        __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
-                        __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
-                        __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
-                        __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
-                        __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
-
-                        // Shuffle pattern one - left side input
-
-                        const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
-                        const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
-
-                        const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
-                        const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
-
-                        const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
-                        const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
-
-                        const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
-                        const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
-
-                        // Shuffle pattern two - left side input
-
-                        const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
-                        const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
-
-                        const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
-                        const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
-
-                        const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
-                        const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
-
-                        const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
-                        const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
-
-                        // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
-                        // Resembles MMLAs into 2x2 matrices in ARM Version
-                        __m512i iacc_mat_00_sp1 =
-                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
-                        __m512i iacc_mat_01_sp1 =
-                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
-                        __m512i iacc_mat_10_sp1 =
-                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
-                        __m512i iacc_mat_11_sp1 =
-                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
-                        __m512i iacc_mat_00_sp2 =
-                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
-                        __m512i iacc_mat_01_sp2 =
-                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
-                        __m512i iacc_mat_10_sp2 =
-                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
-                        __m512i iacc_mat_11_sp2 =
-                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
-
-                        // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
-                        __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
-                        __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
-                        __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
-                        __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
-
-
-                        // Straighten out to make 4 row vectors
-                        __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
-                        __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
-                        __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
-                        __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
-
-                        // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
-                        const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
-                        const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
-
-                        // Multiply with appropiate scales and accumulate
-                        acc_rows[rp * 4]     = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[rp * 4]);
-                        acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[rp * 4 + 1]);
-                        acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
-                        acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
-                    }
-                }
-
-                // Store the accumulated values
-                for (int i = 0; i < 16; i++) {
-                    _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
-                }
-            }
-        }
-        // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
-        for (; y < nr / 4; y ++) {
-
-            const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
-
-            // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
-            for (int64_t x = 0; x < anc / 8; x += 2) {
-
-                const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
-                const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
-
-                // Master FP accumulators
-                __m512 acc_rows[4];
-                for (int i = 0; i < 4; i++) {
-                    acc_rows[i] = _mm512_setzero_ps();
-                }
-
-                for (int64_t b = 0; b < nb; b++) {
-                    // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
-                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
-                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
-                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
-                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
-
-                    const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
-                    const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
-                    const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
-                    const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
-
-                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
-                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
-                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
-
-                    const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
-                    const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
-
-                    const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
-                    const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
-                    const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
-                    const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
-
-                    // 4-bit -> 8-bit - Sign is maintained
-                    const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
-                    const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
-
-                    const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
-                    const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
-
-                    const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
-                    const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
-
-                    const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
-                    const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
-
-                    // Shuffle pattern one - right side input
-                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
-                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
-
-                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
-                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
-
-                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
-                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
-
-                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
-                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
-
-                    // Shuffle pattern two - right side input
-
-                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
-                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
-
-                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
-                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
-
-                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
-                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
-
-                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
-                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
-
-
-                    // Scale values - Load the weight scale values of two block_q4_0x8
-                    const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
-
-                    // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
-                    // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
-                    __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
-                    __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
-                    __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
-                    __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
-                    __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
-                    __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
-                    __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
-                    __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
-                    __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
-                    __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
-                    __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
-                    __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
-
-                    __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
-                    __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
-                    __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
-                    __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
-                    __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
-                    __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
-                    __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
-                    __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
-
-                    // Shuffle pattern one - left side input
-
-                    const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
-                    const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
-
-                    const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
-                    const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
-
-                    const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
-                    const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
-
-                    const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
-                    const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
-
-                    // Shuffle pattern two - left side input
-
-                    const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
-                    const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
-
-                    const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
-                    const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
-
-                    const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
-                    const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
-
-                    const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
-                    const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
-
-                    // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
-                    // Resembles MMLAs into 2x2 matrices in ARM Version
-                    __m512i iacc_mat_00_sp1 =
-                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
-                    __m512i iacc_mat_01_sp1 =
-                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
-                    __m512i iacc_mat_10_sp1 =
-                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
-                    __m512i iacc_mat_11_sp1 =
-                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
-                    __m512i iacc_mat_00_sp2 =
-                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
-                    __m512i iacc_mat_01_sp2 =
-                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
-                    __m512i iacc_mat_10_sp2 =
-                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
-                    __m512i iacc_mat_11_sp2 =
-                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
-
-                    // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
-                    __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
-                    __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
-                    __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
-                    __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
-
-
-                    // Straighten out to make 4 row vectors
-                    __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
-                    __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
-                    __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
-                    __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
-
-                    // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
-                    const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
-                    const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
-
-                    // Multiply with appropiate scales and accumulate
-                    acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[0]);
-                    acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[1]);
-                    acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
-                    acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
-                }
-
-                // Store the accumulated values
-                for (int i = 0; i < 4; i++) {
-                    _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
-                }
-            }
-        }
-        if (anc != nc) {
-            xstart = anc/8;
-            y = 0;
-        }
-    #endif // __AVX512F__
-
-        // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
-
-        for (; y < anr / 4; y += 4) {
-            const block_q8_0x4 * a_ptrs[4];
-
-            a_ptrs[0] = a_ptr_start + (y * nb);
-            for (int i = 0; i < 3; ++i) {
-                a_ptrs[i + 1] = a_ptrs[i] + nb;
-            }
-
-            // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
-            for (int64_t x = xstart; x < nc / 8; x++) {
-
-                const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
-
-                // Master FP accumulators
-                __m256 acc_rows[16];
-                for (int i = 0; i < 16; i++) {
-                    acc_rows[i] = _mm256_setzero_ps();
-                }
-
-                for (int64_t b = 0; b < nb; b++) {
-                    // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
-                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
-                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
-                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
-                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
-
-                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
-                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
-                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
-
-                    // 4-bit -> 8-bit - Sign is maintained
-                    const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
-                    const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
-
-                    const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
-                    const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
-
-                    const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
-                    const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
-
-                    const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
-                    const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
-
-                    // Shuffle pattern one - right side input
-                    const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136);  //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
-                    const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136);  //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
-
-                    const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136);  //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
-                    const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136);  //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
-
-                    const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136);  //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
-                    const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136);  //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
-
-                    const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136);  //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
-                    const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136);  //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
-
-                    // Shuffle pattern two - right side input
-
-                    const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221);  //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
-                    const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221);  //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
-
-                    const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221);  //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
-                    const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221);  //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
-
-                    const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221);  //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
-                    const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221);  //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
-
-                    const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221);  //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
-                    const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221);  //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
-
-                    // Scale values - Load the wight scale values of block_q4_0x8
-                    const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
-
-                    // Process LHS in groups of four
-                    for (int rp = 0; rp < 4; rp++) {
-                        // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
-                        // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
-                        __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
-                        __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
-                        __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
-                        __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
-                        __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
-                        __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
-                        __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
-                        __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
-                        __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
-                        __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
-                        __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
-                        __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
-
-                        // Shuffle pattern one - left side input
-                        const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
-                        const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
-
-                        const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
-                        const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
-
-                        const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
-                        const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
-
-                        const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
-                        const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
-
-                        // Shuffle pattern two - left side input
-                        const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
-                        const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
-
-                        const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
-                        const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
-
-                        const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
-                        const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
-
-                        const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
-                        const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
-
-                        // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
-                        // Resembles MMLAs into 2x2 matrices in ARM Version
-                        __m256i iacc_mat_00_sp1 =
-                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
-                        __m256i iacc_mat_01_sp1 =
-                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
-                        __m256i iacc_mat_10_sp1 =
-                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
-                        __m256i iacc_mat_11_sp1 =
-                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
-                        __m256i iacc_mat_00_sp2 =
-                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
-                        __m256i iacc_mat_01_sp2 =
-                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
-                        __m256i iacc_mat_10_sp2 =
-                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
-                        __m256i iacc_mat_11_sp2 =
-                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
-
-                        // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
-                        __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
-                        __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
-                        __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
-                        __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
-
-                        // Straighten out to make 4 row vectors
-                        __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
-                        __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
-                        __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
-                        __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
-
-                        // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
-                        const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
-
-                        // Multiply with appropiate scales and accumulate
-                        acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
-                        acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
-                        acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
-                        acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32,  255)), acc_rows[rp * 4 + 3]);
-                    }
-                }
-
-                // Store the accumulated values
-                for (int i = 0; i < 16; i++) {
-                    _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
-                }
-            }
-        }
-
-        // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
-        for (; y < nr / 4; y ++) {
-
-            const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
-
-            // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
-            for (int64_t x = xstart; x < nc / 8; x++) {
-
-                const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
-
-                // Master FP accumulators
-                __m256 acc_rows[4];
-                for (int i = 0; i < 4; i++) {
-                    acc_rows[i] = _mm256_setzero_ps();
-                }
-
-                for (int64_t b = 0; b < nb; b++) {
-                    // Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
-                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
-                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
-                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
-                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
-
-                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
-                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
-                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
-                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
-
-                    // 4-bit -> 8-bit - Sign is maintained
-                    const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b));  //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
-                    const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b));  //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
-
-                    const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b));  //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
-                    const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b));  //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
-
-                    const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b));  //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
-                    const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b));  //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
-
-                    const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b));  //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
-                    const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b));  //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
-
-                    // Shuffle pattern one - right side input
-                    const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136);  //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
-                    const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136);  //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
-
-                    const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136);  //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
-                    const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136);  //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
-
-                    const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136);  //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
-                    const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136);  //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
-
-                    const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136);  //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
-                    const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136);  //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
-
-                    // Shuffle pattern two - right side input
-
-                    const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221);  //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
-                    const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221);  //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
-
-                    const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221);  //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
-                    const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221);  //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
-
-                    const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221);  //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
-                    const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221);  //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
-
-                    const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221);  //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
-                    const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221);  //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
-
-                    // Scale values - Load the wight scale values of block_q4_0x8
-                    const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
-
-                    // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
-                    // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
-                    __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
-                    __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
-                    __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
-                    __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
-                    __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
-                    __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
-                    __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
-                    __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
-                    __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
-                    __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
-                    __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
-                    __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
-
-                    // Shuffle pattern one - left side input
-
-                    const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
-                    const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
-
-                    const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
-                    const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
-
-                    const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
-                    const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
-
-                    const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
-                    const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
-
-                    // Shuffle pattern two - left side input
-
-                    const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
-                    const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
-
-                    const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
-                    const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
-
-                    const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
-                    const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
-
-                    const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
-                    const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
-
-                    // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
-                    // Resembles MMLAs into 2x2 matrices in ARM Version
-                    __m256i iacc_mat_00_sp1 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
-                    __m256i iacc_mat_01_sp1 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
-                    __m256i iacc_mat_10_sp1 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
-                    __m256i iacc_mat_11_sp1 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
-                    __m256i iacc_mat_00_sp2 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
-                    __m256i iacc_mat_01_sp2 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
-                    __m256i iacc_mat_10_sp2 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
-                    __m256i iacc_mat_11_sp2 =
-                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
-
-                    // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
-                    __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
-                    __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
-                    __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
-                    __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
-
-
-                    // Straighten out to make 4 row vectors
-                    __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
-                    __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
-                    __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
-                    __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
-
-                    // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
-                    const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
-
-                    // Multiply with appropiate scales and accumulate
-                    acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
-                    acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
-                    acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
-                    acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
-                }
-
-                // Store the accumulated values
-                for (int i = 0; i < 4; i++) {
-                    _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
-                }
-            }
-        }
-        return;
-    }
-#elif defined(__riscv_v_intrinsic)
-    if (__riscv_vlenb() >= QK4_0) {
-        const size_t vl = QK4_0;
-
-        for (int y = 0; y < nr / 4; y++) {
-            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-            for (int x = 0; x < nc / ncols_interleaved; x++) {
-                const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
-                vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
-                vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
-                vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
-                vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
-                for (int l = 0; l < nb; l++) {
-                    const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
-                    const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
-                    const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
-                    const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
-                    const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
-                    const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
-                    const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
-
-                    // vector version needs Zvfhmin extension
-                    const float a_scales[4] = {
-                        GGML_FP16_TO_FP32(a_ptr[l].d[0]),
-                        GGML_FP16_TO_FP32(a_ptr[l].d[1]),
-                        GGML_FP16_TO_FP32(a_ptr[l].d[2]),
-                        GGML_FP16_TO_FP32(a_ptr[l].d[3])
-                    };
-                    const float b_scales[8] = {
-                        GGML_FP16_TO_FP32(b_ptr[l].d[0]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[1]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[2]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[3]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[4]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[5]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[6]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[7])
-                    };
-                    const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
-
-                    const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
-                    const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
-                    const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
-                    const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
-                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
-                    vint16m4_t sumi_l0;
-                    {
-                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
-                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
-                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
-                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
-                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
-                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
-                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
-                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
-
-                        sumi_l0 = sumi_hi_m;
-                    }
-
-                    {
-                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
-                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
-                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
-                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
-                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
-                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
-                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
-                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
-                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
-                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
-                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
-                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
-                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
-
-                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
-                        sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
-                    }
-
-                    const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
-                    const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
-                    const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
-                    const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
-                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
-                    vint16m4_t sumi_l1;
-                    {
-                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
-                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
-                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
-                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
-                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
-                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
-                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
-                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
-
-                        sumi_l1 = sumi_hi_m;
-                    }
-
-                    {
-                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
-                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
-                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
-                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
-                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
-                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
-                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
-                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
-                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
-                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
-                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
-                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
-                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
-
-                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
-                        sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
-                    }
-
-                    const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
-                    const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
-                    const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
-                    const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
-                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
-                    vint16m4_t sumi_l2;
-                    {
-                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
-                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
-                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
-                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
-                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
-                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
-                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
-                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
-
-                        sumi_l2 = sumi_hi_m;
-                    }
-
-                    {
-                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
-                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
-                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
-                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
-                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
-                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
-                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
-                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
-                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
-                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
-                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
-                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
-                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
-
-                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
-                        sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
-                    }
-
-                    const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
-                    const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
-                    const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
-                    const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
-                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
-                    vint16m4_t sumi_l3;
-                    {
-                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
-                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
-                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
-                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
-                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
-                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
-                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
-                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
-
-                        sumi_l3 = sumi_hi_m;
-                    }
-
-                    {
-                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
-                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
-                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
-                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
-                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
-                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
-                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
-                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
-                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
-                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
-                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
-                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
-                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
-
-                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
-                        sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
-                    }
-                }
-                __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
-                __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
-                __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
-                __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
-            }
-        }
-
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
-    float sumf[4][8];
-    int sumi;
-
-    for (int y = 0; y < nr / 4; y++) {
-        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
-            }
-            for (int l = 0; l < nb; l++) {
-                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                    for (int m = 0; m < 4; m++) {
-                        for (int j = 0; j < ncols_interleaved; j++) {
-                            sumi = 0;
-                            for (int i = 0; i < blocklen; ++i) {
-                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
-                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
-                            }
-                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
-                        }
-                    }
-                }
-            }
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++)
-                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
-            }
-        }
-    }
-}
diff --git a/ggml/src/ggml-aarch64.h b/ggml/src/ggml-aarch64.h
index 517babaf1..a57868591 100644
--- a/ggml/src/ggml-aarch64.h
+++ b/ggml/src/ggml-aarch64.h
@@ -1,9 +1,5 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-
 #include "ggml.h"
 
 // GGML internal header
@@ -12,27 +8,11 @@
 extern "C" {
 #endif
 
-// Quantization
-void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
-
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
-// GEMV
-void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-
-// GEMM
-void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt
new file mode 100644
index 000000000..d6676f3f6
--- /dev/null
+++ b/ggml/src/ggml-amx/CMakeLists.txt
@@ -0,0 +1,107 @@
+if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
+        CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
+    message(STATUS "Using AMX")
+
+    file(GLOB   GGML_HEADERS_AMX "*.h")
+    list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
+
+    file(GLOB   GGML_SOURCES_AMX "*.cpp")
+
+    add_library(ggml-amx
+                ${GGML_HEADERS_AMX}
+                ${GGML_SOURCES_AMX})
+
+    target_link_libraries(ggml-amx PRIVATE ggml-base)
+    target_include_directories(ggml-amx PRIVATE . ..)
+
+    # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
+    # TODO: integrate AMX backend into the CPU backend
+    if (MSVC)
+        # instruction set detection for MSVC only
+        if (GGML_NATIVE)
+            # TODO: improve, should not reference files from the parent folder
+            include(../ggml-cpu/cmake/FindSIMD.cmake)
+        endif ()
+        if (GGML_AVX512)
+            list(APPEND ARCH_FLAGS /arch:AVX512)
+            # MSVC has no compile-time flags enabling specific
+            # AVX512 extensions, neither it defines the
+            # macros corresponding to the extensions.
+            # Do it manually.
+            if (GGML_AVX512_VBMI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+            endif()
+            if (GGML_AVX512_VNNI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+            endif()
+            if (GGML_AVX512_BF16)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
+            endif()
+            if (GGML_AMX_TILE)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
+            endif()
+            if (GGML_AMX_INT8)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
+            endif()
+            if (GGML_AMX_BF16)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
+            endif()
+        elseif (GGML_AVX2)
+            list(APPEND ARCH_FLAGS /arch:AVX2)
+        elseif (GGML_AVX)
+            list(APPEND ARCH_FLAGS /arch:AVX)
+        endif()
+    else()
+        if (GGML_NATIVE)
+            list(APPEND ARCH_FLAGS -march=native)
+        endif()
+        if (GGML_F16C)
+            list(APPEND ARCH_FLAGS -mf16c)
+        endif()
+        if (GGML_FMA)
+            list(APPEND ARCH_FLAGS -mfma)
+        endif()
+        if (GGML_AVX)
+            list(APPEND ARCH_FLAGS -mavx)
+        endif()
+        if (GGML_AVX2)
+            list(APPEND ARCH_FLAGS -mavx2)
+        endif()
+        if (GGML_AVX512)
+            list(APPEND ARCH_FLAGS -mavx512f)
+            list(APPEND ARCH_FLAGS -mavx512dq)
+            list(APPEND ARCH_FLAGS -mavx512bw)
+        endif()
+        if (GGML_AVX512_VBMI)
+            list(APPEND ARCH_FLAGS -mavx512vbmi)
+        endif()
+        if (GGML_AVX512_VNNI)
+            list(APPEND ARCH_FLAGS -mavx512vnni)
+        endif()
+        if (GGML_AVX512_BF16)
+            list(APPEND ARCH_FLAGS -mavx512bf16)
+        endif()
+        if (GGML_AMX_TILE)
+            list(APPEND ARCH_FLAGS -mamx-tile)
+        endif()
+        if (GGML_AMX_INT8)
+            list(APPEND ARCH_FLAGS -mamx-int8)
+        endif()
+        if (GGML_AMX_BF16)
+            list(APPEND ARCH_FLAGS -mamx-bf16)
+        endif()
+    endif()
+
+    target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
+else()
+    set(GGML_AMX OFF PARENT_SCOPE)
+    message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
+endif()
diff --git a/ggml/src/ggml-amx/common.h b/ggml/src/ggml-amx/common.h
index 2b6c63527..5db8ce30d 100644
--- a/ggml/src/ggml-amx/common.h
+++ b/ggml/src/ggml-amx/common.h
@@ -1,7 +1,8 @@
 #pragma once
 
 #include "ggml.h"
-#include "ggml-cpu-impl.h" // <immintrin.h>
+// hack until AMX is moved into the CPU backend
+#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
 
 #include <algorithm>
 #include <memory>
diff --git a/ggml/src/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp
similarity index 98%
rename from ggml/src/ggml-amx.cpp
rename to ggml/src/ggml-amx/ggml-amx.cpp
index 144dc9d8a..37da98539 100644
--- a/ggml/src/ggml-amx.cpp
+++ b/ggml/src/ggml-amx/ggml-amx.cpp
@@ -421,9 +421,18 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
 
 #else // if defined(__AMX_INT8__)
 
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
+    return nullptr;
+}
+
+bool ggml_backend_is_amx(ggml_backend_t backend) {
+    GGML_UNUSED(backend);
+    return false;
+}
+
 ggml_backend_t ggml_backend_amx_init(void) {
     fprintf(stderr, "GGML is not compiled with AMX support!\n");
-    return ggml_backend_t{};
+    return nullptr;
 }
 
 void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
@@ -433,4 +442,8 @@ void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
     GGML_UNUSED(n_threads);
 }
 
+ggml_backend_reg_t ggml_backend_amx_reg(void) {
+    return nullptr;
+}
+
 #endif
diff --git a/ggml/src/ggml-amx/mmq.cpp b/ggml/src/ggml-amx/mmq.cpp
index 239d15121..529bee25b 100644
--- a/ggml/src/ggml-amx/mmq.cpp
+++ b/ggml/src/ggml-amx/mmq.cpp
@@ -496,19 +496,20 @@ inline void from_float(const float * x, char * vy, int64_t k);
 
 template <>
 inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
-    quantize_row_q8_0(x, vy, k);
+    // FIXME: using unoptimized reference impl until moved to CPU backend
+    quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k);
 }
 
 template <>
 inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
-    quantize_row_q8_1(x, vy, k);
+    quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k);
 }
 
 template <>
 inline void from_float<block_q8_K>(const float * x, char * vy, int64_t k) {
 #if 1
     // TODO: this is reference impl!
-    quantize_row_q8_K(x, vy, k);
+    quantize_row_q8_K_ref(x, (block_q8_K *)vy, k);
 #else
     quantize_row_q8_K_vnni(x, vy, k);
 #endif
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
new file mode 100644
index 000000000..63e9d8201
--- /dev/null
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -0,0 +1,195 @@
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+#include <cstring>
+#include <vector>
+
+// Backend registry
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
+#ifdef GGML_USE_BLAS
+#include "ggml-blas.h"
+#endif
+
+#ifdef GGML_USE_RPC
+#include "ggml-rpc.h"
+#endif
+
+#ifdef GGML_USE_AMX
+#  include "ggml-amx.h"
+#endif
+
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
+#ifdef GGML_USE_KOMPUTE
+#include "ggml-kompute.h"
+#endif
+
+struct ggml_backend_registry {
+    std::vector<ggml_backend_reg_t> backends;
+    std::vector<ggml_backend_dev_t> devices;
+
+    ggml_backend_registry() {
+#ifdef GGML_USE_CUDA
+        register_backend(ggml_backend_cuda_reg());
+#endif
+#ifdef GGML_USE_METAL
+        register_backend(ggml_backend_metal_reg());
+#endif
+#ifdef GGML_USE_SYCL
+        register_backend(ggml_backend_sycl_reg());
+#endif
+#ifdef GGML_USE_VULKAN
+        register_backend(ggml_backend_vk_reg());
+#endif
+#ifdef GGML_USE_CANN
+        register_backend(ggml_backend_cann_reg());
+#endif
+#ifdef GGML_USE_BLAS
+        register_backend(ggml_backend_blas_reg());
+#endif
+#ifdef GGML_USE_RPC
+        register_backend(ggml_backend_rpc_reg());
+#endif
+#ifdef GGML_USE_AMX
+        register_backend(ggml_backend_amx_reg());
+#endif
+#ifdef GGML_USE_KOMPUTE
+        register_backend(ggml_backend_kompute_reg());
+#endif
+
+        register_backend(ggml_backend_cpu_reg());
+    }
+
+    void register_backend(ggml_backend_reg_t reg) {
+        if (!reg) {
+            return;
+        }
+
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
+            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
+#endif
+        backends.push_back(reg);
+        for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
+            register_device(ggml_backend_reg_dev_get(reg, i));
+        }
+    }
+
+    void register_device(ggml_backend_dev_t device) {
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
+#endif
+        devices.push_back(device);
+    }
+};
+
+static ggml_backend_registry & get_reg() {
+    static ggml_backend_registry reg;
+    return reg;
+}
+
+// Internal API
+void ggml_backend_register(ggml_backend_reg_t reg) {
+    get_reg().register_backend(reg);
+}
+
+void ggml_backend_device_register(ggml_backend_dev_t device) {
+    get_reg().register_device(device);
+}
+
+// Backend (reg) enumeration
+size_t ggml_backend_reg_count() {
+    return get_reg().backends.size();
+}
+
+ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
+    GGML_ASSERT(index < ggml_backend_reg_count());
+    return get_reg().backends[index];
+}
+
+ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
+    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+        ggml_backend_reg_t reg = ggml_backend_reg_get(i);
+        if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
+            return reg;
+        }
+    }
+    return NULL;
+}
+
+// Device enumeration
+size_t ggml_backend_dev_count() {
+    return get_reg().devices.size();
+}
+
+ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
+    GGML_ASSERT(index < ggml_backend_dev_count());
+    return get_reg().devices[index];
+}
+
+ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
+            return dev;
+        }
+    }
+    return NULL;
+}
+
+ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) == type) {
+            return dev;
+        }
+    }
+    return NULL;
+}
+
+// Convenience functions
+ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
+    ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
+    if (!dev) {
+        return NULL;
+    }
+    return ggml_backend_dev_init(dev, params);
+}
+
+ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
+    if (!dev) {
+        return NULL;
+    }
+    return ggml_backend_dev_init(dev, params);
+}
+
+ggml_backend_t ggml_backend_init_best(void) {
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+    if (!dev) {
+        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    }
+    if (!dev) {
+        return NULL;
+    }
+    return ggml_backend_dev_init(dev, NULL);
+}
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 0b8ebac53..e48877ba8 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -525,197 +525,6 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
     return reg->iface.get_proc_address(reg, name);
 }
 
-// Backend registry
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef GGML_USE_BLAS
-#include "ggml-blas.h"
-#endif
-
-#ifdef GGML_USE_RPC
-#include "ggml-rpc.h"
-#endif
-
-#ifndef __AMX_INT8__
-#undef GGML_USE_AMX
-#endif
-
-#ifdef GGML_USE_AMX
-#  include "ggml-amx.h"
-#endif
-
-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif
-
-#ifdef GGML_USE_KOMPUTE
-#include "ggml-kompute.h"
-#endif
-
-#include "ggml-cpu.h"
-
-struct ggml_backend_registry {
-    std::vector<ggml_backend_reg_t> backends;
-    std::vector<ggml_backend_dev_t> devices;
-
-    ggml_backend_registry() {
-#ifdef GGML_USE_CUDA
-        register_backend(ggml_backend_cuda_reg());
-#endif
-#ifdef GGML_USE_METAL
-        register_backend(ggml_backend_metal_reg());
-#endif
-#ifdef GGML_USE_SYCL
-        register_backend(ggml_backend_sycl_reg());
-#endif
-#ifdef GGML_USE_VULKAN
-        register_backend(ggml_backend_vk_reg());
-#endif
-#ifdef GGML_USE_CANN
-        register_backend(ggml_backend_cann_reg());
-#endif
-#ifdef GGML_USE_BLAS
-        register_backend(ggml_backend_blas_reg());
-#endif
-#ifdef GGML_USE_RPC
-        register_backend(ggml_backend_rpc_reg());
-#endif
-#ifdef GGML_USE_AMX
-        register_backend(ggml_backend_amx_reg());
-#endif
-#ifdef GGML_USE_KOMPUTE
-        register_backend(ggml_backend_kompute_reg());
-#endif
-
-        register_backend(ggml_backend_cpu_reg());
-    }
-
-    void register_backend(ggml_backend_reg_t reg) {
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
-            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
-#endif
-        backends.push_back(reg);
-        for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
-            register_device(ggml_backend_reg_dev_get(reg, i));
-        }
-    }
-
-    void register_device(ggml_backend_dev_t device) {
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
-#endif
-        devices.push_back(device);
-    }
-};
-
-static ggml_backend_registry & get_reg() {
-    static ggml_backend_registry reg;
-    return reg;
-}
-
-// Internal API
-void ggml_backend_register(ggml_backend_reg_t reg) {
-    get_reg().register_backend(reg);
-}
-
-void ggml_backend_device_register(ggml_backend_dev_t device) {
-    get_reg().register_device(device);
-}
-
-// Backend (reg) enumeration
-size_t ggml_backend_reg_count() {
-    return get_reg().backends.size();
-}
-
-ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
-    GGML_ASSERT(index < ggml_backend_reg_count());
-    return get_reg().backends[index];
-}
-
-ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
-    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
-        ggml_backend_reg_t reg = ggml_backend_reg_get(i);
-        if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
-            return reg;
-        }
-    }
-    return NULL;
-}
-
-// Device enumeration
-size_t ggml_backend_dev_count() {
-    return get_reg().devices.size();
-}
-
-ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
-    GGML_ASSERT(index < ggml_backend_dev_count());
-    return get_reg().devices[index];
-}
-
-ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
-    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
-            return dev;
-        }
-    }
-    return NULL;
-}
-
-ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
-    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        if (ggml_backend_dev_type(dev) == type) {
-            return dev;
-        }
-    }
-    return NULL;
-}
-
-// Convenience functions
-ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
-    ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
-    if (!dev) {
-        return NULL;
-    }
-    return ggml_backend_dev_init(dev, params);
-}
-
-ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
-    ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
-    if (!dev) {
-        return NULL;
-    }
-    return ggml_backend_dev_init(dev, params);
-}
-
-ggml_backend_t ggml_backend_init_best(void) {
-    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
-    if (!dev) {
-        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    }
-    if (!dev) {
-        return NULL;
-    }
-    return ggml_backend_dev_init(dev, NULL);
-}
-
 // multi-buffer buffer
 
 struct ggml_backend_multi_buffer_context {
@@ -1640,7 +1449,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
         bool parallel) {
     GGML_ASSERT(n_backends > 0);
     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
-    GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
+    GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
 
     struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
 
@@ -2036,17 +1845,6 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
     return true;
 }
 
-
-
-#include "ggml-backend.h"
-#include "ggml-backend-impl.h"
-#include "ggml-cpu.h"
-#include "ggml-impl.h"
-#include <cctype>
-#include <string>
-
-// ggml-backend interface
-
 // CPU backend - buffer
 
 static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -2120,7 +1918,9 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
     /* .reset           = */ NULL,
 };
 
-// CPU backend - buffer type
+// CPU backend buffer type
+
+// this buffer type is defined here to make it available to all backends
 
 static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     return "CPU";
@@ -2161,7 +1961,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
             /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
         },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .device  = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
         /* .context = */ NULL,
     };
 
@@ -2184,478 +1984,14 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
             /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
         },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .device  = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
         /* .context = */ NULL,
     };
 
     return &ggml_backend_cpu_buffer_type;
 }
 
-#ifdef GGML_USE_CPU_HBM
-
-// buffer type HBM
-
-#include <hbwmalloc.h>
-
-static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_HBM";
-
-    GGML_UNUSED(buft);
-}
-
-static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    hbw_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr;
-    int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
-    if (result != 0) {
-        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
-        return NULL;
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .context  = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type_hbm;
-}
-#endif
-
-static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
-    static ggml_backend_buffer_type_t bufts[] = {
-#ifdef GGML_USE_CPU_HBM
-        ggml_backend_cpu_hbm_buffer_type(),
-#endif
-        NULL
-    };
-
-    return bufts;
-
-    GGML_UNUSED(device);
-}
-
-// CPU backend - backend (stream)
-
-struct ggml_backend_cpu_context {
-    int                 n_threads;
-    ggml_threadpool_t   threadpool;
-
-    uint8_t *           work_data;
-    size_t              work_size;
-
-    ggml_abort_callback abort_callback;
-    void *              abort_callback_data;
-};
-
-static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
-    return "CPU";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_cpu_free(ggml_backend_t backend) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-    delete[] cpu_ctx->work_data;
-    delete cpu_ctx;
-    delete backend;
-}
-
-struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
-    struct ggml_cgraph cgraph;
-};
-
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
-
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
-
-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
-        if (cpu_plan->cplan.work_data == NULL) {
-            delete cpu_plan;
-            return NULL;
-        }
-    }
-
-    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
-    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
-    return cpu_plan;
-}
-
-static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    delete[] cpu_plan->cplan.work_data;
-    delete cpu_plan;
-
-    GGML_UNUSED(backend);
-}
-
-static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-
-    GGML_UNUSED(backend);
-}
-
-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-
-    if (cpu_ctx->work_size < cplan.work_size) {
-        delete[] cpu_ctx->work_data;
-        cpu_ctx->work_data = new uint8_t[cplan.work_size];
-        if (cpu_ctx->work_data == NULL) {
-            cpu_ctx->work_size = 0;
-            return GGML_STATUS_ALLOC_FAILED;
-        }
-        cpu_ctx->work_size = cplan.work_size;
-    }
-    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
-
-    cplan.abort_callback      = cpu_ctx->abort_callback;
-    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
-    return ggml_graph_compute(cgraph, &cplan);
-}
-
-static const struct ggml_backend_i ggml_backend_cpu_i = {
-    /* .get_name                = */ ggml_backend_cpu_get_name,
-    /* .free                    = */ ggml_backend_cpu_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
-    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_cpu_guid(void) {
-    static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
-    return &guid;
-}
-
-ggml_backend_t ggml_backend_cpu_init(void) {
-    // initialize CPU backend now to avoid slowing the first graph computation
-    ggml_cpu_init();
-
-    struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
-    if (ctx == NULL) {
-        return NULL;
-    }
-
-    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
-    ctx->threadpool          = NULL;
-    ctx->work_data           = NULL;
-    ctx->work_size           = 0;
-    ctx->abort_callback      = NULL;
-    ctx->abort_callback_data = NULL;
-
-    ggml_backend_t cpu_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_cpu_guid(),
-        /* .interface = */ ggml_backend_cpu_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context   = */ ctx,
-    };
-
-    if (cpu_backend == NULL) {
-        delete ctx;
-        return NULL;
-    }
-
-    return cpu_backend;
-}
-
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
-}
-
-void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->n_threads = n_threads;
-}
-
-void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-
-    if (ctx->threadpool && ctx->threadpool != threadpool) {
-        // already had a different threadpool, pause/suspend it before switching
-        ggml_threadpool_pause(ctx->threadpool);
-    }
-    ctx->threadpool = threadpool;
-}
-
-void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->abort_callback = abort_callback;
-    ctx->abort_callback_data = abort_callback_data;
-}
-
 ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
     return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
 }
-
-// CPU backend - device
-
-struct ggml_backend_cpu_device_context {
-    std::string description = "CPU";
-
-    ggml_backend_cpu_device_context() {
-#ifdef __APPLE__
-        size_t len = 0;
-        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
-            description.resize(len);
-            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
-        }
-#elif defined(__linux__)
-        FILE * f = fopen("/proc/cpuinfo", "r");
-        if (f) {
-            char buf[1024];
-            while (fgets(buf, sizeof(buf), f)) {
-                if (strncmp(buf, "model name", 10) == 0) {
-                    char * p = strchr(buf, ':');
-                    if (p) {
-                        p++;
-                        while (std::isspace(*p)) {
-                            p++;
-                        }
-                        while (std::isspace(p[strlen(p) - 1])) {
-                            p[strlen(p) - 1] = '\0';
-                        }
-                        description = p;
-                        break;
-                    }
-                }
-            }
-            fclose(f);
-        }
-#elif defined(_WIN32)
-        HKEY hKey;
-        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
-                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
-                        0,
-                        KEY_READ,
-                        &hKey) == ERROR_SUCCESS) {
-            DWORD cpu_brand_size = 0;
-            if (RegQueryValueExA(hKey,
-                                TEXT("ProcessorNameString"),
-                                NULL,
-                                NULL,
-                                NULL,
-                                &cpu_brand_size) == ERROR_SUCCESS) {
-                description.resize(cpu_brand_size);
-                if (RegQueryValueExA(hKey,
-                                    TEXT("ProcessorNameString"),
-                                    NULL,
-                                    NULL,
-                                    (LPBYTE)&description[0], // NOLINT
-                                    &cpu_brand_size) == ERROR_SUCCESS) {
-                    if (description.find('\0') != std::string::npos) {
-                        description.resize(description.find('\0'));
-                    }
-                }
-            }
-            RegCloseKey(hKey);
-        }
-#endif
-    }
-};
-
-static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
-    return "CPU";
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
-    struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
-
-    return ctx->description.c_str();
-}
-
-static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-
-    GGML_UNUSED(dev);
-}
-
-static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_CPU;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_cpu_device_get_name(dev);
-    props->description = ggml_backend_cpu_device_get_description(dev);
-    props->type        = ggml_backend_cpu_device_get_type(dev);
-    ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ true,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
-    return ggml_backend_cpu_init();
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_cpu_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(max_tensor_size);
-}
-
-static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_CPY:
-            return
-                op->type != GGML_TYPE_IQ2_XXS &&
-                op->type != GGML_TYPE_IQ2_XS  &&
-                op->type != GGML_TYPE_IQ1_S   &&
-                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
-        case GGML_OP_MUL_MAT:
-            return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
-        case GGML_OP_ROPE_BACK:
-            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
-        case GGML_OP_IM2COL_BACK:
-            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
-        case GGML_OP_OUT_PROD:
-            return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
-        default:
-            return true;
-    }
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft);
-
-    GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
-    /* .get_name             = */ ggml_backend_cpu_device_get_name,
-    /* .get_description      = */ ggml_backend_cpu_device_get_description,
-    /* .get_memory           = */ ggml_backend_cpu_device_get_memory,
-    /* .get_type             = */ ggml_backend_cpu_device_get_type,
-    /* .get_props            = */ ggml_backend_cpu_device_get_props,
-    /* .init_backend         = */ ggml_backend_cpu_device_init_backend,
-    /* .get_buffer_type      = */ ggml_backend_cpu_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
-    /* .supports_op          = */ ggml_backend_cpu_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_cpu_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-// CPU backend - backend (reg)
-
-static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
-    return "CPU";
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    static ggml_backend_cpu_device_context ctx;
-    static ggml_backend_device ggml_backend_cpu_device = {
-        /* .iface   = */ ggml_backend_cpu_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ &ctx,
-    };
-
-    return &ggml_backend_cpu_device;
-}
-
-static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        return (void *)ggml_backend_cpu_set_n_threads;
-    }
-    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
-        return (void *)ggml_backend_cpu_get_extra_bufts;
-    }
-
-    return NULL;
-
-    GGML_UNUSED(reg);
-}
-
-static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
-    /* .get_name         = */ ggml_backend_cpu_reg_get_name,
-    /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_cpu_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_cpu_reg(void) {
-    static struct ggml_backend_reg ggml_backend_cpu_reg = {
-        /* .iface   = */ ggml_backend_cpu_reg_i,
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_cpu_reg;
-}
diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt
new file mode 100644
index 000000000..ffe39e8bd
--- /dev/null
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@@ -0,0 +1,91 @@
+if (GGML_STATIC)
+    set(BLA_STATIC ON)
+endif()
+#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
+#    set(BLA_SIZEOF_INTEGER 8)
+#endif()
+
+set(BLA_VENDOR ${GGML_BLAS_VENDOR})
+find_package(BLAS)
+
+if (BLAS_FOUND)
+    message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
+
+    add_library(ggml-blas
+                ggml-blas.cpp
+                )
+
+    target_link_libraries(ggml-blas PRIVATE ggml-base)
+    target_include_directories(ggml-blas PRIVATE . ..)
+
+    if (${GGML_BLAS_VENDOR} MATCHES "Apple")
+        add_compile_definitions(ACCELERATE_NEW_LAPACK)
+        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+        add_compile_definitions(GGML_BLAS_USE_ACCELERATE)
+    elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
+        # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
+        # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
+        find_package(PkgConfig REQUIRED)
+        if (${GGML_BLAS_VENDOR} MATCHES "Generic")
+            pkg_check_modules(DepBLAS blas)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
+            # As of openblas v0.3.22, the 64-bit is named openblas64.pc
+            pkg_check_modules(DepBLAS openblas64)
+            if (NOT DepBLAS_FOUND)
+                pkg_check_modules(DepBLAS openblas)
+            endif()
+        elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
+            add_compile_definitions(GGML_BLAS_USE_BLIS)
+            pkg_check_modules(DepBLAS blis)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
+            pkg_check_modules(DepBLAS blas-atlas)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
+            pkg_check_modules(DepBLAS flexiblas_api)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
+            add_compile_definitions(GGML_BLAS_USE_MKL)
+            # all Intel* libraries share the same include path
+            pkg_check_modules(DepBLAS mkl-sdl)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
+            # this doesn't provide pkg-config
+            # suggest to assign BLAS_INCLUDE_DIRS on your own
+            if ("${NVHPC_VERSION}" STREQUAL "")
+                message(WARNING "Better to set NVHPC_VERSION")
+            else()
+                set(DepBLAS_FOUND ON)
+                set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
+            endif()
+        endif()
+        if (DepBLAS_FOUND)
+            set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
+        else()
+            message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
+            " detected by pkgconfig, trying to find cblas.h from possible paths...")
+            find_path(BLAS_INCLUDE_DIRS
+                NAMES cblas.h
+                HINTS
+                    /usr/include
+                    /usr/local/include
+                    /usr/include/openblas
+                    /opt/homebrew/opt/openblas/include
+                    /usr/local/opt/openblas/include
+                    /usr/include/x86_64-linux-gnu/openblas/include
+            )
+        endif()
+    endif()
+
+    message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
+
+    #add_compile_options(${BLAS_LINKER_FLAGS})
+    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
+
+    if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+        add_compile_definitions(GGML_BLAS_USE_MKL)
+    endif()
+
+    target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
+    target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
+else()
+    message(ERROR "BLAS not found, please refer to "
+                  "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+                  " to set correct GGML_BLAS_VENDOR")
+endif()
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
similarity index 99%
rename from ggml/src/ggml-blas.cpp
rename to ggml/src/ggml-blas/ggml-blas.cpp
index 8d96220b9..648c9d875 100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include <cstring>
 
-#if defined(GGML_USE_ACCELERATE)
+#if defined(GGML_BLAS_USE_ACCELERATE)
 #   include <Accelerate/Accelerate.h>
 #elif defined(GGML_BLAS_USE_MKL)
 #   include <mkl.h>
@@ -320,7 +320,7 @@ static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
 }
 
 static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
-    #if defined(GGML_USE_ACCELERATE)
+    #if defined(GGML_BLAS_USE_ACCELERATE)
         return "Accelerate";
     #elif defined(GGML_BLAS_USE_MKL)
         return "MKL";
diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt
new file mode 100644
index 000000000..c8e15c6d4
--- /dev/null
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -0,0 +1,46 @@
+if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
+    set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
+    message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
+endif()
+
+if (CANN_INSTALL_DIR)
+    # Only Support Linux.
+    if (NOT UNIX)
+        message(FATAL_ERROR "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}")
+    endif()
+
+    # Supported platforms: x86-64, arm64
+    if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+    elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
+    else()
+        message(FATAL_ERROR "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
+    endif()
+
+    # Set header and libs
+    set(CANN_INCLUDE_DIRS
+        ${CANN_INSTALL_DIR}/include
+        ${CANN_INSTALL_DIR}/include/aclnn
+        ${CANN_INSTALL_DIR}/acllib/include
+    )
+
+    add_subdirectory(kernels)
+    list(APPEND CANN_LIBRARIES
+        ascendcl
+        nnopbase
+        opapi
+        acl_op_compiler
+        ascendc_kernels
+    )
+
+    file(GLOB GGML_SOURCES_CANN "*.cpp")
+
+    add_library(ggml-cann ${GGML_SOURCES_CANN})
+    target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES})
+    target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
+    target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
+
+    message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
+    message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
+else()
+    message(FATAL_ERROR "CANN: Can't find CANN_INSTALL_DIR, did you forget to source set_var.sh?")
+endif()
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
similarity index 100%
rename from ggml/src/ggml-cann.cpp
rename to ggml/src/ggml-cann/ggml-cann.cpp
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
new file mode 100644
index 000000000..4d96f425e
--- /dev/null
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -0,0 +1,244 @@
+add_library(ggml-cpu
+            ggml-cpu.c
+            ggml-cpu.cpp
+            ggml-cpu-aarch64.c
+            ggml-cpu-aarch64.h
+            ggml-cpu-quants.c
+            ggml-cpu-quants.h
+            )
+
+target_link_libraries(ggml-cpu PRIVATE ggml-base)
+target_include_directories(ggml-cpu PRIVATE . ..)
+
+if (APPLE AND GGML_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    if (ACCELERATE_FRAMEWORK)
+        message(STATUS "Accelerate framework found")
+
+        add_compile_definitions(GGML_USE_ACCELERATE)
+        add_compile_definitions(ACCELERATE_NEW_LAPACK)
+        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+
+        target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
+    else()
+        message(WARNING "Accelerate framework not found")
+    endif()
+endif()
+
+if (GGML_OPENMP)
+    find_package(OpenMP)
+    if (OpenMP_FOUND)
+        message(STATUS "OpenMP found")
+
+        add_compile_definitions(GGML_USE_OPENMP)
+
+        target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+
+        # FIXME: should be replaced with a compiler id check
+        #if (GGML_MUSA)
+        #    list(APPEND GGML_CPU_EXTRA_INCLUDES     "/usr/lib/llvm-14/lib/clang/14.0.0/include")
+        #    list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
+        #endif()
+    else()
+        message(WARNING "OpenMP not found")
+    endif()
+endif()
+
+if (GGML_LLAMAFILE)
+    message(STATUS "Using llamafile")
+
+    add_compile_definitions(GGML_USE_LLAMAFILE)
+
+    target_sources(ggml-cpu PRIVATE
+                    llamafile/sgemm.cpp
+                    llamafile/sgemm.h)
+endif()
+
+if (GGML_CPU_HBM)
+    find_library(memkind memkind REQUIRED)
+
+    message(STATUS "Using memkind for CPU HBM")
+
+    add_compile_definitions(GGML_USE_CPU_HBM)
+
+    target_link_libraries(ggml-cpu PUBLIC memkind)
+endif()
+
+if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
+    CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+    (NOT CMAKE_OSX_ARCHITECTURES      AND
+     NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+         CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+
+    message(STATUS "ARM detected")
+
+    if (MSVC)
+        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
+        add_compile_definitions(__ARM_NEON)
+        add_compile_definitions(__ARM_FEATURE_FMA)
+
+        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
+        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
+
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
+        if (GGML_COMPILER_SUPPORT_DOTPROD)
+            add_compile_definitions(__ARM_FEATURE_DOTPROD)
+        endif ()
+
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+
+        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+        endif ()
+
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+        endif ()
+
+        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
+    else()
+        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
+            # Raspberry Pi 1, Zero
+            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
+            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+                # Android armeabi-v7a
+                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
+            else()
+                # Raspberry Pi 2
+                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+            endif()
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+            # Android arm64-v8a
+            # Raspberry Pi 3, 4, Zero 2 (32-bit)
+            list(APPEND ARCH_FLAGS -mno-unaligned-access)
+        endif()
+        if (GGML_SVE)
+            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
+        endif()
+    endif()
+elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
+    message(STATUS "x86 detected")
+    if (MSVC)
+        # instruction set detection for MSVC only
+        if (GGML_NATIVE)
+            # TODO: improve, should not reference files from the parent folder
+            include(cmake/FindSIMD.cmake)
+        endif ()
+        if (GGML_AVX512)
+            list(APPEND ARCH_FLAGS /arch:AVX512)
+            # MSVC has no compile-time flags enabling specific
+            # AVX512 extensions, neither it defines the
+            # macros corresponding to the extensions.
+            # Do it manually.
+            if (GGML_AVX512_VBMI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+            endif()
+            if (GGML_AVX512_VNNI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+            endif()
+            if (GGML_AVX512_BF16)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
+            endif()
+            if (GGML_AMX_TILE)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
+            endif()
+            if (GGML_AMX_INT8)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
+            endif()
+            if (GGML_AMX_BF16)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
+            endif()
+        elseif (GGML_AVX2)
+            list(APPEND ARCH_FLAGS /arch:AVX2)
+        elseif (GGML_AVX)
+            list(APPEND ARCH_FLAGS /arch:AVX)
+        endif()
+    else()
+        if (GGML_NATIVE)
+            list(APPEND ARCH_FLAGS -march=native)
+        endif()
+        if (GGML_F16C)
+            list(APPEND ARCH_FLAGS -mf16c)
+        endif()
+        if (GGML_FMA)
+            list(APPEND ARCH_FLAGS -mfma)
+        endif()
+        if (GGML_AVX)
+            list(APPEND ARCH_FLAGS -mavx)
+        endif()
+        if (GGML_AVX2)
+            list(APPEND ARCH_FLAGS -mavx2)
+        endif()
+        if (GGML_AVX512)
+            list(APPEND ARCH_FLAGS -mavx512f)
+            list(APPEND ARCH_FLAGS -mavx512dq)
+            list(APPEND ARCH_FLAGS -mavx512bw)
+        endif()
+        if (GGML_AVX512_VBMI)
+            list(APPEND ARCH_FLAGS -mavx512vbmi)
+        endif()
+        if (GGML_AVX512_VNNI)
+            list(APPEND ARCH_FLAGS -mavx512vnni)
+        endif()
+        if (GGML_AVX512_BF16)
+            list(APPEND ARCH_FLAGS -mavx512bf16)
+        endif()
+        if (GGML_AMX_TILE)
+            list(APPEND ARCH_FLAGS -mamx-tile)
+        endif()
+        if (GGML_AMX_INT8)
+            list(APPEND ARCH_FLAGS -mamx-int8)
+        endif()
+        if (GGML_AMX_BF16)
+            list(APPEND ARCH_FLAGS -mamx-bf16)
+        endif()
+    endif()
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+    message(STATUS "PowerPC detected")
+    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
+                   OUTPUT_VARIABLE POWER10_M)
+    string(FIND ${POWER10_M} "POWER10" substring_index)
+    if(${substring_index} GREATER_EQUAL 0)
+       list(APPEND ARCH_FLAGS -mcpu=power10)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+       list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
+    else()
+        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
+        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+    endif()
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+    message(STATUS "loongarch64 detected")
+
+    list(APPEND ARCH_FLAGS -march=loongarch64)
+    if (GGML_LASX)
+        list(APPEND ARCH_FLAGS -mlasx)
+    endif()
+    if (GGML_LSX)
+        list(APPEND ARCH_FLAGS -mlsx)
+    endif()
+else()
+    message(STATUS "Unknown architecture")
+endif()
+
+target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
+target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
+
+if (EMSCRIPTEN)
+    set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
+endif()
diff --git a/ggml/cmake/FindSIMD.cmake b/ggml/src/ggml-cpu/cmake/FindSIMD.cmake
similarity index 100%
rename from ggml/cmake/FindSIMD.cmake
rename to ggml/src/ggml-cpu/cmake/FindSIMD.cmake
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
new file mode 100644
index 000000000..0ad9fe40a
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -0,0 +1,3387 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-License-Identifier: MIT
+//
+
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-cpu/ggml-cpu-impl.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#include "ggml-cpu-aarch64.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+#elif defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define UNUSED GGML_UNUSED
+
+// Functions to create the interleaved data layout formats
+
+// interleave 4 block_q4_0s in blocks of blck_size_interleave
+// returns an interleaved block_q4_0x4
+// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
+// first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
+//
+// - in                  : an array of block_q4_0 pointers
+// - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
+//                         blck_size_interleave bytes
+// - xor_mask            : the mask to convert the nibbles in block_q4_0 quants bytes
+//                         from bias offset form to pure sign form (this saves subtract
+//                         operations durin unpacking)
+//
+#if defined(__AVX__)
+#if defined(__F16C__)
+#if defined(__AVX512F__)
+#define GGML_F32Cx8x2_LOAD(x, y)     _mm512_cvtph_ps(_mm256_set_m128i(_mm_loadu_si128((const __m128i *)(y)), _mm_loadu_si128((const __m128i *)(x))))
+#define GGML_F32Cx16_REPEAT_LOAD(x)  _mm512_cvtph_ps(_mm256_set_m128i(x, x))
+#endif
+// the  _mm256_cvt intrinsics require F16C
+#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
+#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask)     _mm256_cvtph_ps(_mm_shuffle_epi32(_mm_maskload_epi32((int const*)(x), loadMask), 68))
+#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask)     _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask))
+#else
+#if defined(__AVX512F__)
+static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
+    float tmp[16];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i + 8] = GGML_FP16_TO_FP32(y[i]);
+    }
+
+    return _mm512_loadu_ps(tmp);
+}
+static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
+    float tmp[16];
+    uint16_t tmphalf[8];
+    _mm_storeu_si128((__m128i*)tmphalf, x);
+
+    for (int i = 0; i < 4; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]);
+    }
+
+    return _mm512_loadu_ps(tmp);
+}
+#endif
+static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
+    float tmp[8];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
+    float tmp[8];
+
+    for (int i = 0; i < 4; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+        tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrangeMask) {
+    uint16_t tmphalf[8];
+    float tmp[8];
+
+    _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+
+#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
+#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask)     __avx_repeat_f32cx8_load(x)
+#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask)     __avx_rearranged_f32cx8_load(x, arrangeMask)
+#if defined(__AVX512F__)
+#define GGML_F32Cx8x2_LOAD(x, y)     __avx512_f32cx8x2_load(x, y)
+#define GGML_F32Cx16_REPEAT_LOAD(x)  __avx512_repeat_f32cx16_load(x)
+#endif
+#endif
+#endif
+
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+#if defined(__AVX512F__)
+// add int16_t pairwise and return as 512 bit int vector
+static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
+    const __m512i ones = _mm512_set1_epi16(1);
+    return _mm512_madd_epi16(ones, x);
+}
+
+static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
+#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+    const __m512i zero = _mm512_setzero_si512();
+    return _mm512_dpbusd_epi32(zero, ax, sy);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m512i dot = _mm512_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_int_32x16(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as 512 bit int vector
+static inline __m512i mul_sum_i8_pairs_int32x16(const __m512i x, const __m512i y) {
+    const __m512i zero = _mm512_setzero_si512();
+    // Get absolute values of x vectors
+    const __m512i ax = _mm512_abs_epi8(x);
+    // Sign the values of the y vectors
+    __mmask64 blt0 = _mm512_movepi8_mask(x);
+    const __m512i sy = _mm512_mask_sub_epi8(y, blt0, zero, y);
+    return mul_sum_us8_pairs_int32x16(ax, sy);
+}
+#endif
+
+// add int16_t pairwise and return as 256 bit int vector
+static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
+    const __m256i ones = _mm256_set1_epi16(1);
+    return _mm256_madd_epi16(ones, x);
+}
+
+static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
+#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+    const __m256i zero = _mm256_setzero_si256();
+    return _mm256_dpbusd_epi32(zero, ax, sy);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_int32x8(dot);
+#endif
+}
+
+// Integer variant of the function defined in ggml-quants.c
+// multiply int8_t, add results pairwise twice and return as 256 bit int vector
+static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+    const __m256i zero = _mm256_setzero_si256();
+    return _mm256_dpbssd_epi32(zero, x, y);
+#else
+    // Get absolute values of x vectors
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return mul_sum_us8_pairs_int32x8(ax, sy);
+#endif
+}
+#endif
+
+static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t srcv[4][8];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
+            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
+
+            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
+            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
+            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
+
+            const float amax = vmaxvq_f32(amaxv[0]);
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < 8; j++) {
+            float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[1][j], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[2][j], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[3][j], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#else
+    // scalar
+    const int blck_size_interleave = 4;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+#endif
+}
+
+static void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t srcv[4][8];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
+            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
+
+            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
+            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
+            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
+
+            const float amax = vmaxvq_f32(amaxv[0]);
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < 4; j++) {
+            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[2][2 * j], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[3][2 * j], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    float id[4];
+    __m256 srcv[4][4];
+    __m256 idvec[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            // Load elements into 4 AVX vectors
+            __m256 v0 = _mm256_loadu_ps( x + row_iter * k + i * 32 );
+            __m256 v1 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 8 );
+            __m256 v2 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 16 );
+            __m256 v3 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 24 );
+
+            // Compute max(abs(e)) for the block
+            const __m256 signBit = _mm256_set1_ps( -0.0f );
+            __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+            maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+            maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+            maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+            __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+            max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+            max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+            const float maxScalar = _mm_cvtss_f32( max4 );
+
+            // Divided by 127.f to mirror results in quantize_row_q8_0
+            const float d = maxScalar  / 127.f;
+            id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
+
+            // Store the scale for the individual block
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+
+            // Store the values in blocks of eight values - Aim is to use these later for block interleaving
+            srcv[row_iter][0] = v0;
+            srcv[row_iter][1] = v1;
+            srcv[row_iter][2] = v2;
+            srcv[row_iter][3] = v3;
+            idvec[row_iter] = _mm256_set1_ps(id[row_iter]);
+        }
+
+        // The loop iterates four times - The aim is to get 4 corresponding chunks of eight bytes from the original weight blocks that are interleaved
+        for (int j = 0; j < 4; j++) {
+            // Apply the multiplier
+            __m256 v0 = _mm256_mul_ps(srcv[0][j], idvec[0]);
+            __m256 v1 = _mm256_mul_ps(srcv[1][j], idvec[1]);
+            __m256 v2 = _mm256_mul_ps(srcv[2][j], idvec[2]);
+            __m256 v3 = _mm256_mul_ps(srcv[3][j], idvec[3]);
+
+            // Round to nearest integer
+            v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+            v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+            v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+            v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+            // Convert floats to integers
+            __m256i i0 = _mm256_cvtps_epi32( v0 );
+            __m256i i1 = _mm256_cvtps_epi32( v1 );
+            __m256i i2 = _mm256_cvtps_epi32( v2 );
+            __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+            // Convert int32 to int16
+            i0 = _mm256_packs_epi32( i0, i1 );
+            i2 = _mm256_packs_epi32( i2, i3 );
+            // Convert int16 to int8
+            i0 = _mm256_packs_epi16( i0, i2 );
+
+            //  Permute and store the quantized weights in the required order after the pack instruction
+            const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+            i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+            _mm256_storeu_si256((__m256i *)(y[i].qs + 32 * j), i0);
+#else
+            // Since we don't have in AVX some necessary functions,
+            // we split the registers in half and call AVX2 analogs from SSE
+            __m128i ni0 = _mm256_castsi256_si128( i0 );
+            __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+            __m128i ni2 = _mm256_castsi256_si128( i1 );
+            __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+            __m128i ni4 = _mm256_castsi256_si128( i2 );
+            __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+            __m128i ni6 = _mm256_castsi256_si128( i3 );
+            __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+            // Convert int32 to int16
+            ni0 = _mm_packs_epi32( ni0, ni1 );
+            ni2 = _mm_packs_epi32( ni2, ni3 );
+            ni4 = _mm_packs_epi32( ni4, ni5 );
+            ni6 = _mm_packs_epi32( ni6, ni7 );
+            // Convert int16 to int8
+            ni0 = _mm_packs_epi16( ni0, ni2 );
+            ni4 = _mm_packs_epi16( ni4, ni6 );
+            _mm_storeu_si128((__m128i *)(y[i].qs + 32 * j), ni0);
+            _mm_storeu_si128((__m128i *)(y[i].qs + 32 * j + 16), ni4);
+#endif
+        }
+    }
+#else
+    // scalar
+    const int blck_size_interleave = 8;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+#endif
+}
+
+void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
+    assert(nrow == 4);
+    UNUSED(nrow);
+    if (blck_size_interleave == 4) {
+        quantize_q8_0_4x4(x, vy, n_per_row);
+    } else if (blck_size_interleave == 8) {
+        quantize_q8_0_4x8(x, vy, n_per_row);
+    } else {
+        assert(false);
+    }
+}
+
+void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    if (ggml_cpu_has_neon()) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+
+        __asm__ __volatile__(
+            "movi v31.16b, #0x4\n"
+            "movi v30.16b, #0xf0\n"
+            "add %x[b_ptr], %x[b_ptr], #0x8\n"
+            "1:"  // Column loop
+            "add x22, %x[a_ptr], #0x2\n"
+            "movi v29.16b, #0x0\n"
+            "mov x21, %x[nb]\n"
+            "2:"  // Block loop
+            "ldr q28, [%x[b_ptr], #0x0]\n"
+            "ldr q27, [x22, #0x0]\n"
+            "movi v26.4s, #0x0\n"
+            "sub x20, x22, #0x2\n"
+            "ldr q25, [x22, #0x10]\n"
+            "ldr q24, [%x[b_ptr], #0x10]\n"
+            "sub x21, x21, #0x1\n"
+            "add x22, x22, #0x22\n"
+            "ldr q23, [%x[b_ptr], #0x20]\n"
+            "ldr q22, [%x[b_ptr], #0x30]\n"
+            "ld1r { v21.8h }, [x20]\n"
+            "ldr q20, [%x[b_ptr], #-0x8]\n"
+            "sshl v16.16b, v28.16b, v31.16b\n"
+            "and v28.16b, v28.16b, v30.16b\n"
+            "sshl v19.16b, v24.16b, v31.16b\n"
+            "and v24.16b, v24.16b, v30.16b\n"
+            "add %x[b_ptr], %x[b_ptr], #0x48\n"
+            "sshl v18.16b, v23.16b, v31.16b\n"
+            "and v23.16b, v23.16b, v30.16b\n"
+            ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
+            "sshl v17.16b, v22.16b, v31.16b\n"
+            "and v22.16b, v22.16b, v30.16b\n"
+            "fcvtl v21.4s, v21.4h\n"
+            "fcvtl v16.4s, v20.4h\n"
+            ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
+            "fmul v16.4s, v16.4s, v21.4s\n"
+            ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
+            ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
+            ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
+            ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
+            ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
+            ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "fmla v29.4s, v26.4s, v16.4s\n"
+            "cbnz x21, 2b\n"
+            "sub %x[nc], %x[nc], #0x4\n"
+            "str q29, [%x[res_ptr], #0x0]\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "cbnz %x[nc], 1b\n"
+            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
+            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
+            : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
+            );
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    float sumf[4];
+    int sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
+}
+
+void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+
+        __asm__ __volatile__(
+            "movi v2.16b, #0x4\n"
+            "movi v1.16b, #0xf0\n"
+            "add %x[b_ptr], %x[b_ptr], #0x8\n"
+            "1:"  // Column loop
+            "add x23, %x[a_ptr], #0x2\n"
+            "movi v0.16b, #0x0\n"
+            "mov x22, %x[nb]\n"
+            "2:"  // Block loop
+            "ldr q31, [%x[b_ptr], #0x0]\n"
+            "ldr q30, [%x[b_ptr], #0x10]\n"
+            "mov x21, x23\n"
+            "movi v29.4s, #0x0\n"
+            "ldr q28, [%x[b_ptr], #0x20]\n"
+            "ldr q27, [%x[b_ptr], #0x30]\n"
+            "movi v26.4s, #0x0\n"
+            "sub x20, x23, #0x2\n"
+            "ld1r { v25.8h }, [x20]\n"
+            "ldr q24, [%x[b_ptr], #-0x8]\n"
+            "sub x22, x22, #0x1\n"
+            "add x23, x23, #0x22\n"
+            "ld1r { v23.2d }, [x21], #0x8\n"
+            "sshl v22.16b, v31.16b, v2.16b\n"
+            "sshl v16.16b, v30.16b, v2.16b\n"
+            "add %x[b_ptr], %x[b_ptr], #0x48\n"
+            "ld1r { v21.2d }, [x21], #0x8\n"
+            "sshl v20.16b, v28.16b, v2.16b\n"
+            "sshl v19.16b, v27.16b, v2.16b\n"
+            "ld1r { v18.2d }, [x21], #0x8\n"
+            "ld1r { v17.2d }, [x21], #0x8\n"
+            "and v31.16b, v31.16b, v1.16b\n"
+            "and v30.16b, v30.16b, v1.16b\n"
+            ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
+            ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
+            "and v28.16b, v28.16b, v1.16b\n"
+            "and v27.16b, v27.16b, v1.16b\n"
+            "fcvtl v25.4s, v25.4h\n"
+            "fcvtl v16.4s, v24.4h\n"
+            ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
+            ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
+            "fmul v16.4s, v16.4s, v25.4s\n"
+            ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
+            ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
+            ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
+            ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
+            "addp v29.4s, v29.4s, v26.4s\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "fmla v0.4s, v29.4s, v16.4s\n"
+            "cbnz x22, 2b\n"
+            "sub %x[nc], %x[nc], #0x4\n"
+            "str q0, [%x[res_ptr], #0x0]\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "cbnz %x[nc], 1b\n"
+            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
+            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
+            : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+        );
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    float sumf[4];
+    int sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
+}
+
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+    if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+
+        __asm__ __volatile__(
+            "ptrue p0.b\n"
+            "add %x[b_ptr], %x[b_ptr], #0x10\n"
+            "1:"  // Column loop
+            "add x22, %x[a_ptr], #0x2\n"
+            "mov z31.b, #0x0\n"
+            "mov x21, %x[nb]\n"
+            "2:"  // Block loop
+            "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
+            "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
+            "mov z28.s, #0x0\n"
+            "mov z27.s, #0x0\n"
+            "ld1rd { z26.d }, p0/Z, [x22]\n"
+            "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
+            "sub x20, x22, #0x2\n"
+            "sub x21, x21, #0x1\n"
+            "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
+            "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
+            "lsl z22.b, z30.b, #0x4\n"
+            "lsl z16.b, z29.b, #0x4\n"
+            "and z30.b, z30.b, #0xf0\n"
+            "and z29.b, z29.b, #0xf0\n"
+            "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
+            "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
+            "lsl z19.b, z25.b, #0x4\n"
+            "and z25.b, z25.b, #0xf0\n"
+            "ld1rh { z17.h }, p0/Z, [x20]\n"
+            "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
+            "sdot z28.s, z22.b, z26.b\n"
+            "sdot z27.s, z16.b, z26.b\n"
+            "lsl z16.b, z24.b, #0x4\n"
+            "add x22, x22, #0x22\n"
+            "and z24.b, z24.b, #0xf0\n"
+            "add %x[b_ptr], %x[b_ptr], #0x90\n"
+            "fcvt z17.s, p0/m, z17.h\n"
+            "fcvt z18.s, p0/m, z18.h\n"
+            "sdot z28.s, z19.b, z23.b\n"
+            "sdot z27.s, z16.b, z23.b\n"
+            "fmul z18.s, z18.s, z17.s\n"
+            "sdot z28.s, z30.b, z21.b\n"
+            "sdot z27.s, z29.b, z21.b\n"
+            "sdot z28.s, z25.b, z20.b\n"
+            "sdot z27.s, z24.b, z20.b\n"
+            "uzp1 z17.s, z28.s, z27.s\n"
+            "uzp2 z16.s, z28.s, z27.s\n"
+            "add z17.s, z17.s, z16.s\n"
+            "asr z17.s, z17.s, #0x4\n"
+            "scvtf z17.s, p0/m, z17.s\n"
+            "fmla z31.s, p0/M, z17.s, z18.s\n"
+            "cbnz x21, 2b\n"
+            "sub %x[nc], %x[nc], #0x8\n"
+            "st1w { z31.s }, p0, [%x[res_ptr]]\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "cbnz %x[nc], 1b\n"
+            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
+            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
+            : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+        );
+        return;
+    }
+#endif // #if defined(__ARM_FEATURE_SVE)
+#elif defined(__AVX2__)
+    // Lookup table to convert signed nibbles to signed bytes
+    __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
+    signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
+    __m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
+    __m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
+
+    // Permute mask used for easier vector processing at later stages
+    const __m256i m4b = _mm256_set1_epi8(0x0F);
+
+    int64_t b_nb = n / QK4_0;
+
+    const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
+    const block_q8_0 * a_ptr_start = (const block_q8_0 *)vy;
+
+    // Process Q8_0 blocks one by one
+    for (int64_t y = 0; y < nr; y++) {
+
+        // Pointers to LHS blocks of block_q8_0 format
+        const block_q8_0 * a_ptr = a_ptr_start + (y * nb);
+
+        // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
+        for (int64_t x = 0; x < nc / 8; x++) {
+
+            // Pointers to RHS blocks
+            const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
+
+            // Master FP accumulator
+            __m256 acc_row = _mm256_setzero_ps();
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Load 8 blocks of Q4_0 interleaved as 8 bytes (B0 - B7)
+                const __m256i rhs_raw_vec_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
+                const __m256i rhs_raw_vec_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 1);
+                const __m256i rhs_raw_vec_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 2);
+                const __m256i rhs_raw_vec_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 3);
+
+                // 4-bit -> 8-bit - Sign is maintained
+                const __m256i rhs_vec_0123_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_0, m4b)); // B0(0-7) B1(0-7) B2(0-7) B3(0-7)
+                const __m256i rhs_vec_4567_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_0, m4b)); // B4(0-7) B5(0-7) B6(0-7) B7(0-7)
+                const __m256i rhs_vec_0123_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
+                const __m256i rhs_vec_4567_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
+
+                const __m256i rhs_vec_0123_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_0, 4), m4b)); // B0(16-23) B1(16-23) B2(16-23) B3(16-23)
+                const __m256i rhs_vec_4567_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_0, 4), m4b)); // B4(16-23) B5(16-23) B6(16-23) B7(16-23)
+                const __m256i rhs_vec_0123_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_1, 4), m4b)); // B0(24-31) B1(24-31) B2(24-31) B3(24-31)
+                const __m256i rhs_vec_4567_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_1, 4), m4b)); // B4(24-31) B5(24-31) B6(24-31) B7(24-31)
+
+                // Load the scale values for the 8 blocks interleaved in block_q4_0x8
+                const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
+
+                // Load and convert to FP32 scale from block_q8_0
+                const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
+
+                // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
+                __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
+                __m256i lhs_vec_1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 16)));
+
+                lhs_vec_0 = _mm256_permute2f128_si256(lhs_vec_0, lhs_vec_0, 0); // A0 (0-15) A0(0-15)
+                lhs_vec_1 = _mm256_permute2f128_si256(lhs_vec_1, lhs_vec_1, 0); // A0 (16-31) A0(16-31))
+
+                __m256i iacc = _mm256_setzero_si256();
+
+                // Dot product done within 32 bit lanes and accumulated in the same vector
+                // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
+                // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
+                // ...........................................................................
+                // B0(28-31) B4(28-31) B1(28-31) B5(28-31) B2(28-31) B6(28-31) B3(28-31) B7(28-31) with A0(28-31)
+
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
+
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
+
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
+
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
+
+                // Accumulated values multipled with appropriate scales
+                acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
+            }
+
+            // Accumulated output values permuted so as to be stored in appropriate order post accumulation
+            acc_row = _mm256_permutevar8x32_ps(acc_row, finalpermutemask);
+            _mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
+        }
+    }
+    return;
+#elif defined(__riscv_v_intrinsic)
+    if (__riscv_vlenb() >= QK4_0) {
+        const size_t vl = QK4_0;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+            for (int l = 0; l < nb; l++) {
+                const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
+                const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
+                const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
+                const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
+                __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
+                const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
+                const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
+                const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
+
+                const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
+                const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
+                const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
+                const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
+                const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
+                const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
+                const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
+
+                const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
+                const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                // vector version needs Zvfhmin extension
+                const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
+                const float b_scales[8] = {
+                    GGML_FP16_TO_FP32(b_ptr[l].d[0]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[1]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[2]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[3]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[4]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[5]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[6]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[7])
+                };
+                const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
+                const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
+                sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
+            }
+            __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
+        }
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+    {
+        float sumf[8];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                        }
+                        sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
+void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    if (ggml_cpu_has_neon()) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+        size_t res_stride = bs * sizeof(float);
+
+        __asm__ __volatile__(
+            "mov x10, %x[nr]\n"
+            "mov x9, #0x88\n"
+            "cmp x10, #0x10\n"
+            "mul x9, %x[nb], x9\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x28, %x[b_ptr], #0x8\n"
+            "mov x27, %x[nc]\n"
+            "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x25, %x[a_ptr], #0x8\n"
+            "movi v15.16b, #0x0\n"
+            "movi v19.16b, #0x0\n"
+            "mov x24, %x[nb]\n"
+            "add x23, x25, x9\n"
+            "movi v18.16b, #0x0\n"
+            "movi v14.16b, #0x0\n"
+            "add x22, x23, x9\n"
+            "movi v11.16b, #0x0\n"
+            "movi v13.16b, #0x0\n"
+            "add x21, x22, x9\n"
+            "movi v23.16b, #0x0\n"
+            "movi v16.16b, #0x0\n"
+            "movi v25.16b, #0x0\n"
+            "movi v7.16b, #0x0\n"
+            "movi v0.16b, #0x0\n"
+            "movi v4.16b, #0x0\n"
+            "movi v5.16b, #0x0\n"
+            "movi v21.16b, #0x0\n"
+            "movi v8.16b, #0x0\n"
+            "movi v1.16b, #0x0\n"
+            "3:"  // Block loop
+            "ldr q3, [x28, #0x0]\n"
+            "ldr q31, [x25, #0x0]\n"
+            "movi v28.16b, #0x4\n"
+            "movi v10.4s, #0x0\n"
+            "ldr q22, [x28, #0x10]\n"
+            "ldr q6, [x25, #0x10]\n"
+            "movi v29.4s, #0x0\n"
+            "movi v9.4s, #0x0\n"
+            "ldr q27, [x28, #0x20]\n"
+            "ldr q30, [x28, #0x30]\n"
+            "movi v20.4s, #0x0\n"
+            "movi v24.16b, #0xf0\n"
+            "ldr d2, [x25, #-0x8]\n"
+            "ldr d26, [x23, #-0x8]\n"
+            "sshl v12.16b, v3.16b, v28.16b\n"
+            "sub x20, x28, #0x8\n"
+            "ldr d17, [x20, #0x0]\n"
+            "and v3.16b, v3.16b, v24.16b\n"
+            "subs x24, x24, #0x1\n"
+            "add x28, x28, #0x48\n"
+            ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
+            ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
+            ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
+            ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
+            "sshl v31.16b, v22.16b, v28.16b\n"
+            "and v22.16b, v22.16b, v24.16b\n"
+            "fcvtl v17.4s, v17.4h\n"
+            "fcvtl v2.4s, v2.4h\n"
+            "fcvtl v26.4s, v26.4h\n"
+            ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
+            ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
+            ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
+            ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
+            "sshl v6.16b, v27.16b, v28.16b\n"
+            "sshl v28.16b, v30.16b, v28.16b\n"
+            "and v27.16b, v27.16b, v24.16b\n"
+            "and v30.16b, v30.16b, v24.16b\n"
+            "ldr q24, [x25, #0x20]\n"
+            ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+            ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x30]\n"
+            ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x40]\n"
+            ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+            ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x50]\n"
+            ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
+            ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
+            ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x60]\n"
+            ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x70]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
+            ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
+            ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
+            "fmul v24.4s, v17.4s, v2.s[0]\n"
+            "scvtf v10.4s, v10.4s, #0x4\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "scvtf v9.4s, v9.4s, #0x4\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v15.4s, v10.4s, v24.4s\n"
+            "ldr q24, [x23, #0x0]\n"
+            "fmul v10.4s, v17.4s, v2.s[1]\n"
+            "fmla v19.4s, v29.4s, v10.4s\n"
+            "ldr q10, [x23, #0x10]\n"
+            "fmul v29.4s, v17.4s, v2.s[2]\n"
+            "fmul v2.4s, v17.4s, v2.s[3]\n"
+            "fmla v18.4s, v9.4s, v29.4s\n"
+            "movi v9.4s, #0x0\n"
+            "movi v29.4s, #0x0\n"
+            ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
+            "fmla v14.4s, v20.4s, v2.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v2.4s, #0x0\n"
+            ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+            "ldr q24, [x23, #0x20]\n"
+            ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
+            ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
+            ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
+            ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
+            "ldr q10, [x23, #0x30]\n"
+            ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+            ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+            "ldr q24, [x23, #0x40]\n"
+            ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
+            ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
+            ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
+            ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
+            "ldr q10, [x23, #0x50]\n"
+            ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+            ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+            "ldr q24, [x23, #0x60]\n"
+            ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
+            ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
+            ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
+            ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
+            "ldr q10, [x23, #0x70]\n"
+            "add x23, x23, #0x88\n"
+            ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x0]\n"
+            ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
+            ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
+            ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
+            ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
+            "fmul v10.4s, v17.4s, v26.s[0]\n"
+            "scvtf v9.4s, v9.4s, #0x4\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "scvtf v2.4s, v2.4s, #0x4\n"
+            "fmla v11.4s, v9.4s, v10.4s\n"
+            "ldr q9, [x22, #0x10]\n"
+            "fmul v10.4s, v17.4s, v26.s[1]\n"
+            "fmla v13.4s, v29.4s, v10.4s\n"
+            "ldr d29, [x22, #-0x8]\n"
+            "fmul v10.4s, v17.4s, v26.s[2]\n"
+            "fmul v26.4s, v17.4s, v26.s[3]\n"
+            "fcvtl v29.4s, v29.4h\n"
+            "fmla v23.4s, v20.4s, v10.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v10.4s, #0x0\n"
+            "fmla v16.4s, v2.4s, v26.4s\n"
+            "movi v26.4s, #0x0\n"
+            "movi v2.4s, #0x0\n"
+            ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+            ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x20]\n"
+            ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+            ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
+            ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
+            "ldr q9, [x22, #0x30]\n"
+            ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
+            ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x40]\n"
+            ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+            ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
+            "ldr q9, [x22, #0x50]\n"
+            ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
+            ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x60]\n"
+            ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+            ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
+            "ldr q9, [x22, #0x70]\n"
+            "add x22, x22, #0x88\n"
+            ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+            "ldr q24, [x21, #0x0]\n"
+            ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
+            ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
+            ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
+            "fmul v9.4s, v17.4s, v29.s[0]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "scvtf v10.4s, v10.4s, #0x4\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "scvtf v2.4s, v2.4s, #0x4\n"
+            "fmla v25.4s, v20.4s, v9.4s\n"
+            "ldr q9, [x21, #0x10]\n"
+            "fmul v20.4s, v17.4s, v29.s[1]\n"
+            "fmla v7.4s, v10.4s, v20.4s\n"
+            "ldr d20, [x21, #-0x8]\n"
+            "fmul v10.4s, v17.4s, v29.s[2]\n"
+            "fmul v29.4s, v17.4s, v29.s[3]\n"
+            "fcvtl v20.4s, v20.4h\n"
+            "fmla v0.4s, v26.4s, v10.4s\n"
+            "movi v26.4s, #0x0\n"
+            "movi v10.4s, #0x0\n"
+            "fmla v4.4s, v2.4s, v29.4s\n"
+            "movi v2.4s, #0x0\n"
+            "movi v29.4s, #0x0\n"
+            ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+            ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
+            "ldr q12, [x21, #0x20]\n"
+            "fmul v24.4s, v17.4s, v20.s[0]\n"
+            ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+            ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
+            ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
+            "ldr q9, [x21, #0x30]\n"
+            "fmul v31.4s, v17.4s, v20.s[1]\n"
+            ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
+            ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
+            ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
+            ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
+            "ldr q12, [x21, #0x40]\n"
+            "fmul v6.4s, v17.4s, v20.s[2]\n"
+            "fmul v20.4s, v17.4s, v20.s[3]\n"
+            ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+            ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
+            "ldr q9, [x21, #0x50]\n"
+            ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
+            ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
+            ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
+            ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
+            "ldr q12, [x21, #0x60]\n"
+            ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+            ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
+            "ldr q17, [x21, #0x70]\n"
+            "add x21, x21, #0x88\n"
+            ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
+            ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
+            ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
+            ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
+            ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
+            ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
+            ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
+            ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "scvtf v10.4s, v10.4s, #0x4\n"
+            "fmla v5.4s, v26.4s, v24.4s\n"
+            "scvtf v2.4s, v2.4s, #0x4\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "fmla v21.4s, v10.4s, v31.4s\n"
+            "fmla v8.4s, v2.4s, v6.4s\n"
+            "fmla v1.4s, v29.4s, v20.4s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x27, x27, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "str q15, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q19, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q18, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q14, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q11, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q13, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q23, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q16, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q25, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q7, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q0, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q4, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q5, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q21, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q8, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q1, [x20, #0x0]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x10, x10, #0x10\n"
+            "cmp x10, #0x10\n"
+            "mov %x[res_ptr], x26\n"
+            "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x10, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x24, %x[b_ptr], #0x8\n"
+            "mov x23, %x[nc]\n"
+            "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "movi v15.16b, #0x0\n"
+            "movi v19.16b, #0x0\n"
+            "add x25, %x[a_ptr], #0x8\n"
+            "mov x21, %x[nb]\n"
+            "movi v18.16b, #0x0\n"
+            "movi v14.16b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ldr q7, [x24, #0x0]\n"
+            "ldr q5, [x25, #0x0]\n"
+            "movi v9.16b, #0x4\n"
+            "movi v4.4s, #0x0\n"
+            "ldr q3, [x24, #0x10]\n"
+            "ldr q2, [x25, #0x10]\n"
+            "movi v1.4s, #0x0\n"
+            "movi v0.4s, #0x0\n"
+            "ldr q13, [x24, #0x20]\n"
+            "ldr q31, [x25, #0x20]\n"
+            "movi v30.4s, #0x0\n"
+            "movi v29.16b, #0xf0\n"
+            "ldr q28, [x24, #0x30]\n"
+            "ldr q27, [x25, #0x30]\n"
+            "sshl v20.16b, v7.16b, v9.16b\n"
+            "sub x20, x24, #0x8\n"
+            "ldr q26, [x25, #0x40]\n"
+            "ldr q25, [x25, #0x50]\n"
+            "sshl v17.16b, v3.16b, v9.16b\n"
+            "and v7.16b, v7.16b, v29.16b\n"
+            "ldr q24, [x25, #0x60]\n"
+            "ldr q16, [x25, #0x70]\n"
+            "sshl v22.16b, v13.16b, v9.16b\n"
+            "and v3.16b, v3.16b, v29.16b\n"
+            "ldr d21, [x20, #0x0]\n"
+            "ldr d12, [x25, #-0x8]\n"
+            ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
+            ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
+            ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
+            ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
+            "sshl v9.16b, v28.16b, v9.16b\n"
+            "subs x21, x21, #0x1\n"
+            "and v13.16b, v13.16b, v29.16b\n"
+            "and v28.16b, v28.16b, v29.16b\n"
+            "add x25, x25, #0x88\n"
+            "add x24, x24, #0x48\n"
+            "fcvtl v21.4s, v21.4h\n"
+            "fcvtl v12.4s, v12.4h\n"
+            ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
+            ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
+            ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
+            ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
+            "fmul v11.4s, v21.4s, v12.s[0]\n"
+            "fmul v23.4s, v21.4s, v12.s[1]\n"
+            "fmul v17.4s, v21.4s, v12.s[2]\n"
+            ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
+            "fmul v6.4s, v21.4s, v12.s[3]\n"
+            ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
+            ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
+            ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
+            ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
+            ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
+            ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
+            ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
+            ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
+            ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
+            ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
+            ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
+            ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
+            ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
+            ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
+            ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
+            ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
+            ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
+            ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
+            ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
+            ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
+            ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
+            "scvtf v4.4s, v4.4s, #0x4\n"
+            "scvtf v1.4s, v1.4s, #0x4\n"
+            "scvtf v0.4s, v0.4s, #0x4\n"
+            "fmla v15.4s, v4.4s, v11.4s\n"
+            "scvtf v30.4s, v30.4s, #0x4\n"
+            "fmla v19.4s, v1.4s, v23.4s\n"
+            "fmla v18.4s, v0.4s, v17.4s\n"
+            "fmla v14.4s, v30.4s, v6.4s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x10, #0x1\n"
+            "str q15, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x2\n"
+            "str q19, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x3\n"
+            "str q18, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "str q14, [x20, #0x0]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x23, x23, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "bne 6b\n"
+            "subs x10, x10, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x9\n"
+            "mov %x[res_ptr], x22\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+        );
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                    const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                                }
+                                sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
+void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+        size_t res_stride = bs * sizeof(float);
+
+        __asm__ __volatile__(
+            "mov x10, %x[nr]\n"
+            "mov x9, #0x88\n"
+            "cmp x10, #0x10\n"
+            "mul x9, %x[nb], x9\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x28, %x[b_ptr], #0x8\n"
+            "mov x27, %x[nc]\n"
+            "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x25, %x[a_ptr], #0x8\n"
+            "movi v2.16b, #0x0\n"
+            "movi v10.16b, #0x0\n"
+            "mov x24, %x[nb]\n"
+            "add x23, x25, x9\n"
+            "movi v12.16b, #0x0\n"
+            "movi v28.16b, #0x0\n"
+            "add x22, x23, x9\n"
+            "movi v11.16b, #0x0\n"
+            "movi v13.16b, #0x0\n"
+            "add x21, x22, x9\n"
+            "movi v22.16b, #0x0\n"
+            "movi v23.16b, #0x0\n"
+            "movi v25.16b, #0x0\n"
+            "movi v5.16b, #0x0\n"
+            "movi v7.16b, #0x0\n"
+            "movi v4.16b, #0x0\n"
+            "movi v6.16b, #0x0\n"
+            "movi v30.16b, #0x0\n"
+            "movi v24.16b, #0x0\n"
+            "movi v14.16b, #0x0\n"
+            "3:"  // Block loop
+            "ldr q21, [x28, #0x0]\n"
+            "ldr q16, [x28, #0x10]\n"
+            "movi v1.16b, #0x4\n"
+            "movi v19.4s, #0x0\n"
+            "ldr q27, [x25, #0x0]\n"
+            "ldr q15, [x25, #0x10]\n"
+            "movi v26.4s, #0x0\n"
+            "movi v18.4s, #0x0\n"
+            "ldr q29, [x28, #0x20]\n"
+            "ldr q3, [x28, #0x30]\n"
+            "movi v17.4s, #0x0\n"
+            "movi v0.16b, #0xf0\n"
+            "ldr d20, [x25, #-0x8]\n"
+            "ldr d9, [x23, #-0x8]\n"
+            "sshl v8.16b, v21.16b, v1.16b\n"
+            "sshl v31.16b, v16.16b, v1.16b\n"
+            "and v21.16b, v21.16b, v0.16b\n"
+            "and v16.16b, v16.16b, v0.16b\n"
+            "sub x20, x28, #0x8\n"
+            "subs x24, x24, #0x1\n"
+            "add x28, x28, #0x48\n"
+            ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
+            ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
+            "ldr q27, [x25, #0x20]\n"
+            ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
+            ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
+            "sshl v15.16b, v29.16b, v1.16b\n"
+            "sshl v1.16b, v3.16b, v1.16b\n"
+            "and v29.16b, v29.16b, v0.16b\n"
+            "and v3.16b, v3.16b, v0.16b\n"
+            "ldr q0, [x25, #0x30]\n"
+            "fcvtl v20.4s, v20.4h\n"
+            ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
+            "fcvtl v9.4s, v9.4h\n"
+            ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
+            "ldr q27, [x25, #0x40]\n"
+            ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
+            ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+            "ldr q0, [x25, #0x50]\n"
+            ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
+            ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
+            "ldr q27, [x25, #0x60]\n"
+            ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
+            ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
+            "ldr q0, [x25, #0x70]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
+            ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
+            "ldr d27, [x20, #0x0]\n"
+            ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
+            ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
+            "fcvtl v27.4s, v27.4h\n"
+            "uzp1 v0.2d, v19.2d, v26.2d\n"
+            "uzp2 v26.2d, v19.2d, v26.2d\n"
+            "fmul v19.4s, v27.4s, v20.s[0]\n"
+            "scvtf v0.4s, v0.4s, #0x4\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "fmla v2.4s, v0.4s, v19.4s\n"
+            "ldr q19, [x23, #0x0]\n"
+            "uzp1 v0.2d, v18.2d, v17.2d\n"
+            "uzp2 v18.2d, v18.2d, v17.2d\n"
+            "fmul v17.4s, v27.4s, v20.s[1]\n"
+            "scvtf v0.4s, v0.4s, #0x4\n"
+            "scvtf v18.4s, v18.4s, #0x4\n"
+            "fmla v10.4s, v26.4s, v17.4s\n"
+            "ldr q17, [x23, #0x10]\n"
+            "fmul v26.4s, v27.4s, v20.s[2]\n"
+            "fmul v20.4s, v27.4s, v20.s[3]\n"
+            "fmla v12.4s, v0.4s, v26.4s\n"
+            "ldr d0, [x22, #-0x8]\n"
+            "ldr d26, [x21, #-0x8]\n"
+            "fcvtl v0.4s, v0.4h\n"
+            "fmla v28.4s, v18.4s, v20.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v18.4s, #0x0\n"
+            ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+            ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+            "ldr q19, [x23, #0x20]\n"
+            "fcvtl v26.4s, v26.4h\n"
+            ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+            ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+            "ldr q19, [x23, #0x40]\n"
+            ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+            ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+            "ldr q19, [x23, #0x60]\n"
+            ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
+            ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
+            "uzp1 v19.2d, v20.2d, v18.2d\n"
+            "scvtf v19.4s, v19.4s, #0x4\n"
+            "uzp2 v20.2d, v20.2d, v18.2d\n"
+            "fmul v18.4s, v27.4s, v9.s[0]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v11.4s, v19.4s, v18.4s\n"
+            "ldr q18, [x22, #0x0]\n"
+            "fmul v19.4s, v27.4s, v9.s[1]\n"
+            "fmla v13.4s, v20.4s, v19.4s\n"
+            "movi v19.4s, #0x0\n"
+            "movi v20.4s, #0x0\n"
+            ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
+            ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
+            "ldr q17, [x23, #0x30]\n"
+            ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
+            ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
+            "ldr q17, [x23, #0x50]\n"
+            ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
+            ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
+            "ldr q17, [x23, #0x70]\n"
+            "add x23, x23, #0x88\n"
+            ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
+            ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
+            "uzp1 v17.2d, v19.2d, v20.2d\n"
+            "scvtf v17.4s, v17.4s, #0x4\n"
+            "uzp2 v20.2d, v19.2d, v20.2d\n"
+            "fmul v19.4s, v27.4s, v9.s[2]\n"
+            "fmul v9.4s, v27.4s, v9.s[3]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v22.4s, v17.4s, v19.4s\n"
+            "ldr q17, [x22, #0x10]\n"
+            "movi v19.4s, #0x0\n"
+            ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
+            "fmla v23.4s, v20.4s, v9.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v9.4s, #0x0\n"
+            ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
+            "ldr q18, [x22, #0x20]\n"
+            ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+            ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
+            ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
+            "ldr q18, [x22, #0x40]\n"
+            ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
+            ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
+            "ldr q18, [x22, #0x60]\n"
+            ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
+            ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
+            "movi v18.4s, #0x0\n"
+            ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
+            "ldr q17, [x22, #0x30]\n"
+            ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+            ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
+            "ldr q17, [x22, #0x50]\n"
+            ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
+            ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
+            "ldr q17, [x22, #0x70]\n"
+            "add x22, x22, #0x88\n"
+            ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
+            ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
+            "uzp1 v17.2d, v19.2d, v20.2d\n"
+            "uzp2 v20.2d, v19.2d, v20.2d\n"
+            "fmul v19.4s, v27.4s, v0.s[0]\n"
+            "scvtf v17.4s, v17.4s, #0x4\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v25.4s, v17.4s, v19.4s\n"
+            "ldr q19, [x21, #0x0]\n"
+            "fmul v17.4s, v27.4s, v0.s[1]\n"
+            "fmla v5.4s, v20.4s, v17.4s\n"
+            "ldr q17, [x21, #0x10]\n"
+            "uzp1 v20.2d, v9.2d, v18.2d\n"
+            "uzp2 v9.2d, v9.2d, v18.2d\n"
+            "fmul v18.4s, v27.4s, v0.s[2]\n"
+            "fmul v0.4s, v27.4s, v0.s[3]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "scvtf v9.4s, v9.4s, #0x4\n"
+            "fmla v7.4s, v20.4s, v18.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v18.4s, #0x0\n"
+            ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+            ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+            "ldr q19, [x21, #0x20]\n"
+            "fmla v4.4s, v9.4s, v0.4s\n"
+            "movi v9.4s, #0x0\n"
+            "movi v0.4s, #0x0\n"
+            ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+            "fmul v8.4s, v27.4s, v26.s[0]\n"
+            ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
+            "ldr q17, [x21, #0x30]\n"
+            ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+            "fmul v31.4s, v27.4s, v26.s[1]\n"
+            ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+            "ldr q19, [x21, #0x40]\n"
+            ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+            "fmul v15.4s, v27.4s, v26.s[2]\n"
+            "fmul v27.4s, v27.4s, v26.s[3]\n"
+            ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
+            "ldr q1, [x21, #0x50]\n"
+            ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+            ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+            "ldr q26, [x21, #0x60]\n"
+            ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
+            ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
+            "ldr q21, [x21, #0x70]\n"
+            "add x21, x21, #0x88\n"
+            ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
+            ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
+            ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
+            ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
+            "uzp1 v29.2d, v20.2d, v18.2d\n"
+            "uzp2 v21.2d, v20.2d, v18.2d\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "uzp1 v18.2d, v9.2d, v0.2d\n"
+            "uzp2 v16.2d, v9.2d, v0.2d\n"
+            "scvtf v21.4s, v21.4s, #0x4\n"
+            "fmla v6.4s, v29.4s, v8.4s\n"
+            "scvtf v18.4s, v18.4s, #0x4\n"
+            "scvtf v16.4s, v16.4s, #0x4\n"
+            "fmla v30.4s, v21.4s, v31.4s\n"
+            "fmla v24.4s, v18.4s, v15.4s\n"
+            "fmla v14.4s, v16.4s, v27.4s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x27, x27, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "str q2, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q10, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q12, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q28, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q11, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q13, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q22, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q23, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q25, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q5, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q7, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q4, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q6, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q30, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q24, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q14, [x20, #0x0]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x10, x10, #0x10\n"
+            "cmp x10, #0x10\n"
+            "mov %x[res_ptr], x26\n"
+            "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x10, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x24, %x[b_ptr], #0x8\n"
+            "mov x23, %x[nc]\n"
+            "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "movi v2.16b, #0x0\n"
+            "movi v10.16b, #0x0\n"
+            "add x25, %x[a_ptr], #0x8\n"
+            "mov x21, %x[nb]\n"
+            "movi v12.16b, #0x0\n"
+            "movi v28.16b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ldr q6, [x24, #0x0]\n"
+            "ldr q5, [x24, #0x10]\n"
+            "movi v17.16b, #0x4\n"
+            "movi v8.4s, #0x0\n"
+            "ldr q4, [x25, #0x0]\n"
+            "ldr q13, [x25, #0x10]\n"
+            "movi v27.4s, #0x0\n"
+            "movi v0.4s, #0x0\n"
+            "ldr q31, [x24, #0x20]\n"
+            "ldr q14, [x24, #0x30]\n"
+            "movi v29.4s, #0x0\n"
+            "movi v22.16b, #0xf0\n"
+            "ldr q11, [x25, #0x20]\n"
+            "ldr q23, [x25, #0x30]\n"
+            "sshl v21.16b, v6.16b, v17.16b\n"
+            "sshl v16.16b, v5.16b, v17.16b\n"
+            "ldr q20, [x25, #0x40]\n"
+            "ldr q26, [x25, #0x50]\n"
+            "and v6.16b, v6.16b, v22.16b\n"
+            "and v5.16b, v5.16b, v22.16b\n"
+            "ldr q25, [x25, #0x60]\n"
+            "ldr q3, [x25, #0x70]\n"
+            "sshl v19.16b, v31.16b, v17.16b\n"
+            "sshl v18.16b, v14.16b, v17.16b\n"
+            "ldr d17, [x25, #-0x8]\n"
+            ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
+            ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
+            "and v31.16b, v31.16b, v22.16b\n"
+            ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
+            ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
+            "and v14.16b, v14.16b, v22.16b\n"
+            "sub x20, x24, #0x8\n"
+            "ldr d16, [x20, #0x0]\n"
+            "subs x21, x21, #0x1\n"
+            "add x25, x25, #0x88\n"
+            "fcvtl v17.4s, v17.4h\n"
+            "add x24, x24, #0x48\n"
+            ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
+            ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
+            ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
+            ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
+            "fcvtl v16.4s, v16.4h\n"
+            ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
+            ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
+            "fmul v23.4s, v16.4s, v17.s[0]\n"
+            "fmul v21.4s, v16.4s, v17.s[1]\n"
+            "fmul v1.4s, v16.4s, v17.s[2]\n"
+            "fmul v20.4s, v16.4s, v17.s[3]\n"
+            ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
+            ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
+            ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
+            ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
+            ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
+            ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
+            "uzp1 v19.2d, v8.2d, v27.2d\n"
+            "uzp2 v18.2d, v8.2d, v27.2d\n"
+            "scvtf v19.4s, v19.4s, #0x4\n"
+            "uzp1 v17.2d, v0.2d, v29.2d\n"
+            "uzp2 v16.2d, v0.2d, v29.2d\n"
+            "scvtf v18.4s, v18.4s, #0x4\n"
+            "fmla v2.4s, v19.4s, v23.4s\n"
+            "scvtf v17.4s, v17.4s, #0x4\n"
+            "scvtf v16.4s, v16.4s, #0x4\n"
+            "fmla v10.4s, v18.4s, v21.4s\n"
+            "fmla v12.4s, v17.4s, v1.4s\n"
+            "fmla v28.4s, v16.4s, v20.4s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x10, #0x1\n"
+            "str q2, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x2\n"
+            "str q10, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x3\n"
+            "str q12, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "str q28, [x20, #0x0]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x23, x23, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "bne 6b\n"
+            "subs x10, x10, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x9\n"
+            "mov %x[res_ptr], x22\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+        );
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    float sumf[4][4];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                        (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
+}
+
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+        size_t res_stride = bs * sizeof(float);
+
+        __asm__ __volatile__(
+            "mov x20, #0x4\n"
+            "mov x13, %x[nr]\n"
+            "mov z28.s, #-0x4\n"
+            "mov x12, #0x88\n"
+            "ptrue p1.b\n"
+            "whilelt p0.s, XZR, x20\n"
+            "cmp x13, #0x10\n"
+            "mul x12, %x[nb], x12\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x11, %x[b_ptr], #0x10\n"
+            "mov x10, %x[nc]\n"
+            "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x28, %x[a_ptr], #0x8\n"
+            "mov z24.b, #0x0\n"
+            "mov z15.b, #0x0\n"
+            "mov x27, %x[nb]\n"
+            "add x26, x28, x12\n"
+            "mov z12.b, #0x0\n"
+            "mov z0.b, #0x0\n"
+            "add x25, x26, x12\n"
+            "mov z13.b, #0x0\n"
+            "mov z1.b, #0x0\n"
+            "add x24, x25, x12\n"
+            "mov z20.b, #0x0\n"
+            "mov z25.b, #0x0\n"
+            "mov z11.b, #0x0\n"
+            "mov z16.b, #0x0\n"
+            "mov z19.b, #0x0\n"
+            "mov z26.b, #0x0\n"
+            "mov z8.b, #0x0\n"
+            "mov z29.b, #0x0\n"
+            "mov z27.b, #0x0\n"
+            "mov z10.b, #0x0\n"
+            "3:"  // Block loop
+            "ld1b { z30.b }, p1/Z, [x11]\n"
+            "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
+            "mov z18.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            "ld1rqb { z3.b }, p1/Z, [x28]\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
+            "mov z9.s, #0x0\n"
+            "mov z22.s, #0x0\n"
+            "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
+            "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
+            "sub x20, x11, #0x10\n"
+            "sub x23, x28, #0x8\n"
+            "lsl z31.b, z30.b, #0x4\n"
+            "lsl z6.b, z21.b, #0x4\n"
+            "ld1h { z23.s }, p1/Z, [x20]\n"
+            "sub x22, x26, #0x8\n"
+            "and z30.b, z30.b, #0xf0\n"
+            "and z21.b, z21.b, #0xf0\n"
+            "sub x21, x25, #0x8\n"
+            "sub x20, x24, #0x8\n"
+            "lsl z14.b, z4.b, #0x4\n"
+            "lsl z2.b, z17.b, #0x4\n"
+            "subs x27, x27, #0x1\n"
+            "add x11, x11, #0x90\n"
+            ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
+            ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
+            "and z4.b, z4.b, #0xf0\n"
+            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
+            "and z17.b, z17.b, #0xf0\n"
+            "fcvt z23.s, p1/m, z23.h\n"
+            ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
+            ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
+            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
+            "fscale z23.s, p1/m, z23.s, z28.s\n"
+            ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
+            ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
+            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
+            "add x28, x28, #0x88\n"
+            ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
+            ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
+            "ld1h { z3.s }, p0/Z, [x23]\n"
+            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+            "fcvt z3.s, p1/m, z3.h\n"
+            "uzp1 z5.d, z18.d, z7.d\n"
+            "uzp2 z18.d, z18.d, z7.d\n"
+            "mov z3.q, z3.q[0]\n"
+            "uzp1 z7.d, z9.d, z22.d\n"
+            "uzp2 z22.d, z9.d, z22.d\n"
+            "fmul z9.s, z23.s, z3.s[0]\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "scvtf z7.s, p1/m, z7.s\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z24.s, p1/M, z5.s, z9.s\n"
+            "ld1rqb { z5.b }, p1/Z, [x26]\n"
+            "fmul z9.s, z23.s, z3.s[1]\n"
+            "fmla z15.s, p1/M, z18.s, z9.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
+            "fmul z9.s, z23.s, z3.s[2]\n"
+            "fmul z3.s, z23.s, z3.s[3]\n"
+            "fmla z12.s, p1/M, z7.s, z9.s\n"
+            "mov z9.s, #0x0\n"
+            "ld1h { z7.s }, p0/Z, [x22]\n"
+            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+            "fmla z0.s, p1/M, z22.s, z3.s\n"
+            "mov z22.s, #0x0\n"
+            "ld1h { z3.s }, p0/Z, [x21]\n"
+            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
+            "fcvt z7.s, p1/m, z7.h\n"
+            "fcvt z3.s, p1/m, z3.h\n"
+            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
+            "mov z7.q, z7.q[0]\n"
+            "mov z3.q, z3.q[0]\n"
+            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
+            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+            "uzp1 z5.d, z9.d, z22.d\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "uzp2 z22.d, z9.d, z22.d\n"
+            "fmul z9.s, z23.s, z7.s[0]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z13.s, p1/M, z5.s, z9.s\n"
+            "ld1rqb { z9.b }, p1/Z, [x25]\n"
+            "fmul z5.s, z23.s, z7.s[1]\n"
+            "fmla z1.s, p1/M, z22.s, z5.s\n"
+            "mov z5.s, #0x0\n"
+            "mov z22.s, #0x0\n"
+            ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
+            ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
+            ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
+            ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
+            ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
+            ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
+            "add x26, x26, #0x88\n"
+            ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
+            ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z5.d, z22.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp2 z22.d, z5.d, z22.d\n"
+            "fmul z5.s, z23.s, z7.s[2]\n"
+            "fmul z7.s, z23.s, z7.s[3]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z20.s, p1/M, z18.s, z5.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
+            "ld1h { z5.s }, p0/Z, [x20]\n"
+            "fcvt z5.s, p1/m, z5.h\n"
+            "fmla z25.s, p1/M, z22.s, z7.s\n"
+            "mov z22.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
+            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
+            "mov z5.q, z5.q[0]\n"
+            ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
+            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
+            ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
+            ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
+            ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
+            ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
+            "uzp1 z9.d, z22.d, z7.d\n"
+            "scvtf z9.s, p1/m, z9.s\n"
+            "uzp2 z22.d, z22.d, z7.d\n"
+            "fmul z7.s, z23.s, z3.s[0]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z11.s, p1/M, z9.s, z7.s\n"
+            "ld1rqb { z9.b }, p1/Z, [x24]\n"
+            "fmul z7.s, z23.s, z3.s[1]\n"
+            "fmla z16.s, p1/M, z22.s, z7.s\n"
+            "mov z22.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
+            ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
+            ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
+            ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
+            ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
+            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
+            ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z22.d, z7.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp2 z7.d, z22.d, z7.d\n"
+            "fmul z22.s, z23.s, z3.s[2]\n"
+            "fmul z3.s, z23.s, z3.s[3]\n"
+            "scvtf z7.s, p1/m, z7.s\n"
+            "fmla z19.s, p1/M, z18.s, z22.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
+            "fmul z22.s, z23.s, z5.s[0]\n"
+            "fmla z26.s, p1/M, z7.s, z3.s\n"
+            "mov z3.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
+            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
+            ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
+            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+            "mov z9.s, #0x0\n"
+            ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
+            "mov z31.s, #0x0\n"
+            ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
+            "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
+            ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
+            "fmul z14.s, z23.s, z5.s[1]\n"
+            ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
+            "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
+            "fmul z2.s, z23.s, z5.s[2]\n"
+            "fmul z23.s, z23.s, z5.s[3]\n"
+            ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
+            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
+            ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
+            ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
+            "add x24, x24, #0x88\n"
+            ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
+            ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
+            ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
+            ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z3.d, z7.d\n"
+            "uzp2 z5.d, z3.d, z7.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp1 z6.d, z9.d, z31.d\n"
+            "uzp2 z9.d, z9.d, z31.d\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "fmla z8.s, p1/M, z18.s, z22.s\n"
+            "scvtf z6.s, p1/m, z6.s\n"
+            "scvtf z9.s, p1/m, z9.s\n"
+            "fmla z29.s, p1/M, z5.s, z14.s\n"
+            "fmla z27.s, p1/M, z6.s, z2.s\n"
+            "fmla z10.s, p1/M, z9.s, z23.s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x10, x10, #0x8\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "st1w { z24.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z15.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z12.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z0.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z13.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z1.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z20.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z25.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z11.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z16.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z19.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z26.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z8.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z29.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z27.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z10.s }, p1, [x20]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x13, x13, #0x10\n"
+            "cmp x13, #0x10\n"
+            "mov %x[res_ptr], x9\n"
+            "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x13, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x25, %x[b_ptr], #0x10\n"
+            "mov x24, %x[nc]\n"
+            "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "mov z24.b, #0x0\n"
+            "mov z15.b, #0x0\n"
+            "add x28, %x[a_ptr], #0x8\n"
+            "mov x22, %x[nb]\n"
+            "mov z12.b, #0x0\n"
+            "mov z0.b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ld1b { z3.b }, p1/Z, [x25]\n"
+            "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
+            "mov z2.s, #0x0\n"
+            "mov z25.s, #0x0\n"
+            "ld1rqb { z26.b }, p1/Z, [x28]\n"
+            "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
+            "mov z27.s, #0x0\n"
+            "mov z19.s, #0x0\n"
+            "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
+            "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
+            "sub x21, x25, #0x10\n"
+            "sub x20, x28, #0x8\n"
+            "lsl z20.b, z3.b, #0x4\n"
+            "lsl z4.b, z6.b, #0x4\n"
+            "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
+            "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
+            "and z3.b, z3.b, #0xf0\n"
+            "and z6.b, z6.b, #0xf0\n"
+            "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
+            "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
+            "lsl z8.b, z29.b, #0x4\n"
+            "lsl z14.b, z16.b, #0x4\n"
+            "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
+            "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
+            ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
+            ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
+            "and z29.b, z29.b, #0xf0\n"
+            "ld1h { z17.s }, p1/Z, [x21]\n"
+            ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
+            ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
+            "and z16.b, z16.b, #0xf0\n"
+            "ld1h { z4.s }, p0/Z, [x20]\n"
+            "subs x22, x22, #0x1\n"
+            "add x28, x28, #0x88\n"
+            "fcvt z17.s, p1/m, z17.h\n"
+            "add x25, x25, #0x90\n"
+            ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
+            ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
+            "fcvt z4.s, p1/m, z4.h\n"
+            ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
+            ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
+            "fscale z17.s, p1/m, z17.s, z28.s\n"
+            "mov z4.q, z4.q[0]\n"
+            ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
+            ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
+            "fmul z23.s, z17.s, z4.s[0]\n"
+            "fmul z9.s, z17.s, z4.s[1]\n"
+            "fmul z21.s, z17.s, z4.s[2]\n"
+            "fmul z4.s, z17.s, z4.s[3]\n"
+            ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
+            ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
+            ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
+            ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
+            ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
+            ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
+            "uzp1 z31.d, z2.d, z25.d\n"
+            "uzp2 z13.d, z2.d, z25.d\n"
+            "scvtf z31.s, p1/m, z31.s\n"
+            "uzp1 z17.d, z27.d, z19.d\n"
+            "uzp2 z18.d, z27.d, z19.d\n"
+            "scvtf z13.s, p1/m, z13.s\n"
+            "fmla z24.s, p1/M, z31.s, z23.s\n"
+            "scvtf z17.s, p1/m, z17.s\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "fmla z15.s, p1/M, z13.s, z9.s\n"
+            "fmla z12.s, p1/M, z17.s, z21.s\n"
+            "fmla z0.s, p1/M, z18.s, z4.s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x13, #0x1\n"
+            "st1w { z24.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x13, #0x2\n"
+            "st1w { z15.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x13, #0x3\n"
+            "st1w { z12.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "st1w { z0.s }, p1, [x20]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x24, x24, #0x8\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "bne 6b\n"
+            "subs x13, x13, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x12\n"
+            "mov %x[res_ptr], x23\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+            : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+        );
+        return;
+    }
+#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+#elif defined(__AVX2__) || defined(__AVX512F__)
+    {
+        const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
+        const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
+        int64_t b_nb = n / QK4_0;
+        int64_t y = 0;
+        // Mask to mask out nibbles from packed bytes
+        const __m256i m4b = _mm256_set1_epi8(0x0F);
+        const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
+        // Lookup table to convert signed nibbles to signed bytes
+        __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
+        signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
+        // Permute mask used for easier vector processing at later stages
+        __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
+        int64_t xstart = 0;
+        int anr = nr - nr%16; // Used to align nr with boundary of 16
+    #ifdef __AVX512F__
+        int anc = nc - nc%16; // Used to align nc with boundary of 16
+        // Mask to mask out nibbles from packed bytes expanded to 512 bit length
+        const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
+        // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
+        __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
+
+        // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
+        for (; y < anr / 4; y += 4) {
+
+            const block_q8_0x4 * a_ptrs[4];
+
+            a_ptrs[0] = a_ptr_start + (y * nb);
+            for (int i = 0; i < 3; ++i) {
+                a_ptrs[i + 1] = a_ptrs[i] + nb;
+            }
+
+            // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
+            for (int64_t x = 0; x < anc / 8; x += 2) {
+
+                const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
+                const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
+
+                // Master FP accumulators
+                __m512 acc_rows[16];
+                for (int i = 0; i < 16; i++) {
+                    acc_rows[i] = _mm512_setzero_ps();
+                }
+
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
+
+                    const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
+                    const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
+                    const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
+                    const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
+
+                    // Save the values in the following vectors in the formats B0B1B4B5B8B9BCBD, B2B3B6B7BABBBEBF for further processing and storing of values
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                    const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
+                    const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
+
+                    const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
+                    const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
+
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
+                    const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
+
+                    const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
+                    const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
+
+                    const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
+                    const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
+
+                    const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
+                    const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
+
+                    // Shuffle pattern one - right side input
+                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
+                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+
+                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
+                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+
+                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
+                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+
+                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
+                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+
+                    // Shuffle pattern two - right side input
+
+                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
+                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+
+                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
+                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+
+                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
+                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+
+                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
+                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+
+                    // Scale values - Load the weight scale values of two block_q4_0x8
+                    const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
+
+                    // Process LHS in pairs of rows
+                    for (int rp = 0; rp < 4; rp++) {
+
+                        // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                        // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
+                        __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
+                        __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
+                        __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
+                        __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
+                        __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
+                        __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
+                        __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
+                        __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
+                        __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
+                        __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
+                        __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
+                        __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
+
+                        __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
+                        __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
+                        __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
+                        __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
+                        __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
+                        __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
+                        __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
+                        __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
+
+                        // Shuffle pattern one - left side input
+
+                        const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                        const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                        const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                        const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                        const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                        const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                        const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                        const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                        // Shuffle pattern two - left side input
+
+                        const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                        const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                        const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                        const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                        const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                        const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                        const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                        const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                        // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                        // Resembles MMLAs into 2x2 matrices in ARM Version
+                        __m512i iacc_mat_00_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
+                        __m512i iacc_mat_01_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                        __m512i iacc_mat_10_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
+                        __m512i iacc_mat_11_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                        __m512i iacc_mat_00_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
+                        __m512i iacc_mat_01_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
+                        __m512i iacc_mat_10_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
+                        __m512i iacc_mat_11_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
+
+                        // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                        __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                        __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                        __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                        __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+
+                        // Straighten out to make 4 row vectors
+                        __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
+                        __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
+                        __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
+                        __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
+
+                        // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                        const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
+                        const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
+
+                        // Multiply with appropiate scales and accumulate
+                        acc_rows[rp * 4]     = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[rp * 4]);
+                        acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[rp * 4 + 1]);
+                        acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
+                        acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
+                    }
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 16; i++) {
+                    _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+                }
+            }
+        }
+        // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
+        for (; y < nr / 4; y ++) {
+
+            const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
+
+            // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
+            for (int64_t x = 0; x < anc / 8; x += 2) {
+
+                const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
+                const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
+
+                // Master FP accumulators
+                __m512 acc_rows[4];
+                for (int i = 0; i < 4; i++) {
+                    acc_rows[i] = _mm512_setzero_ps();
+                }
+
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
+
+                    const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
+                    const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
+                    const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
+                    const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
+
+                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                    const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
+                    const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
+
+                    const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
+                    const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
+
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
+                    const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
+
+                    const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
+                    const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
+
+                    const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
+                    const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
+
+                    const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
+                    const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
+
+                    // Shuffle pattern one - right side input
+                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
+                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+
+                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
+                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+
+                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
+                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+
+                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
+                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+
+                    // Shuffle pattern two - right side input
+
+                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
+                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+
+                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
+                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+
+                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
+                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+
+                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
+                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+
+
+                    // Scale values - Load the weight scale values of two block_q4_0x8
+                    const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
+
+                    // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                    // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
+                    __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
+                    __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
+                    __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
+                    __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
+                    __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
+                    __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
+                    __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
+                    __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
+                    __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
+                    __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
+                    __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
+                    __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
+
+                    __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
+                    __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
+                    __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
+                    __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
+                    __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
+                    __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
+                    __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
+                    __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
+
+                    // Shuffle pattern one - left side input
+
+                    const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                    const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                    const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                    const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                    const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                    const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                    const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                    const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                    // Shuffle pattern two - left side input
+
+                    const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                    const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                    const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                    const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                    const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                    const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                    const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                    const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                    // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                    // Resembles MMLAs into 2x2 matrices in ARM Version
+                    __m512i iacc_mat_00_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
+                    __m512i iacc_mat_01_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                    __m512i iacc_mat_10_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
+                    __m512i iacc_mat_11_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                    __m512i iacc_mat_00_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
+                    __m512i iacc_mat_01_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
+                    __m512i iacc_mat_10_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
+                    __m512i iacc_mat_11_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
+
+                    // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                    __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                    __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                    __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                    __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+
+                    // Straighten out to make 4 row vectors
+                    __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
+                    __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
+                    __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
+                    __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
+
+                    // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                    const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
+                    const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
+
+                    // Multiply with appropiate scales and accumulate
+                    acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[0]);
+                    acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[1]);
+                    acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
+                    acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 4; i++) {
+                    _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+                }
+            }
+        }
+        if (anc != nc) {
+            xstart = anc/8;
+            y = 0;
+        }
+    #endif // __AVX512F__
+
+        // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
+
+        for (; y < anr / 4; y += 4) {
+            const block_q8_0x4 * a_ptrs[4];
+
+            a_ptrs[0] = a_ptr_start + (y * nb);
+            for (int i = 0; i < 3; ++i) {
+                a_ptrs[i + 1] = a_ptrs[i] + nb;
+            }
+
+            // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
+            for (int64_t x = xstart; x < nc / 8; x++) {
+
+                const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
+
+                // Master FP accumulators
+                __m256 acc_rows[16];
+                for (int i = 0; i < 16; i++) {
+                    acc_rows[i] = _mm256_setzero_ps();
+                }
+
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
+
+                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
+                    const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
+
+                    const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
+                    const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
+
+                    const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
+                    const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
+
+                    const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
+                    const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
+
+                    // Shuffle pattern one - right side input
+                    const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136);  //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
+                    const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136);  //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
+
+                    const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136);  //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
+                    const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136);  //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
+
+                    const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136);  //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
+                    const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136);  //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
+
+                    const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136);  //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
+                    const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136);  //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
+
+                    // Shuffle pattern two - right side input
+
+                    const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221);  //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
+                    const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221);  //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
+
+                    const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221);  //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
+                    const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221);  //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
+
+                    const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221);  //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
+                    const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221);  //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
+
+                    const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221);  //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
+                    const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221);  //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
+
+                    // Scale values - Load the wight scale values of block_q4_0x8
+                    const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
+
+                    // Process LHS in groups of four
+                    for (int rp = 0; rp < 4; rp++) {
+                        // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                        // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
+                        __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
+                        __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
+                        __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
+                        __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
+                        __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
+                        __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
+                        __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
+                        __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
+                        __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
+                        __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
+                        __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
+                        __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
+
+                        // Shuffle pattern one - left side input
+                        const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                        const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                        const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                        const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                        const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                        const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                        const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                        const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                        // Shuffle pattern two - left side input
+                        const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                        const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                        const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                        const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                        const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                        const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                        const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                        const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                        // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                        // Resembles MMLAs into 2x2 matrices in ARM Version
+                        __m256i iacc_mat_00_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
+                        __m256i iacc_mat_01_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
+                        __m256i iacc_mat_10_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
+                        __m256i iacc_mat_11_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
+                        __m256i iacc_mat_00_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
+                        __m256i iacc_mat_01_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
+                        __m256i iacc_mat_10_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
+                        __m256i iacc_mat_11_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
+
+                        // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                        __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                        __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                        __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                        __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+                        // Straighten out to make 4 row vectors
+                        __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
+                        __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
+                        __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
+                        __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
+
+                        // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                        const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
+
+                        // Multiply with appropiate scales and accumulate
+                        acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
+                        acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
+                        acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
+                        acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32,  255)), acc_rows[rp * 4 + 3]);
+                    }
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 16; i++) {
+                    _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+                }
+            }
+        }
+
+        // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
+        for (; y < nr / 4; y ++) {
+
+            const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
+
+            // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
+            for (int64_t x = xstart; x < nc / 8; x++) {
+
+                const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
+
+                // Master FP accumulators
+                __m256 acc_rows[4];
+                for (int i = 0; i < 4; i++) {
+                    acc_rows[i] = _mm256_setzero_ps();
+                }
+
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
+
+                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b));  //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
+                    const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b));  //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
+
+                    const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b));  //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
+                    const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b));  //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
+
+                    const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b));  //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
+                    const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b));  //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
+
+                    const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b));  //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
+                    const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b));  //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
+
+                    // Shuffle pattern one - right side input
+                    const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136);  //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
+                    const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136);  //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
+
+                    const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136);  //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
+                    const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136);  //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
+
+                    const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136);  //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
+                    const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136);  //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
+
+                    const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136);  //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
+                    const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136);  //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
+
+                    // Shuffle pattern two - right side input
+
+                    const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221);  //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
+                    const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221);  //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
+
+                    const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221);  //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
+                    const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221);  //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
+
+                    const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221);  //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
+                    const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221);  //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
+
+                    const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221);  //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
+                    const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221);  //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
+
+                    // Scale values - Load the wight scale values of block_q4_0x8
+                    const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
+
+                    // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                    // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
+                    __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
+                    __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
+                    __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
+                    __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
+                    __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
+                    __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
+                    __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
+                    __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
+                    __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
+                    __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
+                    __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
+                    __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
+
+                    // Shuffle pattern one - left side input
+
+                    const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                    const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                    const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                    const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                    const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                    const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                    const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                    const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                    // Shuffle pattern two - left side input
+
+                    const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                    const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                    const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                    const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                    const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                    const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                    const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                    const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                    // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                    // Resembles MMLAs into 2x2 matrices in ARM Version
+                    __m256i iacc_mat_00_sp1 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
+                    __m256i iacc_mat_01_sp1 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
+                    __m256i iacc_mat_10_sp1 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
+                    __m256i iacc_mat_11_sp1 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
+                    __m256i iacc_mat_00_sp2 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
+                    __m256i iacc_mat_01_sp2 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
+                    __m256i iacc_mat_10_sp2 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
+                    __m256i iacc_mat_11_sp2 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
+
+                    // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                    __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                    __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                    __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                    __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+
+                    // Straighten out to make 4 row vectors
+                    __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
+                    __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
+                    __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
+                    __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
+
+                    // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                    const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
+
+                    // Multiply with appropiate scales and accumulate
+                    acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
+                    acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
+                    acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
+                    acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 4; i++) {
+                    _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+                }
+            }
+        }
+        return;
+    }
+#elif defined(__riscv_v_intrinsic)
+    if (__riscv_vlenb() >= QK4_0) {
+        const size_t vl = QK4_0;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+                vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                for (int l = 0; l < nb; l++) {
+                    const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
+                    const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
+                    const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
+                    const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
+                    const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
+                    const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
+                    const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
+
+                    // vector version needs Zvfhmin extension
+                    const float a_scales[4] = {
+                        GGML_FP16_TO_FP32(a_ptr[l].d[0]),
+                        GGML_FP16_TO_FP32(a_ptr[l].d[1]),
+                        GGML_FP16_TO_FP32(a_ptr[l].d[2]),
+                        GGML_FP16_TO_FP32(a_ptr[l].d[3])
+                    };
+                    const float b_scales[8] = {
+                        GGML_FP16_TO_FP32(b_ptr[l].d[0]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[1]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[2]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[3]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[4]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[5]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[6]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[7])
+                    };
+                    const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
+
+                    const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
+                    const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
+                    const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
+                    const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l0;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l0 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
+                        sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
+                    const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
+                    const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
+                    const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l1;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l1 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
+                        sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
+                    const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
+                    const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
+                    const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l2;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l2 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
+                        sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
+                    const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
+                    const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
+                    const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l3;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l3 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
+                        sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
+                    }
+                }
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
+            }
+        }
+
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+    float sumf[4][8];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
+}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
new file mode 100644
index 000000000..203802f07
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "ggml.h"
+
+// GGML internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Quantization
+void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
+
+// GEMV
+void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+// GEMM
+void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/ggml/src/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
similarity index 52%
rename from ggml/src/ggml-cpu-impl.h
rename to ggml/src/ggml-cpu/ggml-cpu-impl.h
index 5b45155b0..27a530b22 100644
--- a/ggml/src/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -27,80 +27,6 @@ extern "C" {
 
 #endif
 
-/**
- * Converts brain16 to float32.
- *
- * The bfloat16 floating point format has the following structure:
- *
- *       ┌sign
- *       │
- *       │   ┌exponent
- *       │   │
- *       │   │      ┌mantissa
- *       │   │      │
- *       │┌──┴───┐┌─┴───┐
- *     0b0000000000000000 brain16
- *
- * Since bf16 has the same number of exponent bits as a 32bit float,
- * encoding and decoding numbers becomes relatively straightforward.
- *
- *       ┌sign
- *       │
- *       │   ┌exponent
- *       │   │
- *       │   │      ┌mantissa
- *       │   │      │
- *       │┌──┴───┐┌─┴───────────────────┐
- *     0b00000000000000000000000000000000 IEEE binary32
- *
- * For comparison, the standard fp16 format has fewer exponent bits.
- *
- *       ┌sign
- *       │
- *       │  ┌exponent
- *       │  │
- *       │  │    ┌mantissa
- *       │  │    │
- *       │┌─┴─┐┌─┴──────┐
- *     0b0000000000000000 IEEE binary16
- *
- * @see IEEE 754-2008
- */
-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
-    union {
-        float f;
-        uint32_t i;
-    } u;
-    u.i = (uint32_t)h.bits << 16;
-    return u.f;
-}
-
-/**
- * Converts float32 to brain16.
- *
- * This is binary identical with Google Brain float conversion.
- * Floats shall round to nearest even, and NANs shall be quiet.
- * Subnormals aren't flushed to zero, except perhaps when used.
- * This code should vectorize nicely if using modern compilers.
- */
-static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
-    ggml_bf16_t h;
-    union {
-        float f;
-        uint32_t i;
-    } u;
-    u.f = s;
-    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
-        h.bits = (u.i >> 16) | 64; /* force to quiet */
-        return h;
-    }
-    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
-    return h;
-}
-
-#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
-#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
-
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __FMA__
@@ -388,28 +314,6 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
 
 #endif // defined(__ARM_NEON)
 
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    ggml_fp16_internal_t tmp;
-    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
-    return (float)tmp;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    ggml_fp16_t res;
-    ggml_fp16_internal_t tmp = f;
-    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
-    return res;
-}
-
-#else
-
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #else
@@ -462,153 +366,6 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
 }
 #endif
 
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
-
-#ifdef __ARM_FEATURE_SVE
-#include <arm_sve.h>
-#endif // __ARM_FEATURE_SVE
-
-// precomputed f32 table for f16 (256 KB)
-// defined in ggml.c, initialized in ggml_init()
-extern float ggml_table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32)
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return ggml_table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#endif
-
-#if !defined(GGML_FP32_TO_FP16)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
new file mode 100644
index 000000000..7fa2897c2
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -0,0 +1,10796 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+
+#include "ggml-quants.h"
+#include "ggml-cpu-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-cpu.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid warnings for hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+#endif
+
+#define UNUSED GGML_UNUSED
+
+// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = _mm_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = _mm_sign_epi8(y, x);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = _mm_maddubs_epi16(ax, sy);
+    const __m128i ones = _mm_set1_epi16(1);
+    return _mm_madd_epi16(ones, dot);
+}
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+    __m128 res = _mm256_extractf128_ps(x, 1);
+    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+    res = _mm_add_ss(res, _mm_movehdup_ps(res));
+    return _mm_cvtss_f32(res);
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = _mm256_set_epi64x(
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
+    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytes = _mm256_or_si256(bytes, bit_mask);
+    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
+    const __m256i lowMask = _mm256_set1_epi8( 0xF );
+    return _mm256_and_si256(lowMask, bytes);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    const __m256i ones = _mm256_set1_epi16(1);
+    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_float(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Get absolute values of x vectors
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return mul_sum_us8_pairs_float(ax, sy);
+#endif
+}
+
+static inline __m128i packNibbles( __m256i bytes )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+#if __AVX512F__
+    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
+    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
+    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
+#else
+    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
+    __m256i high = _mm256_andnot_si256( lowByte, bytes );
+    __m256i low = _mm256_and_si256( lowByte, bytes );
+    high = _mm256_srli_epi16( high, 4 );
+    bytes = _mm256_or_si256( low, high );
+
+    // Compress uint16_t lanes into bytes
+    __m128i r0 = _mm256_castsi256_si128( bytes );
+    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
+    return _mm_packus_epi16( r0, r1 );
+#endif
+}
+#elif defined(__AVX__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytesl = _mm_or_si128(bytesl, bit_mask);
+    bytesh = _mm_or_si128(bytesh, bit_mask);
+    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+    return MM256_SET_M128I(bytesh, bytesl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    // Load 16 bytes from memory
+    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
+    __m128i tmph = _mm_srli_epi16(tmpl, 4);
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    tmpl = _mm_and_si128(lowMask, tmpl);
+    tmph = _mm_and_si128(lowMask, tmph);
+    return MM256_SET_M128I(tmph, tmpl);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+    const __m128i ones = _mm_set1_epi16(1);
+    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+    const __m128i axl = _mm256_castsi256_si128(ax);
+    const __m128i axh = _mm256_extractf128_si256(ax, 1);
+    const __m128i syl = _mm256_castsi256_si128(sy);
+    const __m128i syh = _mm256_extractf128_si256(sy, 1);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+    const __m128i xl = _mm256_castsi256_si128(x);
+    const __m128i xh = _mm256_extractf128_si256(x, 1);
+    const __m128i yl = _mm256_castsi256_si128(y);
+    const __m128i yh = _mm256_extractf128_si256(y, 1);
+    // Get absolute values of x vectors
+    const __m128i axl = _mm_sign_epi8(xl, xl);
+    const __m128i axh = _mm_sign_epi8(xh, xh);
+    // Sign the values of the y vectors
+    const __m128i syl = _mm_sign_epi8(yl, xl);
+    const __m128i syh = _mm_sign_epi8(yh, xh);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m128i lowByte = _mm_set1_epi16( 0xFF );
+    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+    __m128i low = _mm_and_si128( lowByte, bytes1 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes1 = _mm_or_si128( low, high );
+    high = _mm_andnot_si128( lowByte, bytes2 );
+    low = _mm_and_si128( lowByte, bytes2 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes2 = _mm_or_si128( low, high );
+
+    return _mm_packus_epi16( bytes1, bytes2);
+}
+
+static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
+    const __m128i ax = _mm_sign_epi8(x, x);
+    const __m128i sy = _mm_sign_epi8(y, x);
+    return _mm_maddubs_epi16(ax, sy);
+}
+#endif
+#elif defined(__SSSE3__)
+// horizontally add 4x4 floats
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+    __m128 res_0 =_mm_hadd_ps(a, b);
+    __m128 res_1 =_mm_hadd_ps(c, d);
+    __m128 res =_mm_hadd_ps(res_0, res_1);
+    res =_mm_hadd_ps(res, res);
+    res =_mm_hadd_ps(res, res);
+
+    return _mm_cvtss_f32(res);
+}
+#endif // __AVX__ || __AVX2__ || __AVX512F__
+#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+
+#if defined(__ARM_NEON) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes:
+static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
+static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
+#endif
+
+#if defined(__loongarch_asx)
+
+#ifdef __clang__
+#define VREGS_PREFIX "$vr"
+#define XREGS_PREFIX "$xr"
+#else // GCC
+#define VREGS_PREFIX "$f"
+#define XREGS_PREFIX "$f"
+#endif
+#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
+// Convert __m128i to __m256i
+static inline __m256i ____m256i(__m128i in) {
+    __m256i out = __lasx_xvldi(0);
+    __asm__ volatile (
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " XREGS_PREFIX"\\i    \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[in], " VREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        : [out] "+f" (out) : [in] "f" (in)
+    );
+    return out;
+}
+// Convert two __m128i to __m256i
+static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
+    __m256i out;
+    __asm__ volatile (
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[hi], " VREGS_PREFIX "\\i    \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[lo], " VREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        ".ifnc %[out], %[hi]                 \n\t"
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " XREGS_PREFIX "\\i   \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[hi], " VREGS_PREFIX "\\j  \n\t"
+        "    xvori.b $xr\\i, $xr\\j, 0       \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        ".endif                              \n\t"
+        : [out] "=f" (out), [hi] "+f" (inhi)
+        : [lo] "f" (inlo)
+    );
+    return out;
+}
+// Convert __m256i low part to __m128i
+static inline __m128i lasx_extracti128_lo(__m256i in) {
+    __m128i out;
+    __asm__ volatile (
+        ".ifnc %[out], %[in]                 \n\t"
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
+        "    vori.b $vr\\i, $vr\\j, 0        \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        ".endif                              \n\t"
+        : [out] "=f" (out) : [in] "f" (in)
+    );
+    return out;
+}
+// Convert __m256i high part to __m128i
+static inline __m128i lasx_extracti128_hi(__m256i in) {
+    __m128i out;
+    __asm__ volatile (
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x11  \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        : [out] "=f" (out) : [in] "f" (in)
+    );
+    return out;
+}
+
+static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) {
+    v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7};
+    return (__m256i)__ret;
+}
+
+static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
+    v4i32 __ret = {d, c, b, a};
+    return (__m128i)__ret;
+}
+
+static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
+    v4i64 __ret = {d, c, b, a};
+    return (__m256i)__ret;
+}
+
+static __m256i lasx_insertf128( __m128i x, __m128i y) {
+    return lasx_set_q(x, y);
+}
+
+static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
+    __m128i mask_f, zero, tmp0, tmp2, mask;
+    int f = 0x8f;
+    mask_f = __lsx_vreplgr2vr_b(f);
+    zero = __lsx_vldi(0);
+    tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
+    tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
+    mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
+    tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
+    return __lsx_vshuf_b(a, zero, tmp2);
+}
+
+static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
+    __m256i mask_f, zero, tmp0, tmp2, mask;
+    int f = 0x8f;
+    mask_f = __lasx_xvreplgr2vr_b(f);
+    zero = __lasx_xvldi(0);
+    tmp0 = __lasx_xvand_v(b, mask_f); // get mask with low 4 bit and sign bits
+    tmp0 = __lasx_xvori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
+    mask = __lasx_xvsle_b(zero, tmp0); // if mask >= 0, set mask
+    tmp2 = __lasx_xvand_v(tmp0, mask); // maskout the in2 < ones
+    return __lasx_xvshuf_b(a, zero, tmp2);
+}
+
+static __m256i lasx_extu8_16(__m128i a) {
+    __m128i zero = __lsx_vldi(0);
+    __m128i vlo = __lsx_vilvl_b(zero, a);
+    __m128i vhi = __lsx_vilvh_b(zero, a);
+    return lasx_set_q(vhi, vlo);
+}
+
+static __m256i lasx_ext8_16(__m128i a) {
+     __m128i sign = __lsx_vslti_b(a, 0);
+     __m128i vlo = __lsx_vilvl_b(sign, a);
+     __m128i vhi = __lsx_vilvh_b(sign, a);
+     return lasx_set_q(vhi, vlo);
+}
+
+static __m256i lasx_ext16_32(__m128i a) {
+    __m256i tmp1;
+    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
+    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
+    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
+    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
+    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
+    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
+    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
+    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
+    return tmp1;
+}
+
+static __m128i lasx_extracti128( __m256i a, int pos) {
+    __m128i ret;
+    if( pos == 0)
+    {
+       ret = lasx_extracti128_lo(a);
+    } else {
+       ret = lasx_extracti128_hi(a);
+    }
+    return ret;
+}
+
+static __m128 lasx_extractf128( __m256 a, int pos) {
+    __m128 ret;
+    if( pos == 0)
+    {
+       ret = (__m128)lasx_extracti128_lo((__m256i)a);
+    } else {
+       ret = (__m128)lasx_extracti128_hi((__m256i)a);
+    }
+    return ret;
+}
+
+static __m128i lsx_hadd_h(__m128i a, __m128i b) {
+    __m128i tmp1 = __lsx_vpickev_h(b, a);
+    __m128i tmp2 = __lsx_vpickod_h(b, a);
+    return __lsx_vadd_h(tmp1, tmp2);
+}
+
+static __m128i lsx_hadd_w(__m128i a, __m128i b) {
+    __m128i tmp1 = __lsx_vpickev_w(b, a);
+    __m128i tmp2 = __lsx_vpickod_w(b, a);
+    return __lsx_vadd_w(tmp1, tmp2);
+}
+
+static __m128 lsx_hadd_s(__m128 a, __m128 b) {
+    __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
+    __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
+
+    return __lsx_vfadd_s(tmp1, tmp2);
+}
+
+static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
+    __m256i tmp1, tmp2;
+    tmp1 = __lasx_xvmulwev_h_b(a, b);
+    tmp2 = __lasx_xvmulwod_h_b(a, b);
+    return __lasx_xvsadd_h(tmp1, tmp2);
+}
+
+static __m256i lasx_madd_h(__m256i a, __m256i b) {
+    __m256i tmp1, tmp2;
+    tmp1 = __lasx_xvmulwev_w_h(a, b);
+    tmp2 = __lasx_xvmulwod_w_h(a, b);
+    return __lasx_xvadd_w(tmp1, tmp2);
+}
+
+static __m256i lasx_packs_w(__m256i a, __m256i b) {
+    __m256i tmp, tmp1;
+    tmp = __lasx_xvsat_w(a, 15);
+    tmp1 = __lasx_xvsat_w(b, 15);
+    return __lasx_xvpickev_h(tmp1, tmp);
+}
+
+static __m256i lasx_packs_h(__m256i a, __m256i b) {
+    __m256i tmp, tmp1;
+    tmp = __lasx_xvsat_h(a, 7);
+    tmp1 = __lasx_xvsat_h(b, 7);
+    return __lasx_xvpickev_b(tmp1, tmp);
+}
+
+static __m128i lsx_packs_w(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_w(a, 15);
+    tmp1 = __lsx_vsat_w(b, 15);
+    return __lsx_vpickev_h(tmp1, tmp);
+}
+
+static __m128i lsx_packs_h(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_h(a, 7);
+    tmp1 = __lsx_vsat_h(b, 7);
+    return __lsx_vpickev_b(tmp1, tmp);
+}
+
+static __m128i lsx_packus_h(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_hu(a, 7);
+    tmp1 = __lsx_vsat_hu(b, 7);
+    return __lsx_vpickev_b(tmp1, tmp);
+}
+
+
+static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
+    __m128i tmp1, tmp2;
+    tmp1 = __lsx_vmulwev_h_b(a, b);
+    tmp2 = __lsx_vmulwod_h_b(a, b);
+    return __lsx_vsadd_h(tmp1, tmp2);
+}
+
+static __m128i lsx_madd_h(__m128i a, __m128i b) {
+    __m128i tmp1, tmp2;
+    tmp1 = __lsx_vmulwev_w_h(a, b);
+    tmp2 = __lsx_vmulwod_w_h(a, b);
+    return __lsx_vadd_w(tmp1, tmp2);
+}
+
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = __lsx_vsigncov_b(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = __lsx_vsigncov_b(x, y);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = lsx_maddubs_h(ax, sy);
+    const __m128i ones = __lsx_vreplgr2vr_h(1);
+    return lsx_madd_h(ones, dot);
+}
+
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+    __m128 res = lasx_extractf128(x, 1);
+    ft_union tmp;
+    res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
+    res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
+    res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
+    tmp.i = __lsx_vpickve2gr_w(res, 0);
+    return tmp.f;
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+
+    __m256i tmp1 = __lasx_xvpermi_q(a, a, 0x11);
+    __m256i tmp2 = __lasx_xvpermi_q(a, a, 0x00);
+
+    __m128i  tmp1_128 = lasx_extracti128_lo(tmp1);
+    __m128i  tmp2_128 = lasx_extracti128_lo(tmp2);
+
+    __m128i sum128 = __lsx_vadd_w(tmp1_128, tmp2_128);
+
+    __m128i ev = __lsx_vpickev_w(sum128, sum128);
+    __m128i od = __lsx_vpickod_w(sum128, sum128);
+    __m128i sum64 = __lsx_vadd_w(ev, od);
+
+    int sum64_1, sum64_2;
+    sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
+    sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
+
+    return  sum64_1 + sum64_2;
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    __m128i ev = __lsx_vpickev_w(a, a);
+    __m128i od = __lsx_vpickod_w(a, a);
+    __m128i sum64 = __lsx_vadd_w(ev, od);
+
+    int sum64_1, sum64_2;
+    sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
+    sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
+
+    return  sum64_1 + sum64_2;
+}
+
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = lasx_set_d(
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
+
+    __m256i bytes = lasx_shuffle_b(__lasx_xvreplgr2vr_w(x32), shuf_mask);
+    const __m256i bit_mask = __lasx_xvreplgr2vr_d(0x7fbfdfeff7fbfdfe);
+    bytes = __lasx_xvor_v(bytes, bit_mask);
+    return __lasx_xvseq_b(bytes, __lasx_xvreplgr2vr_d(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
+    const __m128i lo = __lsx_vld((const __m128i *)rsi, 0);
+    __m128i hi = __lsx_vsrli_h(lo, 4);
+    return __lasx_xvandi_b(lasx_insertf128(hi, lo), 0xf);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    __m256i v = __lasx_xvpackod_h(x, x);
+    __m256i summed_pairs = __lasx_xvaddwev_w_h(x, v);
+    return __lasx_xvffint_s_w(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = lasx_maddubs_h(ax, sy);
+    return sum_i16_pairs_float(dot);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+
+    // Get absolute values of x vectors
+    const __m256i ax = __lasx_xvsigncov_b(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = __lasx_xvsigncov_b(x, y);
+
+    return mul_sum_us8_pairs_float(ax, sy);
+}
+
+static inline __m128i packNibbles( __m256i bytes ) {
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF);
+     __m256i high = __lasx_xvandn_v(lowByte, bytes);
+    __m256i low = __lasx_xvand_v(lowByte, bytes);
+    high = __lasx_xvsrli_h(high, 4);
+    bytes = __lasx_xvor_v(low, high);
+    // Compress uint16_t lanes into bytes
+    __m128i *r0 = (__m128i *)&bytes;
+    __m256i tmp_h128 = __lasx_xvpermi_q(bytes, bytes, 0x11);
+    __m128i *r1 = (__m128i *)&tmp_h128;
+
+    __m128i zero = __lsx_vldi(0);
+    __m128i tmp, tmp2, tmp3;
+
+    tmp = __lsx_vmax_h(zero, *r0);
+    tmp2 = __lsx_vsat_hu(tmp, 7);
+
+    tmp = __lsx_vmax_h(zero, *r1);
+    tmp3 = __lsx_vsat_hu(tmp, 7);
+    return  __lsx_vpickev_b(tmp3, tmp2);
+}
+#endif  //__loongarch_asx
+
+void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q4_0_ref(x, y, k);
+}
+
+void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q4_1_ref(x, y, k);
+}
+
+void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q5_0_ref(x, y, k);
+}
+
+void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q5_1_ref(x, y, k);
+}
+
+void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__wasm_simd128__)
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+        }
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = GGML_FP32_TO_FP16(d);
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#elif defined(__riscv_v_intrinsic)
+
+    size_t vl = __riscv_vsetvl_e32m4(QK8_0);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
+
+        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+        // convert to integer
+        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    for (int i = 0; i < nb; i++) {
+        vector float srcv [8];
+        vector float asrcv[8];
+        vector float amaxv[8];
+        vector signed int vi[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
+                                   vec_extract(amaxv[0], 1)),
+                               MAX(vec_extract(amaxv[0], 2),
+                                   vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+        const vector float vid = vec_splats(id);
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const vector float v  = vec_round(vec_mul(srcv[j], vid));
+            vi[j] = vec_cts(v, 0);
+        }
+        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
+        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
+    }
+
+#elif defined(__loongarch_asx)
+    for (int i = 0; i < nb; i++) {
+        ft_union fi;
+        __m256 v0 = (__m256)__lasx_xvld( x , 0);
+        __m256 v1 = (__m256)__lasx_xvld( x , 32);
+        __m256 v2 = (__m256)__lasx_xvld( x , 64);
+        __m256 v3 = (__m256)__lasx_xvld( x , 96);
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
+        __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
+
+        __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) );
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
+        __m128 tmp = max4;
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
+        fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
+        const float max_scalar = fi.f;
+
+        // Quantize these floats
+        const float d = max_scalar / 127.f;
+        y[i].d = GGML_FP32_TO_FP16(d);
+        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+        const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
+
+        // Apply the multiplier
+        v0 = __lasx_xvfmul_s( v0, mul );
+        v1 = __lasx_xvfmul_s( v1, mul );
+        v2 = __lasx_xvfmul_s( v2, mul );
+        v3 = __lasx_xvfmul_s( v3, mul );
+
+        // Round to nearest integer
+        __m256i i0 = __lasx_xvftintrne_w_s( v0 );
+        __m256i i1 = __lasx_xvftintrne_w_s( v1 );
+        __m256i i2 = __lasx_xvftintrne_w_s( v2 );
+        __m256i i3 = __lasx_xvftintrne_w_s( v3 );
+
+        __m128i ni0 = lasx_extracti128( i0, 0 );
+        __m128i ni1 = lasx_extracti128( i0, 1);
+        __m128i ni2 = lasx_extracti128( i1, 0);
+        __m128i ni3 = lasx_extracti128( i1, 1);
+        __m128i ni4 = lasx_extracti128( i2, 0);
+        __m128i ni5 = lasx_extracti128( i2, 1);
+        __m128i ni6 = lasx_extracti128( i3, 0);
+        __m128i ni7 = lasx_extracti128( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = lsx_packs_w( ni0, ni1 );
+        ni2 = lsx_packs_w( ni2, ni3 );
+        ni4 = lsx_packs_w( ni4, ni5 );
+        ni6 = lsx_packs_w( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = lsx_packs_h( ni0, ni2 );
+        ni4 = lsx_packs_h( ni4, ni6 );
+
+        __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
+        __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
+
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_ref(x, y, k);
+#endif
+}
+
+void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        int32x4_t accv = vdupq_n_s32(0);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+
+            accv = vaddq_s32(accv, vi);
+        }
+
+        y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+    }
+#elif defined(__wasm_simd128__)
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        v128_t accv = wasm_i32x4_splat(0);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+
+            accv = wasm_i32x4_add(accv, vi);
+        }
+
+        y[i].s = GGML_FP32_TO_FP16(
+                d * (wasm_i32x4_extract_lane(accv, 0) +
+                     wasm_i32x4_extract_lane(accv, 1) +
+                     wasm_i32x4_extract_lane(accv, 2) +
+                     wasm_i32x4_extract_lane(accv, 3)));
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float max_scalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = max_scalar / 127.f;
+        y[i].d = GGML_FP32_TO_FP16(d);
+        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Compute the sum of the quants and set y[i].s
+        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
+
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
+        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
+        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#elif defined(__riscv_v_intrinsic)
+
+    size_t vl = __riscv_vsetvl_e32m4(QK8_1);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
+
+        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d  = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+        // convert to integer
+        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+
+        // compute sum for y[i].s
+        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
+        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
+
+        // set y[i].s
+        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
+        y[i].s = GGML_FP32_TO_FP16(sum*d);
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    for (int i = 0; i < nb; i++) {
+        vector float srcv [8];
+        vector float asrcv[8];
+        vector float amaxv[8];
+        vector signed int vi[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
+                                   vec_extract(amaxv[0], 1)),
+                               MAX(vec_extract(amaxv[0], 2),
+                                   vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+        const vector float vid = vec_splats(id);
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        vector int accv = vec_splats(0);
+
+        for (int j = 0; j < 8; j++) {
+            const vector float v  = vec_round(vec_mul(srcv[j], vid));
+            vi[j] = vec_cts(v, 0);
+
+            accv = vec_add(accv, vi[j]);
+        }
+        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
+        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
+
+        accv = vec_add(accv, vec_sld(accv, accv, 4));
+        accv = vec_add(accv, vec_sld(accv, accv, 8));
+        y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
+    }
+
+#elif defined(__loongarch_asx)
+    for (int i = 0; i < nb; i++) {
+        ft_union ft;
+        __m256 v0 = (__m256)__lasx_xvld( x , 0 );
+        __m256 v1 = (__m256)__lasx_xvld( x , 32 );
+        __m256 v2 = (__m256)__lasx_xvld( x , 64 );
+        __m256 v3 = (__m256)__lasx_xvld( x , 96 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
+        __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
+
+        __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
+        __m128 tmp = max4;
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
+        ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
+        const float max_scalar = ft.f;
+
+        // Quantize these floats
+        const float d = max_scalar / 127.f;
+        y[i].d = GGML_FP32_TO_FP16(d);
+        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+        const __m256 mul = __lasx_xvreplfr2vr_s( id );
+
+        // Apply the multiplier
+        v0 = __lasx_xvfmul_s( v0, mul );
+        v1 = __lasx_xvfmul_s( v1, mul );
+        v2 = __lasx_xvfmul_s( v2, mul );
+        v3 = __lasx_xvfmul_s( v3, mul );
+
+        // Round to nearest integer
+        __m256i i0 = __lasx_xvftintrne_w_s( v0 );
+        __m256i i1 = __lasx_xvftintrne_w_s( v1 );
+        __m256i i2 = __lasx_xvftintrne_w_s( v2 );
+        __m256i i3 = __lasx_xvftintrne_w_s( v3 );
+
+        __m128i ni0 = lasx_extracti128(i0, 0);
+        __m128i ni1 = lasx_extracti128( i0, 1);
+        __m128i ni2 = lasx_extracti128( i1, 0);
+        __m128i ni3 = lasx_extracti128( i1, 1);
+        __m128i ni4 = lasx_extracti128( i2, 0 );
+        __m128i ni5 = lasx_extracti128( i2, 1);
+        __m128i ni6 = lasx_extracti128( i3, 0);
+        __m128i ni7 = lasx_extracti128( i3, 1);
+
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
+        const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
+        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
+
+        // Convert int32 to int16
+        ni0 = lsx_packs_w( ni0, ni1 );
+        ni2 = lsx_packs_w( ni2, ni3 );
+        ni4 = lsx_packs_w( ni4, ni5 );
+        ni6 = lsx_packs_w( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = lsx_packs_h( ni0, ni2 );
+        ni4 = lsx_packs_h( ni4, ni6 );
+
+        __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
+        __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_ref(x, y, k);
+#endif
+}
+
+//
+// 2-6 bit quantization in super-blocks
+//
+
+//
+// ===================== Helper functions
+//
+static inline int nearest_int(float fval) {
+    assert(fabsf(fval) <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
+        const float * restrict qw) {
+    float max = 0;
+    float amax = 0;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) { amax = ax; max = x[i]; }
+    }
+    if (amax < GROUP_MAX_EPS) { // all zero
+        for (int i = 0; i < n; ++i) {
+            L[i] = 0;
+        }
+        return 0.f;
+    }
+    float iscale = -nmax / max;
+    if (rmse_type == 0) {
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
+        }
+        return 1/iscale;
+    }
+    bool return_early = false;
+    if (rmse_type < 0) {
+        rmse_type = -rmse_type;
+        return_early = true;
+    }
+    float sumlx = 0;
+    float suml2 = 0;
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 0; i < n; ++i) {
+#else
+    for (int i = 0; i < n; ++i) {
+#endif
+        int l = nearest_int(iscale * x[i]);
+        l = MAX(-nmax, MIN(nmax-1, l));
+        L[i] = l + nmax;
+        float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
+        sumlx += w*x[i]*l;
+        suml2 += w*l*l;
+    }
+    float scale = suml2 ? sumlx/suml2 : 0.0f;
+    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
+    float best = scale * sumlx;
+    for (int is = -9; is <= 9; ++is) {
+        if (is == 0) {
+            continue;
+        }
+        iscale = -(nmax + 0.1f*is) / max;
+        sumlx = suml2 = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            l = MAX(-nmax, MIN(nmax-1, l));
+            float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
+            sumlx += w*x[i]*l;
+            suml2 += w*l*l;
+        }
+        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
+            for (int i = 0; i < n; ++i) {
+                int l = nearest_int(iscale * x[i]);
+                L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
+            }
+            scale = sumlx/suml2; best = scale*sumlx;
+        }
+    }
+    return scale;
+}
+
+static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+    float max = 0;
+    float amax = 0;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) { amax = ax; max = x[i]; }
+    }
+    if (amax < GROUP_MAX_EPS) { // all zero
+        for (int i = 0; i < n; ++i) { L[i] = 0; }
+        return 0.f;
+    }
+    float iscale = -nmax / max;
+    if (do_rmse) {
+        float sumlx = 0;
+        float suml2 = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            l = MAX(-nmax, MIN(nmax-1, l));
+            L[i] = l;
+            float w = x[i]*x[i];
+            sumlx += w*x[i]*l;
+            suml2 += w*l*l;
+        }
+        for (int itry = 0; itry < 5; ++itry) {
+            int n_changed = 0;
+            for (int i = 0; i < n; ++i) {
+                float w = x[i]*x[i];
+                float slx = sumlx - w*x[i]*L[i];
+                if (slx > 0) {
+                    float sl2 = suml2 - w*L[i]*L[i];
+                    int new_l = nearest_int(x[i] * sl2 / slx);
+                    new_l = MAX(-nmax, MIN(nmax-1, new_l));
+                    if (new_l != L[i]) {
+                        slx += w*x[i]*new_l;
+                        sl2 += w*new_l*new_l;
+                        if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
+                            L[i] = new_l; sumlx = slx; suml2 = sl2;
+                            ++n_changed;
+                        }
+                    }
+                }
+            }
+            if (!n_changed) {
+                break;
+            }
+        }
+        for (int i = 0; i < n; ++i) {
+            L[i] += nmax;
+        }
+        return sumlx / suml2;
+    }
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale * x[i]);
+        l = MAX(-nmax, MIN(nmax-1, l));
+        L[i] = l + nmax;
+    }
+    return 1/iscale;
+}
+
+static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+        int ntry, float alpha) {
+    float min = x[0];
+    float max = x[0];
+    for (int i = 1; i < n; ++i) {
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+    }
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = 0;
+        return 0.f;
+    }
+    if (min > 0) min = 0;
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    for (int itry = 0; itry < ntry; ++itry) {
+        float sumlx = 0; int suml2 = 0;
+        bool did_change = false;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            if (l != L[i]) {
+                L[i] = l;
+                did_change = true;
+            }
+            sumlx += (x[i] - min)*l;
+            suml2 += l*l;
+        }
+        scale = sumlx/suml2;
+        float sum = 0;
+        for (int i = 0; i < n; ++i) {
+            sum += x[i] - scale*L[i];
+        }
+        min = alpha*min + (1 - alpha)*sum/n;
+        if (min > 0) min = 0;
+        iscale = 1/scale;
+        if (!did_change) break;
+    }
+    *the_min = -min;
+    return scale;
+}
+
+static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
+        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+        float rmin, float rdelta, int nstep, bool use_mad) {
+    float min = x[0];
+    float max = x[0];
+    float sum_w = weights[0];
+    float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
+    for (int i = 1; i < n; ++i) {
+#endif
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+        float w = weights[i];
+        sum_w += w;
+        sum_x += w * x[i];
+    }
+    if (min > 0) min = 0;
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = -min;
+        return 0.f;
+    }
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    float best_mad = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale*(x[i] - min));
+        L[i] = MAX(0, MIN(nmax, l));
+        float diff = scale * L[i] + min - x[i];
+        diff = use_mad ? fabsf(diff) : diff * diff;
+        float w = weights[i];
+        best_mad += w * diff;
+    }
+    if (nstep < 1) {
+        *the_min = -min;
+        return scale;
+    }
+    for (int is = 0; is <= nstep; ++is) {
+        iscale = (rmin + rdelta*is + nmax)/(max - min);
+        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            Laux[i] = l;
+            float w = weights[i];
+            sum_l += w*l;
+            sum_l2 += w*l*l;
+            sum_xl += w*l*x[i];
+        }
+        float D = sum_w * sum_l2 - sum_l * sum_l;
+        if (D > 0) {
+            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
+            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
+            if (this_min > 0) {
+                this_min = 0;
+                this_scale = sum_xl / sum_l2;
+            }
+            float mad = 0;
+            for (int i = 0; i < n; ++i) {
+                float diff = this_scale * Laux[i] + this_min - x[i];
+                diff = use_mad ? fabsf(diff) : diff * diff;
+                float w = weights[i];
+                mad += w * diff;
+            }
+            if (mad < best_mad) {
+                for (int i = 0; i < n; ++i) {
+                    L[i] = Laux[i];
+                }
+                best_mad = mad;
+                scale = this_scale;
+                min = this_min;
+            }
+        }
+    }
+    *the_min = -min;
+    return scale;
+}
+
+static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+    if (j < 4) {
+        *d = q[j] & 63; *m = q[j + 4] & 63;
+    } else {
+        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+
+//========================- 2-bit (de)-quantization
+
+void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
+    quantize_row_q2_K_ref(x, vy, k);
+}
+
+//========================= 3-bit (de)-quantization
+
+void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
+    quantize_row_q3_K_ref(x, vy, k);
+}
+
+// ====================== 4-bit (de)-quantization
+
+void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_q4_K * restrict y = vy;
+    quantize_row_q4_K_ref(x, y, k);
+}
+
+// ====================== 5-bit (de)-quantization
+
+void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_q5_K * restrict y = vy;
+    quantize_row_q5_K_ref(x, y, k);
+}
+
+// ====================== 6-bit (de)-quantization
+
+void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_q6_K * restrict y = vy;
+    quantize_row_q6_K_ref(x, y, k);
+}
+
+// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
+
+void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_tq1_0 * restrict y = vy;
+    quantize_row_tq1_0_ref(x, y, k);
+}
+
+void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_tq2_0 * restrict y = vy;
+    quantize_row_tq2_0_ref(x, y, k);
+}
+
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
+//===================================== Q8_K ==============================================
+
+void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q8_K_ref(x, y, k);
+}
+
+//===================================== Dot products =================================
+
+//
+// Helper functions
+//
+#if __AVX__ || __AVX2__ || __AVX512F__
+
+// shuffles to pick the required scales in dot products
+static inline __m256i get_scale_shuffle_q3k(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+    };
+    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m256i get_scale_shuffle_k4(int i) {
+    static const uint8_t k_shuffle[256] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
+        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
+        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
+    };
+    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m128i get_scale_shuffle(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
+        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
+        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
+    };
+    return _mm_loadu_si128((const __m128i*)k_shuffle + i);
+}
+#elif defined(__loongarch_asx)
+// shuffles to pick the required scales in dot products
+static inline __m256i get_scale_shuffle_q3k(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+    };
+    return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
+}
+static inline __m256i get_scale_shuffle_k4(int i) {
+    static const uint8_t k_shuffle[256] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
+        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
+        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
+    };
+    return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
+}
+static inline __m128i get_scale_shuffle(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
+        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
+        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
+    };
+    return __lsx_vld((const __m128i*)k_shuffle + i, 0);
+}
+#endif
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q4_0 * restrict vx0 = vx;
+        const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
+        const block_q8_0 * restrict vy0 = vy;
+        const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
+
+        float32x4_t sumv0 = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; i++) {
+            const block_q4_0 * restrict b_x0 = &vx0[i];
+            const block_q4_0 * restrict b_x1 = &vx1[i];
+            const block_q8_0 * restrict b_y0 = &vy0[i];
+            const block_q8_0 * restrict b_y1 = &vy1[i];
+
+            const uint8x16_t m4b = vdupq_n_u8(0x0F);
+            const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
+            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
+
+            // 4-bit -> 8-bit
+            const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+            const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+            const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+            const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+            // sub 8
+            const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
+            const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
+            const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
+            const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
+
+            // load y
+            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
+            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
+            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
+            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
+
+            float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                                    GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                                    GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                                    GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+
+            float32x4_t scale = vld1q_f32(_scale);
+
+            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+
+            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+
+            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+
+            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+
+            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
+                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
+        }
+        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
+        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+
+        vst1_f32(s,      vget_low_f32(sumv2));
+        vst1_f32(s + bs, vget_high_f32(sumv2));
+        return;
+    }
+#endif
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__ARM_FEATURE_SVE)
+    svfloat32_t sumv0 = svdup_n_f32(0.0f);
+    svfloat32_t sumv1 = svdup_n_f32(0.0f);
+
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
+
+    // VLA Implementation using switch case
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating higher lanes for 4 float32 elements
+                const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
+                    const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
+                    const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
+                    const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
+
+                    // sub 8
+                    const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
+                    const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
+                    const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
+                    const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
+
+                    // load y
+                    const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
+                    const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
+                    const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
+                                    svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
+                                    svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for  16 int8 elements
+                const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
+
+                    // sub 8
+                    const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
+                    const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating higher lanes for 32 int8 elements
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
+                const svbool_t pl16 = svnot_b_z(ph32, ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
+
+                    // sub 8
+                    const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
+                    const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(ph32, y0->qs);
+                    const svint8_t qy1 = svld1_s8(ph32, y1->qs);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
+            } break;
+        default:
+            assert(false && "Unsupported vector length");
+            break;
+    }
+
+#elif defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q4_0 * restrict x0 = &x[ib + 0];
+        const block_q4_0 * restrict x1 = &x[ib + 1];
+        const block_q8_0 * restrict y0 = &y[ib + 0];
+        const block_q8_0 * restrict y1 = &y[ib + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+        const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        // dot product into int32x4_t
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m256i off = _mm256_set1_epi8( 8 );
+        qx = _mm256_sub_epi8( qx, off );
+
+        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    const __m128i mone = _mm_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
+        const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
+        const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
+        const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
+        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
+        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
+        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
+        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
+        const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
+        const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
+        const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
+        const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
+        accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
+                _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
+        accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
+                _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
+    }
+
+    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+#elif defined(__SSSE3__)
+    // set constants
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    const __m128i off = _mm_set1_epi8(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    for (; ib + 1 < nb; ib += 2) {
+        _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
+        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
+        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
+        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = _mm_add_ps(p0_d, acc_0);
+        acc_1 = _mm_add_ps(p1_d, acc_1);
+        acc_2 = _mm_add_ps(p2_d, acc_2);
+        acc_3 = _mm_add_ps(p3_d, acc_3);
+    }
+
+    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#elif defined(__riscv_v_intrinsic)
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    for (; ib < nb; ++ib) {
+        // load elements
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        // subtract offset
+        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
+        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+    const vector signed char v8 = vec_splats((signed char)0x8);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 8
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector signed char q4x0 = vec_and(qxs, lowMask);
+        vector signed char q4x1 = vec_sr(qxs, v4);
+
+        q4x0 = vec_sub(q4x0, v8);
+        q4x1 = vec_sub(q4x1, v8);
+
+        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
+        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
+
+        vector signed int vsumi0 = v0;
+
+        vsumi0 = vec_sum4s(qv0, vsumi0);
+        vsumi0 = vec_sum4s(qv1, vsumi0);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m256i off = __lasx_xvreplgr2vr_b( 8 );
+        qx = __lasx_xvsub_b( qx, off );
+
+        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = __lasx_xvfmadd_s( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__loongarch_sx)
+    // set constants
+    const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
+    const __m128i off = __lsx_vreplgr2vr_b(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = __lsx_vldi(0);
+    __m128 acc_1 = __lsx_vldi(0);
+    __m128 acc_2 = __lsx_vldi(0);
+    __m128 acc_3 = __lsx_vldi(0);
+
+    for (; ib + 1 < nb; ib += 2) {
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
+
+        const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
+
+        __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1);
+        __m128i by_0 = __lsx_vld((const __m128i *)y[ib].qs, 0);
+        bx_0 = __lsx_vsub_b(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4));
+        __m128i by_1 = __lsx_vld((const __m128i *)(y[ib].qs + 16), 0);
+        bx_1 = __lsx_vsub_b(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
+
+        const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
+
+        __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3);
+        __m128i by_2 = __lsx_vld((const __m128i *)y[ib + 1].qs, 0);
+        bx_2 = __lsx_vsub_b(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4));
+        __m128i by_3 = __lsx_vld((const __m128i *)(y[ib + 1].qs + 16), 0);
+        bx_3 = __lsx_vsub_b(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = __lsx_vffint_s_w(i32_0);
+        __m128 p1 = __lsx_vffint_s_w(i32_1);
+        __m128 p2 = __lsx_vffint_s_w(i32_2);
+        __m128 p3 = __lsx_vffint_s_w(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = __lsx_vfmul_s( d_0_1, p0 );
+        __m128 p1_d = __lsx_vfmul_s( d_0_1, p1 );
+        __m128 p2_d = __lsx_vfmul_s( d_2_3, p2 );
+        __m128 p3_d = __lsx_vfmul_s( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = __lsx_vfadd_s(p0_d, acc_0);
+        acc_1 = __lsx_vfadd_s(p1_d, acc_1);
+        acc_2 = __lsx_vfadd_s(p2_d, acc_2);
+        acc_3 = __lsx_vfadd_s(p3_d, acc_3);
+    }
+
+    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q4_1 * restrict vx0 = vx;
+        const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
+        const block_q8_1 * restrict vy0 = vy;
+        const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
+
+        float32x4_t sumv0 = vdupq_n_f32(0.0f);
+        float32x4_t summs0 = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; i++) {
+            const block_q4_1 * restrict b_x0 = &vx0[i];
+            const block_q4_1 * restrict b_x1 = &vx1[i];
+            const block_q8_1 * restrict b_y0 = &vy0[i];
+            const block_q8_1 * restrict b_y1 = &vy1[i];
+
+            float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
+                                    GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
+                                    GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
+                                    GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
+            summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
+
+            const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
+            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
+
+            // 4-bit -> 8-bit
+            const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+            const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+            const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+            const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+            // load y
+            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
+            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
+            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
+            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
+
+            // mmla into int32x4_t
+            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
+                                   GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
+                                   GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
+                                   GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
+            float32x4_t scale = vld1q_f32(_scale);
+
+            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+
+            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+
+            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+
+            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
+                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
+        }
+
+        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
+        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+        sumv2 = vaddq_f32(sumv2, summs0);
+
+        vst1_f32(s,      vget_low_f32 (sumv2));
+        vst1_f32(s + bs, vget_high_f32(sumv2));
+        return;
+    }
+#endif
+
+    int ib = 0;
+    float sumf = 0;
+
+    // TODO: add WASM SIMD
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs = 0;
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q4_1 * restrict x0 = &x[ib + 0];
+        const block_q4_1 * restrict x1 = &x[ib + 1];
+        const block_q8_1 * restrict y0 = &y[ib + 0];
+        const block_q8_1 * restrict y1 = &y[ib + 1];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        // dot product into int32x4_t
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const float d0 = GGML_FP16_TO_FP32(x[ib].d);
+        const float d1 = GGML_FP16_TO_FP32(y[ib].d);
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        const __m256 d0v = _mm256_set1_ps( d0 );
+        const __m256 d1v = _mm256_set1_ps( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs );
+
+        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
+
+        // Accumulate d0*d1*x*y
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d0d1, xy, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+#endif
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#elif defined(__riscv_v_intrinsic)
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    for (; ib < nb; ++ib) {
+        // load elements
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
+        vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
+        vsumf0 = vec_madd(vxmin, vys, vsumf0);
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
+        vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
+
+        vector signed int vsumi0 = v0;
+
+        vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
+        vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    float summs = 0;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const float d0 = GGML_FP16_TO_FP32(x[ib].d);
+        const float d1 = GGML_FP16_TO_FP32(y[ib].d);
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
+        const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i qy = __lasx_xvld( (const __m256i *)y[ib].qs, 0);
+
+        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
+
+        // Accumulate d0*d1*x*y
+        acc = __lasx_xvfmadd_s( d0d1, xy, acc );
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q5_0 * restrict x0 = &x[ib];
+        const block_q5_0 * restrict x1 = &x[ib + 1];
+        const block_q8_0 * restrict y0 = &y[ib];
+        const block_q8_0 * restrict y1 = &y[ib + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        // extract the 5th bit via lookup table ((!b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (; ib < nb; ++ib) {
+        const block_q5_0 * restrict x0 = &x[ib];
+        const block_q8_0 * restrict y0 = &y[ib];
+
+        const v128_t m4b  = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_1[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
+        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
+    }
+
+    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
+        qx = _mm256_or_si256(qx, bxhi);
+
+        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps(d, q, acc);
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8((char)0xF0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+
+        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_andnot_si128(bxhil, mask);
+        bxhih = _mm_andnot_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx_0);
+        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx_0 = MM256_SET_M128I(bxh, bxl);
+
+        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__riscv_v_intrinsic)
+    uint32_t qh;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    // These temporary registers are for masking and shift operations
+    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
+
+    vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
+    vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
+    for (; ib < nb; ++ib) {
+        memcpy(&qh, x[ib].qh, sizeof(uint32_t));
+
+        // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
+        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
+        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+
+        // ((qh & (1u << (j + 16))) >> (j + 12));
+        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
+        vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
+
+        // narrowing
+        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
+        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
+
+        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
+        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
+
+        // load
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
+
+        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
+
+        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
+        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector unsigned char v4 = vec_splats((unsigned char)4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
+        vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[ib].qh[2]]), (uint64_t)(table_b2b_1[x[ib].qh[3]])};
+
+        vector signed char qh0 = (vector signed char)aux64x2_0;
+        vector signed char qh1 = (vector signed char)aux64x2_1;
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+
+        vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
+        vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
+
+        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
+        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
+
+        vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
+        vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
+
+        qv0 = vec_add(qv0, qv1);
+
+        vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); //FIXME
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = __lasx_xvandn_v(bxhi, __lasx_xvreplgr2vr_b((char)0xF0));
+        qx = __lasx_xvor_v(qx, bxhi);
+
+        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = __lasx_xvfmadd_s(d, q, acc);
+    }
+
+    sumf = hsum_float_8(acc);
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs0 = 0.0f;
+    float summs1 = 0.0f;
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q5_1 * restrict x0 = &x[ib];
+        const block_q5_1 * restrict x1 = &x[ib + 1];
+        const block_q8_1 * restrict y0 = &y[ib];
+        const block_q8_1 * restrict y1 = &y[ib + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
+        summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
+
+        // extract the 5th bit via lookup table ((b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit
+        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    float summs = 0.0f;
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (; ib < nb; ++ib) {
+        const block_q5_1 * restrict x0 = &x[ib];
+        const block_q8_1 * restrict y0 = &y[ib];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
+
+        const v128_t m4b = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit
+        const v128_t v0lf = wasm_v128_or(v0l, qhl);
+        const v128_t v0hf = wasm_v128_or(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv,
+                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
+    }
+
+    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
+        qx = _mm256_or_si256(qx, bxhi);
+
+        const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
+        const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
+
+        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8(0x10);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_and_si128(bxhil, mask);
+        bxhih = _mm_and_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx_0);
+        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx_0 = MM256_SET_M128I(bxh, bxl);
+
+        const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
+        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
+
+        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#elif defined(__riscv_v_intrinsic)
+    uint32_t qh;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    // temporary registers for shift operations
+    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+    vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
+    for (; ib < nb; ++ib) {
+        memcpy(&qh, x[ib].qh, sizeof(uint32_t));
+
+        // load qh
+        vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
+
+        // ((qh >> (j +  0)) << 4) & 0x10;
+        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
+        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
+
+        // ((qh >> (j + 12))     ) & 0x10;
+        vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
+        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
+
+        // narrowing
+        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
+        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
+
+        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
+        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
+
+        // load
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
+
+        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
+
+        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
+        vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
+        vsumf0 = vec_madd(vxmin, vys, vsumf0);
+
+        vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
+        vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[ib].qh[2]]), (uint64_t)(table_b2b_0[x[ib].qh[3]])};
+
+        vector signed char qh0 = (vector signed char)aux64x2_0;
+        vector signed char qh1 = (vector signed char)aux64x2_1;
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+
+        vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
+        vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
+
+        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
+        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
+
+        vector signed int vsumi0 = v0;
+
+        vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
+        vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d));
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
+        qx = __lasx_xvor_v(qx, bxhi);
+
+        const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib].d));
+        const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
+
+        acc = __lasx_xvfmadd_s(q, __lasx_xvfmul_s(dx, dy), acc);
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q8_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q8_0 * restrict vx0 = vx;
+        const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
+        const block_q8_0 * restrict vy0 = vy;
+        const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
+
+        float32x4_t sumv0 = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; i++) {
+            const block_q8_0 * restrict b_x0 = &vx0[i];
+            const block_q8_0 * restrict b_y0 = &vy0[i];
+
+            const block_q8_0 * restrict b_x1 = &vx1[i];
+            const block_q8_0 * restrict b_y1 = &vy1[i];
+
+            const int8x16_t x0_l = vld1q_s8(b_x0->qs);
+            const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
+            const int8x16_t x1_l = vld1q_s8(b_x1->qs);
+            const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
+
+            // load y
+            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
+            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
+            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
+            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
+
+            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                                   GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+            float32x4_t scale = vld1q_f32(_scale);
+
+            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+
+            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+
+            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+
+            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+
+            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
+                                                                                       l1, r1)), l2, r2)), l3, r3))), scale);
+        }
+        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
+        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+
+        vst1_f32(s, vget_low_f32(sumv2));
+        vst1_f32(s + bs, vget_high_f32(sumv2));
+        return;
+    }
+#endif
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__ARM_FEATURE_SVE)
+    svfloat32_t sumv0 = svdup_n_f32(0.0f);
+    svfloat32_t sumv1 = svdup_n_f32(0.0f);
+
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
+
+    //VLA Implemenation for SVE
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating lanes for 16 Int8 elements
+                const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
+                const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
+                    const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
+                    const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
+                    const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
+
+                    // load y
+                    const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
+                    const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
+                    const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
+                    const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
+
+                    sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
+                                    svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
+                                    svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                //printf("sve256");
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
+                    const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating high 256 bit
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+                // predicate for activating low 256 bit
+                const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
+
+                // predicate for activating high lanes for 8 float32 elements
+                const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
+                // predicate for activating low lanes for 8 float32 elements
+                const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
+
+                svfloat32_t sumv00 = svdup_n_f32(0.0f);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
+                    // and add them to make one 64 element vector
+                    // load x
+                    const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
+                          svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
+
+                    qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
+
+                    // load y
+                    const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
+                          svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
+
+                    qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
+
+                    // scale creation
+                    const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
+                    const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
+
+                    // duplicate deq1 in first half of vector and deq2 in second half of vector
+                    const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
+
+                    const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
+
+                    sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), sumv00);
+                break;
+            }
+        default:
+            assert(false && "Unsupported vector length");
+            break;
+    }
+#elif defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q8_0 * restrict x0 = &x[ib + 0];
+        const block_q8_0 * restrict x1 = &x[ib + 1];
+        const block_q8_0 * restrict y0 = &y[ib + 0];
+        const block_q8_0 * restrict y1 = &y[ib + 1];
+
+        const int8x16_t x0_0 = vld1q_s8(x0->qs);
+        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
+        const int8x16_t x1_0 = vld1q_s8(x1->qs);
+        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
+
+        // load y
+        const int8x16_t y0_0 = vld1q_s8(y0->qs);
+        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
+        const int8x16_t y1_0 = vld1q_s8(y1->qs);
+        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+        __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
+        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        // Multiply q with scale and accumulate
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d, q, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
+#endif
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__riscv_v_intrinsic)
+    size_t vl = __riscv_vsetvl_e8m1(qk);
+
+    for (; ib < nb; ++ib) {
+        // load elements
+        vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[ib].qs, vl);
+        vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
+
+        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
+
+        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
+        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
+
+        sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
+    }
+#elif defined(__POWER9_VECTOR__)
+    const vector signed int v0 = vec_splats((int32_t)0);
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 8
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed char q8x0 = vec_xl( 0, x[ib].qs);
+        vector signed char q8x1 = vec_xl(16, x[ib].qs);
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector signed short qv0 = vec_mule(q8x0, q8y0);
+        vector signed short qv1 = vec_mulo(q8x0, q8y0);
+        vector signed short qv2 = vec_mule(q8x1, q8y1);
+        vector signed short qv3 = vec_mulo(q8x1, q8y1);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+
+        vsumi0 = vec_sum4s(qv0, vsumi0);
+        vsumi1 = vec_sum4s(qv1, vsumi1);
+        vsumi0 = vec_sum4s(qv2, vsumi0);
+        vsumi1 = vec_sum4s(qv3, vsumi1);
+
+        vsumi0 = vec_add(vsumi0, vsumi1);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        // Compute combined scale for the block
+        const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+        __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0);
+        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        // Multiply q with scale and accumulate
+        acc = __lasx_xvfmadd_s( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_tq1_0 * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+    float sumf = 0.0f;
+
+    uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
+
+    const uint8x16_t shift = vld1q_u8(k_shift);
+
+    for (int i = 0; i < nb; ++i) {
+#if defined(__ARM_FEATURE_DOTPROD)
+        int32x4_t sumi0 = vdupq_n_s32(0);
+        int32x4_t sumi1 = vdupq_n_s32(0);
+#else
+        int16x8_t sumi0 = vdupq_n_s16(0);
+        int16x8_t sumi1 = vdupq_n_s16(0);
+#endif
+
+        // first 32 bytes of 5 elements
+        {
+            uint8x16_t qx0 = vld1q_u8(x[i].qs + 0);
+            uint8x16_t qx1 = vld1q_u8(x[i].qs + 16);
+            uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3));
+            uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3));
+            uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9));
+            uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9));
+            uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27));
+            uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27));
+            uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81));
+            uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81));
+
+            // multiply by 3 and keep the 2 bits above 8 bits
+            int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
+            int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
+            int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
+            int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
+            int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
+            int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
+            int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6));
+            int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6));
+            int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6));
+            int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6));
+
+            const int8x16_t qy0 = vld1q_s8(y[i].qs +   0);
+            const int8x16_t qy1 = vld1q_s8(y[i].qs +  16);
+            const int8x16_t qy2 = vld1q_s8(y[i].qs +  32);
+            const int8x16_t qy3 = vld1q_s8(y[i].qs +  48);
+            const int8x16_t qy4 = vld1q_s8(y[i].qs +  64);
+            const int8x16_t qy5 = vld1q_s8(y[i].qs +  80);
+            const int8x16_t qy6 = vld1q_s8(y[i].qs +  96);
+            const int8x16_t qy7 = vld1q_s8(y[i].qs + 112);
+            const int8x16_t qy8 = vld1q_s8(y[i].qs + 128);
+            const int8x16_t qy9 = vld1q_s8(y[i].qs + 144);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
+            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
+            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
+            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
+            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
+            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
+            sumi0 = vdotq_s32(sumi0, sqx6, qy6);
+            sumi1 = vdotq_s32(sumi1, sqx7, qy7);
+            sumi0 = vdotq_s32(sumi0, sqx8, qy8);
+            sumi1 = vdotq_s32(sumi1, sqx9, qy9);
+#else
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9));
+#endif
+        }
+
+        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
+        {
+            uint8x16_t qx0 = vld1q_u8(x[i].qs + 32);
+            uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3));
+            uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9));
+            uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27));
+            uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81));
+            uint32_t qh;
+            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
+            uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh));
+            qx5 = vmulq_u8(qx5, shift);
+
+            // multiply by 3 and keep the 2 bits above 8 bits
+            int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
+            int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
+            int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
+            int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
+            int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
+            int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
+
+            const int8x16_t qy0 = vld1q_s8(y[i].qs + 160);
+            const int8x16_t qy1 = vld1q_s8(y[i].qs + 176);
+            const int8x16_t qy2 = vld1q_s8(y[i].qs + 192);
+            const int8x16_t qy3 = vld1q_s8(y[i].qs + 208);
+            const int8x16_t qy4 = vld1q_s8(y[i].qs + 224);
+            const int8x16_t qy5 = vld1q_s8(y[i].qs + 240);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
+            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
+            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
+            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
+            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
+            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
+#else
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
+#endif
+        }
+
+        const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
+        const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumi0 = vaddq_s32(sumi0, sumi1);
+        sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
+
+        sumf += d * (float) vaddvq_s32(sumi0);
+#else
+        sumi0 = vaddq_s16(sumi0, sumi1);
+        sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
+
+        sumf += d * (float) vaddlvq_s16(sumi0);
+#endif
+    }
+
+    *s = sumf;
+
+#elif defined(__AVX2__)
+    __m256 sumf = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+        // 16-bit sums
+        __m256i sumi0 = _mm256_setzero_si256();
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+
+        // first 32 bytes of 5 elements
+        {
+            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
+            // 8-bit multiplies with shifts, masks and adds
+            __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
+            __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
+            __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
+            __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
+
+            // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
+
+            // Cancel the +1 from avg so that it behaves like a halving add
+            qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
+            qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
+            qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
+            qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
+            qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
+            // Multiply by 3 and get the top 2 bits
+            qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
+            qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
+            qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
+            qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
+            qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
+            qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
+            qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
+            qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
+            qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
+            qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
+
+            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs +   0));
+            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  32));
+            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  64));
+            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  96));
+            const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
+
+            qx0 = _mm256_maddubs_epi16(qx0, qy0);
+            qx1 = _mm256_maddubs_epi16(qx1, qy1);
+            qx2 = _mm256_maddubs_epi16(qx2, qy2);
+            qx3 = _mm256_maddubs_epi16(qx3, qy3);
+            qx4 = _mm256_maddubs_epi16(qx4, qy4);
+
+            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
+            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
+            sumi2 = _mm256_add_epi16(sumi2, qx4);
+        }
+
+        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
+        {
+            __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
+            uint32_t qh;
+            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
+            __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
+            __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
+            __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
+            __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
+            __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
+            __m256i qx01 = MM256_SET_M128I(qx1, qx0);
+            __m256i qx23 = MM256_SET_M128I(qx3, qx2);
+
+            // avx2 does not have 8-bit multiplies, so 16-bit it is.
+            qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
+            qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
+            __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
+
+            __m256i qx45 = MM256_SET_M128I(qx5, qx4);
+
+            // Cancel the +1 from avg so that it behaves like a halving add
+            qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
+            qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
+            qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
+            // Multiply by 3 and get the top 2 bits
+            qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
+            qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
+            qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
+            qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
+            qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
+            qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
+
+            const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
+            const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
+            const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
+
+            qx01 = _mm256_maddubs_epi16(qx01, qy01);
+            qx23 = _mm256_maddubs_epi16(qx23, qy23);
+            qx45 = _mm256_maddubs_epi16(qx45, qy45);
+
+            sumi0 = _mm256_add_epi16(sumi0, qx01);
+            sumi1 = _mm256_add_epi16(sumi1, qx23);
+            sumi2 = _mm256_add_epi16(sumi2, qx45);
+        }
+
+        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
+
+        sumi0 = _mm256_sub_epi16(sumi0, ysum);
+        sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
+        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
+
+        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
+    }
+
+    *s = hsum_float_8(sumf);
+
+#else
+    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
+
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int sum = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 32; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
+                }
+            }
+        }
+        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 16; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
+                }
+            }
+        }
+
+        for (size_t l = 0; l < 4; ++l) {
+            for (size_t j = 0; j < sizeof(x->qh); ++j) {
+                uint8_t q = x[i].qh[j] * pow3[l];
+                uint16_t xi = ((uint16_t) q * 3) >> 8;
+                sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
+            }
+        }
+
+        sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_tq2_0 * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+    float sumf = 0.0f;
+
+    const uint8x16_t m3 = vdupq_n_u8(3);
+
+    for (int i = 0; i < nb; ++i) {
+#if defined(__ARM_FEATURE_DOTPROD)
+        int32x4_t sumi0 = vdupq_n_s32(0);
+        int32x4_t sumi1 = vdupq_n_s32(0);
+#else
+        int16x8_t sumi0 = vdupq_n_s16(0);
+        int16x8_t sumi1 = vdupq_n_s16(0);
+#endif
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            uint8x16_t qx0 = vld1q_u8(x[i].qs + j);
+            uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16);
+            uint8x16_t qx2 = vshrq_n_u8(qx0, 2);
+            uint8x16_t qx3 = vshrq_n_u8(qx1, 2);
+            uint8x16_t qx4 = vshrq_n_u8(qx0, 4);
+            uint8x16_t qx5 = vshrq_n_u8(qx1, 4);
+            uint8x16_t qx6 = vshrq_n_u8(qx0, 6);
+            uint8x16_t qx7 = vshrq_n_u8(qx1, 6);
+
+            int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3));
+            int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3));
+            int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3));
+            int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3));
+            int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3));
+            int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3));
+            int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3));
+            int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3));
+
+            const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 +   0);
+            const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 +  16);
+            const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 +  32);
+            const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 +  48);
+            const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 +  64);
+            const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 +  80);
+            const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 +  96);
+            const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
+            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
+            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
+            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
+            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
+            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
+            sumi0 = vdotq_s32(sumi0, sqx6, qy6);
+            sumi1 = vdotq_s32(sumi1, sqx7, qy7);
+#else
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
+#endif
+        }
+
+        const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
+        const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumi0 = vaddq_s32(sumi0, sumi1);
+        sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
+
+        sumf += d * (float) vaddvq_s32(sumi0);
+#else
+        sumi0 = vaddq_s16(sumi0, sumi1);
+        sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
+
+        sumf += d * (float) vaddlvq_s16(sumi0);
+#endif
+    }
+
+    *s = sumf;
+
+#elif defined(__AVX2__)
+    __m256 sumf = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+        // 16-bit sums, because 256*127 still fits
+        __m256i sumi0 = _mm256_setzero_si256();
+        __m256i sumi1 = _mm256_setzero_si256();
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
+            __m256i qx1 = _mm256_srli_epi16(qx0, 2);
+            __m256i qx2 = _mm256_srli_epi16(qx0, 4);
+            __m256i qx3 = _mm256_srli_epi16(qx0, 6);
+
+            // 0, 1, 2 (should not be 3)
+            qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
+            qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
+            qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
+            qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
+
+            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 +  0));
+            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
+            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
+            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
+
+            qx0 = _mm256_maddubs_epi16(qx0, qy0);
+            qx1 = _mm256_maddubs_epi16(qx1, qy1);
+            qx2 = _mm256_maddubs_epi16(qx2, qy2);
+            qx3 = _mm256_maddubs_epi16(qx3, qy3);
+
+            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
+            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
+        }
+
+        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
+
+        sumi0 = _mm256_add_epi16(sumi0, sumi1);
+        sumi0 = _mm256_sub_epi16(sumi0, ysum);
+        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
+
+        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
+    }
+
+    *s = hsum_float_8(sumf);
+
+#else
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int32_t sumi = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            for (size_t l = 0; l < 4; ++l) {
+                for (size_t k = 0; k < 32; ++k) {
+                    sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
+                }
+            }
+        }
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        sumf += (float) sumi * d;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q2_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    const uint8x16_t m3 = vdupq_n_u8(0x3);
+    const uint8x16_t m4 = vdupq_n_u8(0xF);
+
+    const int32x4_t vzero = vdupq_n_s32(0);
+
+    ggml_int8x16x2_t q2bytes;
+    uint8_t aux[16];
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * restrict sc = x[i].scales;
+
+        const uint8x16_t mins_and_scales = vld1q_u8(sc);
+        const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
+        vst1q_u8(aux, scales);
+
+        const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
+        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
+        const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
+        const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
+                                       vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
+        const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
+                                       vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1])));
+        sum += dmin * vaddvq_s32(vaddq_s32(s0, s1));
+
+        int isum = 0;
+        int is = 0;
+
+// We use this macro instead of a function call because for some reason
+// the code runs 2-3% slower, even if the function is declared inline
+#define MULTIPLY_ACCUM_WITH_SCALE(index)\
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
+
+#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
+        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
+        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
+        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
+        MULTIPLY_ACCUM_WITH_SCALE((index));
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
+
+            ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
+            q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
+            q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
+
+            MULTIPLY_ACCUM_WITH_SCALE(0);
+
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
+
+            is += 8;
+        }
+
+        sum += d * isum;
+    }
+
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m128i m4 = _mm_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
+        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
+        const __m256i mins = _mm256_cvtepi8_epi16(mins8);
+        const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
+
+        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
+        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
+        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
+            const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
+            const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
+            const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
+
+            __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
+            __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
+            __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
+            __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
+
+            p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
+            p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
+            p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
+            p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
+
+            p0 = _mm256_add_epi32(p0, p1);
+            p2 = _mm256_add_epi32(p2, p3);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
+        }
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(0x3);
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m2 = _mm_set1_epi8(0x2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // load mins and scales from block_q2_K.scales[QK_K/16]
+        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
+        const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
+        const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
+        const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
+
+        // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
+        const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
+        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
+
+        // sumf += -dmin * summs in 32bits*8
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
+
+        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
+        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
+        const __m128i scales[2] = { scales_0, scales_1 };
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
+            __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
+            const __m128i q2_0 = _mm_and_si128(q2bits, m3);
+            const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+            const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+            const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+            q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
+            const __m128i q2_1 = _mm_and_si128(q2bits, m3);
+            const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+            const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+            const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+
+            // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
+            __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
+            __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
+            __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
+            __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
+            __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
+            __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
+            __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
+            __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
+
+            // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
+            __m128i shuffle = _mm_set1_epi16(0x0100);
+            p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
+
+            p0 = _mm_add_epi32(p0, p1);
+            p2 = _mm_add_epi32(p2, p3);
+            p4 = _mm_add_epi32(p4, p5);
+            p6 = _mm_add_epi32(p6, p7);
+
+            // isum in 32bits*4*2
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
+        }
+
+        // sumf += dall * isum - dmin * summs in 32bits
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    float sumf = 0;
+    uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        size_t vl = 16;
+
+        vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
+        vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
+
+        vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
+
+        vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
+        vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
+        vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
+        vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
+        vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+
+        sumf  += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
+
+        vl = 32;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+        vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
+
+        uint8_t is=0;
+        int isum=0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load Q2
+            vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
+
+            vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
+            vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
+            vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
+            vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
+
+            // duplicate scale elements for product
+            vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
+            vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
+            vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
+            vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
+
+            vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
+            vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
+            vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
+            vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
+
+            // load Q8
+            vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
+            vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
+            vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
+            vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
+
+            vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
+            vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
+            vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
+            vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
+
+            vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
+            vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
+
+            isum += __riscv_vmv_x_s_i32m1_i32(isum1);
+
+            q2+=32;  q8+=128;  is=8;
+
+        }
+
+        sumf += dall * isum;
+
+    }
+
+    *s = sumf;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0x3);
+    const vector signed char lowScaleMask = vec_splats((signed char)0xF);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
+        vector float vdmin = vec_mul(vxmin, vyd);
+
+        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
+        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
+
+        vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
+        vector signed char vscales = vec_and(q2xmins, lowScaleMask);
+
+        q2xmins = vec_sr(q2xmins, v4);
+        vector signed short q2xmins0 = vec_unpackh(q2xmins);
+        vector signed short q2xmins1 = vec_unpackl(q2xmins);
+
+        vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
+        vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
+        vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
+        vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
+
+        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
+        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
+        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
+        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+        vector signed int vsumi4 = v0;
+        vector signed int vsumi5 = v0;
+        vector signed int vsumi6 = v0;
+        vector signed int vsumi7 = v0;
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
+            q2 += 32;
+
+            vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
+            vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
+            vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
+            vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
+            vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
+            vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
+            vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
+            vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y01 = vec_xl( 32, q8);
+            vector signed char q8y11 = vec_xl( 48, q8);
+            vector signed char q8y02 = vec_xl( 64, q8);
+            vector signed char q8y12 = vec_xl( 80, q8);
+            vector signed char q8y03 = vec_xl( 96, q8);
+            vector signed char q8y13 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
+            vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
+            vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
+            vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
+            vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
+            vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
+            vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
+            vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
+
+            vector signed short vscales_07 = vec_unpackh(vscales);
+            vector signed int vscales_03 = vec_unpackh(vscales_07);
+            vector signed int vscales_47 = vec_unpackl(vscales_07);
+            vector signed int vs0 = vec_splat(vscales_03, 0);
+            vector signed int vs1 = vec_splat(vscales_03, 1);
+            vector signed int vs2 = vec_splat(vscales_03, 2);
+            vector signed int vs3 = vec_splat(vscales_03, 3);
+            vector signed int vs4 = vec_splat(vscales_47, 0);
+            vector signed int vs5 = vec_splat(vscales_47, 1);
+            vector signed int vs6 = vec_splat(vscales_47, 2);
+            vector signed int vs7 = vec_splat(vscales_47, 3);
+            vscales = vec_sld(vscales, vscales, 8);
+
+            vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
+            vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
+            vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
+            vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
+            vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
+        }
+
+        vsumi0 = vec_add(vsumi0, vsumi4);
+        vsumi1 = vec_add(vsumi1, vsumi5);
+        vsumi2 = vec_add(vsumi2, vsumi6);
+        vsumi3 = vec_add(vsumi3, vsumi7);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+
+    const __m256i m3 = __lasx_xvreplgr2vr_b(3);
+    const __m128i m4 = __lsx_vreplgr2vr_b(0xF);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
+        const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
+        const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
+        const __m256i mins = lasx_ext8_16(mins8);
+        const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0));
+
+        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc);
+
+        const __m256i all_scales = lasx_ext8_16(scales8);
+        const __m128i l_scales = lasx_extracti128(all_scales, 0);
+        const __m128i h_scales = lasx_extracti128(all_scales, 1);
+        const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m256i q2bits = __lasx_xvld((const __m256i*)q2, 0); q2 += 32;
+
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            const __m256i q2_0 = __lasx_xvand_v(q2bits, m3);
+            const __m256i q2_1 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 2), m3);
+            const __m256i q2_2 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 4), m3);
+            const __m256i q2_3 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 6), m3);
+
+            __m256i p0 = lasx_maddubs_h(q2_0, q8_0);
+            __m256i p1 = lasx_maddubs_h(q2_1, q8_1);
+            __m256i p2 = lasx_maddubs_h(q2_2, q8_2);
+            __m256i p3 = lasx_maddubs_h(q2_3, q8_3);
+
+            p0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(0)), p0);
+            p1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(1)), p1);
+            p2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(2)), p2);
+            p3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(3)), p3);
+
+            p0 = __lasx_xvadd_w(p0, p1);
+            p2 = __lasx_xvadd_w(p2, p3);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p0, p2));
+        }
+
+        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    const uint8x16_t m3b = vdupq_n_u8(0x3);
+    const int32x4_t  vzero = vdupq_n_s32(0);
+
+    const uint8x16_t m0 = vdupq_n_u8(1);
+    const uint8x16_t m1 = vshlq_n_u8(m0, 1);
+    const uint8x16_t m2 = vshlq_n_u8(m0, 2);
+    const uint8x16_t m3 = vshlq_n_u8(m0, 3);
+    const int8_t m32 = 32;
+
+    ggml_int8x16x4_t q3bytes;
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict qh = x[i].hmask;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
+
+        ggml_uint8x16x4_t q3h;
+
+        int32_t isum = 0;
+
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+        int8_t * scale = (int8_t *)utmp;
+        for (int j = 0; j < 16; ++j) scale[j] -= m32;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
+            const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
+            const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
+            q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
+            q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1);
+            q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1);
+
+            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0]));
+            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1]));
+            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
+            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
+
+            scale += 4;
+
+            q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
+            q3h.val[1] = vbicq_u8(m2, qhbits.val[1]);
+            q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1);
+            q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1);
+
+            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0]));
+            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1]));
+            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
+            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
+
+            scale += 4;
+
+            if (j == 0) {
+                qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4);
+                qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4);
+            }
+
+        }
+        sum += d * isum;
+
+    }
+
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m256i mone = _mm256_set1_epi8(1);
+    const __m128i m32 = _mm_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    uint32_t aux[3];
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        __m128i scales128 = _mm_set_epi32(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = _mm_sub_epi8(scales128, m32);
+        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
+        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
+        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
+
+        // high bit
+        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
+
+        // integer accumulator
+        __m256i sumi = _mm256_setzero_si256();
+
+        int bit = 0;
+        int is  = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits
+            const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
+
+            // prepare low and high bits
+            const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
+            const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
+            const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
+            const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
+            const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            // load Q8 quants
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+            // and 2 if the high bit was set)
+            __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
+            __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
+            __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
+            __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
+            __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
+            __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
+
+            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+            // multiply with scales
+            p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
+            p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
+            p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
+            p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
+
+            // accumulate
+            p16_0 = _mm256_add_epi32(p16_0, p16_1);
+            p16_2 = _mm256_add_epi32(p16_2, p16_3);
+            sumi  = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
+
+        }
+
+        // multiply with block scale and accumulate
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i mone = _mm_set1_epi8(1);
+    const __m128i m32 = _mm_set1_epi8(32);
+    const __m128i m2 = _mm_set1_epi8(2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    const uint32_t *aux;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // Set up scales
+        aux = (const uint32_t *)x[i].scales;
+        __m128i scales128 = _mm_set_epi32(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = _mm_sub_epi8(scales128, m32);
+        const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
+        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
+        const __m128i scales[2] = { scales_0, scales_1 };
+
+        // high bit *128*2 from block_q3_K.hmask[QK_K/8]
+        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
+        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
+
+        // integer accumulator
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
+            const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
+            const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
+
+            // prepare low and high bits
+            const int bit = j << 2;
+
+            const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
+            const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
+            const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
+            const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
+
+            const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
+            const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
+            const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
+            const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
+
+            const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
+            const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
+            const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
+            const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
+
+            const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
+            const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
+            const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
+            const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
+
+            // load Q8 quants from block_q8_K.qs[QK_K]
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+            // and 2 if the high bit was set)
+            __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
+            __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
+            __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
+            __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
+            __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
+            __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
+            __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
+            __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
+
+            __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
+            __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
+            __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
+            __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
+            __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
+            __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
+            __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
+
+            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
+            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
+            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
+            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
+
+            // multiply with scales
+            __m128i shuffle = _mm_set1_epi16(0x0100);
+            p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
+
+            // accumulate
+            p16_0 = _mm_add_epi32(p16_0, p16_1);
+            p16_2 = _mm_add_epi32(p16_2, p16_3);
+            p16_4 = _mm_add_epi32(p16_4, p16_5);
+            p16_6 = _mm_add_epi32(p16_6, p16_7);
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
+
+        }
+
+        // multiply with block scale and accumulate
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict qh = x[i].hmask;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+        int8_t * scale = (int8_t *)utmp;
+        for (int j = 0; j < 16; ++j) scale[j] -= 32;
+
+
+        size_t vl = 32;
+        uint8_t m =  1;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+        vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
+
+        int sum_t = 0;
+
+        for (int j = 0; j < QK_K; j += 128) {
+
+            vl = 32;
+
+            // load Q3
+            vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
+
+            vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
+            vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
+            vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
+            vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
+
+            // compute mask for subtraction
+            vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
+            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
+            m <<= 1;
+
+            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
+            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
+            m <<= 1;
+
+            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
+            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
+            m <<= 1;
+
+            vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
+            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
+            m <<= 1;
+
+            // load Q8 and take product with Q3
+            vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
+            vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+            vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
+            vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
+
+            vl = 16;
+
+            // retrieve lane to multiply with scale
+            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
+            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
+            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
+            vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
+            vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
+            vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
+            vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
+            vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
+
+            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
+            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
+            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
+            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
+
+            sum_t +=  __riscv_vmv_x_s_i32m1_i32(isum3);
+
+            q3 += 32;    q8 += 128;   scale += 8;
+
+        }
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        sumf += d*sum_t;
+
+    }
+
+    *s = sumf;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0x3);
+    const vector signed char lowMask1 = vec_splats((int8_t)0xf);
+    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector signed char v1 = vec_splats((signed char)0x1);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
+    const vector signed char off = vec_splats((signed char)0x20);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        UNUSED(kmask1);
+        UNUSED(kmask2);
+
+        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
+        vector signed char u1 = vec_and(u0, lowMask1);
+        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
+        vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
+        vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
+        vector signed char u31 = vec_and(u3, lowMask2);
+
+        u1 = vec_or(u1, u30);
+        u2 = vec_or(vec_sr(u0, v4), u31);
+
+        vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
+        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
+        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
+
+        vscales = vec_sub(vscales, off);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+        vector signed int vsumi4 = v0;
+        vector signed int vsumi5 = v0;
+        vector signed int vsumi6 = v0;
+        vector signed int vsumi7 = v0;
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __builtin_prefetch(q3, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
+            q3 += 32;
+
+            //the low 2 bits
+            vector signed char qxs00 = vec_and(qxs0, lowMask);
+            vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
+            vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
+            vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
+            vector signed char qxs10 = vec_and(qxs1, lowMask);
+            vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
+            vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
+            vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
+
+            //the 3rd bit
+            vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
+            vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
+            vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
+            vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
+            vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
+            vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
+            vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
+            vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
+            qxhs0 = vec_sr(qxhs0, v4);
+            qxhs1 = vec_sr(qxhs1, v4);
+
+            vector signed char q3x00 = vec_sub(qxs00, qxh00);
+            vector signed char q3x01 = vec_sub(qxs01, qxh01);
+            vector signed char q3x02 = vec_sub(qxs02, qxh02);
+            vector signed char q3x03 = vec_sub(qxs03, qxh03);
+            vector signed char q3x10 = vec_sub(qxs10, qxh10);
+            vector signed char q3x11 = vec_sub(qxs11, qxh11);
+            vector signed char q3x12 = vec_sub(qxs12, qxh12);
+            vector signed char q3x13 = vec_sub(qxs13, qxh13);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y01 = vec_xl( 32, q8);
+            vector signed char q8y11 = vec_xl( 48, q8);
+            vector signed char q8y02 = vec_xl( 64, q8);
+            vector signed char q8y12 = vec_xl( 80, q8);
+            vector signed char q8y03 = vec_xl( 96, q8);
+            vector signed char q8y13 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed short vscales_h = vec_unpackh(vscales);
+            vector signed short vs0 = vec_splat(vscales_h, 0);
+            vector signed short vs1 = vec_splat(vscales_h, 1);
+            vector signed short vs2 = vec_splat(vscales_h, 2);
+            vector signed short vs3 = vec_splat(vscales_h, 3);
+            vector signed short vs4 = vec_splat(vscales_h, 4);
+            vector signed short vs5 = vec_splat(vscales_h, 5);
+            vector signed short vs6 = vec_splat(vscales_h, 6);
+            vector signed short vs7 = vec_splat(vscales_h, 7);
+            vscales = vec_sld(vscales, vscales, 8);
+
+            vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
+            vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
+            vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
+            vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
+            vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
+            vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
+            vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
+            vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
+
+            vsumi0 = vec_msum(qv00, vs0, vsumi0);
+            vsumi1 = vec_msum(qv01, vs2, vsumi1);
+            vsumi2 = vec_msum(qv02, vs4, vsumi2);
+            vsumi3 = vec_msum(qv03, vs6, vsumi3);
+            vsumi4 = vec_msum(qv10, vs1, vsumi4);
+            vsumi5 = vec_msum(qv11, vs3, vsumi5);
+            vsumi6 = vec_msum(qv12, vs5, vsumi6);
+            vsumi7 = vec_msum(qv13, vs7, vsumi7);
+        }
+
+        vsumi0 = vec_add(vsumi0, vsumi4);
+        vsumi1 = vec_add(vsumi1, vsumi5);
+        vsumi2 = vec_add(vsumi2, vsumi6);
+        vsumi3 = vec_add(vsumi3, vsumi7);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+
+    const __m256i m3 = __lasx_xvreplgr2vr_b(3);
+    const __m256i mone = __lasx_xvreplgr2vr_b(1);
+    const __m128i m32 = __lsx_vreplgr2vr_b(32);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    uint32_t aux[3];
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        __m128i scales128 = lsx_set_w(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = __lsx_vsub_b(scales128, m32);
+        const __m256i all_scales = lasx_ext8_16(scales128);
+        const __m128i l_scales = lasx_extracti128(all_scales, 0);
+        const __m128i h_scales = lasx_extracti128(all_scales, 1);
+        const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
+
+        // high bit
+        const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0);
+
+        // integer accumulator
+        __m256i sumi = __lasx_xvldi(0);
+
+        int bit = 0;
+        int is  = 0;
+        __m256i xvbit;
+
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits
+            const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
+
+            xvbit = __lasx_xvreplgr2vr_h(bit);
+            // prepare low and high bits
+            const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
+            const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
+            ++bit;
+
+            xvbit = __lasx_xvreplgr2vr_h(bit);
+            const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
+            const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
+            ++bit;
+
+            xvbit = __lasx_xvreplgr2vr_h(bit);
+            const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
+            const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
+            ++bit;
+
+            xvbit = __lasx_xvreplgr2vr_h(bit);
+            const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
+            const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
+            ++bit;
+
+            // load Q8 quants
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use lasx_maddubs_h,
+            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+            // and 2 if the high bit was set)
+            __m256i q8s_0 = lasx_maddubs_h(q3h_0, q8_0);
+            __m256i q8s_1 = lasx_maddubs_h(q3h_1, q8_1);
+            __m256i q8s_2 = lasx_maddubs_h(q3h_2, q8_2);
+            __m256i q8s_3 = lasx_maddubs_h(q3h_3, q8_3);
+
+            __m256i p16_0 = lasx_maddubs_h(q3l_0, q8_0);
+            __m256i p16_1 = lasx_maddubs_h(q3l_1, q8_1);
+            __m256i p16_2 = lasx_maddubs_h(q3l_2, q8_2);
+            __m256i p16_3 = lasx_maddubs_h(q3l_3, q8_3);
+
+            p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
+            p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
+            p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
+            p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
+
+            // multiply with scales
+            p16_0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
+            p16_1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
+            p16_2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
+            p16_3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
+
+            // accumulate
+            p16_0 = __lasx_xvadd_w(p16_0, p16_1);
+            p16_2 = __lasx_xvadd_w(p16_2, p16_3);
+            sumi  = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2));
+        }
+        // multiply with block scale and accumulate
+        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);//FIXME
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict hm = x[i].hmask;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#ifdef __ARM_NEON
+    const uint8x16_t m4b = vdupq_n_u8(0xf);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_int8x16x2_t q4bytes;
+    ggml_int8x16x2_t q8bytes;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
+
+        memcpy(utmp, x[i].scales, 12);
+
+        uint32x2_t mins8 = { 0 };
+        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
+        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
+
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[0] &= kmask1;
+
+        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
+        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
+                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
+        sumf -= dmin * vaddvq_s32(prod);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        int32_t sumi1 = 0;
+        int32_t sumi2 = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
+
+            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
+            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
+            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
+
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+            sumi1 += vaddvq_s32(p1) * scales[2*j+0];
+
+            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
+            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
+            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
+
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+
+            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
+        }
+
+        sumf += d * (sumi1 + sumi2);
+
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+    __m128 acc_m = _mm_setzero_ps();
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
+        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
+        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
+
+        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4l = _mm256_and_si256(q4bits, m4);
+            const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
+
+            const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
+            p16l = _mm256_madd_epi16(scale_l, p16l);
+
+            const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
+            p16h = _mm256_madd_epi16(scale_h, p16h);
+            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
+
+            sumi = _mm256_add_epi32(sumi, sumj);
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
+    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
+
+    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m2 = _mm_set1_epi8(0x2);
+
+    __m256 acc = _mm256_setzero_ps();
+    __m128 acc_m = _mm_setzero_ps();
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
+        const __m128i scales = _mm_cvtepu8_epi16(utmps);
+        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
+
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
+        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
+        const __m128i prod = _mm_madd_epi16(mins, q8s);
+        acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        __m128i shuffle = _mm_set1_epi16(0x0100);
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+
+            __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
+            const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
+            q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
+            const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
+
+            const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
+            p16l = _mm_madd_epi16(scale_l, p16l);
+            sumi_0 = _mm_add_epi32(sumi_0, p16l);
+            const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
+            p16l = _mm_madd_epi16(scale_l, p16l);
+            sumi_1 = _mm_add_epi32(sumi_1, p16l);
+
+            const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
+            p16h = _mm_madd_epi16(scale_h, p16h);
+            sumi_0 = _mm_add_epi32(sumi_0, p16h);
+            const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
+            p16h = _mm_madd_epi16(scale_h, p16h);
+            sumi_1 = _mm_add_epi32(sumi_1, p16h);
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
+    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
+
+    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+
+#elif defined __riscv_v_intrinsic
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        size_t vl = 8;
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
+        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
+        vint16mf2_t q8sums   = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        vuint8mf4_t mins8  = __riscv_vle8_v_u8mf4(mins, vl);
+        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
+        vint32m1_t  prod   = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
+
+        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        vl = 32;
+
+        int32_t sum_1 = 0;
+        int32_t sum_2 = 0;
+
+        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            // load Q4
+            vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
+
+            // load Q8 and multiply it with lower Q4 nibble
+            vint8m1_t  q8_0 = __riscv_vle8_v_i8m1(q8, vl);
+            vint8m1_t  q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
+            vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
+            vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
+
+            sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
+
+            // load Q8 and multiply it with upper Q4 nibble
+            vint8m1_t  q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
+            vint8m1_t  q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
+            vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
+            vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
+
+            sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
+
+            q4 += 32;    q8 += 64;
+
+        }
+
+        sumf += d*(sum_1 + sum_2);
+
+    }
+
+    *s = sumf;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
+    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v2 = vec_splats((uint8_t)2);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
+        vector float vdmin = vec_mul(vxmin, vyd);
+
+        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
+        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
+
+        UNUSED(kmask1);
+        UNUSED(kmask2);
+        UNUSED(kmask3);
+        UNUSED(utmp);
+
+        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
+        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
+        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
+        vector signed char u3 = vec_sr(u2, v4);
+
+        vector signed char u30 = u1;
+        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
+
+        u1 = vec_and(u0, lowMask1);
+        u2 = vec_or(u30, u31);
+
+        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
+
+        vector signed short vscales = vec_unpackh(utmps);
+        vector signed short q4xmins = vec_unpackl(utmps);
+        vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
+        vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
+
+        vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
+        vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
+        vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
+        vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
+
+        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
+        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
+        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
+        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/64; j+=2) {
+            __builtin_prefetch(q4, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
+            vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
+            vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
+            q4 += 64;
+
+            vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
+            vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
+            vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
+            vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
+            vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
+            vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
+            vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
+            vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y01 = vec_xl( 32, q8);
+            vector signed char q8y11 = vec_xl( 48, q8);
+            vector signed char q8y20 = vec_xl( 64, q8);
+            vector signed char q8y30 = vec_xl( 80, q8);
+            vector signed char q8y21 = vec_xl( 96, q8);
+            vector signed char q8y31 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
+            vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
+            vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
+            vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
+            vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
+            vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
+            vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
+            vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
+
+            vector signed int vscales_h = vec_unpackh(vscales);
+            vector signed int vs0 = vec_splat(vscales_h, 0);
+            vector signed int vs1 = vec_splat(vscales_h, 1);
+            vector signed int vs2 = vec_splat(vscales_h, 2);
+            vector signed int vs3 = vec_splat(vscales_h, 3);
+            vscales = vec_sld(vscales, vscales, 8);
+
+            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
+
+            vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+    GGML_UNUSED(kmask1);
+    GGML_UNUSED(kmask2);
+    GGML_UNUSED(kmask3);
+
+    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+    __m128 acc_m = (__m128)__lsx_vldi(0);
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
+        const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
+        const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s);
+        acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
+
+        const __m128i sc128  = lasx_extracti128(mins_and_scales, 0);
+        const __m256i scales = lasx_insertf128(sc128, sc128);
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_l = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_h = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+            const __m256i q4l = __lasx_xvand_v(q4bits, m4);
+            const __m256i q4h = __lasx_xvand_v(__lasx_xvsrli_h(q4bits, 4), m4);
+
+            const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            __m256i p16l = lasx_maddubs_h(q4l, q8l);
+            p16l = lasx_madd_h(scale_l, p16l);
+
+            const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            __m256i p16h = lasx_maddubs_h(q4h, q8h);
+            p16h = lasx_madd_h(scale_h, p16h);
+            const __m256i sumj = __lasx_xvadd_w(p16l, p16h);
+
+            sumi = __lasx_xvadd_w(sumi, sumj);
+        }
+
+        __m256 vd = __lasx_xvreplfr2vr_s(d);
+        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
+
+    }
+
+    acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
+    __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
+    acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
+
+
+    ft_union fi;
+    fi.i = __lsx_vpickve2gr_w(acc_m, 0);
+    *s = hsum_float_8(acc) + fi.f ;
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].qs;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#ifdef __ARM_NEON
+    const uint8x16_t m4b = vdupq_n_u8(0xf);
+    const uint8x16_t mone = vdupq_n_u8(1);
+    const uint8x16_t mtwo = vdupq_n_u8(2);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_int8x16x4_t q5bytes;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8x8_t mins8 = vld1_u8((const uint8_t*)utmp + 8);
+        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(mins8));
+        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
+                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
+        int32_t sumi_mins = vaddvq_s32(prod);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
+
+        ggml_uint8x16x4_t q5h;
+
+        int32_t sumi = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
+            const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
+            q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
+            q5h.val[2] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[0]), 3);
+            q5h.val[3] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[1]), 3);
+            qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 2);
+            qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 2);
+
+            q5bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[0], m4b), q5h.val[0]));
+            q5bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[1], m4b), q5h.val[1]));
+            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
+            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
+
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
+        }
+
+        sumf += d * sumi - dmin * sumi_mins;
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m128i mzero = _mm_setzero_si128();
+    const __m256i mone  = _mm256_set1_epi8(1);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.f;
+
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
+        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
+        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
+        summs += dmin * _mm_extract_epi32(hsum, 0);
+
+        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
+
+        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
+        __m256i hmask = mone;
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        int bit = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
+
+            const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
+            const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
+            const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
+            hmask = _mm256_slli_epi16(hmask, 1);
+
+            const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
+            const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
+            const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
+            hmask = _mm256_slli_epi16(hmask, 1);
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
+
+            p16_0 = _mm256_madd_epi16(scale_0, p16_0);
+            p16_1 = _mm256_madd_epi16(scale_1, p16_1);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i mzero = _mm_setzero_si128();
+    const __m128i mone  = _mm_set1_epi8(1);
+    const __m128i m2 = _mm_set1_epi8(2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.f;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
+        const __m128i scales = _mm_cvtepu8_epi16(utmps);
+        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
+
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
+        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
+        const __m128i prod = _mm_madd_epi16(mins, q8s);
+        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
+        summs += dmin * _mm_extract_epi32(hsum, 0);
+
+        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
+        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
+        __m128i hmask = mone;
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        int bit = 0;
+
+        __m128i shuffle = _mm_set1_epi16(0x0100);
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+
+            const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
+            const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
+
+            __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
+            __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
+            __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
+            __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
+            __m128i q5_0  = _mm_add_epi8(q5l_0, q5h_0);
+            __m128i q5_1  = _mm_add_epi8(q5l_1, q5h_1);
+            hmask = _mm_slli_epi16(hmask, 1);
+
+            __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
+            p16_0 = _mm_madd_epi16(scale_0, p16_0);
+            p16_1 = _mm_madd_epi16(scale_0, p16_1);
+
+            q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
+            q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
+            q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
+            q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
+            q5_0  = _mm_add_epi8(q5l_0, q5h_0);
+            q5_1  = _mm_add_epi8(q5l_1, q5h_1);
+            hmask = _mm_slli_epi16(hmask, 1);
+
+            q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
+            __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
+            p16_2 = _mm_madd_epi16(scale_1, p16_2);
+            p16_3 = _mm_madd_epi16(scale_1, p16_3);
+
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#elif defined __riscv_v_intrinsic
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float sumf = 0;
+    float sums = 0.0;
+
+    size_t vl;
+
+    for (int i = 0; i < nb; ++i) {
+
+        vl = 8;
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const uint8_t * restrict hm = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+
+        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
+        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
+        vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
+        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
+        vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
+
+        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
+
+        vl = 32;
+        int32_t aux32 = 0;
+        int is = 0;
+
+        uint8_t m = 1;
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+        vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            // load Q5 and Q8
+            vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
+            vint8m1_t  q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
+            vint8m1_t  q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
+
+            // compute mask for addition
+            vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
+            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
+            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
+            m <<= 1;
+
+            vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
+            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
+            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
+            m <<= 1;
+
+            vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
+            vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
+
+            vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
+            vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
+
+            vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
+            vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
+
+            aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
+            q5 += 32;    q8 += 64;
+
+        }
+
+        vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
+        sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
+
+    }
+
+    *s = sumf+sums;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
+    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v1 = vec_splats((unsigned char)0x1);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
+        vector float vdmin = vec_mul(vxmin, vyd);
+
+        UNUSED(kmask1);
+        UNUSED(kmask2);
+        UNUSED(kmask3);
+        UNUSED(utmp);
+
+        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
+        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
+        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
+        vector signed char u3 = vec_sr(u2, v4);
+
+        vector signed char u30 = u1;
+        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
+
+        u1 = vec_and(u0, lowMask1);
+        u2 = vec_or(u30, u31);
+
+        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
+
+        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
+        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
+
+        vector signed short vscales = vec_unpackh(utmps);
+
+        vector signed short q5xmins = vec_unpackl(utmps);
+        vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
+        vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
+
+        vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
+        vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
+        vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
+        vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
+
+        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
+        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
+        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
+        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
+
+        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
+        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            __builtin_prefetch(q5, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
+            q5 += 32;
+
+            vector signed char qxs00 = vec_and(qxs0, lowMask);
+            vector signed char qxs01 = vec_sr(qxs0, v4);
+            vector signed char qxs10 = vec_and(qxs1, lowMask);
+            vector signed char qxs11 = vec_sr(qxs1, v4);
+
+            vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
+            vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
+            vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
+            vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
+            qxhs0 = vec_sr(qxhs0, v2);
+            qxhs1 = vec_sr(qxhs1, v2);
+
+            vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
+            vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
+            vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
+            vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
+
+            vector signed char q8y00 = vec_xl( 0, q8);
+            vector signed char q8y10 = vec_xl(16, q8);
+            vector signed char q8y01 = vec_xl(32, q8);
+            vector signed char q8y11 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
+            vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
+            vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
+            vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
+
+            vector signed int vscales_h = vec_unpackh(vscales);
+            vector signed int vs0 = vec_splat(vscales_h, 0);
+            vector signed int vs1 = vec_splat(vscales_h, 1);
+            vscales = vec_sld(vscales, vscales, 12);
+
+            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+    GGML_UNUSED(kmask1);
+    GGML_UNUSED(kmask2);
+    GGML_UNUSED(kmask3);
+
+    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+    const __m128i mzero = __lsx_vldi(0);
+    const __m256i mone  = __lasx_xvreplgr2vr_b(1);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    float summs = 0.f;
+
+   for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
+        const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
+        const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s);
+        const __m128i hsum = lsx_hadd_w(lsx_hadd_w(prod, mzero), mzero);
+        summs += dmin * __lsx_vpickve2gr_w(hsum, 0);    //TODO check
+
+        const __m128i sc128  = lasx_extracti128(mins_and_scales, 0);
+        const __m256i scales = lasx_insertf128(sc128, sc128);
+
+        const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0);
+        __m256i hmask = mone;
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        int bit = 0;
+        __m256i xvbit;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_0 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_1 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
+
+            xvbit = __lasx_xvreplgr2vr_h(bit++);
+            const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
+            const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
+            const __m256i q5_0  = __lasx_xvadd_b(q5l_0, q5h_0);
+            hmask = __lasx_xvslli_h(hmask, 1);
+
+            xvbit = __lasx_xvreplgr2vr_h(bit++);
+            const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
+            const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
+            const __m256i q5_1  = __lasx_xvadd_b(q5l_1, q5h_1);
+            hmask = __lasx_xvslli_h(hmask, 1);
+
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            __m256i p16_0 = lasx_maddubs_h(q5_0, q8_0);
+            __m256i p16_1 = lasx_maddubs_h(q5_1, q8_1);
+
+            p16_0 = lasx_madd_h(scale_0, p16_0);
+            p16_1 = lasx_madd_h(scale_1, p16_1);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
+
+        }
+
+        __m256 vd = __lasx_xvreplfr2vr_s(d);
+        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].qs;
+        const uint8_t * restrict hm = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q6_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    float sum = 0;
+
+    const uint8x16_t m4b = vdupq_n_u8(0xF);
+    const int32x4_t  vzero = vdupq_n_s32(0);
+    //const int8x16_t  m32s = vdupq_n_s8(32);
+
+    const uint8x16_t mone = vdupq_n_u8(3);
+
+    ggml_int8x16x4_t q6bytes;
+    ggml_uint8x16x4_t q6h;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const int8_t * restrict scale = x[i].scales;
+
+        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
+        const int8x16_t scales = vld1q_s8(scale);
+        const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
+
+        const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
+                                                   vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
+                                         vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])),
+                                                   vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
+        int32_t isum_mins = vaddvq_s32(prod);
+
+        int32_t isum = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
+            ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
+            ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
+            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
+            uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
+            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 2);
+            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
+            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
+            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s);
+            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s);
+            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
+            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
+            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
+            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+
+            scale += 4;
+
+            q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            shifted = vshrq_n_u8(qhbits.val[0], 4);
+            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 4);
+            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[0], 6);
+            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 6);
+            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s);
+            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s);
+            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s);
+            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s);
+            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
+            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
+            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
+            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+            scale += 4;
+        }
+        //sum += isum * d_all * y[i].d;
+        sum += d_all * y[i].d * (isum - 32 * isum_mins);
+
+    }
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m256i m2 = _mm256_set1_epi8(3);
+    const __m256i m32s = _mm256_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;
+
+            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
+
+            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
+            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
+            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
+            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
+
+            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
+            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
+            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
+            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
+            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
+            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
+            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
+            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
+            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
+
+            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
+            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
+            p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
+            p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
+
+        }
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i m15 = _mm_set1_epi8(15);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // handle the q6_k -32 offset separately using bsums
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
+        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
+        const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
+        const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
+        const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
+            const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
+
+            const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
+            const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
+            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
+            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
+            const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
+            const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
+            const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
+            const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
+
+            const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+
+            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
+            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
+            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
+            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
+            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
+            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
+            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
+            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
+
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
+            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
+            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
+            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
+            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
+            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
+            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;
+
+            p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
+            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
+            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
+            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
+            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
+            p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
+            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
+
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
+
+        }
+
+        sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
+        sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
+        const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        const int8_t * restrict scale = x[i].scales;
+
+        size_t vl;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+        int sum_t = 0;
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            vl = 32;
+
+            // load qh
+            vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
+
+            // load Q6
+            vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
+            vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
+
+            vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
+            vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
+            vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
+            vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
+
+            vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
+            vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
+            vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
+            vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
+
+            vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
+            vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
+            vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
+            vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
+
+            vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
+            vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
+            vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
+            vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
+
+            // load Q8 and take product
+            vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
+            vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+            vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
+            vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
+
+            vl = 16;
+
+            vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
+            vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
+            vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
+            vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
+            vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
+            vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
+            vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
+            vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
+
+            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
+            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
+            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
+            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
+
+            sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
+
+            q6 += 64;   qh += 32;   q8 += 128;   is=8;
+
+        }
+
+        sumf += d * sum_t;
+
+    }
+
+    *s = sumf;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
+    const vector signed char off = vec_splats((signed char)0x20);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+        vector signed int vsumi4 = v0;
+        vector signed int vsumi5 = v0;
+        vector signed int vsumi6 = v0;
+        vector signed int vsumi7 = v0;
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict qs = x[i].scales;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __builtin_prefetch(q6, 0, 0);
+            __builtin_prefetch(qh, 0, 0);
+            __builtin_prefetch(q8, 0, 0);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
+            vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
+            vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
+            q6 += 64;
+
+            vector signed char qxs00 = vec_and(qxs0, lowMask);
+            vector signed char qxs01 = vec_sr(qxs0, v4);
+            vector signed char qxs10 = vec_and(qxs1, lowMask);
+            vector signed char qxs11 = vec_sr(qxs1, v4);
+            vector signed char qxs20 = vec_and(qxs2, lowMask);
+            vector signed char qxs21 = vec_sr(qxs2, v4);
+            vector signed char qxs30 = vec_and(qxs3, lowMask);
+            vector signed char qxs31 = vec_sr(qxs3, v4);
+
+            vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
+            vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
+            qh += 32;
+
+            vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
+            vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
+            vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
+            vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
+            vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
+            vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
+            vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
+            vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
+
+            vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
+            vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
+            vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
+            vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
+            vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
+            vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
+            vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
+            vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y20 = vec_xl( 32, q8);
+            vector signed char q8y30 = vec_xl( 48, q8);
+            vector signed char q8y01 = vec_xl( 64, q8);
+            vector signed char q8y11 = vec_xl( 80, q8);
+            vector signed char q8y21 = vec_xl( 96, q8);
+            vector signed char q8y31 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
+            vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
+            vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
+            vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
+            vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
+            vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
+            vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
+            vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
+
+            vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
+            qs += 8;
+
+            vector signed short vs0 = vec_splat(vscales, 0);
+            vector signed short vs1 = vec_splat(vscales, 1);
+            vector signed short vs2 = vec_splat(vscales, 2);
+            vector signed short vs3 = vec_splat(vscales, 3);
+            vector signed short vs4 = vec_splat(vscales, 4);
+            vector signed short vs5 = vec_splat(vscales, 5);
+            vector signed short vs6 = vec_splat(vscales, 6);
+            vector signed short vs7 = vec_splat(vscales, 7);
+
+            vsumi0 = vec_msum(qv00, vs0, vsumi0);
+            vsumi1 = vec_msum(qv01, vs4, vsumi1);
+            vsumi2 = vec_msum(qv10, vs1, vsumi2);
+            vsumi3 = vec_msum(qv11, vs5, vsumi3);
+            vsumi4 = vec_msum(qv20, vs2, vsumi4);
+            vsumi5 = vec_msum(qv21, vs6, vsumi5);
+            vsumi6 = vec_msum(qv30, vs3, vsumi6);
+            vsumi7 = vec_msum(qv31, vs7, vsumi7);
+        }
+
+        vsumi0 = vec_add(vsumi0, vsumi4);
+        vsumi1 = vec_add(vsumi1, vsumi5);
+        vsumi2 = vec_add(vsumi2, vsumi6);
+        vsumi3 = vec_add(vsumi3, vsumi7);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+
+    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+    const __m256i m2 = __lasx_xvreplgr2vr_b(3);
+    const __m256i m32s = __lasx_xvreplgr2vr_b(32);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i scales = __lsx_vld((const __m128i*)x[i].scales, 0);
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i scale_0 = lsx_shuffle_b(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = lsx_shuffle_b(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = lsx_shuffle_b(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = lsx_shuffle_b(scales, get_scale_shuffle(is + 3));
+            is += 4;
+
+            const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+            const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+            const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32;
+
+            const __m256i q4h_0 = __lasx_xvslli_h(__lasx_xvand_v(q4bitsH, m2), 4);
+            const __m256i q4h_1 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 2), m2), 4);
+            const __m256i q4h_2 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 4), m2), 4);
+            const __m256i q4h_3 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 6), m2), 4);
+
+            const __m256i q4_0 = __lasx_xvor_v(__lasx_xvand_v(q4bits1, m4), q4h_0);
+            const __m256i q4_1 = __lasx_xvor_v(__lasx_xvand_v(q4bits2, m4), q4h_1);
+            const __m256i q4_2 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits1, 4), m4), q4h_2);
+            const __m256i q4_3 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits2, 4), m4), q4h_3);
+
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            __m256i q8s_0 = lasx_maddubs_h(m32s, q8_0);
+            __m256i q8s_1 = lasx_maddubs_h(m32s, q8_1);
+            __m256i q8s_2 = lasx_maddubs_h(m32s, q8_2);
+            __m256i q8s_3 = lasx_maddubs_h(m32s, q8_3);
+
+            __m256i p16_0 = lasx_maddubs_h(q4_0, q8_0);
+            __m256i p16_1 = lasx_maddubs_h(q4_1, q8_1);
+            __m256i p16_2 = lasx_maddubs_h(q4_2, q8_2);
+            __m256i p16_3 = lasx_maddubs_h(q4_3, q8_3);
+
+            p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
+            p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
+            p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
+            p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
+
+            p16_0 = lasx_madd_h(lasx_ext8_16(scale_0), p16_0);
+            p16_1 = lasx_madd_h(lasx_ext8_16(scale_1), p16_1);
+            p16_2 = lasx_madd_h(lasx_ext8_16(scale_2), p16_2);
+            p16_3 = lasx_madd_h(lasx_ext8_16(scale_3), p16_3);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3));
+        }
+
+        acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
+static const int8_t keven_signs_q2xs[1024] = {
+     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
+     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
+     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
+     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
+     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
+     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
+     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
+     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
+     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
+     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
+     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
+     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
+     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
+     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
+     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
+     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
+     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
+     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
+     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
+     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
+     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
+     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
+     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
+     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
+     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+#endif
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xxs * restrict x = vx;
+    const block_q8_K    * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    ggml_int8x16x4_t q2u;
+    ggml_int8x16x4_t q2s;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        float sumf1 = 0, sumf2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
+            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
+            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
+            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
+            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >>  7) & 127))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
+            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
+            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
+            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
+            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
+            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
+            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
+        }
+        sumf += d*(sumf1 + sumf2);
+    }
+    *s = 0.25f * sumf;
+
+#elif defined(__AVX2__)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
+                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[1] >> 28;
+            const uint16_t ls2 = aux32[3] >> 28;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+            const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
+            const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+            const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
+            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
+            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
+            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
+            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
+            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
+            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
+            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const uint16_t ls1 = aux32[1] >> 28;
+            const uint16_t ls2 = aux32[3] >> 28;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__POWER9_VECTOR__)
+    const vector int v0 = vec_splats((int32_t)0);
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t  *  restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            uint32_t aux32[4];
+            const uint8_t * aux8 = (const uint8_t *)aux32;
+
+            memcpy(aux32, q2, 4*sizeof(uint32_t));
+            q2 += 8;
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
+
+            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >>  7) & 127))};
+            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
+            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >>  7) & 127))};
+            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
+
+            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
+            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
+            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
+            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
+
+            const uint16_t ls0 = aux32[1] >> 28;
+            const uint16_t ls1 = aux32[3] >> 28;
+
+            vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
+            vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.125f * vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+
+            const __m256i q2_1 = lasx_set_d(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+            const __m256i q2_2 = lasx_set_d(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+            const __m256i s2_1 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i s2_2 = lasx_set_d(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
+                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
+            const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
+            const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[1] >> 28;
+            const uint16_t ls2 = aux32[3] >> 28;
+            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xs * restrict x = vx;
+    const block_q8_K   * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    ggml_int8x16x4_t q2u;
+    ggml_int8x16x4_t q2s;
+    ggml_int8x16x4_t q8b;
+
+    int32x4x4_t scales32;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        const uint8x8_t scales8 = vld1_u8(x[i].scales);
+        const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
+        const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
+        uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
+        scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
+        const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
+        const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
+        scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
+        scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
+        scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
+        scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
+        int32x4_t sumi = vdupq_n_s32(0);
+        for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
+            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
+            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
+            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
+            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
+            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
+            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
+            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
+            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
+            const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
+            const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
+            const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
+            const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
+            const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
+            sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
+            q2 += 8;
+        }
+        sumf += d*vaddvq_s32(sumi);
+    }
+    *s = 0.125f * sumf;
+
+#elif defined(__AVX2__)
+
+    const __m256i mone = _mm256_set1_epi8(1);
+    static const char block_sign_shuffle_mask_1[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+    };
+    static const char block_sign_shuffle_mask_2[32] = {
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+    };
+    static const uint8_t bit_selector_mask_bytes[32] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
+    const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
+    const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
+
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
+    const __m256i m511 = _mm256_set1_epi16(511);
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    uint64_t aux64;
+
+    // somewhat hacky, but gives a significant boost in performance
+    __m256i aux_gindex;
+    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        __m128i stmp = _mm_set1_epi64x(aux64);
+        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
+        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
+
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+            const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2);  q2 += 16;
+            aux_gindex = _mm256_and_si256(q2_data, m511);
+
+            const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
+            const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
+            const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
+
+            const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
+            const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
+
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+
+            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
+                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
+                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
+            const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
+                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
+            const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
+                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+
+            const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
+            const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
+            const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
+            const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
+
+            __m256i signs;
+            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
+
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const __m256i dot3  = _mm256_maddubs_epi16(q2_3, q8s_3);
+            const __m256i dot4  = _mm256_maddubs_epi16(q2_4, q8s_4);
+
+            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
+            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
+            const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
+            const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
+
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+    const __m128i mone = _mm_set1_epi8(1);
+    static const char block_sign_shuffle_mask_1[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+    };
+    static const char block_sign_shuffle_mask_2[32] = {
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+    };
+    static const uint8_t bit_selector_mask_bytes[32] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
+    const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
+    const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
+    const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
+    const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
+    const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
+
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
+    const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
+    const __m128i m511 = _mm_set1_epi16(511);
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    uint64_t aux64;
+
+    // somewhat hacky, but gives a significant boost in performance
+    __m256i aux_gindex;
+    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        __m128i stmp = _mm_set1_epi64x(aux64);
+        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
+        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+            const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
+            const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1);  q2 += 16;
+            aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
+
+            const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
+            const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
+            const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
+            const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
+            const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
+            const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
+
+            const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
+            const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
+            const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
+            const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
+
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+            const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
+            const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
+            const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
+            const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
+            const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
+            const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
+            const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+            const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
+
+            // AVX2 full_signs_1 is full_sign_bits_0 here
+            // AVX2 full_signs_2 is full_sign_bits_1 here
+            __m128i signs_0, signs_1;
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
+
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
+
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
+
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
+
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const __m128i dot3_0  = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
+            const __m128i dot3_1  = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
+            const __m128i dot4_0  = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
+            const __m128i dot4_1  = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
+
+            __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
+            const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
+            const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
+            const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
+            const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
+            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
+            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
+            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
+            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__loongarch_asx)
+
+    const __m256i mone = __lasx_xvreplgr2vr_b(1);
+    static const char block_sign_shuffle_mask_1[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+    };
+    static const char block_sign_shuffle_mask_2[32] = {
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+    };
+    static const uint8_t bit_selector_mask_bytes[32] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i bit_selector_mask = __lasx_xvld((const __m256i*)bit_selector_mask_bytes, 0);
+    const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0);
+    const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0);
+
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m256i bit_helper = __lasx_xvld((const __m256i*)k_bit_helper, 0);
+    const __m256i m511 = __lasx_xvreplgr2vr_h(511);
+    const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
+    const __m128i m1 = __lsx_vreplgr2vr_b(1);
+
+    uint64_t aux64;
+
+    // somewhat hacky, but gives a significant boost in performance
+    __m256i aux_gindex;
+    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        __m128i stmp = __lsx_vreplgr2vr_d(aux64);
+        stmp = __lsx_vilvl_b( __lsx_vand_v(__lsx_vsrli_h(stmp, 4), m4), __lsx_vand_v(stmp, m4));
+        const __m128i scales = __lsx_vadd_b(__lsx_vslli_h(stmp, 1), m1);
+
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+            const __m256i q2_data = __lasx_xvld((const __m256i*)q2, 0);  q2 += 16;
+            aux_gindex = __lasx_xvand_v(q2_data, m511);
+
+            const __m256i partial_sign_bits = __lasx_xvsrli_h(q2_data, 9);
+            const __m256i partial_sign_bits_upper = __lasx_xvsrli_h(q2_data, 13);
+            const __m256i partial_sign_bits_for_counting = __lasx_xvxor_v(partial_sign_bits, partial_sign_bits_upper);
+
+            const __m256i odd_bits = lasx_shuffle_b(bit_helper, partial_sign_bits_for_counting);
+            const __m256i full_sign_bits = __lasx_xvor_v(partial_sign_bits, odd_bits);
+
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_4 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+
+            const __m256i q2_1 = lasx_set_d(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
+                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
+            const __m256i q2_2 = lasx_set_d(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
+                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
+            const __m256i q2_3 = lasx_set_d(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
+                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
+            const __m256i q2_4 = lasx_set_d(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
+                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+
+            const __m128i full_signs_l = lasx_extracti128(full_sign_bits, 0);
+            const __m128i full_signs_h = lasx_extracti128(full_sign_bits, 1);
+            const __m256i full_signs_1 = lasx_insertf128(full_signs_l, full_signs_l);
+            const __m256i full_signs_2 = lasx_insertf128(full_signs_h, full_signs_h);
+
+            __m256i signs;
+            signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_1);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1);
+
+            signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_2);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2);
+
+            signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_1);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_3 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_3);
+
+            signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_2);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_4 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_4);
+
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const __m256i dot3  = lasx_maddubs_h(q2_3, q8s_3);
+            const __m256i dot4  = lasx_maddubs_h(q2_4, q8s_4);
+
+            const __m256i sc1 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+0)));
+            const __m256i sc2 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+1)));
+            const __m256i sc3 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+2)));
+            const __m256i sc4 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+3)));
+
+            sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot1, sc1));
+            sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot2, sc2));
+            sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot3, sc3));
+            sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot4, sc4));
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+#elif defined(__POWER9_VECTOR__)
+    const vector int v0 = vec_splats((int32_t)0);
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint16_t * restrict q2 = x[i].qs;
+        const uint8_t  * restrict sc = x[i].scales;
+        const int8_t  *  restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
+
+            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
+            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
+            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
+            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
+            q2 += 8;
+
+            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
+            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
+            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
+            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
+            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
+            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
+            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
+            sc += 2;
+
+            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
+            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
+            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
+            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
+
+            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.125f * vec_extract(vsumf0, 0);
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const uint8_t  * restrict sc = x[i].scales;
+        const int8_t   * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_s * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
+    const uint8x16_t        mask2 = vld1q_u8(k_mask2);
+    const uint8x16_t m1 = vdupq_n_u8(1);
+    const int32x4_t vzero = vdupq_n_s32(0);
+
+    uint8x16x2_t vs;
+    ggml_int8x16x4_t q2s;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * restrict q8 = y[i].qs;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
+            qs += 8;
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vceqq_u8(vs.val[0], mask2);
+            vs.val[1] = vceqq_u8(vs.val[1], mask2);
+
+            q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
+            q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vceqq_u8(vs.val[0], mask2);
+            vs.val[1] = vceqq_u8(vs.val[1], mask2);
+
+            signs += 4;
+
+            q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
+            q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
+
+            const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
+            const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
+            const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
+            const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
+
+            sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
+            sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >>  4));
+            sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
+            sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >>  4));
+        }
+        sumf += d*(sumi1 + sumi2);
+    }
+
+    *s = 0.125f * sumf;
+
+#elif defined(__AVX2__)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
+    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
+
+    uint64_t aux64;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
+        const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
+
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
+                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
+                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+            qs += 8;
+
+            __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
+
+            aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
+
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
+    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
+    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
+    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
+
+    uint64_t aux64;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
+        const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
+        const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+                                                  iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+            const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+                                                  iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
+            const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+                                                  iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+            const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+                                                  iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
+            qs += 8;
+
+            __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
+            __m128i aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
+            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
+
+            aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
+            aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
+            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
+
+            signs += 4;
+
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__POWER9_VECTOR__)
+    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+    };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    const vector int v0 = vec_splats((int32_t)0);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
+    const vector unsigned char mask1 = vec_xl(16, k_mask1);
+    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t *  restrict q2 = x[i].qs;
+        const uint8_t *  restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const uint8_t *  restrict sc = x[i].scales;
+        const int8_t  *  restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
+            q2 += 8;
+            qh += 2;
+
+            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
+            vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
+            signs += 4;
+
+            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
+            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
+            vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
+            vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
+
+            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
+            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
+            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
+            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
+
+            vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
+            vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
+            vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
+            vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
+            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
+            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
+            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
+            sc += 2;
+
+            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
+            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
+            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
+            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
+
+            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.125f * vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+
+    const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
+    const __m128i m1 = __lsx_vreplgr2vr_b(1);
+
+    const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
+    const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
+    uint64_t aux64;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * restrict q8 = y[i].qs;
+
+        __m128i tmp1;
+        memcpy(&aux64, x[i].scales, 8);
+        tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64, 0);
+        tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64 >> 4, 1);
+        const __m128i scales8 = __lsx_vadd_b(__lsx_vslli_h(__lsx_vand_v(tmp1, m4), 1), m1);
+        const __m256i scales16 = lasx_ext8_16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
+
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q2_1 = lasx_set_d(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
+                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+            const __m256i q2_2 = lasx_set_d(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
+                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+            qs += 8;
+
+            __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | ((uint32_t) signs[1] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
+
+            aux256 = __lasx_xvreplgr2vr_w(signs[2] | ((uint32_t) signs[3] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
+
+            const __m256i p1 = lasx_madd_h(dot1, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+0)));
+            const __m256i p2 = lasx_madd_h(dot2, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+1)));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        int bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
+            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
+            int sumi1 = 0, sumi2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += ls1 * sumi1 + ls2 * sumi2;
+            qs += 4;
+            signs += 4;
+        }
+
+        sumf += d * bsum;
+    }
+
+    *s = 0.125f * sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_xxs * restrict x = vx;
+    const block_q8_K    * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    ggml_int8x16x4_t q3s;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t   * restrict q8 = y[i].qs;
+        float sumf1 = 0, sumf2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
+            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
+            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
+            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
+            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
+            q3 += 16;
+            q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >>  7) & 127))));
+            q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
+            q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
+            q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
+            q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
+            q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
+            q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
+            q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
+            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
+            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
+        }
+        sumf += d*(sumf1 + sumf2);
+    }
+    *s = 0.5f * sumf;
+
+#elif defined(__AVX2__)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            memcpy(aux32, gas, 8); gas += 8;
+            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
+                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
+            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[0] >> 28;
+            const uint16_t ls2 = aux32[1] >> 28;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.25f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
+            q3 += 8;
+            const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
+            q3 += 8;
+            memcpy(aux32, gas, 8); gas += 8;
+            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
+            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
+            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
+            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
+            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
+            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
+            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const uint16_t ls1 = aux32[0] >> 28;
+            const uint16_t ls2 = aux32[1] >> 28;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.25f * hsum_float_8(accumf);
+
+#elif defined(__POWER9_VECTOR__)
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    const vector int v0 = vec_splats((int32_t)0);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
+        const int8_t  * restrict q8 = y[i].qs;
+
+#pragma GCC unroll 1
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q3, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
+            vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
+            vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
+            vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
+            q3 += 16;
+
+            vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >>  0) & 127]), (uint64_t)(signs64[(signs[0] >>  7) & 127])};
+            vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
+            vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >>  0) & 127]), (uint64_t)(signs64[(signs[1] >>  7) & 127])};
+            vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
+
+            vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
+            vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
+            vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
+            vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
+            const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
+            signs += 2;
+
+            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
+            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.25f * vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q2_1 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            const __m256i q2_2 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            memcpy(aux32, gas, 8); gas += 8;
+
+            const __m256i s2_1 = lasx_set_d(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
+                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
+            const __m256i s2_2 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
+            const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[0] >> 28;
+            const uint16_t ls2 = aux32[1] >> 28;
+
+            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = 0.25f * hsum_float_8(accumf);
+
+#else
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t  * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_s * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    typedef union {
+        uint16x8_t vec_index;
+        uint16_t   index[8];
+    } vec_index_t;
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
+
+    const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
+    const uint8x16_t        mask2 = vld1q_u8(k_mask2);
+
+    const int16x8_t  hshift = vld1q_s16(k_shift);
+    const uint16x8_t m256   = vdupq_n_u16(256);
+    const uint8x16_t m1     = vdupq_n_u8(1);
+
+    uint8x16x2_t vs;
+    ggml_int8x16x4_t q3s;
+    ggml_int8x16x4_t q8b;
+    vec_index_t idx;
+
+    uint32_t scales32[2];
+    const uint8_t * scales8 = (const uint8_t *)scales32;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
+        const int8_t   * restrict q8 = y[i].qs;
+
+        memcpy(scales32, x[i].scales, 4);
+        scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
+        scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
+            idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
+            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
+                                                        iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
+            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
+                                                        iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
+            idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
+            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
+                                                        iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
+            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
+                                                        iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
+
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
+            vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
+
+            q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
+            q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
+            vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
+
+            signs += 4;
+
+            q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
+            q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
+
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
+
+            sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
+            sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
+        }
+        sumf += d*(sumi1 + sumi2);
+    }
+    *s = sumf;
+
+#elif defined(__AVX2__)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
+    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
+
+    const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+    const __m256i idx_mask  = _mm256_set1_epi32(256);
+
+    typedef union {
+        __m256i  vec[2];
+        uint32_t index[16];
+    } index_t;
+
+    index_t idx;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
+            idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
+            idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
+            idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
+            idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
+            idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
+            idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
+
+            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
+            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
+            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
+            const __m256i q2_1 = _mm256_set_epi32(
+                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
+                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
+            );
+            const __m256i q2_2 = _mm256_set_epi32(
+                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
+                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
+            );
+
+            __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
+
+            aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
+    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
+    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
+    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
+
+    const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
+    const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
+    const __m128i idx_mask  = _mm_set1_epi32(256);
+
+    typedef union {
+        __m128i  vec[4];
+        uint32_t index[16];
+    } index_t;
+
+    index_t idx;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
+            const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
+            const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
+            idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
+            idx.vec[1] = idx.vec[0];
+            idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
+            idx.vec[3] = idx.vec[2];
+
+            idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
+            idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
+            idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
+            idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
+
+            idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
+            idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
+            idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
+            idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
+
+            const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
+            const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
+            const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
+            const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
+
+            __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
+            __m128i aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
+            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
+
+            aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
+            aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
+            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
+
+            signs += 4;
+
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = hsum_float_8(accumf);
+
+#elif defined(__POWER9_VECTOR__)
+    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+    };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    const vector int v0 = vec_splats((int32_t)0);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
+    const vector unsigned char mask1 = vec_xl(16, k_mask1);
+    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        const uint8_t *  restrict q3 = x[i].qs;
+        const uint8_t *  restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
+        const uint8_t *  restrict sc = x[i].scales;
+        const int8_t  *  restrict q8 = y[i].qs;
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q3, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
+                                             iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
+            vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
+                                             iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
+            vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
+                                             iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
+            vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
+                                             iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
+            q3 += 16;
+            qh += 2;
+
+            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
+            vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
+            signs += 4;
+
+            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
+            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
+            vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
+            vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
+
+            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
+            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
+            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
+            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
+
+            vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
+            vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
+            vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
+            vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
+            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
+            sc ++;
+
+            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
+            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
+    const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
+
+    __m256i idx_shift = lasx_set_w(1, 2, 3, 4, 5, 6, 7, 8);
+    const __m256i idx_mask  = __lasx_xvreplgr2vr_w(256);
+
+    typedef union {
+        __m256i  vec[2];
+        uint32_t index[16];
+    } index_t;
+
+    index_t idx;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i idx_l = lasx_extu8_16(__lsx_vld(qs, 0)); qs += 16;
+            idx.vec[0] = __lasx_xvreplgr2vr_w(qh[ib32+0]);
+            idx.vec[1] = __lasx_xvreplgr2vr_w(qh[ib32+1]);
+            idx.vec[0] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[0], idx_shift), idx_mask);
+            idx.vec[1] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[1], idx_shift), idx_mask);
+            idx.vec[0] = __lasx_xvor_v(idx.vec[0], lasx_ext16_32(lasx_extracti128(idx_l, 0)));
+            idx.vec[1] = __lasx_xvor_v(idx.vec[1], lasx_ext16_32(lasx_extracti128(idx_l, 1)));
+
+            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
+            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
+            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
+            const __m256i q2_1 = lasx_set_w(
+                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
+                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
+            );
+            const __m256i q2_2 = lasx_set_w(
+                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
+                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
+            );
+
+            __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | (signs[1] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
+
+            aux256 = __lasx_xvreplgr2vr_w(signs[2] | (signs[3] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
+            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint8_t * restrict signs = x[i].signs;
+        const int8_t  * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
+            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls2;
+        }
+        sumf += d * bsum;
+    }
+    *s = sumf;
+#endif
+}
+
+#if defined(__AVX2__)
+static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return _mm256_maddubs_epi16(ax, sy);
+}
+#elif defined(__loongarch_asx)
+static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
+    const __m256i ax = __lasx_xvsigncov_b(x, x);
+    const __m256i sy = __lasx_xvsigncov_b(x, y);
+    __m256i tmp1, tmp2, tmp3;
+    tmp1 = __lasx_xvmulwev_h_bu_b(ax, sy);
+    tmp2 = __lasx_xvmulwod_h_bu_b(ax, sy);
+    tmp3 = __lasx_xvadd_h(tmp1, tmp2);
+    return __lasx_xvsat_h(tmp3, 15);
+}
+#endif
+
+void ggml_vec_dot_iq1_s_q8_K  (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_s * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __ARM_NEON
+
+    ggml_int8x16x4_t q1b;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi1 = 0, sumi2 = 0, sumi3 = 0;
+
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+
+            q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700)))));
+            q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700)))));
+            q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700)))));
+            q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700)))));
+            qs += 8;
+
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]);
+
+            const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+            sumi1 += vaddvq_s32(p1) * ls1;
+            sumi2 += vaddvq_s32(p2) * ls2;
+            sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1)
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1);
+
+        }
+
+        sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    __m256 accum = _mm256_setzero_ps();
+    float accum1 = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        __m256i sumi = _mm256_setzero_si256();
+        int sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
+                                                    iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
+            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
+                                                    iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+            qs += 8;
+            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
+            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+        }
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
+        accum1 += d * sumi1;
+
+    }
+
+    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#elif defined __AVX__
+    __m256 accum = _mm256_setzero_ps();
+    float accum1 = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        int sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
+            const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
+            const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+            const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
+            qs += 8;
+            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
+            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
+            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
+            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
+            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
+
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
+            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+        }
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
+        accum1 += d * sumi1;
+
+    }
+
+    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector unsigned char v0 = vec_splats((unsigned char)0x0);
+    const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = vec_splats((int32_t)0);
+        vector signed int vsumi1 = vec_splats((int32_t)0);
+        vector signed int vsumi2 = vec_splats((int32_t)0);
+        vector signed int vsumi3 = vec_splats((int32_t)0);
+        vector signed int vsumi8 = vec_splats((int32_t)0);
+
+        const uint8_t  * restrict q1 = x[i].qs;
+        const uint16_t * restrict qh = x[i].qh;
+        const int8_t   * restrict q8 = y[i].qs;
+        const int16_t  * restrict qs = y[i].bsums;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q1, 0, 1);
+            __builtin_prefetch(qh, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
+            q1 += 8;
+
+            vector signed char q1x0 = (vector signed char)aux64x2_0;
+            vector signed char q1x1 = (vector signed char)aux64x2_1;
+            vector signed char q1x2 = (vector signed char)aux64x2_2;
+            vector signed char q1x3 = (vector signed char)aux64x2_3;
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
+            const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
+
+            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
+            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
+            vector signed short vscales = vec_sld(vscales23, vscales01, 8);
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+
+            vector signed short q8ysums = vec_xl_len(qs, 8);
+            qs += 4;
+            q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
+
+            vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
+            qh += 2;
+            vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
+
+            vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
+
+            vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+    __m256 accum = (__m256)__lasx_xvldi(0);
+    float accum1 = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        __m256i sumi = __lasx_xvldi(0);
+        int sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            __m256i q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)], 0);
+            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], 1);
+            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2);
+            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3);
+
+            __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0);
+            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1);
+            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2);
+            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], 3);
+
+            qs += 8;
+            const __m256i q8b_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8b_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+
+            __m256i tmp1, tmp5, tmp6;
+            tmp1 = __lasx_xvreplgr2vr_h(ls1);
+            tmp5 = __lasx_xvmulwev_w_h(dot1, tmp1);
+            tmp6 = __lasx_xvmulwod_w_h(dot1, tmp1);
+            const __m256i p1 = __lasx_xvadd_w(tmp5, tmp6);
+
+            tmp1 = __lasx_xvreplgr2vr_h(ls2);
+            tmp5 = __lasx_xvmulwev_w_h(dot2, tmp1);
+            tmp6 = __lasx_xvmulwod_w_h(dot2, tmp1);
+            const __m256i p2 = __lasx_xvadd_w(tmp5, tmp6);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p1, p2));
+            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+        }
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
+        accum1 += d * sumi1;
+    }
+
+    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi = 0, sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
+            const int delta = qh[ib] & 0x8000 ? -1 : 1;
+            int lsum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    lsum += q8[j] * grid[j];
+                }
+                q8 += 8;
+            }
+            sumi  += ls * lsum;
+            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
+            qs += 4;
+        }
+
+        sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+    }
+
+    *s = sumf;
+
+#endif
+}
+
+void ggml_vec_dot_iq1_m_q8_K  (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_m * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+    iq1m_scale_t scale;
+
+#if defined __ARM_NEON
+    const int32x4_t mask  = vdupq_n_s32(0x7);
+    const int32x4_t mone  = vdupq_n_s32(1);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_int8x16x4_t deltas;
+    deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
+    deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
+    deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
+    deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
+
+    ggml_int8x16x4_t q1b;
+    ggml_int8x16x4_t q8b;
+
+    uint32_t aux32;
+    const uint8_t * aux8 = (const uint8_t *)&aux32;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        int32x4_t sumi1 = mzero;
+        int32x4_t sumi2 = mzero;
+
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+
+            q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
+            q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
+            q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
+            q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
+
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
+            const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
+            const int32x4_t p12 = vpaddq_s32(p1, p2);
+
+            const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
+            aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
+
+            const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
+            const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
+            const int32x4_t p34 = vpaddq_s32(p3, p4);
+
+            int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
+
+            scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
+
+            sumi1 = vmlaq_s32(sumi1, scales_4, p12);
+            sumi2 = vmlaq_s32(sumi2, scales_4, p34);
+
+            qs += 8; qh += 4;
+
+        }
+
+        sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m256i mask = _mm256_set1_epi16(0x7);
+    const __m256i mone = _mm256_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m256i q1b_1 = _mm256_set_epi64x(
+                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
+                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
+            );
+            const __m256i q1b_2 = _mm256_set_epi64x(
+                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
+                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
+            );
+            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+
+            const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+
+            const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
+            const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
+
+            __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
+            __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
+
+            scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
+            scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
+            const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
+            const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
+            const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
+            const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
+
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
+
+            qs += 8; qh += 4;
+        }
+
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
+
+        accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
+        accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
+    }
+
+    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
+
+#elif defined __AVX__
+    const __m128i mask = _mm_set1_epi16(0x7);
+    const __m128i mone = _mm_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q1b_1_0 = _mm_set_epi64x(
+                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
+            const __m128i q1b_1_1 = _mm_set_epi64x(
+                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
+            const __m128i q1b_2_0 = _mm_set_epi64x(
+                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
+            const __m128i q1b_2_1 = _mm_set_epi64x(
+                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
+            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
+            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
+            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
+            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
+
+            const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+
+            const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
+            const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
+            const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
+            const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
+
+            __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
+            __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
+            __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
+            __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
+
+            scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
+            scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
+            scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
+            scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
+            const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
+            const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
+            const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
+            const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
+
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
+            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
+            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
+
+            qs += 8; qh += 4;
+        }
+
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
+
+        accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
+        accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
+    }
+
+    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
+
+#else
+
+    int sum1[2], sum2[2], delta[4];
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            delta[0] = qh[0] & 0x08 ? -1 : 1;
+            delta[1] = qh[0] & 0x80 ? -1 : 1;
+            delta[2] = qh[1] & 0x08 ? -1 : 1;
+            delta[3] = qh[1] & 0x80 ? -1 : 1;
+            sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
+                int lsum1 = 0, lsum2 = 0;
+                for (int j = 0; j < 8; ++j) {
+                    lsum1 += q8[j] * grid[j];
+                    lsum2 += q8[j];
+                }
+                q8 += 8;
+                sum1[l/2] += lsum1;
+                sum2[l/2] += lsum2*delta[l];
+            }
+
+            const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
+            const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
+
+            sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
+            sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
+            qs += 4;
+            qh += 2;
+        }
+
+        sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
+    }
+
+    *s = sumf;
+
+#endif
+}
+
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK4_NL == 0);
+    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+    const block_iq4_nl * restrict x = vx;
+    const block_q8_0   * restrict y = vy;
+
+    const int nb = n / QK4_NL;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined __ARM_NEON
+    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
+    const uint8x16_t m4b = vdupq_n_u8(0x0f);
+    uint8x16x2_t q4bits;
+    int8x16x4_t q4b;
+    int8x16x4_t q8b;
+    int32x4_t prod_1, prod_2;
+
+    for (; ib + 1 < nb; ib += 2) {
+
+        q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
+        q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
+        q8b.val[0]    = vld1q_s8(y[ib + 0].qs);
+        q8b.val[1]    = vld1q_s8(y[ib + 0].qs + 16);
+        q8b.val[2]    = vld1q_s8(y[ib + 1].qs);
+        q8b.val[3]    = vld1q_s8(y[ib + 1].qs + 16);
+
+        q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
+        q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
+        q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
+        q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
+
+        prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
+        prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
+
+        sumf +=
+            GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
+            GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
+    }
+
+#elif defined __AVX2__
+
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+    const __m256i mone = _mm256_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
+        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
+        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
+        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
+        const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
+        accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
+                _mm256_cvtepi32_ps(p_1), accum1);
+        accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
+                _mm256_cvtepi32_ps(p_2), accum2);
+    }
+
+    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+
+#elif defined __AVX__
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+    const __m128i mone = _mm_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
+        const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
+        const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
+        const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
+        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
+        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
+        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
+        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
+        const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
+        const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
+        const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
+        const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
+        accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
+                _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
+        accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
+                _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
+    }
+
+    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+
+    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+        vector signed char q4x0 = vec_and(qxs, lowMask);
+        vector signed char q4x1 = vec_sr(qxs, v4);
+
+        q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
+        q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
+
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
+        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+
+        vsumi0 = vec_sum4s(qv0, vsumi0);
+        vsumi1 = vec_sum4s(qv1, vsumi1);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined (__loongarch_asx)
+
+    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
+    const __m128i m4b  = __lsx_vreplgr2vr_b(0x0f);
+    const __m256i mone = __lasx_xvreplgr2vr_h(1);
+
+    __m256 accum1 = (__m256)__lasx_xvldi(0);
+    __m256 accum2 = (__m256)__lasx_xvldi(0);
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[ib + 0].qs, 0);
+        const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[ib + 1].qs, 0);
+        const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[ib + 0].qs, 0);
+        const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[ib + 1].qs, 0);
+        const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)),
+                                              lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b)));
+        const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)),
+                                              lsx_shuffle_b(values128, __lsx_vand_v(q4bits_2, m4b)));
+        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+        const __m256i p_1 = lasx_madd_h(p16_1, mone);
+        const __m256i p_2 = lasx_madd_h(p16_2, mone);
+        accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
+                __lasx_xvffint_s_w(p_1), accum1);
+        accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
+                __lasx_xvffint_s_w(p_2), accum2);
+    }
+
+    sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
+
+#endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK_K == 0);
+
+    const block_iq4_xs * restrict x = vx;
+    const block_q8_K   * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __ARM_NEON
+    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
+    const uint8x16_t m4b = vdupq_n_u8(0x0f);
+    ggml_uint8x16x2_t q4bits;
+    ggml_int8x16x4_t q4b;
+    ggml_int8x16x4_t q8b;
+    int32x4_t prod_1, prod_2;
+
+    float sumf = 0;
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+
+        const int8_t  * q8 = y[ibl].qs;
+        const uint8_t * q4 = x[ibl].qs;
+        uint16_t h = x[ibl].scales_h;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/64; ++ib) {
+
+            q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
+            q8b    = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
+            q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
+            q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
+            q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
+
+            prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
+            prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
+
+            int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
+            int ls2 = ((x[ibl].scales_l[ib] >>  4) | ((h << 2) & 0x30)) - 32;
+            h >>= 4;
+            sumi1 += vaddvq_s32(prod_1) * ls1;
+            sumi2 += vaddvq_s32(prod_2) * ls2;
+
+        }
+
+        sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+
+    __m256 accum = _mm256_setzero_ps();
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
+            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
+            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+            const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            sh >>= 4;
+            const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
+            const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
+            sumi1 = _mm256_add_epi32(p_1, sumi1);
+            sumi2 = _mm256_add_epi32(p_2, sumi2);
+        }
+        accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+                _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
+    }
+
+    *s = hsum_float_8(accum);
+
+#elif defined __AVX__
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+
+    __m256 accum = _mm256_setzero_ps();
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
+            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
+            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
+            const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
+            const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
+            const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
+            const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
+            const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
+            const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
+            const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
+            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            sh >>= 4;
+            const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
+            const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
+            const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
+            const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
+            sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
+            sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
+            sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
+            sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
+        }
+        __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
+        __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
+        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+                _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
+    }
+
+    *s = hsum_float_8(accum);
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
+        vector float vyd = vec_splats(y[ibl].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        uint16_t h = x[ibl].scales_h;
+
+        const uint8_t * restrict q4 = x[ibl].qs;
+        const uint8_t * restrict sc = x[ibl].scales_l;
+        const int8_t  * restrict q8 = y[ibl].qs;
+
+        for (int ib = 0; ib < QK_K/64; ib ++ ) {
+            __builtin_prefetch(q4, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
+            q4 += 32;
+
+            vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
+            vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
+            vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
+            vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
+
+            q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
+            q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
+            q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
+            q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
+            const uint16_t ls1 = (uint16_t)(((sc[0] >>  4) | ((h << 2) & 0x30)) - 32);
+            h >>= 4;
+            sc ++;
+
+            vector signed short vscales01 = vec_splats((int16_t)ls0);
+            vector signed short vscales23 = vec_splats((int16_t)ls1);
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
+    const __m128i m4b  = __lsx_vreplgr2vr_b(0x0f);
+
+    __m256 accum = (__m256)__lasx_xvldi(0);
+    __m256i tmp1;
+    __m128i tmp0, tmp2, tmp3, tmp4, mask_8f, mask;
+
+    mask_8f = __lsx_vreplgr2vr_b(0x8f);
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        __m128i zero = __lsx_vldi(0);
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0);  qs += 16;
+            const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0);  qs += 16;
+            const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b), mask_8f);
+            tmp0 = __lsx_vori_b(tmp2, 0x10);
+            mask = __lsx_vsle_b(zero, tmp2);
+            tmp3 = __lsx_vand_v(tmp0, mask);
+            tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
+
+            tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_1, m4b), mask_8f);
+            tmp0 = __lsx_vori_b(tmp2, 0x10);
+            mask = __lsx_vsle_b(zero, tmp2);
+            tmp4 = __lsx_vand_v(tmp0, mask);
+            tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
+
+            const __m256i q4b_1 = lasx_insertf128(tmp3, tmp4);
+
+            tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b), mask_8f);
+            tmp0 = __lsx_vori_b(tmp2, 0x10);
+            mask = __lsx_vsle_b(zero, tmp2);
+            tmp3 = __lsx_vand_v(tmp0, mask);
+            tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
+
+            tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_2, m4b), mask_8f);
+            tmp0 = __lsx_vori_b(tmp2, 0x10);
+            mask = __lsx_vsle_b(zero, tmp2);
+            tmp4 = __lsx_vand_v(tmp0, mask);
+            tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
+
+            const __m256i q4b_2 = lasx_insertf128(tmp3, tmp4);
+
+            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            sh >>= 4;
+            __m256i tmp5, tmp6;
+            tmp1 = __lasx_xvreplgr2vr_h(ls1);
+            tmp5 = __lasx_xvmulwev_w_h(p16_1, tmp1);
+            tmp6 = __lasx_xvmulwod_w_h(p16_1, tmp1);
+            const __m256i p_1 = __lasx_xvadd_w(tmp5, tmp6);
+            tmp1 = __lasx_xvreplgr2vr_h(ls2);
+            tmp5 = __lasx_xvmulwev_w_h(p16_2, tmp1);
+            tmp6 = __lasx_xvmulwod_w_h(p16_2, tmp1);
+            const __m256i p_2 = __lasx_xvadd_w(tmp5, tmp6);
+            sumi1 = __lasx_xvadd_w(p_1, sumi1);
+            sumi2 = __lasx_xvadd_w(p_2, sumi2);
+        }
+        accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+                __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
+    }
+
+    *s = hsum_float_8(accum);
+
+#else
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
+#endif
+}
+
+// ============================ 4-bit non-linear quants
+
+void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) {
+    assert(k % QK4_NL == 0);
+    quantize_row_iq4_nl_ref(x, y, k);
+}
+
+void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    quantize_iq4_xs(x, y, 1, k, NULL);
+}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.h b/ggml/src/ggml-cpu/ggml-cpu-quants.h
new file mode 100644
index 000000000..e33d9d473
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+
+#include "ggml.h"
+
+// GGML CPU internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Quantization
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+// Dot product
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
similarity index 99%
rename from ggml/src/ggml-cpu.c
rename to ggml/src/ggml-cpu/ggml-cpu.c
index de1de18ec..4c45146a1 100644
--- a/ggml/src/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1,13 +1,15 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
-#include "ggml-aarch64.h"
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
+#include "ggml-cpu-aarch64.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
 #include "ggml-quants.h"
+#include "ggml-cpu-quants.h"
+#include "ggml-threading.h"
 #include "ggml.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -42,7 +44,7 @@
 #endif
 
 #ifdef GGML_USE_LLAMAFILE
-#include <llamafile/sgemm.h>
+#include "llamafile/sgemm.h"
 #endif
 
 #if defined(_MSC_VER)
@@ -104,9 +106,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
 // precomputed quick gelu table for f16 (128 KB)
 static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
 
-// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
-float ggml_table_f32_f16[1 << 16];
-
 #if defined(__ARM_ARCH)
 struct ggml_arm_arch_features_type {
     int has_neon;
@@ -261,11 +260,13 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .nrows                    = 1,
     },
     [GGML_TYPE_F16] = {
+        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
         .vec_dot_type             = GGML_TYPE_F16,
         .nrows                    = 1,
     },
     [GGML_TYPE_Q4_0] = {
+        .from_float               = quantize_row_q4_0,
         .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -275,6 +276,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
 #endif
     },
     [GGML_TYPE_Q4_1] = {
+        .from_float               = quantize_row_q4_1,
         .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
         .vec_dot_type             = GGML_TYPE_Q8_1,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -283,27 +285,20 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .nrows                    = 1,
 #endif
     },
-    [4] = { // GGML_TYPE_Q4_2
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-        .nrows                    = 1,
-    },
-    [5] = { // GGML_TYPE_Q4_3
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-        .nrows                    = 1,
-    },
     [GGML_TYPE_Q5_0] = {
+        .from_float               = quantize_row_q5_0,
         .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
     },
     [GGML_TYPE_Q5_1] = {
+        .from_float               = quantize_row_q5_1,
         .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
         .vec_dot_type             = GGML_TYPE_Q8_1,
         .nrows                    = 1,
     },
     [GGML_TYPE_Q8_0] = {
+        .from_float               = quantize_row_q8_0,
         .from_float_to_mat        = quantize_mat_q8_0,
         .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
@@ -314,85 +309,106 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
 #endif
     },
     [GGML_TYPE_Q8_1] = {
+        .from_float               = quantize_row_q8_1,
         .vec_dot_type             = GGML_TYPE_Q8_1,
         .nrows                    = 1,
     },
     [GGML_TYPE_Q2_K] = {
+        .from_float               = quantize_row_q2_K,
         .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_Q3_K] = {
+        .from_float               = quantize_row_q3_K,
         .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_Q4_K] = {
+        .from_float               = quantize_row_q4_K,
         .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_Q5_K] = {
+        .from_float               = quantize_row_q5_K,
         .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_Q6_K] = {
+        .from_float               = quantize_row_q6_K,
         .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ2_XXS] = {
+        .from_float               = NULL,
         .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ2_XS] = {
+        .from_float               = NULL,
         .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ3_XXS] = {
+        // NOTE: from_float for iq3 and iq2_s was removed because these quants require initialization in ggml_quantize_init
+        //.from_float               = quantize_row_iq3_xxs,
         .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ3_S] = {
+        //.from_float               = quantize_row_iq3_s,
         .vec_dot                  = ggml_vec_dot_iq3_s_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ2_S] = {
+        //.from_float               = quantize_row_iq2_s,
         .vec_dot                  = ggml_vec_dot_iq2_s_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ1_S] = {
+        .from_float               = NULL,
         .vec_dot                  = ggml_vec_dot_iq1_s_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ1_M] = {
+        .from_float               = NULL,
         .vec_dot                  = ggml_vec_dot_iq1_m_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ4_NL] = {
+        .from_float               = quantize_row_iq4_nl,
         .vec_dot                  = ggml_vec_dot_iq4_nl_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ4_XS] = {
+        .from_float               = quantize_row_iq4_xs,
         .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
+    [GGML_TYPE_Q8_K] = {
+        .from_float               = quantize_row_q8_K,
+    },
     [GGML_TYPE_BF16] = {
+        .from_float               = (ggml_from_float_t) ggml_fp32_to_bf16_row,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
         .vec_dot_type             = GGML_TYPE_BF16,
         .nrows                    = 1,
     },
     [GGML_TYPE_Q4_0_4_4] = {
+        .from_float               = NULL,
         .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
@@ -401,6 +417,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
     },
     [GGML_TYPE_Q4_0_4_8] = {
+        .from_float               = NULL,
         .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
@@ -409,6 +426,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
     },
     [GGML_TYPE_Q4_0_8_8] = {
+        .from_float               = NULL,
         .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
@@ -417,11 +435,13 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
     },
     [GGML_TYPE_TQ1_0] = {
+        .from_float               = quantize_row_tq1_0,
         .vec_dot                  = ggml_vec_dot_tq1_0_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_TQ2_0] = {
+        .from_float               = quantize_row_tq2_0,
         .vec_dot                  = ggml_vec_dot_tq2_0_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
@@ -2250,22 +2270,7 @@ struct ggml_state {
     struct ggml_numa_nodes numa;
 };
 
-// global state
 static struct ggml_state g_state = {0};
-static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
-
-// TODO: move to threading file
-// critical section via spin lock
-void ggml_critical_section_start(void) {
-    while (atomic_flag_test_and_set(&g_state_critical)) {
-        // spin
-        sched_yield();
-    }
-}
-
-void ggml_critical_section_end(void) {
-    atomic_flag_clear(&g_state_critical);
-}
 
 static void ggml_barrier(struct ggml_threadpool * tp) {
     int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
@@ -2997,8 +3002,8 @@ static void ggml_compute_forward_dup_f16(
                         id += ne00 * (ne01 - ir1);
                     }
                 }
-            } else if (ggml_get_type_traits(dst->type)->from_float) {
-                ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
+            } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
+                ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
                 float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
 
                 size_t id = 0;
@@ -3278,8 +3283,8 @@ static void ggml_compute_forward_dup_bf16(
                         id += ne00 * (ne01 - ir1);
                     }
                 }
-            } else if (ggml_get_type_traits(dst->type)->from_float) {
-                ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
+            } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
+                ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
                 float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
 
                 size_t id = 0;
@@ -3594,8 +3599,8 @@ static void ggml_compute_forward_dup_f32(
                         id += rs * (ne01 - ir1);
                     }
                 }
-            } else if (ggml_get_type_traits(dst->type)->from_float) {
-                ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
+            } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
+                ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
 
                 size_t id = 0;
                 size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
@@ -4377,7 +4382,7 @@ static void ggml_compute_forward_add_q_f32(
     const enum ggml_type type = src0->type;
     const enum ggml_type dtype = dst->type;
     ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
-    ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dtype)->from_float;
+    ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dtype)->from_float;
 
     // we don't support permuted src0 or src1
     GGML_ASSERT(nb00 == ggml_type_size(type));
@@ -4679,7 +4684,7 @@ static void ggml_compute_forward_add1_q_f32(
 
     const enum ggml_type type = src0->type;
     ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
-    ggml_from_float_t const quantize_row_q = ggml_get_type_traits(type)->from_float;
+    ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(type)->from_float;
 
     // we don't support permuted src0
     GGML_ASSERT(nb00 == ggml_type_size(type));
@@ -7428,7 +7433,7 @@ static void ggml_compute_forward_mul_mat(
     const enum ggml_type type = src0->type;
 
     enum ggml_type           const vec_dot_type         = type_traits_cpu[type].vec_dot_type;
-    ggml_from_float_t        const from_float           = ggml_get_type_traits(vec_dot_type)->from_float;
+    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
     ggml_from_float_to_mat_t const from_float_to_mat    = type_traits_cpu[vec_dot_type].from_float_to_mat;
     int64_t                  const vec_dot_num_rows     = type_traits_cpu[type].nrows;
     int64_t                  const matmul_num_cols      = type_traits_cpu[type].ncols;
@@ -7649,7 +7654,7 @@ static void ggml_compute_forward_mul_mat_id(
 
     ggml_vec_dot_t    const vec_dot         = type_traits_cpu[type].vec_dot;
     enum ggml_type    const vec_dot_type    = type_traits_cpu[type].vec_dot_type;
-    ggml_from_float_t const from_float      = ggml_get_type_traits(vec_dot_type)->from_float;
+    ggml_from_float_t const from_float      = type_traits_cpu[vec_dot_type].from_float;
     int64_t           const matmul_num_cols = type_traits_cpu[type].ncols;
     ggml_gemv_t       const gemv            = type_traits_cpu[type].gemv;
 
@@ -9159,12 +9164,6 @@ static void rope_yarn(
     *sin_theta = sinf(theta) * mscale;
 }
 
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
-    return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
-}
-
 static void ggml_rope_cache_init(
      float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
      float * cache, float sin_sign, float theta_scale) {
@@ -9181,16 +9180,6 @@ static void ggml_rope_cache_init(
     }
 }
 
-void ggml_rope_yarn_corr_dims(
-    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
-) {
-    // start and end correction dims
-    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
-    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
-    dims[0] = MAX(0, start);
-    dims[1] = MIN(n_dims - 1, end);
-}
-
 static void ggml_compute_forward_rope_f32(
         const struct ggml_compute_params * params,
         struct ggml_tensor * dst,
@@ -10668,7 +10657,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
     enum ggml_type    const k_vec_dot_type = type_traits_cpu[k->type].vec_dot_type;
-    ggml_from_float_t const q_to_vec_dot   = ggml_get_type_traits(k_vec_dot_type)->from_float;
+    ggml_from_float_t const q_to_vec_dot   = type_traits_cpu[k_vec_dot_type].from_float;
     ggml_vec_dot_t    const kq_vec_dot     = type_traits_cpu[k->type].vec_dot;
     ggml_to_float_t   const v_to_float     = ggml_get_type_traits(v->type)->to_float;
 
@@ -13759,6 +13748,151 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
     return ggml_graph_compute(cgraph, &cplan);
 }
 
+
+int ggml_cpu_has_avx(void) {
+#if defined(__AVX__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx_vnni(void) {
+#if defined(__AVXVNNI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx2(void) {
+#if defined(__AVX2__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512(void) {
+#if defined(__AVX512F__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512_vbmi(void) {
+#if defined(__AVX512VBMI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512_vnni(void) {
+#if defined(__AVX512VNNI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512_bf16(void) {
+#if defined(__AVX512BF16__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_amx_int8(void) {
+#if defined(__AMX_INT8__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_fma(void) {
+#if defined(__FMA__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_arm_fma(void) {
+#if defined(__ARM_FEATURE_FMA)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_riscv_v(void) {
+#if defined(__riscv_v_intrinsic)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_f16c(void) {
+#if defined(__F16C__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_fp16_va(void) {
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_wasm_simd(void) {
+#if defined(__wasm_simd128__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_llamafile(void) {
+#if defined(GGML_USE_LLAMAFILE)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_sse3(void) {
+#if defined(__SSE3__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_ssse3(void) {
+#if defined(__SSSE3__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_vsx(void) {
+#if defined(__POWER9_VECTOR__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_neon(void) {
 #if defined(__ARM_ARCH)
     return ggml_arm_arch_features.has_neon;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
new file mode 100644
index 000000000..c7216117b
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -0,0 +1,575 @@
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+#include <cctype>
+#include <string>
+#include <vector>
+
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+    #define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+// ggml-backend interface
+
+#ifdef GGML_USE_CPU_HBM
+
+// buffer type HBM
+
+#include <hbwmalloc.h>
+
+static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_HBM";
+
+    GGML_UNUSED(buft);
+}
+
+static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    hbw_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr;
+    int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+    if (result != 0) {
+        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
+        return NULL;
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .context  = */ NULL,
+    };
+
+    return &ggml_backend_cpu_buffer_type_hbm;
+}
+#endif
+
+static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
+    static ggml_backend_buffer_type_t bufts[] = {
+#ifdef GGML_USE_CPU_HBM
+        ggml_backend_cpu_hbm_buffer_type(),
+#endif
+        NULL
+    };
+
+    return bufts;
+
+    GGML_UNUSED(device);
+}
+
+// CPU backend - backend (stream)
+
+struct ggml_backend_cpu_context {
+    int                 n_threads;
+    ggml_threadpool_t   threadpool;
+
+    uint8_t *           work_data;
+    size_t              work_size;
+
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
+};
+
+static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
+    return "CPU";
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_cpu_free(ggml_backend_t backend) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+    delete[] cpu_ctx->work_data;
+    delete cpu_ctx;
+    delete backend;
+}
+
+struct ggml_backend_plan_cpu {
+    struct ggml_cplan cplan;
+    struct ggml_cgraph cgraph;
+};
+
+static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
+
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
+
+    if (cpu_plan->cplan.work_size > 0) {
+        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
+        if (cpu_plan->cplan.work_data == NULL) {
+            delete cpu_plan;
+            return NULL;
+        }
+    }
+
+    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
+    return cpu_plan;
+}
+
+static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    delete[] cpu_plan->cplan.work_data;
+    delete cpu_plan;
+
+    GGML_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+
+    GGML_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+
+    if (cpu_ctx->work_size < cplan.work_size) {
+        delete[] cpu_ctx->work_data;
+        cpu_ctx->work_data = new uint8_t[cplan.work_size];
+        if (cpu_ctx->work_data == NULL) {
+            cpu_ctx->work_size = 0;
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+        cpu_ctx->work_size = cplan.work_size;
+    }
+    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
+
+    cplan.abort_callback      = cpu_ctx->abort_callback;
+    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
+    return ggml_graph_compute(cgraph, &cplan);
+}
+
+static const struct ggml_backend_i ggml_backend_cpu_i = {
+    /* .get_name                = */ ggml_backend_cpu_get_name,
+    /* .free                    = */ ggml_backend_cpu_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
+    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
+    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_cpu_guid(void) {
+    static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_cpu_init(void) {
+    // initialize CPU backend now to avoid slowing the first graph computation
+    ggml_cpu_init();
+
+    struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
+    if (ctx == NULL) {
+        return NULL;
+    }
+
+    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
+    ctx->work_data           = NULL;
+    ctx->work_size           = 0;
+    ctx->abort_callback      = NULL;
+    ctx->abort_callback_data = NULL;
+
+    ggml_backend_t cpu_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_cpu_guid(),
+        /* .interface = */ ggml_backend_cpu_i,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context   = */ ctx,
+    };
+
+    if (cpu_backend == NULL) {
+        delete ctx;
+        return NULL;
+    }
+
+    return cpu_backend;
+}
+
+bool ggml_backend_is_cpu(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
+}
+
+void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->n_threads = n_threads;
+}
+
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+
+    if (ctx->threadpool && ctx->threadpool != threadpool) {
+        // already had a different threadpool, pause/suspend it before switching
+        ggml_threadpool_pause(ctx->threadpool);
+    }
+    ctx->threadpool = threadpool;
+}
+
+void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+}
+
+// CPU backend - device
+
+struct ggml_backend_cpu_device_context {
+    std::string description = "CPU";
+
+    ggml_backend_cpu_device_context() {
+#ifdef __APPLE__
+        size_t len = 0;
+        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
+            description.resize(len);
+            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
+        }
+#elif defined(__linux__)
+        FILE * f = fopen("/proc/cpuinfo", "r");
+        if (f) {
+            char buf[1024];
+            while (fgets(buf, sizeof(buf), f)) {
+                if (strncmp(buf, "model name", 10) == 0) {
+                    char * p = strchr(buf, ':');
+                    if (p) {
+                        p++;
+                        while (std::isspace(*p)) {
+                            p++;
+                        }
+                        while (std::isspace(p[strlen(p) - 1])) {
+                            p[strlen(p) - 1] = '\0';
+                        }
+                        description = p;
+                        break;
+                    }
+                }
+            }
+            fclose(f);
+        }
+#elif defined(_WIN32)
+        HKEY hKey;
+        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
+                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
+                        0,
+                        KEY_READ,
+                        &hKey) == ERROR_SUCCESS) {
+            DWORD cpu_brand_size = 0;
+            if (RegQueryValueExA(hKey,
+                                TEXT("ProcessorNameString"),
+                                NULL,
+                                NULL,
+                                NULL,
+                                &cpu_brand_size) == ERROR_SUCCESS) {
+                description.resize(cpu_brand_size);
+                if (RegQueryValueExA(hKey,
+                                    TEXT("ProcessorNameString"),
+                                    NULL,
+                                    NULL,
+                                    (LPBYTE)&description[0], // NOLINT
+                                    &cpu_brand_size) == ERROR_SUCCESS) {
+                    if (description.find('\0') != std::string::npos) {
+                        description.resize(description.find('\0'));
+                    }
+                }
+            }
+            RegCloseKey(hKey);
+        }
+#endif
+    }
+};
+
+static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
+    return "CPU";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
+    struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
+
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    // TODO
+    *free = 0;
+    *total = 0;
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_CPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_cpu_device_get_name(dev);
+    props->description = ggml_backend_cpu_device_get_description(dev);
+    props->type        = ggml_backend_cpu_device_get_type(dev);
+    ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    return ggml_backend_cpu_init();
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_cpu_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
+}
+
+static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_CPY:
+            return
+                op->type != GGML_TYPE_IQ2_XXS &&
+                op->type != GGML_TYPE_IQ2_XS  &&
+                op->type != GGML_TYPE_IQ1_S   &&
+                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
+        case GGML_OP_MUL_MAT:
+            return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
+        case GGML_OP_ROPE_BACK:
+            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
+        case GGML_OP_IM2COL_BACK:
+            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+        case GGML_OP_OUT_PROD:
+            return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
+        default:
+            return true;
+    }
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft);
+
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
+    /* .get_name             = */ ggml_backend_cpu_device_get_name,
+    /* .get_description      = */ ggml_backend_cpu_device_get_description,
+    /* .get_memory           = */ ggml_backend_cpu_device_get_memory,
+    /* .get_type             = */ ggml_backend_cpu_device_get_type,
+    /* .get_props            = */ ggml_backend_cpu_device_get_props,
+    /* .init_backend         = */ ggml_backend_cpu_device_init_backend,
+    /* .get_buffer_type      = */ ggml_backend_cpu_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
+    /* .supports_op          = */ ggml_backend_cpu_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_cpu_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// CPU backend - backend (reg)
+
+static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
+    return "CPU";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
+    return 1;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    static ggml_backend_cpu_device_context ctx;
+    static ggml_backend_device ggml_backend_cpu_device = {
+        /* .iface   = */ ggml_backend_cpu_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ &ctx,
+    };
+
+    return &ggml_backend_cpu_device;
+}
+
+struct ggml_backend_feature {
+    const char * name;
+    const char * value;
+};
+
+// Not used yet
+// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
+// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
+static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
+    static std::vector<ggml_backend_feature> features = []() {
+        std::vector<ggml_backend_feature> features;
+        if (ggml_cpu_has_sse3()) {
+            features.push_back({ "SSE3", "1" });
+        }
+        if (ggml_cpu_has_ssse3()) {
+            features.push_back({ "SSSE3", "1" });
+        }
+        if (ggml_cpu_has_avx()) {
+            features.push_back({ "AVX", "1" });
+        }
+        if (ggml_cpu_has_avx2()) {
+            features.push_back({ "AVX2", "1" });
+        }
+        if (ggml_cpu_has_f16c()) {
+            features.push_back({ "F16C", "1" });
+        }
+        if (ggml_cpu_has_fma()) {
+            features.push_back({ "FMA", "1" });
+        }
+        if (ggml_cpu_has_avx_vnni()) {
+            features.push_back({ "AVX_VNNI", "1" });
+        }
+        if (ggml_cpu_has_avx512()) {
+            features.push_back({ "AVX512", "1" });
+        }
+        if (ggml_cpu_has_avx512_vbmi()) {
+            features.push_back({ "AVX512_VBMI", "1" });
+        }
+        if (ggml_cpu_has_avx512_vnni()) {
+            features.push_back({ "AVX512_VNNI", "1" });
+        }
+        if (ggml_cpu_has_avx512_bf16()) {
+            features.push_back({ "AVX512_BF16", "1" });
+        }
+        if (ggml_cpu_has_amx_int8()) {
+            features.push_back({ "AMX_INT8", "1" });
+        }
+        if (ggml_cpu_has_neon()) {
+            features.push_back({ "NEON", "1" });
+        }
+        if (ggml_cpu_has_arm_fma()) {
+            features.push_back({ "ARM_FMA", "1" });
+        }
+        if (ggml_cpu_has_fp16_va()) {
+            features.push_back({ "FP16_VA", "1" });
+        }
+        if (ggml_cpu_has_matmul_int8()) {
+            features.push_back({ "MATMUL_INT8", "1" });
+        }
+        if (ggml_cpu_has_sve()) {
+            features.push_back({ "SVE", "1" });
+        }
+        if (ggml_cpu_get_sve_cnt() > 0) {
+            static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
+            features.push_back({ "SVE_CNT", sve_cnt.c_str() });
+        }
+        if (ggml_cpu_has_riscv_v()) {
+            features.push_back({ "RISCV_V", "1" });
+        }
+        if (ggml_cpu_has_vsx()) {
+            features.push_back({ "VSX", "1" });
+        }
+        if (ggml_cpu_has_wasm_simd()) {
+            features.push_back({ "WASM_SIMD", "1" });
+        }
+        if (ggml_cpu_has_llamafile()) {
+            features.push_back({ "LLAMAFILE", "1" });
+        }
+
+        features.push_back({ nullptr, nullptr });
+
+        return features;
+    }();
+
+    return features.data();
+
+    GGML_UNUSED(reg);
+}
+
+static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        return (void *)ggml_backend_cpu_set_n_threads;
+    }
+    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
+        return (void *)ggml_backend_cpu_get_extra_bufts;
+    }
+
+    return NULL;
+
+    GGML_UNUSED(reg);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
+    /* .get_name         = */ ggml_backend_cpu_reg_get_name,
+    /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_cpu_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_cpu_reg(void) {
+    static struct ggml_backend_reg ggml_backend_cpu_reg = {
+        /* .iface   = */ ggml_backend_cpu_reg_i,
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_cpu_reg;
+}
diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
similarity index 100%
rename from ggml/src/llamafile/sgemm.cpp
rename to ggml/src/ggml-cpu/llamafile/sgemm.cpp
diff --git a/ggml/src/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h
similarity index 100%
rename from ggml/src/llamafile/sgemm.h
rename to ggml/src/ggml-cpu/llamafile/sgemm.h
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
new file mode 100644
index 000000000..40ed2bdf3
--- /dev/null
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -0,0 +1,165 @@
+cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
+
+find_package(CUDAToolkit)
+
+if (CUDAToolkit_FOUND)
+    message(STATUS "CUDA Toolkit found")
+
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        # 52 == lowest CUDA 12 standard
+        # 60 == FP16 CUDA intrinsics
+        # 61 == integer CUDA intrinsics
+        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
+            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
+        endif()
+    endif()
+    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
+    enable_language(CUDA)
+
+    file(GLOB   GGML_HEADERS_CUDA "*.cuh")
+    list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
+
+    file(GLOB   GGML_SOURCES_CUDA "*.cu")
+    file(GLOB   SRCS "template-instances/fattn-wmma*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    file(GLOB   SRCS "template-instances/mmq*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
+
+    if (GGML_CUDA_FA_ALL_QUANTS)
+        file(GLOB   SRCS "template-instances/fattn-vec*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+    else()
+        file(GLOB   SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*f16-f16.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    endif()
+
+    add_library(ggml-cuda
+                ${GGML_HEADERS_CUDA}
+                ${GGML_SOURCES_CUDA}
+                )
+
+    target_link_libraries(ggml-cuda PRIVATE ggml-base)
+    target_include_directories(ggml-cuda PRIVATE . ..)
+
+    # TODO: change the definitions to this target only
+
+    add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
+    add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
+    add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
+    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
+
+    if (GGML_CUDA_GRAPHS)
+        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
+    endif()
+
+    if (GGML_CUDA_FORCE_DMMV)
+        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
+    endif()
+
+    if (GGML_CUDA_FORCE_MMQ)
+        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+    endif()
+
+    if (GGML_CUDA_FORCE_CUBLAS)
+        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        add_compile_definitions(GGML_CUDA_NO_VMM)
+    endif()
+
+    if (DEFINED GGML_CUDA_DMMV_Y)
+        add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
+    endif()
+
+    if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+        add_compile_definitions(GGML_CUDA_F16)
+    endif()
+
+    if (GGML_CUDA_NO_PEER_COPY)
+        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+    endif()
+
+    if (GGML_STATIC)
+        if (WIN32)
+            # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
+            target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+        else ()
+            target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+        endif()
+    else()
+        target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
+    else()
+        target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
+    endif()
+
+    set(CUDA_CXX_FLAGS "")
+
+    set(CUDA_FLAGS -use_fast_math)
+
+    if (GGML_FATAL_WARNINGS)
+        list(APPEND CUDA_FLAGS -Werror all-warnings)
+    endif()
+
+    if (GGML_ALL_WARNINGS AND NOT MSVC)
+        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
+        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
+            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
+        endif()
+
+        execute_process(
+            COMMAND ${NVCC_CMD} -Xcompiler --version
+            OUTPUT_VARIABLE CUDA_CCFULLVER
+            ERROR_QUIET
+        )
+
+        if (NOT CUDA_CCFULLVER MATCHES clang)
+            set(CUDA_CCID "GNU")
+            execute_process(
+                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
+                OUTPUT_VARIABLE CUDA_CCVER
+                ERROR_QUIET
+            )
+        else()
+            if (CUDA_CCFULLVER MATCHES Apple)
+                set(CUDA_CCID "AppleClang")
+            else()
+                set(CUDA_CCID "Clang")
+            endif()
+            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
+        endif()
+
+        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
+
+        get_flags(${CUDA_CCID} ${CUDA_CCVER})
+        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
+    endif()
+
+    if (NOT MSVC)
+        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
+    endif()
+
+    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
+
+    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
+        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
+    endif()
+
+    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
+else()
+    message(FATAL_ERROR "CUDA Toolkit not found")
+endif()
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index dd203fcde..e146c691c 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -6,7 +6,7 @@
 #include <cstdint>
 #include <memory>
 
-#if defined(GGML_USE_HIPBLAS)
+#if defined(GGML_USE_HIP)
 #define GGML_COMMON_DECL_HIP
 #define GGML_COMMON_IMPL_HIP
 #else
@@ -26,13 +26,13 @@
 #include <string>
 #include <vector>
 
-#if defined(GGML_USE_HIPBLAS)
+#if defined(GGML_USE_HIP)
 #include "vendors/hip.h"
 #elif defined(GGML_USE_MUSA)
 #include "vendors/musa.h"
 #else
 #include "vendors/cuda.h"
-#endif // defined(GGML_USE_HIPBLAS)
+#endif // defined(GGML_USE_HIP)
 
 #define STRINGIZE_IMPL(...) #__VA_ARGS__
 #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@@ -97,7 +97,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
 
 #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
 
-#if !defined(GGML_USE_HIPBLAS)
+#if !defined(GGML_USE_HIP)
 static const char * cu_get_error_str(CUresult err) {
     const char * err_str;
     cuGetErrorString(err, &err_str);
@@ -120,21 +120,21 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif // GGML_CUDA_F16
 
-#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
+#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 #define FP16_AVAILABLE
-#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
+#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 
 #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
 #define FAST_FP16_AVAILABLE
 #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
 
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
 #define FP16_MMA_AVAILABLE
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
 
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
 #define INT8_MMA_AVAILABLE
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
 
 #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
 #define FLASH_ATTN_AVAILABLE
@@ -156,14 +156,14 @@ static constexpr bool int8_mma_available(const int cc) {
 static __device__ void no_device_code(
     const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
     printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
            file_name, line, function_name, arch);
     GGML_UNUSED(arch_list);
 #else
     printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
            file_name, line, function_name, arch, arch_list);
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
     __trap();
 
     GGML_UNUSED(no_device_code); // suppress unused function warning
@@ -176,7 +176,7 @@ static __device__ void no_device_code(
 #endif // __CUDA_ARCH__
 
 static __device__ __forceinline__ int warp_reduce_sum(int x) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
     return __reduce_add_sync(0xffffffff, x);
 #else
 #pragma unroll
@@ -184,7 +184,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
         x += __shfl_xor_sync(0xffffffff, x, mask, 32);
     }
     return x;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
 }
 
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
@@ -207,7 +207,7 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 #ifdef FP16_AVAILABLE
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
@@ -221,7 +221,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
         a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
     }
     return a;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 
 #else
     NO_DEVICE_CODE;
@@ -240,11 +240,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
 static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
 #ifdef FP16_AVAILABLE
 
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
     return __float2half(fmaxf(__half2float(a), __half2float(b)));
 #else
     return __hmax(a, b);
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
 
 #else
    NO_DEVICE_CODE;
@@ -254,7 +254,7 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
 }
 
 static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 
 #if CUDART_VERSION >= CUDART_HMAX
     return __hmax2(a, b);
@@ -269,11 +269,11 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
     GGML_UNUSED(a);
     GGML_UNUSED(b);
     NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 }
 
 static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
@@ -282,7 +282,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #else
    GGML_UNUSED(x);
    NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }
 
 #if CUDART_VERSION < CUDART_HMASK
@@ -294,7 +294,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
 #endif // CUDART_VERSION < CUDART_HMASK
 
 static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
     c = __builtin_amdgcn_sdot4(a, b, c, false);
 #elif defined(RDNA3)
@@ -320,7 +320,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 #endif
     return c;
 
-#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 
 #if __CUDA_ARCH__ >= MIN_CC_DP4A
     return __dp4a(a, b, c);
@@ -330,7 +330,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
     return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 }
 
 // TODO: move to ggml-common.h
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index 1fb5c09c3..ee9752da6 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -517,9 +517,9 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
 }
 
 template<int D, int parallel_blocks> // D == head size
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_combine_results(
         const float  * __restrict__ VKQ_parts,
         const float2 * __restrict__ VKQ_meta,
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu
index 5af02c7ec..4d314dacb 100644
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -5,9 +5,9 @@
 #define FATTN_KQ_STRIDE_TILE_F16 64
 
 template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_tile_ext_f16(
         const char * __restrict__ Q,
         const char * __restrict__ K,
diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu
index f402195ce..bb3360447 100644
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -5,9 +5,9 @@
 #define FATTN_KQ_STRIDE_TILE_F32 32
 
 template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_tile_ext_f32(
         const char * __restrict__ Q,
         const char * __restrict__ K,
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
index 2ed6509ac..5ec3b91ae 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -2,9 +2,9 @@
 #include "fattn-common.cuh"
 
 template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f16(
         const char * __restrict__ Q,
         const char * __restrict__ K,
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
index bf5125902..3d93f4a8a 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -2,9 +2,9 @@
 #include "fattn-common.cuh"
 
 template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f32(
         const char * __restrict__ Q,
         const char * __restrict__ K,
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
index b10d19d93..860d0e6dc 100644
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
@@ -7,9 +7,9 @@
 
 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
 template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_ext_f16(
         const char * __restrict__ Q,
         const char * __restrict__ K,
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
similarity index 99%
rename from ggml/src/ggml-cuda.cu
rename to ggml/src/ggml-cuda/ggml-cuda.cu
index b5096b3ee..07f043328 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -91,7 +91,7 @@ int ggml_cuda_get_device() {
 
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
     ggml_cuda_set_device(device);
-#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
+#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
     auto res = hipMallocManaged(ptr, size);
     if (res == hipSuccess) {
         // if error we "need" to know why...
@@ -100,7 +100,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
     return res;
 #else
 
-#if !defined(GGML_USE_HIPBLAS)
+#if !defined(GGML_USE_HIP)
     cudaError_t err;
     if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
     {
@@ -113,7 +113,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
     return err;
 #else
     return cudaMalloc(ptr, size);
-#endif // !defined(GGML_USE_HIPBLAS)
+#endif // !defined(GGML_USE_HIP)
 
 #endif
 }
@@ -151,7 +151,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
     for (int id = 0; id < info.device_count; ++id) {
         int device_vmm = 0;
 
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
         CUdevice device;
         CU_CHECK(cuDeviceGet(&device, id));
         CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -163,7 +163,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
             alloc_prop.location.id = id;
             CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
         }
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
         info.devices[id].vmm = !!device_vmm;
 
         cudaDeviceProp prop;
@@ -175,13 +175,13 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
         info.devices[id].nsm   = prop.multiProcessorCount;
         info.devices[id].smpb  = prop.sharedMemPerBlock;
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
         info.devices[id].smpbo = prop.sharedMemPerBlock;
         info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
 #else
         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
         info.devices[id].cc = 100*prop.major + 10*prop.minor;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
     }
 
     for (int id = 0; id < info.device_count; ++id) {
@@ -299,7 +299,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 };
 
 // pool with virtual memory
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
     static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
 
@@ -393,14 +393,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
         GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
     }
 };
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
 
 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
     if (ggml_cuda_info().devices[device].vmm) {
         return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
     }
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
     return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }
 
@@ -1325,7 +1325,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
 static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
     void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
 
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
     // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
     cudaMemcpy3DPeerParms p = {};
     p.dstDevice = dstDevice;
@@ -1339,7 +1339,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
     GGML_UNUSED(dstDevice);
     GGML_UNUSED(srcDevice);
     return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
 }
 
 static void ggml_cuda_op_mul_mat(
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 021a25682..425acb20d 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -100,9 +100,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
     return 128;
 #else // INT8_MMA_AVAILABLE
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
     return 128;
-#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 
 #if __CUDA_ARCH__ >= CC_VOLTA
 #ifdef GGML_CUDA_FORCE_MMQ
@@ -115,7 +115,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
     return 64;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 #endif // INT8_MMA_AVAILABLE
 }
 
@@ -124,7 +124,7 @@ static constexpr int get_mmq_y_host(const int cc) {
 }
 
 static constexpr __device__ int get_mmq_y_device() {
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA1)
     return 64;
 #else
@@ -136,7 +136,7 @@ static constexpr __device__ int get_mmq_y_device() {
 #else
     return 64;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 }
 
 #define MMQ_DP4A_TXS_Q4_0    tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_0   + mmq_y/QI4_0,     0}
@@ -2569,7 +2569,7 @@ static __device__ void mul_mat_q_process_tile(
 // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
 
 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
     __launch_bounds__(WARP_SIZE*nwarps, 2)
 #endif // defined(RDNA3) || defined(RDNA2)
@@ -2579,7 +2579,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #else
     __launch_bounds__(WARP_SIZE*nwarps, 2)
 #endif // __CUDA_ARCH__ >= CC_VOLTA
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 static __global__ void mul_mat_q(
     const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
     const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
@@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q(
     constexpr int mmq_y = get_mmq_y_device();
 
     // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
-#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
+#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
     {
         constexpr bool fixup = false;
         mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
@@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q(
                 blockIdx.x, blockIdx.y, 0, ne00/qk);
         return;
     }
-#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
+#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
 
     const     int64_t blocks_per_ne00 = ne00 / qk;
     constexpr int     blocks_per_iter = MMQ_ITER_K / qk;
@@ -2765,14 +2765,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
 
     const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
 
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
     static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
     if (!shmem_limit_raised[id]) {
         CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
         CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>,  cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
         shmem_limit_raised[id] = true;
     }
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 
     const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
     const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 7dbbc9939..735975c16 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -48,10 +48,10 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
 }
 
 template <ggml_type type, int ncols_y>
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 // tell the compiler to use as many registers as it wants, see nwarps definition below
 __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void mul_mat_vec_q(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
@@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
 
     constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
     constexpr int nwarps              = 1;
     constexpr int rows_per_cuda_block = 1;
 #else
     constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
     constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
 
     const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
     const     int row0 = rows_per_cuda_block*blockIdx.x;
diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu
index 0583e4fe0..31cfe5394 100644
--- a/ggml/src/ggml-cuda/sum.cu
+++ b/ggml/src/ggml-cuda/sum.cu
@@ -1,6 +1,6 @@
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
 #define USE_CUB
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
 
 #ifdef USE_CUB
 // On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt
new file mode 100644
index 000000000..5ed186ded
--- /dev/null
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -0,0 +1,113 @@
+if (NOT EXISTS $ENV{ROCM_PATH})
+    if (NOT EXISTS /opt/rocm)
+        set(ROCM_PATH /usr)
+    else()
+        set(ROCM_PATH /opt/rocm)
+    endif()
+else()
+    set(ROCM_PATH $ENV{ROCM_PATH})
+endif()
+
+list(APPEND CMAKE_PREFIX_PATH  ${ROCM_PATH})
+list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
+
+# CMake on Windows doesn't support the HIP language yet
+if (WIN32)
+    set(CXX_IS_HIPCC TRUE)
+else()
+    string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
+endif()
+
+if (CXX_IS_HIPCC)
+    if (LINUX)
+        if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+            message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+        endif()
+
+        message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
+                " Prefer setting the HIP compiler directly. See README for details.")
+    endif()
+else()
+    # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
+    if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+    endif()
+    cmake_minimum_required(VERSION 3.21)
+    enable_language(HIP)
+endif()
+
+find_package(hip     REQUIRED)
+find_package(hipblas REQUIRED)
+find_package(rocblas REQUIRED)
+
+message(STATUS "HIP and hipBLAS found")
+
+file(GLOB   GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
+list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
+
+file(GLOB   GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
+file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu")
+list(APPEND GGML_SOURCES_ROCM ${SRCS})
+file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
+list(APPEND GGML_SOURCES_ROCM ${SRCS})
+
+if (GGML_CUDA_FA_ALL_QUANTS)
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+else()
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+endif()
+
+add_library(ggml-hip
+            ${GGML_HEADERS_ROCM}
+            ${GGML_SOURCES_ROCM})
+
+target_link_libraries(ggml-hip PRIVATE ggml-base)
+target_include_directories(ggml-hip PRIVATE . ..)
+
+# TODO: do not use CUDA definitions for HIP
+target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
+
+add_compile_definitions(GGML_USE_HIP)
+add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
+add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
+add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
+
+if (GGML_HIP_UMA)
+    add_compile_definitions(GGML_HIP_UMA)
+endif()
+
+if (GGML_CUDA_FORCE_DMMV)
+    add_compile_definitions(GGML_CUDA_FORCE_DMMV)
+endif()
+
+if (GGML_CUDA_FORCE_MMQ)
+    add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+endif()
+
+if (GGML_CUDA_FORCE_CUBLAS)
+    add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
+endif()
+
+if (GGML_CUDA_NO_PEER_COPY)
+    add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+endif()
+
+if (CXX_IS_HIPCC)
+    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
+    target_link_libraries(ggml-hip PRIVATE hip::device)
+else()
+    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
+endif()
+
+if (GGML_STATIC)
+    message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+endif()
+
+target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index af29a26f0..aa4d2b85d 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -3,13 +3,29 @@
 // GGML internal header
 
 #include "ggml.h"
-
 #include <assert.h>
+#include <math.h>
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 #include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
+#if defined(__ARM_NEON)
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+#endif
+
+#if defined(__F16C__)
+#include <immintrin.h>
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -28,13 +44,13 @@ extern "C" {
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
 #ifndef __cplusplus
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
+    #ifndef static_assert
+        #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+            #define static_assert(cond, msg) _Static_assert(cond, msg)
+        #else
+            #define static_assert(cond, msg) struct global_scope_noop_trick
+        #endif
+    #endif
 #endif
 
 static inline int ggml_up32(int n) {
@@ -120,14 +136,12 @@ struct ggml_map_custom1_op_params {
     void             * userdata;
 };
 
-
 struct ggml_map_custom2_op_params {
     ggml_custom2_op_t   fun;
     int                 n_tasks;
     void              * userdata;
 };
 
-
 struct ggml_map_custom3_op_params {
     ggml_custom3_op_t fun;
     int n_tasks;
@@ -287,9 +301,249 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 void * ggml_aligned_malloc(size_t size);
 void ggml_aligned_free(void * ptr, size_t size);
 
-// TODO: move to threading file
-void ggml_critical_section_start(void);
-void ggml_critical_section_end(void);
+// FP16 to FP32 conversion
+
+#if defined(__ARM_NEON)
+    #ifdef _MSC_VER
+        typedef uint16_t ggml_fp16_internal_t;
+    #else
+        typedef __fp16 ggml_fp16_internal_t;
+    #endif
+#endif
+
+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+    #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+
+    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+        ggml_fp16_internal_t tmp;
+        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+        return (float)tmp;
+    }
+
+    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+        ggml_fp16_t res;
+        ggml_fp16_internal_t tmp = f;
+        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+        return res;
+    }
+
+#elif defined(__F16C__)
+
+    #ifdef _MSC_VER
+        #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+        #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+    #else
+        #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+        #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+    #endif
+
+#elif defined(__POWER9_VECTOR__)
+
+    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+    /* the inline asm below is about 12% faster than the lookup method */
+    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+        register float f;
+        register double d;
+        __asm__(
+            "mtfprd %0,%2\n"
+            "xscvhpdp %0,%0\n"
+            "frsp %1,%0\n" :
+            /* temp */ "=d"(d),
+            /* out */  "=f"(f):
+            /* in */   "r"(h));
+        return f;
+    }
+
+    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+        register double d;
+        register ggml_fp16_t r;
+        __asm__( /* xscvdphp can work on double or single precision */
+            "xscvdphp %0,%2\n"
+            "mffprd %1,%0\n" :
+            /* temp */ "=d"(d),
+            /* out */  "=r"(r):
+            /* in */   "f"(f));
+        return r;
+    }
+
+#else
+
+    // FP16 <-> FP32
+    // ref: https://github.com/Maratyszcza/FP16
+
+    static inline float fp32_from_bits(uint32_t w) {
+        union {
+            uint32_t as_bits;
+            float as_value;
+        } fp32;
+        fp32.as_bits = w;
+        return fp32.as_value;
+    }
+
+    static inline uint32_t fp32_to_bits(float f) {
+        union {
+            float as_value;
+            uint32_t as_bits;
+        } fp32;
+        fp32.as_value = f;
+        return fp32.as_bits;
+    }
+
+    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+        const uint32_t w = (uint32_t) h << 16;
+        const uint32_t sign = w & UINT32_C(0x80000000);
+        const uint32_t two_w = w + w;
+
+        const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+    #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+        const float exp_scale = 0x1.0p-112f;
+    #else
+        const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+    #endif
+        const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+        const uint32_t magic_mask = UINT32_C(126) << 23;
+        const float magic_bias = 0.5f;
+        const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+        const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+        const uint32_t result = sign |
+            (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+        return fp32_from_bits(result);
+    }
+
+    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+        const float scale_to_inf = 0x1.0p+112f;
+        const float scale_to_zero = 0x1.0p-110f;
+    #else
+        const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+        const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+    #endif
+        float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+        const uint32_t w = fp32_to_bits(f);
+        const uint32_t shl1_w = w + w;
+        const uint32_t sign = w & UINT32_C(0x80000000);
+        uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+        if (bias < UINT32_C(0x71000000)) {
+            bias = UINT32_C(0x71000000);
+        }
+
+        base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+        const uint32_t bits = fp32_to_bits(base);
+        const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+        const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+        const uint32_t nonsign = exp_bits + mantissa_bits;
+        return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+    }
+
+    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml.c, initialized in ggml_init()
+GGML_API float ggml_table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_FP16_TO_FP32)
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return ggml_table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#endif
+
+#if !defined(GGML_FP32_TO_FP16)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
+
+/**
+ * Converts brain16 to float32.
+ *
+ * The bfloat16 floating point format has the following structure:
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───┐
+ *     0b0000000000000000 brain16
+ *
+ * Since bf16 has the same number of exponent bits as a 32bit float,
+ * encoding and decoding numbers becomes relatively straightforward.
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───────────────────┐
+ *     0b00000000000000000000000000000000 IEEE binary32
+ *
+ * For comparison, the standard fp16 format has fewer exponent bits.
+ *
+ *       ┌sign
+ *       │
+ *       │  ┌exponent
+ *       │  │
+ *       │  │    ┌mantissa
+ *       │  │    │
+ *       │┌─┴─┐┌─┴──────┐
+ *     0b0000000000000000 IEEE binary16
+ *
+ * @see IEEE 754-2008
+ */
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
+/**
+ * Converts float32 to brain16.
+ *
+ * This is binary identical with Google Brain float conversion.
+ * Floats shall round to nearest even, and NANs shall be quiet.
+ * Subnormals aren't flushed to zero, except perhaps when used.
+ * This code should vectorize nicely if using modern compilers.
+ */
+static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
+    ggml_bf16_t h;
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.f = s;
+    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
+        h.bits = (u.i >> 16) | 64; /* force to quiet */
+        return h;
+    }
+    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
+    return h;
+}
+
+#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
+#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt
new file mode 100644
index 000000000..0bd027c7f
--- /dev/null
+++ b/ggml/src/ggml-kompute/CMakeLists.txt
@@ -0,0 +1,162 @@
+
+find_package(Vulkan COMPONENTS glslc REQUIRED)
+find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
+
+if (NOT glslc_executable)
+    message(FATAL_ERROR "glslc not found")
+endif()
+
+add_library(ggml-kompute
+            ggml-kompute.cpp
+            ../../include/ggml-kompute.h
+            )
+
+target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
+target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+
+add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
+
+function(compile_shader)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs SOURCES)
+    cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    foreach(source ${compile_shader_SOURCES})
+        get_filename_component(filename ${source} NAME)
+        set(spv_file ${filename}.spv)
+        add_custom_command(
+            OUTPUT ${spv_file}
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
+            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
+            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
+            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
+            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            COMMENT "Compiling ${source} to ${spv_file}"
+            )
+
+        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
+        set(FILE_NAME "shader${RAW_FILE_NAME}")
+        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
+        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
+        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
+        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
+        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
+        if(CMAKE_GENERATOR MATCHES "Visual Studio")
+            add_custom_command(
+                OUTPUT ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                DEPENDS ${spv_file} xxd
+                COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
+                )
+        else()
+            add_custom_command(
+                OUTPUT ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                DEPENDS ${spv_file} xxd
+                COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
+                )
+        endif()
+    endforeach()
+endfunction()
+
+if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
+    message(STATUS "Kompute found")
+    set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
+    add_subdirectory(kompute)
+
+    # Compile our shaders
+    compile_shader(SOURCES
+        kompute-shaders/op_scale.comp
+        kompute-shaders/op_scale_8.comp
+        kompute-shaders/op_add.comp
+        kompute-shaders/op_addrow.comp
+        kompute-shaders/op_mul.comp
+        kompute-shaders/op_silu.comp
+        kompute-shaders/op_relu.comp
+        kompute-shaders/op_gelu.comp
+        kompute-shaders/op_softmax.comp
+        kompute-shaders/op_norm.comp
+        kompute-shaders/op_rmsnorm.comp
+        kompute-shaders/op_diagmask.comp
+        kompute-shaders/op_mul_mat_mat_f32.comp
+        kompute-shaders/op_mul_mat_f16.comp
+        kompute-shaders/op_mul_mat_q8_0.comp
+        kompute-shaders/op_mul_mat_q4_0.comp
+        kompute-shaders/op_mul_mat_q4_1.comp
+        kompute-shaders/op_mul_mat_q4_k.comp
+        kompute-shaders/op_mul_mat_q6_k.comp
+        kompute-shaders/op_getrows_f32.comp
+        kompute-shaders/op_getrows_f16.comp
+        kompute-shaders/op_getrows_q4_0.comp
+        kompute-shaders/op_getrows_q4_1.comp
+        kompute-shaders/op_getrows_q6_k.comp
+        kompute-shaders/op_rope_f16.comp
+        kompute-shaders/op_rope_f32.comp
+        kompute-shaders/op_cpy_f16_f16.comp
+        kompute-shaders/op_cpy_f16_f32.comp
+        kompute-shaders/op_cpy_f32_f16.comp
+        kompute-shaders/op_cpy_f32_f32.comp
+    )
+
+    # Create a custom target for our generated shaders
+    add_custom_target(generated_shaders DEPENDS
+        shaderop_scale.h
+        shaderop_scale_8.h
+        shaderop_add.h
+        shaderop_addrow.h
+        shaderop_mul.h
+        shaderop_silu.h
+        shaderop_relu.h
+        shaderop_gelu.h
+        shaderop_softmax.h
+        shaderop_norm.h
+        shaderop_rmsnorm.h
+        shaderop_diagmask.h
+        shaderop_mul_mat_mat_f32.h
+        shaderop_mul_mat_f16.h
+        shaderop_mul_mat_q8_0.h
+        shaderop_mul_mat_q4_0.h
+        shaderop_mul_mat_q4_1.h
+        shaderop_mul_mat_q4_k.h
+        shaderop_mul_mat_q6_k.h
+        shaderop_getrows_f32.h
+        shaderop_getrows_f16.h
+        shaderop_getrows_q4_0.h
+        shaderop_getrows_q4_1.h
+        shaderop_getrows_q6_k.h
+        shaderop_rope_f16.h
+        shaderop_rope_f32.h
+        shaderop_cpy_f16_f16.h
+        shaderop_cpy_f16_f32.h
+        shaderop_cpy_f32_f16.h
+        shaderop_cpy_f32_f32.h
+    )
+
+    # Create a custom command that depends on the generated_shaders
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
+        COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
+        DEPENDS generated_shaders
+        COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
+    )
+
+    # Add the stamp to the main sources to ensure dependency tracking
+    target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
+else()
+    message(WARNING "Kompute not found")
+endif()
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp
similarity index 100%
rename from ggml/src/ggml-kompute.cpp
rename to ggml/src/ggml-kompute/ggml-kompute.cpp
diff --git a/ggml/src/kompute b/ggml/src/ggml-kompute/kompute
similarity index 100%
rename from ggml/src/kompute
rename to ggml/src/ggml-kompute/kompute
diff --git a/ggml/src/kompute-shaders/common.comp b/ggml/src/ggml-kompute/kompute-shaders/common.comp
similarity index 100%
rename from ggml/src/kompute-shaders/common.comp
rename to ggml/src/ggml-kompute/kompute-shaders/common.comp
diff --git a/ggml/src/kompute-shaders/op_add.comp b/ggml/src/ggml-kompute/kompute-shaders/op_add.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_add.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_add.comp
diff --git a/ggml/src/kompute-shaders/op_addrow.comp b/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_addrow.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp
diff --git a/ggml/src/kompute-shaders/op_cpy_f16_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_cpy_f16_f16.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp
diff --git a/ggml/src/kompute-shaders/op_cpy_f16_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_cpy_f16_f32.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp
diff --git a/ggml/src/kompute-shaders/op_cpy_f32_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_cpy_f32_f16.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp
diff --git a/ggml/src/kompute-shaders/op_cpy_f32_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_cpy_f32_f32.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp
diff --git a/ggml/src/kompute-shaders/op_diagmask.comp b/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_diagmask.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp
diff --git a/ggml/src/kompute-shaders/op_gelu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_gelu.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp
diff --git a/ggml/src/kompute-shaders/op_getrows.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_getrows.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp
diff --git a/ggml/src/kompute-shaders/op_getrows_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_getrows_f16.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp
diff --git a/ggml/src/kompute-shaders/op_getrows_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_getrows_f32.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp
diff --git a/ggml/src/kompute-shaders/op_getrows_q4_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_getrows_q4_0.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp
diff --git a/ggml/src/kompute-shaders/op_getrows_q4_1.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_getrows_q4_1.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp
diff --git a/ggml/src/kompute-shaders/op_getrows_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_getrows_q6_k.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp
diff --git a/ggml/src/kompute-shaders/op_mul.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_mul.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul.comp
diff --git a/ggml/src/kompute-shaders/op_mul_mat_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_mul_mat_f16.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
diff --git a/ggml/src/kompute-shaders/op_mul_mat_mat_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_mul_mat_mat_f32.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp
diff --git a/ggml/src/kompute-shaders/op_mul_mat_q4_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_mul_mat_q4_0.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp
diff --git a/ggml/src/kompute-shaders/op_mul_mat_q4_1.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_mul_mat_q4_1.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp
diff --git a/ggml/src/kompute-shaders/op_mul_mat_q4_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_mul_mat_q4_k.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
diff --git a/ggml/src/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_mul_mat_q6_k.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
diff --git a/ggml/src/kompute-shaders/op_mul_mat_q8_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_mul_mat_q8_0.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp
diff --git a/ggml/src/kompute-shaders/op_mul_mv_q_n.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_mul_mv_q_n.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
diff --git a/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
diff --git a/ggml/src/kompute-shaders/op_norm.comp b/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_norm.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_norm.comp
diff --git a/ggml/src/kompute-shaders/op_relu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_relu.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_relu.comp
diff --git a/ggml/src/kompute-shaders/op_rmsnorm.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_rmsnorm.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp
diff --git a/ggml/src/kompute-shaders/op_rope_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_rope_f16.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp
diff --git a/ggml/src/kompute-shaders/op_rope_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_rope_f32.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp
diff --git a/ggml/src/kompute-shaders/op_scale.comp b/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_scale.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_scale.comp
diff --git a/ggml/src/kompute-shaders/op_scale_8.comp b/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_scale_8.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp
diff --git a/ggml/src/kompute-shaders/op_silu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_silu.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_silu.comp
diff --git a/ggml/src/kompute-shaders/op_softmax.comp b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
similarity index 100%
rename from ggml/src/kompute-shaders/op_softmax.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
diff --git a/ggml/src/kompute-shaders/rope_common.comp b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
similarity index 100%
rename from ggml/src/kompute-shaders/rope_common.comp
rename to ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt
new file mode 100644
index 000000000..e0992c744
--- /dev/null
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@@ -0,0 +1,104 @@
+find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+
+message(STATUS "Metal framework found")
+
+add_library(ggml-metal
+            ggml-metal.m
+            )
+
+target_link_libraries(ggml-metal PRIVATE
+                      ggml-base
+                      ${FOUNDATION_LIBRARY}
+                      ${METAL_FRAMEWORK}
+                      ${METALKIT_FRAMEWORK}
+                      )
+
+target_include_directories(ggml-metal PRIVATE . ..)
+
+if (GGML_METAL_NDEBUG)
+    add_compile_definitions(GGML_METAL_NDEBUG)
+endif()
+
+if (GGML_METAL_USE_BF16)
+    add_compile_definitions(GGML_METAL_USE_BF16)
+endif()
+
+# copy ggml-common.h and ggml-metal.metal to bin directory
+configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
+configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
+
+if (GGML_METAL_EMBED_LIBRARY)
+    enable_language(ASM)
+
+    add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
+
+    set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
+    set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+
+    file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
+
+    # merge ggml-common.h and ggml-metal.metal into a single file
+    set(METALLIB_EMBED_ASM    "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
+    set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
+
+    add_custom_command(
+        OUTPUT ${METALLIB_EMBED_ASM}
+        COMMAND echo "Embedding Metal library"
+        COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED}
+        COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
+        COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
+        COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
+        COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
+        COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
+        COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
+        DEPENDS ggml-metal.metal ../ggml-common.h
+        COMMENT "Generate assembly for embedded Metal library"
+    )
+
+    target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM})
+else()
+    if (GGML_METAL_SHADER_DEBUG)
+        # custom command to do the following:
+        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
+        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
+        #
+        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
+        #       disabling fast math is needed in order to pass tests/test-backend-ops
+        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
+        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
+        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
+        set(XC_FLAGS -fno-fast-math -fno-inline -g)
+    else()
+        set(XC_FLAGS -O3)
+    endif()
+
+    # Append macOS metal versioning flags
+    if (GGML_METAL_MACOSX_VERSION_MIN)
+        message(STATUS "Adding  -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
+        list   (APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN})
+    endif()
+
+    if (GGML_METAL_STD)
+        message(STATUS "Adding  -std=${GGML_METAL_STD} flag to metal compilation")
+        list   (APPEND XC_FLAGS -std=${GGML_METAL_STD})
+    endif()
+
+    add_custom_command(
+        OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+        COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
+        DEPENDS ggml-metal.metal ggml-common.h
+        COMMENT "Compiling Metal kernels"
+        )
+
+    # FIXME: only add to the ggml-metal target?
+    add_custom_target(
+        ggml-metal-lib ALL
+        DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        )
+endif() # GGML_METAL_EMBED_LIBRARY
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
similarity index 99%
rename from ggml/src/ggml-metal.m
rename to ggml/src/ggml-metal/ggml-metal.m
index 04ec5117f..b4b5cfd26 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -516,6 +516,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
                     [prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
                 }
 
+#if GGML_METAL_EMBED_LIBRARY
+                [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
+#endif
+
                 MTLCompileOptions * options = [MTLCompileOptions new];
                 options.preprocessorMacros = prep;
 
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
similarity index 99%
rename from ggml/src/ggml-metal.metal
rename to ggml/src/ggml-metal/ggml-metal.metal
index e8b71a9f8..8c7fcb113 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1,6 +1,11 @@
 #define GGML_COMMON_DECL_METAL
 #define GGML_COMMON_IMPL_METAL
-#include "ggml-common.h"
+#if defined(GGML_METAL_EMBED_LIBRARY)
+__embed_ggml-common.h__
+#else
+// TODO: this should not be a relative path, but can't figure out how to set Metal include paths in Package.swift
+#include "../ggml-common.h"
+#endif
 
 #include <metal_stdlib>
 
@@ -15,8 +20,8 @@ using namespace metal;
 // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
 //
 // cmd:
-//   .../usr/bin/metal -dM -E -c                             ggml/src/ggml-metal.metal
-//   .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal.metal
+//   .../usr/bin/metal -dM -E -c                             ggml/src/ggml-metal/ggml-metal.metal
+//   .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal/ggml-metal.metal
 //
 #if __METAL_VERSION__ < 310 && defined(GGML_METAL_USE_BF16)
 #undef GGML_METAL_USE_BF16
diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt
new file mode 100644
index 000000000..8edc75cc5
--- /dev/null
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -0,0 +1,111 @@
+if (NOT EXISTS $ENV{MUSA_PATH})
+    if (NOT EXISTS /opt/musa)
+        set(MUSA_PATH /usr/local/musa)
+    else()
+        set(MUSA_PATH /opt/musa)
+    endif()
+else()
+    set(MUSA_PATH $ENV{MUSA_PATH})
+endif()
+
+set(CMAKE_C_COMPILER "${MUSA_PATH}/bin/clang")
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_CXX_COMPILER "${MUSA_PATH}/bin/clang++")
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")
+
+find_package(MUSAToolkit)
+
+if (MUSAToolkit_FOUND)
+    message(STATUS "MUSA Toolkit found")
+
+    file(GLOB   GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
+    list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
+
+    file(GLOB   GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu")
+    list(APPEND GGML_SOURCES_MUSA ${SRCS})
+    file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
+    list(APPEND GGML_SOURCES_MUSA ${SRCS})
+
+    if (GGML_CUDA_FA_ALL_QUANTS)
+        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
+        list(APPEND GGML_SOURCES_MUSA ${SRCS})
+        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+    else()
+        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+        list(APPEND GGML_SOURCES_MUSA ${SRCS})
+        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+        list(APPEND GGML_SOURCES_MUSA ${SRCS})
+        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+        list(APPEND GGML_SOURCES_MUSA ${SRCS})
+    endif()
+
+    set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
+    foreach(SOURCE ${GGML_SOURCES_MUSA})
+        set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
+    endforeach()
+
+    add_library(ggml-musa
+                ${GGML_HEADERS_MUSA}
+                ${GGML_SOURCES_MUSA})
+
+    target_link_libraries(ggml-musa PRIVATE ggml-base)
+    target_include_directories(ggml-musa PRIVATE . ..)
+
+    # TODO: do not use CUDA definitions for MUSA
+    target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
+
+    add_compile_definitions(GGML_USE_MUSA)
+    add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
+    add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
+    add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
+    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
+
+    if (GGML_CUDA_GRAPHS)
+        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
+    endif()
+
+    if (GGML_CUDA_FORCE_DMMV)
+        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
+    endif()
+
+    if (GGML_CUDA_FORCE_MMQ)
+        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+    endif()
+
+    if (GGML_CUDA_FORCE_CUBLAS)
+        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        add_compile_definitions(GGML_CUDA_NO_VMM)
+    endif()
+
+    if (DEFINED GGML_CUDA_DMMV_Y)
+        add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
+    endif()
+
+    if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+        add_compile_definitions(GGML_CUDA_F16)
+    endif()
+
+    if (GGML_CUDA_NO_PEER_COPY)
+        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+    endif()
+
+    if (GGML_STATIC)
+        target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
+    else()
+        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        # No VMM requested, no need to link directly with the musa driver lib (libmusa.so)
+    else()
+        target_link_libraries(ggml-musa PRIVATE MUSA::musa_driver)
+    endif()
+else()
+    message(FATAL_ERROR "MUSA Toolkit not found")
+endif()
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 82a463f27..7301a9c6c 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3,7 +3,7 @@
 
 #include "ggml-quants.h"
 #include "ggml-impl.h"
-#include "ggml-cpu-impl.h"
+#include "ggml-cpu/ggml-cpu-impl.h"
 #include "ggml-cpu.h"
 
 #include <math.h>
@@ -27,643 +27,6 @@
 
 #define UNUSED GGML_UNUSED
 
-// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
-#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
-
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
-// multiply int8_t, add results pairwise twice
-static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
-    // Get absolute values of x vectors
-    const __m128i ax = _mm_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m128i sy = _mm_sign_epi8(y, x);
-    // Perform multiplication and create 16-bit values
-    const __m128i dot = _mm_maddubs_epi16(ax, sy);
-    const __m128i ones = _mm_set1_epi16(1);
-    return _mm_madd_epi16(ones, dot);
-}
-
-#if __AVX__ || __AVX2__ || __AVX512F__
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = _mm256_extractf128_ps(x, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    return _mm_cvtss_f32(res);
-}
-
-// horizontally add 8 int32_t
-static inline int hsum_i32_8(const __m256i a) {
-    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
-    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-// horizontally add 4 int32_t
-static inline int hsum_i32_4(const __m128i a) {
-    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
-    const __m128i sum64 = _mm_add_epi32(hi64, a);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-#if defined(__AVX2__) || defined(__AVX512F__)
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m256i shuf_mask = _mm256_set_epi64x(
-            0x0303030303030303, 0x0202020202020202,
-            0x0101010101010101, 0x0000000000000000);
-    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
-    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytes = _mm256_or_si256(bytes, bit_mask);
-    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
-    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    return _mm256_and_si256(lowMask, bytes);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    const __m256i ones = _mm256_set1_epi16(1);
-    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_float(dot);
-#endif
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-#if __AVXVNNIINT8__
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Get absolute values of x vectors
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return mul_sum_us8_pairs_float(ax, sy);
-#endif
-}
-
-static inline __m128i packNibbles( __m256i bytes )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-#if __AVX512F__
-    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
-    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
-    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
-#else
-    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
-    __m256i high = _mm256_andnot_si256( lowByte, bytes );
-    __m256i low = _mm256_and_si256( lowByte, bytes );
-    high = _mm256_srli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-
-    // Compress uint16_t lanes into bytes
-    __m128i r0 = _mm256_castsi256_si128( bytes );
-    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
-    return _mm_packus_epi16( r0, r1 );
-#endif
-}
-#elif defined(__AVX__)
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
-    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
-    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
-    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
-    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytesl = _mm_or_si128(bytesl, bit_mask);
-    bytesh = _mm_or_si128(bytesh, bit_mask);
-    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
-    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
-    return MM256_SET_M128I(bytesh, bytesl);
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    // Load 16 bytes from memory
-    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
-    __m128i tmph = _mm_srli_epi16(tmpl, 4);
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    tmpl = _mm_and_si128(lowMask, tmpl);
-    tmph = _mm_and_si128(lowMask, tmph);
-    return MM256_SET_M128I(tmph, tmpl);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
-    const __m128i ones = _mm_set1_epi16(1);
-    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
-    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
-    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-    const __m128i axl = _mm256_castsi256_si128(ax);
-    const __m128i axh = _mm256_extractf128_si256(ax, 1);
-    const __m128i syl = _mm256_castsi256_si128(sy);
-    const __m128i syh = _mm256_extractf128_si256(sy, 1);
-    // Perform multiplication and create 16-bit values
-    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-    const __m128i doth = _mm_maddubs_epi16(axh, syh);
-    return sum_i16_pairs_float(doth, dotl);
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-    const __m128i xl = _mm256_castsi256_si128(x);
-    const __m128i xh = _mm256_extractf128_si256(x, 1);
-    const __m128i yl = _mm256_castsi256_si128(y);
-    const __m128i yh = _mm256_extractf128_si256(y, 1);
-    // Get absolute values of x vectors
-    const __m128i axl = _mm_sign_epi8(xl, xl);
-    const __m128i axh = _mm_sign_epi8(xh, xh);
-    // Sign the values of the y vectors
-    const __m128i syl = _mm_sign_epi8(yl, xl);
-    const __m128i syh = _mm_sign_epi8(yh, xh);
-    // Perform multiplication and create 16-bit values
-    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-    const __m128i doth = _mm_maddubs_epi16(axh, syh);
-    return sum_i16_pairs_float(doth, dotl);
-}
-
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m128i lowByte = _mm_set1_epi16( 0xFF );
-    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
-    __m128i low = _mm_and_si128( lowByte, bytes1 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes1 = _mm_or_si128( low, high );
-    high = _mm_andnot_si128( lowByte, bytes2 );
-    low = _mm_and_si128( lowByte, bytes2 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes2 = _mm_or_si128( low, high );
-
-    return _mm_packus_epi16( bytes1, bytes2);
-}
-
-static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
-    const __m128i ax = _mm_sign_epi8(x, x);
-    const __m128i sy = _mm_sign_epi8(y, x);
-    return _mm_maddubs_epi16(ax, sy);
-}
-#endif
-#elif defined(__SSSE3__)
-// horizontally add 4x4 floats
-static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
-    __m128 res_0 =_mm_hadd_ps(a, b);
-    __m128 res_1 =_mm_hadd_ps(c, d);
-    __m128 res =_mm_hadd_ps(res_0, res_1);
-    res =_mm_hadd_ps(res, res);
-    res =_mm_hadd_ps(res, res);
-
-    return _mm_cvtss_f32(res);
-}
-#endif // __AVX__ || __AVX2__ || __AVX512F__
-#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
-
-#if defined(__ARM_NEON) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
-#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
-#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
-#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
-#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
-#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
-#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
-#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
-#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
-
-// precomputed tables for expanding 8bits to 8 bytes:
-static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
-static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
-#endif
-
-#if defined(__loongarch_asx)
-
-#ifdef __clang__
-#define VREGS_PREFIX "$vr"
-#define XREGS_PREFIX "$xr"
-#else // GCC
-#define VREGS_PREFIX "$f"
-#define XREGS_PREFIX "$f"
-#endif
-#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
-// Convert __m128i to __m256i
-static inline __m256i ____m256i(__m128i in) {
-    __m256i out = __lasx_xvldi(0);
-    __asm__ volatile (
-        ".irp i," __ALL_REGS                "\n\t"
-        " .ifc %[out], " XREGS_PREFIX"\\i    \n\t"
-        "  .irp j," __ALL_REGS              "\n\t"
-        "   .ifc %[in], " VREGS_PREFIX "\\j  \n\t"
-        "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
-        "   .endif                           \n\t"
-        "  .endr                             \n\t"
-        " .endif                             \n\t"
-        ".endr                               \n\t"
-        : [out] "+f" (out) : [in] "f" (in)
-    );
-    return out;
-}
-// Convert two __m128i to __m256i
-static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
-    __m256i out;
-    __asm__ volatile (
-        ".irp i," __ALL_REGS                "\n\t"
-        " .ifc %[hi], " VREGS_PREFIX "\\i    \n\t"
-        "  .irp j," __ALL_REGS              "\n\t"
-        "   .ifc %[lo], " VREGS_PREFIX "\\j  \n\t"
-        "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
-        "   .endif                           \n\t"
-        "  .endr                             \n\t"
-        " .endif                             \n\t"
-        ".endr                               \n\t"
-        ".ifnc %[out], %[hi]                 \n\t"
-        ".irp i," __ALL_REGS                "\n\t"
-        " .ifc %[out], " XREGS_PREFIX "\\i   \n\t"
-        "  .irp j," __ALL_REGS              "\n\t"
-        "   .ifc %[hi], " VREGS_PREFIX "\\j  \n\t"
-        "    xvori.b $xr\\i, $xr\\j, 0       \n\t"
-        "   .endif                           \n\t"
-        "  .endr                             \n\t"
-        " .endif                             \n\t"
-        ".endr                               \n\t"
-        ".endif                              \n\t"
-        : [out] "=f" (out), [hi] "+f" (inhi)
-        : [lo] "f" (inlo)
-    );
-    return out;
-}
-// Convert __m256i low part to __m128i
-static inline __m128i lasx_extracti128_lo(__m256i in) {
-    __m128i out;
-    __asm__ volatile (
-        ".ifnc %[out], %[in]                 \n\t"
-        ".irp i," __ALL_REGS                "\n\t"
-        " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
-        "  .irp j," __ALL_REGS              "\n\t"
-        "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
-        "    vori.b $vr\\i, $vr\\j, 0        \n\t"
-        "   .endif                           \n\t"
-        "  .endr                             \n\t"
-        " .endif                             \n\t"
-        ".endr                               \n\t"
-        ".endif                              \n\t"
-        : [out] "=f" (out) : [in] "f" (in)
-    );
-    return out;
-}
-// Convert __m256i high part to __m128i
-static inline __m128i lasx_extracti128_hi(__m256i in) {
-    __m128i out;
-    __asm__ volatile (
-        ".irp i," __ALL_REGS                "\n\t"
-        " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
-        "  .irp j," __ALL_REGS              "\n\t"
-        "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
-        "    xvpermi.q $xr\\i, $xr\\j, 0x11  \n\t"
-        "   .endif                           \n\t"
-        "  .endr                             \n\t"
-        " .endif                             \n\t"
-        ".endr                               \n\t"
-        : [out] "=f" (out) : [in] "f" (in)
-    );
-    return out;
-}
-
-static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) {
-    v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7};
-    return (__m256i)__ret;
-}
-
-static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
-    v4i32 __ret = {d, c, b, a};
-    return (__m128i)__ret;
-}
-
-static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
-    v4i64 __ret = {d, c, b, a};
-    return (__m256i)__ret;
-}
-
-static __m256i lasx_insertf128( __m128i x, __m128i y) {
-    return lasx_set_q(x, y);
-}
-
-static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
-    __m128i mask_f, zero, tmp0, tmp2, mask;
-    int f = 0x8f;
-    mask_f = __lsx_vreplgr2vr_b(f);
-    zero = __lsx_vldi(0);
-    tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
-    tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
-    mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
-    tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
-    return __lsx_vshuf_b(a, zero, tmp2);
-}
-
-static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
-    __m256i mask_f, zero, tmp0, tmp2, mask;
-    int f = 0x8f;
-    mask_f = __lasx_xvreplgr2vr_b(f);
-    zero = __lasx_xvldi(0);
-    tmp0 = __lasx_xvand_v(b, mask_f); // get mask with low 4 bit and sign bits
-    tmp0 = __lasx_xvori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
-    mask = __lasx_xvsle_b(zero, tmp0); // if mask >= 0, set mask
-    tmp2 = __lasx_xvand_v(tmp0, mask); // maskout the in2 < ones
-    return __lasx_xvshuf_b(a, zero, tmp2);
-}
-
-static __m256i lasx_extu8_16(__m128i a) {
-    __m128i zero = __lsx_vldi(0);
-    __m128i vlo = __lsx_vilvl_b(zero, a);
-    __m128i vhi = __lsx_vilvh_b(zero, a);
-    return lasx_set_q(vhi, vlo);
-}
-
-static __m256i lasx_ext8_16(__m128i a) {
-     __m128i sign = __lsx_vslti_b(a, 0);
-     __m128i vlo = __lsx_vilvl_b(sign, a);
-     __m128i vhi = __lsx_vilvh_b(sign, a);
-     return lasx_set_q(vhi, vlo);
-}
-
-static __m256i lasx_ext16_32(__m128i a) {
-    __m256i tmp1;
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
-    return tmp1;
-}
-
-static __m128i lasx_extracti128( __m256i a, int pos) {
-    __m128i ret;
-    if( pos == 0)
-    {
-       ret = lasx_extracti128_lo(a);
-    } else {
-       ret = lasx_extracti128_hi(a);
-    }
-    return ret;
-}
-
-static __m128 lasx_extractf128( __m256 a, int pos) {
-    __m128 ret;
-    if( pos == 0)
-    {
-       ret = (__m128)lasx_extracti128_lo((__m256i)a);
-    } else {
-       ret = (__m128)lasx_extracti128_hi((__m256i)a);
-    }
-    return ret;
-}
-
-static __m128i lsx_hadd_h(__m128i a, __m128i b) {
-    __m128i tmp1 = __lsx_vpickev_h(b, a);
-    __m128i tmp2 = __lsx_vpickod_h(b, a);
-    return __lsx_vadd_h(tmp1, tmp2);
-}
-
-static __m128i lsx_hadd_w(__m128i a, __m128i b) {
-    __m128i tmp1 = __lsx_vpickev_w(b, a);
-    __m128i tmp2 = __lsx_vpickod_w(b, a);
-    return __lsx_vadd_w(tmp1, tmp2);
-}
-
-static __m128 lsx_hadd_s(__m128 a, __m128 b) {
-    __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
-    __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
-
-    return __lsx_vfadd_s(tmp1, tmp2);
-}
-
-static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
-    __m256i tmp1, tmp2;
-    tmp1 = __lasx_xvmulwev_h_b(a, b);
-    tmp2 = __lasx_xvmulwod_h_b(a, b);
-    return __lasx_xvsadd_h(tmp1, tmp2);
-}
-
-static __m256i lasx_madd_h(__m256i a, __m256i b) {
-    __m256i tmp1, tmp2;
-    tmp1 = __lasx_xvmulwev_w_h(a, b);
-    tmp2 = __lasx_xvmulwod_w_h(a, b);
-    return __lasx_xvadd_w(tmp1, tmp2);
-}
-
-static __m256i lasx_packs_w(__m256i a, __m256i b) {
-    __m256i tmp, tmp1;
-    tmp = __lasx_xvsat_w(a, 15);
-    tmp1 = __lasx_xvsat_w(b, 15);
-    return __lasx_xvpickev_h(tmp1, tmp);
-}
-
-static __m256i lasx_packs_h(__m256i a, __m256i b) {
-    __m256i tmp, tmp1;
-    tmp = __lasx_xvsat_h(a, 7);
-    tmp1 = __lasx_xvsat_h(b, 7);
-    return __lasx_xvpickev_b(tmp1, tmp);
-}
-
-static __m128i lsx_packs_w(__m128i a, __m128i b) {
-    __m128i tmp, tmp1;
-    tmp = __lsx_vsat_w(a, 15);
-    tmp1 = __lsx_vsat_w(b, 15);
-    return __lsx_vpickev_h(tmp1, tmp);
-}
-
-static __m128i lsx_packs_h(__m128i a, __m128i b) {
-    __m128i tmp, tmp1;
-    tmp = __lsx_vsat_h(a, 7);
-    tmp1 = __lsx_vsat_h(b, 7);
-    return __lsx_vpickev_b(tmp1, tmp);
-}
-
-static __m128i lsx_packus_h(__m128i a, __m128i b) {
-    __m128i tmp, tmp1;
-    tmp = __lsx_vsat_hu(a, 7);
-    tmp1 = __lsx_vsat_hu(b, 7);
-    return __lsx_vpickev_b(tmp1, tmp);
-}
-
-
-static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
-    __m128i tmp1, tmp2;
-    tmp1 = __lsx_vmulwev_h_b(a, b);
-    tmp2 = __lsx_vmulwod_h_b(a, b);
-    return __lsx_vsadd_h(tmp1, tmp2);
-}
-
-static __m128i lsx_madd_h(__m128i a, __m128i b) {
-    __m128i tmp1, tmp2;
-    tmp1 = __lsx_vmulwev_w_h(a, b);
-    tmp2 = __lsx_vmulwod_w_h(a, b);
-    return __lsx_vadd_w(tmp1, tmp2);
-}
-
-// multiply int8_t, add results pairwise twice
-static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
-    // Get absolute values of x vectors
-    const __m128i ax = __lsx_vsigncov_b(x, x);
-    // Sign the values of the y vectors
-    const __m128i sy = __lsx_vsigncov_b(x, y);
-    // Perform multiplication and create 16-bit values
-    const __m128i dot = lsx_maddubs_h(ax, sy);
-    const __m128i ones = __lsx_vreplgr2vr_h(1);
-    return lsx_madd_h(ones, dot);
-}
-
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = lasx_extractf128(x, 1);
-    ft_union tmp;
-    res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
-    res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
-    res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
-    tmp.i = __lsx_vpickve2gr_w(res, 0);
-    return tmp.f;
-}
-
-// horizontally add 8 int32_t
-static inline int hsum_i32_8(const __m256i a) {
-
-    __m256i tmp1 = __lasx_xvpermi_q(a, a, 0x11);
-    __m256i tmp2 = __lasx_xvpermi_q(a, a, 0x00);
-
-    __m128i  tmp1_128 = lasx_extracti128_lo(tmp1);
-    __m128i  tmp2_128 = lasx_extracti128_lo(tmp2);
-
-    __m128i sum128 = __lsx_vadd_w(tmp1_128, tmp2_128);
-
-    __m128i ev = __lsx_vpickev_w(sum128, sum128);
-    __m128i od = __lsx_vpickod_w(sum128, sum128);
-    __m128i sum64 = __lsx_vadd_w(ev, od);
-
-    int sum64_1, sum64_2;
-    sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
-    sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
-
-    return  sum64_1 + sum64_2;
-}
-
-// horizontally add 4 int32_t
-static inline int hsum_i32_4(const __m128i a) {
-    __m128i ev = __lsx_vpickev_w(a, a);
-    __m128i od = __lsx_vpickod_w(a, a);
-    __m128i sum64 = __lsx_vadd_w(ev, od);
-
-    int sum64_1, sum64_2;
-    sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
-    sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
-
-    return  sum64_1 + sum64_2;
-}
-
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m256i shuf_mask = lasx_set_d(
-            0x0303030303030303, 0x0202020202020202,
-            0x0101010101010101, 0x0000000000000000);
-
-    __m256i bytes = lasx_shuffle_b(__lasx_xvreplgr2vr_w(x32), shuf_mask);
-    const __m256i bit_mask = __lasx_xvreplgr2vr_d(0x7fbfdfeff7fbfdfe);
-    bytes = __lasx_xvor_v(bytes, bit_mask);
-    return __lasx_xvseq_b(bytes, __lasx_xvreplgr2vr_d(-1));
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
-    const __m128i lo = __lsx_vld((const __m128i *)rsi, 0);
-    __m128i hi = __lsx_vsrli_h(lo, 4);
-    return __lasx_xvandi_b(lasx_insertf128(hi, lo), 0xf);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    __m256i v = __lasx_xvpackod_h(x, x);
-    __m256i summed_pairs = __lasx_xvaddwev_w_h(x, v);
-    return __lasx_xvffint_s_w(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = lasx_maddubs_h(ax, sy);
-    return sum_i16_pairs_float(dot);
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-
-    // Get absolute values of x vectors
-    const __m256i ax = __lasx_xvsigncov_b(x, x);
-    // Sign the values of the y vectors
-    const __m256i sy = __lasx_xvsigncov_b(x, y);
-
-    return mul_sum_us8_pairs_float(ax, sy);
-}
-
-static inline __m128i packNibbles( __m256i bytes ) {
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF);
-     __m256i high = __lasx_xvandn_v(lowByte, bytes);
-    __m256i low = __lasx_xvand_v(lowByte, bytes);
-    high = __lasx_xvsrli_h(high, 4);
-    bytes = __lasx_xvor_v(low, high);
-    // Compress uint16_t lanes into bytes
-    __m128i *r0 = (__m128i *)&bytes;
-    __m256i tmp_h128 = __lasx_xvpermi_q(bytes, bytes, 0x11);
-    __m128i *r1 = (__m128i *)&tmp_h128;
-
-    __m128i zero = __lsx_vldi(0);
-    __m128i tmp, tmp2, tmp3;
-
-    tmp = __lsx_vmax_h(zero, *r0);
-    tmp2 = __lsx_vsat_hu(tmp, 7);
-
-    tmp = __lsx_vmax_h(zero, *r1);
-    tmp3 = __lsx_vsat_hu(tmp, 7);
-    return  __lsx_vpickev_b(tmp3, tmp2);
-}
-#endif  //__loongarch_asx
-
 // reference implementation for deterministic creation of model files
 void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
     static const int qk = QK4_0;
@@ -702,11 +65,6 @@ void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, in
     }
 }
 
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q4_0_ref(x, y, k);
-}
-
-
 void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
     const int qk = QK4_1;
 
@@ -744,10 +102,6 @@ void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, in
     }
 }
 
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q4_1_ref(x, y, k);
-}
-
 void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
     static const int qk = QK5_0;
 
@@ -792,10 +146,6 @@ void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, in
     }
 }
 
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q5_0_ref(x, y, k);
-}
-
 void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
     const int qk = QK5_1;
 
@@ -840,10 +190,6 @@ void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, in
     }
 }
 
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q5_1_ref(x, y, k);
-}
-
 // reference implementation for deterministic creation of model files
 void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
     assert(k % QK8_0 == 0);
@@ -870,291 +216,6 @@ void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, in
     }
 }
 
-void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
-                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
-                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
-                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
-
-            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
-            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
-            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
-            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
-        }
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#elif defined(__riscv_v_intrinsic)
-
-    size_t vl = __riscv_vsetvl_e32m4(QK8_0);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
-
-        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
-        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
-        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
-        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
-
-        // convert to integer
-        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
-        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
-
-        // store result
-        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    for (int i = 0; i < nb; i++) {
-        vector float srcv [8];
-        vector float asrcv[8];
-        vector float amaxv[8];
-        vector signed int vi[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
-                                   vec_extract(amaxv[0], 1)),
-                               MAX(vec_extract(amaxv[0], 2),
-                                   vec_extract(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-        const vector float vid = vec_splats(id);
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const vector float v  = vec_round(vec_mul(srcv[j], vid));
-            vi[j] = vec_cts(v, 0);
-        }
-        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
-        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
-    }
-
-#elif defined(__loongarch_asx)
-    for (int i = 0; i < nb; i++) {
-        ft_union fi;
-        __m256 v0 = (__m256)__lasx_xvld( x , 0);
-        __m256 v1 = (__m256)__lasx_xvld( x , 32);
-        __m256 v2 = (__m256)__lasx_xvld( x , 64);
-        __m256 v3 = (__m256)__lasx_xvld( x , 96);
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
-        __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
-
-        __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) );
-        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
-        __m128 tmp = max4;
-        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
-        fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
-        const float max_scalar = fi.f;
-
-        // Quantize these floats
-        const float d = max_scalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
-        const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
-
-        // Apply the multiplier
-        v0 = __lasx_xvfmul_s( v0, mul );
-        v1 = __lasx_xvfmul_s( v1, mul );
-        v2 = __lasx_xvfmul_s( v2, mul );
-        v3 = __lasx_xvfmul_s( v3, mul );
-
-        // Round to nearest integer
-        __m256i i0 = __lasx_xvftintrne_w_s( v0 );
-        __m256i i1 = __lasx_xvftintrne_w_s( v1 );
-        __m256i i2 = __lasx_xvftintrne_w_s( v2 );
-        __m256i i3 = __lasx_xvftintrne_w_s( v3 );
-
-        __m128i ni0 = lasx_extracti128( i0, 0 );
-        __m128i ni1 = lasx_extracti128( i0, 1);
-        __m128i ni2 = lasx_extracti128( i1, 0);
-        __m128i ni3 = lasx_extracti128( i1, 1);
-        __m128i ni4 = lasx_extracti128( i2, 0);
-        __m128i ni5 = lasx_extracti128( i2, 1);
-        __m128i ni6 = lasx_extracti128( i3, 0);
-        __m128i ni7 = lasx_extracti128( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = lsx_packs_w( ni0, ni1 );
-        ni2 = lsx_packs_w( ni2, ni3 );
-        ni4 = lsx_packs_w( ni4, ni5 );
-        ni6 = lsx_packs_w( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = lsx_packs_h( ni0, ni2 );
-        ni4 = lsx_packs_h( ni4, ni6 );
-
-        __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
-        __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
-
-    }
-#else
-    GGML_UNUSED(nb);
-    // scalar
-    quantize_row_q8_0_ref(x, y, k);
-#endif
-}
-
 // reference implementation for deterministic creation of model files
 void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
     assert(QK8_1 == 32);
@@ -1191,334 +252,6 @@ void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, in
     }
 }
 
-void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK8_1 == 0);
-    const int nb = k / QK8_1;
-
-    block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        int32x4_t accv = vdupq_n_s32(0);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-
-            accv = vaddq_s32(accv, vi);
-        }
-
-        y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
-                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
-                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
-                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        v128_t accv = wasm_i32x4_splat(0);
-
-        for (int j = 0; j < 8; j++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
-
-            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
-            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
-            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
-            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
-
-            accv = wasm_i32x4_add(accv, vi);
-        }
-
-        y[i].s = GGML_FP32_TO_FP16(
-                d * (wasm_i32x4_extract_lane(accv, 0) +
-                     wasm_i32x4_extract_lane(accv, 1) +
-                     wasm_i32x4_extract_lane(accv, 2) +
-                     wasm_i32x4_extract_lane(accv, 3)));
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float max_scalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = max_scalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Compute the sum of the quants and set y[i].s
-        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
-        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#elif defined(__riscv_v_intrinsic)
-
-    size_t vl = __riscv_vsetvl_e32m4(QK8_1);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
-
-        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
-        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
-        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
-        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
-
-        const float d  = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
-
-        // convert to integer
-        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
-        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
-
-        // store result
-        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
-
-        // compute sum for y[i].s
-        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
-        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
-
-        // set y[i].s
-        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
-        y[i].s = GGML_FP32_TO_FP16(sum*d);
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    for (int i = 0; i < nb; i++) {
-        vector float srcv [8];
-        vector float asrcv[8];
-        vector float amaxv[8];
-        vector signed int vi[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
-                                   vec_extract(amaxv[0], 1)),
-                               MAX(vec_extract(amaxv[0], 2),
-                                   vec_extract(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-        const vector float vid = vec_splats(id);
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        vector int accv = vec_splats(0);
-
-        for (int j = 0; j < 8; j++) {
-            const vector float v  = vec_round(vec_mul(srcv[j], vid));
-            vi[j] = vec_cts(v, 0);
-
-            accv = vec_add(accv, vi[j]);
-        }
-        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
-        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
-
-        accv = vec_add(accv, vec_sld(accv, accv, 4));
-        accv = vec_add(accv, vec_sld(accv, accv, 8));
-        y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
-    }
-
-#elif defined(__loongarch_asx)
-    for (int i = 0; i < nb; i++) {
-        ft_union ft;
-        __m256 v0 = (__m256)__lasx_xvld( x , 0 );
-        __m256 v1 = (__m256)__lasx_xvld( x , 32 );
-        __m256 v2 = (__m256)__lasx_xvld( x , 64 );
-        __m256 v3 = (__m256)__lasx_xvld( x , 96 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
-        __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
-
-        __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
-        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
-        __m128 tmp = max4;
-        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
-        ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
-        const float max_scalar = ft.f;
-
-        // Quantize these floats
-        const float d = max_scalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
-        const __m256 mul = __lasx_xvreplfr2vr_s( id );
-
-        // Apply the multiplier
-        v0 = __lasx_xvfmul_s( v0, mul );
-        v1 = __lasx_xvfmul_s( v1, mul );
-        v2 = __lasx_xvfmul_s( v2, mul );
-        v3 = __lasx_xvfmul_s( v3, mul );
-
-        // Round to nearest integer
-        __m256i i0 = __lasx_xvftintrne_w_s( v0 );
-        __m256i i1 = __lasx_xvftintrne_w_s( v1 );
-        __m256i i2 = __lasx_xvftintrne_w_s( v2 );
-        __m256i i3 = __lasx_xvftintrne_w_s( v3 );
-
-        __m128i ni0 = lasx_extracti128(i0, 0);
-        __m128i ni1 = lasx_extracti128( i0, 1);
-        __m128i ni2 = lasx_extracti128( i1, 0);
-        __m128i ni3 = lasx_extracti128( i1, 1);
-        __m128i ni4 = lasx_extracti128( i2, 0 );
-        __m128i ni5 = lasx_extracti128( i2, 1);
-        __m128i ni6 = lasx_extracti128( i3, 0);
-        __m128i ni7 = lasx_extracti128( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
-        const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
-        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
-
-        // Convert int32 to int16
-        ni0 = lsx_packs_w( ni0, ni1 );
-        ni2 = lsx_packs_w( ni2, ni3 );
-        ni4 = lsx_packs_w( ni4, ni5 );
-        ni6 = lsx_packs_w( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = lsx_packs_h( ni0, ni2 );
-        ni4 = lsx_packs_h( ni4, ni6 );
-
-        __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
-        __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
-    }
-#else
-    GGML_UNUSED(nb);
-    // scalar
-    quantize_row_q8_1_ref(x, y, k);
-#endif
-}
-
 void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
     static const int qk = QK4_0;
 
@@ -2008,10 +741,6 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
     }
 }
 
-void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
-    quantize_row_q2_K_ref(x, vy, k);
-}
-
 static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
         uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
         float rmin, float rdelta, int nstep, bool use_mad) {
@@ -2374,10 +1103,6 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
     }
 }
 
-void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
-    quantize_row_q3_K_ref(x, vy, k);
-}
-
 static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
     assert(n_per_row % QK_K == 0);
     const int nb = n_per_row / QK_K;
@@ -2576,12 +1301,6 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
     }
 }
 
-void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_q4_K * restrict y = vy;
-    quantize_row_q4_K_ref(x, y, k);
-}
-
 static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
     assert(n_per_row % QK_K == 0);
     const int64_t nb = n_per_row / QK_K;
@@ -2787,12 +1506,6 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
     }
 }
 
-void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_q5_K * restrict y = vy;
-    quantize_row_q5_K_ref(x, y, k);
-}
-
 static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
     assert(n_per_row % QK_K == 0);
     const int64_t nb = n_per_row / QK_K;
@@ -3005,12 +1718,6 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
     }
 }
 
-void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_q6_K * restrict y = vy;
-    quantize_row_q6_K_ref(x, y, k);
-}
-
 static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
     assert(n_per_row % QK_K == 0);
     const int64_t nb = n_per_row / QK_K;
@@ -3413,33 +2120,20 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
     }
 }
 
-void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_tq1_0 * restrict y = vy;
-    quantize_row_tq1_0_ref(x, y, k);
-}
-
-void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_tq2_0 * restrict y = vy;
-    quantize_row_tq2_0_ref(x, y, k);
-}
-
 size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     (void)quant_weights; // not used
     const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
-    quantize_row_tq1_0(src, dst, (int64_t)nrow*n_per_row);
+    quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
     return nrow * row_size;
 }
 
 size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     (void)quant_weights; // not used
     const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
-    quantize_row_tq2_0(src, dst, (int64_t)nrow*n_per_row);
+    quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
     return nrow * row_size;
 }
 
-
 void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
@@ -3832,9166 +2526,6 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int6
     }
 }
 
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q8_K_ref(x, y, k);
-}
-
-//===================================== Dot products =================================
-
-//
-// Helper functions
-//
-#if __AVX__ || __AVX2__ || __AVX512F__
-
-// shuffles to pick the required scales in dot products
-static inline __m256i get_scale_shuffle_q3k(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
-    };
-    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
-}
-static inline __m256i get_scale_shuffle_k4(int i) {
-    static const uint8_t k_shuffle[256] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
-         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
-        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
-        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
-    };
-    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
-}
-static inline __m128i get_scale_shuffle(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
-        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
-        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
-    };
-    return _mm_loadu_si128((const __m128i*)k_shuffle + i);
-}
-#elif defined(__loongarch_asx)
-// shuffles to pick the required scales in dot products
-static inline __m256i get_scale_shuffle_q3k(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
-    };
-    return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
-}
-static inline __m256i get_scale_shuffle_k4(int i) {
-    static const uint8_t k_shuffle[256] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
-         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
-        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
-        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
-    };
-    return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
-}
-static inline __m128i get_scale_shuffle(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
-        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
-        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
-    };
-    return __lsx_vld((const __m128i*)k_shuffle + i, 0);
-}
-#endif
-
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q4_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q4_0 * restrict vx0 = vx;
-        const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
-        const block_q8_0 * restrict vy0 = vy;
-        const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
-
-        float32x4_t sumv0 = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; i++) {
-            const block_q4_0 * restrict b_x0 = &vx0[i];
-            const block_q4_0 * restrict b_x1 = &vx1[i];
-            const block_q8_0 * restrict b_y0 = &vy0[i];
-            const block_q8_0 * restrict b_y1 = &vy1[i];
-
-            const uint8x16_t m4b = vdupq_n_u8(0x0F);
-            const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
-            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
-
-            // 4-bit -> 8-bit
-            const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-            const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-            const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-            const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-            // sub 8
-            const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
-            const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
-            const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
-            const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
-
-            // load y
-            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
-            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
-            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
-            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
-
-            float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                    GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                                    GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                    GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
-
-            float32x4_t scale = vld1q_f32(_scale);
-
-            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-
-            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-
-            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-
-            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-
-            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
-        }
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
-        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
-
-        vst1_f32(s,      vget_low_f32(sumv2));
-        vst1_f32(s + bs, vget_high_f32(sumv2));
-        return;
-    }
-#endif
-
-    int ib = 0;
-    float sumf = 0;
-
-#if defined(__ARM_FEATURE_SVE)
-    svfloat32_t sumv0 = svdup_n_f32(0.0f);
-    svfloat32_t sumv1 = svdup_n_f32(0.0f);
-
-    const int vector_length = ggml_cpu_get_sve_cnt()*8;
-
-    // VLA Implementation using switch case
-    switch (vector_length) {
-        case 128:
-            {
-                // predicate for activating higher lanes for 4 float32 elements
-                const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
-
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q4_0 * restrict x0 = &x[ib + 0];
-                    const block_q4_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
-
-                    // load x
-                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
-                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
-
-                    // 4-bit -> 8-bit
-                    const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
-                    const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
-                    const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
-                    const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
-
-                    // sub 8
-                    const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
-                    const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
-                    const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
-                    const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
-
-                    // load y
-                    const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
-                    const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
-                    const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
-                    const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
-
-                    // dot product
-                    sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
-                                    svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
-                                    svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-                    sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
-                                    svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
-                                    svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-                }
-
-                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-            } break;
-        case 256:
-            {
-                // predicate for activating higher lanes for 16 int8 elements
-                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
-                // predicate for activating lower lanes for  16 int8 elements
-                const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
-
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q4_0 * restrict x0 = &x[ib + 0];
-                    const block_q4_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
-
-                    // load x
-                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
-                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
-
-                    // 4-bit -> 8-bit
-                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
-                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
-
-                    // sub 8
-                    const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
-                    const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
-
-                    // load y
-                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
-                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
-
-                    // dot product
-                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
-                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
-                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-                }
-
-                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-            } break;
-        case 512:
-            {
-                // predicate for activating higher lanes for 32 int8 elements
-                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
-
-                // predicate for activating higher lanes for 16 int8 elements
-                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
-                // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
-                const svbool_t pl16 = svnot_b_z(ph32, ph16);
-
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q4_0 * restrict x0 = &x[ib + 0];
-                    const block_q4_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
-
-                    // load x
-                    const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
-                    const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
-
-                    // 4-bit -> 8-bit
-                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
-                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
-
-                    // sub 8
-                    const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
-                    const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
-
-                    // load y
-                    const svint8_t qy0 = svld1_s8(ph32, y0->qs);
-                    const svint8_t qy1 = svld1_s8(ph32, y1->qs);
-
-                    // dot product
-                    sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
-                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-                    sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
-                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-                }
-
-                sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
-            } break;
-        default:
-            assert(false && "Unsupported vector length");
-            break;
-    }
-
-#elif defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q4_0 * restrict x0 = &x[ib + 0];
-        const block_q4_0 * restrict x1 = &x[ib + 1];
-        const block_q8_0 * restrict y0 = &y[ib + 0];
-        const block_q8_0 * restrict y1 = &y[ib + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        // dot product into int32x4_t
-        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
-        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8( 8 );
-        qx = _mm256_sub_epi8( qx, off );
-
-        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    sumf = hsum_float_8(acc);
-#elif defined(__AVX__)
-    const __m128i mone = _mm_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
-        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
-        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
-        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
-        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
-        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
-
-        const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
-        const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
-        const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
-        const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
-        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
-        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
-        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
-        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
-        const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
-        const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
-        const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
-        const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
-        accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
-        accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
-    }
-
-    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
-#elif defined(__SSSE3__)
-    // set constants
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    const __m128i off = _mm_set1_epi8(8);
-
-    // Initialize accumulator with zeros
-    __m128 acc_0 = _mm_setzero_ps();
-    __m128 acc_1 = _mm_setzero_ps();
-    __m128 acc_2 = _mm_setzero_ps();
-    __m128 acc_3 = _mm_setzero_ps();
-
-    for (; ib + 1 < nb; ib += 2) {
-        _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
-
-        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
-        bx_1 = _mm_sub_epi8(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
-
-        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
-
-        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
-        bx_2 = _mm_sub_epi8(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16));
-        bx_3 = _mm_sub_epi8(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = _mm_cvtepi32_ps(i32_0);
-        __m128 p1 = _mm_cvtepi32_ps(i32_1);
-        __m128 p2 = _mm_cvtepi32_ps(i32_2);
-        __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-        // Apply the scale
-        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
-        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
-        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
-        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
-
-        // Acummulate
-        acc_0 = _mm_add_ps(p0_d, acc_0);
-        acc_1 = _mm_add_ps(p1_d, acc_1);
-        acc_2 = _mm_add_ps(p2_d, acc_2);
-        acc_3 = _mm_add_ps(p3_d, acc_3);
-    }
-
-    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#elif defined(__riscv_v_intrinsic)
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    for (; ib < nb; ++ib) {
-        // load elements
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
-
-        // mask and store lower part of x, and then upper part
-        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        // subtract offset
-        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
-        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-    const vector signed char v8 = vec_splats((signed char)0x8);
-
-    vector float vsumf0 = vec_splats(0.0f);
-
-#pragma GCC unroll 8
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
-        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
-        vector signed char q8y1 = vec_xl(16, y[ib].qs);
-
-        vector signed char q4x0 = vec_and(qxs, lowMask);
-        vector signed char q4x1 = vec_sr(qxs, v4);
-
-        q4x0 = vec_sub(q4x0, v8);
-        q4x1 = vec_sub(q4x1, v8);
-
-        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
-        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
-
-        vector signed int vsumi0 = v0;
-
-        vsumi0 = vec_sum4s(qv0, vsumi0);
-        vsumi0 = vec_sum4s(qv1, vsumi0);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-    // Initialize accumulator with zeros
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        /* Compute combined scale for the block */
-        const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = __lasx_xvreplgr2vr_b( 8 );
-        qx = __lasx_xvsub_b( qx, off );
-
-        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        /* Multiply q with scale and accumulate */
-        acc = __lasx_xvfmadd_s( d, q, acc );
-    }
-
-    sumf = hsum_float_8(acc);
-#elif defined(__loongarch_sx)
-    // set constants
-    const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
-    const __m128i off = __lsx_vreplgr2vr_b(8);
-
-    // Initialize accumulator with zeros
-    __m128 acc_0 = __lsx_vldi(0);
-    __m128 acc_1 = __lsx_vldi(0);
-    __m128 acc_2 = __lsx_vldi(0);
-    __m128 acc_3 = __lsx_vldi(0);
-
-    for (; ib + 1 < nb; ib += 2) {
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
-
-        const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
-
-        __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1);
-        __m128i by_0 = __lsx_vld((const __m128i *)y[ib].qs, 0);
-        bx_0 = __lsx_vsub_b(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4));
-        __m128i by_1 = __lsx_vld((const __m128i *)(y[ib].qs + 16), 0);
-        bx_1 = __lsx_vsub_b(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
-        //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
-
-        const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
-
-        __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3);
-        __m128i by_2 = __lsx_vld((const __m128i *)y[ib + 1].qs, 0);
-        bx_2 = __lsx_vsub_b(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4));
-        __m128i by_3 = __lsx_vld((const __m128i *)(y[ib + 1].qs + 16), 0);
-        bx_3 = __lsx_vsub_b(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = __lsx_vffint_s_w(i32_0);
-        __m128 p1 = __lsx_vffint_s_w(i32_1);
-        __m128 p2 = __lsx_vffint_s_w(i32_2);
-        __m128 p3 = __lsx_vffint_s_w(i32_3);
-
-        // Apply the scale
-        __m128 p0_d = __lsx_vfmul_s( d_0_1, p0 );
-        __m128 p1_d = __lsx_vfmul_s( d_0_1, p1 );
-        __m128 p2_d = __lsx_vfmul_s( d_2_3, p2 );
-        __m128 p3_d = __lsx_vfmul_s( d_2_3, p3 );
-
-        // Acummulate
-        acc_0 = __lsx_vfadd_s(p0_d, acc_0);
-        acc_1 = __lsx_vfadd_s(p1_d, acc_1);
-        acc_2 = __lsx_vfadd_s(p2_d, acc_2);
-        acc_3 = __lsx_vfadd_s(p3_d, acc_3);
-    }
-
-    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#endif
-    for (; ib < nb; ++ib) {
-        int sumi0 = 0;
-        int sumi1 = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
-            const int v1 = (x[ib].qs[j] >>   4) - 8;
-
-            sumi0 += (v0 * y[ib].qs[j]);
-            sumi1 += (v1 * y[ib].qs[j + qk/2]);
-        }
-
-        int sumi = sumi0 + sumi1;
-        sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_1;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q4_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q4_1 * restrict vx0 = vx;
-        const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
-        const block_q8_1 * restrict vy0 = vy;
-        const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
-
-        float32x4_t sumv0 = vdupq_n_f32(0.0f);
-        float32x4_t summs0 = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; i++) {
-            const block_q4_1 * restrict b_x0 = &vx0[i];
-            const block_q4_1 * restrict b_x1 = &vx1[i];
-            const block_q8_1 * restrict b_y0 = &vy0[i];
-            const block_q8_1 * restrict b_y1 = &vy1[i];
-
-            float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
-                                    GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
-                                    GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
-                                    GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
-            summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
-
-            const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
-            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
-
-            // 4-bit -> 8-bit
-            const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-            const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-            const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-            const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-            // load y
-            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
-            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
-            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
-            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
-
-            // mmla into int32x4_t
-            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
-                                   GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
-                                   GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
-                                   GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
-            float32x4_t scale = vld1q_f32(_scale);
-
-            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-
-            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-
-            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-
-            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
-        }
-
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
-        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
-        sumv2 = vaddq_f32(sumv2, summs0);
-
-        vst1_f32(s,      vget_low_f32 (sumv2));
-        vst1_f32(s + bs, vget_high_f32(sumv2));
-        return;
-    }
-#endif
-
-    int ib = 0;
-    float sumf = 0;
-
-    // TODO: add WASM SIMD
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs = 0;
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q4_1 * restrict x0 = &x[ib + 0];
-        const block_q4_1 * restrict x1 = &x[ib + 1];
-        const block_q8_1 * restrict y0 = &y[ib + 0];
-        const block_q8_1 * restrict y1 = &y[ib + 1];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        // dot product into int32x4_t
-        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
-        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        const float d0 = GGML_FP16_TO_FP32(x[ib].d);
-        const float d1 = GGML_FP16_TO_FP32(y[ib].d);
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        const __m256 d0v = _mm256_set1_ps( d0 );
-        const __m256 d1v = _mm256_set1_ps( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs );
-
-        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
-
-        // Accumulate d0*d1*x*y
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d0d1, xy, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
-#endif
-    }
-
-    sumf = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v_intrinsic)
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    for (; ib < nb; ++ib) {
-        // load elements
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
-
-        // mask and store lower part of x, and then upper part
-        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-
-#pragma GCC unroll 4
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
-        vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
-        vsumf0 = vec_madd(vxmin, vys, vsumf0);
-
-        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
-        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
-        vector signed char q8y1 = vec_xl(16, y[ib].qs);
-
-        vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
-        vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
-
-        vector signed int vsumi0 = v0;
-
-        vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
-        vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-    // Initialize accumulator with zeros
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    float summs = 0;
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        const float d0 = GGML_FP16_TO_FP32(x[ib].d);
-        const float d1 = GGML_FP16_TO_FP32(y[ib].d);
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
-        const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        const __m256i qy = __lasx_xvld( (const __m256i *)y[ib].qs, 0);
-
-        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
-
-        // Accumulate d0*d1*x*y
-        acc = __lasx_xvfmadd_s( d0d1, xy, acc );
-    }
-
-    sumf = hsum_float_8(acc) + summs;
-#endif
-    for (; ib < nb; ++ib) {
-        int sumi0 = 0;
-        int sumi1 = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[ib].qs[j] & 0x0F);
-            const int v1 = (x[ib].qs[j] >>   4);
-
-            sumi0 += (v0 * y[ib].qs[j]);
-            sumi1 += (v1 * y[ib].qs[j + qk/2]);
-        }
-
-        int sumi = sumi0 + sumi1;
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    int ib = 0;
-    float sumf = 0;
-
-    assert(n % qk == 0);
-    assert(qk == QK5_0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q5_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q5_0 * restrict x0 = &x[ib];
-        const block_q5_0 * restrict x1 = &x[ib + 1];
-        const block_q8_0 * restrict y0 = &y[ib];
-        const block_q8_0 * restrict y1 = &y[ib + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        // extract the 5th bit via lookup table ((!b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    uint32_t qh;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (; ib < nb; ++ib) {
-        const block_q5_0 * restrict x0 = &x[ib];
-        const block_q8_0 * restrict y0 = &y[ib];
-
-        const v128_t m4b  = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_1[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
-        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
-    }
-
-    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
-        qx = _mm256_or_si256(qx, bxhi);
-
-        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    sumf = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8((char)0xF0);
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-
-        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_andnot_si128(bxhil, mask);
-        bxhih = _mm_andnot_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx_0);
-        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx_0 = MM256_SET_M128I(bxh, bxl);
-
-        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
-    }
-
-    sumf = hsum_float_8(acc);
-#elif defined(__riscv_v_intrinsic)
-    uint32_t qh;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    // These temporary registers are for masking and shift operations
-    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
-    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
-
-    vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
-    vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
-
-    for (; ib < nb; ++ib) {
-        memcpy(&qh, x[ib].qh, sizeof(uint32_t));
-
-        // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
-        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
-        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
-
-        // ((qh & (1u << (j + 16))) >> (j + 12));
-        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
-        vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
-
-        // narrowing
-        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
-        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
-
-        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
-        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
-
-        // load
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
-
-        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
-        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
-
-        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
-        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector unsigned char v4 = vec_splats((unsigned char)4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-
-#pragma GCC unroll 4
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
-        vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[ib].qh[2]]), (uint64_t)(table_b2b_1[x[ib].qh[3]])};
-
-        vector signed char qh0 = (vector signed char)aux64x2_0;
-        vector signed char qh1 = (vector signed char)aux64x2_1;
-
-        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
-
-        vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
-        vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
-
-        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
-        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
-
-        vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
-        vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
-
-        qv0 = vec_add(qv0, qv1);
-
-        vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-    // Initialize accumulator with zeros
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        /* Compute combined scale for the block */
-        const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); //FIXME
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        bxhi = __lasx_xvandn_v(bxhi, __lasx_xvreplgr2vr_b((char)0xF0));
-        qx = __lasx_xvor_v(qx, bxhi);
-
-        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        /* Multiply q with scale and accumulate */
-        acc = __lasx_xvfmadd_s(d, q, acc);
-    }
-
-    sumf = hsum_float_8(acc);
-#endif
-    for (; ib < nb; ++ib) {
-        uint32_t qh;
-        memcpy(&qh, x[ib].qh, sizeof(qh));
-
-        int sumi0 = 0;
-        int sumi1 = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
-
-            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
-            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
-
-            sumi0 += (x0 * y[ib].qs[j]);
-            sumi1 += (x1 * y[ib].qs[j + qk/2]);
-        }
-
-        int sumi = sumi0 + sumi1;
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_1;
-    const int nb = n / qk;
-
-    int ib = 0;
-    float sumf = 0;
-
-    assert(n % qk == 0);
-    assert(qk == QK5_1);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q5_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs0 = 0.0f;
-    float summs1 = 0.0f;
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q5_1 * restrict x0 = &x[ib];
-        const block_q5_1 * restrict x1 = &x[ib + 1];
-        const block_q8_1 * restrict y0 = &y[ib];
-        const block_q8_1 * restrict y1 = &y[ib + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
-        summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
-
-        // extract the 5th bit via lookup table ((b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit
-        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    float summs = 0.0f;
-
-    uint32_t qh;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (; ib < nb; ++ib) {
-        const block_q5_1 * restrict x0 = &x[ib];
-        const block_q8_1 * restrict y0 = &y[ib];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
-
-        const v128_t m4b = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_0[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit
-        const v128_t v0lf = wasm_v128_or(v0l, qhl);
-        const v128_t v0hf = wasm_v128_or(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv,
-                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
-    }
-
-    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
-        qx = _mm256_or_si256(qx, bxhi);
-
-        const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
-        const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
-
-        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
-    }
-
-    sumf = hsum_float_8(acc) + summs;
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8(0x10);
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_and_si128(bxhil, mask);
-        bxhih = _mm_and_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx_0);
-        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx_0 = MM256_SET_M128I(bxh, bxl);
-
-        const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
-        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
-
-        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
-    }
-
-    sumf = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v_intrinsic)
-    uint32_t qh;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    // temporary registers for shift operations
-    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
-    vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
-
-    for (; ib < nb; ++ib) {
-        memcpy(&qh, x[ib].qh, sizeof(uint32_t));
-
-        // load qh
-        vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
-
-        // ((qh >> (j +  0)) << 4) & 0x10;
-        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
-        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
-        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
-
-        // ((qh >> (j + 12))     ) & 0x10;
-        vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
-        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
-
-        // narrowing
-        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
-        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
-
-        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
-        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
-
-        // load
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
-
-        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
-        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
-
-        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-
-#pragma GCC unroll 4
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
-        vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
-        vsumf0 = vec_madd(vxmin, vys, vsumf0);
-
-        vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
-        vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[ib].qh[2]]), (uint64_t)(table_b2b_0[x[ib].qh[3]])};
-
-        vector signed char qh0 = (vector signed char)aux64x2_0;
-        vector signed char qh1 = (vector signed char)aux64x2_1;
-
-        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
-
-        vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
-        vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
-
-        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
-        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
-
-        vector signed int vsumi0 = v0;
-
-        vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
-        vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-    // Initialize accumulator with zeros
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d));
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
-        qx = __lasx_xvor_v(qx, bxhi);
-
-        const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib].d));
-        const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
-
-        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
-
-        acc = __lasx_xvfmadd_s(q, __lasx_xvfmul_s(dx, dy), acc);
-    }
-
-    sumf = hsum_float_8(acc) + summs;
-#endif
-    for (; ib < nb; ++ib) {
-        uint32_t qh;
-        memcpy(&qh, x[ib].qh, sizeof(qh));
-
-        int sumi0 = 0;
-        int sumi1 = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
-            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
-
-            sumi0 += (x0 * y[ib].qs[j]);
-            sumi1 += (x1 * y[ib].qs[j + qk/2]);
-        }
-
-        int sumi = sumi0 + sumi1;
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q8_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q8_0 * restrict vx0 = vx;
-        const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
-        const block_q8_0 * restrict vy0 = vy;
-        const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
-
-        float32x4_t sumv0 = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; i++) {
-            const block_q8_0 * restrict b_x0 = &vx0[i];
-            const block_q8_0 * restrict b_y0 = &vy0[i];
-
-            const block_q8_0 * restrict b_x1 = &vx1[i];
-            const block_q8_0 * restrict b_y1 = &vy1[i];
-
-            const int8x16_t x0_l = vld1q_s8(b_x0->qs);
-            const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
-            const int8x16_t x1_l = vld1q_s8(b_x1->qs);
-            const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
-
-            // load y
-            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
-            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
-            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
-            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
-
-            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                   GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
-            float32x4_t scale = vld1q_f32(_scale);
-
-            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-
-            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-
-            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-
-            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-
-            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                       l1, r1)), l2, r2)), l3, r3))), scale);
-        }
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
-        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
-
-        vst1_f32(s, vget_low_f32(sumv2));
-        vst1_f32(s + bs, vget_high_f32(sumv2));
-        return;
-    }
-#endif
-
-    int ib = 0;
-    float sumf = 0;
-
-#if defined(__ARM_FEATURE_SVE)
-    svfloat32_t sumv0 = svdup_n_f32(0.0f);
-    svfloat32_t sumv1 = svdup_n_f32(0.0f);
-
-    const int vector_length = ggml_cpu_get_sve_cnt()*8;
-
-    //VLA Implemenation for SVE
-    switch (vector_length) {
-        case 128:
-            {
-                // predicate for activating lanes for 16 Int8 elements
-                const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
-                const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
-
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q8_0 * restrict x0 = &x[ib + 0];
-                    const block_q8_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
-
-                    // load x
-                    const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
-                    const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
-                    const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
-                    const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
-
-                    // load y
-                    const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
-                    const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
-                    const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
-                    const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
-
-                    sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
-                                    svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
-                                    svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-                    sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
-                                    svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
-                                    svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-                }
-
-                sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
-            } break;
-        case 256:
-            {
-                //printf("sve256");
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q8_0 * restrict x0 = &x[ib + 0];
-                    const block_q8_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
-
-                    // load x
-                    const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
-                    const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
-
-                    // load y
-                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
-                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
-
-                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
-                                svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
-                                svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-                }
-
-                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-            } break;
-        case 512:
-            {
-                // predicate for activating high 256 bit
-                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
-                // predicate for activating low 256 bit
-                const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
-
-                // predicate for activating high lanes for 8 float32 elements
-                const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
-                // predicate for activating low lanes for 8 float32 elements
-                const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
-
-                svfloat32_t sumv00 = svdup_n_f32(0.0f);
-
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q8_0 * restrict x0 = &x[ib + 0];
-                    const block_q8_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
-
-                    //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
-                    // and add them to make one 64 element vector
-                    // load x
-                    const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
-                          svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
-
-                    qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
-
-                    // load y
-                    const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
-                          svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
-
-                    qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
-
-                    // scale creation
-                    const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
-                    const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
-
-                    // duplicate deq1 in first half of vector and deq2 in second half of vector
-                    const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
-
-                    const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
-
-                    sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
-                }
-
-                sumf = svaddv_f32(svptrue_b32(), sumv00);
-                break;
-            }
-        default:
-            assert(false && "Unsupported vector length");
-            break;
-    }
-#elif defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q8_0 * restrict x0 = &x[ib + 0];
-        const block_q8_0 * restrict x1 = &x[ib + 1];
-        const block_q8_0 * restrict y0 = &y[ib + 0];
-        const block_q8_0 * restrict y1 = &y[ib + 1];
-
-        const int8x16_t x0_0 = vld1q_s8(x0->qs);
-        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
-        const int8x16_t x1_0 = vld1q_s8(x1->qs);
-        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
-
-        // load y
-        const int8x16_t y0_0 = vld1q_s8(y0->qs);
-        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
-        const int8x16_t y1_0 = vld1q_s8(y1->qs);
-        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-        __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
-        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        // Multiply q with scale and accumulate
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d, q, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
-#endif
-    }
-
-    sumf = hsum_float_8(acc);
-#elif defined(__riscv_v_intrinsic)
-    size_t vl = __riscv_vsetvl_e8m1(qk);
-
-    for (; ib < nb; ++ib) {
-        // load elements
-        vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[ib].qs, vl);
-        vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
-
-        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
-
-        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
-        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
-    }
-#elif defined(__POWER9_VECTOR__)
-    const vector signed int v0 = vec_splats((int32_t)0);
-    vector float vsumf0 = vec_splats(0.0f);
-
-#pragma GCC unroll 8
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed char q8x0 = vec_xl( 0, x[ib].qs);
-        vector signed char q8x1 = vec_xl(16, x[ib].qs);
-        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
-        vector signed char q8y1 = vec_xl(16, y[ib].qs);
-
-        vector signed short qv0 = vec_mule(q8x0, q8y0);
-        vector signed short qv1 = vec_mulo(q8x0, q8y0);
-        vector signed short qv2 = vec_mule(q8x1, q8y1);
-        vector signed short qv3 = vec_mulo(q8x1, q8y1);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-
-        vsumi0 = vec_sum4s(qv0, vsumi0);
-        vsumi1 = vec_sum4s(qv1, vsumi1);
-        vsumi0 = vec_sum4s(qv2, vsumi0);
-        vsumi1 = vec_sum4s(qv3, vsumi1);
-
-        vsumi0 = vec_add(vsumi0, vsumi1);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-    // Initialize accumulator with zeros
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        // Compute combined scale for the block
-        const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-        __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0);
-        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        // Multiply q with scale and accumulate
-        acc = __lasx_xvfmadd_s( d, q, acc );
-    }
-
-    sumf = hsum_float_8(acc);
-#endif
-    for (; ib < nb; ++ib) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk; j++) {
-            sumi += x[ib].qs[j]*y[ib].qs[j];
-        }
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_tq1_0 * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-    float sumf = 0.0f;
-
-    uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
-
-    const uint8x16_t shift = vld1q_u8(k_shift);
-
-    for (int i = 0; i < nb; ++i) {
-#if defined(__ARM_FEATURE_DOTPROD)
-        int32x4_t sumi0 = vdupq_n_s32(0);
-        int32x4_t sumi1 = vdupq_n_s32(0);
-#else
-        int16x8_t sumi0 = vdupq_n_s16(0);
-        int16x8_t sumi1 = vdupq_n_s16(0);
-#endif
-
-        // first 32 bytes of 5 elements
-        {
-            uint8x16_t qx0 = vld1q_u8(x[i].qs + 0);
-            uint8x16_t qx1 = vld1q_u8(x[i].qs + 16);
-            uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3));
-            uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3));
-            uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9));
-            uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9));
-            uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27));
-            uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27));
-            uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81));
-            uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81));
-
-            // multiply by 3 and keep the 2 bits above 8 bits
-            int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
-            int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
-            int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
-            int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
-            int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
-            int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
-            int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6));
-            int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6));
-            int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6));
-            int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6));
-
-            const int8x16_t qy0 = vld1q_s8(y[i].qs +   0);
-            const int8x16_t qy1 = vld1q_s8(y[i].qs +  16);
-            const int8x16_t qy2 = vld1q_s8(y[i].qs +  32);
-            const int8x16_t qy3 = vld1q_s8(y[i].qs +  48);
-            const int8x16_t qy4 = vld1q_s8(y[i].qs +  64);
-            const int8x16_t qy5 = vld1q_s8(y[i].qs +  80);
-            const int8x16_t qy6 = vld1q_s8(y[i].qs +  96);
-            const int8x16_t qy7 = vld1q_s8(y[i].qs + 112);
-            const int8x16_t qy8 = vld1q_s8(y[i].qs + 128);
-            const int8x16_t qy9 = vld1q_s8(y[i].qs + 144);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
-            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
-            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
-            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
-            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
-            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
-            sumi0 = vdotq_s32(sumi0, sqx6, qy6);
-            sumi1 = vdotq_s32(sumi1, sqx7, qy7);
-            sumi0 = vdotq_s32(sumi0, sqx8, qy8);
-            sumi1 = vdotq_s32(sumi1, sqx9, qy9);
-#else
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9));
-#endif
-        }
-
-        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
-        {
-            uint8x16_t qx0 = vld1q_u8(x[i].qs + 32);
-            uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3));
-            uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9));
-            uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27));
-            uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81));
-            uint32_t qh;
-            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
-            uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh));
-            qx5 = vmulq_u8(qx5, shift);
-
-            // multiply by 3 and keep the 2 bits above 8 bits
-            int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
-            int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
-            int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
-            int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
-            int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
-            int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
-
-            const int8x16_t qy0 = vld1q_s8(y[i].qs + 160);
-            const int8x16_t qy1 = vld1q_s8(y[i].qs + 176);
-            const int8x16_t qy2 = vld1q_s8(y[i].qs + 192);
-            const int8x16_t qy3 = vld1q_s8(y[i].qs + 208);
-            const int8x16_t qy4 = vld1q_s8(y[i].qs + 224);
-            const int8x16_t qy5 = vld1q_s8(y[i].qs + 240);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
-            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
-            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
-            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
-            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
-            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
-#else
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
-#endif
-        }
-
-        const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
-        const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumi0 = vaddq_s32(sumi0, sumi1);
-        sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
-
-        sumf += d * (float) vaddvq_s32(sumi0);
-#else
-        sumi0 = vaddq_s16(sumi0, sumi1);
-        sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
-
-        sumf += d * (float) vaddlvq_s16(sumi0);
-#endif
-    }
-
-    *s = sumf;
-
-#elif defined(__AVX2__)
-    __m256 sumf = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-        // 16-bit sums
-        __m256i sumi0 = _mm256_setzero_si256();
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-
-        // first 32 bytes of 5 elements
-        {
-            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
-            // 8-bit multiplies with shifts, masks and adds
-            __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
-            __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
-            __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
-            __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
-
-            // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
-
-            // Cancel the +1 from avg so that it behaves like a halving add
-            qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
-            qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
-            qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
-            qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
-            qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
-            // Multiply by 3 and get the top 2 bits
-            qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
-            qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
-            qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
-            qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
-            qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
-            qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
-            qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
-            qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
-            qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
-            qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
-
-            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs +   0));
-            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  32));
-            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  64));
-            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  96));
-            const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
-
-            qx0 = _mm256_maddubs_epi16(qx0, qy0);
-            qx1 = _mm256_maddubs_epi16(qx1, qy1);
-            qx2 = _mm256_maddubs_epi16(qx2, qy2);
-            qx3 = _mm256_maddubs_epi16(qx3, qy3);
-            qx4 = _mm256_maddubs_epi16(qx4, qy4);
-
-            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
-            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
-            sumi2 = _mm256_add_epi16(sumi2, qx4);
-        }
-
-        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
-        {
-            __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
-            uint32_t qh;
-            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
-            __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
-            __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
-            __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
-            __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
-            __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
-            __m256i qx01 = MM256_SET_M128I(qx1, qx0);
-            __m256i qx23 = MM256_SET_M128I(qx3, qx2);
-
-            // avx2 does not have 8-bit multiplies, so 16-bit it is.
-            qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
-            qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
-            __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
-
-            __m256i qx45 = MM256_SET_M128I(qx5, qx4);
-
-            // Cancel the +1 from avg so that it behaves like a halving add
-            qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
-            qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
-            qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
-            // Multiply by 3 and get the top 2 bits
-            qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
-            qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
-            qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
-            qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
-            qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
-            qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
-
-            const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
-            const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
-            const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
-
-            qx01 = _mm256_maddubs_epi16(qx01, qy01);
-            qx23 = _mm256_maddubs_epi16(qx23, qy23);
-            qx45 = _mm256_maddubs_epi16(qx45, qy45);
-
-            sumi0 = _mm256_add_epi16(sumi0, qx01);
-            sumi1 = _mm256_add_epi16(sumi1, qx23);
-            sumi2 = _mm256_add_epi16(sumi2, qx45);
-        }
-
-        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
-        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
-
-        sumi0 = _mm256_sub_epi16(sumi0, ysum);
-        sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
-        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
-
-        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
-    }
-
-    *s = hsum_float_8(sumf);
-
-#else
-    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
-
-    float sumf = 0.0f;
-
-    for (int i = 0; i < nb; ++i) {
-        int sum = 0;
-
-        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
-            for (size_t l = 0; l < 5; ++l) {
-                for (size_t m = 0; m < 32; ++m) {
-                    uint8_t q = x[i].qs[j + m] * pow3[l];
-                    uint16_t xi = ((uint16_t) q * 3) >> 8;
-                    sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
-                }
-            }
-        }
-        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
-            for (size_t l = 0; l < 5; ++l) {
-                for (size_t m = 0; m < 16; ++m) {
-                    uint8_t q = x[i].qs[j + m] * pow3[l];
-                    uint16_t xi = ((uint16_t) q * 3) >> 8;
-                    sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
-                }
-            }
-        }
-
-        for (size_t l = 0; l < 4; ++l) {
-            for (size_t j = 0; j < sizeof(x->qh); ++j) {
-                uint8_t q = x[i].qh[j] * pow3[l];
-                uint16_t xi = ((uint16_t) q * 3) >> 8;
-                sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
-            }
-        }
-
-        sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_tq2_0 * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-    float sumf = 0.0f;
-
-    const uint8x16_t m3 = vdupq_n_u8(3);
-
-    for (int i = 0; i < nb; ++i) {
-#if defined(__ARM_FEATURE_DOTPROD)
-        int32x4_t sumi0 = vdupq_n_s32(0);
-        int32x4_t sumi1 = vdupq_n_s32(0);
-#else
-        int16x8_t sumi0 = vdupq_n_s16(0);
-        int16x8_t sumi1 = vdupq_n_s16(0);
-#endif
-
-        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
-            uint8x16_t qx0 = vld1q_u8(x[i].qs + j);
-            uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16);
-            uint8x16_t qx2 = vshrq_n_u8(qx0, 2);
-            uint8x16_t qx3 = vshrq_n_u8(qx1, 2);
-            uint8x16_t qx4 = vshrq_n_u8(qx0, 4);
-            uint8x16_t qx5 = vshrq_n_u8(qx1, 4);
-            uint8x16_t qx6 = vshrq_n_u8(qx0, 6);
-            uint8x16_t qx7 = vshrq_n_u8(qx1, 6);
-
-            int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3));
-            int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3));
-            int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3));
-            int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3));
-            int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3));
-            int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3));
-            int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3));
-            int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3));
-
-            const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 +   0);
-            const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 +  16);
-            const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 +  32);
-            const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 +  48);
-            const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 +  64);
-            const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 +  80);
-            const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 +  96);
-            const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
-            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
-            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
-            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
-            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
-            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
-            sumi0 = vdotq_s32(sumi0, sqx6, qy6);
-            sumi1 = vdotq_s32(sumi1, sqx7, qy7);
-#else
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
-#endif
-        }
-
-        const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
-        const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumi0 = vaddq_s32(sumi0, sumi1);
-        sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
-
-        sumf += d * (float) vaddvq_s32(sumi0);
-#else
-        sumi0 = vaddq_s16(sumi0, sumi1);
-        sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
-
-        sumf += d * (float) vaddlvq_s16(sumi0);
-#endif
-    }
-
-    *s = sumf;
-
-#elif defined(__AVX2__)
-    __m256 sumf = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-        // 16-bit sums, because 256*127 still fits
-        __m256i sumi0 = _mm256_setzero_si256();
-        __m256i sumi1 = _mm256_setzero_si256();
-
-        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
-            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
-            __m256i qx1 = _mm256_srli_epi16(qx0, 2);
-            __m256i qx2 = _mm256_srli_epi16(qx0, 4);
-            __m256i qx3 = _mm256_srli_epi16(qx0, 6);
-
-            // 0, 1, 2 (should not be 3)
-            qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
-            qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
-            qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
-            qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
-
-            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 +  0));
-            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
-            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
-            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
-
-            qx0 = _mm256_maddubs_epi16(qx0, qy0);
-            qx1 = _mm256_maddubs_epi16(qx1, qy1);
-            qx2 = _mm256_maddubs_epi16(qx2, qy2);
-            qx3 = _mm256_maddubs_epi16(qx3, qy3);
-
-            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
-            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
-        }
-
-        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
-        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
-
-        sumi0 = _mm256_add_epi16(sumi0, sumi1);
-        sumi0 = _mm256_sub_epi16(sumi0, ysum);
-        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
-
-        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
-    }
-
-    *s = hsum_float_8(sumf);
-
-#else
-    float sumf = 0.0f;
-
-    for (int i = 0; i < nb; ++i) {
-        int32_t sumi = 0;
-
-        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
-            for (size_t l = 0; l < 4; ++l) {
-                for (size_t k = 0; k < 32; ++k) {
-                    sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
-                }
-            }
-        }
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        sumf += (float) sumi * d;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const uint8x16_t m3 = vdupq_n_u8(0x3);
-    const uint8x16_t m4 = vdupq_n_u8(0xF);
-
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    ggml_int8x16x2_t q2bytes;
-    uint8_t aux[16];
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint8_t * restrict sc = x[i].scales;
-
-        const uint8x16_t mins_and_scales = vld1q_u8(sc);
-        const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
-        vst1q_u8(aux, scales);
-
-        const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
-        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
-        const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
-        const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
-                                       vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
-        const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
-                                       vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1])));
-        sum += dmin * vaddvq_s32(vaddq_s32(s0, s1));
-
-        int isum = 0;
-        int is = 0;
-
-// We use this macro instead of a function call because for some reason
-// the code runs 2-3% slower, even if the function is declared inline
-#define MULTIPLY_ACCUM_WITH_SCALE(index)\
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
-
-#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
-        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
-        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
-        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
-        MULTIPLY_ACCUM_WITH_SCALE((index));
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
-
-            ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
-            q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
-
-            MULTIPLY_ACCUM_WITH_SCALE(0);
-
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
-
-            is += 8;
-        }
-
-        sum += d * isum;
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-    const __m128i m4 = _mm_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
-        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
-        const __m256i mins = _mm256_cvtepi8_epi16(mins8);
-        const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
-
-        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
-        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
-        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
-            const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
-            const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
-            const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
-
-            __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
-            __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
-            __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
-            __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
-
-            p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
-            p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
-            p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
-            p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
-
-            p0 = _mm256_add_epi32(p0, p1);
-            p2 = _mm256_add_epi32(p2, p3);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
-        }
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(0x3);
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m2 = _mm_set1_epi8(0x2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        // load mins and scales from block_q2_K.scales[QK_K/16]
-        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-        const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
-        const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
-        const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
-        const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
-
-        // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
-        const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
-        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
-
-        // sumf += -dmin * summs in 32bits*8
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
-
-        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
-        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
-        const __m128i scales[2] = { scales_0, scales_1 };
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
-            __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
-            const __m128i q2_0 = _mm_and_si128(q2bits, m3);
-            const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
-            const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
-            const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
-            q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
-            const __m128i q2_1 = _mm_and_si128(q2bits, m3);
-            const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
-            const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
-            const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
-
-            // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
-            __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
-            __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
-            __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
-            __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
-            __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
-            __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
-            __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
-            __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
-
-            // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
-            __m128i shuffle = _mm_set1_epi16(0x0100);
-            p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
-
-            p0 = _mm_add_epi32(p0, p1);
-            p2 = _mm_add_epi32(p2, p3);
-            p4 = _mm_add_epi32(p4, p5);
-            p6 = _mm_add_epi32(p6, p7);
-
-            // isum in 32bits*4*2
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
-        }
-
-        // sumf += dall * isum - dmin * summs in 32bits
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    float sumf = 0;
-    uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * q2 = x[i].qs;
-        const  int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        size_t vl = 16;
-
-        vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
-        vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
-
-        vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
-
-        vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
-        vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
-        vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
-        vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
-        vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-
-        sumf  += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
-
-        vl = 32;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-        vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
-
-        uint8_t is=0;
-        int isum=0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load Q2
-            vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
-
-            vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
-            vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
-            vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
-            vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
-
-            // duplicate scale elements for product
-            vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
-            vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
-            vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
-            vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
-
-            vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
-            vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
-            vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
-            vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
-
-            // load Q8
-            vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
-            vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
-            vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
-            vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
-
-            vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
-            vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
-            vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
-            vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
-
-            vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
-            vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
-
-            isum += __riscv_vmv_x_s_i32m1_i32(isum1);
-
-            q2+=32;  q8+=128;  is=8;
-
-        }
-
-        sumf += dall * isum;
-
-    }
-
-    *s = sumf;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0x3);
-    const vector signed char lowScaleMask = vec_splats((signed char)0xF);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
-    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
-        vector float vdmin = vec_mul(vxmin, vyd);
-
-        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
-        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
-
-        vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
-        vector signed char vscales = vec_and(q2xmins, lowScaleMask);
-
-        q2xmins = vec_sr(q2xmins, v4);
-        vector signed short q2xmins0 = vec_unpackh(q2xmins);
-        vector signed short q2xmins1 = vec_unpackl(q2xmins);
-
-        vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
-        vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
-        vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
-        vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
-
-        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
-        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
-        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
-        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-        vector signed int vsumi4 = v0;
-        vector signed int vsumi5 = v0;
-        vector signed int vsumi6 = v0;
-        vector signed int vsumi7 = v0;
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            __builtin_prefetch(q2, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
-            q2 += 32;
-
-            vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
-            vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
-            vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
-            vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
-            vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
-            vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
-            vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
-            vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
-
-            vector signed char q8y00 = vec_xl(  0, q8);
-            vector signed char q8y10 = vec_xl( 16, q8);
-            vector signed char q8y01 = vec_xl( 32, q8);
-            vector signed char q8y11 = vec_xl( 48, q8);
-            vector signed char q8y02 = vec_xl( 64, q8);
-            vector signed char q8y12 = vec_xl( 80, q8);
-            vector signed char q8y03 = vec_xl( 96, q8);
-            vector signed char q8y13 = vec_xl(112, q8);
-            q8 += 128;
-
-            vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
-            vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
-            vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
-            vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
-            vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
-            vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
-            vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
-            vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
-
-            vector signed short vscales_07 = vec_unpackh(vscales);
-            vector signed int vscales_03 = vec_unpackh(vscales_07);
-            vector signed int vscales_47 = vec_unpackl(vscales_07);
-            vector signed int vs0 = vec_splat(vscales_03, 0);
-            vector signed int vs1 = vec_splat(vscales_03, 1);
-            vector signed int vs2 = vec_splat(vscales_03, 2);
-            vector signed int vs3 = vec_splat(vscales_03, 3);
-            vector signed int vs4 = vec_splat(vscales_47, 0);
-            vector signed int vs5 = vec_splat(vscales_47, 1);
-            vector signed int vs6 = vec_splat(vscales_47, 2);
-            vector signed int vs7 = vec_splat(vscales_47, 3);
-            vscales = vec_sld(vscales, vscales, 8);
-
-            vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
-            vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
-            vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
-            vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
-            vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
-            vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
-            vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
-            vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
-        }
-
-        vsumi0 = vec_add(vsumi0, vsumi4);
-        vsumi1 = vec_add(vsumi1, vsumi5);
-        vsumi2 = vec_add(vsumi2, vsumi6);
-        vsumi3 = vec_add(vsumi3, vsumi7);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined __loongarch_asx
-
-    const __m256i m3 = __lasx_xvreplgr2vr_b(3);
-    const __m128i m4 = __lsx_vreplgr2vr_b(0xF);
-
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
-        const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
-        const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
-        const __m256i mins = lasx_ext8_16(mins8);
-        const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0));
-
-        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc);
-
-        const __m256i all_scales = lasx_ext8_16(scales8);
-        const __m128i l_scales = lasx_extracti128(all_scales, 0);
-        const __m128i h_scales = lasx_extracti128(all_scales, 1);
-        const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
-
-        __m256i sumi = __lasx_xvldi(0);
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m256i q2bits = __lasx_xvld((const __m256i*)q2, 0); q2 += 32;
-
-            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-
-            const __m256i q2_0 = __lasx_xvand_v(q2bits, m3);
-            const __m256i q2_1 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 2), m3);
-            const __m256i q2_2 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 4), m3);
-            const __m256i q2_3 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 6), m3);
-
-            __m256i p0 = lasx_maddubs_h(q2_0, q8_0);
-            __m256i p1 = lasx_maddubs_h(q2_1, q8_1);
-            __m256i p2 = lasx_maddubs_h(q2_2, q8_2);
-            __m256i p3 = lasx_maddubs_h(q2_3, q8_3);
-
-            p0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(0)), p0);
-            p1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(1)), p1);
-            p2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(2)), p2);
-            p3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(3)), p3);
-
-            p0 = __lasx_xvadd_w(p0, p1);
-            p2 = __lasx_xvadd_w(p2, p3);
-
-            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p0, p2));
-        }
-
-        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#else
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * q2 = x[i].qs;
-        const  int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        int summs = 0;
-        for (int j = 0; j < 16; ++j) {
-            summs += y[i].bsums[j] * (sc[j] >> 4);
-        }
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        int isum = 0;
-        int is = 0;
-        int d;
-        for (int k = 0; k < QK_K/128; ++k) {
-            int shift = 0;
-            for (int j = 0; j < 4; ++j) {
-                d = sc[is++] & 0xF;
-                int isuml = 0;
-                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
-                isum += d * isuml;
-                d = sc[is++] & 0xF;
-                isuml = 0;
-                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
-                isum += d * isuml;
-                shift += 2;
-                q8 += 32;
-            }
-            q2 += 32;
-        }
-        sumf += dall * isum - dmin * summs;
-    }
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const uint32_t kmask1 = 0x03030303;
-    const uint32_t kmask2 = 0x0f0f0f0f;
-
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-
-    uint32_t aux[3];
-    uint32_t utmp[4];
-
-    const uint8x16_t m3b = vdupq_n_u8(0x3);
-    const int32x4_t  vzero = vdupq_n_s32(0);
-
-    const uint8x16_t m0 = vdupq_n_u8(1);
-    const uint8x16_t m1 = vshlq_n_u8(m0, 1);
-    const uint8x16_t m2 = vshlq_n_u8(m0, 2);
-    const uint8x16_t m3 = vshlq_n_u8(m0, 3);
-    const int8_t m32 = 32;
-
-    ggml_int8x16x4_t q3bytes;
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
-
-        ggml_uint8x16x4_t q3h;
-
-        int32_t isum = 0;
-
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-        int8_t * scale = (int8_t *)utmp;
-        for (int j = 0; j < 16; ++j) scale[j] -= m32;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
-            const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
-            const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
-            q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
-            q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1);
-            q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1);
-
-            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0]));
-            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1]));
-            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
-            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
-
-            scale += 4;
-
-            q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
-            q3h.val[1] = vbicq_u8(m2, qhbits.val[1]);
-            q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1);
-            q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1);
-
-            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0]));
-            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1]));
-            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
-            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
-
-            scale += 4;
-
-            if (j == 0) {
-                qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4);
-                qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4);
-            }
-
-        }
-        sum += d * isum;
-
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-    const __m256i mone = _mm256_set1_epi8(1);
-    const __m128i m32 = _mm_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint32_t aux[3];
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        __m128i scales128 = _mm_set_epi32(
-                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
-                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
-                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
-                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
-        scales128 = _mm_sub_epi8(scales128, m32);
-        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
-        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
-        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
-
-        // high bit
-        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
-
-        // integer accumulator
-        __m256i sumi = _mm256_setzero_si256();
-
-        int bit = 0;
-        int is  = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load low 2 bits
-            const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
-
-            // prepare low and high bits
-            const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
-            const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
-            const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
-            const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
-            const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            // load Q8 quants
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
-            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-            // and 2 if the high bit was set)
-            __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
-            __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
-            __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
-            __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
-            __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
-            __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
-
-            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
-
-            // multiply with scales
-            p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
-            p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
-            p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
-            p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
-
-            // accumulate
-            p16_0 = _mm256_add_epi32(p16_0, p16_1);
-            p16_2 = _mm256_add_epi32(p16_2, p16_3);
-            sumi  = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
-
-        }
-
-        // multiply with block scale and accumulate
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i mone = _mm_set1_epi8(1);
-    const __m128i m32 = _mm_set1_epi8(32);
-    const __m128i m2 = _mm_set1_epi8(2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    const uint32_t *aux;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        // Set up scales
-        aux = (const uint32_t *)x[i].scales;
-        __m128i scales128 = _mm_set_epi32(
-                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
-                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
-                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
-                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
-        scales128 = _mm_sub_epi8(scales128, m32);
-        const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
-        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
-        const __m128i scales[2] = { scales_0, scales_1 };
-
-        // high bit *128*2 from block_q3_K.hmask[QK_K/8]
-        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
-        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
-
-        // integer accumulator
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
-            const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
-            const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
-
-            // prepare low and high bits
-            const int bit = j << 2;
-
-            const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
-            const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
-            const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
-            const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
-
-            const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
-            const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
-            const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
-            const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
-
-            const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
-            const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
-            const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
-            const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
-
-            const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
-            const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
-            const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
-            const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
-
-            // load Q8 quants from block_q8_K.qs[QK_K]
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
-            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-            // and 2 if the high bit was set)
-            __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
-            __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
-            __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
-            __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
-            __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
-            __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
-            __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
-            __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
-
-            __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
-            __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
-            __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
-            __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
-            __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
-            __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
-            __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
-
-            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
-            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
-            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
-            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
-
-            // multiply with scales
-            __m128i shuffle = _mm_set1_epi16(0x0100);
-            p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
-
-            // accumulate
-            p16_0 = _mm_add_epi32(p16_0, p16_1);
-            p16_2 = _mm_add_epi32(p16_2, p16_3);
-            p16_4 = _mm_add_epi32(p16_4, p16_5);
-            p16_6 = _mm_add_epi32(p16_6, p16_7);
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
-
-        }
-
-        // multiply with block scale and accumulate
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    uint32_t aux[3];
-    uint32_t utmp[4];
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        memcpy(aux, x[i].scales, 12);
-        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-        int8_t * scale = (int8_t *)utmp;
-        for (int j = 0; j < 16; ++j) scale[j] -= 32;
-
-
-        size_t vl = 32;
-        uint8_t m =  1;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-        vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
-
-        int sum_t = 0;
-
-        for (int j = 0; j < QK_K; j += 128) {
-
-            vl = 32;
-
-            // load Q3
-            vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
-
-            vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
-            vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
-            vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
-            vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
-
-            // compute mask for subtraction
-            vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
-            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
-            m <<= 1;
-
-            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
-            m <<= 1;
-
-            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
-            m <<= 1;
-
-            vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
-            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
-            m <<= 1;
-
-            // load Q8 and take product with Q3
-            vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
-            vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
-            vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
-            vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
-
-            vl = 16;
-
-            // retrieve lane to multiply with scale
-            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
-            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
-            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
-            vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
-            vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
-            vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
-            vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
-            vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
-
-            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
-            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
-            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
-            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
-
-            sum_t +=  __riscv_vmv_x_s_i32m1_i32(isum3);
-
-            q3 += 32;    q8 += 128;   scale += 8;
-
-        }
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        sumf += d*sum_t;
-
-    }
-
-    *s = sumf;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0x3);
-    const vector signed char lowMask1 = vec_splats((int8_t)0xf);
-    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector signed char v1 = vec_splats((signed char)0x1);
-    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
-    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
-    const vector signed char off = vec_splats((signed char)0x20);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        UNUSED(kmask1);
-        UNUSED(kmask2);
-
-        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
-        vector signed char u1 = vec_and(u0, lowMask1);
-        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
-        vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
-        vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
-        vector signed char u31 = vec_and(u3, lowMask2);
-
-        u1 = vec_or(u1, u30);
-        u2 = vec_or(vec_sr(u0, v4), u31);
-
-        vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
-        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
-        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
-
-        vscales = vec_sub(vscales, off);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-        vector signed int vsumi4 = v0;
-        vector signed int vsumi5 = v0;
-        vector signed int vsumi6 = v0;
-        vector signed int vsumi7 = v0;
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            __builtin_prefetch(q3, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
-            q3 += 32;
-
-            //the low 2 bits
-            vector signed char qxs00 = vec_and(qxs0, lowMask);
-            vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
-            vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
-            vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
-            vector signed char qxs10 = vec_and(qxs1, lowMask);
-            vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
-            vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
-            vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
-
-            //the 3rd bit
-            vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
-            vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
-            vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
-            vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
-            vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
-            vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
-            vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
-            vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
-            qxhs0 = vec_sr(qxhs0, v4);
-            qxhs1 = vec_sr(qxhs1, v4);
-
-            vector signed char q3x00 = vec_sub(qxs00, qxh00);
-            vector signed char q3x01 = vec_sub(qxs01, qxh01);
-            vector signed char q3x02 = vec_sub(qxs02, qxh02);
-            vector signed char q3x03 = vec_sub(qxs03, qxh03);
-            vector signed char q3x10 = vec_sub(qxs10, qxh10);
-            vector signed char q3x11 = vec_sub(qxs11, qxh11);
-            vector signed char q3x12 = vec_sub(qxs12, qxh12);
-            vector signed char q3x13 = vec_sub(qxs13, qxh13);
-
-            vector signed char q8y00 = vec_xl(  0, q8);
-            vector signed char q8y10 = vec_xl( 16, q8);
-            vector signed char q8y01 = vec_xl( 32, q8);
-            vector signed char q8y11 = vec_xl( 48, q8);
-            vector signed char q8y02 = vec_xl( 64, q8);
-            vector signed char q8y12 = vec_xl( 80, q8);
-            vector signed char q8y03 = vec_xl( 96, q8);
-            vector signed char q8y13 = vec_xl(112, q8);
-            q8 += 128;
-
-            vector signed short vscales_h = vec_unpackh(vscales);
-            vector signed short vs0 = vec_splat(vscales_h, 0);
-            vector signed short vs1 = vec_splat(vscales_h, 1);
-            vector signed short vs2 = vec_splat(vscales_h, 2);
-            vector signed short vs3 = vec_splat(vscales_h, 3);
-            vector signed short vs4 = vec_splat(vscales_h, 4);
-            vector signed short vs5 = vec_splat(vscales_h, 5);
-            vector signed short vs6 = vec_splat(vscales_h, 6);
-            vector signed short vs7 = vec_splat(vscales_h, 7);
-            vscales = vec_sld(vscales, vscales, 8);
-
-            vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
-            vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
-            vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
-            vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
-            vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
-            vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
-            vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
-            vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
-
-            vsumi0 = vec_msum(qv00, vs0, vsumi0);
-            vsumi1 = vec_msum(qv01, vs2, vsumi1);
-            vsumi2 = vec_msum(qv02, vs4, vsumi2);
-            vsumi3 = vec_msum(qv03, vs6, vsumi3);
-            vsumi4 = vec_msum(qv10, vs1, vsumi4);
-            vsumi5 = vec_msum(qv11, vs3, vsumi5);
-            vsumi6 = vec_msum(qv12, vs5, vsumi6);
-            vsumi7 = vec_msum(qv13, vs7, vsumi7);
-        }
-
-        vsumi0 = vec_add(vsumi0, vsumi4);
-        vsumi1 = vec_add(vsumi1, vsumi5);
-        vsumi2 = vec_add(vsumi2, vsumi6);
-        vsumi3 = vec_add(vsumi3, vsumi7);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined __loongarch_asx
-
-    const __m256i m3 = __lasx_xvreplgr2vr_b(3);
-    const __m256i mone = __lasx_xvreplgr2vr_b(1);
-    const __m128i m32 = __lsx_vreplgr2vr_b(32);
-
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    uint32_t aux[3];
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        __m128i scales128 = lsx_set_w(
-                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
-                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
-                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
-                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
-        scales128 = __lsx_vsub_b(scales128, m32);
-        const __m256i all_scales = lasx_ext8_16(scales128);
-        const __m128i l_scales = lasx_extracti128(all_scales, 0);
-        const __m128i h_scales = lasx_extracti128(all_scales, 1);
-        const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
-
-        // high bit
-        const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0);
-
-        // integer accumulator
-        __m256i sumi = __lasx_xvldi(0);
-
-        int bit = 0;
-        int is  = 0;
-        __m256i xvbit;
-
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load low 2 bits
-            const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
-
-            xvbit = __lasx_xvreplgr2vr_h(bit);
-            // prepare low and high bits
-            const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
-            const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
-            ++bit;
-
-            xvbit = __lasx_xvreplgr2vr_h(bit);
-            const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
-            const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
-            ++bit;
-
-            xvbit = __lasx_xvreplgr2vr_h(bit);
-            const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
-            const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
-            ++bit;
-
-            xvbit = __lasx_xvreplgr2vr_h(bit);
-            const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
-            const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
-            ++bit;
-
-            // load Q8 quants
-            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-
-            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use lasx_maddubs_h,
-            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-            // and 2 if the high bit was set)
-            __m256i q8s_0 = lasx_maddubs_h(q3h_0, q8_0);
-            __m256i q8s_1 = lasx_maddubs_h(q3h_1, q8_1);
-            __m256i q8s_2 = lasx_maddubs_h(q3h_2, q8_2);
-            __m256i q8s_3 = lasx_maddubs_h(q3h_3, q8_3);
-
-            __m256i p16_0 = lasx_maddubs_h(q3l_0, q8_0);
-            __m256i p16_1 = lasx_maddubs_h(q3l_1, q8_1);
-            __m256i p16_2 = lasx_maddubs_h(q3l_2, q8_2);
-            __m256i p16_3 = lasx_maddubs_h(q3l_3, q8_3);
-
-            p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
-            p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
-            p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
-            p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
-
-            // multiply with scales
-            p16_0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
-            p16_1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
-            p16_2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
-            p16_3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
-
-            // accumulate
-            p16_0 = __lasx_xvadd_w(p16_0, p16_1);
-            p16_2 = __lasx_xvadd_w(p16_2, p16_3);
-            sumi  = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2));
-        }
-        // multiply with block scale and accumulate
-        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);//FIXME
-    }
-
-    *s = hsum_float_8(acc);
-
-#else
-    // scalar version
-    // This function is written like this so the compiler can manage to vectorize most of it
-    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
-    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
-    // The ideal situation would be if we could just write the code once, and the compiler would
-    // automatically produce the best possible set of machine instructions, instead of us having to manually
-    // write vectorized versions for AVX, ARM_NEON, etc.
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    uint32_t auxs[4];
-    const int8_t * scales = (const int8_t*)auxs;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        uint8_t m = 1;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            q3 += 32;
-        }
-        a = aux8;
-
-        memcpy(auxs, x[i].scales, 12);
-        uint32_t tmp = auxs[2];
-        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
-        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
-        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
-        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
-        for (int j = 0; j < QK_K/16; ++j) {
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-
-#endif
-
-}
-
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-    static const uint32_t kmask1 = 0x3f3f3f3f;
-    static const uint32_t kmask2 = 0x0f0f0f0f;
-    static const uint32_t kmask3 = 0x03030303;
-
-    uint32_t utmp[4];
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x2_t q4bytes;
-    ggml_int8x16x2_t q8bytes;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
-
-        memcpy(utmp, x[i].scales, 12);
-
-        uint32x2_t mins8 = { 0 };
-        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
-        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
-
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[0] &= kmask1;
-
-        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
-        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
-                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
-        sumf -= dmin * vaddvq_s32(prod);
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        int32_t sumi1 = 0;
-        int32_t sumi2 = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
-
-            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
-            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
-
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
-            sumi1 += vaddvq_s32(p1) * scales[2*j+0];
-
-            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
-            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
-
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
-
-            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
-        }
-
-        sumf += d * (sumi1 + sumi2);
-
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-    __m128 acc_m = _mm_setzero_ps();
-
-   for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
-
-        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
-        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
-        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
-        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
-
-        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = MM256_SET_M128I(sc128, sc128);
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
-            const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
-
-            const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4l = _mm256_and_si256(q4bits, m4);
-            const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
-
-            const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
-            p16l = _mm256_madd_epi16(scale_l, p16l);
-
-            const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
-            p16h = _mm256_madd_epi16(scale_h, p16h);
-            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
-
-            sumi = _mm256_add_epi32(sumi, sumj);
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
-    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
-
-    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m2 = _mm_set1_epi8(0x2);
-
-    __m256 acc = _mm256_setzero_ps();
-    __m128 acc_m = _mm_setzero_ps();
-
-   for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
-        const __m128i scales = _mm_cvtepu8_epi16(utmps);
-        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
-
-        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
-        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
-        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
-        const __m128i prod = _mm_madd_epi16(mins, q8s);
-        acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        __m128i shuffle = _mm_set1_epi16(0x0100);
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-
-            __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
-            const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
-            q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
-            const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
-
-            const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
-            p16l = _mm_madd_epi16(scale_l, p16l);
-            sumi_0 = _mm_add_epi32(sumi_0, p16l);
-            const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
-            p16l = _mm_madd_epi16(scale_l, p16l);
-            sumi_1 = _mm_add_epi32(sumi_1, p16l);
-
-            const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
-            p16h = _mm_madd_epi16(scale_h, p16h);
-            sumi_0 = _mm_add_epi32(sumi_0, p16h);
-            const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
-            p16h = _mm_madd_epi16(scale_h, p16h);
-            sumi_1 = _mm_add_epi32(sumi_1, p16h);
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
-    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
-
-    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
-
-#elif defined __riscv_v_intrinsic
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        size_t vl = 8;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
-        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
-        vint16mf2_t q8sums   = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        vuint8mf4_t mins8  = __riscv_vle8_v_u8mf4(mins, vl);
-        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
-        vint32m1_t  prod   = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
-
-        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        vl = 32;
-
-        int32_t sum_1 = 0;
-        int32_t sum_2 = 0;
-
-        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            // load Q4
-            vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
-
-            // load Q8 and multiply it with lower Q4 nibble
-            vint8m1_t  q8_0 = __riscv_vle8_v_i8m1(q8, vl);
-            vint8m1_t  q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
-            vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
-            vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
-
-            sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
-
-            // load Q8 and multiply it with upper Q4 nibble
-            vint8m1_t  q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
-            vint8m1_t  q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
-            vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
-            vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
-
-            sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
-
-            q4 += 32;    q8 += 64;
-
-        }
-
-        sumf += d*(sum_1 + sum_2);
-
-    }
-
-    *s = sumf;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
-    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v2 = vec_splats((uint8_t)2);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
-        vector float vdmin = vec_mul(vxmin, vyd);
-
-        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
-        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
-
-        UNUSED(kmask1);
-        UNUSED(kmask2);
-        UNUSED(kmask3);
-        UNUSED(utmp);
-
-        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
-        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
-        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
-        vector signed char u3 = vec_sr(u2, v4);
-
-        vector signed char u30 = u1;
-        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
-
-        u1 = vec_and(u0, lowMask1);
-        u2 = vec_or(u30, u31);
-
-        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
-
-        vector signed short vscales = vec_unpackh(utmps);
-        vector signed short q4xmins = vec_unpackl(utmps);
-        vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
-        vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
-
-        vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
-        vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
-        vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
-        vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
-
-        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
-        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
-        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
-        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/64; j+=2) {
-            __builtin_prefetch(q4, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
-            vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
-            vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
-            q4 += 64;
-
-            vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
-            vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
-            vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
-            vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
-            vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
-            vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
-            vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
-            vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
-
-            vector signed char q8y00 = vec_xl(  0, q8);
-            vector signed char q8y10 = vec_xl( 16, q8);
-            vector signed char q8y01 = vec_xl( 32, q8);
-            vector signed char q8y11 = vec_xl( 48, q8);
-            vector signed char q8y20 = vec_xl( 64, q8);
-            vector signed char q8y30 = vec_xl( 80, q8);
-            vector signed char q8y21 = vec_xl( 96, q8);
-            vector signed char q8y31 = vec_xl(112, q8);
-            q8 += 128;
-
-            vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
-            vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
-            vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
-            vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
-            vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
-            vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
-            vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
-            vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
-
-            vector signed int vscales_h = vec_unpackh(vscales);
-            vector signed int vs0 = vec_splat(vscales_h, 0);
-            vector signed int vs1 = vec_splat(vscales_h, 1);
-            vector signed int vs2 = vec_splat(vscales_h, 2);
-            vector signed int vs3 = vec_splat(vscales_h, 3);
-            vscales = vec_sld(vscales, vscales, 8);
-
-            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
-            vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
-            vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
-            vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
-
-            vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
-            vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
-            vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
-            vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined __loongarch_asx
-    GGML_UNUSED(kmask1);
-    GGML_UNUSED(kmask2);
-    GGML_UNUSED(kmask3);
-
-    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
-
-    __m256 acc = (__m256)__lasx_xvldi(0);
-    __m128 acc_m = (__m128)__lsx_vldi(0);
-
-   for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
-
-        const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
-        const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
-        const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s);
-        acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
-
-        const __m128i sc128  = lasx_extracti128(mins_and_scales, 0);
-        const __m256i scales = lasx_insertf128(sc128, sc128);
-
-        __m256i sumi = __lasx_xvldi(0);
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_l = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0));
-            const __m256i scale_h = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1));
-
-            const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
-            const __m256i q4l = __lasx_xvand_v(q4bits, m4);
-            const __m256i q4h = __lasx_xvand_v(__lasx_xvsrli_h(q4bits, 4), m4);
-
-            const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            __m256i p16l = lasx_maddubs_h(q4l, q8l);
-            p16l = lasx_madd_h(scale_l, p16l);
-
-            const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            __m256i p16h = lasx_maddubs_h(q4h, q8h);
-            p16h = lasx_madd_h(scale_h, p16h);
-            const __m256i sumj = __lasx_xvadd_w(p16l, p16h);
-
-            sumi = __lasx_xvadd_w(sumi, sumj);
-        }
-
-        __m256 vd = __lasx_xvreplfr2vr_s(d);
-        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
-
-    }
-
-    acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
-    __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
-    acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
-
-
-    ft_union fi;
-    fi.i = __lsx_vpickve2gr_w(acc_m, 0);
-    *s = hsum_float_8(acc) + fi.f ;
-#else
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        for (int j = 0; j < QK_K/64; ++j) {
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
-            a += 32;
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
-            a += 32; q4 += 32;
-        }
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        int sumi = 0;
-        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            int32_t scale = scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-        sumf -= dmin * sumi;
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-    static const uint32_t kmask1 = 0x3f3f3f3f;
-    static const uint32_t kmask2 = 0x0f0f0f0f;
-    static const uint32_t kmask3 = 0x03030303;
-
-    uint32_t utmp[4];
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const uint8x16_t mone = vdupq_n_u8(1);
-    const uint8x16_t mtwo = vdupq_n_u8(2);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x4_t q5bytes;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8x8_t mins8 = vld1_u8((const uint8_t*)utmp + 8);
-        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(mins8));
-        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
-                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
-        int32_t sumi_mins = vaddvq_s32(prod);
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
-
-        ggml_uint8x16x4_t q5h;
-
-        int32_t sumi = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
-            const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
-            q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
-            q5h.val[2] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[0]), 3);
-            q5h.val[3] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[1]), 3);
-            qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 2);
-            qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 2);
-
-            q5bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[0], m4b), q5h.val[0]));
-            q5bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[1], m4b), q5h.val[1]));
-            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
-            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
-
-            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
-            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
-        }
-
-        sumf += d * sumi - dmin * sumi_mins;
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m128i mzero = _mm_setzero_si128();
-    const __m256i mone  = _mm256_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.f;
-
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
-
-        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
-        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
-        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
-        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
-        summs += dmin * _mm_extract_epi32(hsum, 0);
-
-        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = MM256_SET_M128I(sc128, sc128);
-
-        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
-        __m256i hmask = mone;
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        int bit = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
-            const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
-
-            const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
-
-            const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
-            const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
-            const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
-            hmask = _mm256_slli_epi16(hmask, 1);
-
-            const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
-            const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
-            const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
-            hmask = _mm256_slli_epi16(hmask, 1);
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
-
-            p16_0 = _mm256_madd_epi16(scale_0, p16_0);
-            p16_1 = _mm256_madd_epi16(scale_1, p16_1);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i mzero = _mm_setzero_si128();
-    const __m128i mone  = _mm_set1_epi8(1);
-    const __m128i m2 = _mm_set1_epi8(2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.f;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
-        const __m128i scales = _mm_cvtepu8_epi16(utmps);
-        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
-
-        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
-        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
-        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
-        const __m128i prod = _mm_madd_epi16(mins, q8s);
-        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
-        summs += dmin * _mm_extract_epi32(hsum, 0);
-
-        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
-        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
-        __m128i hmask = mone;
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        int bit = 0;
-
-        __m128i shuffle = _mm_set1_epi16(0x0100);
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-
-            const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
-            const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
-
-            __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
-            __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
-            __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
-            __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
-            __m128i q5_0  = _mm_add_epi8(q5l_0, q5h_0);
-            __m128i q5_1  = _mm_add_epi8(q5l_1, q5h_1);
-            hmask = _mm_slli_epi16(hmask, 1);
-
-            __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
-            p16_0 = _mm_madd_epi16(scale_0, p16_0);
-            p16_1 = _mm_madd_epi16(scale_0, p16_1);
-
-            q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
-            q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
-            q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
-            q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
-            q5_0  = _mm_add_epi8(q5l_0, q5h_0);
-            q5_1  = _mm_add_epi8(q5l_1, q5h_1);
-            hmask = _mm_slli_epi16(hmask, 1);
-
-            q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
-            __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
-            p16_2 = _mm_madd_epi16(scale_1, p16_2);
-            p16_3 = _mm_madd_epi16(scale_1, p16_3);
-
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __riscv_v_intrinsic
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    float sumf = 0;
-    float sums = 0.0;
-
-    size_t vl;
-
-    for (int i = 0; i < nb; ++i) {
-
-        vl = 8;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-
-        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
-        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
-        vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
-        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
-        vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
-
-        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
-
-        vl = 32;
-        int32_t aux32 = 0;
-        int is = 0;
-
-        uint8_t m = 1;
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-        vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            // load Q5 and Q8
-            vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
-            vint8m1_t  q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
-            vint8m1_t  q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
-
-            // compute mask for addition
-            vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
-            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
-            m <<= 1;
-
-            vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
-            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
-            m <<= 1;
-
-            vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
-            vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
-
-            vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
-            vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
-
-            vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
-            vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
-
-            aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
-            q5 += 32;    q8 += 64;
-
-        }
-
-        vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
-        sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
-
-    }
-
-    *s = sumf+sums;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
-    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v1 = vec_splats((unsigned char)0x1);
-    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
-    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
-        vector float vdmin = vec_mul(vxmin, vyd);
-
-        UNUSED(kmask1);
-        UNUSED(kmask2);
-        UNUSED(kmask3);
-        UNUSED(utmp);
-
-        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
-        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
-        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
-        vector signed char u3 = vec_sr(u2, v4);
-
-        vector signed char u30 = u1;
-        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
-
-        u1 = vec_and(u0, lowMask1);
-        u2 = vec_or(u30, u31);
-
-        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
-
-        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
-        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
-
-        vector signed short vscales = vec_unpackh(utmps);
-
-        vector signed short q5xmins = vec_unpackl(utmps);
-        vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
-        vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
-
-        vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
-        vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
-        vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
-        vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
-
-        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
-        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
-        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
-        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
-
-        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
-        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            __builtin_prefetch(q5, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
-            q5 += 32;
-
-            vector signed char qxs00 = vec_and(qxs0, lowMask);
-            vector signed char qxs01 = vec_sr(qxs0, v4);
-            vector signed char qxs10 = vec_and(qxs1, lowMask);
-            vector signed char qxs11 = vec_sr(qxs1, v4);
-
-            vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
-            vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
-            vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
-            vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
-            qxhs0 = vec_sr(qxhs0, v2);
-            qxhs1 = vec_sr(qxhs1, v2);
-
-            vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
-            vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
-            vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
-            vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
-
-            vector signed char q8y00 = vec_xl( 0, q8);
-            vector signed char q8y10 = vec_xl(16, q8);
-            vector signed char q8y01 = vec_xl(32, q8);
-            vector signed char q8y11 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
-            vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
-            vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
-            vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
-
-            vector signed int vscales_h = vec_unpackh(vscales);
-            vector signed int vs0 = vec_splat(vscales_h, 0);
-            vector signed int vs1 = vec_splat(vscales_h, 1);
-            vscales = vec_sld(vscales, vscales, 12);
-
-            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
-            vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
-            vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
-            vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined __loongarch_asx
-    GGML_UNUSED(kmask1);
-    GGML_UNUSED(kmask2);
-    GGML_UNUSED(kmask3);
-
-    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
-    const __m128i mzero = __lsx_vldi(0);
-    const __m256i mone  = __lasx_xvreplgr2vr_b(1);
-
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    float summs = 0.f;
-
-   for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
-
-        const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
-        const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
-        const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s);
-        const __m128i hsum = lsx_hadd_w(lsx_hadd_w(prod, mzero), mzero);
-        summs += dmin * __lsx_vpickve2gr_w(hsum, 0);    //TODO check
-
-        const __m128i sc128  = lasx_extracti128(mins_and_scales, 0);
-        const __m256i scales = lasx_insertf128(sc128, sc128);
-
-        const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0);
-        __m256i hmask = mone;
-
-        __m256i sumi = __lasx_xvldi(0);
-
-        int bit = 0;
-        __m256i xvbit;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_0 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0));
-            const __m256i scale_1 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1));
-
-            const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
-
-            xvbit = __lasx_xvreplgr2vr_h(bit++);
-            const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
-            const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
-            const __m256i q5_0  = __lasx_xvadd_b(q5l_0, q5h_0);
-            hmask = __lasx_xvslli_h(hmask, 1);
-
-            xvbit = __lasx_xvreplgr2vr_h(bit++);
-            const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
-            const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
-            const __m256i q5_1  = __lasx_xvadd_b(q5l_1, q5h_1);
-            hmask = __lasx_xvslli_h(hmask, 1);
-
-            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-
-            __m256i p16_0 = lasx_maddubs_h(q5_0, q8_0);
-            __m256i p16_1 = lasx_maddubs_h(q5_1, q8_1);
-
-            p16_0 = lasx_madd_h(scale_0, p16_0);
-            p16_1 = lasx_madd_h(scale_1, p16_1);
-
-            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
-
-        }
-
-        __m256 vd = __lasx_xvreplfr2vr_s(d);
-        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#else
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        uint8_t m = 1;
-        for (int j = 0; j < QK_K/64; ++j) {
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
-            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
-            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-            a += 32; m <<= 1;
-            q4 += 32;
-        }
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        int sumi = 0;
-        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            int32_t scale = scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-        sumf -= dmin * sumi;
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    float sum = 0;
-
-    const uint8x16_t m4b = vdupq_n_u8(0xF);
-    const int32x4_t  vzero = vdupq_n_s32(0);
-    //const int8x16_t  m32s = vdupq_n_s8(32);
-
-    const uint8x16_t mone = vdupq_n_u8(3);
-
-    ggml_int8x16x4_t q6bytes;
-    ggml_uint8x16x4_t q6h;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
-        const int8x16_t scales = vld1q_s8(scale);
-        const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
-
-        const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
-                                                   vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
-                                         vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])),
-                                                   vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
-        int32_t isum_mins = vaddvq_s32(prod);
-
-        int32_t isum = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
-            ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
-            ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
-            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
-            uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
-            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 2);
-            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
-            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
-            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s);
-            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s);
-            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
-            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
-            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
-            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-
-            scale += 4;
-
-            q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            shifted = vshrq_n_u8(qhbits.val[0], 4);
-            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 4);
-            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[0], 6);
-            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 6);
-            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s);
-            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s);
-            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s);
-            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s);
-            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
-            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
-            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
-            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-            scale += 4;
-        }
-        //sum += isum * d_all * y[i].d;
-        sum += d_all * y[i].d * (isum - 32 * isum_mins);
-
-    }
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m256i m2 = _mm256_set1_epi8(3);
-    const __m256i m32s = _mm256_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        int is = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
-            is += 4;
-
-            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
-
-            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
-            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
-            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
-            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
-
-            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
-            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
-            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
-            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
-            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
-            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
-            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
-            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
-            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
-
-            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
-
-            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
-            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
-            p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
-            p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
-
-        }
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i m15 = _mm_set1_epi8(15);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        // handle the q6_k -32 offset separately using bsums
-        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
-        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
-        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-        const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
-        const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
-        const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
-        const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        int is = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
-            const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
-
-            const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
-            const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
-            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
-            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
-            const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
-            const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
-            const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
-            const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
-
-            const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-
-            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
-            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
-            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
-            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
-            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
-            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
-            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
-            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
-
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
-            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
-            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
-            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
-            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
-            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
-            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
-            is += 4;
-
-            p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
-            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
-            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
-            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
-            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
-            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
-            p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
-            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
-
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
-
-        }
-
-        sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
-        sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
-        const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        size_t vl;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-
-        int sum_t = 0;
-        int is = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            vl = 32;
-
-            // load qh
-            vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
-
-            // load Q6
-            vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
-            vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
-
-            vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
-            vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
-            vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
-            vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
-
-            vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
-            vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
-            vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
-            vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
-
-            vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
-            vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
-            vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
-            vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
-
-            vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
-            vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
-            vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
-            vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
-
-            // load Q8 and take product
-            vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
-            vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
-            vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
-            vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
-
-            vl = 16;
-
-            vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
-            vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
-            vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
-            vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
-            vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
-            vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
-            vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
-            vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
-
-            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
-            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
-            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
-            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
-
-            sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
-
-            q6 += 64;   qh += 32;   q8 += 128;   is=8;
-
-        }
-
-        sumf += d * sum_t;
-
-    }
-
-    *s = sumf;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
-    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
-    const vector signed char off = vec_splats((signed char)0x20);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-        vector signed int vsumi4 = v0;
-        vector signed int vsumi5 = v0;
-        vector signed int vsumi6 = v0;
-        vector signed int vsumi7 = v0;
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict qs = x[i].scales;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            __builtin_prefetch(q6, 0, 0);
-            __builtin_prefetch(qh, 0, 0);
-            __builtin_prefetch(q8, 0, 0);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
-            vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
-            vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
-            q6 += 64;
-
-            vector signed char qxs00 = vec_and(qxs0, lowMask);
-            vector signed char qxs01 = vec_sr(qxs0, v4);
-            vector signed char qxs10 = vec_and(qxs1, lowMask);
-            vector signed char qxs11 = vec_sr(qxs1, v4);
-            vector signed char qxs20 = vec_and(qxs2, lowMask);
-            vector signed char qxs21 = vec_sr(qxs2, v4);
-            vector signed char qxs30 = vec_and(qxs3, lowMask);
-            vector signed char qxs31 = vec_sr(qxs3, v4);
-
-            vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
-            vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
-            qh += 32;
-
-            vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
-            vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
-            vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
-            vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
-            vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
-            vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
-            vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
-            vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
-
-            vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
-            vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
-            vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
-            vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
-            vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
-            vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
-            vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
-            vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
-
-            vector signed char q8y00 = vec_xl(  0, q8);
-            vector signed char q8y10 = vec_xl( 16, q8);
-            vector signed char q8y20 = vec_xl( 32, q8);
-            vector signed char q8y30 = vec_xl( 48, q8);
-            vector signed char q8y01 = vec_xl( 64, q8);
-            vector signed char q8y11 = vec_xl( 80, q8);
-            vector signed char q8y21 = vec_xl( 96, q8);
-            vector signed char q8y31 = vec_xl(112, q8);
-            q8 += 128;
-
-            vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
-            vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
-            vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
-            vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
-            vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
-            vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
-            vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
-            vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
-
-            vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
-            qs += 8;
-
-            vector signed short vs0 = vec_splat(vscales, 0);
-            vector signed short vs1 = vec_splat(vscales, 1);
-            vector signed short vs2 = vec_splat(vscales, 2);
-            vector signed short vs3 = vec_splat(vscales, 3);
-            vector signed short vs4 = vec_splat(vscales, 4);
-            vector signed short vs5 = vec_splat(vscales, 5);
-            vector signed short vs6 = vec_splat(vscales, 6);
-            vector signed short vs7 = vec_splat(vscales, 7);
-
-            vsumi0 = vec_msum(qv00, vs0, vsumi0);
-            vsumi1 = vec_msum(qv01, vs4, vsumi1);
-            vsumi2 = vec_msum(qv10, vs1, vsumi2);
-            vsumi3 = vec_msum(qv11, vs5, vsumi3);
-            vsumi4 = vec_msum(qv20, vs2, vsumi4);
-            vsumi5 = vec_msum(qv21, vs6, vsumi5);
-            vsumi6 = vec_msum(qv30, vs3, vsumi6);
-            vsumi7 = vec_msum(qv31, vs7, vsumi7);
-        }
-
-        vsumi0 = vec_add(vsumi0, vsumi4);
-        vsumi1 = vec_add(vsumi1, vsumi5);
-        vsumi2 = vec_add(vsumi2, vsumi6);
-        vsumi3 = vec_add(vsumi3, vsumi7);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined __loongarch_asx
-
-    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
-    const __m256i m2 = __lasx_xvreplgr2vr_b(3);
-    const __m256i m32s = __lasx_xvreplgr2vr_b(32);
-
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m128i scales = __lsx_vld((const __m128i*)x[i].scales, 0);
-
-        __m256i sumi = __lasx_xvldi(0);
-
-        int is = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i scale_0 = lsx_shuffle_b(scales, get_scale_shuffle(is + 0));
-            const __m128i scale_1 = lsx_shuffle_b(scales, get_scale_shuffle(is + 1));
-            const __m128i scale_2 = lsx_shuffle_b(scales, get_scale_shuffle(is + 2));
-            const __m128i scale_3 = lsx_shuffle_b(scales, get_scale_shuffle(is + 3));
-            is += 4;
-
-            const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
-            const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
-            const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32;
-
-            const __m256i q4h_0 = __lasx_xvslli_h(__lasx_xvand_v(q4bitsH, m2), 4);
-            const __m256i q4h_1 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 2), m2), 4);
-            const __m256i q4h_2 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 4), m2), 4);
-            const __m256i q4h_3 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 6), m2), 4);
-
-            const __m256i q4_0 = __lasx_xvor_v(__lasx_xvand_v(q4bits1, m4), q4h_0);
-            const __m256i q4_1 = __lasx_xvor_v(__lasx_xvand_v(q4bits2, m4), q4h_1);
-            const __m256i q4_2 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits1, 4), m4), q4h_2);
-            const __m256i q4_3 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits2, 4), m4), q4h_3);
-
-            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-
-            __m256i q8s_0 = lasx_maddubs_h(m32s, q8_0);
-            __m256i q8s_1 = lasx_maddubs_h(m32s, q8_1);
-            __m256i q8s_2 = lasx_maddubs_h(m32s, q8_2);
-            __m256i q8s_3 = lasx_maddubs_h(m32s, q8_3);
-
-            __m256i p16_0 = lasx_maddubs_h(q4_0, q8_0);
-            __m256i p16_1 = lasx_maddubs_h(q4_1, q8_1);
-            __m256i p16_2 = lasx_maddubs_h(q4_2, q8_2);
-            __m256i p16_3 = lasx_maddubs_h(q4_3, q8_3);
-
-            p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
-            p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
-            p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
-            p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
-
-            p16_0 = lasx_madd_h(lasx_ext8_16(scale_0), p16_0);
-            p16_1 = lasx_madd_h(lasx_ext8_16(scale_1), p16_1);
-            p16_2 = lasx_madd_h(lasx_ext8_16(scale_2), p16_2);
-            p16_3 = lasx_madd_h(lasx_ext8_16(scale_3), p16_3);
-
-            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
-            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3));
-        }
-
-        acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#else
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-            }
-            a  += 128;
-            q4 += 64;
-            qh += 32;
-        }
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            int scale = x[i].scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
-static const int8_t keven_signs_q2xs[1024] = {
-     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
-     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
-     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
-     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
-     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
-     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
-     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
-     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
-     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
-     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
-     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
-     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
-     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
-     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
-     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
-     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
-     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
-     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
-     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
-     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
-     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
-     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
-     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
-     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
-     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
-     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
-     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
-     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
-     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
-     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
-     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
-     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-#endif
-
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq2_xxs * restrict x = vx;
-    const block_q8_K    * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    ggml_int8x16x4_t q2u;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        float sumf1 = 0, sumf2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
-            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
-            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
-            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
-            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >>  7) & 127))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
-            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
-            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
-            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
-            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
-            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
-            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
-        }
-        sumf += d*(sumf1 + sumf2);
-    }
-    *s = 0.25f * sumf;
-
-#elif defined(__AVX2__)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
-                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[1] >> 28;
-            const uint16_t ls2 = aux32[3] >> 28;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__AVX__)
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
-            const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
-            const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
-            const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
-            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
-            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
-            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
-            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
-            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
-            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
-            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
-            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
-            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
-            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
-            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
-            const uint16_t ls1 = aux32[1] >> 28;
-            const uint16_t ls2 = aux32[3] >> 28;
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
-            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
-            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
-            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
-            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
-        }
-
-        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__POWER9_VECTOR__)
-    const vector int v0 = vec_splats((int32_t)0);
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t  *  restrict q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/32; j += 2) {
-            __builtin_prefetch(q2, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            uint32_t aux32[4];
-            const uint8_t * aux8 = (const uint8_t *)aux32;
-
-            memcpy(aux32, q2, 4*sizeof(uint32_t));
-            q2 += 8;
-
-            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
-            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
-            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
-            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
-
-            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >>  7) & 127))};
-            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
-            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >>  7) & 127))};
-            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
-
-            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
-            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
-            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
-            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
-
-            const uint16_t ls0 = aux32[1] >> 28;
-            const uint16_t ls1 = aux32[3] >> 28;
-
-            vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
-            vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
-
-            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = 0.125f * vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    __m256 accumf = (__m256)__lasx_xvldi(0);
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-
-            const __m256i q2_1 = lasx_set_d(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
-            const __m256i q2_2 = lasx_set_d(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
-            const __m256i s2_1 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i s2_2 = lasx_set_d(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
-                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
-            const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
-            const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
-            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
-            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[1] >> 28;
-            const uint16_t ls2 = aux32[3] >> 28;
-            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
-            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
-            sumi1 = __lasx_xvadd_w(sumi1, p1);
-            sumi2 = __lasx_xvadd_w(sumi2, p2);
-        }
-
-        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#else
-
-    uint32_t aux32[2];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(aux32, q2, 2*sizeof(uint32_t));
-            q2 += 4;
-            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.125f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq2_xs * restrict x = vx;
-    const block_q8_K   * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    ggml_int8x16x4_t q2u;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    int32x4x4_t scales32;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        const uint8x8_t scales8 = vld1_u8(x[i].scales);
-        const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
-        const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
-        uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
-        scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
-        const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
-        const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
-        scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
-        scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
-        scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
-        scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
-        int32x4_t sumi = vdupq_n_s32(0);
-        for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
-            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
-            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
-            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
-            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
-            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
-            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
-            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
-            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
-            const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
-            const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
-            const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
-            const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
-            const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
-            sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
-            q2 += 8;
-        }
-        sumf += d*vaddvq_s32(sumi);
-    }
-    *s = 0.125f * sumf;
-
-#elif defined(__AVX2__)
-
-    const __m256i mone = _mm256_set1_epi8(1);
-    static const char block_sign_shuffle_mask_1[32] = {
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-    };
-    static const char block_sign_shuffle_mask_2[32] = {
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
-    };
-    static const uint8_t bit_selector_mask_bytes[32] = {
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
-    const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
-    const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
-
-    static const uint8_t k_bit_helper[32] = {
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-    };
-    const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
-    const __m256i m511 = _mm256_set1_epi16(511);
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    uint64_t aux64;
-
-    // somewhat hacky, but gives a significant boost in performance
-    __m256i aux_gindex;
-    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        __m128i stmp = _mm_set1_epi64x(aux64);
-        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
-        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
-
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
-
-            const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2);  q2 += 16;
-            aux_gindex = _mm256_and_si256(q2_data, m511);
-
-            const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
-            const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
-            const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
-
-            const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
-            const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
-
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
-                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
-                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
-            const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
-                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
-            const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
-                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
-
-            const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
-            const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
-            const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
-            const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
-
-            __m256i signs;
-            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
-
-            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
-
-            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
-
-            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
-
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const __m256i dot3  = _mm256_maddubs_epi16(q2_3, q8s_3);
-            const __m256i dot4  = _mm256_maddubs_epi16(q2_4, q8s_4);
-
-            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
-            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
-            const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
-            const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
-
-            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
-            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
-            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
-            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__AVX__)
-    const __m128i mone = _mm_set1_epi8(1);
-    static const char block_sign_shuffle_mask_1[32] = {
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-    };
-    static const char block_sign_shuffle_mask_2[32] = {
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
-    };
-    static const uint8_t bit_selector_mask_bytes[32] = {
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
-    const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
-    const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
-    const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
-    const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
-    const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
-
-    static const uint8_t k_bit_helper[32] = {
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-    };
-    const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
-    const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
-    const __m128i m511 = _mm_set1_epi16(511);
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    uint64_t aux64;
-
-    // somewhat hacky, but gives a significant boost in performance
-    __m256i aux_gindex;
-    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        __m128i stmp = _mm_set1_epi64x(aux64);
-        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
-        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
-
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
-
-            const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
-            const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1);  q2 += 16;
-            aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
-
-            const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
-            const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
-            const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
-            const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
-            const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
-            const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
-
-            const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
-            const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
-            const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
-            const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
-
-            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-
-            const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
-            const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
-            const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
-            const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
-            const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
-            const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
-            const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
-            const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
-
-            // AVX2 full_signs_1 is full_sign_bits_0 here
-            // AVX2 full_signs_2 is full_sign_bits_1 here
-            __m128i signs_0, signs_1;
-            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
-            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
-            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
-            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
-            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
-            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
-
-            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
-            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
-            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
-            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
-            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
-            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
-
-            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
-            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
-            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
-            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
-            const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
-            const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
-
-            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
-            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
-            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
-            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
-            const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
-            const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
-
-            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
-            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
-            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
-            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
-            const __m128i dot3_0  = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
-            const __m128i dot3_1  = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
-            const __m128i dot4_0  = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
-            const __m128i dot4_1  = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
-
-            __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
-            const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
-            const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
-            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
-            const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
-            const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
-            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
-            const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
-            const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
-            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
-            const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
-            const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
-
-            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
-            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
-            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
-            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
-            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
-            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
-            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
-            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
-        }
-
-        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__loongarch_asx)
-
-    const __m256i mone = __lasx_xvreplgr2vr_b(1);
-    static const char block_sign_shuffle_mask_1[32] = {
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-    };
-    static const char block_sign_shuffle_mask_2[32] = {
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
-    };
-    static const uint8_t bit_selector_mask_bytes[32] = {
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m256i bit_selector_mask = __lasx_xvld((const __m256i*)bit_selector_mask_bytes, 0);
-    const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0);
-    const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0);
-
-    static const uint8_t k_bit_helper[32] = {
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-    };
-    const __m256i bit_helper = __lasx_xvld((const __m256i*)k_bit_helper, 0);
-    const __m256i m511 = __lasx_xvreplgr2vr_h(511);
-    const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
-    const __m128i m1 = __lsx_vreplgr2vr_b(1);
-
-    uint64_t aux64;
-
-    // somewhat hacky, but gives a significant boost in performance
-    __m256i aux_gindex;
-    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
-
-    __m256 accumf = (__m256)__lasx_xvldi(0);
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        __m128i stmp = __lsx_vreplgr2vr_d(aux64);
-        stmp = __lsx_vilvl_b( __lsx_vand_v(__lsx_vsrli_h(stmp, 4), m4), __lsx_vand_v(stmp, m4));
-        const __m128i scales = __lsx_vadd_b(__lsx_vslli_h(stmp, 1), m1);
-
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
-
-            const __m256i q2_data = __lasx_xvld((const __m256i*)q2, 0);  q2 += 16;
-            aux_gindex = __lasx_xvand_v(q2_data, m511);
-
-            const __m256i partial_sign_bits = __lasx_xvsrli_h(q2_data, 9);
-            const __m256i partial_sign_bits_upper = __lasx_xvsrli_h(q2_data, 13);
-            const __m256i partial_sign_bits_for_counting = __lasx_xvxor_v(partial_sign_bits, partial_sign_bits_upper);
-
-            const __m256i odd_bits = lasx_shuffle_b(bit_helper, partial_sign_bits_for_counting);
-            const __m256i full_sign_bits = __lasx_xvor_v(partial_sign_bits, odd_bits);
-
-            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_3 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_4 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-
-            const __m256i q2_1 = lasx_set_d(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
-                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
-            const __m256i q2_2 = lasx_set_d(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
-                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
-            const __m256i q2_3 = lasx_set_d(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
-                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
-            const __m256i q2_4 = lasx_set_d(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
-                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
-
-            const __m128i full_signs_l = lasx_extracti128(full_sign_bits, 0);
-            const __m128i full_signs_h = lasx_extracti128(full_sign_bits, 1);
-            const __m256i full_signs_1 = lasx_insertf128(full_signs_l, full_signs_l);
-            const __m256i full_signs_2 = lasx_insertf128(full_signs_h, full_signs_h);
-
-            __m256i signs;
-            signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_1);
-            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1);
-
-            signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_2);
-            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2);
-
-            signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_1);
-            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_3 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_3);
-
-            signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_2);
-            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_4 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_4);
-
-            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
-            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
-            const __m256i dot3  = lasx_maddubs_h(q2_3, q8s_3);
-            const __m256i dot4  = lasx_maddubs_h(q2_4, q8s_4);
-
-            const __m256i sc1 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+0)));
-            const __m256i sc2 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+1)));
-            const __m256i sc3 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+2)));
-            const __m256i sc4 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+3)));
-
-            sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot1, sc1));
-            sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot2, sc2));
-            sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot3, sc3));
-            sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot4, sc4));
-        }
-
-        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-#elif defined(__POWER9_VECTOR__)
-    const vector int v0 = vec_splats((int32_t)0);
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint16_t * restrict q2 = x[i].qs;
-        const uint8_t  * restrict sc = x[i].scales;
-        const int8_t  *  restrict q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            __builtin_prefetch(q2, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
-            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
-            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
-            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
-
-            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
-            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
-            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
-            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
-            q2 += 8;
-
-            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
-            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
-            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
-            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
-
-            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
-            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
-            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
-            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
-            sc += 2;
-
-            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
-            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
-            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
-            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
-
-            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = 0.125f * vec_extract(vsumf0, 0);
-#else
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const uint8_t  * restrict sc = x[i].scales;
-        const int8_t   * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
-            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 2; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls1;
-            sumi = 0;
-            for (int l = 2; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls2;
-            q2 += 4;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.125f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq2_s * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
-
-    const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
-    const uint8x16_t        mask2 = vld1q_u8(k_mask2);
-    const uint8x16_t m1 = vdupq_n_u8(1);
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    uint8x16x2_t vs;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * restrict q8 = y[i].qs;
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
-            qs += 8;
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vceqq_u8(vs.val[0], mask2);
-            vs.val[1] = vceqq_u8(vs.val[1], mask2);
-
-            q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
-            q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vceqq_u8(vs.val[0], mask2);
-            vs.val[1] = vceqq_u8(vs.val[1], mask2);
-
-            signs += 4;
-
-            q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
-            q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
-
-            const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
-            const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
-            const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
-            const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
-
-            sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
-            sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >>  4));
-            sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
-            sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >>  4));
-        }
-        sumf += d*(sumi1 + sumi2);
-    }
-
-    *s = 0.125f * sumf;
-
-#elif defined(__AVX2__)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
-    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
-
-    uint64_t aux64;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * restrict q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
-        const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
-
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
-                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
-                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
-                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
-                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
-                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
-                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
-            qs += 8;
-
-            __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
-
-            aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
-
-            signs += 4;
-
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
-
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__AVX__)
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
-    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
-    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
-    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
-
-    uint64_t aux64;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * restrict q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
-        const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
-        const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
-
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
-                                                  iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
-            const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
-                                                  iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
-            const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
-                                                  iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
-            const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
-                                                  iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
-            qs += 8;
-
-            __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
-            __m128i aux128_1 = aux128_0;
-            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
-            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
-            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
-            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
-            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
-            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
-
-            aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
-            aux128_1 = aux128_0;
-            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
-            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
-            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
-            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
-            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
-            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
-
-            signs += 4;
-
-            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
-            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
-            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
-            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
-
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
-            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
-            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
-            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
-            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
-        }
-
-        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__POWER9_VECTOR__)
-    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-    };
-
-    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
-
-    const vector int v0 = vec_splats((int32_t)0);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
-    const vector unsigned char mask1 = vec_xl(16, k_mask1);
-    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint8_t *  restrict q2 = x[i].qs;
-        const uint8_t *  restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const uint8_t *  restrict sc = x[i].scales;
-        const int8_t  *  restrict q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/32; j += 2) {
-            __builtin_prefetch(q2, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
-            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
-            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
-            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
-            q2 += 8;
-            qh += 2;
-
-            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
-            vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
-            signs += 4;
-
-            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
-            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
-            vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
-            vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
-
-            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
-            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
-            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
-            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
-
-            vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
-            vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
-            vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
-            vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
-
-            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
-            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
-            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
-            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
-            sc += 2;
-
-            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
-            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
-            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
-            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
-
-            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = 0.125f * vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-
-    const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
-    const __m128i m1 = __lsx_vreplgr2vr_b(1);
-
-    const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
-    const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
-    uint64_t aux64;
-
-    __m256 accumf = (__m256)__lasx_xvldi(0);
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * restrict q8 = y[i].qs;
-
-        __m128i tmp1;
-        memcpy(&aux64, x[i].scales, 8);
-        tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64, 0);
-        tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64 >> 4, 1);
-        const __m128i scales8 = __lsx_vadd_b(__lsx_vslli_h(__lsx_vand_v(tmp1, m4), 1), m1);
-        const __m256i scales16 = lasx_ext8_16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
-
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q2_1 = lasx_set_d(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
-                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
-                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
-                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
-            const __m256i q2_2 = lasx_set_d(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
-                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
-                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
-                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
-            qs += 8;
-
-            __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | ((uint32_t) signs[1] << 16));
-            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
-            const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
-            const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
-
-            aux256 = __lasx_xvreplgr2vr_w(signs[2] | ((uint32_t) signs[3] << 16));
-            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
-            const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
-            const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
-
-            signs += 4;
-
-            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
-            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
-
-            const __m256i p1 = lasx_madd_h(dot1, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+0)));
-            const __m256i p2 = lasx_madd_h(dot2, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+1)));
-            sumi1 = __lasx_xvadd_w(sumi1, p1);
-            sumi2 = __lasx_xvadd_w(sumi2, p2);
-        }
-
-        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#else
-
-    float sumf = 0;
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const int8_t  * q8 = y[i].qs;
-        const uint8_t * qs = x[i].qs;
-        const uint8_t * qh = x[i].qh;
-        const uint8_t * signs = qs + QK_K/8;
-
-        int bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
-            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
-            int sumi1 = 0, sumi2 = 0;
-            for (int l = 0; l < 2; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
-                for (int j = 0; j < 8; ++j) {
-                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            for (int l = 2; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
-                for (int j = 0; j < 8; ++j) {
-                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += ls1 * sumi1 + ls2 * sumi2;
-            qs += 4;
-            signs += 4;
-        }
-
-        sumf += d * bsum;
-    }
-
-    *s = 0.125f * sumf;
-
-#endif
-
-}
-
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq3_xxs * restrict x = vx;
-    const block_q8_K    * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[2];
-
-    ggml_int8x16x4_t q3s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t   * restrict q8 = y[i].qs;
-        float sumf1 = 0, sumf2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
-            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
-            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
-            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
-            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
-            q3 += 16;
-            q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >>  7) & 127))));
-            q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
-            q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
-            q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
-            q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
-            q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
-            q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
-            q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
-            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
-            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
-        }
-        sumf += d*(sumf1 + sumf2);
-    }
-    *s = 0.5f * sumf;
-
-#elif defined(__AVX2__)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[2];
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t  * restrict q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
-                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            q3 += 8;
-            const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
-                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            q3 += 8;
-            memcpy(aux32, gas, 8); gas += 8;
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
-                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[0] >> 28;
-            const uint16_t ls2 = aux32[1] >> 28;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.25f * hsum_float_8(accumf);
-
-#elif defined(__AVX__)
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[2];
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t  * restrict q8 = y[i].qs;
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
-            q3 += 8;
-            const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
-            q3 += 8;
-            memcpy(aux32, gas, 8); gas += 8;
-            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
-            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
-            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
-            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
-            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
-            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
-            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
-            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
-            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
-            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
-            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
-            const uint16_t ls1 = aux32[0] >> 28;
-            const uint16_t ls2 = aux32[1] >> 28;
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
-            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
-            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
-            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
-            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
-        }
-
-        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
-
-    }
-
-    *s = 0.25f * hsum_float_8(accumf);
-
-#elif defined(__POWER9_VECTOR__)
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    const vector int v0 = vec_splats((int32_t)0);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
-        const int8_t  * restrict q8 = y[i].qs;
-
-#pragma GCC unroll 1
-        for (int j = 0; j < QK_K/32; j += 2) {
-            __builtin_prefetch(q3, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
-            vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
-            vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
-            vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
-            q3 += 16;
-
-            vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >>  0) & 127]), (uint64_t)(signs64[(signs[0] >>  7) & 127])};
-            vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
-            vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >>  0) & 127]), (uint64_t)(signs64[(signs[1] >>  7) & 127])};
-            vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
-
-            vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
-            vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
-            vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
-            vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
-
-            const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
-            const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
-            signs += 2;
-
-            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
-            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
-
-            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = 0.25f * vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[2];
-
-    __m256 accumf = (__m256)__lasx_xvldi(0);
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t  * restrict q8 = y[i].qs;
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q2_1 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
-                                                iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            q3 += 8;
-            const __m256i q2_2 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
-                                                iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            q3 += 8;
-            memcpy(aux32, gas, 8); gas += 8;
-
-            const __m256i s2_1 = lasx_set_d(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
-                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
-            const __m256i s2_2 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
-            const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
-            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
-            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[0] >> 28;
-            const uint16_t ls2 = aux32[1] >> 28;
-
-            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
-            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
-            sumi1 = __lasx_xvadd_w(sumi1, p1);
-            sumi2 = __lasx_xvadd_w(sumi2, p2);
-        }
-
-        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
-    }
-
-    *s = 0.25f * hsum_float_8(accumf);
-
-#else
-
-    uint32_t aux32;
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t  * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
-            const uint32_t ls = 2*(aux32 >> 28) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
-                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
-                for (int j = 0; j < 4; ++j) {
-                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
-                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            q3 += 8;
-            bsum += sumi * ls;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.25f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq3_s * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    typedef union {
-        uint16x8_t vec_index;
-        uint16_t   index[8];
-    } vec_index_t;
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
-
-    static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
-
-    const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
-    const uint8x16_t        mask2 = vld1q_u8(k_mask2);
-
-    const int16x8_t  hshift = vld1q_s16(k_shift);
-    const uint16x8_t m256   = vdupq_n_u16(256);
-    const uint8x16_t m1     = vdupq_n_u8(1);
-
-    uint8x16x2_t vs;
-    ggml_int8x16x4_t q3s;
-    ggml_int8x16x4_t q8b;
-    vec_index_t idx;
-
-    uint32_t scales32[2];
-    const uint8_t * scales8 = (const uint8_t *)scales32;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
-        const int8_t   * restrict q8 = y[i].qs;
-
-        memcpy(scales32, x[i].scales, 4);
-        scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
-        scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
-            idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
-            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
-                                                        iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
-            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
-                                                        iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
-            idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
-            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
-                                                        iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
-            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
-                                                        iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
-
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
-            vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
-
-            q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
-            q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
-            vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
-
-            signs += 4;
-
-            q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
-            q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
-
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
-
-            sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
-            sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
-        }
-        sumf += d*(sumi1 + sumi2);
-    }
-    *s = sumf;
-
-#elif defined(__AVX2__)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
-    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
-
-    const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-    const __m256i idx_mask  = _mm256_set1_epi32(256);
-
-    typedef union {
-        __m256i  vec[2];
-        uint32_t index[16];
-    } index_t;
-
-    index_t idx;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
-        const int8_t  * restrict q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
-            idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
-            idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
-            idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
-            idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
-            idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
-            idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
-
-            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
-            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
-            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
-            const __m256i q2_1 = _mm256_set_epi32(
-                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
-                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
-            );
-            const __m256i q2_2 = _mm256_set_epi32(
-                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
-                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
-            );
-
-            __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
-
-            aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
-
-            signs += 4;
-
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
-            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = hsum_float_8(accumf);
-
-#elif defined(__AVX__)
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
-    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
-    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
-    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
-
-    const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
-    const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
-    const __m128i idx_mask  = _mm_set1_epi32(256);
-
-    typedef union {
-        __m128i  vec[4];
-        uint32_t index[16];
-    } index_t;
-
-    index_t idx;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
-        const int8_t  * restrict q8 = y[i].qs;
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
-            const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
-            const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
-            idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
-            idx.vec[1] = idx.vec[0];
-            idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
-            idx.vec[3] = idx.vec[2];
-
-            idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
-            idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
-            idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
-            idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
-
-            idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
-            idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
-            idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
-            idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
-
-            const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
-            const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
-            const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
-            const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
-
-            __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
-            __m128i aux128_1 = aux128_0;
-            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
-            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
-            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
-            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
-            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
-            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
-
-            aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
-            aux128_1 = aux128_0;
-            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
-            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
-            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
-            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
-            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
-            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
-
-            signs += 4;
-
-            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
-            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
-            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
-            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
-            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
-            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
-            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
-            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
-            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
-            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
-        }
-
-        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
-
-    }
-
-    *s = hsum_float_8(accumf);
-
-#elif defined(__POWER9_VECTOR__)
-    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-    };
-
-    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
-
-    const vector int v0 = vec_splats((int32_t)0);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
-    const vector unsigned char mask1 = vec_xl(16, k_mask1);
-    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        const uint8_t *  restrict q3 = x[i].qs;
-        const uint8_t *  restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
-        const uint8_t *  restrict sc = x[i].scales;
-        const int8_t  *  restrict q8 = y[i].qs;
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        for (int j = 0; j < QK_K/32; j += 2) {
-            __builtin_prefetch(q3, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
-                                             iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
-            vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
-                                             iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
-            vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
-                                             iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
-            vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
-                                             iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
-            q3 += 16;
-            qh += 2;
-
-            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
-            vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
-            signs += 4;
-
-            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
-            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
-            vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
-            vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
-
-            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
-            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
-            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
-            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
-
-            vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
-            vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
-            vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
-            vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
-
-            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
-            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
-            sc ++;
-
-            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
-            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
-
-            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
-    const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
-
-    __m256i idx_shift = lasx_set_w(1, 2, 3, 4, 5, 6, 7, 8);
-    const __m256i idx_mask  = __lasx_xvreplgr2vr_w(256);
-
-    typedef union {
-        __m256i  vec[2];
-        uint32_t index[16];
-    } index_t;
-
-    index_t idx;
-
-    __m256 accumf = (__m256)__lasx_xvldi(0);
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
-        const int8_t  * restrict q8 = y[i].qs;
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i idx_l = lasx_extu8_16(__lsx_vld(qs, 0)); qs += 16;
-            idx.vec[0] = __lasx_xvreplgr2vr_w(qh[ib32+0]);
-            idx.vec[1] = __lasx_xvreplgr2vr_w(qh[ib32+1]);
-            idx.vec[0] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[0], idx_shift), idx_mask);
-            idx.vec[1] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[1], idx_shift), idx_mask);
-            idx.vec[0] = __lasx_xvor_v(idx.vec[0], lasx_ext16_32(lasx_extracti128(idx_l, 0)));
-            idx.vec[1] = __lasx_xvor_v(idx.vec[1], lasx_ext16_32(lasx_extracti128(idx_l, 1)));
-
-            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
-            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
-            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
-            const __m256i q2_1 = lasx_set_w(
-                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
-                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
-            );
-            const __m256i q2_2 = lasx_set_w(
-                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
-                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
-            );
-
-            __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | (signs[1] << 16));
-            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
-            const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
-            const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
-
-            aux256 = __lasx_xvreplgr2vr_w(signs[2] | (signs[3] << 16));
-            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
-            const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
-            const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
-
-            signs += 4;
-
-            const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
-            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
-            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
-            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
-            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
-            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
-            sumi1 = __lasx_xvadd_w(sumi1, p1);
-            sumi2 = __lasx_xvadd_w(sumi2, p2);
-        }
-
-        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
-    }
-
-    *s = hsum_float_8(accumf);
-
-#else
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint8_t * restrict signs = x[i].signs;
-        const int8_t  * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
-            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
-                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
-                for (int j = 0; j < 4; ++j) {
-                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
-                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            qs += 8;
-            signs += 4;
-            bsum += sumi * ls1;
-            sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
-                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
-                for (int j = 0; j < 4; ++j) {
-                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
-                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            qs += 8;
-            signs += 4;
-            bsum += sumi * ls2;
-        }
-        sumf += d * bsum;
-    }
-    *s = sumf;
-#endif
-}
-
-#if defined(__AVX2__)
-static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return _mm256_maddubs_epi16(ax, sy);
-}
-#elif defined(__loongarch_asx)
-static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
-    const __m256i ax = __lasx_xvsigncov_b(x, x);
-    const __m256i sy = __lasx_xvsigncov_b(x, y);
-    __m256i tmp1, tmp2, tmp3;
-    tmp1 = __lasx_xvmulwev_h_bu_b(ax, sy);
-    tmp2 = __lasx_xvmulwod_h_bu_b(ax, sy);
-    tmp3 = __lasx_xvadd_h(tmp1, tmp2);
-    return __lasx_xvsat_h(tmp3, 15);
-}
-#endif
-
-void ggml_vec_dot_iq1_s_q8_K  (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq1_s * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined __ARM_NEON
-
-    ggml_int8x16x4_t q1b;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint16_t * qh = x[i].qh;
-
-        int sumi1 = 0, sumi2 = 0, sumi3 = 0;
-
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-
-            q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700)))));
-            q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700)))));
-            q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700)))));
-            q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700)))));
-            qs += 8;
-
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]);
-
-            const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
-            const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
-            sumi1 += vaddvq_s32(p1) * ls1;
-            sumi2 += vaddvq_s32(p2) * ls2;
-            sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1)
-                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1);
-
-        }
-
-        sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    __m256 accum = _mm256_setzero_ps();
-    float accum1 = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint16_t * qh = x[i].qh;
-
-        __m256i sumi = _mm256_setzero_si256();
-        int sumi1 = 0;
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
-                                                    iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
-            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
-                                                    iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
-            qs += 8;
-            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
-            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
-            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
-            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
-            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
-                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
-        }
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
-        accum1 += d * sumi1;
-
-    }
-
-    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
-
-#elif defined __AVX__
-    __m256 accum = _mm256_setzero_ps();
-    float accum1 = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint16_t * qh = x[i].qh;
-
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        int sumi1 = 0;
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
-            const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
-            const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
-            const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
-            qs += 8;
-            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-
-            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
-            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
-            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
-            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
-            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
-            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
-
-            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
-            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
-            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
-                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
-        }
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
-        accum1 += d * sumi1;
-
-    }
-
-    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector unsigned char v0 = vec_splats((unsigned char)0x0);
-    const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = vec_splats((int32_t)0);
-        vector signed int vsumi1 = vec_splats((int32_t)0);
-        vector signed int vsumi2 = vec_splats((int32_t)0);
-        vector signed int vsumi3 = vec_splats((int32_t)0);
-        vector signed int vsumi8 = vec_splats((int32_t)0);
-
-        const uint8_t  * restrict q1 = x[i].qs;
-        const uint16_t * restrict qh = x[i].qh;
-        const int8_t   * restrict q8 = y[i].qs;
-        const int16_t  * restrict qs = y[i].bsums;
-
-        for (int j = 0; j < QK_K/32; j += 2) {
-            __builtin_prefetch(q1, 0, 1);
-            __builtin_prefetch(qh, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
-            vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
-            vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
-            vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
-            q1 += 8;
-
-            vector signed char q1x0 = (vector signed char)aux64x2_0;
-            vector signed char q1x1 = (vector signed char)aux64x2_1;
-            vector signed char q1x2 = (vector signed char)aux64x2_2;
-            vector signed char q1x3 = (vector signed char)aux64x2_3;
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
-
-            const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
-            const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
-
-            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
-            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
-            vector signed short vscales = vec_sld(vscales23, vscales01, 8);
-
-            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
-
-            vector signed short q8ysums = vec_xl_len(qs, 8);
-            qs += 4;
-            q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
-
-            vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
-            qh += 2;
-            vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
-
-            vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
-
-            vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-    __m256 accum = (__m256)__lasx_xvldi(0);
-    float accum1 = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint16_t * qh = x[i].qh;
-
-        __m256i sumi = __lasx_xvldi(0);
-        int sumi1 = 0;
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            __m256i q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)], 0);
-            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], 1);
-            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2);
-            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3);
-
-            __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0);
-            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1);
-            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2);
-            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], 3);
-
-            qs += 8;
-            const __m256i q8b_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8b_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-
-            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
-            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
-            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
-            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
-
-            __m256i tmp1, tmp5, tmp6;
-            tmp1 = __lasx_xvreplgr2vr_h(ls1);
-            tmp5 = __lasx_xvmulwev_w_h(dot1, tmp1);
-            tmp6 = __lasx_xvmulwod_w_h(dot1, tmp1);
-            const __m256i p1 = __lasx_xvadd_w(tmp5, tmp6);
-
-            tmp1 = __lasx_xvreplgr2vr_h(ls2);
-            tmp5 = __lasx_xvmulwev_w_h(dot2, tmp1);
-            tmp6 = __lasx_xvmulwod_w_h(dot2, tmp1);
-            const __m256i p2 = __lasx_xvadd_w(tmp5, tmp6);
-
-            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p1, p2));
-            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
-                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
-        }
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
-        accum1 += d * sumi1;
-    }
-
-    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
-
-#else
-
-    float sumf = 0;
-    for (int i = 0; i < nb; i++) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint16_t * qh = x[i].qh;
-
-        int sumi = 0, sumi1 = 0;
-        for (int ib = 0; ib < QK_K/32; ++ib) {
-            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
-            const int delta = qh[ib] & 0x8000 ? -1 : 1;
-            int lsum = 0;
-            for (int l = 0; l < 4; ++l) {
-                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
-                for (int j = 0; j < 8; ++j) {
-                    lsum += q8[j] * grid[j];
-                }
-                q8 += 8;
-            }
-            sumi  += ls * lsum;
-            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
-            qs += 4;
-        }
-
-        sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
-    }
-
-    *s = sumf;
-
-#endif
-}
-
-void ggml_vec_dot_iq1_m_q8_K  (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq1_m * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-    iq1m_scale_t scale;
-
-#if defined __ARM_NEON
-    const int32x4_t mask  = vdupq_n_s32(0x7);
-    const int32x4_t mone  = vdupq_n_s32(1);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x4_t deltas;
-    deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
-    deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
-    deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
-    deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
-
-    ggml_int8x16x4_t q1b;
-    ggml_int8x16x4_t q8b;
-
-    uint32_t aux32;
-    const uint8_t * aux8 = (const uint8_t *)&aux32;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint8_t  * qh = x[i].qh;
-        const uint16_t * sc = (const uint16_t *)x[i].scales;
-
-        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-
-        int32x4_t sumi1 = mzero;
-        int32x4_t sumi2 = mzero;
-
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-
-            q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
-            q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
-            q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
-            q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
-
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
-            const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
-            const int32x4_t p12 = vpaddq_s32(p1, p2);
-
-            const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
-            aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
-
-            const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
-            const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
-            const int32x4_t p34 = vpaddq_s32(p3, p4);
-
-            int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
-
-            scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
-
-            sumi1 = vmlaq_s32(sumi1, scales_4, p12);
-            sumi2 = vmlaq_s32(sumi2, scales_4, p34);
-
-            qs += 8; qh += 4;
-
-        }
-
-        sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i mask = _mm256_set1_epi16(0x7);
-    const __m256i mone = _mm256_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint8_t  * qh = x[i].qh;
-        const uint16_t * sc = (const uint16_t *)x[i].scales;
-
-        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m256i q1b_1 = _mm256_set_epi64x(
-                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
-                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
-            );
-            const __m256i q1b_2 = _mm256_set_epi64x(
-                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
-                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
-            );
-            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
-            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
-
-            const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-            const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-
-            const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
-            const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
-
-            __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
-            __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
-
-            scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
-            scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
-            const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
-            const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
-            const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
-            const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
-
-            sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
-            sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
-
-            qs += 8; qh += 4;
-        }
-
-        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
-
-        accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
-        accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
-    }
-
-    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
-
-#elif defined __AVX__
-    const __m128i mask = _mm_set1_epi16(0x7);
-    const __m128i mone = _mm_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint8_t  * qh = x[i].qh;
-        const uint16_t * sc = (const uint16_t *)x[i].scales;
-
-        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q1b_1_0 = _mm_set_epi64x(
-                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
-            const __m128i q1b_1_1 = _mm_set_epi64x(
-                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
-            const __m128i q1b_2_0 = _mm_set_epi64x(
-                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
-            const __m128i q1b_2_1 = _mm_set_epi64x(
-                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
-            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-
-            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
-            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
-            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
-            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
-
-            const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-            const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-            const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-            const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-
-            const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
-            const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
-            const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
-            const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
-
-            __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
-            __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
-            __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
-            __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
-
-            scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
-            scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
-            scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
-            scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
-            const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
-            const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
-            const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
-            const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
-
-            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
-            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
-            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
-            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
-
-            qs += 8; qh += 4;
-        }
-
-        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
-
-        accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
-        accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
-    }
-
-    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
-
-#else
-
-    int sum1[2], sum2[2], delta[4];
-
-    float sumf = 0;
-    for (int i = 0; i < nb; i++) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint8_t  * qh = x[i].qh;
-        const uint16_t * sc = (const uint16_t *)x[i].scales;
-
-        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib = 0; ib < QK_K/32; ++ib) {
-            delta[0] = qh[0] & 0x08 ? -1 : 1;
-            delta[1] = qh[0] & 0x80 ? -1 : 1;
-            delta[2] = qh[1] & 0x08 ? -1 : 1;
-            delta[3] = qh[1] & 0x80 ? -1 : 1;
-            sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
-            for (int l = 0; l < 4; ++l) {
-                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
-                int lsum1 = 0, lsum2 = 0;
-                for (int j = 0; j < 8; ++j) {
-                    lsum1 += q8[j] * grid[j];
-                    lsum2 += q8[j];
-                }
-                q8 += 8;
-                sum1[l/2] += lsum1;
-                sum2[l/2] += lsum2*delta[l];
-            }
-
-            const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
-            const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
-
-            sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
-            sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
-            qs += 4;
-            qh += 2;
-        }
-
-        sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
-    }
-
-    *s = sumf;
-
-#endif
-}
-
-void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK4_NL == 0);
-    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
-
-    const block_iq4_nl * restrict x = vx;
-    const block_q8_0   * restrict y = vy;
-
-    const int nb = n / QK4_NL;
-
-    int ib = 0;
-    float sumf = 0;
-
-#if defined __ARM_NEON
-    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
-    const uint8x16_t m4b = vdupq_n_u8(0x0f);
-    uint8x16x2_t q4bits;
-    int8x16x4_t q4b;
-    int8x16x4_t q8b;
-    int32x4_t prod_1, prod_2;
-
-    for (; ib + 1 < nb; ib += 2) {
-
-        q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
-        q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
-        q8b.val[0]    = vld1q_s8(y[ib + 0].qs);
-        q8b.val[1]    = vld1q_s8(y[ib + 0].qs + 16);
-        q8b.val[2]    = vld1q_s8(y[ib + 1].qs);
-        q8b.val[3]    = vld1q_s8(y[ib + 1].qs + 16);
-
-        q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
-        q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
-        q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
-        q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
-
-        prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
-        prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
-
-        sumf +=
-            GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
-            GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
-    }
-
-#elif defined __AVX2__
-
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-    const __m256i mone = _mm256_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
-        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
-        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
-        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
-        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
-        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
-        const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
-        accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
-                _mm256_cvtepi32_ps(p_1), accum1);
-        accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
-                _mm256_cvtepi32_ps(p_2), accum2);
-    }
-
-    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
-
-#elif defined __AVX__
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-    const __m128i mone = _mm_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
-        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
-        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
-        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
-        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
-        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
-
-        const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
-        const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
-        const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
-        const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
-        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
-        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
-        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
-        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
-        const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
-        const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
-        const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
-        const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
-        accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
-        accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
-    }
-
-    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-
-    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
-
-#pragma GCC unroll 4
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
-        vector signed char q4x0 = vec_and(qxs, lowMask);
-        vector signed char q4x1 = vec_sr(qxs, v4);
-
-        q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
-        q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
-
-        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
-        vector signed char q8y1 = vec_xl(16, y[ib].qs);
-
-        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
-        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-
-        vsumi0 = vec_sum4s(qv0, vsumi0);
-        vsumi1 = vec_sum4s(qv1, vsumi1);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined (__loongarch_asx)
-
-    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
-    const __m128i m4b  = __lsx_vreplgr2vr_b(0x0f);
-    const __m256i mone = __lasx_xvreplgr2vr_h(1);
-
-    __m256 accum1 = (__m256)__lasx_xvldi(0);
-    __m256 accum2 = (__m256)__lasx_xvldi(0);
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[ib + 0].qs, 0);
-        const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[ib + 1].qs, 0);
-        const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[ib + 0].qs, 0);
-        const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[ib + 1].qs, 0);
-        const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)),
-                                              lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b)));
-        const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)),
-                                              lsx_shuffle_b(values128, __lsx_vand_v(q4bits_2, m4b)));
-        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-        const __m256i p_1 = lasx_madd_h(p16_1, mone);
-        const __m256i p_2 = lasx_madd_h(p16_2, mone);
-        accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
-                __lasx_xvffint_s_w(p_1), accum1);
-        accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
-                __lasx_xvffint_s_w(p_2), accum2);
-    }
-
-    sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
-
-#endif
-    for (; ib < nb; ++ib) {
-        const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
-        int sumi1 = 0, sumi2 = 0;
-        for (int j = 0; j < QK4_NL/2; ++j) {
-            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
-            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
-        }
-        sumf += d * (sumi1 + sumi2);
-    }
-    *s = sumf;
-}
-
-void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK_K == 0);
-
-    const block_iq4_xs * restrict x = vx;
-    const block_q8_K   * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined __ARM_NEON
-    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
-    const uint8x16_t m4b = vdupq_n_u8(0x0f);
-    ggml_uint8x16x2_t q4bits;
-    ggml_int8x16x4_t q4b;
-    ggml_int8x16x4_t q8b;
-    int32x4_t prod_1, prod_2;
-
-    float sumf = 0;
-
-    for (int ibl = 0; ibl < nb; ++ibl) {
-
-        const int8_t  * q8 = y[ibl].qs;
-        const uint8_t * q4 = x[ibl].qs;
-        uint16_t h = x[ibl].scales_h;
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib = 0; ib < QK_K/64; ++ib) {
-
-            q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
-            q8b    = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
-            q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
-            q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
-            q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
-
-            prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
-            prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
-
-            int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
-            int ls2 = ((x[ibl].scales_l[ib] >>  4) | ((h << 2) & 0x30)) - 32;
-            h >>= 4;
-            sumi1 += vaddvq_s32(prod_1) * ls1;
-            sumi2 += vaddvq_s32(prod_2) * ls2;
-
-        }
-
-        sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-
-    __m256 accum = _mm256_setzero_ps();
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const uint8_t * qs = x[ibl].qs;
-        const int8_t  * q8 = y[ibl].qs;
-        uint16_t sh = x[ibl].scales_h;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
-            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
-            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-            const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
-            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
-            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
-            sh >>= 4;
-            const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
-            const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
-            sumi1 = _mm256_add_epi32(p_1, sumi1);
-            sumi2 = _mm256_add_epi32(p_2, sumi2);
-        }
-        accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
-                _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
-    }
-
-    *s = hsum_float_8(accum);
-
-#elif defined __AVX__
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-
-    __m256 accum = _mm256_setzero_ps();
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const uint8_t * qs = x[ibl].qs;
-        const int8_t  * q8 = y[ibl].qs;
-        uint16_t sh = x[ibl].scales_h;
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
-            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
-            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
-            const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
-            const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
-            const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
-            const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
-            const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
-            const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
-            const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
-            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
-            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
-            sh >>= 4;
-            const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
-            const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
-            const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
-            const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
-            sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
-            sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
-            sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
-            sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
-        }
-        __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
-        __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
-        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
-    }
-
-    *s = hsum_float_8(accum);
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
-
-    for (int ibl = 0; ibl < nb; ++ibl) {
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
-        vector float vyd = vec_splats(y[ibl].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        uint16_t h = x[ibl].scales_h;
-
-        const uint8_t * restrict q4 = x[ibl].qs;
-        const uint8_t * restrict sc = x[ibl].scales_l;
-        const int8_t  * restrict q8 = y[ibl].qs;
-
-        for (int ib = 0; ib < QK_K/64; ib ++ ) {
-            __builtin_prefetch(q4, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
-            q4 += 32;
-
-            vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
-            vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
-            vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
-            vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
-
-            q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
-            q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
-            q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
-            q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
-
-            const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
-            const uint16_t ls1 = (uint16_t)(((sc[0] >>  4) | ((h << 2) & 0x30)) - 32);
-            h >>= 4;
-            sc ++;
-
-            vector signed short vscales01 = vec_splats((int16_t)ls0);
-            vector signed short vscales23 = vec_splats((int16_t)ls1);
-
-            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
-    const __m128i m4b  = __lsx_vreplgr2vr_b(0x0f);
-
-    __m256 accum = (__m256)__lasx_xvldi(0);
-    __m256i tmp1;
-    __m128i tmp0, tmp2, tmp3, tmp4, mask_8f, mask;
-
-    mask_8f = __lsx_vreplgr2vr_b(0x8f);
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const uint8_t * qs = x[ibl].qs;
-        const int8_t  * q8 = y[ibl].qs;
-        uint16_t sh = x[ibl].scales_h;
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        __m128i zero = __lsx_vldi(0);
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0);  qs += 16;
-            const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0);  qs += 16;
-            const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b), mask_8f);
-            tmp0 = __lsx_vori_b(tmp2, 0x10);
-            mask = __lsx_vsle_b(zero, tmp2);
-            tmp3 = __lsx_vand_v(tmp0, mask);
-            tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
-
-            tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_1, m4b), mask_8f);
-            tmp0 = __lsx_vori_b(tmp2, 0x10);
-            mask = __lsx_vsle_b(zero, tmp2);
-            tmp4 = __lsx_vand_v(tmp0, mask);
-            tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
-
-            const __m256i q4b_1 = lasx_insertf128(tmp3, tmp4);
-
-            tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b), mask_8f);
-            tmp0 = __lsx_vori_b(tmp2, 0x10);
-            mask = __lsx_vsle_b(zero, tmp2);
-            tmp3 = __lsx_vand_v(tmp0, mask);
-            tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
-
-            tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_2, m4b), mask_8f);
-            tmp0 = __lsx_vori_b(tmp2, 0x10);
-            mask = __lsx_vsle_b(zero, tmp2);
-            tmp4 = __lsx_vand_v(tmp0, mask);
-            tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
-
-            const __m256i q4b_2 = lasx_insertf128(tmp3, tmp4);
-
-            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
-            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
-            sh >>= 4;
-            __m256i tmp5, tmp6;
-            tmp1 = __lasx_xvreplgr2vr_h(ls1);
-            tmp5 = __lasx_xvmulwev_w_h(p16_1, tmp1);
-            tmp6 = __lasx_xvmulwod_w_h(p16_1, tmp1);
-            const __m256i p_1 = __lasx_xvadd_w(tmp5, tmp6);
-            tmp1 = __lasx_xvreplgr2vr_h(ls2);
-            tmp5 = __lasx_xvmulwev_w_h(p16_2, tmp1);
-            tmp6 = __lasx_xvmulwod_w_h(p16_2, tmp1);
-            const __m256i p_2 = __lasx_xvadd_w(tmp5, tmp6);
-            sumi1 = __lasx_xvadd_w(p_1, sumi1);
-            sumi2 = __lasx_xvadd_w(p_2, sumi2);
-        }
-        accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
-                __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
-    }
-
-    *s = hsum_float_8(accum);
-
-#else
-    float sumf = 0;
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
-        uint16_t h = x[ibl].scales_h;
-        const uint8_t * qs = x[ibl].qs;
-        const int8_t  * q8 = y[ibl].qs;
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
-            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
-            h >>= 4;
-            const float d1 = d4d8*(ls1 - 32);
-            const float d2 = d4d8*(ls2 - 32);
-            int sumi1 = 0, sumi2 = 0;
-            for (int j = 0; j < 16; ++j) {
-                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
-                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
-            }
-            sumf += d1 * (sumi1 + sumi2);
-            qs += 16;
-            q8 += 32;
-            sumi1 = sumi2 = 0;
-            for (int j = 0; j < 16; ++j) {
-                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
-                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
-            }
-            sumf += d2 * (sumi1 + sumi2);
-            qs += 16;
-            q8 += 32;
-        }
-    }
-    *s = sumf;
-#endif
-}
-
 // ================================ IQ2 quantization =============================================
 
 typedef struct {
@@ -14236,12 +3770,6 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
     return nrow * nblock * sizeof(block_iq3_xxs);
 }
 
-void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_iq3_xxs * restrict y = vy;
-    quantize_row_iq3_xxs_ref(x, y, k);
-}
-
 void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
@@ -14452,12 +3980,6 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
     return nrow * nblock * sizeof(block_iq3_s);
 }
 
-void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_iq3_s * restrict y = vy;
-    quantize_row_iq3_s_ref(x, y, k);
-}
-
 void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_iq3_s(x, y, 1, k, NULL);
@@ -15181,7 +4703,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
     return nrow * nblock * sizeof(block_iq4_nl);
 }
 
-void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k) {
+//void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
     GGML_ASSERT(k%QK4_NL == 0);
     int64_t nblock = k/QK4_NL;
     uint8_t L[QK4_NL];
@@ -15189,18 +4712,13 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k
     uint16_t unused_h;
     uint8_t * unused_l = NULL;
     float scale;
-    block_iq4_nl * iq4 = (block_iq4_nl *)vy;
+    block_iq4_nl * iq4 = y;
     for (int ibl = 0; ibl < nblock; ++ibl) {
         quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
                 &scale, weight, L, kvalues_iq4nl, NULL, -1);
     }
 }
 
-void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
-    assert(k % QK4_NL == 0);
-    quantize_row_iq4_nl(x, y, k);
-}
-
 size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
@@ -15221,12 +4739,6 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
     return nrow * nblock * sizeof(block_iq4_xs);
 }
 
-void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_iq4_xs * restrict y = vy;
-    quantize_row_iq4_xs_ref(x, y, k);
-}
-
 void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_iq4_xs(x, y, 1, k, NULL);
@@ -15419,11 +4931,7 @@ void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y,
     quantize_iq2_s(x, y, 1, k, NULL);
 }
 
-void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_iq2_s * restrict y = vy;
-    quantize_row_iq2_s_ref(x, y, k);
-}
+// =============================== data validation
 
 static bool validate_float(float f, size_t i) {
     if (isinf(f)) {
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index df9c4b24a..d09173e11 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -11,136 +11,89 @@
 extern "C" {
 #endif
 
+// NOTE: these functions are defined as GGML_API because they used by the CPU backend
+
 // Quantization
-void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
 
-void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
 
-void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
 
-void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
 
 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 
-void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 
-void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 
-void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-
-// Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
-size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
-size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
-size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
-void iq2xs_init_impl(enum ggml_type type);
-void iq2xs_free_impl(enum ggml_type type);
-void iq3xs_init_impl(int grid_size);
-void iq3xs_free_impl(int grid_size);
+GGML_API void iq2xs_init_impl(enum ggml_type type);
+GGML_API void iq2xs_free_impl(enum ggml_type type);
+GGML_API void iq3xs_init_impl(int grid_size);
+GGML_API void iq3xs_free_impl(int grid_size);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-rpc/CMakeLists.txt b/ggml/src/ggml-rpc/CMakeLists.txt
new file mode 100644
index 000000000..a2d6770eb
--- /dev/null
+++ b/ggml/src/ggml-rpc/CMakeLists.txt
@@ -0,0 +1,11 @@
+message(STATUS "Using RPC backend")
+
+add_library(ggml-rpc
+            ggml-rpc.cpp)
+
+target_link_libraries(ggml-rpc PRIVATE ggml-base)
+target_include_directories(ggml-rpc PRIVATE . ..)
+
+if (WIN32)
+    target_link_libraries(ggml-rpc PRIVATE ws2_32)
+endif()
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
similarity index 99%
rename from ggml/src/ggml-rpc.cpp
rename to ggml/src/ggml-rpc/ggml-rpc.cpp
index 8a772f224..47357daab 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -671,7 +671,7 @@ static ggml_backend_i ggml_backend_rpc_interface = {
     /* .event_wait              = */ NULL,
 };
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
+ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
     static std::mutex mutex;
     std::lock_guard<std::mutex> lock(mutex);
     // NOTE: buffer types are allocated and never freed; this is by design
@@ -718,7 +718,7 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
     return backend;
 }
 
-GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) {
+bool ggml_backend_is_rpc(ggml_backend_t backend) {
     return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
 }
 
@@ -730,7 +730,7 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
     *total = response.total_mem;
 }
 
-GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
+void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
     auto sock = get_socket(endpoint);
     if (sock == nullptr) {
         *free = 0;
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
new file mode 100644
index 000000000..03bf3cb92
--- /dev/null
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -0,0 +1,81 @@
+if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
+    message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
+endif()
+
+check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
+
+if (DEFINED ENV{ONEAPI_ROOT})
+    message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
+elseif(SUPPORTS_SYCL)
+    message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
+        If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
+        source /opt/intel/oneapi/setvars.sh")
+else()
+    message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
+endif()
+message(STATUS "SYCL found")
+#todo: AOT
+
+add_library(ggml-sycl
+            ggml-sycl.cpp
+            ../../include/ggml-sycl.h)
+
+target_link_libraries(ggml-sycl PRIVATE ggml-base)
+target_include_directories(ggml-sycl PRIVATE . ..)
+
+if (GGML_SYCL_F16)
+    if (GGML_SYCL_TARGET STREQUAL "AMD")
+        message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
+    endif()
+    add_compile_definitions(GGML_SYCL_F16)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
+
+if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+    add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
+elseif (GGML_SYCL_TARGET STREQUAL "AMD")
+    # INFO: Allowed Sub_group_sizes are not consistent through all
+    # hip targets. For example, 64 is used for certain models, but the backend
+    # does not support it.
+    # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
+    add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
+else()
+    add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
+endif()
+
+file(GLOB   GGML_HEADERS_SYCL "*.hpp")
+file(GLOB   GGML_SOURCES_SYCL "*.cpp")
+target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
+
+find_package(DNNL)
+message("-- DNNL found:" ${DNNL_FOUND})
+
+if (GGML_SYCL_TARGET STREQUAL "INTEL")
+    add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
+else()
+    add_compile_definitions(GGML_SYCL_DNNL=0)
+endif()
+
+if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+    target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
+endif()
+
+if (WIN32)
+    find_package(IntelSYCL REQUIRED)
+    find_package(MKL REQUIRED)
+    target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+else()
+    if (GGML_SYCL_TARGET STREQUAL "INTEL")
+        target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+    elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
+        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
+    elseif (GGML_SYCL_TARGET STREQUAL "AMD")
+        if (GGML_SYCL_HIP_TARGET STREQUAL "")
+            message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.")
+        endif()
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}")
+        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
+    endif()
+endif()
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
similarity index 100%
rename from ggml/src/ggml-sycl.cpp
rename to ggml/src/ggml-sycl/ggml-sycl.cpp
diff --git a/ggml/src/ggml-threading.cpp b/ggml/src/ggml-threading.cpp
new file mode 100644
index 000000000..25a19eedb
--- /dev/null
+++ b/ggml/src/ggml-threading.cpp
@@ -0,0 +1,12 @@
+#include "ggml-threading.h"
+#include <mutex>
+
+std::mutex ggml_critical_section_mutex;
+
+void ggml_critical_section_start() {
+    ggml_critical_section_mutex.lock();
+}
+
+void ggml_critical_section_end(void) {
+    ggml_critical_section_mutex.unlock();
+}
diff --git a/ggml/src/ggml-threading.h b/ggml/src/ggml-threading.h
new file mode 100644
index 000000000..ce975d880
--- /dev/null
+++ b/ggml/src/ggml-threading.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_critical_section_start(void);
+void ggml_critical_section_end(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt
new file mode 100644
index 000000000..1e85dd15b
--- /dev/null
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -0,0 +1,78 @@
+find_package(Vulkan COMPONENTS glslc REQUIRED)
+
+if (Vulkan_FOUND)
+    message(STATUS "Vulkan found")
+
+    add_library(ggml-vulkan
+                ggml-vulkan.cpp
+                ../../include/ggml-vulkan.h
+                )
+
+    target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan)
+    target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+
+    # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
+    # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
+    if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
+    endif()
+
+    if (GGML_VULKAN_CHECK_RESULTS)
+        add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
+    endif()
+
+    if (GGML_VULKAN_DEBUG)
+        add_compile_definitions(GGML_VULKAN_DEBUG)
+    endif()
+
+    if (GGML_VULKAN_MEMORY_DEBUG)
+        add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
+    endif()
+
+    if (GGML_VULKAN_SHADER_DEBUG_INFO)
+        add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
+    endif()
+
+    if (GGML_VULKAN_PERF)
+        add_compile_definitions(GGML_VULKAN_PERF)
+    endif()
+
+    if (GGML_VULKAN_VALIDATE)
+        add_compile_definitions(GGML_VULKAN_VALIDATE)
+    endif()
+
+    if (GGML_VULKAN_RUN_TESTS)
+        add_compile_definitions(GGML_VULKAN_RUN_TESTS)
+    endif()
+
+    add_subdirectory(vulkan-shaders)
+
+    set (_ggml_vk_genshaders_cmd vulkan-shaders-gen)
+    set (_ggml_vk_header     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
+    set (_ggml_vk_source     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
+    set (_ggml_vk_input_dir  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
+    set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
+
+    file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
+
+    add_custom_command(
+        OUTPUT ${_ggml_vk_header}
+                ${_ggml_vk_source}
+
+        COMMAND ${_ggml_vk_genshaders_cmd}
+            --glslc      ${Vulkan_GLSLC_EXECUTABLE}
+            --input-dir  ${_ggml_vk_input_dir}
+            --output-dir ${_ggml_vk_output_dir}
+            --target-hpp ${_ggml_vk_header}
+            --target-cpp ${_ggml_vk_source}
+            --no-clean
+
+        DEPENDS ${_ggml_vk_shader_deps}
+        COMMENT "Generate vulkan shaders"
+    )
+
+    target_sources(ggml-vulkan PRIVATE ${_ggml_vk_source} ${_ggml_vk_header})
+
+else()
+    message(WARNING "Vulkan not found")
+endif()
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
similarity index 100%
rename from ggml/src/ggml-vulkan.cpp
rename to ggml/src/ggml-vulkan/ggml-vulkan.cpp
diff --git a/ggml/src/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
similarity index 100%
rename from ggml/src/vulkan-shaders/CMakeLists.txt
rename to ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
diff --git a/ggml/src/vulkan-shaders/acc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/acc.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
diff --git a/ggml/src/vulkan-shaders/add.comp b/ggml/src/ggml-vulkan/vulkan-shaders/add.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/add.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/add.comp
diff --git a/ggml/src/vulkan-shaders/argsort.comp b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/argsort.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
diff --git a/ggml/src/vulkan-shaders/clamp.comp b/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/clamp.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp
diff --git a/ggml/src/vulkan-shaders/concat.comp b/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/concat.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/concat.comp
diff --git a/ggml/src/vulkan-shaders/contig_copy.comp b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/contig_copy.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
diff --git a/ggml/src/vulkan-shaders/copy.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/copy.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
diff --git a/ggml/src/vulkan-shaders/cos.comp b/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/cos.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/cos.comp
diff --git a/ggml/src/vulkan-shaders/dequant_f32.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_f32.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp
diff --git a/ggml/src/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_funcs.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
diff --git a/ggml/src/vulkan-shaders/dequant_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_head.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp
diff --git a/ggml/src/vulkan-shaders/dequant_iq4_nl.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_iq4_nl.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
diff --git a/ggml/src/vulkan-shaders/dequant_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_q2_k.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
diff --git a/ggml/src/vulkan-shaders/dequant_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_q3_k.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp
diff --git a/ggml/src/vulkan-shaders/dequant_q4_0.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_q4_0.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp
diff --git a/ggml/src/vulkan-shaders/dequant_q4_1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_q4_1.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp
diff --git a/ggml/src/vulkan-shaders/dequant_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_q4_k.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
diff --git a/ggml/src/vulkan-shaders/dequant_q5_0.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_q5_0.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp
diff --git a/ggml/src/vulkan-shaders/dequant_q5_1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_q5_1.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp
diff --git a/ggml/src/vulkan-shaders/dequant_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_q5_k.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
diff --git a/ggml/src/vulkan-shaders/dequant_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_q6_k.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp
diff --git a/ggml/src/vulkan-shaders/dequant_q8_0.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/dequant_q8_0.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp
diff --git a/ggml/src/vulkan-shaders/diag_mask_inf.comp b/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/diag_mask_inf.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp
diff --git a/ggml/src/vulkan-shaders/div.comp b/ggml/src/ggml-vulkan/vulkan-shaders/div.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/div.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/div.comp
diff --git a/ggml/src/vulkan-shaders/gelu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/gelu.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp
diff --git a/ggml/src/vulkan-shaders/gelu_quick.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/gelu_quick.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp
diff --git a/ggml/src/vulkan-shaders/generic_binary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/generic_binary_head.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp
diff --git a/ggml/src/vulkan-shaders/generic_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/generic_head.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp
diff --git a/ggml/src/vulkan-shaders/generic_unary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/generic_unary_head.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
diff --git a/ggml/src/vulkan-shaders/get_rows.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/get_rows.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
diff --git a/ggml/src/vulkan-shaders/get_rows_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/get_rows_quant.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
diff --git a/ggml/src/vulkan-shaders/group_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/group_norm.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
diff --git a/ggml/src/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/im2col.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
diff --git a/ggml/src/vulkan-shaders/leaky_relu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/leaky_relu.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp
diff --git a/ggml/src/vulkan-shaders/mul.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul.comp
diff --git a/ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mat_vec.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mat_vec_base.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_nc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mat_vec_nc.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_p021.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mat_vec_p021.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
diff --git a/ggml/src/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/mul_mm.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
diff --git a/ggml/src/vulkan-shaders/norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/norm.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/norm.comp
diff --git a/ggml/src/vulkan-shaders/pad.comp b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/pad.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
diff --git a/ggml/src/vulkan-shaders/pool2d.comp b/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/pool2d.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp
diff --git a/ggml/src/vulkan-shaders/relu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/relu.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/relu.comp
diff --git a/ggml/src/vulkan-shaders/repeat.comp b/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/repeat.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp
diff --git a/ggml/src/vulkan-shaders/rms_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/rms_norm.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
diff --git a/ggml/src/vulkan-shaders/rope_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/rope_head.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp
diff --git a/ggml/src/vulkan-shaders/rope_neox.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/rope_neox.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
diff --git a/ggml/src/vulkan-shaders/rope_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/rope_norm.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
diff --git a/ggml/src/vulkan-shaders/scale.comp b/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/scale.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/scale.comp
diff --git a/ggml/src/vulkan-shaders/silu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/silu.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/silu.comp
diff --git a/ggml/src/vulkan-shaders/sin.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/sin.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/sin.comp
diff --git a/ggml/src/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/soft_max.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
diff --git a/ggml/src/vulkan-shaders/square.comp b/ggml/src/ggml-vulkan/vulkan-shaders/square.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/square.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/square.comp
diff --git a/ggml/src/vulkan-shaders/sum_rows.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/sum_rows.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp
diff --git a/ggml/src/vulkan-shaders/tanh.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/tanh.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp
diff --git a/ggml/src/vulkan-shaders/timestep_embedding.comp b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/timestep_embedding.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp
diff --git a/ggml/src/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/types.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/types.comp
diff --git a/ggml/src/vulkan-shaders/upscale.comp b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
similarity index 100%
rename from ggml/src/vulkan-shaders/upscale.comp
rename to ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
similarity index 100%
rename from ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
rename to ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index cd26a361b..4a97bfc32 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3,9 +3,11 @@
 
 #include "ggml-backend.h"
 #include "ggml-impl.h"
-#include "ggml-cpu-impl.h"
-#include "ggml-quants.h"
+#include "ggml-threading.h"
 #include "ggml.h"
+
+// FIXME: required here for quantization functions
+#include "ggml-quants.h"
 #include "ggml-aarch64.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -47,6 +49,9 @@
 
 #define UNUSED GGML_UNUSED
 
+// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
+float ggml_table_f32_f16[1 << 16];
+
 #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
     (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
 #include <unistd.h>
@@ -363,7 +368,7 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
 void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
     int64_t i = 0;
 #if defined(__F16C__)
-    if (ggml_cpu_has_f16c()) {
+    //if (ggml_cpu_has_f16c()) {
         for (; i + 7 < n; i += 8) {
             __m256 x_vec = _mm256_loadu_ps(x + i);
             __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
@@ -374,7 +379,7 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
             __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
             _mm_storel_epi64((__m128i *)(y + i), y_vec);
         }
-    }
+    //}
 #endif
     for (; i < n; i++) {
         y[i] = GGML_FP32_TO_FP16(x[i]);
@@ -384,7 +389,7 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
 void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
     int64_t i = 0;
 #if defined(__AVX512F__)
-    if (ggml_cpu_has_avx512()) {
+    //if (ggml_cpu_has_avx512()) {
         for (; i + 16 <= n; i += 16) {
             _mm512_storeu_ps(y + i,
                             _mm512_castsi512_ps(
@@ -394,10 +399,10 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
                                             (const __m256i *)(x + i))),
                                     16)));
         }
-    }
+    //}
 #endif
 #if defined(__AVX2__)
-    if (ggml_cpu_has_avx2()) {
+    //if (ggml_cpu_has_avx2()) {
         for (; i + 8 <= n; i += 8) {
             _mm256_storeu_ps(y + i,
                             _mm256_castsi256_ps(
@@ -407,7 +412,7 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
                                             (const __m128i *)(x + i))),
                                     16)));
         }
-    }
+    //}
 #endif
     for (; i < n; i++) {
         y[i] = GGML_BF16_TO_FP32(x[i]);
@@ -588,7 +593,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(ggml_fp16_t),
         .is_quantized             = false,
         .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
-        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
         .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_fp16_row,
     },
     [GGML_TYPE_Q4_0] = {
@@ -597,7 +601,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q4_0),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
-        .from_float               = quantize_row_q4_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_0_ref,
     },
     [GGML_TYPE_Q4_1] = {
@@ -606,7 +609,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q4_1),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
-        .from_float               = quantize_row_q4_1,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_1_ref,
     },
     [4] = { // GGML_TYPE_Q4_2
@@ -614,18 +616,12 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .blck_size                = 0,
         .type_size                = 0,
         .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_ref           = NULL,
     },
     [5] = { // GGML_TYPE_Q4_3
         .type_name                = "DEPRECATED",
         .blck_size                = 0,
         .type_size                = 0,
         .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_ref           = NULL,
     },
     [GGML_TYPE_Q5_0] = {
         .type_name                = "q5_0",
@@ -633,7 +629,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q5_0),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
-        .from_float               = quantize_row_q5_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q5_0_ref,
     },
     [GGML_TYPE_Q5_1] = {
@@ -642,7 +637,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q5_1),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
-        .from_float               = quantize_row_q5_1,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q5_1_ref,
     },
     [GGML_TYPE_Q8_0] = {
@@ -651,7 +645,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q8_0),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
-        .from_float               = quantize_row_q8_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q8_0_ref,
     },
     [GGML_TYPE_Q8_1] = {
@@ -659,7 +652,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .blck_size                = QK8_1,
         .type_size                = sizeof(block_q8_1),
         .is_quantized             = true,
-        .from_float               = quantize_row_q8_1,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q8_1_ref,
     },
     [GGML_TYPE_Q2_K] = {
@@ -668,7 +660,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q2_K),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
-        .from_float               = quantize_row_q2_K,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q2_K_ref,
     },
     [GGML_TYPE_Q3_K] = {
@@ -677,7 +668,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q3_K),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
-        .from_float               = quantize_row_q3_K,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q3_K_ref,
     },
     [GGML_TYPE_Q4_K] = {
@@ -686,7 +676,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q4_K),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
-        .from_float               = quantize_row_q4_K,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_K_ref,
     },
     [GGML_TYPE_Q5_K] = {
@@ -695,7 +684,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q5_K),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
-        .from_float               = quantize_row_q5_K,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q5_K_ref,
     },
     [GGML_TYPE_Q6_K] = {
@@ -704,7 +692,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q6_K),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
-        .from_float               = quantize_row_q6_K,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q6_K_ref,
     },
     [GGML_TYPE_IQ2_XXS] = {
@@ -713,7 +700,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq2_xxs),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
-        .from_float               = NULL,
         .from_float_ref           = NULL,
     },
     [GGML_TYPE_IQ2_XS] = {
@@ -722,7 +708,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq2_xs),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
-        .from_float               = NULL,
         .from_float_ref           = NULL,
     },
     [GGML_TYPE_IQ3_XXS] = {
@@ -731,7 +716,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq3_xxs),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq3_xxs,
-        .from_float               = quantize_row_iq3_xxs,
         .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
     },
     [GGML_TYPE_IQ3_S] = {
@@ -740,7 +724,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq3_s),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq3_s,
-        .from_float               = quantize_row_iq3_s,
         .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_s_ref,
     },
     [GGML_TYPE_IQ2_S] = {
@@ -749,7 +732,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq2_s),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_s,
-        .from_float               = quantize_row_iq2_s,
         .from_float_ref           = (ggml_from_float_t)quantize_row_iq2_s_ref,
     },
     [GGML_TYPE_IQ1_S] = {
@@ -758,7 +740,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq1_s),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq1_s,
-        .from_float               = NULL,
         .from_float_ref           = NULL,
     },
     [GGML_TYPE_IQ1_M] = {
@@ -767,7 +748,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq1_m),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq1_m,
-        .from_float               = NULL,
         .from_float_ref           = NULL,
     },
     [GGML_TYPE_IQ4_NL] = {
@@ -776,7 +756,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq4_nl),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq4_nl,
-        .from_float               = quantize_row_iq4_nl,
         .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_nl_ref,
     },
     [GGML_TYPE_IQ4_XS] = {
@@ -785,7 +764,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq4_xs),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
-        .from_float               = quantize_row_iq4_xs,
         .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_xs_ref,
     },
     [GGML_TYPE_Q8_K] = {
@@ -793,7 +771,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q8_K),
         .is_quantized             = true,
-        .from_float               = quantize_row_q8_K,
     },
     [GGML_TYPE_BF16] = {
         .type_name                = "bf16",
@@ -801,7 +778,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(ggml_bf16_t),
         .is_quantized             = false,
         .to_float                 = (ggml_to_float_t) ggml_bf16_to_fp32_row,
-        .from_float               = (ggml_from_float_t) ggml_fp32_to_bf16_row,
         .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
     },
     [GGML_TYPE_Q4_0_4_4] = {
@@ -811,7 +787,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q4_0),
         .is_quantized             = true,
         .to_float                 = NULL,
-        .from_float               = NULL,
         .from_float_ref           = NULL,
     },
     [GGML_TYPE_Q4_0_4_8] = {
@@ -821,7 +796,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q4_0),
         .is_quantized             = true,
         .to_float                 = NULL,
-        .from_float               = NULL,
         .from_float_ref           = NULL,
     },
     [GGML_TYPE_Q4_0_8_8] = {
@@ -831,7 +805,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q4_0),
         .is_quantized             = true,
         .to_float                 = NULL,
-        .from_float               = NULL,
         .from_float_ref           = NULL,
     },
     [GGML_TYPE_TQ1_0] = {
@@ -840,7 +813,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_tq1_0),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_tq1_0,
-        .from_float               = quantize_row_tq1_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_tq1_0_ref,
     },
     [GGML_TYPE_TQ2_0] = {
@@ -849,7 +821,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_tq2_0),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_tq2_0,
-        .from_float               = quantize_row_tq2_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_tq2_0_ref,
     },
 };
@@ -3646,6 +3617,22 @@ struct ggml_tensor * ggml_rope_custom_inplace(
     );
 }
 
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
+    return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
+}
+
+void ggml_rope_yarn_corr_dims(
+    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
+) {
+    // start and end correction dims
+    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
+    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
+    dims[0] = MAX(0, start);
+    dims[1] = MIN(n_dims - 1, end);
+}
+
 // ggml_rope_back
 
 struct ggml_tensor * ggml_rope_back(
@@ -8166,222 +8153,7 @@ void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
     gguf_buf_free(buf);
 }
 
-////////////////////////////////////////////////////////////////////////////////
-
-int ggml_cpu_has_avx(void) {
-#if defined(__AVX__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx_vnni(void) {
-#if defined(__AVXVNNI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx2(void) {
-#if defined(__AVX2__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512(void) {
-#if defined(__AVX512F__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_vbmi(void) {
-#if defined(__AVX512VBMI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_vnni(void) {
-#if defined(__AVX512VNNI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_bf16(void) {
-#if defined(__AVX512BF16__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_amx_int8(void) {
-#if defined(__AMX_INT8__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_fma(void) {
-#if defined(__FMA__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_arm_fma(void) {
-#if defined(__ARM_FEATURE_FMA)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_riscv_v(void) {
-#if defined(__riscv_v_intrinsic)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_metal(void) {
-#if defined(GGML_USE_METAL)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_f16c(void) {
-#if defined(__F16C__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_fp16_va(void) {
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_wasm_simd(void) {
-#if defined(__wasm_simd128__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_cuda(void) {
-#if defined(GGML_USE_CUDA)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_vulkan(void) {
-#if defined(GGML_USE_VULKAN)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_kompute(void) {
-#if defined(GGML_USE_KOMPUTE)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_sycl(void) {
-#if defined(GGML_USE_SYCL)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_rpc(void) {
-#if defined(GGML_USE_RPC)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_cann(void) {
-#if defined(GGML_USE_CANN)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_llamafile(void) {
-#if defined(GGML_USE_LLAMAFILE)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
-}
-
-int ggml_cpu_has_sse3(void) {
-#if defined(__SSE3__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_ssse3(void) {
-#if defined(__SSSE3__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_vsx(void) {
-#if defined(__POWER9_VECTOR__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
 void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
     g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
     g_logger_state.log_callback_user_data = user_data;
 }
-////////////////////////////////////////////////////////////////////////////////
diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp
index e9af8a363..2dca62848 100644
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@@ -237,7 +237,6 @@ int main(int argc, char** argv) {
     int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
     int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
 
-    const auto * funcs = ggml_get_type_traits(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
     const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
 
     std::vector<block_q4_0> q40;
@@ -263,9 +262,9 @@ int main(int argc, char** argv) {
         // Note, we do not include this in the timing as in practical application
         // we already have the quantized model weights.
         if (useQ4_1) {
-            funcs->from_float(x1.data(), q41.data(), kVecSize);
+            funcs_cpu->from_float(x1.data(), q41.data(), kVecSize);
         } else {
-            funcs->from_float(x1.data(), q40.data(), kVecSize);
+            funcs_cpu->from_float(x1.data(), q40.data(), kVecSize);
         }
 
         // Now measure time the dot product needs using the "scalar" version above
@@ -284,7 +283,7 @@ int main(int argc, char** argv) {
             dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
         }
         else {
-            const auto * vdot = ggml_get_type_traits(funcs_cpu->vec_dot_type);
+            const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
             vdot->from_float(y1.data(), q8.data(), kVecSize);
             if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
             else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 46a6ad562..a86624750 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -29,5 +29,6 @@ target_link_libraries(llama PUBLIC ggml)
 
 if (BUILD_SHARED_LIBS)
     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    target_compile_definitions(llama PRIVATE LLAMA_BUILD)
+    target_compile_definitions(llama PUBLIC  LLAMA_SHARED)
 endif()
diff --git a/src/llama.cpp b/src/llama.cpp
index 97eee26a5..6ec419e9b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7521,7 +7521,7 @@ static bool llm_load_tensors(
 
             // avoid using a host buffer when using mmap
             auto * buft_dev = ggml_backend_buft_get_device(buft);
-            if (ml.use_mmap && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
+            if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
                 auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
                 buft = ggml_backend_dev_buffer_type(cpu_dev);
             }
@@ -9128,6 +9128,10 @@ static bool llm_load_tensors(
 
         // check if it is possible to use buffer_from_host_ptr with this buffer type
         ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        if (!dev) {
+            // FIXME: workaround for CPU backend buft having a NULL device
+            dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
+        }
         ggml_backend_dev_props props;
         ggml_backend_dev_get_props(dev, &props);
         bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
@@ -22021,7 +22025,6 @@ const char * llama_print_system_info(void) {
     s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
     s += "RISCV_VECT = "  + std::to_string(ggml_cpu_has_riscv_v())     + " | ";
     s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
-    s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
     s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
     s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
     s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 000e60adf..8d0bf0470 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -45,22 +45,23 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
 }
 
 // Total quantization error on test data
-static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
+static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
     std::vector<uint8_t> tmp_q(2*test_size);
     std::vector<float> tmp_out(test_size);
 
-    qfns->from_float(test_data, tmp_q.data(), test_size);
+    qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
     qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
     return array_rmse(test_data, tmp_out.data(), test_size);
 }
 
 // Total quantization error on test data
-static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
+static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
     std::vector<uint8_t> tmp_q(2*test_size);
     std::vector<float> tmp_out(test_size);
     std::vector<float> tmp_out_ref(test_size);
 
-    qfns->from_float(test_data, tmp_q.data(), test_size);
+    // FIXME: why is done twice?
+    qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
     qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
 
     qfns->from_float_ref(test_data, tmp_q.data(), test_size);
@@ -84,9 +85,9 @@ static float dot_product_error(
     std::vector<uint8_t> tmp_q1(2*test_size);
     std::vector<uint8_t> tmp_q2(2*test_size);
 
-    const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
+    const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
 
-    qfns->from_float(test_data1, tmp_q1.data(), test_size);
+    qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
     vdot->from_float(test_data2, tmp_q2.data(), test_size);
 
     float result = INFINITY;
@@ -145,8 +146,8 @@ int main(int argc, char * argv[]) {
         printf("Testing %s\n", ggml_type_name((ggml_type) i));
         ggml_quantize_init(ei);
 
-        if (qfns->from_float && qfns->to_float) {
-            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
+        if (qfns_cpu->from_float && qfns->to_float) {
+            const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
             const float max_quantization_error =
                 type == GGML_TYPE_TQ1_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
                 type == GGML_TYPE_TQ2_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
@@ -161,7 +162,7 @@ int main(int argc, char * argv[]) {
                 printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
             }
 
-            const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
+            const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
             failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
             num_failed += failed;
             if (failed || verbose) {
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 221424de8..ac0d12714 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -123,9 +123,10 @@ static void usage(char * argv[]) {
     printf("  --type TYPE           set test type as");
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        const auto * qfns = ggml_get_type_traits(type);
+        const auto * qfns     = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
         if (ggml_type_name(type) != NULL) {
-            if (qfns->from_float && qfns->to_float) {
+            if (qfns_cpu->from_float && qfns->to_float) {
                 printf(" %s", ggml_type_name(type));
             }
         }
@@ -277,7 +278,7 @@ int main(int argc, char * argv[]) {
             continue;
         }
 
-        if (qfns->from_float && qfns->to_float) {
+        if (qfns_cpu->from_float && qfns->to_float) {
             printf("%s\n", ggml_type_name(type));
 
             ggml_quantize_init(type);
@@ -301,7 +302,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns->from_float(test_data1, test_q1, size);
+                        qfns_cpu->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -312,7 +313,7 @@ int main(int argc, char * argv[]) {
 
             if (params.op_dequantize_row_q) {
                 printf("  dequantize_row_q\n");
-                qfns->from_float(test_data1, test_q1, largest);
+                qfns_cpu->from_float(test_data1, test_q1, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
@@ -330,7 +331,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
+                        const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
                         vdot->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
@@ -342,8 +343,8 @@ int main(int argc, char * argv[]) {
 
             if (params.op_vec_dot_q) {
                 printf("  vec_dot_q\n");
-                qfns->from_float(test_data1, test_q1, largest);
-                qfns->from_float(test_data2, test_q2, largest);
+                qfns_cpu->from_float(test_data1, test_q1, largest);
+                qfns_cpu->from_float(test_data2, test_q2, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {

From 1607a5e5b08f4e55f118af3d7de325949d8f1835 Mon Sep 17 00:00:00 2001
From: Charles Xu <charles.xu@arm.com>
Date: Fri, 15 Nov 2024 01:28:50 +0100
Subject: [PATCH 212/396] backend cpu: add online flow for aarch64 Q4_0
 GEMV/GEMM kernels (#9921)

* backend-cpu: add online flow for aarch64 Q4_0 GEMV/GEMM kernels

---------

Co-authored-by: Diego Devesa <slarengh@gmail.com>
---
 Makefile                             |   4 +
 ggml/CMakeLists.txt                  |   1 +
 ggml/include/ggml-cpu.h              |   3 +
 ggml/src/ggml-cpu/CMakeLists.txt     |   5 +
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 144 +++++++++++++++++++++++++++
 ggml/src/ggml-cpu/ggml-cpu-aarch64.h |   3 +
 ggml/src/ggml-cpu/ggml-cpu.c         |  23 +++--
 ggml/src/ggml-cpu/ggml-cpu.cpp       | 110 ++++++++++++++++++--
 src/llama.cpp                        |   2 +-
 9 files changed, 273 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index de06cb8b0..87fe795aa 100644
--- a/Makefile
+++ b/Makefile
@@ -940,6 +940,10 @@ ggml/src/ggml-cuda/%.o: \
 	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
 endif # GGML_MUSA
 
+ifndef GGML_NO_CPU_AARCH64
+	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
+endif
+
 ifdef GGML_METAL
 	MK_CPPFLAGS += -DGGML_USE_METAL
 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 3e5b16f86..4fb78e59f 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -92,6 +92,7 @@ else()
 endif()
 
 option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
+option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 
 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 4da62cb2b..7571ef979 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -169,6 +169,9 @@ extern "C" {
     GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 #endif
 
+    GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
+    GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 4d96f425e..8b0d60d4e 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -236,6 +236,11 @@ else()
     message(STATUS "Unknown architecture")
 endif()
 
+if (GGML_CPU_AARCH64)
+    message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
+    add_compile_definitions(GGML_USE_CPU_AARCH64)
+endif()
+
 target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 0ad9fe40a..b753ba767 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -3385,3 +3385,147 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         }
     }
 }
+
+// FIXME: this code is duplicated from ggml-aarch64.c
+static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
+    block_q4_0x4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    for (int i = 0; i < QK4_0 * 2; i++) {
+        int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
+        int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
+        src_offset += (i % blck_size_interleave);
+
+        out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
+    }
+
+    return out;
+}
+
+// interleave 8 block_q4_0s in blocks of blck_size_interleave
+// returns an interleaved block_q4_0x8
+// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
+// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
+static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
+    block_q4_0x8 out;
+
+    for (int i = 0; i < 8; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    for (int i = 0; i < QK4_0 * 4; i++) {
+        int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
+        int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
+        src_offset += (i % blck_size_interleave);
+
+        out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
+    }
+
+    return out;
+}
+
+static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
+    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+
+    block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
+    const block_q4_0 * src = (const block_q4_0 *)data;
+    block_q4_0 dst_tmp[4];
+    int nrow = t->ne[1]; // Number of rows
+    int nrows_interleaved = 4;
+    int nblocks = t->ne[0] / QK4_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+    if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i = 0; i < nrows_interleaved; i++) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+
+static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * restrict data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
+    GGML_ASSERT(interleave_block == 8);
+
+    block_q4_0x8 * dst = (block_q4_0x8*)t->data;
+    const block_q4_0 * src = (const block_q4_0*) data;
+    block_q4_0 dst_tmp[8];
+    int nrow = t->ne[1]; // Number of rows
+    int nrows_interleaved = 8;
+    int nblocks = t->ne[0] / QK4_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+    if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i  = 0; i < nrows_interleaved; i++ ) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+
+// Prepare for optimized kernels if applicable
+void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
+    if (cur->type == repack_type) {
+        memcpy(cur->data, data, data_size);
+        return;
+    }
+
+    GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
+
+    switch (repack_type) {
+        case GGML_TYPE_Q4_0_8_8:
+            repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
+            break;
+        case GGML_TYPE_Q4_0_4_8:
+            repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
+            break;
+        case GGML_TYPE_Q4_0_4_4:
+            repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
+            break;
+        default:
+            GGML_ABORT("Unsupported type");
+    }
+}
+
+enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
+    if (cur->type == GGML_TYPE_Q4_0) {
+        // TODO: enable for AVX2 - currently disabled due to bad gemv performance
+        if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+            return GGML_TYPE_Q4_0_8_8;
+        }
+        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+            return GGML_TYPE_Q4_0_4_8;
+        }
+        if (ggml_cpu_has_neon()) {
+            return GGML_TYPE_Q4_0_4_4;
+        }
+    }
+
+    return cur->type;
+}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
index 203802f07..53b30c1dd 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@@ -21,6 +21,9 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
+void           ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
+enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 4c45146a1..30b1bf895 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -7330,6 +7330,7 @@ static void ggml_compute_forward_group_norm(
 static void ggml_compute_forward_mul_mat_one_chunk(
     const struct ggml_compute_params * params,
     struct ggml_tensor * dst,
+    const enum ggml_type type,
     const int64_t num_rows_per_vec_dot,
     const int64_t ir0_start,
     const int64_t ir0_end,
@@ -7341,8 +7342,6 @@ static void ggml_compute_forward_mul_mat_one_chunk(
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
-    const enum ggml_type type = src0->type;
-
     const bool src1_cont = ggml_is_contiguous(src1);
 
     ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
@@ -7430,7 +7429,11 @@ static void ggml_compute_forward_mul_mat(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const enum ggml_type type = src0->type;
+    enum ggml_type type = src0->type;
+
+    if (src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
+        type = (enum ggml_type)(intptr_t)src0->extra;
+    }
 
     enum ggml_type           const vec_dot_type         = type_traits_cpu[type].vec_dot_type;
     ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
@@ -7469,15 +7472,15 @@ static void ggml_compute_forward_mul_mat(
     if (src1_cont) {
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(src0->type),
+                                     nb01/ggml_type_size(type),
                                      (const char *)src1->data + i12*nb12 + i13*nb13,
                                      nb11/ggml_type_size(src1->type),
                                      (char *)dst->data + i12*nb2 + i13*nb3,
                                      nb1/ggml_type_size(dst->type),
                                      ith, nth,
-                                     src0->type,
+                                     type,
                                      src1->type,
                                      dst->type))
                     goto UseGgmlGemm1;
@@ -7530,15 +7533,15 @@ UseGgmlGemm1:;
 
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(src0->type),
+                                     nb01/ggml_type_size(type),
                                      (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
                                      row_size/ggml_type_size(vec_dot_type),
                                      (char *)dst->data + i12*nb2 + i13*nb3,
                                      nb1/ggml_type_size(dst->type),
                                      ith, nth,
-                                     src0->type,
+                                     type,
                                      vec_dot_type,
                                      dst->type))
                     goto UseGgmlGemm2;
@@ -7623,7 +7626,7 @@ UseGgmlGemm2:;
         const int64_t ir1_start = dr1 * ith1;
         const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
 
-        ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+        ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
 
         if (nth >= nchunk0 * nchunk1) {
             break;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index c7216117b..573b7c5b9 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -1,6 +1,7 @@
 #include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-cpu.h"
+#include "ggml-cpu-aarch64.h"
 #include "ggml-impl.h"
 #include <cctype>
 #include <string>
@@ -69,15 +70,84 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 }
 #endif
 
-static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
-    static ggml_backend_buffer_type_t bufts[] = {
-#ifdef GGML_USE_CPU_HBM
-        ggml_backend_cpu_hbm_buffer_type(),
-#endif
-        NULL
+// buffer type AARCH64
+
+static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;
+
+    ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_AARCH64";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+
+    if (buffer == NULL) {
+        return NULL;
+    }
+
+    buffer->buft = buft;
+    buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
+    buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ NULL,
+        },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
     };
 
-    return bufts;
+    return &ggml_backend_cpu_buffer_type_aarch64;
+}
+
+bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
+    return buft == ggml_backend_cpu_aarch64_buffer_type();
+}
+
+static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
+    static std::vector<ggml_backend_buffer_type_t> bufts = []() {
+        std::vector<ggml_backend_buffer_type_t> bufts;
+
+#ifdef GGML_USE_CPU_HBM
+        bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
+#endif
+
+#ifdef GGML_USE_CPU_AARCH64
+        bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
+#endif
+
+        bufts.push_back(NULL);
+
+        return bufts;
+    }();
+
+    return bufts.data();
 
     GGML_UNUSED(device);
 }
@@ -383,6 +453,21 @@ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_b
 }
 
 static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+
+    if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
+        if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
+            return false;
+        }
+    }
+
+    for (int i = 1; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
+            return false;
+        }
+    }
+
     switch (op->op) {
         case GGML_OP_CPY:
             return
@@ -391,13 +476,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
                 op->type != GGML_TYPE_IQ1_S   &&
                 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
         case GGML_OP_MUL_MAT:
-            return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
+            return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
         case GGML_OP_ROPE_BACK:
             return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
         case GGML_OP_IM2COL_BACK:
-            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+            return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
         case GGML_OP_OUT_PROD:
-            return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
+            return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
         default:
             return true;
     }
@@ -406,7 +491,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
 }
 
 static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft);
+    return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
 
     GGML_UNUSED(dev);
 }
@@ -566,6 +651,9 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
 };
 
 ggml_backend_reg_t ggml_backend_cpu_reg(void) {
+    // init CPU feature detection
+    ggml_cpu_init();
+
     static struct ggml_backend_reg ggml_backend_cpu_reg = {
         /* .iface   = */ ggml_backend_cpu_reg_i,
         /* .context = */ NULL,
diff --git a/src/llama.cpp b/src/llama.cpp
index 6ec419e9b..7a9a0e3ad 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7254,7 +7254,7 @@ static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) {
     auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
     auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
     auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
-        ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_cpu_get_extra_bufts");
+        ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
     if (ggml_backend_dev_get_extra_bufts_fn) {
         ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
         while (extra_bufts && *extra_bufts) {

From 5a54af4d4f588f109f31e456483fdf77096399d9 Mon Sep 17 00:00:00 2001
From: Romain Biessy <romain.biessy@codeplay.com>
Date: Fri, 15 Nov 2024 04:09:12 +0100
Subject: [PATCH 213/396] sycl: Use syclcompat::dp4a (#10267)

* sycl: Use syclcompat::dp4a

* Using the syclcompat version allow the compiler to optimize the
  operation with native function

* Update news section

* Update CI Windows oneAPI version to 2025.0

* Reword doc

* Call syclcompat::dp4a inside dpct::dp4a

This reverts commit 90cb61d692d61360b46954a1c7f780bd2e569b73.
---
 .github/workflows/build.yml        |  2 +-
 docs/backend/SYCL.md               |  2 ++
 ggml/src/ggml-sycl/dpct/helper.hpp | 24 ++----------------------
 ggml/src/ggml-sycl/vecdotq.hpp     |  8 ++++----
 4 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d6a7b66a5..c770bbd15 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -930,7 +930,7 @@ jobs:
         shell: bash
 
     env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
       WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
       ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
     steps:
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index bc8c0f886..38185f738 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -41,6 +41,8 @@ The following release is verified with good quality:
 
 ## News
 
+- 2024.11
+  - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
 
 - 2024.8
   - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index fe4a8f744..c2f28bb49 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -15,6 +15,7 @@
 
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
+#include <syclcompat/math.hpp>
 #include <oneapi/mkl.hpp>
 #include <map>
 
@@ -1830,31 +1831,10 @@ namespace dpct
                                            : id);
     }
 
-    template <typename T>
-    sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
-    {
-        return sycl::vec<T, 1>(val)
-            .template as<sycl::vec<
-                std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
-            .template convert<T>();
-    }
-
-    template <typename T1, typename T2>
-    using dot_product_acc_t =
-        std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
-                           uint32_t, int32_t>;
-
     template <typename T1, typename T2, typename T3>
     inline auto dp4a(T1 a, T2 b, T3 c)
     {
-        dot_product_acc_t<T1, T2> res = c;
-        auto va = extract_and_sign_or_zero_extend4(a);
-        auto vb = extract_and_sign_or_zero_extend4(b);
-        res += va[0] * vb[0];
-        res += va[1] * vb[1];
-        res += va[2] * vb[2];
-        res += va[3] * vb[3];
-        return res;
+        return syclcompat::dp4a(a, b, c);
     }
 
     struct sub_sat
diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp
index d2dccade2..c5942008a 100644
--- a/ggml/src/ggml-sycl/vecdotq.hpp
+++ b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -968,8 +968,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
             grid1[0] ^ signs[0], signs[0], std::minus<>());
         const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
             grid2[0] ^ signs[1], signs[1], std::minus<>());
-        sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
-        sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
+        sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
+        sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
         q8 += 8;
         aux32 >>= 7;
     }
@@ -1009,8 +1009,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
             grid1[0] ^ signs0, signs0, std::minus<>());
         const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
             grid2[0] ^ signs1, signs1, std::minus<>());
-        sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
-        sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
+        sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
+        sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
         q8 += 8;
     }
     const float d =

From 4802ad350b8e19cbc7a77269b4494c896f6e0896 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 15 Nov 2024 08:38:43 +0200
Subject: [PATCH 214/396] scripts : fix regex in sync [no ci]

---
 scripts/sync-ggml-am.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index 06a04745b..74d6c6c8b 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -144,17 +144,17 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
         -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
         -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\1.c/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\1.cpp/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\1.h/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\1.cu/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\1.m/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\2.cu/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\2.m/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
-        -e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\1.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \
         -e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \
         -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \

From 231f9360d94446cd083b6b116f63991b1328c484 Mon Sep 17 00:00:00 2001
From: Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
Date: Fri, 15 Nov 2024 15:09:35 +0800
Subject: [PATCH 215/396] cann: dockerfile and doc adjustment (#10302)

Co-authored-by: noemotiovon <noemotiovon@gmail.com>
---
 .devops/llama-cli-cann.Dockerfile | 4 ++--
 docs/build.md                     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile
index db5ba2f25..45c0585b0 100644
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,6 +1,6 @@
 ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
 
-FROM cosdt/cann:$ASCEND_VERSION AS build
+FROM ascendai/cann:$ASCEND_VERSION AS build
 
 WORKDIR /app
 
@@ -26,7 +26,7 @@ RUN echo "Building with static libs" && \
     cmake --build build --config Release --target llama-cli
 
 # TODO: use image with NNRT
-FROM cosdt/cann:$ASCEND_VERSION AS runtime
+FROM ascendai/cann:$ASCEND_VERSION AS runtime
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 
 ENV LC_ALL=C.utf8
diff --git a/docs/build.md b/docs/build.md
index 95512415a..52de2b4e2 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -375,7 +375,7 @@ cmake --build build --config release
 
 You can test with:
 
-`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
+`./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
 
 If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
 ```bash

From 9901068ac78838745e604fffb4601d315a610456 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Fri, 15 Nov 2024 05:48:49 -0400
Subject: [PATCH 216/396] server : (web UI) add copy button for code block, fix
 api key (#10242)

* server : (web ui) add copy btn for code blocks

* fix problem with api key

* use settings-modal-short-input component

* always show copy btn for code snippet
---
 examples/server/public/index.html | 62 +++++++++++++++++++++----------
 examples/server/server.cpp        | 42 +++++++++++++--------
 2 files changed, 68 insertions(+), 36 deletions(-)

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 55639a944..65a915d59 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -12,7 +12,7 @@
     .markdown {
       h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
       pre {
-        @apply whitespace-pre-wrap my-4 rounded-lg p-2;
+        @apply whitespace-pre-wrap rounded-lg p-2;
         border: 1px solid currentColor;
       }
       /* TODO: fix markdown table */
@@ -25,8 +25,11 @@
     .bg-base-200 {background-color: var(--fallback-b2,oklch(var(--b2)/1))}
     .bg-base-300 {background-color: var(--fallback-b3,oklch(var(--b3)/1))}
     .text-base-content {color: var(--fallback-bc,oklch(var(--bc)/1))}
+    .show-on-hover {
+      @apply opacity-0 group-hover:opacity-100;
+    }
     .btn-mini {
-      @apply cursor-pointer opacity-0 group-hover:opacity-100 hover:shadow-md;
+      @apply cursor-pointer hover:shadow-md;
     }
     .chat-screen { max-width: 900px; }
     /* because the default bubble color is quite dark, we will make a custom one using bg-base-300 */
@@ -152,14 +155,14 @@
           <!-- actions for each message -->
           <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
             <!-- user message -->
-            <button v-if="msg.role === 'user'" class="badge btn-mini" @click="editingMsg = msg" :disabled="isGenerating">
+            <button v-if="msg.role === 'user'" class="badge btn-minishow-on-hover " @click="editingMsg = msg" :disabled="isGenerating">
               ✍️ Edit
             </button>
             <!-- assistant message -->
-            <button v-if="msg.role === 'assistant'" class="badge btn-mini mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
+            <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
               🔄 Regenerate
             </button>
-            <button v-if="msg.role === 'assistant'" class="badge btn-mini mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
+            <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
               📋 Copy
             </button>
           </div>
@@ -196,12 +199,13 @@
         <h3 class="text-lg font-bold mb-6">Settings</h3>
         <div class="h-[calc(90vh-12rem)] overflow-y-auto">
           Settings below are saved in browser's localStorage
+          <settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
           <label class="form-control mb-2">
             <div class="label">System Message</div>
             <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
           </label>
           <template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
-            <settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+            <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
           </template>
           <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
           <!-- Section: Other sampler settings -->
@@ -209,7 +213,7 @@
             <summary class="collapse-title font-bold">Other sampler settings</summary>
             <div class="collapse-content">
               <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
-                <settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
               </template>
             </div>
           </details>
@@ -218,7 +222,7 @@
             <summary class="collapse-title font-bold">Penalties settings</summary>
             <div class="collapse-content">
               <template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
-                <settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
               </template>
             </div>
           </details>
@@ -245,7 +249,7 @@
   </div>
 
   <!-- Template to be used by settings modal -->
-  <template id="settings-modal-numeric-input">
+  <template id="settings-modal-short-input">
     <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
       <!-- Show help message on hovering on the input label -->
       <div class="dropdown dropdown-hover">
@@ -264,9 +268,13 @@
     import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
     import { llama } from './completion.js';
 
+    // utility functions
     const isString = (x) => !!x.toLowerCase;
     const isNumeric = (n) => !isString(n) && !isNaN(n);
+    const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
+    const copyStr = (str) => navigator.clipboard.writeText(str);
 
+    // constants
     const BASE_URL = localStorage.getItem('base') // for debugging
       || (new URL('.', document.baseURI).href).toString(); // for production
     const CONFIG_DEFAULT = {
@@ -295,7 +303,7 @@
       custom: '', // custom json-stringified object
     };
     const CONFIG_INFO = {
-      apiKey: '',
+      apiKey: 'Set the API Key if you are using --api-key option for the server.',
       systemMessage: 'The starting message that defines how model should behave.',
       temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
       dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
@@ -325,19 +333,28 @@
     // markdown support
     const VueMarkdown = defineComponent(
       (props) => {
-        const md = shallowRef(new markdownit(props.options ?? { breaks: true }));
-        for (const plugin of props.plugins ?? []) {
-          md.value.use(plugin);
-        }
+        const md = shallowRef(new markdownit({ breaks: true }));
+        const origFenchRenderer = md.value.renderer.rules.fence;
+        md.value.renderer.rules.fence = (tokens, idx, ...args) => {
+          const content = tokens[idx].content;
+          const origRendered = origFenchRenderer(tokens, idx, ...args);
+          return `<div class="relative my-4">
+            <div class="text-right sticky top-4 mb-2 mr-2 h-0">
+              <button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
+            </div>
+            ${origRendered}
+          </div>`;
+        };
+        window.copyStr = copyStr;
         const content = computed(() => md.value.render(props.source));
         return () => h("div", { innerHTML: content.value });
       },
-      { props: ["source", "options", "plugins"] }
+      { props: ["source"] }
     );
 
     // inout field to be used by settings modal
-    const SettingsModalNumericInput = defineComponent({
-      template: document.getElementById('settings-modal-numeric-input').innerHTML,
+    const SettingsModalShortInput = defineComponent({
+      template: document.getElementById('settings-modal-short-input').innerHTML,
       props: ['configKey', 'configDefault', 'configInfo', 'modelValue'],
     });
 
@@ -390,7 +407,11 @@
         if (!conv) return;
         const msg = conv.messages.pop();
         conv.lastModified = Date.now();
-        localStorage.setItem(convId, JSON.stringify(conv));
+        if (conv.messages.length === 0) {
+          StorageUtils.remove(convId);
+        } else {
+          localStorage.setItem(convId, JSON.stringify(conv));
+        }
         return msg;
       },
 
@@ -431,7 +452,7 @@
     const mainApp = createApp({
       components: {
         VueMarkdown,
-        SettingsModalNumericInput,
+        SettingsModalShortInput,
       },
       data() {
         return {
@@ -587,6 +608,7 @@
           this.isGenerating = false;
           this.stopGeneration = () => {};
           this.fetchMessages();
+          chatScrollToBottom();
         },
 
         // message actions
@@ -600,7 +622,7 @@
           this.generateMessage(currConvId);
         },
         copyMsg(msg) {
-          navigator.clipboard.writeText(msg.content);
+          copyStr(msg.content);
         },
         editUserMsgAndRegenerate(msg) {
           if (this.isGenerating) return;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index cac55007a..00f9031dc 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -102,6 +102,12 @@ struct server_task_result {
     bool error;
 };
 
+struct server_static_file {
+    const unsigned char * data;
+    unsigned int size;
+    const char * mime_type;
+};
+
 struct slot_params {
     bool stream       = true;
     bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
@@ -2259,6 +2265,16 @@ int main(int argc, char ** argv) {
     LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     LOG_INF("\n");
 
+    // static files
+    std::map<std::string, server_static_file> static_files = {
+        { "/",                        { index_html,              index_html_len,              "text/html; charset=utf-8" }},
+        { "/completion.js",           { completion_js,           completion_js_len,           "text/javascript; charset=utf-8" }},
+        { "/deps_daisyui.min.css",    { deps_daisyui_min_css,    deps_daisyui_min_css_len,    "text/css; charset=utf-8" }},
+        { "/deps_markdown-it.js",     { deps_markdown_it_js,     deps_markdown_it_js_len,     "text/javascript; charset=utf-8" }},
+        { "/deps_tailwindcss.js",     { deps_tailwindcss_js,     deps_tailwindcss_js_len,     "text/javascript; charset=utf-8" }},
+        { "/deps_vue.esm-browser.js", { deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8" }},
+    };
+
     std::unique_ptr<httplib::Server> svr;
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
     if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
@@ -2339,7 +2355,7 @@ int main(int argc, char ** argv) {
     // Middlewares
     //
 
-    auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
+    auto middleware_validate_api_key = [&params, &res_error, &static_files](const httplib::Request & req, httplib::Response & res) {
         static const std::unordered_set<std::string> public_endpoints = {
             "/health",
             "/models",
@@ -2351,8 +2367,8 @@ int main(int argc, char ** argv) {
             return true;
         }
 
-        // If path is public, skip validation
-        if (public_endpoints.find(req.path) != public_endpoints.end()) {
+        // If path is public or is static file, skip validation
+        if (public_endpoints.find(req.path) != public_endpoints.end() || static_files.find(req.path) != static_files.end()) {
             return true;
         }
 
@@ -3096,13 +3112,6 @@ int main(int argc, char ** argv) {
         res.status = 200; // HTTP OK
     };
 
-    auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
-        return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
-            res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
-            return false;
-        };
-    };
-
     //
     // Router
     //
@@ -3117,12 +3126,13 @@ int main(int argc, char ** argv) {
         }
     } else {
         // using embedded static files
-        svr->Get("/",                        handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
-        svr->Get("/completion.js",           handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
-        svr->Get("/deps_daisyui.min.css",    handle_static_file(deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8"));
-        svr->Get("/deps_markdown-it.js",     handle_static_file(deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8"));
-        svr->Get("/deps_tailwindcss.js",     handle_static_file(deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8"));
-        svr->Get("/deps_vue.esm-browser.js", handle_static_file(deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8"));
+        for (const auto & it : static_files) {
+            const server_static_file & static_file = it.second;
+            svr->Get(it.first.c_str(), [&static_file](const httplib::Request &, httplib::Response & res) {
+                res.set_content(reinterpret_cast<const char*>(static_file.data), static_file.size, static_file.mime_type);
+                return false;
+            });
+        }
     }
 
     // register API routes

From 57f8355b29a8c7dfcd1fb6094758ad85644f8535 Mon Sep 17 00:00:00 2001
From: Romain Biessy <romain.biessy@codeplay.com>
Date: Fri, 15 Nov 2024 12:10:45 +0100
Subject: [PATCH 217/396] sycl: Update Intel docker images to use DPC++ 2025.0
 (#10305)

---
 .devops/llama-cli-intel.Dockerfile    | 2 +-
 .devops/llama-server-intel.Dockerfile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
index 79dba06a7..1f4ce0730 100644
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 
diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile
index 9c355b664..773f030a7 100644
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 

From f0204a0ec70d50ca60e07bc0096ec1d6508ab0c7 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Fri, 15 Nov 2024 19:47:25 +0800
Subject: [PATCH 218/396] ci: build test musa with cmake (#10298)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 .github/workflows/build.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c770bbd15..6ef0770f3 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -414,6 +414,27 @@ jobs:
           cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
           cmake --build build2 --config Release -j $(nproc)
 
+  ubuntu-22-cmake-musa:
+    runs-on: ubuntu-22.04
+    container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        run: |
+          apt-get update
+          apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+      - name: Build with native CMake MUSA support
+        id: cmake_build
+        run: |
+          cmake -B build -S . -DGGML_MUSA=ON
+          cmake --build build --config Release -j $(nproc)
+
   ubuntu-22-cmake-sycl:
     runs-on: ubuntu-22.04
 

From 18429220bdb344da1bc7df9bc580c7b41b3cd57b Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Fri, 15 Nov 2024 11:47:58 +0000
Subject: [PATCH 219/396] AVX BF16 and single scale quant optimizations
 (#10212)

* use 128 bit loads (i've tried 256->128 to death and its slower)

* double accumulator

* avx bf16 vec dot

* +3% q4_0 inference

* +7% tg +5% pp compared to master

* slower f16c version, kep for reference

* 256b version, also slow. i tried :)

* revert f16

* faster with madd

* split to functions

* Q8_0 and IQ4_NL, 5-7% faster

* fix potential overflow (performance reduced)

* 16 bit add for q4_0 only

* merge
---
 ggml/src/ggml-cpu/ggml-cpu-quants.c | 128 +++++++++++++++++-----------
 ggml/src/ggml-cpu/ggml-cpu.c        |   6 +-
 2 files changed, 82 insertions(+), 52 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index 7fa2897c2..f0e276b69 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -150,6 +150,28 @@ static inline __m128i packNibbles( __m256i bytes )
 #endif
 }
 #elif defined(__AVX__)
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m128i lowByte = _mm_set1_epi16( 0xFF );
+    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+    __m128i low = _mm_and_si128( lowByte, bytes1 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes1 = _mm_or_si128( low, high );
+    high = _mm_andnot_si128( lowByte, bytes2 );
+    low = _mm_and_si128( lowByte, bytes2 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes2 = _mm_or_si128( low, high );
+
+    return _mm_packus_epi16( bytes1, bytes2);
+}
+
+static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
+    const __m128i ax = _mm_sign_epi8(x, x);
+    const __m128i sy = _mm_sign_epi8(y, x);
+    return _mm_maddubs_epi16(ax, sy);
+}
+
 // spread 32 bits to 32 bytes { 0x00, 0xFF }
 static inline __m256i bytes_from_bits_32(const uint8_t * x) {
     uint32_t x32;
@@ -217,26 +239,29 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
     return sum_i16_pairs_float(doth, dotl);
 }
 
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m128i lowByte = _mm_set1_epi16( 0xFF );
-    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
-    __m128i low = _mm_and_si128( lowByte, bytes1 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes1 = _mm_or_si128( low, high );
-    high = _mm_andnot_si128( lowByte, bytes2 );
-    low = _mm_and_si128( lowByte, bytes2 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes2 = _mm_or_si128( low, high );
+// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
+static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
+                                           const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
+    const __m128i mone = _mm_set1_epi16(1);
 
-    return _mm_packus_epi16( bytes1, bytes2);
+    const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
+    const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
+    const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
+    const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
+    const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
+    const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
+    const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
+    const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
+    const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
+    const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
+    return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
 }
 
-static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
-    const __m128i ax = _mm_sign_epi8(x, x);
-    const __m128i sy = _mm_sign_epi8(y, x);
-    return _mm_maddubs_epi16(ax, sy);
+// quad fp16 delta calculation
+static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
+    // GGML_FP16_TO_FP32 is faster than Intel F16C
+    return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
+                           _mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
 }
 #endif
 #elif defined(__SSSE3__)
@@ -2004,10 +2029,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
 
     sumf = hsum_float_8(acc);
 #elif defined(__AVX__)
-    const __m128i mone = _mm_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
+    __m256 accum = _mm256_setzero_ps();
     for (; ib + 1 < nb; ib += 2) {
         const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
         const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
@@ -2020,21 +2042,20 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
         const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
         const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
         const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
+
         const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
         const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
         const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
         const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
-        const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
-        const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
-        const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
-        const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
-        accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
-        accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
+        const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
+        const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
+        const __m256 p =  sum_i16_pairs_float(p_2, p_1);
+
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
     }
 
-    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+    sumf = hsum_float_8(accum);
 #elif defined(__SSSE3__)
     // set constants
     const __m128i lowMask = _mm_set1_epi8(0xF);
@@ -3535,7 +3556,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     }
 
     sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__) || defined(__AVX__)
+#elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
 
@@ -3549,14 +3570,29 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
         const __m256 q = mul_sum_i8_pairs_float(qx, qy);
 
         // Multiply q with scale and accumulate
-#if defined(__AVX2__)
         acc = _mm256_fmadd_ps( d, q, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
-#endif
     }
 
     sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    __m256 accum = _mm256_setzero_ps();
+
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
+        const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
+        const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
+        const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
+        const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
+        const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+    }
+
+    sumf = hsum_float_8(accum);
 #elif defined(__riscv_v_intrinsic)
     size_t vl = __riscv_vsetvl_e8m1(qk);
 
@@ -10322,10 +10358,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
 #elif defined __AVX__
     const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
     const __m128i m4b  = _mm_set1_epi8(0x0f);
-    const __m128i mone = _mm_set1_epi16(1);
 
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
+    __m256 accum = _mm256_setzero_ps();
     for (; ib + 1 < nb; ib += 2) {
         const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
         const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
@@ -10338,21 +10372,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
         const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
         const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
         const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
-        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
-        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
-        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
-        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
-        const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
-        const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
-        const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
-        const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
-        accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
-        accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
+
+        const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
     }
 
-    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+    sumf = hsum_float_8(accum);
 
 #elif defined(__POWER9_VECTOR__)
     const vector signed char lowMask = vec_splats((signed char)0xF);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 30b1bf895..61f53cd01 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1469,8 +1469,12 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
     sumf += (ggml_float)_mm512_reduce_add_ps(c2);
 
 #undef LOAD
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) || defined(__AVX__)
+#if defined(__AVX2__)
 #define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
+#else
+#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
+#endif
     __m256 c1 = _mm256_setzero_ps();
     __m256 c2 = _mm256_setzero_ps();
     __m256 c3 = _mm256_setzero_ps();

From cbf5541a82952bcd7c4fceb55f5e332cafbf1720 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 15 Nov 2024 15:31:16 +0200
Subject: [PATCH 220/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 199237a21..6ddb71ab1 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-8a3d799484d861748f86eb87c8314fa2dbccc254
+9d0708e863f3aa2fc1eb0b75d433303c30bd0dbc

From 3225008973579cc6a784890c237e1bfc9de41819 Mon Sep 17 00:00:00 2001
From: thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
Date: Fri, 15 Nov 2024 15:33:53 +0200
Subject: [PATCH 221/396] ggml : vulkan logs (whisper/2547)

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c02c35665..04d671aeb 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1763,7 +1763,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
     fp16 = fp16 && vk12_features.shaderFloat16;
 
     std::string device_name = props2.properties.deviceName.data();
-    std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
+    GGML_LOG_DEBUG("ggml_vulkan: %d = %s (%s) | uma: %d | fp16: %d | warp size: %d\n",
+              idx, device_name.c_str(), driver_props.driverName, uma, fp16, subgroup_size);
 
     if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
         std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
@@ -1821,8 +1822,7 @@ void ggml_vk_instance_init() {
         };
         validation_features.setPNext(nullptr);
         instance_create_info.setPNext(&validation_features);
-
-        std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
+        GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
     }
     vk_instance.instance = vk::createInstance(instance_create_info);
 
@@ -1936,8 +1936,8 @@ void ggml_vk_instance_init() {
             vk_instance.device_indices.push_back(0);
         }
     }
+    GGML_LOG_DEBUG("ggml_vulkan: Found %d Vulkan devices:\n", vk_instance.device_indices.size());
 
-    std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
 
     for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
         ggml_vk_print_gpu_info(i);

From 09ecbcb596ed8fa97d503d7440f0b3eff872e8f1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 15 Nov 2024 15:35:22 +0200
Subject: [PATCH 222/396] cmake : fix ppc64 check (whisper/0)

ggml-ci
---
 ggml/src/ggml-cpu/CMakeLists.txt | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 8b0d60d4e..30de6c99a 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -211,10 +211,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
     message(STATUS "PowerPC detected")
-    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
-                   OUTPUT_VARIABLE POWER10_M)
-    string(FIND ${POWER10_M} "POWER10" substring_index)
-    if(${substring_index} GREATER_EQUAL 0)
+    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
+    string(FIND "${POWER10_M}" "POWER10" substring_index)
+    if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
+        set(substring_index -1)
+    endif()
+
+    if (${substring_index} GREATER_EQUAL 0)
        list(APPEND ARCH_FLAGS -mcpu=power10)
     elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)

From 883d206fbd2c5b2b9b589a9328503b9005e146c9 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 15 Nov 2024 20:20:54 +0100
Subject: [PATCH 223/396] ggml : fix some build issues

---
 Makefile        | 8 ++++----
 ggml/src/ggml.c | 8 ++++++++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 87fe795aa..804c1bbec 100644
--- a/Makefile
+++ b/Makefile
@@ -359,6 +359,10 @@ ifdef LLAMA_SERVER_SSL
 	MK_LDFLAGS += -lssl -lcrypto
 endif
 
+ifndef GGML_NO_CPU_AARCH64
+	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
+endif
+
 # warnings
 WARN_FLAGS = \
 	-Wall \
@@ -940,10 +944,6 @@ ggml/src/ggml-cuda/%.o: \
 	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
 endif # GGML_MUSA
 
-ifndef GGML_NO_CPU_AARCH64
-	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
-endif
-
 ifdef GGML_METAL
 	MK_CPPFLAGS += -DGGML_USE_METAL
 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 4a97bfc32..5cdf59f25 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -49,6 +49,14 @@
 
 #define UNUSED GGML_UNUSED
 
+#if defined(_MSC_VER)
+#define m512bh(p) p
+#define m512i(p) p
+#else
+#define m512bh(p) (__m512bh)(p)
+#define m512i(p) (__m512i)(p)
+#endif
+
 // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 float ggml_table_f32_f16[1 << 16];
 

From 4047be74da398acb8717a4d21b77b929ad7ed4f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 15 Nov 2024 21:19:03 +0100
Subject: [PATCH 224/396] scripts: update compare-llama-bench.py (#10319)

---
 scripts/compare-llama-bench.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index 4ac6b5fc0..6125a080a 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -19,22 +19,22 @@ logger = logging.getLogger("compare-llama-bench")
 
 # Properties by which to differentiate results per commit:
 KEY_PROPERTIES = [
-    "cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
-    "blas", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "n_threads",
-    "type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
+    "cpu_info", "gpu_info", "backends", "n_gpu_layers", "model_filename", "model_type", "n_batch", "n_ubatch",
+    "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "use_mmap", "no_kv_offload",
+    "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
 ]
 
 # Properties that are boolean and are converted to Yes/No for the table:
-BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
+BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
 
 # Header names for the table:
 PRETTY_NAMES = {
-    "cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
-    "gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
-    "model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
-    "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
-    "main_gpu": "Main GPU", "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split",
-    "use_mmap": "Use mmap", "embeddings": "Embeddings",
+    "cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
+    "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
+    "model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
+    "embeddings": "Embeddings", "cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll",
+    "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "split_mode": "Split mode", "main_gpu": "Main GPU",
+    "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split", "use_mmap": "Use mmap",
 }
 
 DEFAULT_SHOW = ["model_type"]  # Always show these properties by default.

From 74d73dc85cc2057446bf63cc37ff649ae7cebd80 Mon Sep 17 00:00:00 2001
From: Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
Date: Sat, 16 Nov 2024 02:57:00 +0530
Subject: [PATCH 225/396] Make updates to fix issues with clang-cl builds while
 using AVX512 flags (#10314)

---
 ggml/src/ggml-cpu/CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 30de6c99a..cef41a074 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -143,14 +143,23 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
             if (GGML_AVX512_VBMI)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                    list(APPEND ARCH_FLAGS -mavx512vbmi)
+                endif()
             endif()
             if (GGML_AVX512_VNNI)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                    list(APPEND ARCH_FLAGS -mavx512vnni)
+                endif()
             endif()
             if (GGML_AVX512_BF16)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
+                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                    list(APPEND ARCH_FLAGS -mavx512bf16)
+                endif()
             endif()
             if (GGML_AMX_TILE)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)

From 89e4caaaf081f4712af61a3e08cb67b406c02b80 Mon Sep 17 00:00:00 2001
From: FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
Date: Sat, 16 Nov 2024 13:42:13 +1300
Subject: [PATCH 226/396] llama : save number of parameters and the size in
 llama_model (#10286)

fixes #10285
---
 src/llama.cpp | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 7a9a0e3ad..dc5dfba0c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2907,9 +2907,15 @@ struct llama_model {
     // for quantize-stats only
     std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
 
-    int64_t t_load_us = 0;
+    int64_t t_load_us  = 0;
     int64_t t_start_us = 0;
 
+    // total number of parameters in the model
+    uint64_t n_elements = 0;
+
+    // total size of all the tensors in the model in bytes
+    size_t  n_bytes     = 0;
+
     // keep track of loaded lora adapters
     std::set<struct llama_lora_adapter *> lora_adapters;
 
@@ -4275,8 +4281,8 @@ struct llama_model_loader {
     int n_tensors = 0;
     int n_created = 0;
 
-    int64_t n_elements = 0;
-    size_t  n_bytes    = 0;
+    uint64_t n_elements = 0;
+    size_t  n_bytes     = 0;
 
     bool use_mmap = false;
     bool check_tensors;
@@ -5344,6 +5350,11 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
     }
 }
 
+static void llm_load_stats(llama_model_loader & ml, llama_model & model) {
+    model.n_elements = ml.n_elements;
+    model.n_bytes = ml.n_bytes;
+}
+
 static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
     model.arch = ml.get_arch();
     if (model.arch == LLM_ARCH_UNKNOWN) {
@@ -9256,6 +9267,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
             throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
         }
 
+        llm_load_stats(ml, model);
         llm_load_print_meta(ml, model);
 
         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
@@ -18601,6 +18613,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     llama_model model;
     llm_load_arch(ml, model);
     llm_load_hparams(ml, model);
+    llm_load_stats(ml, model);
 
     struct quantize_state_internal qs(model, params);
 
@@ -19953,19 +19966,11 @@ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t bu
 }
 
 uint64_t llama_model_size(const struct llama_model * model) {
-    uint64_t size = 0;
-    for (const auto & it : model->tensors_by_name) {
-        size += ggml_nbytes(it.second);
-    }
-    return size;
+    return model->n_bytes;
 }
 
 uint64_t llama_model_n_params(const struct llama_model * model) {
-    uint64_t nparams = 0;
-    for (const auto & it : model->tensors_by_name) {
-        nparams += ggml_nelements(it.second);
-    }
-    return nparams;
+    return model->n_elements;
 }
 
 struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {

From 1e58ee1318429a3e97aa66f3034cdfd65ffc6c34 Mon Sep 17 00:00:00 2001
From: Dan Johansson <dan.johansson@arm.com>
Date: Sat, 16 Nov 2024 01:53:37 +0100
Subject: [PATCH 227/396] ggml : optimize Q4_0 into Q4_0_X_Y repack (#10324)

---
 ggml/src/ggml-aarch64.c              | 57 +++++++++++++++++++++-------
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 57 +++++++++++++++++++++-------
 2 files changed, 86 insertions(+), 28 deletions(-)

diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 688b17b45..013912051 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -8,19 +8,42 @@
 
 #define UNUSED GGML_UNUSED
 
-static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
+static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
     block_q4_0x4 out;
 
     for (int i = 0; i < 4; i++) {
         out.d[i] = in[i].d;
     }
 
-    for (int i = 0; i < QK4_0 * 2; i++) {
-        int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
-        int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
-        src_offset += (i % blck_size_interleave);
+    const int end = QK4_0 * 2 / blck_size_interleave;
 
-        out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
+    if (blck_size_interleave == 8) {
+        const uint64_t xor_mask = 0x8888888888888888ULL;
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            uint64_t elems;
+            // Using memcpy to avoid unaligned memory accesses
+            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+            elems ^= xor_mask;
+            memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+        }
+    } else if (blck_size_interleave == 4) {
+        const uint32_t xor_mask = 0x88888888;
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            uint32_t elems;
+            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
+            elems ^= xor_mask;
+            memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
+        }
+    } else {
+        GGML_ASSERT(false);
     }
 
     return out;
@@ -30,19 +53,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
 // returns an interleaved block_q4_0x8
 // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
 // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
-static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
+static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
     block_q4_0x8 out;
 
     for (int i = 0; i < 8; i++) {
         out.d[i] = in[i].d;
     }
 
-    for (int i = 0; i < QK4_0 * 4; i++) {
-        int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
-        int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
-        src_offset += (i % blck_size_interleave);
+    const int end = QK4_0 * 4 / blck_size_interleave;
+    const uint64_t xor_mask = 0x8888888888888888ULL;
 
-        out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
+    for (int i = 0; i < end; ++i) {
+        int src_id = i % 8;
+        int src_offset = (i / 8) * blck_size_interleave;
+        int dst_offset = i * blck_size_interleave;
+
+        uint64_t elems;
+        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+        elems ^= xor_mask;
+        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
     }
 
     return out;
@@ -71,11 +100,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
             }
 
             if (nrows_interleaved == 8) {
-                *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave, 0x88);
+                *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
                 out_ptr = (block_q4_0x8 *) out_ptr + 1;
             }
             else if (nrows_interleaved == 4) {
-                *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x88);
+                *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
                 out_ptr = (block_q4_0x4 *) out_ptr + 1;
             }
         }
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index b753ba767..96a16dfba 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -3387,19 +3387,42 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 }
 
 // FIXME: this code is duplicated from ggml-aarch64.c
-static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
+static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
     block_q4_0x4 out;
 
     for (int i = 0; i < 4; i++) {
         out.d[i] = in[i].d;
     }
 
-    for (int i = 0; i < QK4_0 * 2; i++) {
-        int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
-        int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
-        src_offset += (i % blck_size_interleave);
+    const int end = QK4_0 * 2 / blck_size_interleave;
 
-        out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
+    if (blck_size_interleave == 8) {
+        const uint64_t xor_mask = 0x8888888888888888ULL;
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            uint64_t elems;
+            // Using memcpy to avoid unaligned memory accesses
+            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+            elems ^= xor_mask;
+            memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+        }
+    } else if (blck_size_interleave == 4) {
+        const uint32_t xor_mask = 0x88888888;
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            uint32_t elems;
+            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
+            elems ^= xor_mask;
+            memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
+        }
+    } else {
+        GGML_ASSERT(false);
     }
 
     return out;
@@ -3409,19 +3432,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
 // returns an interleaved block_q4_0x8
 // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
 // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
-static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
+static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
     block_q4_0x8 out;
 
     for (int i = 0; i < 8; i++) {
         out.d[i] = in[i].d;
     }
 
-    for (int i = 0; i < QK4_0 * 4; i++) {
-        int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
-        int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
-        src_offset += (i % blck_size_interleave);
+    const int end = QK4_0 * 4 / blck_size_interleave;
+    const uint64_t xor_mask = 0x8888888888888888ULL;
 
-        out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
+    for (int i = 0; i < end; ++i) {
+        int src_id = i % 8;
+        int src_offset = (i / 8) * blck_size_interleave;
+        int dst_offset = i * blck_size_interleave;
+
+        uint64_t elems;
+        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+        elems ^= xor_mask;
+        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
     }
 
     return out;
@@ -3449,7 +3478,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
             for (int i = 0; i < nrows_interleaved; i++) {
                 dst_tmp[i] = src[x + i * nblocks];
             }
-            *dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
+            *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
         }
         src += nrows_interleaved * nblocks;
     }
@@ -3480,7 +3509,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
             for (int i  = 0; i < nrows_interleaved; i++ ) {
                 dst_tmp[i] = src[x + i * nblocks];
             }
-            *dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
+            *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
         }
         src += nrows_interleaved * nblocks;
     }

From dd3a6ce9f84d21ba05fe98af9f983bdea0398e6c Mon Sep 17 00:00:00 2001
From: FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
Date: Sat, 16 Nov 2024 14:59:33 +1300
Subject: [PATCH 228/396] vulkan : add cmake preset debug/release (#10306)

---
 CMakePresets.json | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index ae45d60af..436448967 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -24,11 +24,12 @@
             "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
         }
     },
-    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
-    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
-    { "name": "sycl_f16",  "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
+    { "name": "debug",    "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
+    { "name": "release",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "reldbg",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
+    { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
+    { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
 
     {
         "name": "arm64-windows-msvc", "hidden": true,
@@ -57,25 +58,28 @@
         }
     },
 
-    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
+    { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
 
-    { "name": "arm64-apple-clang-debug"  , "inherits": [ "base", "arm64-apple-clang",  "debug"   ] },
-    { "name": "arm64-apple-clang-release"  , "inherits": [ "base", "arm64-apple-clang",  "reldbg"   ] },
-    { "name": "arm64-apple-clang+static-release"  , "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
+    { "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
+    { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
+    { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
 
-    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
     { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
     { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
 
-    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
+    { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
     { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
     { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
 
-    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
+    { "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
     { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
     { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
-    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
+    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
+
+    { "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
+    { "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
   ]
 }

From 772703c8fffdd83d2e28f60119e83525f1189412 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Sat, 16 Nov 2024 00:26:57 -0600
Subject: [PATCH 229/396] vulkan: Optimize some mat-vec mul quant shaders
 (#10296)

Compute two result elements per workgroup (for Q{4,5}_{0,1}). This reuses
the B loads across the rows and also reuses some addressing calculations.
This required manually partially unrolling the loop, since the compiler
is less willing to unroll outer loops.

Add bounds-checking on the last iteration of the loop. I think this was at
least partly broken before.

Optimize the Q4_K shader to vectorize most loads and reduce the number of
bit twiddling instructions.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          |  79 +++++-----
 .../vulkan-shaders/mul_mat_vec.comp           | 107 +++++++++----
 .../vulkan-shaders/mul_mat_vec_q4_k.comp      | 149 ++++++++++--------
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |   6 +-
 4 files changed, 210 insertions(+), 131 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 04d671aeb..19951d9b6 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1365,47 +1365,48 @@ static void ggml_vk_load_shaders(vk_device& device) {
     }
 
     // mul mat vec
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32",  mul_mat_vec_f32_f32_f32_len,  mul_mat_vec_f32_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32",  mul_mat_vec_f16_f32_f32_len,  mul_mat_vec_f16_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    // computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32",  mul_mat_vec_f32_f32_f32_len,  mul_mat_vec_f32_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32",  mul_mat_vec_f16_f32_f32_len,  mul_mat_vec_f16_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
 
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32",  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32",  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1);
 
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
 
     // dequant shaders
     ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
index d3ccba7fc..ddbac5d2c 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -3,54 +3,107 @@
 #ifdef FLOAT16
 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
 #endif
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#extension GL_EXT_null_initializer : enable
 
 #include "mul_mat_vec_base.comp"
 
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+layout (constant_id = 1) const uint NUM_ROWS = 1;
 
-shared FLOAT_TYPE tmp[BLOCK_SIZE];
+uint a_offset, b_offset, d_offset, y_offset;
 
-void main() {
-    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
-    const uint tid = gl_LocalInvocationID.x;
+shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
 
-    // There are not enough cols to use all threads
-    if (tid >= p.ncols) {
-        return;
+void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
+{
+    const uint col = i*BLOCK_SIZE + 2*tid;
+    const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
+    const uint iybs = col - col%QUANT_K; // y block start index
+
+    // Check if the second of the pair of elements is OOB, and don't fetch B or
+    // accumulate it. We still fetch a pair of elements for A, which is fine for
+    // quantized formats since they'll be within the same block. We should
+    // probably skip fetching the second element for F16/F32, but as of now we
+    // still do.
+    const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
+
+    FLOAT_TYPE b0 = 0, b1 = 0;
+    b0 = FLOAT_TYPE(data_b[b_offset + iybs + iqs]);
+    if (!OOB) {
+        b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
     }
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const uint ib = ((first_row + n)*p.ncols + col)/QUANT_K; // block index
 
-    const uint block_size = min(p.ncols, BLOCK_SIZE);
-
-    uint a_offset, b_offset, d_offset;
-    get_offsets(a_offset, b_offset, d_offset);
-
-    const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
-
-    tmp[tid] = FLOAT_TYPE(0.0f);
-
-    [[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) {
-        const uint col = i*block_size + 2*tid;
-        const uint ib = (row*p.ncols + col)/QUANT_K; // block index
-        const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
-        const uint iybs = col - col%QUANT_K; // y block start index
-
-        vec2 v = dequantize(ib, iqs, a_offset / QUANT_K);
+        const vec2 v = dequantize(ib, iqs, a_offset);
 
         // matrix multiplication
-        tmp[tid] = fma(FLOAT_TYPE(v.x), FLOAT_TYPE(data_b[b_offset + iybs + iqs]), fma(FLOAT_TYPE(v.y), FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]), tmp[tid]));
+        temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]);
+        if (!OOB) {
+            temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
+        }
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    const uint tid = gl_LocalInvocationID.x;
+
+    get_offsets(a_offset, b_offset, d_offset);
+    a_offset /= QUANT_K;
+
+    y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
+
+    FLOAT_TYPE temp[NUM_ROWS] = {};
+
+    const int unroll_count = 8;
+
+    const uint num_iters = (p.ncols >= 2*tid) ? ((p.ncols - 2*tid + BLOCK_SIZE - 1) / BLOCK_SIZE) : 0;
+    const uint unrolled_iters = num_iters & ~(2*unroll_count - 1);
+
+    uint i = 0;
+    while (i < unrolled_iters) {
+        // Manually partially unroll the loop
+        [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
+            iter(temp, first_row, num_rows, tid, i, false);
+            i += 2;
+        }
+    }
+    while (i < num_iters) {
+        iter(temp, first_row, num_rows, tid, i, true);
+        i += 2;
     }
 
     // sum up partial sums and write back result
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        tmpsh[n][tid] = temp[n];
+    }
     barrier();
-    [[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
         if (tid < s) {
-            tmp[tid] += tmp[tid + s];
+            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+                tmpsh[n][tid] += tmpsh[n][tid + s];
+            }
         }
         barrier();
     }
     if (tid == 0) {
-        data_d[d_offset + row] = D_TYPE(tmp[0]);
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
+        }
+    }
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        compute_outputs(first_row, p.stride_d - first_row);
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
index d91e00e10..b7c9b722d 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
@@ -1,11 +1,34 @@
 #version 450
 
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
 #include "mul_mat_vec_base.comp"
 
 layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
 
 shared FLOAT_TYPE tmp[32];
 
+// Declare aliased versions of A and B bindings that can use 16b/32b loads for
+// the quantized values, and vec4 loads for B.
+struct block_q4_K_u32
+{
+    f16vec2 d;
+    uint32_t scales[3*QUANT_K/64/4];
+    uint32_t qs[QUANT_K/2/4];
+};
+
+struct block_q4_K_u16
+{
+    f16vec2 d;
+    uint16_t scales[3*QUANT_K/64/2];
+    uint16_t qs[QUANT_K/2/2];
+};
+
+layout (binding = 0) readonly buffer A_u32 {block_q4_K_u32 data_a_u32[];};
+layout (binding = 0) readonly buffer A_u16 {block_q4_K_u16 data_a_u16[];};
+layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
+
+// This shader assumes K_QUANTS_PER_ITERATION == 2 for alignment of loads
 void main() {
     const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
 
@@ -31,79 +54,81 @@ void main() {
     const uint q_offset = 32*v_im + l0;
     const uint y_offset = 64*v_im + l0;
 
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
         const uint y1_idx = i * QUANT_K + y_offset;
         const uint y2_idx = y1_idx + 128;
 
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y);
+        f16vec2 d = data_a[ib0 + i].d;
+        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
 
-        const uint8_t sc0 = uint8_t(  data_a[ib0 + i].scales[v_im * 2    ]       & 0x3f);
-        const uint8_t sc1 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 1]       & 0x3f);
-        const uint8_t sc2 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 4]       & 0x3f);
-        const uint8_t sc3 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 5]       & 0x3f);
-        const uint8_t sc4 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 8]       & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2    ] & 0xc0) >> 2));
-        const uint8_t sc5 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 9]       & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 1] & 0xc0) >> 2));
-        const uint8_t sc6 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 8] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 4] & 0xc0) >> 2));
-        const uint8_t sc7 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 9] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 5] & 0xc0) >> 2));
+        uint32_t scale0_u32 = data_a_u16[ib0 + i].scales[v_im    ];
+        uint32_t scale4_u32 = data_a_u16[ib0 + i].scales[v_im + 2];
+        uint32_t scale8_u32 = data_a_u16[ib0 + i].scales[v_im + 4];
+        uvec4 scale0 = uvec4(unpack8(scale0_u32));
+        uvec4 scale4 = uvec4(unpack8(scale4_u32));
+        uvec4 scale8 = uvec4(unpack8(scale8_u32));
 
-#if K_QUANTS_PER_ITERATION == 2
-        const uint8_t q4_0  = uint8_t(data_a[ib0 + i].qs[q_offset     ] & 0xf);
-        const uint8_t q4_1  = uint8_t(data_a[ib0 + i].qs[q_offset +  1] & 0xf);
-        const uint8_t q4_2  = uint8_t(data_a[ib0 + i].qs[q_offset +  2] & 0xf);
-        const uint8_t q4_3  = uint8_t(data_a[ib0 + i].qs[q_offset +  3] & 0xf);
-        const uint8_t q4_4  = uint8_t(data_a[ib0 + i].qs[q_offset     ]  >> 4);
-        const uint8_t q4_5  = uint8_t(data_a[ib0 + i].qs[q_offset +  1]  >> 4);
-        const uint8_t q4_6  = uint8_t(data_a[ib0 + i].qs[q_offset +  2]  >> 4);
-        const uint8_t q4_7  = uint8_t(data_a[ib0 + i].qs[q_offset +  3]  >> 4);
-        const uint8_t q4_8  = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf);
-        const uint8_t q4_9  = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf);
-        const uint8_t q4_10 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] & 0xf);
-        const uint8_t q4_11 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] & 0xf);
-        const uint8_t q4_12 = uint8_t(data_a[ib0 + i].qs[q_offset + 64]  >> 4);
-        const uint8_t q4_13 = uint8_t(data_a[ib0 + i].qs[q_offset + 65]  >> 4);
-        const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66]  >> 4);
-        const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67]  >> 4);
+        const uint32_t sc0 = (  scale0.x       & 0x3f);
+        const uint32_t sc1 = (  scale0.y       & 0x3f);
+        const uint32_t sc2 = (  scale4.x       & 0x3f);
+        const uint32_t sc3 = (  scale4.y       & 0x3f);
+        const uint32_t sc4 = (( scale8.x       & 0x0f) | ((scale0.x & 0xc0) >> 2));
+        const uint32_t sc5 = (( scale8.y       & 0x0f) | ((scale0.y & 0xc0) >> 2));
+        const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
+        const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
 
-        const FLOAT_TYPE sx = fma(FLOAT_TYPE(data_b[b_offset + y1_idx]),      q4_0,  fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]),  q4_1,  fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 2]),  q4_2,  FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) *  q4_3)));
-        const FLOAT_TYPE sy = fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), q4_4,  fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), q4_5,  fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 34]), q4_6,  FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * q4_7)));
-        const FLOAT_TYPE sz = fma(FLOAT_TYPE(data_b[b_offset + y2_idx]),      q4_8,  fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]),  q4_9,  fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 2]),  q4_10, FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) *  q4_11)));
-        const FLOAT_TYPE sw = fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), q4_12, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), q4_13, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 34]), q4_14, FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * q4_15)));
+        uint32_t qs0_u32 = data_a_u32[ib0 + i].qs[q_offset / 4];
+        uint32_t qs64_u32 = data_a_u32[ib0 + i].qs[q_offset / 4 + 16];
+
+        uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
+        uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
+        uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
+        uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
+
+        uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4));
+        uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4));
+        uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4));
+        uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4));
+
+        const uint32_t q4_0  = qs0_lo4.x;
+        const uint32_t q4_1  = qs0_lo4.y;
+        const uint32_t q4_2  = qs0_lo4.z;
+        const uint32_t q4_3  = qs0_lo4.w;
+        const uint32_t q4_4  = qs0_hi4.x;
+        const uint32_t q4_5  = qs0_hi4.y;
+        const uint32_t q4_6  = qs0_hi4.z;
+        const uint32_t q4_7  = qs0_hi4.w;
+        const uint32_t q4_8  = qs64_lo4.x;
+        const uint32_t q4_9  = qs64_lo4.y;
+        const uint32_t q4_10 = qs64_lo4.z;
+        const uint32_t q4_11 = qs64_lo4.w;
+        const uint32_t q4_12 = qs64_hi4.x;
+        const uint32_t q4_13 = qs64_hi4.y;
+        const uint32_t q4_14 = qs64_hi4.z;
+        const uint32_t q4_15 = qs64_hi4.w;
+
+        B_TYPE_VEC4 by10 =  data_b_v4[(b_offset + y1_idx) / 4];
+        B_TYPE_VEC4 by132 = data_b_v4[(b_offset + y1_idx) / 4 + 8];
+        B_TYPE_VEC4 by20 =  data_b_v4[(b_offset + y2_idx) / 4];
+        B_TYPE_VEC4 by232 = data_b_v4[(b_offset + y2_idx) / 4 + 8];
+
+        const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x),      q4_0,  fma(FLOAT_TYPE(by10.y),  q4_1,  fma(FLOAT_TYPE(by10.z),  q4_2,  FLOAT_TYPE(by10.w) *  q4_3)));
+        const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x),     q4_4,  fma(FLOAT_TYPE(by132.y), q4_5,  fma(FLOAT_TYPE(by132.z), q4_6,  FLOAT_TYPE(by132.w) * q4_7)));
+        const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x),      q4_8,  fma(FLOAT_TYPE(by20.y),  q4_9,  fma(FLOAT_TYPE(by20.z),  q4_10, FLOAT_TYPE(by20.w) *  q4_11)));
+        const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x),     q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15)));
         const FLOAT_TYPE smin =
-            fma(FLOAT_TYPE(data_b[b_offset + y1_idx    ]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx    ]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), sc7,
-            fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), sc7,
-            fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 2]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 34]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 2]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 34]), sc7,
-            fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 3]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 35]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 3]), sc6,     FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7)))))))))))))));
-        const uint tmp_idx = 16 * ix + tid;
-        tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx]));
-#else
-        const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset     ] & 0xf);
-        const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset +  1] & 0xf);
-        const uint8_t q4_2 = uint8_t(data_a[ib0 + i].qs[q_offset     ]  >> 4);
-        const uint8_t q4_3 = uint8_t(data_a[ib0 + i].qs[q_offset +  1]  >> 4);
-        const uint8_t q4_4 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf);
-        const uint8_t q4_5 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf);
-        const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64]  >> 4);
-        const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65]  >> 4);
-
-        const FLOAT_TYPE sx = fma(FLOAT_TYPE(data_b[b_offset + y1_idx     ]), q4_0, FLOAT_TYPE(data_b[b_offset + y1_idx +  1]) * q4_1);
-        const FLOAT_TYPE sy = fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), q4_2, FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3);
-        const FLOAT_TYPE sz = fma(FLOAT_TYPE(data_b[b_offset + y2_idx     ]), q4_4, FLOAT_TYPE(data_b[b_offset + y2_idx +  1]) * q4_5);
-        const FLOAT_TYPE sw = fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), q4_6, FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7);
-        const FLOAT_TYPE smin =
-            fma(FLOAT_TYPE(data_b[b_offset + y1_idx    ]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx    ]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), sc7,
-          + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), sc6, FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7)))))));
-
-        tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) +
-                        sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin);
-        const uint tmp_idx = 16 * ix + tid;
-        tmp[tmp_idx] = fma(dall, (fma(sx, FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f), fma(sy, FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f),
-                       fma(sz, FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)), fma(sw, FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))))))), fma(-dmin, smin, tmp[tmp_idx]));
-#endif
+            fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7,
+            fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
+            fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
+            fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6,     FLOAT_TYPE(by232.w) * sc7)))))))))))))));
+        temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
     }
 
+    tmp[gl_LocalInvocationID.x] = temp;
+
     // sum up partial sums and write back result
     barrier();
     [[unroll]] for (uint s = 16; s > 0; s >>= 1) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index 5c84f473f..fe3e4cb39 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -317,10 +317,10 @@ void process_shaders() {
         std::string data_a_key = "DATA_A_" + to_uppercase(tname);
         std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
 
-        string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
-        string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
 
-        string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
 
         // Dequant shaders
         if (tname != "f16") {

From f245cc28d4eb900efad0bc740145f58d713c6e4f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 16 Nov 2024 10:32:50 +0200
Subject: [PATCH 230/396] scripts : fix missing key in compare-llama-bench.py
 (#10332)

---
 scripts/compare-llama-bench.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index 6125a080a..5069ae638 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -303,14 +303,11 @@ else:
 
     show = []
     # Show CPU and/or GPU by default even if the hardware for all results is the same:
-    if "gpu_blas" not in properties_different and "n_gpu_layers" not in properties_different:
-        gpu_blas = bool(rows_full[0][KEY_PROPERTIES.index("gpu_blas")])
+    if "n_gpu_layers" not in properties_different:
         ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])
 
-        if not gpu_blas or ngl != 99 and "cpu_info" not in properties_different:
+        if ngl != 99 and "cpu_info" not in properties_different:
             show.append("cpu_info")
-        if gpu_blas and "gpu_info" not in properties_different:
-            show.append("gpu_info")
 
     show += properties_different
 

From bcdb7a23862b61aa307fc462fadfe1e2e653d010 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Sat, 16 Nov 2024 18:26:54 +0500
Subject: [PATCH 231/396] server: (web UI) Add samplers sequence customization
 (#10255)

* Samplers sequence: simplified and input field.

* Removed unused function

* Modify and use `settings-modal-short-input`

* rename "name" --> "label"

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 examples/server/public/index.html | 19 ++++++++++++++++---
 examples/server/server.cpp        | 20 ++++++++++++++------
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 65a915d59..6216c0841 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -212,6 +212,9 @@
           <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
             <summary class="collapse-title font-bold">Other sampler settings</summary>
             <div class="collapse-content">
+              <!-- Samplers queue -->
+              <settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
+              <!-- Samplers -->
               <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
                 <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
               </template>
@@ -231,6 +234,7 @@
             <summary class="collapse-title font-bold">Advanced config</summary>
             <div class="collapse-content">
               <label class="form-control mb-2">
+                <!-- Custom parameters input -->
                 <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
                 <textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
               </label>
@@ -253,7 +257,7 @@
     <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
       <!-- Show help message on hovering on the input label -->
       <div class="dropdown dropdown-hover">
-        <div tabindex="0" role="button" class="font-bold">{{ configKey }}</div>
+        <div tabindex="0" role="button" class="font-bold">{{ label || configKey }}</div>
         <div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
           {{ configInfo[configKey] || '(no help message available)' }}
         </div>
@@ -282,6 +286,7 @@
       apiKey: '',
       systemMessage: 'You are a helpful assistant.',
       // make sure these default values are in sync with `common.h`
+      samplers: 'dkypmxt',
       temperature: 0.8,
       dynatemp_range: 0.0,
       dynatemp_exponent: 1.0,
@@ -305,6 +310,7 @@
     const CONFIG_INFO = {
       apiKey: 'Set the API Key if you are using --api-key option for the server.',
       systemMessage: 'The starting message that defines how model should behave.',
+      samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
       temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
       dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
       dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
@@ -352,10 +358,16 @@
       { props: ["source"] }
     );
 
-    // inout field to be used by settings modal
+    // input field to be used by settings modal
     const SettingsModalShortInput = defineComponent({
       template: document.getElementById('settings-modal-short-input').innerHTML,
-      props: ['configKey', 'configDefault', 'configInfo', 'modelValue'],
+      props: {
+        label: { type: String, required: false },
+        configKey: String,
+        configDefault: Object,
+        configInfo: Object,
+        modelValue: [Object, String, Number],
+      },
     });
 
     // coversations is stored in localStorage
@@ -546,6 +558,7 @@
               ],
               stream: true,
               cache_prompt: true,
+              samplers: this.config.samplers,
               temperature: this.config.temperature,
               dynatemp_range: this.config.dynatemp_range,
               dynatemp_exponent: this.config.dynatemp_exponent,
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 00f9031dc..b8e003be9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -927,14 +927,22 @@ struct server_context {
 
         {
             const auto & samplers = data.find("samplers");
-            if (samplers != data.end() && samplers->is_array()) {
-                std::vector<std::string> sampler_names;
-                for (const auto & name : *samplers) {
-                    if (name.is_string()) {
-                        sampler_names.emplace_back(name);
+            if (samplers != data.end()) {
+                if (samplers->is_array()) {
+                    std::vector<std::string> sampler_names;
+                    for (const auto & name : *samplers) {
+                        if (name.is_string()) {
+                            sampler_names.emplace_back(name);
+                        }
                     }
+                    slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
+                } else if (samplers->is_string()){
+                    std::string sampler_string;
+                    for (const auto & name : *samplers) {
+                        sampler_string += name;
+                    }
+                    slot.sparams.samplers = common_sampler_types_from_chars(sampler_string);
                 }
-                slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
             } else {
                 slot.sparams.samplers = default_sparams.samplers;
             }

From 8ee0d09ae6928d0501765cfc4e430b9236730caf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 16 Nov 2024 17:58:32 +0200
Subject: [PATCH 232/396] make : auto-determine dependencies (#0)

---
 .gitignore |   1 +
 Makefile   | 411 ++++++++++++++++-------------------------------------
 2 files changed, 125 insertions(+), 287 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1092d097a..0b56bcc7a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 *.a
 *.bat
 *.bin
+*.d
 *.dll
 *.dot
 *.etag
diff --git a/Makefile b/Makefile
index 804c1bbec..539370e06 100644
--- a/Makefile
+++ b/Makefile
@@ -527,11 +527,11 @@ ifndef GGML_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
-		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
-		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
-		MK_LDFLAGS  += -framework Accelerate
-		OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+		MK_CPPFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
+		MK_CPPFLAGS  += -DACCELERATE_NEW_LAPACK
+		MK_CPPFLAGS  += -DACCELERATE_LAPACK_ILP64
+		MK_LDFLAGS   += -framework Accelerate
+		OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 	endif
 endif # GGML_NO_ACCELERATE
 
@@ -542,44 +542,44 @@ ifndef GGML_NO_OPENMP
 endif # GGML_NO_OPENMP
 
 ifdef GGML_OPENBLAS
-	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
-	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
-	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+	MK_CPPFLAGS  += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
+	MK_CFLAGS    += $(shell pkg-config --cflags-only-other openblas)
+	MK_LDFLAGS   += $(shell pkg-config --libs openblas)
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_OPENBLAS
 
 ifdef GGML_OPENBLAS64
-	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
-	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
-	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+	MK_CPPFLAGS  += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
+	MK_CFLAGS    += $(shell pkg-config --cflags-only-other openblas64)
+	MK_LDFLAGS   += $(shell pkg-config --libs openblas64)
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_OPENBLAS64
 
 ifdef GGML_BLIS
-	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
-	MK_LDFLAGS  += -lblis -L/usr/local/lib
-	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+	MK_CPPFLAGS  += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
+	MK_LDFLAGS   += -lblis -L/usr/local/lib
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_BLIS
 
 ifdef GGML_NVPL
-	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
-	MK_LDFLAGS  += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
-	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+	MK_CPPFLAGS  += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
+	MK_LDFLAGS   += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_NVPL
 
 ifndef GGML_NO_LLAMAFILE
-	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJ_GGML    += ggml/src/ggml-cpu/llamafile/sgemm.o
+	MK_CPPFLAGS  += -DGGML_USE_LLAMAFILE
+	OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
 endif
 
 ifndef GGML_NO_AMX
 	MK_CPPFLAGS += -DGGML_USE_AMX
-	OBJ_GGML    += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
+	OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
 endif
 
 ifdef GGML_RPC
-	MK_CPPFLAGS += -DGGML_USE_RPC
-	OBJ_GGML    += ggml/src/ggml-rpc.o
+	MK_CPPFLAGS  += -DGGML_USE_RPC
+	OBJ_GGML_EXT += ggml/src/ggml-rpc.o
 endif # GGML_RPC
 
 OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
@@ -604,9 +604,9 @@ ifdef GGML_CUDA
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	MK_NVCCFLAGS += -use_fast_math
 
-	OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
-	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
-	OBJ_GGML += $(OBJ_CUDA_TMPL)
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
 
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
@@ -723,9 +723,9 @@ ggml/src/ggml-cuda/ggml-cuda.o: \
 endif # GGML_CUDA
 
 ifdef GGML_VULKAN
-	MK_CPPFLAGS += -DGGML_USE_VULKAN
-	MK_LDFLAGS  += $(shell pkg-config --libs vulkan)
-	OBJ_GGML    += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
+	MK_CPPFLAGS  += -DGGML_USE_VULKAN
+	MK_LDFLAGS   += $(shell pkg-config --libs vulkan)
+	OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
 
 ifdef GGML_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
@@ -821,9 +821,9 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY
 
-	OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
-	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
-	OBJ_GGML += $(OBJ_CUDA_TMPL)
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
 
 ggml/src/ggml-cuda/ggml-cuda.o: \
 	ggml/src/ggml-cuda/ggml-cuda.cu \
@@ -922,9 +922,9 @@ ifdef GGML_CUDA_FA_ALL_QUANTS
 	MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # GGML_CUDA_FA_ALL_QUANTS
 
-	OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
-	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
-	OBJ_GGML += $(OBJ_CUDA_TMPL)
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
 
 ggml/src/ggml-cuda/ggml-cuda.o: \
 	ggml/src/ggml-cuda/ggml-cuda.cu \
@@ -945,9 +945,9 @@ ggml/src/ggml-cuda/%.o: \
 endif # GGML_MUSA
 
 ifdef GGML_METAL
-	MK_CPPFLAGS += -DGGML_USE_METAL
-	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	OBJ_GGML	+= ggml/src/ggml-metal/ggml-metal.o
+	MK_CPPFLAGS  += -DGGML_USE_METAL
+	MK_LDFLAGS   += -framework Foundation -framework Metal -framework MetalKit
+	OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o
 
 ifdef GGML_METAL_USE_BF16
 	MK_CPPFLAGS += -DGGML_METAL_USE_BF16
@@ -956,8 +956,8 @@ ifdef GGML_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
 ifdef GGML_METAL_EMBED_LIBRARY
-	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
-	OBJ_GGML    += ggml/src/ggml-metal-embed.o
+	MK_CPPFLAGS  += -DGGML_METAL_EMBED_LIBRARY
+	OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o
 endif
 endif # GGML_METAL
 
@@ -987,36 +987,41 @@ ggml/src/ggml-metal-embed.o: \
 endif
 endif # GGML_METAL
 
-OBJ_GGML += \
-	ggml/src/ggml.o \
-	ggml/src/ggml-aarch64.o \
-	ggml/src/ggml-alloc.o \
-	ggml/src/ggml-backend.o \
-	ggml/src/ggml-backend-reg.o \
-	ggml/src/ggml-quants.o \
-	ggml/src/ggml-threading.o \
-	ggml/src/ggml-cpu/ggml-cpu.o \
-	ggml/src/ggml-cpu/ggml-cpu-cpp.o \
-	ggml/src/ggml-cpu/ggml-cpu-aarch64.o \
-	ggml/src/ggml-cpu/ggml-cpu-quants.o
+DIR_GGML = ggml
+DIR_LLAMA = src
+DIR_COMMON = common
+
+OBJ_GGML = \
+	$(DIR_GGML)/src/ggml.o \
+	$(DIR_GGML)/src/ggml-aarch64.o \
+	$(DIR_GGML)/src/ggml-alloc.o \
+	$(DIR_GGML)/src/ggml-backend.o \
+	$(DIR_GGML)/src/ggml-backend-reg.o \
+	$(DIR_GGML)/src/ggml-quants.o \
+	$(DIR_GGML)/src/ggml-threading.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
+	$(OBJ_GGML_EXT)
 
 OBJ_LLAMA = \
-	src/llama.o \
-	src/llama-vocab.o \
-	src/llama-grammar.o \
-	src/llama-sampling.o \
-	src/unicode.o \
-	src/unicode-data.o
+	$(DIR_LLAMA)/llama.o \
+	$(DIR_LLAMA)/llama-vocab.o \
+	$(DIR_LLAMA)/llama-grammar.o \
+	$(DIR_LLAMA)/llama-sampling.o \
+	$(DIR_LLAMA)/unicode.o \
+	$(DIR_LLAMA)/unicode-data.o
 
 OBJ_COMMON = \
-	common/common.o \
-	common/arg.o \
-	common/log.o \
-	common/console.o \
-	common/ngram-cache.o \
-	common/sampling.o \
-	common/build-info.o \
-	common/json-schema-to-grammar.o
+	$(DIR_COMMON)/common.o \
+	$(DIR_COMMON)/arg.o \
+	$(DIR_COMMON)/log.o \
+	$(DIR_COMMON)/console.o \
+	$(DIR_COMMON)/ngram-cache.o \
+	$(DIR_COMMON)/sampling.o \
+	$(DIR_COMMON)/build-info.o \
+	$(DIR_COMMON)/json-schema-to-grammar.o
 
 OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
 
@@ -1117,246 +1122,78 @@ endif
 # Build libraries
 #
 
-# ggml
+# Libraries
+LIB_GGML   = libggml.so
+LIB_GGML_S = libggml.a
 
-ggml/src/ggml.o: \
-	ggml/src/ggml.c \
-	ggml/include/ggml.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+LIB_LLAMA   = libllama.so
+LIB_LLAMA_S = libllama.a
 
-ggml/src/ggml-threading.o: \
-	ggml/src/ggml-threading.cpp \
-	ggml/include/ggml.h
-	$(CXX) $(XXCFLAGS)   -c $< -o $@
+LIB_COMMON   = libcommon.so
+LIB_COMMON_S = libcommon.a
 
-ggml/src/ggml-cpu/ggml-cpu.o: \
-	ggml/src/ggml-cpu/ggml-cpu.c \
-	ggml/include/ggml.h \
-	ggml/src/ggml-common.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+# Targets
+BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)
 
-ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
+# Dependency files
+DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
+
+# Default target
+all: $(BUILD_TARGETS)
+
+# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
+#       g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
+$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
 	ggml/src/ggml-cpu/ggml-cpu.cpp \
-	ggml/include/ggml.h \
-	ggml/src/ggml-common.h
-	$(CXX) $(CXXFLAGS)   -c $< -o $@
-
-ggml/src/ggml-alloc.o: \
-	ggml/src/ggml-alloc.c \
-	ggml/include/ggml.h \
-	ggml/include/ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
-
-ggml/src/ggml-backend.o: \
-	ggml/src/ggml-backend.cpp \
-	ggml/src/ggml-backend-impl.h \
-	ggml/include/ggml.h \
-	ggml/include/ggml-backend.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ggml/src/ggml-quants.o: \
-	ggml/src/ggml-quants.c \
-	ggml/include/ggml.h \
-	ggml/src/ggml-quants.h \
-	ggml/src/ggml-common.h
-	$(CC) $(CFLAGS)    -c $< -o $@
-
-ggml/src/ggml-aarch64.o: \
-	ggml/src/ggml-aarch64.c \
-	ggml/include/ggml.h \
-	ggml/src/ggml-aarch64.h \
-	ggml/src/ggml-common.h
-	$(CC) $(CFLAGS)    -c $< -o $@
-
-ggml/src/ggml-blas/ggml-blas.o: \
-	ggml/src/ggml-blas/ggml-blas.cpp \
-	ggml/include/ggml-blas.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ifndef GGML_NO_LLAMAFILE
-ggml/src/ggml-cpu/llamafile/sgemm.o: \
-	ggml/src/ggml-cpu/llamafile/sgemm.cpp \
-	ggml/src/ggml-cpu/llamafile/sgemm.h \
-	ggml/include/ggml.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@ -I ggml/src -I ggml/src/ggml-cpu
-endif # GGML_NO_LLAMAFILE
-
-ifndef GGML_NO_AMX
-ggml/src/ggml-amx/ggml-amx.o: \
-	ggml/src/ggml-amx/ggml-amx.cpp \
-	ggml/include/ggml-amx.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ggml/src/ggml-amx/mmq.o: \
-	ggml/src/ggml-amx/mmq.cpp \
-	ggml/src/ggml-amx/mmq.h \
-	ggml/include/ggml.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif
-
-ifdef GGML_RPC
-ggml/src/ggml-rpc.o: \
-	ggml/src/ggml-rpc.cpp \
-	ggml/include/ggml-rpc.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif # GGML_RPC
-
-$(LIB_GGML): \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
-
-$(LIB_GGML_S): \
-	$(OBJ_GGML)
-	ar rcs $(LIB_GGML_S) $^
-
-# llama
-
-src/unicode.o: \
-	src/unicode.cpp \
-	src/unicode.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-src/unicode-data.o: \
-	src/unicode-data.cpp \
-	src/unicode-data.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-src/llama.o: \
-	src/llama.cpp \
-	src/llama-impl.h \
-	src/llama-vocab.h \
-	src/llama-grammar.h \
-	src/llama-sampling.h \
-	src/unicode.h \
-	include/llama.h \
-	ggml/include/ggml-cuda.h \
-	ggml/include/ggml-metal.h \
+	ggml/include/ggml-backend.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-alloc.h \
-	ggml/include/ggml-backend.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+	ggml/src/ggml-backend-impl.h \
+	ggml/include/ggml-cpu.h \
+	ggml/src/ggml-impl.h
+	$(CXX) $(CXXFLAGS)   -c $< -o $@
 
-src/llama-vocab.o: \
-	src/llama-vocab.cpp \
-	src/llama-vocab.h \
-	src/llama-impl.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+# Rules for building object files
+$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
+	$(CC) $(CFLAGS) -MMD -c $< -o $@
 
-src/llama-grammar.o: \
-	src/llama-grammar.cpp \
-	src/llama-grammar.h \
-	src/llama-impl.h \
-	src/llama-vocab.h \
-	src/llama-sampling.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+$(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
 
-src/llama-sampling.o: \
-	src/llama-sampling.cpp \
-	src/llama-sampling.h \
-	src/llama-impl.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+$(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
 
-$(LIB_LLAMA): \
-	$(OBJ_LLAMA) \
-	$(LIB_GGML)
+$(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+
+# Rules for building libraries
+$(LIB_GGML): $(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
-$(LIB_LLAMA_S): \
-	$(OBJ_LLAMA)
+$(LIB_GGML_S): $(OBJ_GGML)
+	ar rcs $(LIB_GGML_S) $^
+
+$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_LLAMA_S): $(OBJ_LLAMA)
 	ar rcs $(LIB_LLAMA_S) $^
 
-# common
-
-common/common.o: \
-	common/common.cpp \
-	common/common.h \
-	common/console.h \
-	common/sampling.h \
-	common/json.hpp \
-	common/json-schema-to-grammar.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/arg.o: \
-	common/arg.cpp \
-	common/arg.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/log.o: \
-	common/log.cpp \
-	common/log.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/sampling.o: \
-	common/sampling.cpp \
-	common/sampling.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/console.o: \
-	common/console.cpp \
-	common/console.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/json-schema-to-grammar.o: \
-	common/json-schema-to-grammar.cpp \
-	common/json-schema-to-grammar.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/ngram-cache.o: \
-	common/ngram-cache.cpp \
-	common/ngram-cache.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-$(LIB_COMMON): \
-	$(OBJ_COMMON) \
-	$(LIB_LLAMA) \
-	$(LIB_GGML)
+$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
-$(LIB_COMMON_S): \
-	$(OBJ_COMMON)
+$(LIB_COMMON_S): $(OBJ_COMMON)
 	ar rcs $(LIB_COMMON_S) $^
 
+# Include dependency files
+-include $(DEP_FILES)
+
+# Clean rule
 clean:
-	rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
-	rm -rvf src/*.o
-	rm -rvf tests/*.o
-	rm -rvf examples/*.o
-	rm -rvf common/*.o
-	rm -rvf *.a
-	rm -rvf *.dll
-	rm -rvf *.so
-	rm -rvf *.dot
-	rm -rvf ggml/*.a
-	rm -rvf ggml/*.dll
-	rm -rvf ggml/*.so
-	rm -rvf ggml/src/*.o
-	rm -rvf common/build-info.cpp
-	rm -rvf ggml/src/ggml-cpu/*.o
-	rm -rvf ggml/src/ggml-cpu/llamafile/*.o
-	rm -vrf ggml/src/ggml-amx/*.o
-	rm -vrf ggml/src/ggml-blas/*.o
-	rm -vrf ggml/src/ggml-cann/*.o
-	rm -vrf ggml/src/ggml-cpu/*.o
-	rm -vrf ggml/src/ggml-cuda/*.o
-	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
-	rm -vrf ggml/src/ggml-hip/*.o
-	rm -vrf ggml/src/ggml-kompute/*.o
-	rm -vrf ggml/src/ggml-metal/*.o
-	rm -vrf ggml/src/ggml-metal/ggml-metal-embed.metal
-	rm -vrf ggml/src/ggml-rpc/*.o
-	rm -vrf ggml/src/ggml-sycl/*.o
-	rm -vrf ggml/src/ggml-vulkan/*.o
-	rm -vrf ggml/src/ggml-musa/*.o
-	rm -rvf $(BUILD_TARGETS)
-	rm -rvf $(TEST_TARGETS)
-	rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
-	rm -rvf $(LEGACY_TARGETS_CLEAN)
-	find examples pocs -type f -name "*.o" -delete
+	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -rvf *.a *.dll *.so *.dot
+	find ggml src common tests examples pocs -type f -name "*.o" -delete
+	find ggml src common tests examples pocs -type f -name "*.d" -delete
 
 #
 # Examples

From db4cfd5dbc31c90f0d5c413a2e182d068b8ee308 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 16 Nov 2024 17:58:56 +0200
Subject: [PATCH 233/396] llamafile : fix include path (#0)

ggml-ci
---
 ggml/src/ggml-cpu/llamafile/sgemm.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index da4146ec4..b2ce2e664 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -50,7 +50,8 @@
 
 #include "sgemm.h"
 #include "ggml-impl.h"
-#include "ggml-cpu-impl.h"
+// hack until moved into the CPU backend
+#include "../ggml-cpu-impl.h"
 #include "ggml-quants.h"
 
 #ifdef _MSC_VER

From 4e54be0ec6cb5cd6ed56e52e927b80b2796ec844 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 16 Nov 2024 23:00:41 +0100
Subject: [PATCH 234/396] llama/ex: remove --logdir argument (#10339)

---
 common/arg.cpp                     |  11 --
 common/common.cpp                  | 210 -----------------------------
 common/common.h                    |  13 --
 examples/infill/infill.cpp         |  46 -------
 examples/main/main.cpp             |  45 -------
 examples/perplexity/perplexity.cpp |  51 -------
 examples/server/README.md          |   1 -
 include/llama.h                    |   2 -
 scripts/run-with-preset.py         | 146 --------------------
 src/llama.cpp                      |  22 ---
 10 files changed, 547 deletions(-)
 delete mode 100755 scripts/run-with-preset.py

diff --git a/common/arg.cpp b/common/arg.cpp
index 7c5c5e5cd..4115b2f75 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1939,17 +1939,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.simple_io = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
-    add_opt(common_arg(
-        {"-ld", "--logdir"}, "LOGDIR",
-        "path under which to save YAML logs (no logging if unset)",
-        [](common_params & params, const std::string & value) {
-            params.logdir = value;
-
-            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
-                params.logdir += DIRECTORY_SEPARATOR;
-            }
-        }
-    ));
     add_opt(common_arg(
         {"--positive-file"}, "FNAME",
         string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
diff --git a/common/common.cpp b/common/common.cpp
index ebd16b600..930374621 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1890,213 +1890,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
     return result;
 }
 
-//
-// YAML utils
-//
-
-void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
-    if (data.empty()) {
-        fprintf(stream, "%s:\n", prop_name);
-        return;
-    }
-
-    fprintf(stream, "%s: [", prop_name);
-    for (size_t i = 0; i < data.size() - 1; ++i) {
-        fprintf(stream, "%e, ", data[i]);
-    }
-    fprintf(stream, "%e]\n", data.back());
-}
-
-void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
-    if (data.empty()) {
-        fprintf(stream, "%s:\n", prop_name);
-        return;
-    }
-
-    fprintf(stream, "%s: [", prop_name);
-    for (size_t i = 0; i < data.size() - 1; ++i) {
-        fprintf(stream, "%d, ", data[i]);
-    }
-    fprintf(stream, "%d]\n", data.back());
-}
-
-void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
-    std::string data_str(data == NULL ? "" : data);
-
-    if (data_str.empty()) {
-        fprintf(stream, "%s:\n", prop_name);
-        return;
-    }
-
-    size_t pos_start = 0;
-    size_t pos_found = 0;
-
-    if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
-        data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
-        data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
-        data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
-        data_str = "\"" + data_str + "\"";
-        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
-        return;
-    }
-
-    if (data_str.find('\n') == std::string::npos) {
-        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
-        return;
-    }
-
-    fprintf(stream, "%s: |\n", prop_name);
-    while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
-        fprintf(stream, "  %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
-        pos_start = pos_found + 1;
-    }
-}
-
-void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
-                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    ggml_cpu_init(); // some ARM features are detected at runtime
-
-    const auto & sparams = params.sparams;
-
-    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
-    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
-    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
-    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
-    fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
-    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
-    fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
-    fprintf(stream, "cpu_has_sve: %s\n",         ggml_cpu_has_sve()         ? "true" : "false");
-    fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
-    fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
-    fprintf(stream, "cpu_has_riscv_v: %s\n",     ggml_cpu_has_riscv_v()     ? "true" : "false");
-    fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
-    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
-    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
-    fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
-
-#ifdef NDEBUG
-    fprintf(stream, "debug: false\n");
-#else
-    fprintf(stream, "debug: true\n");
-#endif // NDEBUG
-
-    fprintf(stream, "model_desc: %s\n", model_desc);
-    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
-
-#ifdef __OPTIMIZE__
-    fprintf(stream, "optimize: true\n");
-#else
-    fprintf(stream, "optimize: false\n");
-#endif // __OPTIMIZE__
-
-    fprintf(stream, "time: %s\n", timestamp.c_str());
-
-    fprintf(stream, "\n");
-    fprintf(stream, "###############\n");
-    fprintf(stream, "# User Inputs #\n");
-    fprintf(stream, "###############\n");
-    fprintf(stream, "\n");
-
-    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
-    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
-    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
-    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
-    fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
-    fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
-    fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
-    fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
-    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
-    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
-    yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
-    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
-    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
-    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
-    fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
-
-    yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
-    fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
-    yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
-    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
-    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
-    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
-    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
-
-    fprintf(stream, "logit_bias:\n");
-    for (const auto & logit_bias : sparams.logit_bias) {
-        fprintf(stream, "  %d: %f", logit_bias.token, logit_bias.bias);
-    }
-
-    fprintf(stream, "lora:\n");
-    for (auto & la : params.lora_adapters) {
-        if (la.scale == 1.0f) {
-            fprintf(stream, "  - %s\n", la.path.c_str());
-        }
-    }
-    fprintf(stream, "lora_scaled:\n");
-    for (auto & la : params.lora_adapters) {
-        if (la.scale != 1.0f) {
-            fprintf(stream, "  - %s: %f\n", la.path.c_str(), la.scale);
-        }
-    }
-    fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
-    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
-    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
-    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
-    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
-    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
-    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
-    fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
-    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
-    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
-    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
-    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
-    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
-    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
-    fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
-    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
-    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
-    yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
-    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
-    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
-    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
-    yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
-
-    fprintf(stream, "reverse_prompt:\n");
-    for (std::string ap : params.antiprompt) {
-        size_t pos = 0;
-        while ((pos = ap.find('\n', pos)) != std::string::npos) {
-            ap.replace(pos, 1, "\\n");
-            pos += 1;
-        }
-
-        fprintf(stream, "  - %s\n", ap.c_str());
-    }
-
-    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
-    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
-    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
-    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
-    fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
-    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
-
-    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
-    yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
-
-    fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
-    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
-    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
-    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
-    fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
-    fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
-    fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
-    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
-    fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
-}
diff --git a/common/common.h b/common/common.h
index 6289feaeb..7977cc7a9 100644
--- a/common/common.h
+++ b/common/common.h
@@ -209,7 +209,6 @@ struct common_params {
     std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
     std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
     std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
-    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
     std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
     std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
     std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
@@ -584,15 +583,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
-
-//
-// YAML utils
-//
-
-void yaml_dump_vector_float    (FILE * stream, const char * prop_name, const std::vector<float> & data);
-void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std::vector<int> & data);
-void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
-
-void yaml_dump_non_result_info(
-    FILE * stream, const common_params & params, const llama_context * lctx,
-    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index f18362c91..15b358dc4 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -43,50 +43,6 @@ static std::vector<llama_token> * g_output_tokens;
 
 static bool is_interacting = false;
 
-static void write_logfile(
-    const llama_context * ctx, const common_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    const std::string timestamp = string_get_sortable_timestamp();
-
-    const bool success = fs_create_directory_with_parents(params.logdir);
-    if (!success) {
-        LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: infill\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
-
-    llama_perf_dump_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
     if (signo == SIGINT) {
@@ -96,7 +52,6 @@ static void sigint_handler(int signo) {
             console::cleanup();
             LOG("\n");
             common_perf_print(*g_ctx, *g_smpl);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
 
             // make sure all logs are flushed
             LOG("Interrupted by user\n");
@@ -625,7 +580,6 @@ int main(int argc, char ** argv) {
 
     LOG("\n");
     common_perf_print(ctx, smpl);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 374ed47ad..7c4ce4be2 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -62,49 +62,6 @@ static bool file_is_empty(const std::string & path) {
     return f.tellg() == 0;
 }
 
-static void write_logfile(
-    const llama_context * ctx, const common_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    const std::string timestamp = string_get_sortable_timestamp();
-
-    const bool success = fs_create_directory_with_parents(params.logdir);
-    if (!success) {
-        LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: main\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
-
-    llama_perf_dump_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
     if (signo == SIGINT) {
@@ -115,7 +72,6 @@ static void sigint_handler(int signo) {
             console::cleanup();
             LOG("\n");
             common_perf_print(*g_ctx, *g_smpl);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
 
             // make sure all logs are flushed
             LOG("Interrupted by user\n");
@@ -926,7 +882,6 @@ int main(int argc, char ** argv) {
 
     LOG("\n\n");
     common_perf_print(ctx, smpl);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
     common_sampler_free(smpl);
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index e803ff143..64a84607c 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -34,55 +34,6 @@ struct results_log_softmax {
     float  prob;
 };
 
-static void write_logfile(
-    const llama_context * ctx, const common_params & params, const llama_model * model,
-    const struct results_perplexity & results
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    if (params.hellaswag) {
-        LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
-        return;
-    }
-
-    const std::string timestamp = string_get_sortable_timestamp();
-
-    const bool success = fs_create_directory_with_parents(params.logdir);
-    if (!success) {
-        LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: main\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Perplexity Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    yaml_dump_vector_float(logfile, "logits", results.logits);
-    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
-    yaml_dump_vector_float(logfile, "probs", results.probs);
-
-    llama_perf_dump_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
 static std::vector<float> softmax(const std::vector<float>& logits) {
     std::vector<float> probs(logits.size());
     float max_logit = logits[0];
@@ -2072,8 +2023,6 @@ int main(int argc, char ** argv) {
     LOG("\n");
     llama_perf_context_print(ctx);
 
-    write_logfile(ctx, params, model, results);
-
     llama_free(ctx);
     llama_free_model(model);
 
diff --git a/examples/server/README.md b/examples/server/README.md
index 6f72c6bb8..0936e0b7b 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -85,7 +85,6 @@ The project is under active development, and we are [looking for feedback and co
 | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)(env: LLAMA_ARG_HF_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file (default: unused)(env: LLAMA_ARG_HF_FILE) |
 | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)(env: HF_TOKEN) |
-| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
 | `--log-disable` | Log disable |
 | `--log-file FNAME` | Log to file |
 | `--log-colors` | Enable colored logging(env: LLAMA_LOG_COLORS) |
diff --git a/include/llama.h b/include/llama.h
index 5e742642e..bc268e799 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1244,8 +1244,6 @@ extern "C" {
     LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
     LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
 
-    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py
deleted file mode 100755
index 8f0bf8ca8..000000000
--- a/scripts/run-with-preset.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#!/usr/bin/env python3
-
-import logging
-import argparse
-import os
-import subprocess
-import sys
-
-import yaml
-
-logger = logging.getLogger("run-with-preset")
-
-CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
-    "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
-    "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
-    "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
-    "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
-    "low-vram", "main-gpu", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
-    "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
-    "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
-    "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
-    "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
-    "simple-io", "tensor-split", "threads", "temp", "top-k", "top-p", "typical",
-    "verbose-prompt"
-]
-
-CLI_ARGS_LLAMA_BENCH = [
-    "batch-size", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
-    "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
-]
-
-CLI_ARGS_LLAMA_SERVER = [
-    "alias", "batch-size", "ctx-size", "embedding", "host", "lora", "lora-base",
-    "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
-    "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
-    "threads", "verbose"
-]
-
-description = """Run llama.cpp binaries with presets from YAML file(s).
-To specify which binary should be run, specify the "binary" property (llama-cli, llama-perplexity, llama-bench, and llama-server are supported).
-To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
-
-Formatting considerations:
-- The YAML property names are the same as the CLI argument names of the corresponding binary.
-- Properties must use the long name of their corresponding llama.cpp CLI arguments.
-- Like the llama.cpp binaries the property names do not differentiate between hyphens and underscores.
-- Flags must be defined as "<PROPERTY_NAME>: true" to be effective.
-- To define the logit_bias property, the expected format is "<TOKEN_ID>: <BIAS>" in the "logit_bias" namespace.
-- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
-- To define a tensor split, pass a list of floats.
-"""
-usage = "run-with-preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
-epilog = ("  --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
-          "Unknown args will be ignored.")
-
-parser = argparse.ArgumentParser(
-    description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
-parser.add_argument("-bin", "--binary", help="The binary to run.")
-parser.add_argument("yaml_files", nargs="*",
-                    help="Arbitrary number of YAML files from which to read preset values. "
-                    "If two files specify the same values the later one will be used.")
-parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
-
-known_args, unknown_args = parser.parse_known_args()
-
-if not known_args.yaml_files and not unknown_args:
-    parser.print_help()
-    sys.exit(0)
-
-logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
-
-props = dict()
-
-for yaml_file in known_args.yaml_files:
-    with open(yaml_file, "r") as f:
-        props.update(yaml.load(f, yaml.SafeLoader))
-
-props = {prop.replace("_", "-"): val for prop, val in props.items()}
-
-binary = props.pop("binary", "llama-cli")
-if known_args.binary:
-    binary = known_args.binary
-
-if os.path.exists(f"./{binary}"):
-    binary = f"./{binary}"
-
-if binary.lower().endswith("llama-cli") or binary.lower().endswith("llama-perplexity"):
-    cli_args = CLI_ARGS_LLAMA_CLI_PERPLEXITY
-elif binary.lower().endswith("llama-bench"):
-    cli_args = CLI_ARGS_LLAMA_BENCH
-elif binary.lower().endswith("llama-server"):
-    cli_args = CLI_ARGS_LLAMA_SERVER
-else:
-    logger.error(f"Unknown binary: {binary}")
-    sys.exit(1)
-
-command_list = [binary]
-
-for cli_arg in cli_args:
-    value = props.pop(cli_arg, None)
-
-    if not value or value == -1:
-        continue
-
-    if cli_arg == "logit-bias":
-        for token, bias in value.items():
-            command_list.append("--logit-bias")
-            command_list.append(f"{token}{bias:+}")
-        continue
-
-    if cli_arg == "reverse-prompt" and not isinstance(value, str):
-        for rp in value:
-            command_list.append("--reverse-prompt")
-            command_list.append(str(rp))
-        continue
-
-    command_list.append(f"--{cli_arg}")
-
-    if cli_arg == "tensor-split":
-        command_list.append(",".join([str(v) for v in value]))
-        continue
-
-    value = str(value)
-
-    if value != "True":
-        command_list.append(str(value))
-
-num_unused = len(props)
-if num_unused > 10:
-    logger.info(f"The preset file contained a total of {num_unused} unused properties.")
-elif num_unused > 0:
-    logger.info("The preset file contained the following unused properties:")
-    for prop, value in props.items():
-        logger.info(f"  {prop}: {value}")
-
-command_list += unknown_args
-
-sp = subprocess.Popen(command_list)
-
-while sp.returncode is None:
-    try:
-        sp.wait()
-    except KeyboardInterrupt:
-        pass
-
-sys.exit(sp.returncode)
diff --git a/src/llama.cpp b/src/llama.cpp
index dc5dfba0c..1703104fb 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -22075,28 +22075,6 @@ void llama_perf_context_reset(struct llama_context * ctx) {
     ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }
 
-void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
-    fprintf(stream, "\n");
-    fprintf(stream, "###########\n");
-    fprintf(stream, "# Timings #\n");
-    fprintf(stream, "###########\n");
-    fprintf(stream, "\n");
-
-    fprintf(stream, "mst_eval: %.2f  # ms / token during generation\n",
-            1.0e-3 * ctx->t_eval_us / ctx->n_eval);
-    fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
-            1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
-    fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
-    fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
-    fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
-    fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
-    fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
-    fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
-            1.0e6 * ctx->n_eval / ctx->t_eval_us);
-    fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
-            1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
-}
-
 // For internal test use
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
     struct llama_context * ctx

From 0fff7fd79818980763a601660f25b01a0cf4b87a Mon Sep 17 00:00:00 2001
From: FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
Date: Sun, 17 Nov 2024 12:29:18 +1300
Subject: [PATCH 235/396] docs : vulkan build instructions to use git bash
 mingw64 (#10303)

---
 docs/build.md | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/docs/build.md b/docs/build.md
index 52de2b4e2..811bbb409 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -282,9 +282,9 @@ The following compilation options are also available to tweak performance (yes,
 
 #### w64devkit
 
-Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
+Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
 
-Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
+Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
 
 Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
 ```sh
@@ -302,6 +302,29 @@ EOF
 ```
 Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
 
+#### Git Bash MINGW64
+
+Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
+
+Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
+
+Download and install [`CMake`](https://cmake.org/download/) with the default settings
+
+Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
+
+Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
+
+```
+cmake -B build -DGGML_VULKAN=ON
+cmake --build build --config Release
+```
+
+Now you can load the model in conversation mode using `Vulkan`
+
+```
+build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
+```
+
 #### MSYS2
 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
   ```sh

From 5c9a8b22b10132db620529435e3cfa49304b65cc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 16 Nov 2024 22:16:04 +0200
Subject: [PATCH 236/396] scripts : update sync

---
 scripts/sync-ggml-am.sh | 54 +++++++++++++++++++++++------------------
 scripts/sync-ggml.sh    | 26 +++++++++++---------
 2 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index 74d6c6c8b..d0815cf89 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -73,17 +73,20 @@ while read c; do
         src/ggml*.h \
         src/ggml*.c \
         src/ggml*.cpp \
-        src/ggml*.m \
-        src/ggml*.metal \
-        src/ggml*.cu \
         src/ggml-amx/* \
+        src/ggml-blas/* \
         src/ggml-cann/* \
+        src/ggml-cpu/* \
         src/ggml-cuda/* \
+        src/ggml-hip/* \
+        src/ggml-kompute/* \
+        src/ggml-metal/* \
+        src/ggml-musa/* \
+        src/ggml-rpc/* \
         src/ggml-sycl/* \
-        src/vulkan-shaders/* \
+        src/ggml-vulkan/* \
         include/ggml*.h \
         tests/test-opt.cpp \
-        tests/test-grad0.cpp \
         tests/test-quantize-fns.cpp \
         tests/test-quantize-perf.cpp \
         tests/test-backend-ops.cpp \
@@ -121,21 +124,22 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # src/ggml*.c          -> ggml/src/ggml*.c
     # src/ggml*.cpp        -> ggml/src/ggml*.cpp
     # src/ggml*.h          -> ggml/src/ggml*.h
-    # src/ggml*.cu         -> ggml/src/ggml*.cu
-    # src/ggml*.m          -> ggml/src/ggml*.m
-    # src/ggml-amx/*       -> ggml/src/ggml-amx/
-    # src/ggml-cann/*      -> ggml/src/ggml-cann/
-    # src/ggml-cuda/*      -> ggml/src/ggml-cuda/
-    # src/ggml-sycl/*      -> ggml/src/ggml-sycl/
-    # src/vulkan-shaders/* -> ggml/src/vulkan-shaders/
+    # src/ggml-amx/*       -> ggml/src/ggml-amx/*
+    # src/ggml-blas/*      -> ggml/src/ggml-blas/*
+    # src/ggml-cann/*      -> ggml/src/ggml-cann/*
+    # src/ggml-cpu/*       -> ggml/src/ggml-cpu/*
+    # src/ggml-cuda/*      -> ggml/src/ggml-cuda/*
+    # src/ggml-hip/*       -> ggml/src/ggml-hip/*
+    # src/ggml-kompute/*   -> ggml/src/ggml-kompute/*
+    # src/ggml-metal/*     -> ggml/src/ggml-metal/*
+    # src/ggml-musa/*      -> ggml/src/ggml-musa/*
+    # src/ggml-rpc/*       -> ggml/src/ggml-rpc/*
+    # src/ggml-sycl/*      -> ggml/src/ggml-sycl/*
+    # src/ggml-vulkan/*    -> ggml/src/ggml-vulkan/*
     #
     # include/ggml*.h -> ggml/include/ggml*.h
     #
-    # tests/test-opt.cpp           -> tests/test-opt.cpp
-    # tests/test-grad0.cpp         -> tests/test-grad0.cpp
-    # tests/test-quantize-fns.cpp  -> tests/test-quantize-fns.cpp
-    # tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
-    # tests/test-backend-ops.cpp   -> tests/test-backend-ops.cpp
+    # tests/test*.cpp -> tests/
     #
     # LICENSE                -> LICENSE
     # scripts/gen-authors.sh -> scripts/gen-authors.sh
@@ -147,18 +151,20 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\2.cu/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\2.m/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-blas\//\1ggml\/src\/ggml-blas\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-cpu\//\1ggml\/src\/ggml-cpu\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-hip\//\1ggml\/src\/ggml-hip\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\//\1ggml\/src\/ggml-kompute\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\//\1ggml\/src\/ggml-metal\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-musa\//\1ggml\/src\/ggml-musa\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\//\1ggml\/src\/ggml-rpc\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
-        -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\//\1ggml\/src\/ggml-vulkan\//g' \
         -e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \
-        -e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \
-        -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \
-        -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \
-        -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/\1examples\/common-ggml.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)tests\/(.*)\.cpp/\1tests\/\2.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)LICENSE/\1LICENSE/g' \
         -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/\1scripts\/gen-authors.sh/g' \
         > ggml-src.patch.tmp
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index f29554c82..000270afb 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -4,21 +4,25 @@ cp -rpv ../ggml/CMakeLists.txt       ./ggml/CMakeLists.txt
 cp -rpv ../ggml/src/CMakeLists.txt   ./ggml/src/CMakeLists.txt
 cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
 
-cp -rpv ../ggml/src/ggml*.c          ./ggml/src/
-cp -rpv ../ggml/src/ggml*.cpp        ./ggml/src/
-cp -rpv ../ggml/src/ggml*.h          ./ggml/src/
-cp -rpv ../ggml/src/ggml*.cu         ./ggml/src/
-cp -rpv ../ggml/src/ggml*.m          ./ggml/src/
-cp -rpv ../ggml/src/ggml-amx/*       ./ggml/src/ggml-amx/
-cp -rpv ../ggml/src/ggml-cann/*      ./ggml/src/ggml-cann/
-cp -rpv ../ggml/src/ggml-cuda/*      ./ggml/src/ggml-cuda/
-cp -rpv ../ggml/src/ggml-sycl/*      ./ggml/src/ggml-sycl/
-cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/
+cp -rpv ../ggml/src/ggml*.c        ./ggml/src/
+cp -rpv ../ggml/src/ggml*.cpp      ./ggml/src/
+cp -rpv ../ggml/src/ggml*.h        ./ggml/src/
+cp -rpv ../ggml/src/ggml-amx/*     ./ggml/src/ggml-amx/
+cp -rpv ../ggml/src/ggml-blas/*    ./ggml/src/ggml-blas/
+cp -rpv ../ggml/src/ggml-cann/*    ./ggml/src/ggml-cann/
+cp -rpv ../ggml/src/ggml-cpu/*     ./ggml/src/ggml-cpu/
+cp -rpv ../ggml/src/ggml-cuda/*    ./ggml/src/ggml-cuda/
+cp -rpv ../ggml/src/ggml-hip/*     ./ggml/src/ggml-hip/
+cp -rpv ../ggml/src/ggml-kompute/* ./ggml/src/ggml-kompute/
+cp -rpv ../ggml/src/ggml-metal/*   ./ggml/src/ggml-metal/
+cp -rpv ../ggml/src/ggml-musa/*    ./ggml/src/ggml-musa/
+cp -rpv ../ggml/src/ggml-rpc/*     ./ggml/src/ggml-rpc/
+cp -rpv ../ggml/src/ggml-sycl/*    ./ggml/src/ggml-sycl/
+cp -rpv ../ggml/src/ggml-vulkan/*  ./ggml/src/ggml-vulkan/
 
 cp -rpv ../ggml/include/ggml*.h ./ggml/include/
 
 cp -rpv ../ggml/tests/test-opt.cpp           ./tests/test-opt.cpp
-cp -rpv ../ggml/tests/test-grad0.cpp         ./tests/test-grad0.cpp
 cp -rpv ../ggml/tests/test-quantize-fns.cpp  ./tests/test-quantize-fns.cpp
 cp -rpv ../ggml/tests/test-quantize-perf.cpp ./tests/test-quantize-perf.cpp
 cp -rpv ../ggml/tests/test-backend-ops.cpp   ./tests/test-backend-ops.cpp

From 8a43e940ab0daaff198809bf9277289994ec62f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 16 Nov 2024 22:17:59 +0200
Subject: [PATCH 237/396] ggml: new optimization interface (ggml/988)

---
 ggml/CMakeLists.txt                  |    1 +
 ggml/include/ggml-backend.h          |   26 +-
 ggml/include/ggml-opt.h              |  216 ++++
 ggml/include/ggml.h                  |  218 +---
 ggml/src/CMakeLists.txt              |    2 +
 ggml/src/ggml-alloc.c                |   14 +-
 ggml/src/ggml-backend.cpp            |    2 +-
 ggml/src/ggml-cpu/ggml-cpu.c         |   41 +-
 ggml/src/ggml-cuda/opt-step-adamw.cu |   66 +-
 ggml/src/ggml-impl.h                 |   17 +-
 ggml/src/ggml-metal/ggml-metal.m     |    8 +-
 ggml/src/ggml-opt.cpp                |  867 +++++++++++++
 ggml/src/ggml.c                      | 1750 +++++++++-----------------
 tests/test-backend-ops.cpp           |   79 +-
 tests/test-opt.cpp                   |  989 +++++++++++++--
 15 files changed, 2663 insertions(+), 1633 deletions(-)
 create mode 100644 ggml/include/ggml-opt.h
 create mode 100644 ggml/src/ggml-opt.cpp

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4fb78e59f..fd9499826 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -228,6 +228,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-cann.h
     include/ggml-cuda.h
     include/ggml-kompute.h
+    include/ggml-opt.h
     include/ggml-metal.h
     include/ggml-rpc.h
     include/ggml-sycl.h
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 0a65dbfca..cef164764 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -86,7 +86,7 @@ extern "C" {
     GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
     GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
 
-    // "offset" refers to the offset of the tensor data for setting/getting data
+    // "offset" refers to the offset in tensor->data for setting/getting data
     GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
     GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
     GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
@@ -242,14 +242,20 @@ extern "C" {
         ggml_backend_sched_reserve(sched, reserve_graph);
 
         // compute
-        graph = build_graph(sched);
-        ggml_backend_sched_graph_compute(sched, graph);
+        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
+        for (int i = 0; i < 10; ++i) {
+            ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
+        }
 
         // if there are graph inputs:
-        ggml_backend_sched_reset(sched);
-        ggml_backend_sched_alloc_graph(sched, graph);
-        ggml_backend_tensor_set(input_tensor, ...);
-        ggml_backend_sched_graph_compute(sched, graph);
+        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
+        ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
+        ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
+        ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
+        ggml_backend_sched_graph_compute(sched, graph); // execute the graph
+
+        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
+        // allocate them statically via ggml_backend_alloc_ctx_tensors
     }
     */
 
@@ -264,7 +270,7 @@ extern "C" {
     //
     typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
 
-    // Initialize a backend scheduler
+    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
     GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
 
@@ -289,7 +295,9 @@ extern "C" {
     GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
     GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
 
-    // Reset all assignments and allocators - must be called before changing the node backends
+    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
+    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
+    // The correct way to use this API is to discard the deallocated tensors and create new ones.
     GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
 
     // Set a callback to be called for each resulting node during graph compute
diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h
new file mode 100644
index 000000000..eb5eab9de
--- /dev/null
+++ b/ggml/include/ggml-opt.h
@@ -0,0 +1,216 @@
+// This file contains functionality for training models using GGML.
+// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
+// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
+//
+// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stdint.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    struct ggml_opt_dataset;
+    struct ggml_opt_context;
+    struct ggml_opt_result;
+
+    typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
+    typedef struct ggml_opt_context * ggml_opt_context_t;
+    typedef struct ggml_opt_result  * ggml_opt_result_t;
+
+    // ====== Loss ======
+
+    // built-in loss types, i.e. the built-in quantities minimized by the optimizer
+    // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
+    enum ggml_opt_loss_type {
+        GGML_OPT_LOSS_TYPE_MEAN,
+        GGML_OPT_LOSS_TYPE_SUM,
+        GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
+        GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
+    };
+
+    // ====== Dataset ======
+
+    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
+            int64_t ne_datapoint, // number of elements per datapoint
+            int64_t ne_label,     // number of elements per label
+            int64_t ndata,        // total number of datapoints/labels
+            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
+
+    // get underlying tensors that store the data
+    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
+    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
+
+    // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
+    GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
+
+    // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
+    GGML_API void ggml_opt_dataset_get_batch(
+            ggml_opt_dataset_t   dataset,
+            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
+            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
+            int64_t              ibatch);
+
+    // ====== Model / Context ======
+
+    enum ggml_opt_build_type {
+        GGML_OPT_BUILD_TYPE_FORWARD,
+        GGML_OPT_BUILD_TYPE_GRAD,
+        GGML_OPT_BUILD_TYPE_OPT,
+    };
+
+    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
+    struct ggml_opt_optimizer_params {
+        // AdamW optimizer parameters
+        struct {
+            float alpha; // learning rate
+            float beta1;
+            float beta2;
+            float eps;   // epsilon for numerical stability
+            float wd;    // weight decay for AdamW, use 0.0f to disable
+        } adamw;
+    };
+
+    // callback to calculate optimizer parameters prior to a backward pass
+    // userdata can be used to pass arbitrary data
+    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
+
+    // returns the default optimizer params (constant)
+    // userdata is not used
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
+
+    // parameters for initializing a new optimization context
+    struct ggml_opt_params {
+        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
+
+        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
+
+        // the forward graph is defined by inputs and outputs
+        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
+        struct ggml_tensor * inputs;
+        struct ggml_tensor * outputs;
+
+        enum ggml_opt_loss_type  loss_type;
+        enum ggml_opt_build_type build_type;
+
+        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
+
+        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+    };
+
+    // get parameters for an optimization context with defaults set where possible
+    // parameters for which no sensible defaults exist are supplied as arguments to this function
+    GGML_API ggml_opt_params ggml_opt_default_params(
+            ggml_backend_sched_t      backend_sched,
+            struct ggml_context     * ctx_compute,
+            struct ggml_tensor      * inputs,
+            struct ggml_tensor      * outputs,
+            enum ggml_opt_loss_type   loss_type);
+
+    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
+    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
+
+    // set gradients to zero, initilize loss, and optionally reset the optimizer
+    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
+
+    // get underlying tensors that store data
+    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
+    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
+    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
+    GGML_API struct ggml_tensor * ggml_opt_loss(    ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
+    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
+    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
+
+    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
+
+    // ====== Optimization Result ======
+
+    GGML_API ggml_opt_result_t ggml_opt_result_init();
+    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
+    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
+
+    // get data from result, uncertainties are optional and can be ignored by passing NULL
+    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // writes 1 value, number of datapoints
+    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // writes 1 value
+    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // writes ndata values
+    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // writes 1 value
+
+    // ====== Computation ======
+
+    // do forward pass, increment result if not NULL
+    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+
+    // do forward pass, increment result if not NULL, do backward pass
+    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+
+    // ############################################################################
+    // ## The high-level functions start here. They do not depend on any private ##
+    // ## functions or structs and can be copied to and adapted for user code.   ##
+    // ############################################################################
+
+    // ====== Intended Usage ======
+    //
+    // 1. Select the appropriate loss for your problem.
+    // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
+    //    Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
+    // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
+    //    The first context should contain the model parameters and inputs and be allocated statically in user code.
+    //    The second context should contain all other tensors and will be (re)allocated automatically.
+    //    Due to this automated allocation the data of the second context is not defined when accessed in user code.
+    //    Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
+    // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
+
+    // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
+    typedef void (*ggml_opt_epoch_callback)(
+            bool               train,       // true after training evaluation, false after validation evaluation
+            ggml_opt_context_t opt_ctx,
+            ggml_opt_dataset_t dataset,
+            ggml_opt_result_t  result,      // result associated with the dataset subsection
+            int64_t            ibatch,      // number of batches that have been evaluated so far
+            int64_t            ibatch_max,  // total number of batches in this dataset subsection
+            int64_t            t_start_us); // time at which the evaluation on the dataset subsection was started
+
+    // do training on front of dataset, do evaluation only on back of dataset
+    GGML_API void ggml_opt_epoch(
+            ggml_opt_context_t      opt_ctx,
+            ggml_opt_dataset_t      dataset,
+            ggml_opt_result_t       result_train,   // result to increment during training, ignored if NULL
+            ggml_opt_result_t       result_eval,    // result to increment during evaluation, ignored if NULL
+            int64_t                 idata_split,    // data index at which to split training and evaluation
+            ggml_opt_epoch_callback callback_train,
+            ggml_opt_epoch_callback callback_eval);
+
+    // callback that prints a progress bar on stderr
+    GGML_API void ggml_opt_epoch_callback_progress_bar(
+            bool               train,
+            ggml_opt_context_t opt_ctx,
+            ggml_opt_dataset_t dataset,
+            ggml_opt_result_t  result,
+            int64_t            ibatch,
+            int64_t            ibatch_max,
+            int64_t            t_start_us);
+
+    // fit model defined by inputs and outputs to dataset
+    GGML_API void ggml_opt_fit(
+            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
+            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
+            enum ggml_opt_loss_type         loss_type,      // loss to minimize
+            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
+            int64_t                         nepoch,         // how many times the dataset should be iterated over
+            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
+            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
+            bool                            silent);        // whether or not info prints to stderr should be suppressed
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 3b3f6798a..69e6a2434 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -602,7 +602,6 @@ extern "C" {
 
         int32_t flags;
 
-        struct ggml_tensor * grad;
         struct ggml_tensor * src[GGML_MAX_SRC];
 
         // source tensor and offset for views
@@ -615,7 +614,7 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        // char padding[4];
+        char padding[8];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -1985,28 +1984,20 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * grad,
-            float                 alpha,
-            float                 beta1,
-            float                 beta2,
-            float                 eps,
-            float                 wd); // weight decay
+            struct ggml_tensor  * m,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * adamw_params); // parameters such a the learning rate
 
     //
     // automatic differentiation
     //
 
-    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
-
-    GGML_API void ggml_build_opt_adamw(
-            struct ggml_context * ctx,
-            struct ggml_cgraph  * gf,
-            struct ggml_cgraph  * gb,
-            float                 alpha,
-            float                 beta1,
-            float                 beta2,
-            float                 eps,
-            float                 wd); // weight decay
+    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_backward_expand(
+        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
+        struct ggml_context * ctx_compute, // context for gradient computation
+        struct ggml_cgraph  * cgraph,
+        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
 
     // graph allocation in a context
     GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
@@ -2026,7 +2017,9 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
-    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+    GGML_API struct ggml_tensor * ggml_graph_get_tensor  (const struct ggml_cgraph * cgraph, const char * name);
+    GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
+    GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
 
     GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
     GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
@@ -2037,198 +2030,15 @@ extern "C" {
     // dump the graph into a file using the dot format
     GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
 
-    // build gradient checkpointing backward graph gb for gf using provided checkpoints
-    // gb_tmp will contain original backward graph with rewritten backward process nodes,
-    // but without the second forward pass nodes.
-    GGML_API void ggml_build_backward_gradient_checkpointing(
-            struct ggml_context   * ctx,
-            struct ggml_cgraph    * gf,
-            struct ggml_cgraph    * gb,
-            struct ggml_cgraph    * gb_tmp,
-            struct ggml_tensor  * * checkpoints,
-            int                     n_checkpoints);
-    //
-    // optimization
-    //
-
-    // optimization methods
-    enum ggml_opt_type {
-        GGML_OPT_TYPE_ADAM,
-        GGML_OPT_TYPE_LBFGS,
-    };
-
-    // linesearch methods
-    enum ggml_linesearch {
-        GGML_LINESEARCH_DEFAULT = 1,
-
-        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
-        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
-        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
-    };
-
-    // optimization return values
-    enum ggml_opt_result {
-        GGML_OPT_RESULT_OK = 0,
-        GGML_OPT_RESULT_DID_NOT_CONVERGE,
-        GGML_OPT_RESULT_NO_CONTEXT,
-        GGML_OPT_RESULT_INVALID_WOLFE,
-        GGML_OPT_RESULT_FAIL,
-        GGML_OPT_RESULT_CANCEL,
-
-        GGML_LINESEARCH_FAIL = -128,
-        GGML_LINESEARCH_MINIMUM_STEP,
-        GGML_LINESEARCH_MAXIMUM_STEP,
-        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
-        GGML_LINESEARCH_INVALID_PARAMETERS,
-    };
-
-    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+    // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
     typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
 
     // Set callback for all future logging events.
     // If this is not called, or NULL is supplied, everything is output on stderr.
     GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
 
-    // optimization parameters
-    //
-    //   see ggml.c (ggml_opt_default_params) for default values
-    //
-    struct ggml_opt_params {
-        enum ggml_opt_type type;
-
-        size_t graph_size;
-
-        int n_threads;
-
-        // delta-based convergence test
-        //
-        //   if past == 0 - disabled
-        //   if past > 0:
-        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
-        //
-        int past;
-        float delta;
-
-        // maximum number of iterations without improvement
-        //
-        //   if 0 - disabled
-        //   if > 0:
-        //     assume convergence if no cost improvement in this number of iterations
-        //
-        int max_no_improvement;
-
-        bool print_forward_graph;
-        bool print_backward_graph;
-
-        int n_gradient_accumulation;
-
-        // ADAM parameters
-        struct {
-            int n_iter;
-
-            float sched; // schedule multiplier (fixed, decay or warmup)
-            float decay; // weight decay for AdamW, use 0.0f to disable
-            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
-            float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float eps_f; // epsilon for convergence test
-            float eps_g; // epsilon for convergence test
-            float gclip; // gradient clipping
-        } adam;
-
-        // LBFGS parameters
-        struct {
-            int m; // number of corrections to approximate the inv. Hessian
-            int n_iter;
-            int max_linesearch;
-
-            float eps;      // convergence tolerance
-            float ftol;     // line search tolerance
-            float wolfe;
-            float min_step;
-            float max_step;
-
-            enum ggml_linesearch linesearch;
-        } lbfgs;
-    };
-
-    struct ggml_opt_context {
-        struct ggml_context * ctx;
-        struct ggml_opt_params params;
-
-        int iter;
-        int64_t nx; // number of parameter elements
-
-        bool just_initialized;
-
-        float loss_before;
-        float loss_after;
-
-        struct {
-            struct ggml_tensor * g;  // current gradient
-            struct ggml_tensor * m;  // first moment
-            struct ggml_tensor * v;  // second moment
-            struct ggml_tensor * pf; // past function values
-            float fx_best;
-            float fx_prev;
-            int n_no_improvement;
-        } adam;
-
-        struct {
-            struct ggml_tensor * x;    // current parameters
-            struct ggml_tensor * xp;   // previous parameters
-            struct ggml_tensor * g;    // current gradient
-            struct ggml_tensor * gp;   // previous gradient
-            struct ggml_tensor * d;    // search direction
-            struct ggml_tensor * pf;   // past function values
-            struct ggml_tensor * lmal; // the L-BFGS memory alpha
-            struct ggml_tensor * lmys; // the L-BFGS memory ys
-            struct ggml_tensor * lms;  // the L-BFGS memory s
-            struct ggml_tensor * lmy;  // the L-BFGS memory y
-            float fx_best;
-            float step;
-            int j;
-            int k;
-            int end;
-            int n_no_improvement;
-        } lbfgs;
-    };
-
     GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
 
-    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
-
-    // optimize the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt(
-            struct ggml_context * ctx,
-            struct ggml_opt_params params,
-            struct ggml_tensor * f);
-
-    // initialize optimizer context
-    GGML_API void ggml_opt_init(
-            struct ggml_context     * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_opt_params    params,
-            int64_t                   nx);
-
-    // continue optimizing the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt_resume(
-            struct ggml_context * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_tensor * f);
-
-    // continue optimizing the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt_resume_g(
-            struct ggml_context * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_tensor * f,
-            struct ggml_cgraph * gf,
-            struct ggml_cgraph * gb,
-            ggml_opt_callback callback,
-            void * callback_data);
-
     //
     // quantization
     //
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 71934c679..ae7d3abc8 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -207,9 +207,11 @@ add_library(ggml-base
             ../include/ggml-alloc.h
             ../include/ggml-backend.h
             ../include/ggml-cpp.h
+            ../include/ggml-opt.h
             ggml.c
             ggml-alloc.c
             ggml-backend.cpp
+            ggml-opt.cpp
             ggml-threading.cpp
             ggml-threading.h
             ggml-quants.c
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 041de9e3e..2b2240be8 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -466,18 +466,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
     return ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 
-static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
-    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
-    hn->buffer_id = buffer_id;
-    hn->offset = offset;
-    hn->allocated = true;
-}
-
 static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
     return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 
 static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
+    GGML_ASSERT(buffer_id >= 0);
     struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 
     if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
@@ -816,7 +810,11 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 }
 
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
+    size_t node_size = 0;
+    if (!node->data && !node->view_src) {
+        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+        node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
+    }
     return talloc->size_max >= node_size;
 }
 
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index e48877ba8..634fe38ee 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -279,7 +279,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz
     buf->iface.get_tensor(buf, tensor, data, offset, size);
 }
 
-GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
     if (size == 0) {
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 61f53cd01..df6487929 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -12216,11 +12216,16 @@ static void ggml_compute_forward_opt_step_adamw_f32(
         const struct ggml_compute_params * params,
         struct ggml_tensor * dst) {
 
-    const struct ggml_tensor * src0        = dst->src[0];
-    const struct ggml_tensor * src0_grad   = dst->src[1];
-    const struct ggml_tensor * src0_grad_m = dst->src[2];
-    const struct ggml_tensor * src0_grad_v = dst->src[3];
+    const struct ggml_tensor * src0         = dst->src[0];
+    const struct ggml_tensor * src0_grad    = dst->src[1];
+    const struct ggml_tensor * src0_grad_m  = dst->src[2];
+    const struct ggml_tensor * src0_grad_v  = dst->src[3];
+    const struct ggml_tensor * adamw_params = dst->src[4];
+
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
+    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -12237,16 +12242,14 @@ static void ggml_compute_forward_opt_step_adamw_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    /* const float   gnorm = 1.0f; */
-    int64_t       iter;   memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
-    const float   alpha = ggml_get_op_params_f32(dst, 2);
-    const float   beta1 = ggml_get_op_params_f32(dst, 3);
-    const float   beta2 = ggml_get_op_params_f32(dst, 4);
-    const float   eps   = ggml_get_op_params_f32(dst, 5);
-    const float   wd    = ggml_get_op_params_f32(dst, 6);
-
-    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
-    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
+    const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
+    const float alpha  = adamw_params_ptr[0];
+    const float beta1  = adamw_params_ptr[1];
+    const float beta2  = adamw_params_ptr[2];
+    const float eps    = adamw_params_ptr[3];
+    const float wd     = adamw_params_ptr[4];
+    const float beta1h = adamw_params_ptr[5];
+    const float beta2h = adamw_params_ptr[6];
 
     for (int ir = ir0; ir < ir1; ++ir) {
         const int64_t i03 = ir/(ne02*ne01);
@@ -12270,17 +12273,9 @@ static void ggml_compute_forward_opt_step_adamw_f32(
             // The weight decay is applied independently of the Adam momenta m and v.
             // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
             // See: https://arxiv.org/pdf/1711.05101v3.pdf
-            w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
+            w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
         }
     }
-
-    ggml_barrier(params->threadpool);
-    if (ith != 0) {
-        return;
-    }
-
-    iter++;
-    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
 }
 
 static void ggml_compute_forward_opt_step_adamw(
diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cu b/ggml/src/ggml-cuda/opt-step-adamw.cu
index d6f13a9c6..35154f299 100644
--- a/ggml/src/ggml-cuda/opt-step-adamw.cu
+++ b/ggml/src/ggml-cuda/opt-step-adamw.cu
@@ -1,11 +1,11 @@
+#include "ggml-impl.h"
 #include "opt-step-adamw.cuh"
 
 #include <cstdint>
 
 static __global__ void opt_step_adamw_f32(
-    float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v, const int64_t k,
-    const float alpha, const float beta1, const float beta2, const float eps, const float wd,
-    const float beta1h, const float beta2h) {
+    float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v,
+    const float * __restrict__ pars, const int64_t k) {
 
     const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
 
@@ -13,6 +13,14 @@ static __global__ void opt_step_adamw_f32(
         return;
     }
 
+    const float alpha  = pars[0];
+    const float beta1  = pars[1];
+    const float beta2  = pars[2];
+    const float eps    = pars[3];
+    const float wd     = pars[4];
+    const float beta1h = pars[5];
+    const float beta2h = pars[6];
+
     const float gi = g[i];
     const float gmi = g_m[i]*beta1 +    gi*(1.0f - beta1);
     const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);
@@ -23,58 +31,48 @@ static __global__ void opt_step_adamw_f32(
     const float mh =       gmi*beta1h;
     const float vh = sqrtf(gvi*beta2h) + eps;
 
-    x[i] = x[i]*(1.0f - alpha*wd) - mh/vh;
+    x[i] = x[i]*(1.0f - alpha*wd) - alpha*mh/vh;
 }
 
 static void opt_step_adamw_f32_cuda(
-    float * x, const float * g, float * g_m, float * g_v, const int64_t k,
-    const float alpha, const float beta1, const float beta2, const float eps, const float wd,
-    const float beta1h, const float beta2h, cudaStream_t stream) {
+    float * x, const float * g, float * g_m, float * g_v, const float * pars, const int64_t k, cudaStream_t stream) {
 
     const dim3 block_dims(CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
     const dim3 block_nums((k + CUDA_OPT_STEP_ADAMW_BLOCK_SIZE - 1) / CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
-    opt_step_adamw_f32<<<block_nums, block_dims, 0, stream>>>(x, g, g_m, g_v, k, alpha, beta1, beta2, eps, wd, beta1h, beta2h);
+    opt_step_adamw_f32<<<block_nums, block_dims, 0, stream>>>(x, g, g_m, g_v, pars, k);
 }
 
 void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0        = dst->src[0];
-    const ggml_tensor * src0_grad   = dst->src[1];
-    const ggml_tensor * src0_grad_m = dst->src[2];
-    const ggml_tensor * src0_grad_v = dst->src[3];
+    const ggml_tensor * src0         = dst->src[0];
+    const ggml_tensor * src0_grad    = dst->src[1];
+    const ggml_tensor * src0_grad_m  = dst->src[2];
+    const ggml_tensor * src0_grad_v  = dst->src[3];
+    const ggml_tensor * adamw_params = dst->src[4];
 
-    GGML_ASSERT(src0->type        == GGML_TYPE_F32);
-    GGML_ASSERT(src0_grad->type   == GGML_TYPE_F32);
-    GGML_ASSERT(src0_grad_m->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0_grad_v->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type         == GGML_TYPE_F32);
+    GGML_ASSERT(src0_grad->type    == GGML_TYPE_F32);
+    GGML_ASSERT(src0_grad_m->type  == GGML_TYPE_F32);
+    GGML_ASSERT(src0_grad_v->type  == GGML_TYPE_F32);
+    GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
     GGML_ASSERT(ggml_is_contiguous(src0));
     GGML_ASSERT(ggml_is_contiguous(src0_grad));
     GGML_ASSERT(ggml_is_contiguous(src0_grad_m));
     GGML_ASSERT(ggml_is_contiguous(src0_grad_v));
+    GGML_ASSERT(ggml_is_contiguous(adamw_params));
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
+    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
 
-    float       * src0_d        = (float       *) src0->data;
-    const float * src0_grad_d   = (const float *) src0_grad->data;
-    float       * src0_grad_m_d = (float       *) src0_grad_m->data;
-    float       * src0_grad_v_d = (float       *) src0_grad_v->data;
+    float       * src0_d         = (float       *) src0->data;
+    const float * src0_grad_d    = (const float *) src0_grad->data;
+    float       * src0_grad_m_d  = (float       *) src0_grad_m->data;
+    float       * src0_grad_v_d  = (float       *) src0_grad_v->data;
+    const float * adamw_params_d = (const float *) adamw_params->data;
 
     cudaStream_t stream = ctx.stream();
 
     const int64_t ne = ggml_nelements(src0);
 
-    int64_t iter;  memcpy(&iter,  &dst->op_params[0], sizeof(int64_t));
-    float   alpha; memcpy(&alpha, &dst->op_params[2], sizeof(float));
-    float   beta1; memcpy(&beta1, &dst->op_params[3], sizeof(float));
-    float   beta2; memcpy(&beta2, &dst->op_params[4], sizeof(float));
-    float   eps;   memcpy(&eps,   &dst->op_params[5], sizeof(float));
-    float   wd;    memcpy(&wd,    &dst->op_params[6], sizeof(float));
-
-    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
-    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
-
-    opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, ne, alpha, beta1, beta2, eps, wd, beta1h, beta2h, stream);
-
-    iter++;
-    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
+    opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, adamw_params_d, ne, stream);
 }
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index aa4d2b85d..92a64fe5a 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -196,7 +196,7 @@ void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
 static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
 
 // returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
-static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key);
 
 // returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
@@ -210,7 +210,7 @@ static inline size_t ggml_hash(const struct ggml_tensor * p) {
     return (size_t)(uintptr_t)p >> 4;
 }
 
-static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key) {
     size_t h = ggml_hash(key) % hash_set->size;
 
     // linear probing
@@ -281,13 +281,14 @@ enum ggml_cgraph_eval_order {
 };
 
 struct ggml_cgraph {
-    int size;
-    int n_nodes;
-    int n_leafs;
+    int size;    // maximum number of nodes/leafs/grads/grad_accs
+    int n_nodes; // number of nodes currently in use
+    int n_leafs; // number of leafs currently in use
 
-    struct ggml_tensor ** nodes;
-    struct ggml_tensor ** grads;
-    struct ggml_tensor ** leafs;
+    struct ggml_tensor ** nodes;     // tensors with data that can change if the graph is evaluated
+    struct ggml_tensor ** grads;     // the outputs of these tensors are the gradients of the nodes
+    struct ggml_tensor ** grad_accs; // accumulators for node gradients
+    struct ggml_tensor ** leafs;     // tensors with constant data
 
     struct ggml_hash_set visited_hash_set;
 
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index b4b5cfd26..95b21fbf9 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -3639,6 +3639,12 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
     return ctx->all_data;
 }
 
+static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    memset((char *)tensor->data + offset, value, size);
+
+    UNUSED(buffer);
+}
+
 static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     memcpy((char *)tensor->data + offset, data, size);
 
@@ -3671,7 +3677,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
     /* .free_buffer     = */ ggml_backend_metal_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_metal_buffer_get_base,
     /* .init_tensor     = */ NULL,
-    /* .memset_tensor   = */ NULL,
+    /* .memset_tensor   = */ ggml_backend_metal_buffer_memset_tensor,
     /* .set_tensor      = */ ggml_backend_metal_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_metal_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_metal_buffer_cpy_tensor,
diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp
new file mode 100644
index 000000000..808aa0d02
--- /dev/null
+++ b/ggml/src/ggml-opt.cpp
@@ -0,0 +1,867 @@
+#include "ggml-opt.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <inttypes.h>
+#include <map>
+#include <random>
+#include <vector>
+
+struct ggml_opt_dataset {
+    struct ggml_context   * ctx;
+    ggml_backend_buffer_t   buf;
+    struct ggml_tensor    * data;
+    struct ggml_tensor    * labels;
+
+    int64_t ndata;
+    int64_t ndata_shard;
+    size_t  nbs_data;
+    size_t  nbs_labels;
+
+    std::vector<int64_t> permutation;
+};
+
+struct ggml_opt_context {
+    ggml_backend_sched_t    backend_sched;
+    ggml_cgraph           * allocated_graph;
+    ggml_cgraph           * allocated_graph_copy;
+    struct ggml_context   * ctx_static;
+    struct ggml_context   * ctx_static_cpu;
+    struct ggml_context   * ctx_compute;
+    struct ggml_context   * ctx_copy;
+    ggml_backend_buffer_t   buf_static;
+    ggml_backend_buffer_t   buf_static_cpu;
+    std::mt19937            rng;
+
+    struct ggml_tensor * inputs;
+    struct ggml_tensor * outputs;
+    struct ggml_tensor * labels;
+
+    struct ggml_tensor * loss;
+    struct ggml_tensor * pred;
+    struct ggml_tensor * ncorrect;
+
+    struct ggml_cgraph * gf;
+    struct ggml_cgraph * gb_grad;
+    struct ggml_cgraph * gb_opt;
+
+    int64_t iter;
+    int32_t opt_period;
+    int32_t opt_i;
+    bool    loss_per_datapoint;
+
+    ggml_opt_get_optimizer_params get_opt_pars;
+    void * get_opt_pars_ud;
+    struct ggml_tensor * adamw_params;
+};
+
+struct ggml_opt_result {
+    int64_t              ndata    = 0;
+    std::vector<float>   loss;
+    std::vector<int32_t> pred;
+    int64_t              ncorrect = 0;
+
+    bool loss_per_datapoint = false;
+    int64_t opt_period = -1;
+};
+
+// ====== Dataset ======
+
+ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) {
+    GGML_ASSERT(ne_datapoint >  0);
+    GGML_ASSERT(ne_label     >= 0);
+    GGML_ASSERT(ndata        >  0);
+    GGML_ASSERT(ndata_shard  >  0);
+
+    ggml_opt_dataset_t result = new ggml_opt_dataset;
+    result->ndata       = ndata;
+    result->ndata_shard = ndata_shard;
+
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ 2*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        result->ctx = ggml_init(params);
+    }
+
+    result->data = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_datapoint, ndata);
+    result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;
+
+    if (ne_label > 0) {
+        result->labels = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_label, ndata);
+        result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
+    } else {
+        result->labels = nullptr;
+        result->nbs_labels = 0;
+    }
+
+    result->buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_cpu_buffer_type());
+
+    const int64_t nshards = ndata/ndata_shard;
+    result->permutation.resize(nshards);
+    for (int64_t i = 0; i < nshards; ++i) {
+        result->permutation[i] = i;
+    }
+    return result;
+}
+
+void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) {
+    ggml_backend_buffer_free(dataset->buf);
+    ggml_free(dataset->ctx);
+    delete dataset;
+}
+
+struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
+    return dataset->data;
+}
+
+struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset) {
+    return dataset->labels;
+}
+
+void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata) {
+    GGML_ASSERT(idata <= dataset->ndata);
+
+    if (idata < 0) {
+        std::shuffle(dataset->permutation.begin(), dataset->permutation.end(), opt_ctx->rng);
+        return;
+    }
+
+    GGML_ASSERT(idata % dataset->ndata_shard == 0);
+    const int64_t ishard_max = idata / dataset->ndata_shard;
+    std::shuffle(dataset->permutation.begin(), dataset->permutation.begin() + ishard_max, opt_ctx->rng);
+}
+
+void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor * data_batch, struct ggml_tensor * labels_batch, int64_t ibatch) {
+    GGML_ASSERT(   data_batch && ggml_is_contiguous(data_batch));
+    GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
+    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+
+    const size_t nb_data_batch = ggml_nbytes(data_batch);
+    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
+    const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
+
+    if (labels_batch) {
+        const size_t nb_labels_batch = ggml_nbytes(labels_batch);
+        GGML_ASSERT(nb_labels_batch == shards_per_batch*dataset->nbs_labels);
+    }
+
+    GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
+
+    for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
+        const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
+
+        const char * ptr_data = (const char *) dataset->data->data + ishard*dataset->nbs_data;
+        ggml_backend_tensor_set(data_batch, ptr_data, ishard_batch*dataset->nbs_data, dataset->nbs_data);
+
+        if (!labels_batch) {
+            continue;
+        }
+
+        const char * ptr_labels = (const char *) dataset->labels->data + ishard*dataset->nbs_labels;
+        ggml_backend_tensor_set(labels_batch, ptr_labels, ishard_batch*dataset->nbs_labels, dataset->nbs_labels);
+    }
+}
+
+// ====== Model / Context ======
+
+struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
+    GGML_UNUSED(userdata);
+
+    ggml_opt_optimizer_params result;
+
+    result.adamw.alpha = 0.001f;
+    result.adamw.beta1 = 0.9f;
+    result.adamw.beta2 = 0.999f;
+    result.adamw.eps   = 1e-8f;
+    result.adamw.wd    = 0.0f;
+
+    return result;
+}
+
+struct ggml_opt_params ggml_opt_default_params(
+        ggml_backend_sched_t backend_sched,
+        struct ggml_context * ctx_compute,
+        struct ggml_tensor * inputs,
+        struct ggml_tensor * outputs,
+        enum ggml_opt_loss_type loss_type) {
+    return {
+        /*backend_sched   =*/ backend_sched,
+        /*ctx_compute     =*/ ctx_compute,
+        /*inputs          =*/ inputs,
+        /*logits          =*/ outputs,
+        /*loss_type       =*/ loss_type,
+        /*build_type      =*/ GGML_OPT_BUILD_TYPE_OPT,
+        /*opt_period      =*/ 1,
+        /*get_opt_pars    =*/ ggml_opt_get_default_optimizer_params,
+        /*get_opt_pars_ud =*/ nullptr,
+    };
+}
+
+static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_map, ggml_context * ctx, ggml_tensor * tensor) {
+    if (!tensor) {
+        return nullptr;
+    }
+
+    if (tensor_map.find(tensor) != tensor_map.end()) {
+        return tensor_map[tensor];
+    }
+
+    ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
+    tensor_map[tensor] = new_tensor;
+
+    new_tensor->op = tensor->op;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        new_tensor->nb[i] = tensor->nb[i];
+    }
+    new_tensor->flags = tensor->flags;
+    memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params));
+    strcpy(new_tensor->name, tensor->name);
+    new_tensor->data = tensor->data;
+    new_tensor->buffer = tensor->buffer;
+    new_tensor->extra = tensor->extra;
+    new_tensor->view_offs = tensor->view_offs;
+    new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src);
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i]);
+    }
+
+    return new_tensor;
+}
+
+static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * graph) {
+    std::map<ggml_tensor *, ggml_tensor *> tensor_map;
+
+    ggml_cgraph * new_graph = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true);
+
+    for (int i = 0; i < graph->n_leafs; i++) {
+        ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->leafs[i]));
+    }
+    for (int i = 0; i < graph->n_nodes; i++) {
+        ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->nodes[i]));
+    }
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        const size_t igrad_src = ggml_hash_find(&graph->visited_hash_set, graph->nodes[i]);
+        const size_t igrad_dst = ggml_hash_find(&new_graph->visited_hash_set, new_graph->nodes[i]);
+        graph->grads[igrad_dst]     = new_graph->grads[igrad_src];
+        graph->grad_accs[igrad_dst] = new_graph->grad_accs[igrad_src];
+    }
+
+    return new_graph;
+}
+
+static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
+    GGML_ASSERT(graph);
+    if (opt_ctx->allocated_graph == graph) {
+        return;
+    }
+
+    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+
+    {
+        ggml_init_params params = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_free(opt_ctx->ctx_copy);
+        opt_ctx->ctx_copy = ggml_init(params);
+    }
+
+    opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
+
+    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->allocated_graph = graph;
+}
+
+ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
+    ggml_opt_context_t result = new struct ggml_opt_context;
+    result->backend_sched        = params.backend_sched;
+    result->allocated_graph      = nullptr;
+    result->allocated_graph_copy = nullptr;
+    result->ctx_compute          = params.ctx_compute;
+    result->ctx_copy             = nullptr;
+    result->inputs               = params.inputs;
+    result->outputs              = params.outputs;
+    result->iter                 = 1;
+    result->opt_period           = params.opt_period;
+    result->opt_i                = 0;
+    result->get_opt_pars         = params.get_opt_pars;
+    result->get_opt_pars_ud      = params.get_opt_pars_ud;
+
+    GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
+    GGML_ASSERT(result->opt_period >= 1);
+
+    const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD ||
+        (params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
+
+    ggml_set_input(result->inputs);
+    ggml_set_output(result->outputs);
+
+    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
+    ggml_build_forward_expand(result->gf, result->outputs);
+
+    int n_param = 0;
+    for (int i = 0; i < result->gf->n_nodes; ++i) {
+        if (result->gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
+            n_param++;
+        }
+    }
+
+    {
+        // The static context is used for:
+        //   - gradients (1 tensor per param if using gradient accumulation)
+        //   - optimizer momenta (2 tensors per param)
+        //   - labels
+        //   - loss + its gradient (up to 5 tensors)
+        //   - pred
+        //   - ncorrect (2 tensors).
+        const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+        const size_t size_meta = (tensors_per_param*n_param + 9) * ggml_tensor_overhead();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ size_meta,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        result->ctx_static = ggml_init(params);
+    }
+    {
+        // The static cpu context is used for:
+        //   - optimizer parameters (1 for the entire context)
+        const size_t size_meta = 1 * ggml_tensor_overhead();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ size_meta,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        result->ctx_static_cpu = ggml_init(params);
+    }
+
+
+    switch (params.loss_type) {
+        case GGML_OPT_LOSS_TYPE_MEAN: {
+            result->labels = nullptr;
+            result->loss = ggml_sum(result->ctx_static, result->outputs);
+            ggml_set_name(result->loss, "loss_sum");
+            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
+            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
+            ggml_set_name(result->loss, "loss_mean");
+            result->loss_per_datapoint = true;
+            break;
+        }
+        case GGML_OPT_LOSS_TYPE_SUM: {
+            result->labels = nullptr;
+            result->loss = ggml_sum(result->ctx_static, result->outputs);
+            ggml_set_name(result->loss, "loss_sum");
+            result->loss_per_datapoint = false;
+            break;
+        }
+        case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
+            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
+            ggml_set_input(result->labels);
+            ggml_set_name(result->labels, "labels");
+            result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
+            ggml_set_name(result->loss, "loss_cross_entropy");
+            if (result->opt_period > 1) {
+                result->loss = ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period);
+                ggml_set_name(result->loss, "loss_cross_entropy_scaled");
+            }
+            result->loss_per_datapoint = true;
+            break;
+        }
+        case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
+            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
+            ggml_set_input(result->labels);
+            ggml_set_name(result->labels, "labels");
+            result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels);
+            ggml_set_name(result->loss, "loss_error");
+            result->loss = ggml_sqr(result->ctx_static, result->loss);
+            ggml_set_name(result->loss, "loss_squared_error");
+            result->loss = ggml_sum(result->ctx_static, result->loss);
+            ggml_set_name(result->loss, "loss_sum_squared_error");
+            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
+            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
+            ggml_set_name(result->loss, "loss_mean_squared_error");
+            result->loss_per_datapoint = true;
+            break;
+        }
+    }
+    ggml_set_output(result->loss);
+    ggml_set_loss(result->loss);
+    ggml_build_forward_expand(result->gf, result->loss);
+
+    result->pred = ggml_argmax(result->ctx_static, result->outputs);
+    ggml_set_name(result->pred, "pred");
+    ggml_set_output(result->pred);
+    ggml_build_forward_expand(result->gf, result->pred);
+
+    if (result->labels) {
+        result->ncorrect = ggml_count_equal(result->ctx_static, result->pred, ggml_argmax(result->ctx_static, result->labels));
+        ggml_set_name(result->ncorrect, "ncorrect");
+        ggml_set_output(result->ncorrect);
+        ggml_build_forward_expand(result->gf, result->ncorrect);
+    } else {
+        result->ncorrect = nullptr;
+    }
+
+    if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
+        result->gb_grad = nullptr;
+        result->gb_opt  = nullptr;
+
+        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+        result->buf_static_cpu = nullptr;
+
+        ggml_opt_alloc_graph(result, result->gf);
+
+        return result;
+    }
+
+    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
+    result->gb_grad = ggml_graph_dup(result->ctx_compute, result->gf);
+    ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
+
+    if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
+        result->gb_opt  = nullptr;
+
+        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+        result->buf_static_cpu = nullptr;
+
+        ggml_opt_alloc_graph(result, result->gb_grad);
+        ggml_graph_reset(result->gb_grad);
+
+        return result;
+    }
+
+    GGML_ASSERT(params.build_type == GGML_OPT_BUILD_TYPE_OPT);
+
+    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
+    result->gb_opt = ggml_graph_dup(result->ctx_compute, result->gb_grad);
+
+    result->adamw_params = ggml_new_tensor_1d(result->ctx_static_cpu, GGML_TYPE_F32, 7);
+    ggml_set_input(result->adamw_params);
+    ggml_set_name(result->adamw_params, "adamw_params");
+
+    for (int i = result->gf->n_nodes-1; i >= 0; --i) {
+        struct ggml_tensor * node = result->gb_opt->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(result->gb_opt, node);
+
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
+            struct ggml_tensor * m        = ggml_dup_tensor(result->ctx_static, node);
+            struct ggml_tensor * v        = ggml_dup_tensor(result->ctx_static, node);
+            struct ggml_tensor * opt_step = ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params);
+            ggml_build_forward_expand(result->gb_opt, opt_step);
+        }
+    }
+
+    result->buf_static = ggml_backend_alloc_ctx_tensors(
+        result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+
+    result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
+
+    ggml_opt_alloc_graph(result, result->gb_opt);
+    ggml_graph_reset(result->gb_opt);
+
+    return result;
+}
+
+void ggml_opt_free(ggml_opt_context_t opt_ctx) {
+    if (opt_ctx == nullptr) {
+        return;
+    }
+    ggml_backend_buffer_free(opt_ctx->buf_static);
+    ggml_backend_buffer_free(opt_ctx->buf_static_cpu);
+    ggml_free(opt_ctx->ctx_static);
+    ggml_free(opt_ctx->ctx_static_cpu);
+    delete opt_ctx;
+}
+
+void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer) {
+    if (optimizer) {
+        ggml_graph_reset(opt_ctx->gb_opt);
+        opt_ctx->iter = 1;
+    } else {
+        ggml_graph_reset(opt_ctx->gb_grad);
+    }
+}
+
+struct ggml_tensor * ggml_opt_inputs(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->inputs;
+}
+
+struct ggml_tensor * ggml_opt_outputs(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->outputs;
+}
+
+struct ggml_tensor * ggml_opt_labels(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->labels;
+}
+
+struct ggml_tensor * ggml_opt_loss(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->loss;
+}
+
+struct ggml_tensor * ggml_opt_pred(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->pred;
+}
+
+struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->ncorrect;
+}
+
+struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node) {
+    return ggml_graph_get_grad_acc(opt_ctx->gb_opt, node);
+}
+
+// ====== Optimization Result ======
+
+ggml_opt_result_t ggml_opt_result_init() {
+    return new ggml_opt_result;
+}
+
+void ggml_opt_result_free(ggml_opt_result_t result) {
+    delete result;
+}
+
+void ggml_opt_result_reset(ggml_opt_result_t result) {
+    result->ndata = 0;
+    result->loss.clear();
+    result->pred.clear();
+    result->ncorrect = 0;
+}
+
+void ggml_opt_result_ndata(ggml_opt_result_t result, int64_t * ndata) {
+    *ndata = result->ndata;
+}
+
+void ggml_opt_result_loss(ggml_opt_result_t result, double * loss, double * unc) {
+    const int64_t nbatches = result->loss.size(); // Number of physical batches.
+
+    if (nbatches == 0) {
+        *loss = 0.0;
+        *unc  = NAN;
+        return;
+    }
+
+    double sum         = 0.0;
+    double sum_squared = 0.0;
+
+    for (const float & loss : result->loss) {
+        // If the loss is per datapoint it was scaled by 1.0f/opt_period for each physical batch.
+        const float loss_scaled = result->loss_per_datapoint ? loss*result->opt_period : loss;
+        sum         += loss_scaled;
+        sum_squared += loss_scaled*loss_scaled;
+    }
+
+    const double mean = sum/nbatches;
+    *loss = result->loss_per_datapoint ? mean : sum;
+
+    if (!unc) {
+        return;
+    }
+
+    if (nbatches < 2) {
+        *unc = NAN;
+        return;
+    }
+
+    const double var_sum = sum_squared/nbatches - mean*mean; // variance without Bessel's correction, i.e. nbatches/(nbatches-1)
+    *unc = result->loss_per_datapoint ? sqrt(var_sum / (nbatches - 1)) : sqrt(var_sum * nbatches/(nbatches - 1));
+}
+
+void ggml_opt_result_pred(ggml_opt_result_t result, int32_t * pred) {
+    for (size_t i = 0; i < result->pred.size(); ++i) {
+        pred[i] = result->pred[i];
+    }
+}
+
+void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc) {
+    *accuracy = result->ncorrect >= 0 ? double(result->ncorrect) / double(result->ndata) : NAN;
+
+    if (!unc) {
+        return;
+    }
+
+    *unc = result->ncorrect >= 0 && result->ndata >= 2 ?
+        sqrt((*accuracy) * (1.0 - (*accuracy)) / double(result->ndata - 1)) : NAN;
+}
+
+// ====== Computation ======
+
+static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) {
+    if (graph != opt_ctx->gf) {
+        struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
+
+        GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
+        GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
+        GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
+        GGML_ASSERT(opt_pars.adamw.eps   >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.wd    >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.wd    <= 1.0f);
+
+        // beta1, beta2 after applying warmup
+        const float beta1h = 1.0f/(1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
+        const float beta2h = 1.0f/(1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
+
+        float * adamw_par_data = ggml_get_data_f32(opt_ctx->adamw_params);
+        adamw_par_data[0] = opt_pars.adamw.alpha;
+        adamw_par_data[1] = opt_pars.adamw.beta1;
+        adamw_par_data[2] = opt_pars.adamw.beta2;
+        adamw_par_data[3] = opt_pars.adamw.eps;
+        adamw_par_data[4] = opt_pars.adamw.wd;
+        adamw_par_data[5] = beta1h;
+        adamw_par_data[6] = beta2h;
+    }
+
+    ggml_opt_alloc_graph(opt_ctx, graph);
+    ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
+
+    if (!result) {
+        return;
+    }
+
+    if (result->ndata == 0) {
+        result->loss_per_datapoint = opt_ctx->loss_per_datapoint;
+        result->opt_period         = opt_ctx->opt_period;
+    } else {
+        GGML_ASSERT(result->loss_per_datapoint == opt_ctx->loss_per_datapoint);
+        GGML_ASSERT(result->opt_period         == opt_ctx->opt_period);
+    }
+
+    const int64_t ndata = opt_ctx->outputs->ne[1];
+    GGML_ASSERT(result->ndata == ndata*int64_t(result->loss.size()) && "varying batch size not supported");
+    result->ndata += ndata;
+
+    GGML_ASSERT(ggml_is_scalar(opt_ctx->loss));
+    GGML_ASSERT(opt_ctx->loss->type == GGML_TYPE_F32);
+    float loss;
+    ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
+    result->loss.push_back(loss);
+
+    GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
+    std::vector<int32_t> pred(ndata);
+    ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
+    result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+
+    if (!opt_ctx->labels || result->ncorrect < 0) {
+        result->ncorrect = -1;
+        return;
+    }
+
+    GGML_ASSERT(ggml_is_scalar(opt_ctx->ncorrect));
+    GGML_ASSERT(opt_ctx->ncorrect->type == GGML_TYPE_I64);
+    int64_t ncorrect;
+    ggml_backend_tensor_get(opt_ctx->ncorrect, &ncorrect, 0, ggml_nbytes(opt_ctx->ncorrect));
+    result->ncorrect += ncorrect;
+}
+
+void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
+    ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
+}
+
+void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
+    if (opt_ctx->opt_period == 1) {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
+        return;
+    }
+
+    const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+    if (opt_i_next == 0) {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
+        ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
+    } else {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
+    }
+    opt_ctx->opt_i = opt_i_next;
+}
+
+// ====== High-Level Functions ======
+
+void ggml_opt_epoch(
+        ggml_opt_context_t      opt_ctx,
+        ggml_opt_dataset_t      dataset,
+        ggml_opt_result_t       result_train,
+        ggml_opt_result_t       result_eval,
+        int64_t                 idata_split,
+        ggml_opt_epoch_callback callback_train,
+        ggml_opt_epoch_callback callback_eval) {
+    struct ggml_tensor * inputs = ggml_opt_inputs(opt_ctx);
+    struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
+    struct ggml_tensor * data   = ggml_opt_dataset_data(dataset);
+    GGML_ASSERT(data->ne[0] == inputs->ne[0]);
+
+    const int64_t ndata       =   data->ne[1];
+    const int64_t ndata_batch = inputs->ne[1];
+
+    GGML_ASSERT(data->ne[1] % inputs->ne[1] == 0);
+    const int64_t nbatches = ndata/ndata_batch;
+
+    idata_split = idata_split < 0 ? ndata : idata_split;
+    GGML_ASSERT(idata_split % ndata_batch == 0);
+    const int64_t ibatch_split = idata_split / ndata_batch;
+
+    int64_t ibatch = 0;
+    int64_t t_loop_start = ggml_time_us();
+    for (; ibatch < ibatch_split; ++ibatch) {
+        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
+        ggml_opt_forward_backward(opt_ctx, result_train);
+        if (callback_train) {
+            callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
+        }
+    }
+    t_loop_start = ggml_time_us();
+    for (; ibatch < nbatches; ++ibatch) {
+        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
+        ggml_opt_forward(opt_ctx, result_eval);
+        if (callback_eval) {
+            callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
+        }
+    }
+}
+
+void ggml_opt_epoch_callback_progress_bar(
+        bool               train,
+        ggml_opt_context_t opt_ctx,
+        ggml_opt_dataset_t dataset,
+        ggml_opt_result_t  result,
+        int64_t            ibatch,
+        int64_t            ibatch_max,
+        int64_t            t_start_us) {
+    fprintf(stderr, "%s[", train ? "train: " : "val:   ");
+
+    constexpr int64_t bar_length = 25;
+    for (int64_t j = 0; j < bar_length; ++j) {
+        const int64_t ibatch_j = ibatch_max * j/bar_length;
+        if (ibatch_j < ibatch) {
+            fprintf(stderr, "=");
+        } else if (ibatch_max * (j - 1)/bar_length < ibatch) {
+            fprintf(stderr, ">");
+        } else {
+            fprintf(stderr, " ");
+        }
+    }
+
+    const int64_t batch_size = ggml_opt_inputs(opt_ctx)->ne[1];
+    const int64_t idata      = ibatch*batch_size;
+    const int64_t idata_max  = ibatch_max*batch_size;
+
+    double loss;
+    double loss_unc;
+    ggml_opt_result_loss(result, &loss, &loss_unc);
+
+    double accuracy;
+    double accuracy_unc;
+    ggml_opt_result_accuracy(result, &accuracy, &accuracy_unc);
+
+    const int64_t t_ibatch_us = ggml_time_us() - t_start_us;
+    int64_t t_ibatch_s = t_ibatch_us / 1000000;
+    const int64_t t_ibatch_h = t_ibatch_s / 3600;
+    t_ibatch_s -= t_ibatch_h * 3600;
+    const int64_t t_ibatch_m = t_ibatch_s / 60;
+    t_ibatch_s -= t_ibatch_m * 60;
+
+    const int64_t t_eta_us = t_ibatch_us * (ibatch_max - ibatch)/ibatch;
+    int64_t t_eta_s = t_eta_us / 1000000;
+    const int64_t t_eta_h = t_eta_s / 3600;
+    t_eta_s -= t_eta_h * 3600;
+    const int64_t t_eta_m = t_eta_s / 60;
+    t_eta_s -= t_eta_m * 60;
+
+    fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, "
+            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r",
+            idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
+            t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
+    if (ibatch == ibatch_max) {
+        fprintf(stderr, "\n");
+    }
+    fflush(stderr);
+
+    GGML_UNUSED(dataset);
+}
+
+void ggml_opt_fit(
+        ggml_backend_sched_t            backend_sched,
+        ggml_context                  * ctx_compute,
+        ggml_tensor                   * inputs,
+        ggml_tensor                   * outputs,
+        ggml_opt_dataset_t              dataset,
+        enum ggml_opt_loss_type         loss_type,
+        ggml_opt_get_optimizer_params   get_opt_pars,
+        int64_t                         nepoch,
+        int64_t                         nbatch_logical,
+        float                           val_split,
+        bool                            silent) {
+    ggml_time_init();
+    const int64_t t_start_us = ggml_time_us();
+
+    const int64_t ndata           = ggml_opt_dataset_data(dataset)->ne[1];
+    const int64_t nbatch_physical = inputs->ne[1];
+    GGML_ASSERT(ndata          % nbatch_logical  == 0);
+    GGML_ASSERT(nbatch_logical % nbatch_physical == 0);
+
+    const int64_t opt_period       = nbatch_logical / nbatch_physical;
+    const int64_t nbatches_logical = ndata / nbatch_logical;
+
+    GGML_ASSERT(val_split >= 0.0f);
+    GGML_ASSERT(val_split <  1.0f);
+    const int64_t ibatch_split = int64_t(((1.0f - val_split) * nbatches_logical)) * opt_period; // train <-> val split index (physical)
+    const int64_t idata_split  = ibatch_split * nbatch_physical;
+
+    int64_t epoch = 1;
+
+    ggml_opt_params params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
+    params.opt_period      = opt_period;
+    params.get_opt_pars    = get_opt_pars;
+    params.get_opt_pars_ud = &epoch;
+    ggml_opt_context_t opt_ctx = ggml_opt_init(params);
+
+    // Shuffling the data is generally useful but there is only a point if not all data is used in a single batch.
+    if (nbatch_logical < ndata) {
+        ggml_opt_dataset_shuffle(opt_ctx, dataset, -1); // Shuffle all data (train + validation).
+    }
+
+    ggml_opt_result_t result_train = ggml_opt_result_init();
+    ggml_opt_result_t result_val   = ggml_opt_result_init();
+
+    ggml_opt_epoch_callback epoch_callback = silent ? nullptr : ggml_opt_epoch_callback_progress_bar;
+
+    for (; epoch <= nepoch; ++epoch) {
+        if (nbatch_logical < idata_split) {
+            ggml_opt_dataset_shuffle(opt_ctx, dataset, idata_split);
+        }
+
+        ggml_opt_result_reset(result_train);
+        ggml_opt_result_reset(result_val);
+
+        if (!silent) {
+            fprintf(stderr, "%s: epoch %04" PRId64 "/%04" PRId64 ":\n", __func__, epoch, nepoch);
+        }
+        ggml_opt_epoch(opt_ctx, dataset, result_train, result_val, idata_split, epoch_callback, epoch_callback);
+        if (!silent) {
+            fprintf(stderr, "\n");
+        }
+    }
+
+    if (!silent) {
+        int64_t t_total_s = (ggml_time_us() - t_start_us) / 1000000;
+        const int64_t t_total_h = t_total_s / 3600;
+        t_total_s -= t_total_h * 3600;
+        const int64_t t_total_m = t_total_s / 60;
+        t_total_s -= t_total_m * 60;
+        fprintf(stderr, "%s: training took %02" PRId64 ":%02" PRId64 ":%02" PRId64 "\n", __func__, t_total_h, t_total_m, t_total_s);
+    }
+
+    ggml_opt_free(opt_ctx);
+    ggml_opt_result_free(result_train);
+    ggml_opt_result_free(result_val);
+}
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 5cdf59f25..4a478fcaa 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1592,14 +1592,13 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.op           =*/ GGML_OP_NONE,
         /*.op_params    =*/ { 0 },
         /*.flags        =*/ 0,
-        /*.grad         =*/ NULL,
         /*.src          =*/ { NULL },
         /*.view_src     =*/ view_src,
         /*.view_offs    =*/ view_offs,
         /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
-        ///*.padding      =*/ { 0 },
+        /*.padding      =*/ { 0 },
     };
 
 #ifdef __clang__
@@ -4194,8 +4193,6 @@ struct ggml_tensor * ggml_flash_attn_ext(
         GGML_ASSERT(mask);
     }
 
-    bool is_node = false;
-
     // permute(0, 2, 1, 3)
     int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
@@ -4203,8 +4200,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
     float params[] = { scale, max_bias, logit_softcap };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op   = GGML_OP_FLASH_ATTN_EXT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_FLASH_ATTN_EXT;
     result->src[0] = q;
     result->src[1] = k;
     result->src[2] = v;
@@ -4272,14 +4268,6 @@ struct ggml_tensor * ggml_flash_attn_back(
 
     GGML_ASSERT(ne2 % kvne2 == 0);
 
-    bool is_node = false;
-
-    if (q->grad || k->grad || v->grad) {
-        // when using this operation (in backwards pass) these grads are set.
-        // we don't want to create (big) grad of our result, so is_node is false.
-        is_node = false;
-    }
-
     // store gradients of q, k and v as continuous tensors concatenated in result.
     // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
     const int64_t elem_q = ggml_nelements(q);
@@ -4302,8 +4290,7 @@ struct ggml_tensor * ggml_flash_attn_back(
     int32_t masked_i = masked ? 1 : 0;
     ggml_set_op_params(result, &masked_i, sizeof(masked_i));
 
-    result->op   = GGML_OP_FLASH_ATTN_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->op     = GGML_OP_FLASH_ATTN_BACK;
     result->src[0] = q;
     result->src[1] = k;
     result->src[2] = v;
@@ -4945,34 +4932,24 @@ struct ggml_tensor * ggml_opt_step_adamw(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * grad,
-        float                 alpha,
-        float                 beta1,
-        float                 beta2,
-        float                 eps,
-        float                 wd) {
+        struct ggml_tensor  * m,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * adamw_params) {
     GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
     GGML_ASSERT(ggml_are_same_shape(a, grad));
-    GGML_ASSERT(alpha >  0.0f);
-    GGML_ASSERT(beta1 >= 0.0f && beta1 <= 1.0f);
-    GGML_ASSERT(beta2 >= 0.0f && beta2 <= 1.0f);
-    GGML_ASSERT(eps   >= 0.0f);
-    GGML_ASSERT(wd    >= 0.0f && wd    <= 1.0f);
+    GGML_ASSERT(ggml_are_same_shape(a, m));
+    GGML_ASSERT(ggml_are_same_shape(a, v));
+    GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
 
     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
 
-    const int64_t iter = 1;
-    memcpy(&result->op_params[0], &iter, sizeof(int64_t));
-    ggml_set_op_params_f32(result, 2, alpha);
-    ggml_set_op_params_f32(result, 3, beta1);
-    ggml_set_op_params_f32(result, 4, beta2);
-    ggml_set_op_params_f32(result, 5, eps);
-    ggml_set_op_params_f32(result, 6, wd);
-
     result->op     = GGML_OP_OPT_STEP_ADAMW;
     result->src[0] = a;
     result->src[1] = grad;
-    result->src[2] = ggml_dup_tensor(ctx, grad);
-    result->src[3] = ggml_dup_tensor(ctx, grad);
+    result->src[2] = m;
+    result->src[3] = v;
+    result->src[4] = adamw_params;
 
     return result;
 }
@@ -5041,1112 +5018,514 @@ static void ggml_hash_map_free(struct hash_map * map) {
     GGML_FREE(map);
 }
 
-// gradient checkpointing
-
-static struct ggml_tensor * ggml_recompute_graph_node(
-        struct ggml_context * ctx,
-        struct ggml_cgraph  * graph,
-        struct hash_map     * replacements,
-        struct ggml_tensor  * node) {
-
-    if (node == NULL) {
-        return NULL;
-    }
-
-    if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-        return node;
-    }
-
-    if (!ggml_hash_contains(&graph->visited_hash_set, node)) {
-        return node;
-    }
-
-    int count_children = 0;
-    for (int k = 0; k < GGML_MAX_SRC; ++k) {
-        if (node->src[k]) {
-            ++count_children;
-        }
-    }
-
-    if (count_children == 0) {
-        return node;
-    }
-
-    size_t i = ggml_hash_find(&replacements->set, node);
-    GGML_ASSERT(i != GGML_HASHSET_FULL); // assert that not full
-    if (replacements->set.keys[i] == node) {
-        return replacements->vals[i];
-    }
-
-    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
-
-    // insert clone into replacements
-    GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
-    replacements->set.keys[i] = node;
-    replacements->vals[i] = clone;
-
-    clone->op       = node->op;
-    clone->grad     = node->grad;
-    clone->flags    = node->flags;
-    clone->extra    = node->extra;
-    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
-        clone->nb[k] = node->nb[k];
-    }
-    for (int k = 0; k < GGML_MAX_SRC; ++k) {
-        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
-    }
-    if (node->view_src != NULL) {
-        clone->data = (node->view_src->data == NULL)
-                        ? NULL // view_src not yet allocated
-                        : (char *) node->view_src->data // view_src already allocated
-                                 + node->view_offs;
-        clone->view_src  = node->view_src;
-        clone->view_offs = node->view_offs;
-    }
-
-    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
-    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
-    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
-    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
-
-    return clone;
-}
-
-void ggml_build_backward_gradient_checkpointing(
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * checkpoints,
-        int                     n_checkpoints) {
-    ggml_graph_cpy(gf, gb_tmp);
-    ggml_build_backward_expand(ctx, gf, gb_tmp, false);
-
-    if (n_checkpoints <= 0) {
-        ggml_graph_cpy(gb_tmp, gb);
-        return;
-    }
-
-    struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
-
-    // insert checkpoints in replacements
-    for (int i = 0; i < n_checkpoints; ++i) {
-        size_t k = ggml_hash_find(&replacements->set, checkpoints[i]);
-        GGML_ASSERT(k != GGML_HASHSET_FULL); // assert that not full
-        GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
-        replacements->set.keys[k] = checkpoints[i];
-        replacements->vals[k]     = checkpoints[i];
-    }
-
-    ggml_graph_cpy(gf, gb);
-    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
-    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
-    // by recomputing them from checkpoints
-    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
-        struct ggml_tensor * node = gb_tmp->nodes[i];
-        for (int k = 0; k < GGML_MAX_SRC; ++k) {
-            // insert new tensors recomputing src, reusing already made replacements,
-            // remember replacements: remember new tensors with mapping from corresponding gf nodes
-            // recurse for input tensors,
-            // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
-            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
-        }
-        // insert rewritten backward node with replacements made into resulting backward graph gb
-        ggml_build_forward_expand(gb, node);
-    }
-
-    ggml_hash_map_free(replacements);
-}
-
 // utility functions to change gradients
 // if a is in acc_table, modify gradients in-place and mark result as gradient accumulator
 // else if a is in zero_table, replace a
 // else, just add/subtract/etc. the gradients
 
-static struct ggml_tensor * ggml_add_or_set(
-        struct ggml_context  * ctx,
-        struct ggml_tensor   * a,
-        struct ggml_tensor   * b,
-        struct ggml_hash_set * zero_table,
-        struct ggml_hash_set * acc_table) {
-    if (ggml_hash_contains(acc_table, a)) {
-        struct ggml_tensor * ret = ggml_add_impl(ctx, a, b, true);
-        const size_t insert_result = ggml_hash_insert(acc_table, ret);
-        GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
-        GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
-        return ret;
+static void ggml_add_or_set(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * cgraph,
+        size_t                isrc,
+        struct ggml_tensor  * tensor) {
+    if (cgraph->grads[isrc]) {
+        cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
+    } else {
+        cgraph->grads[isrc] = tensor;
     }
-    if (ggml_hash_contains(zero_table, a)) {
-        return b;
-    }
-    return ggml_add_impl(ctx, a, b, false);
+    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
 
-static struct ggml_tensor * ggml_acc_or_set(
-        struct ggml_context  * ctx,
-        struct ggml_tensor   * a,
-        struct ggml_tensor   * b,
-        const  size_t          nb1,
-        const  size_t          nb2,
-        const  size_t          nb3,
-        const  size_t          offset,
-        struct ggml_hash_set * zero_table,
-        struct ggml_hash_set * acc_table) {
-    if (ggml_hash_contains(acc_table, a)) {
-        struct ggml_tensor * ret = ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
-        const size_t insert_result = ggml_hash_insert(acc_table, ret);
-        GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
-        GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
-        return ret;
+static void ggml_acc_or_set(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * cgraph,
+        size_t                isrc,
+        struct ggml_tensor  * src,
+        struct ggml_tensor  * tensor,
+        const  size_t         nb1,
+        const  size_t         nb2,
+        const  size_t         nb3,
+        const  size_t         offset) {
+    if (cgraph->grads[isrc]) {
+        cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
+    } else {
+        struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
+        cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
     }
-    if (ggml_hash_contains(zero_table, a)) {
-        struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
-        return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
-    }
-    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
 
-static struct ggml_tensor * ggml_add1_or_set(
-        struct ggml_context  * ctx,
-        struct ggml_tensor   * a,
-        struct ggml_tensor   * b,
-        struct ggml_hash_set * zero_table,
-        struct ggml_hash_set * acc_table) {
-    if (ggml_hash_contains(acc_table, a)) {
-        struct ggml_tensor * ret = ggml_add1_impl(ctx, a, b, true);
-        const size_t insert_result = ggml_hash_insert(acc_table, ret);
-        GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
-        GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
-        return ret;
+static void ggml_add1_or_set(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * cgraph,
+        size_t                isrc,
+        struct ggml_tensor  * src,
+        struct ggml_tensor  * tensor) {
+    if (cgraph->grads[isrc]) {
+        cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
+    } else {
+        cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
     }
-    if (ggml_hash_contains(zero_table, a)) {
-        return ggml_repeat(ctx, b, a);
-    }
-    return ggml_add1_impl(ctx, a, b, false);
+    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
 
-static struct ggml_tensor * ggml_sub_or_set(
-        struct ggml_context  * ctx,
-        struct ggml_tensor   * a,
-        struct ggml_tensor   * b,
-        struct ggml_hash_set * zero_table,
-        struct ggml_hash_set * acc_table) {
-    if (ggml_hash_contains(acc_table, a)) {
-        struct ggml_tensor * ret = ggml_sub_impl(ctx, a, b, true);
-        const size_t insert_result = ggml_hash_insert(acc_table, ret);
-        GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
-        GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
-        return ret;
+static void ggml_sub_or_set(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * cgraph,
+        size_t                isrc,
+        struct ggml_tensor  * tensor) {
+    if (cgraph->grads[isrc]) {
+        cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
+    } else {
+        cgraph->grads[isrc] = ggml_neg(ctx, tensor);
     }
-    if (ggml_hash_contains(zero_table, a)) {
-        return ggml_neg(ctx, b);
-    }
-    return ggml_sub_impl(ctx, a, b, false);
+    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
 
-static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table, struct ggml_hash_set * acc_table) {
+static void ggml_compute_backward(
+        struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, bool * grads_needed) {
+    struct ggml_tensor * tensor = cgraph->nodes[i];
+    struct ggml_tensor * grad   = ggml_graph_get_grad(cgraph, tensor);
+
+    if (!grad) {
+        return;
+    }
+
     struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
     struct ggml_tensor * src2 = tensor->src[2];
+    struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
+    const size_t isrc0 = ggml_hash_find(hash_set, src0);
+    const size_t isrc1 = ggml_hash_find(hash_set, src1);
+    const size_t isrc2 = ggml_hash_find(hash_set, src2);
+    const bool src0_needs_grads = isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
+    const bool src1_needs_grads = isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
+    const bool src2_needs_grads = isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
 
     switch (tensor->op) {
-        case GGML_OP_DUP:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_ADD:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
-                }
-                if (src1->grad) {
-                    if (ggml_are_same_shape(src0, src1)) {
-                        src1->grad = ggml_add_or_set(ctx, src1->grad,                       tensor->grad,        zero_table, acc_table);
-                    } else {
-                        src1->grad = ggml_add_or_set(ctx, src1->grad, ggml_repeat_back(ctx, tensor->grad, src1), zero_table, acc_table);
-                    }
-                }
-            } break;
-        case GGML_OP_ADD1:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_add_or_set(ctx,
-                        src1->grad,
-                        ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean
-                        zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_ACC:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
-                }
-                if (src1->grad) {
-                    const size_t nb1     = ((int32_t *) tensor->op_params)[0];
-                    const size_t nb2     = ((int32_t *) tensor->op_params)[1];
-                    const size_t nb3     = ((int32_t *) tensor->op_params)[2];
-                    const size_t offset  = ((int32_t *) tensor->op_params)[3];
-
-                    struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
-                        tensor->grad,
-                        src1->grad->ne[0],
-                        src1->grad->ne[1],
-                        src1->grad->ne[2],
-                        src1->grad->ne[3],
-                        nb1, nb2, nb3, offset);
-
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                            src1->grad,
-                            ggml_reshape(ctx,
-                                ggml_cont(ctx, tensor_grad_view),
-                                src1->grad),
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_SUB:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_MUL:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_mul(ctx, src1, tensor->grad),
-                                zero_table, acc_table);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                                src1->grad,
-                                ggml_mul(ctx, src0, tensor->grad),
-                                zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_DIV:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_div(ctx, tensor->grad, src1),
-                                zero_table, acc_table);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_sub_or_set(ctx,
-                                src1->grad,
-                                ggml_mul(ctx,
-                                    tensor->grad,
-                                    ggml_div(ctx, tensor, src1)),
-                                zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_SQR:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_scale(ctx,
-                                    ggml_mul(ctx, src0, tensor->grad),
-                                    2.0f),
-                                zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_SQRT:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_scale(ctx,
-                                    ggml_div(ctx,
-                                        tensor->grad,
-                                        tensor),
-                                    0.5f),
-                                zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_LOG:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_div(ctx,
-                                    tensor->grad,
-                                    src0),
-                                zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_SIN:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_mul(ctx,
-                                    tensor->grad,
-                                    ggml_cos(ctx, src0)),
-                                zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_COS:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_sub_or_set(ctx,
-                                src0->grad,
-                                ggml_mul(ctx,
-                                    tensor->grad,
-                                    ggml_sin(ctx, src0)),
-                                zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_SUM:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add1_or_set(ctx,
-                                src0->grad,
-                                tensor->grad,
-                                zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_SUM_ROWS:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_repeat(ctx,
-                                    tensor->grad,
-                                    src0->grad),
-                                zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_MEAN:
-        case GGML_OP_ARGMAX:
-        case GGML_OP_COUNT_EQUAL:
-            {
-                GGML_ABORT("fatal error"); // TODO: implement
+        case GGML_OP_DUP: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
             }
-        case GGML_OP_REPEAT:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_repeat_back(ctx, tensor->grad, src0->grad),
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_REPEAT_BACK:
-            {
-                if (src0->grad) {
-                    // TODO: test this
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_repeat(ctx, tensor->grad, src0->grad),
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_CONCAT:
-            {
-                GGML_ABORT("fatal error"); // TODO: implement
+        } break;
+        case GGML_OP_ADD: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
             }
-        case GGML_OP_SILU_BACK:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+            if (src1_needs_grads) {
+                struct ggml_tensor * tmp = grad;
+                if (!ggml_are_same_shape(src0, src1)) {
+                    tmp = ggml_repeat_back(ctx, tmp, src1);
+                }
+                ggml_add_or_set(ctx, cgraph, isrc1, tmp);
             }
-        case GGML_OP_NORM:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+        } break;
+        case GGML_OP_ADD1: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
             }
-        case GGML_OP_RMS_NORM:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    float eps;
-                    memcpy(&eps, tensor->op_params, sizeof(float));
-
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_RMS_NORM_BACK:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+            if (src1_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc1, ggml_mean(ctx, grad)); // TODO: should probably be sum instead of mean
             }
-        case GGML_OP_GROUP_NORM:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+        } break;
+        case GGML_OP_ACC: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
             }
-        case GGML_OP_MUL_MAT:
-            {
-                // https://cs231n.github.io/optimization-2/#staged
-                // # forward pass
-                // s0 = np.random.randn(5, 10)
-                // s1 = np.random.randn(10, 3)
-                // t = s0.dot(s1)
+            if (src1_needs_grads) {
+                const size_t nb1    = ((int32_t *) tensor->op_params)[0];
+                const size_t nb2    = ((int32_t *) tensor->op_params)[1];
+                const size_t nb3    = ((int32_t *) tensor->op_params)[2];
+                const size_t offset = ((int32_t *) tensor->op_params)[3];
 
-                // # now suppose we had the gradient on t from above in the circuit
-                // dt = np.random.randn(*t.shape) # same shape as t
-                // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
-                // ds1 = t.T.dot(dt)
+                struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
+                    grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+                    nb1, nb2, nb3, offset);
 
-                // tensor.shape [m,p,qq,rr]
-                // src0.shape   [n,m,q1,r1]
-                // src1.shape   [n,p,qq,rr]
-
-                // necessary for llama
-                if (src0->grad) {
-                    struct ggml_tensor * s1_tg =
-                        ggml_out_prod(ctx, // [n,m,qq,rr]
-                            src1,          // [n,p,qq,rr]
-                            tensor->grad); // [m,p,qq,rr]
-                    const int64_t qq = s1_tg->ne[2];
-                    const int64_t rr = s1_tg->ne[3];
-                    const int64_t q1 = src0->ne[2];
-                    const int64_t r1 = src0->ne[3];
-                    const bool ne2_broadcasted = qq > q1;
-                    const bool ne3_broadcasted = rr > r1;
-                    if (ne2_broadcasted || ne3_broadcasted) {
-                        // sum broadcast repetitions of s1_tg into shape of src0
-                        s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
-                    }
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad, // [n,m,q1,r1]
-                                s1_tg,      // [n,m,q1,r1]
-                                zero_table, acc_table);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                                src1->grad,                            // [n,p,qq,rr]
-                                // ggml_mul_mat(ctx,                   // [n,p,qq,rr]
-                                //     ggml_cont(ctx,                  // [m,n,q1,r1]
-                                //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
-                                //     tensor->grad),                  // [m,p,qq,rr]
-
-                                // // when src0 is bigger than tensor->grad (this is mostly the case in llama),
-                                // // avoid transpose of src0, rather transpose smaller tensor->grad
-                                // // and then use ggml_out_prod
-                                ggml_out_prod(ctx,                  // [n,p,qq,rr]
-                                    src0,                           // [n,m,q1,r1]
-                                    ggml_transpose(ctx,             // [p,m,qq,rr]
-                                        tensor->grad)),             // [m,p,qq,rr]
-                                zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_MUL_MAT_ID:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+                ggml_add_or_set(ctx, cgraph, isrc1, ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1));
             }
-        case GGML_OP_OUT_PROD:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+        } break;
+        case GGML_OP_SUB: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
             }
-        case GGML_OP_SCALE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    float s;
-                    memcpy(&s, tensor->op_params, sizeof(float));
-
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_scale_impl(ctx, tensor->grad, s, false),
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_SET:
-            {
-                const size_t nb1     = ((int32_t *) tensor->op_params)[0];
-                const size_t nb2     = ((int32_t *) tensor->op_params)[1];
-                const size_t nb3     = ((int32_t *) tensor->op_params)[2];
-                const size_t offset  = ((int32_t *) tensor->op_params)[3];
-
-                struct ggml_tensor * tensor_grad_view = NULL;
-
-                if (src0->grad || src1->grad) {
-                    GGML_ASSERT(src0->type == tensor->type);
-                    GGML_ASSERT(tensor->grad->type == tensor->type);
-                    GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);
-
-                    tensor_grad_view = ggml_view_4d(ctx,
-                        tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
-                        nb1, nb2, nb3, offset);
-                }
-
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx,
-                        src0->grad,
-                        ggml_acc_impl(ctx,
-                            tensor->grad,
-                            ggml_neg(ctx, tensor_grad_view),
-                            nb1, nb2, nb3, offset, false),
-                        zero_table, acc_table);
-                }
-
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                            src1->grad,
-                            ggml_reshape(ctx,
-                                ggml_cont(ctx, tensor_grad_view),
-                                src1->grad),
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_CPY:
-            {
-                // necessary for llama
-                // cpy overwrites value of src1 by src0 and returns view(src1)
-                // the overwriting is mathematically equivalent to:
-                // tensor = src0 * 1 + src1 * 0
-                if (src0->grad) {
-                    // dsrc0 = dtensor * 1
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
-                }
-                if (src1->grad) {
-                    // dsrc1 = dtensor * 0 -> noop
-                }
-            } break;
-        case GGML_OP_CONT:
-            {
-                // same as cpy
-                if (src0->grad) {
-                    GGML_ASSERT(ggml_is_contiguous(src0->grad));
-                    GGML_ASSERT(ggml_is_contiguous(tensor->grad));
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_RESHAPE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_reshape(ctx,
-                                ggml_is_contiguous(tensor->grad)
-                                    ? tensor->grad
-                                    : ggml_cont(ctx, tensor->grad),
-                                src0->grad),
-                        zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_VIEW:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    size_t offset;
-
-                    memcpy(&offset, tensor->op_params, sizeof(offset));
-
-                    size_t nb1 = tensor->nb[1];
-                    size_t nb2 = tensor->nb[2];
-                    size_t nb3 = tensor->nb[3];
-
-                    if (src0->type != src0->grad->type) {
-                        // gradient is typically F32, but src0 could be other type
-                        size_t ng = ggml_element_size(src0->grad);
-                        size_t n0 = ggml_element_size(src0);
-                        GGML_ASSERT(offset % n0 == 0);
-                        GGML_ASSERT(nb1 % n0 == 0);
-                        GGML_ASSERT(nb2 % n0 == 0);
-                        GGML_ASSERT(nb3 % n0 == 0);
-                        offset = (offset / n0) * ng;
-                        nb1 = (nb1 / n0) * ng;
-                        nb2 = (nb2 / n0) * ng;
-                        nb3 = (nb3 / n0) * ng;
-                    }
-
-                    src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    int32_t * axes = (int32_t *) tensor->op_params;
-                    int axis0 = axes[0] & 0x3;
-                    int axis1 = axes[1] & 0x3;
-                    int axis2 = axes[2] & 0x3;
-                    int axis3 = axes[3] & 0x3;
-                    int axes_backward[4] = {0,0,0,0};
-                    axes_backward[axis0] = 0;
-                    axes_backward[axis1] = 1;
-                    axes_backward[axis2] = 2;
-                    axes_backward[axis3] = 3;
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_permute(ctx,
-                                tensor->grad,
-                                axes_backward[0],
-                                axes_backward[1],
-                                axes_backward[2],
-                                axes_backward[3]),
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_transpose(ctx, tensor->grad),
-                        zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                // necessary for llama (only for tokenizer)
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            // last ggml_get_rows_back argument src0->grad is only
-                            // necessary to setup correct output shape
-                            ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
-                        zero_table, acc_table);
-                }
-                if (src1->grad) {
-                    // noop
-                }
-            } break;
-        case GGML_OP_GET_ROWS_BACK:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+            if (src1_needs_grads) {
+                ggml_sub_or_set(ctx, cgraph, isrc1, grad);
             }
-        case GGML_OP_DIAG:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+        } break;
+        case GGML_OP_MUL: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, src1, grad));
             }
-        case GGML_OP_DIAG_MASK_INF:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    const int n_past = ((int32_t *) tensor->op_params)[0];
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            /* ggml_diag_mask_inf_impl() shouldn't be here */
-                            /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
-                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-                        zero_table, acc_table);
+            if (src1_needs_grads) {
+                struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
+                if (!ggml_are_same_shape(src0, src1)) {
+                    tmp = ggml_repeat_back(ctx, tmp, src1);
                 }
-            } break;
-        case GGML_OP_DIAG_MASK_ZERO:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    const int n_past = ((int32_t *) tensor->op_params)[0];
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-                        zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_soft_max_back(ctx, tensor->grad, tensor),
-                        zero_table, acc_table);
-                }
-                GGML_ASSERT((!src1 || !src1->grad) && "backward pass for softmax mask not implemented");
-            } break;
-        case GGML_OP_SOFT_MAX_BACK:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+                ggml_add_or_set(ctx, cgraph, isrc1, tmp);
             }
-        case GGML_OP_ROPE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
-                    const int mode       = ((int32_t *) tensor->op_params)[2];
-                    //const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
-                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+        } break;
+        case GGML_OP_DIV: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_div(ctx, grad, src1));
+            }
+            if (src1_needs_grads) {
+                ggml_sub_or_set(ctx, cgraph, isrc1, ggml_mul(ctx, grad, ggml_div(ctx, tensor, src1)));
+            }
+        } break;
+        case GGML_OP_SQR: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale(ctx, ggml_mul(ctx, src0, grad), 2.0f));
+            }
+        } break;
+        case GGML_OP_SQRT: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale(ctx, ggml_div(ctx, grad, tensor), 0.5f));
+            }
+        } break;
+        case GGML_OP_LOG: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_div(ctx, grad, src0));
+            }
+        } break;
+        case GGML_OP_SIN: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_cos(ctx, src0)));
+            }
+        } break;
+        case GGML_OP_COS: {
+            if (src0_needs_grads) {
+                ggml_sub_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_sin(ctx, src0)));
+            }
+        } break;
+        case GGML_OP_SUM: {
+            if (src0_needs_grads) {
+                ggml_add1_or_set(ctx, cgraph, isrc0, src0, grad);
+            }
+        } break;
+        case GGML_OP_SUM_ROWS: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat(ctx, grad, src0));
+            }
+        } break;
+        case GGML_OP_MEAN: {
+            if (src0_needs_grads) {
+                ggml_add1_or_set(ctx, cgraph, isrc0, src0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
+            }
+        } break;
+        case GGML_OP_REPEAT: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat_back(ctx, grad, src0));
+            }
+        } break;
+        case GGML_OP_REPEAT_BACK: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat(ctx, grad, src0));
+            }
+        } break;
+        case GGML_OP_RMS_NORM: {
+            if (src0_needs_grads) {
+                float eps;
+                memcpy(&eps, tensor->op_params, sizeof(float));
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, src0, grad, eps));
+            }
+        } break;
+        case GGML_OP_MUL_MAT: {
+            // https://cs231n.github.io/optimization-2/#staged
+            // # forward pass
+            // s0 = np.random.randn(5, 10)
+            // s1 = np.random.randn(10, 3)
+            // t = s0.dot(s1)
 
-                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
-                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
-                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
-                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
-                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
-                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+            // # now suppose we had the gradient on t from above in the circuit
+            // dt = np.random.randn(*t.shape) # same shape as t
+            // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
+            // ds1 = t.T.dot(dt)
 
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_rope_back(ctx,
-                                tensor->grad,
-                                src1,
-                                src2,
-                                n_dims,
-                                mode,
-                                n_ctx_orig,
-                                freq_base,
-                                freq_scale,
-                                ext_factor,
-                                attn_factor,
-                                beta_fast,
-                                beta_slow),
-                            zero_table, acc_table);
+            // tensor.shape [m,p,qq,rr]
+            // src0.shape   [n,m,q1,r1]
+            // src1.shape   [n,p,qq,rr]
+
+            if (src0_needs_grads) {
+                struct ggml_tensor * s1_tg =
+                    ggml_out_prod(ctx, // [n,m,qq,rr]
+                        src1,          // [n,p,qq,rr]
+                        grad);         // [m,p,qq,rr]
+                const int64_t qq = s1_tg->ne[2];
+                const int64_t rr = s1_tg->ne[3];
+                const int64_t q1 = src0->ne[2];
+                const int64_t r1 = src0->ne[3];
+                const bool ne2_broadcasted = qq > q1;
+                const bool ne3_broadcasted = rr > r1;
+                if (ne2_broadcasted || ne3_broadcasted) {
+                    // sum broadcast repetitions of s1_tg into shape of src0
+                    s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
                 }
-                GGML_ASSERT((!src2 || !src2->grad) && "gradients for freq factors not implemented");
-            } break;
-        case GGML_OP_ROPE_BACK:
-            {
-                if (src0->grad) {
-                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
-                    const int mode       = ((int32_t *) tensor->op_params)[2];
-                    //const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
-                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                ggml_add_or_set(ctx, cgraph, isrc0, s1_tg /*= [n,m,q1,r1]*/);
+            }
+            if (src1_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc1,
+                        // ggml_mul_mat(ctx,                   // [n,p,qq,rr]
+                        //     ggml_cont(ctx,                  // [m,n,q1,r1]
+                        //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
+                        //     grad),                          // [m,p,qq,rr]
 
-                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
-                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
-                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
-                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
-                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
-                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                        // when src0 is bigger than tensor->grad (this is mostly the case in llama),
+                        // avoid transpose of src0, rather transpose smaller tensor->grad
+                        // and then use ggml_out_prod
+                        ggml_out_prod(ctx,      // [n,p,qq,rr]
+                            src0,               // [n,m,q1,r1]
+                            ggml_transpose(ctx, // [p,m,qq,rr]
+                                grad)));        // [m,p,qq,rr]
+            }
+        } break;
+        case GGML_OP_SCALE: {
+            if (src0_needs_grads) {
+                float s;
+                memcpy(&s, tensor->op_params, sizeof(float));
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, false));
+            }
+        } break;
+        case GGML_OP_SET: {
+            const size_t nb1    = ((const int32_t *) tensor->op_params)[0];
+            const size_t nb2    = ((const int32_t *) tensor->op_params)[1];
+            const size_t nb3    = ((const int32_t *) tensor->op_params)[2];
+            const size_t offset = ((const int32_t *) tensor->op_params)[3];
 
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_rope_impl(ctx,
-                                tensor->grad,
-                                src1,
-                                src2,
-                                n_dims,
-                                mode,
-                                n_ctx_orig,
-                                freq_base,
-                                freq_scale,
-                                ext_factor,
-                                attn_factor,
-                                beta_fast,
-                                beta_slow,
-                                false),
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_CLAMP:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
-            }
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
-            }
-        case GGML_OP_IM2COL:
-            {
-                if (src1->grad) {
-                    const int32_t s0    = ggml_get_op_params_i32(tensor, 0);
-                    const int32_t s1    = ggml_get_op_params_i32(tensor, 1);
-                    const int32_t p0    = ggml_get_op_params_i32(tensor, 2);
-                    const int32_t p1    = ggml_get_op_params_i32(tensor, 3);
-                    const int32_t d0    = ggml_get_op_params_i32(tensor, 4);
-                    const int32_t d1    = ggml_get_op_params_i32(tensor, 5);
-                    const bool    is_2D = ggml_get_op_params_i32(tensor, 6) == 1;
+            struct ggml_tensor * tensor_grad_view = NULL;
 
-                    src1->grad = ggml_add_or_set(ctx,
-                            src1->grad,
-                            ggml_im2col_back(ctx, src0, tensor->grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D),
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_IM2COL_BACK:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
-            }
-        case GGML_OP_CONV_TRANSPOSE_2D:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
-            }
-        case GGML_OP_POOL_1D:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
-            }
-        case GGML_OP_POOL_2D:
-            {
-                if (src0->grad) {
-                    const enum ggml_op_pool op = ggml_get_op_params_i32(tensor, 0);
-                    const      int32_t      k0 = ggml_get_op_params_i32(tensor, 1);
-                    const      int32_t      k1 = ggml_get_op_params_i32(tensor, 2);
-                    const      int32_t      s0 = ggml_get_op_params_i32(tensor, 3);
-                    const      int32_t      s1 = ggml_get_op_params_i32(tensor, 4);
-                    const      int32_t      p0 = ggml_get_op_params_i32(tensor, 5);
-                    const      int32_t      p1 = ggml_get_op_params_i32(tensor, 6);
+            if (src0_needs_grads || src1_needs_grads) {
+                GGML_ASSERT(src0->type == tensor->type);
+                GGML_ASSERT(!cgraph->grads[isrc0] ||                      cgraph->grads[isrc0]->type == grad->type);
+                GGML_ASSERT(!cgraph->grads[isrc1] || !src1_needs_grads || cgraph->grads[isrc1]->type == grad->type);
 
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_pool_2d_back(ctx, tensor->grad, src0, op, k0, k1, s0, s1, p0, p1),
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_POOL_2D_BACK:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+                tensor_grad_view = ggml_view_4d(ctx,
+                    grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+                    nb1, nb2, nb3, offset);
             }
-        case GGML_OP_UPSCALE:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+
+            if (src0_needs_grads) {
+                struct ggml_tensor * tmp = ggml_neg(ctx, tensor_grad_view);
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_acc_impl(ctx, grad, tmp, nb1, nb2, nb3, offset, false));
             }
-        case GGML_OP_PAD:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+
+            if (src1_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc1, ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1));
             }
-        case GGML_OP_ARANGE:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+        } break;
+        case GGML_OP_CPY: {
+            // cpy overwrites value of src1 by src0 and returns view(src1)
+            // the overwriting is mathematically equivalent to:
+            // tensor = src0 * 1 + src1 * 0
+            if (src0_needs_grads) {
+                // dsrc0 = dtensor * 1
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
             }
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+            if (src1_needs_grads) {
+                // dsrc1 = dtensor * 0 -> noop
             }
-        case GGML_OP_ARGSORT:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+        } break;
+        case GGML_OP_CONT: {
+            // same as cpy
+            if (src0_needs_grads) {
+                GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
+                GGML_ASSERT(ggml_is_contiguous(grad));
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
             }
-        case GGML_OP_LEAKY_RELU:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+        } break;
+        case GGML_OP_RESHAPE: {
+            if (src0_needs_grads) {
+                struct ggml_tensor * grad_cont = ggml_is_contiguous(grad) ? grad : ggml_cont(ctx, grad);
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_reshape(ctx, grad_cont, src0));
             }
-        case GGML_OP_FLASH_ATTN_EXT:
-            {
-                GGML_ABORT("FA backward pass not adapted after rework");
-                struct ggml_tensor * flash_grad = NULL;
-                if (src0->grad || src1->grad || tensor->src[2]->grad) {
-                    int32_t t = ggml_get_op_params_i32(tensor, 0);
-                    GGML_ASSERT(t == 0 || t == 1);
-                    bool masked = t != 0;
-                    flash_grad =
-                        ggml_flash_attn_back(ctx,
-                            src0,
-                            src1,
-                            tensor->src[2],
-                            tensor->grad,
-                            masked);
+        } break;
+        case GGML_OP_VIEW: {
+            if (src0_needs_grads) {
+                size_t offset;
+
+                memcpy(&offset, tensor->op_params, sizeof(offset));
+
+                size_t nb1 = tensor->nb[1];
+                size_t nb2 = tensor->nb[2];
+                size_t nb3 = tensor->nb[3];
+
+                if (cgraph->grads[isrc0] && src0->type != cgraph->grads[isrc0]->type) {
+                    // gradient is typically F32, but src0 could be other type
+                    size_t ng = ggml_element_size(cgraph->grads[isrc0]);
+                    size_t n0 = ggml_element_size(src0);
+                    GGML_ASSERT(offset % n0 == 0);
+                    GGML_ASSERT(nb1 % n0 == 0);
+                    GGML_ASSERT(nb2 % n0 == 0);
+                    GGML_ASSERT(nb3 % n0 == 0);
+                    offset = (offset / n0) * ng;
+                    nb1 = (nb1 / n0) * ng;
+                    nb2 = (nb2 / n0) * ng;
+                    nb3 = (nb3 / n0) * ng;
                 }
 
-                const int64_t elem_q = ggml_nelements(src0);
-                const int64_t elem_k = ggml_nelements(src1);
-                const int64_t elem_v = ggml_nelements(src2);
-
-                enum ggml_type result_type = flash_grad->type;
-                GGML_ASSERT(ggml_blck_size(result_type) == 1);
-                const size_t tsize = ggml_type_size(result_type);
-
-                const size_t offs_q = 0;
-                const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
-                const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
-
-                if (src0->grad) {
-                    struct ggml_tensor * view_q = ggml_view_1d(ctx, flash_grad, elem_q, offs_q);
-                    struct ggml_tensor * grad_q = ggml_reshape(ctx, view_q, src0);
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            grad_q,
-                            zero_table, acc_table);
-                }
-                if (src1->grad) {
-                    struct ggml_tensor * view_k = ggml_view_1d(ctx, flash_grad, elem_k, offs_k);
-                    struct ggml_tensor * grad_k = ggml_reshape(ctx, view_k, src1);
-                    src1->grad = ggml_add_or_set(ctx,
-                            src1->grad,
-                            grad_k,
-                            zero_table, acc_table);
-                }
-                if (src2->grad) {
-                    struct ggml_tensor * view_v = ggml_view_1d(ctx, flash_grad, elem_v, offs_v);
-                    struct ggml_tensor * grad_v = ggml_reshape(ctx, view_v, src2);
-                    src2->grad = ggml_add_or_set(ctx,
-                            src2->grad,
-                            grad_v,
-                            zero_table, acc_table);
-                }
-            } break;
-        case GGML_OP_FLASH_ATTN_BACK:
-            {
-                GGML_ABORT("fatal error"); // not supported
+                ggml_acc_or_set(ctx, cgraph, isrc0, src0, grad, nb1, nb2, nb3, offset);
             }
-        case GGML_OP_SSM_CONV:
-        case GGML_OP_SSM_SCAN:
-            {
-                GGML_ABORT("fatal error"); // TODO: not implemented
+        } break;
+        case GGML_OP_PERMUTE: {
+            if (src0_needs_grads) {
+                const int32_t * axes = (const int32_t *) tensor->op_params;
+                const int axis0 = axes[0] & 0x3;
+                const int axis1 = axes[1] & 0x3;
+                const int axis2 = axes[2] & 0x3;
+                const int axis3 = axes[3] & 0x3;
+                int axb[4] = {0,0,0,0}; // axes backward
+                axb[axis0] = 0;
+                axb[axis1] = 1;
+                axb[axis2] = 2;
+                axb[axis3] = 3;
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_permute(ctx, grad, axb[0], axb[1], axb[2], axb[3]));
             }
+        } break;
+        case GGML_OP_TRANSPOSE: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_transpose(ctx, grad));
+            }
+        } break;
+        case GGML_OP_GET_ROWS: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_get_rows_back(ctx, grad, src1, src0));
+            }
+            if (src1_needs_grads) {
+                // noop
+            }
+        } break;
+        case GGML_OP_DIAG_MASK_INF: {
+            if (src0_needs_grads) {
+                /* ggml_diag_mask_inf_impl() shouldn't be here */
+                /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
+                const int n_past = ((const int32_t *) tensor->op_params)[0];
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
+            }
+        } break;
+        case GGML_OP_DIAG_MASK_ZERO: {
+            if (src0_needs_grads) {
+                const int n_past = ((const int32_t *) tensor->op_params)[0];
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
+            }
+        } break;
+        case GGML_OP_SOFT_MAX: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_back(ctx, grad, tensor));
+            }
+            GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented");
+        } break;
+        case GGML_OP_ROPE: {
+            if (src0_needs_grads) {
+                //const int n_past = ((int32_t *) tensor->op_params)[0];
+                const int n_dims     = ((const int32_t *) tensor->op_params)[1];
+                const int mode       = ((const int32_t *) tensor->op_params)[2];
+                //const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4];
+                float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+
+                memcpy(&freq_base,   (const float *) tensor->op_params +  5, sizeof(float));
+                memcpy(&freq_scale,  (const float *) tensor->op_params +  6, sizeof(float));
+                memcpy(&ext_factor,  (const float *) tensor->op_params +  7, sizeof(float));
+                memcpy(&attn_factor, (const float *) tensor->op_params +  8, sizeof(float));
+                memcpy(&beta_fast,   (const float *) tensor->op_params +  9, sizeof(float));
+                memcpy(&beta_slow,   (const float *) tensor->op_params + 10, sizeof(float));
+
+                ggml_add_or_set(ctx, cgraph, isrc0,
+                    ggml_rope_back(ctx, grad, src1, src2, n_dims, mode, n_ctx_orig, freq_base,
+                        freq_scale, ext_factor, attn_factor, beta_fast, beta_slow));
+            }
+            GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented");
+        } break;
+        case GGML_OP_IM2COL: {
+            if (src1_needs_grads) {
+                const int32_t s0    = ggml_get_op_params_i32(tensor, 0);
+                const int32_t s1    = ggml_get_op_params_i32(tensor, 1);
+                const int32_t p0    = ggml_get_op_params_i32(tensor, 2);
+                const int32_t p1    = ggml_get_op_params_i32(tensor, 3);
+                const int32_t d0    = ggml_get_op_params_i32(tensor, 4);
+                const int32_t d1    = ggml_get_op_params_i32(tensor, 5);
+                const bool    is_2D = ggml_get_op_params_i32(tensor, 6) == 1;
+
+                ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, src0, grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
+            }
+        } break;
+        case GGML_OP_POOL_2D: {
+            if (src0_needs_grads) {
+                const enum ggml_op_pool op = ggml_get_op_params_i32(tensor, 0);
+                const      int32_t      k0 = ggml_get_op_params_i32(tensor, 1);
+                const      int32_t      k1 = ggml_get_op_params_i32(tensor, 2);
+                const      int32_t      s0 = ggml_get_op_params_i32(tensor, 3);
+                const      int32_t      s1 = ggml_get_op_params_i32(tensor, 4);
+                const      int32_t      p0 = ggml_get_op_params_i32(tensor, 5);
+                const      int32_t      p1 = ggml_get_op_params_i32(tensor, 6);
+
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_pool_2d_back(ctx, grad, src0, op, k0, k1, s0, s1, p0, p1));
+            }
+        } break;
         case GGML_OP_WIN_PART:
         case GGML_OP_WIN_UNPART:
-        case GGML_OP_UNARY:
-            {
-                switch (ggml_get_unary_op(tensor)) {
-                    case GGML_UNARY_OP_ABS:
-                        {
-                            if (src0->grad) {
-                                src0->grad =
-                                    ggml_add_or_set(ctx,
-                                            src0->grad,
-                                            ggml_mul(ctx,
-                                                ggml_sgn(ctx, src0),
-                                                tensor->grad),
-                                            zero_table, acc_table);
-                            }
-                        } break;
-                    case GGML_UNARY_OP_SGN:
-                        {
-                            if (src0->grad) {
-                                // noop
-                            }
-                        } break;
-                    case GGML_UNARY_OP_NEG:
-                        {
-                            if (src0->grad) {
-                                src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table);
-                            }
-                        } break;
-                    case GGML_UNARY_OP_STEP:
-                        {
-                            if (src0->grad) {
-                                // noop
-                            }
-                        } break;
-                    case GGML_UNARY_OP_TANH:
-                        {
-                            GGML_ABORT("fatal error"); // TODO: not implemented
-                        }
-                    case GGML_UNARY_OP_ELU:
-                        {
-                            GGML_ABORT("fatal error"); // TODO: not implemented
-                        }
-                    case GGML_UNARY_OP_RELU:
-                        {
-                            if (src0->grad) {
-                                src0->grad = ggml_add_or_set(ctx,
-                                        src0->grad,
-                                        ggml_mul(ctx,
-                                            ggml_step(ctx, src0),
-                                            tensor->grad),
-                                        zero_table, acc_table);
-                            }
-                        } break;
-                    case GGML_UNARY_OP_SIGMOID:
-                        {
-                            GGML_ABORT("fatal error"); // TODO: not implemented
-                        }
-                    case GGML_UNARY_OP_GELU:
-                        {
-                            GGML_ABORT("fatal error"); // TODO: not implemented
-                        }
-                    case GGML_UNARY_OP_GELU_QUICK:
-                        {
-                            GGML_ABORT("fatal error"); // TODO: not implemented
-                        }
-                    case GGML_UNARY_OP_SILU:
-                        {
-                            // necessary for llama
-                            if (src0->grad) {
-                                src0->grad = ggml_add_or_set(ctx,
-                                        src0->grad,
-                                        ggml_silu_back(ctx, src0, tensor->grad),
-                                        zero_table, acc_table);
-                            }
-                        } break;
-                    case GGML_UNARY_OP_EXP:
-                        {
-                            if (src0->grad) {
-                                src0->grad = ggml_add_or_set(ctx,
-                                        src0->grad,
-                                        ggml_mul(ctx, tensor, tensor->grad),
-                                        zero_table, acc_table);
-                            }
-                        } break;
-                    default:
-                        GGML_ABORT("fatal error");
-                }
-            } break;
-        case GGML_OP_GET_REL_POS:
-        case GGML_OP_ADD_REL_POS:
-        case GGML_OP_RWKV_WKV6:
-        case GGML_OP_MAP_UNARY:
-        case GGML_OP_MAP_BINARY:
-        case GGML_OP_MAP_CUSTOM1_F32:
-        case GGML_OP_MAP_CUSTOM2_F32:
-        case GGML_OP_MAP_CUSTOM3_F32:
-        case GGML_OP_MAP_CUSTOM1:
-        case GGML_OP_MAP_CUSTOM2:
-        case GGML_OP_MAP_CUSTOM3:
-            {
-                GGML_ABORT("fatal error"); // not supported
+        case GGML_OP_UNARY: {
+            switch (ggml_get_unary_op(tensor)) {
+                case GGML_UNARY_OP_ABS: {
+                    if (src0_needs_grads) {
+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, ggml_sgn(ctx, src0), grad));
+                    }
+                } break;
+                case GGML_UNARY_OP_SGN: {
+                    // noop
+                } break;
+                case GGML_UNARY_OP_NEG: {
+                    if (src0_needs_grads) {
+                        ggml_sub_or_set(ctx, cgraph, isrc0, grad);
+                    }
+                } break;
+                case GGML_UNARY_OP_STEP: {
+                    // noop
+                } break;
+                case GGML_UNARY_OP_RELU: {
+                    if (src0_needs_grads) {
+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, ggml_step(ctx, src0), grad));
+                    }
+                } break;
+                case GGML_UNARY_OP_SILU: {
+                    if (src0_needs_grads) {
+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, src0, grad));
+                    }
+                } break;
+                case GGML_UNARY_OP_EXP: {
+                    if (src0_needs_grads) {
+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, tensor, grad));
+                    }
+                } break;
+                default: {
+                    fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n",
+                        __func__, ggml_unary_op_name(ggml_get_unary_op(tensor)));
+                    GGML_ABORT("fatal error");
+                } break;
             }
-        case GGML_OP_CROSS_ENTROPY_LOSS:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_cross_entropy_loss_back(ctx,
-                                    src0,
-                                    src1,
-                                    tensor->grad),
-                                zero_table, acc_table);
-                }
-                GGML_ASSERT(!src1->grad && "backward pass for labels not implemented");
-            } break;
-        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-            {
-                GGML_ABORT("fatal error"); // not supported
+        } break;
+        case GGML_OP_CROSS_ENTROPY_LOSS: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, src0, src1, grad));
             }
-        case GGML_OP_OPT_STEP_ADAMW:
-            {
-                GGML_ABORT("fatal error"); // not supported
-            }
-        case GGML_OP_NONE:
-            {
-                // nop
-            } break;
+            GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
+        } break;
+        case GGML_OP_NONE: {
+            // noop
+        } break;
         case GGML_OP_COUNT:
-            {
-                GGML_ABORT("fatal error");
-            }
+        default: {
+            fprintf(stderr, "%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op));
+            GGML_ABORT("fatal error");
+        } break;
     }
 
-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        if (tensor->src[i] && tensor->src[i]->grad) {
-            GGML_ASSERT(ggml_are_same_shape(tensor->src[i], tensor->src[i]->grad));
-        }
-    }
+    GGML_ASSERT(!src0_needs_grads || ggml_are_same_shape(src0, cgraph->grads[isrc0]));
+    GGML_ASSERT(!src1_needs_grads || ggml_are_same_shape(src1, cgraph->grads[isrc1]));
+    GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
 }
 
 static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
-    if (node->grad == NULL) {
-        // this usually happens when we generate intermediate nodes from constants in the backward pass
-        // it can also happen during forward pass, if the user performs computations with constants
-        if (node->op != GGML_OP_NONE) {
-            //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op);
-        }
-    }
-
     // check if already visited
     if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
         return;
@@ -6207,18 +5586,42 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
     ggml_build_forward_impl(cgraph, tensor, true);
 }
 
-void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate) {
-    GGML_ASSERT(gf->n_nodes > 0);
-    GGML_ASSERT(gf->grads);
+void ggml_build_backward_expand(
+        struct ggml_context * ctx_static,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * cgraph,
+        bool                  accumulate) {
+    GGML_ASSERT(cgraph->n_nodes > 0);
+    GGML_ASSERT(cgraph->grads);
+    GGML_ASSERT(cgraph->grad_accs);
 
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        struct ggml_tensor * node = gf->nodes[i];
+    const int n_nodes_f = cgraph->n_nodes;
+
+    const size_t hash_size = ggml_hash_size(2*cgraph->size);
+    memset(cgraph->grads,     0, hash_size*sizeof(struct ggml_tensor *));
+    memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
+    bool * grads_needed = calloc(hash_size, sizeof(bool));
+
+    {
+        bool any_params = false;
+        bool any_loss   = false;
+        for (int i = 0; i < n_nodes_f; ++i) {
+            struct ggml_tensor * node = cgraph->nodes[i];
+            any_params = any_params || (node->flags & GGML_TENSOR_FLAG_PARAM);
+            any_loss   = any_loss   || (node->flags & GGML_TENSOR_FLAG_LOSS);
+        }
+        GGML_ASSERT(any_params && "no trainable parameters found, did you forget to call ggml_set_param?");
+        GGML_ASSERT(any_loss && "no training loss found, did you forget to call ggml_set_loss?");
+    }
+
+    for (int i = 0; i < n_nodes_f; ++i) {
+        struct ggml_tensor * node = cgraph->nodes[i];
 
         if (node->type == GGML_TYPE_I32) {
             continue;
         }
 
-        bool needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM;
+        bool node_needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM;
         bool ignore_src[GGML_MAX_SRC] = {false};
         switch (node->op) {
             // gradients in node->src[0] for one reason or another have no effect on output gradients
@@ -6246,14 +5649,14 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
                 break;
         }
         for (int j = 0; j < GGML_MAX_SRC; ++j) {
-            if (!node->src[j] || !node->src[j]->grad || ignore_src[j]) {
+            if (!node->src[j] || ignore_src[j] || !grads_needed[ggml_hash_find(&cgraph->visited_hash_set, node->src[j])]) {
                 continue;
             }
             GGML_ASSERT(node->src[j]->type == GGML_TYPE_F32 || node->src[j]->type == GGML_TYPE_F16);
-            needs_grad = true;
+            node_needs_grad = true;
             break;
         }
-        if (!needs_grad) {
+        if (!node_needs_grad) {
             continue;
         }
 
@@ -6261,73 +5664,21 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
         GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
             node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
 
-        // create a new tensor with the same type and shape as the node and set it as grad
-        node->grad = ggml_dup_tensor(ctx, node);
-    }
-
-    // keep tables of original gradients for replacement/accumulation logic
-    struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
-    struct ggml_hash_set acc_table  = ggml_hash_set_new(gf->size);
-    for (int i = 0; i < gf->n_nodes; i++) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        if (node->grad) {
-            {
-                const size_t insert_result = ggml_hash_insert(&zero_table, node->grad);
-                GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
-                GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
-            }
-
-            // only gradients of trainable parameters should be accumulated
-            if (accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
-                const size_t insert_result = ggml_hash_insert(&acc_table, node->grad);
-                GGML_ASSERT(insert_result != GGML_HASHSET_FULL);
-                GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS);
-            }
+        const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
+        if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
+            cgraph->grads[igrad]     = ggml_dup_tensor(ctx_static, node);
+            cgraph->grad_accs[igrad] = cgraph->grads[igrad];
         }
+        grads_needed[igrad] = true;
     }
 
-    for (int i = gf->n_nodes - 1; i >= 0; i--) {
-        struct ggml_tensor * node = gf->nodes[i];
-
+    for (int i = n_nodes_f - 1; i >= 0; --i) {
         // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
         // use allocator to automatically make inplace operations
-        if (node->grad) {
-            ggml_compute_backward(ctx, node, &zero_table, &acc_table);
-        }
+        ggml_compute_backward(ctx_compute, cgraph, i, grads_needed);
     }
 
-    for (int i = 0; i < gf->n_nodes; i++) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            ggml_build_forward_expand(gb, node->grad);
-        }
-    }
-
-    ggml_hash_set_free(&zero_table);
-    ggml_hash_set_free(&acc_table);
-}
-
-void ggml_build_opt_adamw(
-        struct ggml_context * ctx,
-        struct ggml_cgraph  * gf,
-        struct ggml_cgraph  * gb,
-        float                 alpha,
-        float                 beta1,
-        float                 beta2,
-        float                 eps,
-        float                 wd) {
-    for (int i = 0; i < gf->n_nodes; i++) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            struct ggml_tensor * opt_step = ggml_opt_step_adamw(ctx, node, node->grad, alpha, beta1, beta2, eps, wd);
-            ggml_build_forward_expand(gb, opt_step);
-        }
-    }
+    free(grads_needed);
 }
 
 static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
@@ -6345,7 +5696,8 @@ static size_t ggml_graph_nbytes(size_t size, bool grads) {
     incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
     incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
     if (grads) {
-        incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
+        incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
+        incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grad_accs
     }
     incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
 
@@ -6371,10 +5723,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
 
     void * p = cgraph + 1;
 
-    struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
-    struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
-    struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
-    struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
+    struct ggml_tensor ** nodes_ptr     =         incr_ptr_aligned(&p, size      * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+    struct ggml_tensor ** leafs_ptr     =         incr_ptr_aligned(&p, size      * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+    struct ggml_tensor ** hash_keys_ptr =         incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+    struct ggml_tensor ** grads_ptr     = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
+    struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
+
     ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
 
     // check that we allocated the correct amount of memory
@@ -6386,12 +5740,17 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
         /*.n_leafs      =*/ 0,
         /*.nodes        =*/ nodes_ptr,
         /*.grads        =*/ grads_ptr,
+        /*.grad_accs    =*/ grad_accs_ptr,
         /*.leafs        =*/ leafs_ptr,
         /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
         /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
     };
 
     ggml_hash_set_reset(&cgraph->visited_hash_set);
+    if (grads) {
+        memset(cgraph->grads,     0, hash_size*sizeof(struct ggml_tensor *));
+        memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
+    }
 
     return cgraph;
 }
@@ -6407,6 +5766,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
         /*.n_leafs      =*/ 0,
         /*.nodes        =*/ cgraph0->nodes + i0,
         /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
+        /*.grad_accs    =*/ cgraph0->grad_accs ? cgraph0->grad_accs + i0 : NULL,
         /*.leafs        =*/ NULL,
         /*.hash_table   =*/ { 0, NULL, NULL },
         /*.order        =*/ cgraph0->order,
@@ -6432,19 +5792,23 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
         dst->nodes[i] = src->nodes[i];
     }
 
-    if (src->grads) {
-        GGML_ASSERT(dst->grads != NULL);
-        for (int i = 0; i < src->n_nodes; ++i) {
-            dst->grads[i] = src->grads[i];
-        }
-    }
-
     for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
         // copy all hashset keys (tensors) that are in use
         if (ggml_bitset_get(src->visited_hash_set.used, i)) {
             ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
         }
     }
+
+    if (src->grads) {
+        GGML_ASSERT(dst->grads     != NULL);
+        GGML_ASSERT(dst->grad_accs != NULL);
+        for (int i = 0; i < src->n_nodes; ++i) {
+            const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
+            const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
+            dst->grads[igrad_dst]     = src->grads[igrad_src];
+            dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
+        }
+    }
 }
 
 struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
@@ -6470,28 +5834,35 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
     GGML_ASSERT(cgraph->grads != NULL);
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
+        struct ggml_tensor * node     = cgraph->nodes[i];
+        struct ggml_tensor * grad_acc = ggml_graph_get_grad_acc(cgraph, node);
 
-        // initial gradients of loss should be 1, 0 otherwise
-        if (node->grad) {
-            if (node->flags & GGML_TENSOR_FLAG_LOSS) {
-                GGML_ASSERT(node->grad->buffer);
-                GGML_ASSERT(node->type == GGML_TYPE_F32);
-                GGML_ASSERT(ggml_is_scalar(node));
-
-                const float onef = 1.0f;
-                ggml_backend_tensor_set(node->grad, &onef, 0, ggml_nbytes(node->grad));
-            } else {
-                ggml_set_zero(node->grad);
+        if (node->op == GGML_OP_OPT_STEP_ADAMW) {
+            // clear momenta
+            if (node->src[2]->data) {
+                ggml_set_zero(node->src[2]);
+            }
+            if (node->src[3]->data) {
+                ggml_set_zero(node->src[3]);
             }
         }
 
-        GGML_ASSERT(node);
-        if (node->op == GGML_OP_OPT_STEP_ADAMW) {
-            // set iteration to 1 and clear momenta
-            ggml_set_op_params_i32(node, 0, 1);
-            ggml_set_zero(node->src[2]);
-            ggml_set_zero(node->src[3]);
+        // initial gradients of loss should be 1, 0 otherwise
+        if (grad_acc) {
+            if (node->flags & GGML_TENSOR_FLAG_LOSS) {
+                GGML_ASSERT(grad_acc->type == GGML_TYPE_F32);
+                GGML_ASSERT(ggml_is_scalar(grad_acc));
+
+                const float onef = 1.0f;
+                if (grad_acc->buffer) {
+                    ggml_backend_tensor_set(grad_acc, &onef, 0, sizeof(float));
+                } else {
+                    GGML_ASSERT(grad_acc->data);
+                    *((float *) grad_acc->data) = onef;
+                }
+            } else {
+                ggml_set_zero(grad_acc);
+            }
         }
     }
 }
@@ -6530,7 +5901,7 @@ void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tenso
     cgraph->n_nodes++;
 }
 
-struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
+struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, const char * name) {
     for (int i = 0; i < cgraph->n_leafs; i++) {
         struct ggml_tensor * leaf = cgraph->leafs[i];
 
@@ -6550,6 +5921,16 @@ struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const ch
     return NULL;
 }
 
+struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
+    const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
+    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
+}
+
+struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
+    const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
+    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
+}
+
 void ggml_graph_print(const struct ggml_cgraph * cgraph) {
     GGML_LOG_INFO("=== GRAPH ===\n");
 
@@ -6560,7 +5941,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
         GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
                 i,
                 node->ne[0], node->ne[1], node->ne[2],
-                ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ");
+                ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" :
+                      ggml_graph_get_grad(cgraph, node) ? "g" : " ");
     }
 
     GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs);
@@ -6595,8 +5977,9 @@ static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml
 static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * parent = cgraph->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, parent);
 
-        if (parent->grad == node) {
+        if (grad == node) {
             return parent;
         }
     }
@@ -6636,6 +6019,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
 
     for (int i = 0; i < gb->n_nodes; i++) {
         struct ggml_tensor * node = gb->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(gb, node);
 
         if (ggml_graph_get_parent(gb, node) != NULL) {
             continue;
@@ -6643,7 +6027,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
 
         if (node->flags & GGML_TENSOR_FLAG_PARAM) {
             snprintf(color, sizeof(color), "yellow");
-        } else if (node->grad) {
+        } else if (grad) {
             if (ggml_graph_find(gf, node)) {
                 snprintf(color, sizeof(color), "green");
             } else {
@@ -6670,8 +6054,8 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
             fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
         }
 
-        if (node->grad) {
-            fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
+        if (grad) {
+            fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(grad->op));
         } else {
             fprintf(fp, "\"; ]\n");
         }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 6618d03d1..f8a59b6df 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -811,11 +811,11 @@ struct test_case {
 
         ggml_build_forward_expand(gf, out);
         ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, false);
+        ggml_build_backward_expand(ctx, ctx, gb, false);
         if (expect.size() != 1 || expect[0] != 0.0f) {
             GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-                GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
+                GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE);
             }
         }
 
@@ -862,7 +862,13 @@ struct test_case {
             const char * bn = ggml_backend_name(backend);
             const int64_t ne = ggml_nelements(t);
 
-            std::vector<float> ga = tensor_to_float(t->grad);
+            std::vector<float> ga;
+            struct ggml_tensor * grad = ggml_graph_get_grad(gb, t);
+            if (grad) {
+                ga = tensor_to_float(grad);
+            } else {
+                ga.resize(ne); // default value is 0.0f
+            }
 
             for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
                 // check for nans
@@ -2500,6 +2506,35 @@ struct test_sum_rows : public test_case {
     }
 };
 
+// GGML_OP_MEAN
+struct test_mean : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_mean(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_mean(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
+    }
+};
+
 // GGML_OP_UPSCALE
 struct test_upscale : public test_case {
     const ggml_type type;
@@ -2834,24 +2869,14 @@ struct test_cross_entropy_loss : public test_case {
 struct test_opt_step_adamw : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne;
-    const float alpha;
-    const float beta1;
-    const float beta2;
-    const float eps;
-    const float wd;
 
     std::string vars() override {
-        return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd);
+        return VARS_TO_STR2(type, ne);
     }
 
     test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 5, 4, 3},
-            float alpha = 1e-3f,
-            float beta1 = 0.9f,
-            float beta2 = 0.999f,
-            float eps = 1e-8f,
-            float wd = 0.0f)
-        : type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
@@ -2861,7 +2886,16 @@ struct test_opt_step_adamw : public test_case {
         ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
         ggml_set_name(grad, "grad");
 
-        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, alpha, beta1, beta2, eps, wd);
+        ggml_tensor * grad_m = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad_m, "grad_m");
+
+        ggml_tensor * grad_v = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad_v, "grad_v");
+
+        ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7);
+        ggml_set_name(adamw_params, "adamw_params");
+
+        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params);
         ggml_set_name(out, "out");
 
         return out;
@@ -2869,7 +2903,7 @@ struct test_opt_step_adamw : public test_case {
 
     void initialize_tensors(ggml_context * ctx) override {
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values.
+            init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values.
         }
     }
 
@@ -3735,6 +3769,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
     test_cases.emplace_back(new test_sum());
     test_cases.emplace_back(new test_sum_rows());
+    test_cases.emplace_back(new test_mean());
     test_cases.emplace_back(new test_upscale());
     test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
     test_cases.emplace_back(new test_upscale_ext());
@@ -3766,9 +3801,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     }
 
     test_cases.emplace_back(new test_cross_entropy_loss());
-    for (float wd : {0.0f, 1e-2f}) {
-        test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
-    }
+    test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
 
     // these tests are disabled to save execution time, but they can be handy for debugging
 #if 0
@@ -3938,6 +3971,8 @@ int main(int argc, char ** argv) {
         ggml_backend_free(backend);
     }
 
+    ggml_quantize_free();
+
     printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
 
     if (n_ok != ggml_backend_dev_count()) {
@@ -3945,8 +3980,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    ggml_quantize_free();
-
     printf("\033[1;32mOK\033[0m\n");
     return 0;
 }
diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp
index 546ca230b..4abe85c74 100644
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
@@ -1,181 +1,892 @@
 #include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#include "ggml-opt.h"
 
 #include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
+#include <inttypes.h>
+#include <random>
+#include <string>
+#include <thread>
+#include <vector>
 
-#define MAX_NARGS 2
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-//
-// logging
-//
-#define GGML_DEBUG 0
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
+static bool almost_equal(const double a, const double b, const double atol) {
+    return fabs(a - b) < atol;
 }
 
-static struct ggml_tensor * get_random_tensor(
-    struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
-) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+constexpr int64_t ne_datapoint = 2;
+constexpr int64_t ne_label     = 1;
+constexpr int64_t ndata        = 6;
 
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+struct helper_ctx_data {
+    std::vector<ggml_opt_dataset_t>   datasets_supervised;
+    std::vector<struct ggml_tensor *> data_batch;
+    std::vector<struct ggml_tensor *> labels_batch;
+
+    ggml_opt_dataset_t       dataset_unsupervised;
+    struct ggml_context    * ctx_static;
+    struct ggml_context    * ctx_compute;
+    struct ggml_opt_params   opt_params;
+    ggml_opt_context_t       opt_ctx;
+    struct ggml_tensor     * inputs;
+    struct ggml_tensor     * weights;
+    struct ggml_tensor     * outputs;
+    ggml_backend_buffer_t    buf;
+    ggml_opt_result_t        result;
+    ggml_opt_result_t        result2;
+};
+
+// These default values make it easier to check optimization results vs. expected values.
+static ggml_opt_optimizer_params helper_get_test_opt_pars(void * userdata) {
+    ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata);
+    result.adamw.alpha = 1.0f;
+    result.adamw.beta1 = 0.0f;
+    result.adamw.beta2 = 0.0f;
+    result.adamw.eps   = 0.0f;
+    return result;
+}
+
+static helper_ctx_data helper_get_ctx_data(
+        ggml_backend_sched_t    backend_sched,
+        ggml_backend_t          backend,
+        const bool              init_opt_ctx       = true,
+        const bool              optimizer_defaults = true,
+        int64_t                 nbatch_logical     = 1,
+        int64_t                 nbatch_physical    = 1,
+        enum ggml_opt_loss_type loss_type          = GGML_OPT_LOSS_TYPE_SUM) {
+    std::vector<ggml_opt_dataset_t> datasets(ndata);
+    for (int64_t ndata_shard = 1; ndata_shard <= ndata; ++ndata_shard) {
+        ggml_opt_dataset_t dataset = ggml_opt_dataset_init(ne_datapoint, ne_label, ndata, ndata_shard);
+
+        float * data   = ggml_get_data_f32(ggml_opt_dataset_data(  dataset));
+        float * labels = ggml_get_data_f32(ggml_opt_dataset_labels(dataset));
+
+        for (int64_t idata = 0; idata < ndata; ++idata) {
+            for (int64_t id = 0; id < ne_datapoint; ++id) {
+                data[  idata*ne_datapoint + id] =     16*idata + id;
             }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
+            for (int64_t il = 0; il < ne_label;     ++il) {
+                labels[idata*ne_label     + il] = 16*(16*idata + il);
             }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+        }
+
+        datasets[ndata_shard-1] = dataset;
+    }
+
+    ggml_opt_dataset_t dataset_unsupervised = ggml_opt_dataset_init(1, 0, ndata, /*ndata_shard =*/ 1);
+
+    float * data = ggml_get_data_f32(ggml_opt_dataset_data(dataset_unsupervised));
+
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        data[idata] = idata;
+    }
+
+    struct ggml_context * ctx_static;
+    struct ggml_context * ctx_compute;
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ (2*ndata + 2)*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_static = ggml_init(params);
+    }
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ GGML_DEFAULT_GRAPH_SIZE*ggml_tensor_overhead() + 3*ggml_graph_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_compute = ggml_init(params);
+    }
+
+    std::vector<struct ggml_tensor *>   data_batch(ndata);
+    std::vector<struct ggml_tensor *> labels_batch(ndata);
+    for (int64_t ndata_batch = 1; ndata_batch <= ndata; ++ndata_batch) {
+        data_batch[ndata_batch-1]   = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, ndata_batch*ne_datapoint);
+        labels_batch[ndata_batch-1] = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, ndata_batch*ne_label);
+    }
+
+    struct ggml_tensor * inputs = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, nbatch_physical);
+    ggml_set_name(inputs, "inputs");
+
+    struct ggml_tensor * weights = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, 1);
+    ggml_set_name(weights, "weights");
+    ggml_set_param(ctx_static, weights);
+
+    struct ggml_tensor * intermediary = ggml_add(ctx_compute, inputs, weights);
+
+    struct ggml_tensor * outputs = ggml_scale(ctx_compute, intermediary, 1.0f);
+    ggml_set_name(outputs, "outputs");
+
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_static, backend);
+    const float w0 = float(ndata)/2;
+    ggml_backend_tensor_set(weights, &w0, 0, sizeof(float));
+
+    GGML_ASSERT(nbatch_logical % nbatch_physical == 0);
+    const int32_t opt_period = nbatch_logical / nbatch_physical;
+
+    struct ggml_opt_params opt_params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
+    opt_params.opt_period = opt_period;
+    if (!optimizer_defaults) {
+        opt_params.get_opt_pars = helper_get_test_opt_pars;
+    }
+    ggml_opt_context_t opt_ctx = init_opt_ctx ? ggml_opt_init(opt_params) : nullptr;
+
+    ggml_opt_result_t result  = ggml_opt_result_init();
+    ggml_opt_result_t result2 = ggml_opt_result_init();
+
+    return {datasets, data_batch, labels_batch, dataset_unsupervised, ctx_static, ctx_compute, opt_params, opt_ctx, inputs, weights, outputs, buf, result, result2};
+}
+
+static void helper_free_ctx_data(struct helper_ctx_data ctx_data) {
+    ggml_opt_result_free(ctx_data.result);
+    ggml_opt_result_free(ctx_data.result2);
+    ggml_opt_free(ctx_data.opt_ctx);
+    ggml_backend_buffer_free(ctx_data.buf);
+    ggml_free(ctx_data.ctx_static);
+    ggml_free(ctx_data.ctx_compute);
+    for (ggml_opt_dataset_t dataset : ctx_data.datasets_supervised) {
+        ggml_opt_dataset_free(dataset);
+    }
+    ggml_opt_dataset_free(ctx_data.dataset_unsupervised);
+}
+
+static void helper_after_test(
+        const char * func, const bool high_level, const std::string options,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    printf("  %s(high_level=%s%s, subtest=%s): ",
+           func, high_level ? "yes" : "no", options.c_str(), subtest.c_str());
+    if (subtest_ok) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+}
+
+static std::pair<int, int> test_dataset(ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool shuffle) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend);
+
+    for (int64_t ndata_shard = 1; ndata_shard <= ndata; ++ndata_shard) {
+        ggml_opt_dataset_t dataset = cd.datasets_supervised[ndata_shard-1];
+
+        if (shuffle) {
+            ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        }
+
+        for (int64_t ndata_batch = 1; ndata_batch <= ndata; ++ndata_batch) {
+            if (ndata_batch % ndata_shard != 0) {
+                continue;
+            }
+            bool subtest_ok = true;
+
+            struct ggml_tensor *   data_batch =   cd.data_batch[ndata_batch-1];
+            struct ggml_tensor * labels_batch = cd.labels_batch[ndata_batch-1];
+
+            std::vector<float>   data(ggml_nelements(  data_batch));
+            std::vector<float> labels(ggml_nelements(labels_batch));
+
+            std::vector<int64_t> idata_shuffled;
+            const int64_t nbatches = ndata / ndata_batch;
+            for (int64_t ibatch = 0; ibatch < nbatches; ++ibatch) {
+                ggml_opt_dataset_get_batch(dataset, data_batch, labels_batch, ibatch);
+
+                ggml_backend_tensor_get(  data_batch,   data.data(), 0, ggml_nbytes(  data_batch));
+                ggml_backend_tensor_get(labels_batch, labels.data(), 0, ggml_nbytes(labels_batch));
+
+                for (int64_t idata_batch = 0; idata_batch < ndata_batch; ++idata_batch) {
+                    const int64_t idata = ibatch*ndata_batch + idata_batch;
+                    const int64_t idata_found = data[idata_batch*ne_datapoint] / 16;
+                    subtest_ok = subtest_ok && (shuffle || idata_found == idata);
+                    idata_shuffled.push_back(idata_found);
+
+                    for (int64_t id = 0; id < ne_datapoint; ++id) {
+                        if (data[  idata_batch*ne_datapoint + id] != 16*idata_found + id) {
+                            subtest_ok = false;
+                        }
                     }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    for (int64_t il = 0; il < ne_label;     ++il) {
+                        if (labels[idata_batch*ne_label     + il] != 16*(16*idata_found + il)) {
+                            subtest_ok = false;
                         }
                     }
                 }
             }
-            break;
-        default:
-            assert(false);
+
+            if (!shuffle || ndata % ndata_batch == 0) {
+                const int ndata_max = (ndata / ndata_batch) * ndata_batch;
+
+                for (int64_t idata = 0; subtest_ok && idata < ndata_max; ++idata) {
+                    int ninstances = 0;
+                    for (int64_t id : idata_shuffled) {
+                        ninstances += id == idata;
+                    }
+                    if (ninstances != 1) {
+                        subtest_ok = false;
+                    }
+                }
+            }
+
+            printf("  %s(shuffle=%s, ndata_shard=%" PRId64 ", ndata_batch=%" PRId64 "): ",
+                   __func__, shuffle ? "yes" : "no", ndata_shard, ndata_batch);
+            if (subtest_ok) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
     }
 
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static std::pair<int, int> test_grad(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false,
+    /*nbatch_logical =*/ 999999, /*nbatch_physical =*/ 1);
+
+    std::vector<float> grad_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        grad_history[idata] = NAN;
+    }
+
+    for (int idata = 0; idata < ndata; ++idata) {
+        const float idataf = idata;
+        ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+        ggml_opt_forward_backward(cd.opt_ctx, cd.result);
+        ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata, 0, sizeof(float));
+    }
+
+    {
+        bool subtest_ok = true;
+        for (int idata = 0; idata < ndata; ++idata) {
+            if (grad_history[idata] != idata + 1) {
+                subtest_ok = false;
+            }
+        }
+        printf("  %s(): ", __func__);
+        if (subtest_ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static void helper_after_test_forward_backward(
+        const char * func, const bool high_level, const bool shuffle,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    std::string options = ", shuffle=";
+    options += shuffle ? "yes" : "no";
+    helper_after_test(func, high_level, options, subtest, subtest_ok, ntest, npass);
+}
+
+static std::pair<int, int> test_forward_backward(
+        ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level, const bool shuffle) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
+    struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
+
+    std::vector<float> loss_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        loss_history[idata] = NAN;
+    }
+
+    {
+        int64_t ndata;
+        ggml_opt_result_ndata(cd.result, &ndata);
+        double loss;
+        double loss_unc;
+        ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+        double accuracy;
+        double accuracy_unc;
+        ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+        const bool subtest_ok = ndata == 0 && loss == 0.0 && std::isnan(loss_unc) && std::isnan(accuracy) && std::isnan(accuracy_unc);
+        helper_after_test_forward_backward(__func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass);
+    }
+
+    if (high_level) {
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+        if (shuffle) {
+            ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        }
+        ggml_opt_epoch(cd.opt_ctx, dataset, nullptr, cd.result, 0, nullptr, nullptr);
+    } else {
+        for (int idata = 0; idata < ndata; ++idata) {
+            const float idataf = idata;
+            ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+            ggml_opt_forward(cd.opt_ctx, cd.result);
+            ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+        }
+    }
+
+    {
+        float weights;
+        ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+        const bool subtest_ok = weights == ndata/2;
+        helper_after_test_forward_backward(__func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass);
+    }
+    {
+        int64_t ndata;
+        ggml_opt_result_ndata(cd.result, &ndata);
+        bool subtest_ok = ndata == 6;
+
+        double loss;
+        double loss_unc;
+        ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+        subtest_ok = subtest_ok && loss == 33.0 && almost_equal(loss_unc, sqrt(3.5), 1e-10);
+
+        double accuracy;
+        double accuracy_unc;
+        ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+        subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+        helper_after_test_forward_backward(__func__, high_level, shuffle, "results_after_forward", subtest_ok, ntest, npass);
+    }
+
+    float w0;
+    ggml_backend_tensor_get(cd.weights, &w0, 0, sizeof(float));
+    for (int i = 0; i < 10; ++i) {
+        ggml_opt_forward_backward(cd.opt_ctx, nullptr);
+    }
+    ggml_backend_tensor_set(cd.weights, &w0, 0, sizeof(float));
+
+    ggml_opt_reset(cd.opt_ctx, /*optimizer =*/ false);
+    ggml_opt_result_reset(cd.result);
+
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        loss_history[idata] = NAN;
+    }
+
+    if (high_level) {
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+        if (shuffle) {
+            ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        }
+        ggml_opt_epoch(cd.opt_ctx, dataset, cd.result, nullptr, ndata, nullptr, nullptr);
+    } else {
+        for (int idata = 0; idata < ndata; ++idata) {
+            const float idataf = idata;
+            ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+            ggml_opt_forward_backward(cd.opt_ctx, cd.result);
+            ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+        }
+    }
+
+    {
+        float weights;
+        ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+        const bool subtest_ok = weights == -ndata/2;
+        helper_after_test_forward_backward(__func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass);
+    }
+    {
+        int64_t ndata;
+        ggml_opt_result_ndata(cd.result, &ndata);
+        bool subtest_ok = ndata == 6;
+
+        double loss;
+        double loss_unc;
+        ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+        subtest_ok = subtest_ok && loss == 18.0 && (shuffle || loss_unc == 0.0);
+
+        double accuracy;
+        double accuracy_unc;
+        ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+        subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+        helper_after_test_forward_backward(__func__, high_level, shuffle, "result_after_forward_backward", subtest_ok, ntest, npass);
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static std::pair<int, int> test_epoch_vs_fit(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int ntest = 0;
+    int npass = 0;
+
+    float weights_epoch;
+    float weights_fit;
+
+    {
+        struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true);
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+
+        ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        ggml_opt_epoch(cd.opt_ctx, dataset, cd.result, nullptr, ndata, nullptr, nullptr);
+
+        ggml_backend_tensor_get(cd.weights, &weights_epoch, 0, ggml_nbytes(cd.weights));
+        helper_free_ctx_data(cd);
+    }
+    {
+        struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ false);
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+
+        ggml_opt_fit(backend_sched, cd.ctx_compute, cd.inputs, cd.outputs, dataset,
+            GGML_OPT_LOSS_TYPE_SUM, ggml_opt_get_default_optimizer_params, 1, 1, 0.0f, true);
+
+        ggml_backend_tensor_get(cd.weights, &weights_fit, 0, ggml_nbytes(cd.weights));
+        helper_free_ctx_data(cd);
+    }
+
+    const bool subtest_ok = weights_epoch == weights_fit;
+
+    printf("  %s(): ", __func__);
+    if (subtest_ok) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    return std::make_pair(npass, ntest);
+}
+
+static void helper_after_test_idata_split(
+        const char * func, const bool high_level, const int epoch,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    std::string options = ", epoch=";
+    options += std::to_string(epoch);
+    helper_after_test(func, high_level, options, subtest, subtest_ok, ntest, npass);
+}
+
+static std::pair<int, int> test_idata_split(ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
+    struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
+    const int idata_split = ndata * 2/3;
+
+    std::vector<float> loss_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        loss_history[idata] = NAN;
+    }
+
+    for (int epoch = 1; epoch <= 4; ++epoch) {
+        if (high_level) {
+            ggml_opt_epoch(cd.opt_ctx, cd.dataset_unsupervised, cd.result, cd.result2, idata_split, nullptr, nullptr);
+        } else {
+            int idata = 0;
+            for (; idata < idata_split; ++idata) {
+                const float idataf = idata;
+                ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+                ggml_opt_forward_backward(cd.opt_ctx, cd.result);
+                ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+            }
+            for (; idata < ndata; ++idata) {
+                const float idataf = idata;
+                ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+                ggml_opt_forward(cd.opt_ctx, cd.result2);
+                ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+            }
+        }
+
+        {
+            float weights;
+            ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+            const bool subtest_ok = weights == ndata/2 - epoch*idata_split;
+            helper_after_test_idata_split(__func__, high_level, epoch, "weights", subtest_ok, ntest, npass);
+        }
+        {
+            int64_t ndata_result;
+            ggml_opt_result_ndata(cd.result, &ndata_result);
+            bool subtest_ok = ndata_result == idata_split;
+
+            double loss;
+            double loss_unc;
+            ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+            subtest_ok = subtest_ok && loss == 28.0 - epoch*16.0 && loss_unc == 0.0;
+
+            double accuracy;
+            double accuracy_unc;
+            ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+            subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+            helper_after_test_idata_split(__func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass);
+        }
+        {
+            int64_t ndata_result;
+            ggml_opt_result_ndata(cd.result2, &ndata_result);
+            bool subtest_ok = ndata_result == ndata - idata_split;
+
+            double loss;
+            double loss_unc;
+            ggml_opt_result_loss(cd.result2, &loss, &loss_unc);
+            subtest_ok = subtest_ok && loss == 15.0 - epoch*8 && almost_equal(loss_unc, sqrt(0.5), 1e-10);
+
+            double accuracy;
+            double accuracy_unc;
+            ggml_opt_result_accuracy(cd.result2, &accuracy, &accuracy_unc);
+            subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+            helper_after_test_idata_split(__func__, high_level, epoch, "results_forward", subtest_ok, ntest, npass);
+        }
+
+        ggml_opt_result_reset(cd.result);
+        ggml_opt_result_reset(cd.result2);
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static void helper_after_test_gradient_accumulation(
+        const char * func, const int nbatch_physical, const enum ggml_opt_loss_type loss_type, const int epoch,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    std::string options = ", nbatch_physical=";
+    options += std::to_string(nbatch_physical);
+    options += ", loss_type=";
+    options += loss_type == GGML_OPT_LOSS_TYPE_MEAN ? "mean" : "sum";
+    options += ", epoch=";
+    options += std::to_string(epoch);
+    helper_after_test(func, false, options, subtest, subtest_ok, ntest, npass);
+}
+
+static std::pair<int, int> test_gradient_accumulation(
+        ggml_backend_sched_t backend_sched, ggml_backend_t backend, const int32_t nbatch_physical, const enum ggml_opt_loss_type loss_type) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(
+        backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false, /*nbatch_logical =*/ 6, nbatch_physical, loss_type);
+    struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
+
+    std::vector<float> grad_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        grad_history[idata] = NAN;
+    }
+
+    for (int epoch = 1; epoch <= 4; ++epoch) {
+        if (nbatch_physical == 1) {
+            for (int idata = 0; idata < ndata; ++idata) {
+                const float idataf = idata;
+                ggml_backend_tensor_set(cd.inputs, &idataf, 0, 1*sizeof(float));
+                ggml_opt_forward_backward(cd.opt_ctx, cd.result);
+                ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata, 0, 1*sizeof(float));
+            }
+        } else if (nbatch_physical == 2) {
+            for (int idata = 0; idata < ndata; idata += 2) {
+                const float idataf[2] = {float(idata + 0), float(idata + 1)};
+                ggml_backend_tensor_set(cd.inputs, idataf, 0, 2*sizeof(float));
+                ggml_opt_forward_backward(cd.opt_ctx, cd.result);
+
+                grad_history[idata + 0] = 0.0f;
+                ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata + 1, 0, 1*sizeof(float));
+            }
+        } else {
+            GGML_ASSERT(false);
+        }
+
+        {
+            GGML_ASSERT(ndata == 6);
+            constexpr double atol = 1e-6;
+            bool subtest_ok = true;
+            if (loss_type == GGML_OPT_LOSS_TYPE_SUM) {
+                if (nbatch_physical == 1) {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 1.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 3.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 5.0, atol);
+                } else {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 0.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 0.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 0.0, atol);
+                }
+                subtest_ok = subtest_ok && almost_equal(grad_history[1], 2.0, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[3], 4.0, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[5], 0.0, atol);
+            } else if (loss_type == GGML_OPT_LOSS_TYPE_MEAN) {
+                if (nbatch_physical == 1) {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 1.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 3.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 5.0/ndata, atol);
+                } else {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 0.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 0.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 0.0/ndata, atol);
+                }
+                subtest_ok = subtest_ok && almost_equal(grad_history[1], 2.0/ndata, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[3], 4.0/ndata, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[5], 0.0/ndata, atol);
+            } else {
+                GGML_ASSERT(false);
+            }
+            helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "grads", subtest_ok, ntest, npass);
+        }
+        {
+            float weights;
+            ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+            const bool subtest_ok = weights == (ndata/2) - epoch;
+            helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass);
+        }
+        {
+            int64_t ndata_result;
+            ggml_opt_result_ndata(cd.result, &ndata_result);
+            bool subtest_ok = ndata_result == ndata/nbatch_physical;
+
+            double loss;
+            ggml_opt_result_loss(cd.result, &loss, /*loss_unc =*/ nullptr);
+            if (loss_type == GGML_OPT_LOSS_TYPE_SUM) {
+                subtest_ok = subtest_ok && loss == (39.0 - epoch*6.0);
+            } else if (loss_type == GGML_OPT_LOSS_TYPE_MEAN) {
+                subtest_ok = subtest_ok && almost_equal(loss, (39.0 - epoch*6.0) / ndata, 1e-6);
+            } else {
+                GGML_ASSERT(false);
+            }
+
+            double accuracy;
+            double accuracy_unc;
+            ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+            subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+            helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "results", subtest_ok, ntest, npass);
+        }
+
+        ggml_opt_result_reset(cd.result);
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static ggml_opt_optimizer_params helper_get_regression_opt_pars(void * userdata) {
+    ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata);
+    result.adamw.alpha = 0.1f;
     return result;
 }
 
-int main(void) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 1024*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
+static std::pair<int, int> test_regression(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int ntest = 0;
+    int npass = 0;
 
-    struct ggml_context * ctx = ggml_init(params);
+    // Test for simple regression with f(x) = a*x + b
 
-    int64_t ne1[4] = {4, 128, 1, 1};
-    int64_t ne2[4] = {4, 256, 1, 1};
-    int64_t ne3[4] = {128, 256, 1, 1};
+    constexpr int64_t ndata_regression = 201;
+    constexpr float a_true = 1.2f;
+    constexpr float b_true = 3.4f;
 
-    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
-    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
-    ggml_set_param(ctx, a);
-    ggml_set_param(ctx, b);
+    std::mt19937 gen(12345);
+    std::normal_distribution<float> nd{0.0f, 0.1f};
 
-    struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
+    ggml_opt_dataset_t dataset = ggml_opt_dataset_init(1, 1, ndata_regression, ndata_regression);
 
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
-    struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
-    struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
+    float * data   = ggml_get_data_f32(ggml_opt_dataset_data(  dataset));
+    float * labels = ggml_get_data_f32(ggml_opt_dataset_labels(dataset));
 
-    struct ggml_cgraph * ge = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
-    ggml_build_forward_expand(ge, e);
-    ggml_graph_reset(ge);
+    constexpr float x_min = -100.0f;
+    constexpr float x_max =  100.0f;
 
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
+    for (int64_t idata = 0; idata < ndata_regression; ++idata) {
+        const float x = x_min + (x_max - x_min) * idata/(ndata_regression-1);
+        const float y = a_true*x + b_true + nd(gen);
 
-    const float fe = ggml_get_f32_1d(e, 0);
-    printf("%s: e = %.4f\n", __func__, fe);
+        data[idata]   = x;
+        labels[idata] = y;
+    }
 
-    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
+    struct ggml_context * ctx_static;
+    struct ggml_context * ctx_compute;
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ 3*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_static = ggml_init(params);
+    }
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ GGML_DEFAULT_GRAPH_SIZE*ggml_tensor_overhead() + 3*ggml_graph_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_compute = ggml_init(params);
+    }
 
-    ggml_opt(ctx, opt_params, e);
+    // The first dimension is the dimension of the datapoints, the second dimension is the number of datapoints.
+    struct ggml_tensor * x = ggml_new_tensor_2d(ctx_static, GGML_TYPE_F32, 1, ndata_regression);
+    ggml_set_name(x, "x");
 
-    ggml_graph_reset(ge);
+    struct ggml_tensor * a = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, 1);
+    ggml_set_name(a, "a");
+    ggml_set_param(ctx_static, a);
 
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, 1);
+    ggml_set_name(b, "b");
+    ggml_set_param(ctx_static, b);
 
-    const float fe_opt = ggml_get_f32_1d(e, 0);
-    printf("%s: original  e = %.4f\n", __func__, fe);
-    printf("%s: optimized e = %.4f\n", __func__, fe_opt);
+    struct ggml_tensor * f = ggml_add(ctx_compute, ggml_mul(ctx_compute, x, a), b);
+    ggml_set_name(f, "f");
+    ggml_set_param(ctx_static, f);
 
-    const bool success = (fe_opt <= fe);
-    assert(success);
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_static, backend);
+    const float a0 = 1.0f;
+    const float b0 = 3.0f;
+    ggml_backend_tensor_set(a, &a0, 0, sizeof(float));
+    ggml_backend_tensor_set(b, &b0, 0, sizeof(float));
 
-    ggml_free(ctx);
-    return success ? 0 : -1;
+    ggml_opt_fit(backend_sched, ctx_compute, x, f, dataset, GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
+        helper_get_regression_opt_pars, 100, ndata_regression, 0.0f, true);
+
+    {
+        float a_fit;
+        ggml_backend_tensor_get(a, &a_fit, 0, sizeof(float));
+        float b_fit;
+        ggml_backend_tensor_get(b, &b_fit, 0, sizeof(float));
+        const bool subtest_ok = almost_equal(a_fit, a_true, 1e-2) && almost_equal(b_fit, b_true, 1e-2);
+        printf("  %s(subtest=weights): ", __func__);
+        if (subtest_ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+    }
+
+    ggml_backend_buffer_free(buf);
+    ggml_free(ctx_static);
+    ggml_opt_dataset_free(dataset);
+
+    return std::make_pair(npass, ntest);
 }
-// int64_t ne1[4] = {4, 128, 1, 1};
-// int64_t ne2[4] = {4, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 25890.9375
-// main: optimized e = 10094.7031
 
-// int64_t ne1[4] = {8, 128, 1, 1};
-// int64_t ne2[4] = {8, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 39429.5078
-// main: optimized e = 9275.8936
+static std::pair<int, int> test_backend(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int npass = 0;
+    int ntest = 0;
 
-// int64_t ne1[4] = {16, 128, 1, 1};
-// int64_t ne2[4] = {16, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 68371.1328
-// main: optimized e = 7854.4502
+    for (bool shuffle : {false, true}) {
+        std::pair<int, int> partial = test_dataset(backend_sched, backend, shuffle);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    {
+        std::pair<int, int> partial = test_grad(backend_sched, backend);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    for (bool high_level : {false, true}){
+        for (bool shuffle : {false, true}) {
+            if (!high_level && shuffle) {
+                continue;
+            }
 
+            std::pair<int, int> partial = test_forward_backward(backend_sched, backend, high_level, shuffle);
+            npass += partial.first;
+            ntest += partial.second;
+        }
+    }
+    {
+        std::pair<int, int> partial = test_epoch_vs_fit(backend_sched, backend);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    for (bool high_level : {false, true}){
+        std::pair<int, int> partial = test_idata_split(backend_sched, backend, high_level);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    for (int32_t nbatch_physical : {2, 1}) {
+        for (enum ggml_opt_loss_type loss_type : {GGML_OPT_LOSS_TYPE_SUM, GGML_OPT_LOSS_TYPE_MEAN}) {
+            std::pair<int, int> partial = test_gradient_accumulation(backend_sched, backend, nbatch_physical, loss_type);
+            npass += partial.first;
+            ntest += partial.second;
+        }
+    }
+    {
+        std::pair<int, int> partial = test_regression(backend_sched, backend);
+        npass += partial.first;
+        ntest += partial.second;
+    }
 
-// int64_t ne1[4] = {32, 128, 1, 1};
-// int64_t ne2[4] = {32, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 126061.1953
-// main: optimized e = 5451.0166
+    return std::make_pair(npass, ntest);
+}
 
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1620817.8750
-// main: optimized e = 698387.6875
+int main(void) {
+    const size_t dev_count = ggml_backend_dev_count();
+    printf("Testing %zu devices\n\n", dev_count);
+    size_t n_ok = 0;
 
-// another run on M1
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1629595.6250
-// main: optimized e = 698169.1250
+    std::vector<ggml_backend_dev_t> devs;
+    std::vector<ggml_backend_t>     backends;
 
-// int64_t ne1[4] = {32, 1024, 1, 1};
-// int64_t ne2[4] = {32, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 8146770.5000
-// main: optimized e = 651119.1250
+    for (size_t i = 0; i < dev_count; ++i) {
+        devs.push_back(ggml_backend_dev_get(i));
+
+        ggml_backend_t backend = ggml_backend_dev_init(devs[i], NULL);
+        GGML_ASSERT(backend != NULL);
+
+        if (ggml_backend_is_cpu(backend)) {
+            ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
+        }
+
+        backends.push_back(backend);
+    }
+
+    for (size_t i = 0; i < dev_count; ++i) {
+        // Put the backend to be tested in front so that it's prioritized:
+        std::vector<ggml_backend_t> backends_modded = {backends[i]};
+        backends_modded.insert(backends_modded.end(), backends.begin(), backends.end());
+
+        ggml_backend_sched_t backend_sched = ggml_backend_sched_new(
+            backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false);
+
+        printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i]));
+        printf("  Device description: %s\n", ggml_backend_dev_description(devs[i]));
+        size_t free, total; // NOLINT
+        ggml_backend_dev_memory(devs[i], &free, &total);
+        printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
+        printf("\n");
+
+        std::pair<int, int> result = test_backend(backend_sched, backends[i]);
+
+        printf("  %d/%d tests passed\n", result.first, result.second);
+        printf("  Backend %s: ", ggml_backend_name(backends[i]));
+        if (result.first == result.second) {
+            printf("\033[1;32mOK\033[0m\n");
+            n_ok++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+
+        printf("\n");
+
+        ggml_backend_sched_free(backend_sched);
+    }
+
+    for (ggml_backend_t backend : backends) {
+        ggml_backend_free(backend);
+    }
+
+    printf("%zu/%zu backends passed\n", n_ok, dev_count);
+    if (n_ok != dev_count) {
+        printf("\033[1;31mFAIL\033[0m\n");
+        return 1;
+    }
+    printf("\033[1;32mOK\033[0m\n");
+    return 0;
+}

From 68fcb4759c0ca2874b59d9c1e6a35ceca1cc04ce Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 16 Nov 2024 21:32:41 +0200
Subject: [PATCH 238/396] ggml : fix compile warnings (#0)

ggml-ci
---
 ggml/src/ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 4a478fcaa..ee72a173e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5501,7 +5501,7 @@ static void ggml_compute_backward(
                     fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n",
                         __func__, ggml_unary_op_name(ggml_get_unary_op(tensor)));
                     GGML_ABORT("fatal error");
-                } break;
+                } //break;
             }
         } break;
         case GGML_OP_CROSS_ENTROPY_LOSS: {
@@ -5517,7 +5517,7 @@ static void ggml_compute_backward(
         default: {
             fprintf(stderr, "%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op));
             GGML_ABORT("fatal error");
-        } break;
+        } //break;
     }
 
     GGML_ASSERT(!src0_needs_grads || ggml_are_same_shape(src0, cgraph->grads[isrc0]));

From 84274a10c399d843cff39feb2b86dc8224f613d8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 16 Nov 2024 21:34:03 +0200
Subject: [PATCH 239/396] tests : remove test-grad0

---
 Makefile             |    6 -
 tests/CMakeLists.txt |    3 +-
 tests/test-grad0.cpp | 1684 ------------------------------------------
 3 files changed, 1 insertion(+), 1692 deletions(-)
 delete mode 100644 tests/test-grad0.cpp

diff --git a/Makefile b/Makefile
index 539370e06..ae9a9c632 100644
--- a/Makefile
+++ b/Makefile
@@ -48,7 +48,6 @@ TEST_TARGETS = \
 	tests/test-backend-ops \
 	tests/test-chat-template \
 	tests/test-double-float \
-	tests/test-grad0 \
 	tests/test-grammar-integration \
 	tests/test-grammar-parser \
 	tests/test-json-schema-to-grammar \
@@ -1499,11 +1498,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-grad0: tests/test-grad0.cpp \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 tests/test-opt: tests/test-opt.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 08ad66b49..b06f122e8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -116,9 +116,8 @@ llama_target_and_test(test-sampling.cpp)
 llama_target_and_test(test-chat-template.cpp)
 
 llama_target_and_test(test-grammar-parser.cpp)
-llama_target_and_test(test-llama-grammar.cpp)
 llama_target_and_test(test-grammar-integration.cpp)
-llama_target_and_test(test-grad0.cpp)
+llama_target_and_test(test-llama-grammar.cpp)
 llama_target_and_test(test-barrier.cpp)
 # llama_target_and_test(test-opt.cpp) # SLOW
 llama_target_and_test(test-backend-ops.cpp)
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
deleted file mode 100644
index c712dba7f..000000000
--- a/tests/test-grad0.cpp
+++ /dev/null
@@ -1,1684 +0,0 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
-#include "ggml.h"
-#include "ggml-cpu.h"
-
-#include <cfloat>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <initializer_list>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-#define MAX_NARGS 3
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define GGML_SILU_FP16
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static int irand(int n) {
-    if (n == 0) return 0;
-    return rand()%n;
-}
-
-static void get_random_dims(int64_t * dims, int ndims) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = 1 + irand(4);
-    }
-}
-
-static struct ggml_tensor * get_random_tensor_f32(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static struct ggml_tensor * get_random_tensor_f16(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static struct ggml_tensor * get_random_tensor_i32(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        int32_t imin,
-        int32_t imax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static bool check_gradient(
-        const char * op_name,
-        struct ggml_context * ctx0,
-        struct ggml_tensor * x[],
-        struct ggml_tensor * f,
-        int ndims,
-        int nargs,
-        float eps,
-        float max_error_abs,
-        float max_error_rel,
-        std::vector<double> expected_vals) {
-
-    static int n_threads = -1;
-    if (n_threads < 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
-
-        const char *env = getenv("GGML_N_THREADS");
-        if (env) {
-            n_threads = atoi(env);
-        }
-
-        printf("GGML_N_THREADS = %d\n", n_threads);
-    }
-
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-    struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-    ggml_build_forward_expand(gf, f);
-    ggml_graph_cpy(gf, gb);
-    ggml_build_backward_expand(ctx0, gf, gb, false);
-
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    ggml_graph_reset(gb);
-    if (f->grad) {
-        ggml_set_f32(f->grad, 1.0f);
-    }
-
-    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-    // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
-    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
-
-    for (int i = 0; i < nargs; ++i) {
-        bool all_g0_bad = true;
-        const int nelements = ggml_nelements(x[i]);
-        for (int k = 0; k < nelements; ++k) {
-            // Calculate gradient numerically:
-            const float x0 = ggml_get_f32_1d(x[i], k);
-            const float xm = x0 - eps;
-            const float xp = x0 + eps;
-            ggml_set_f32_1d(x[i], k, xp);
-
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-            const double f0 = ggml_get_f32_1d(f, 0);
-
-            ggml_set_f32_1d(x[i], k, xm);
-
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-            const double f1 = ggml_get_f32_1d(f, 0);
-            const double g0 = (f0 - f1)/(2.0*(double) eps);
-
-            // The numerical calculation of the gradient fails around noncontinuities (e.g. 0 for ReLU).
-            // In such cases, provide a vector of expected values and skip the comparison for failed calculations.
-            if (!expected_vals.empty()) {
-                bool matches_any = false;
-                for (const double & ev : expected_vals) {
-                    const double error_abs = std::fabs(g0 - ev);
-                    if (error_abs > max_error_abs) {
-                        continue;
-                    }
-                    const double error_rel = g0 != 0.0 ? fabs(g0 - ev)/fabs(g0) : 0.0;
-                    if (error_rel > max_error_rel) {
-                        continue;
-                    }
-                    matches_any = true;
-                    break;
-                }
-                if (!matches_any) {
-                    continue;
-                }
-            }
-            all_g0_bad = false;
-
-            ggml_set_f32_1d(x[i], k, x0);
-
-            // compute gradient using backward graph
-            ggml_graph_reset(gb);
-            if (f->grad) {
-                ggml_set_f32(f->grad, 1.0f);
-            }
-
-            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-            const double g1 = ggml_get_f32_1d(x[i]->grad, k);
-
-            const double error_abs = fabs(g0 - g1);
-            const double error_rel = g0 != 0.0 ? fabs(g0 - g1)/fabs(g0) : 0.0;
-
-            if (error_abs > max_error_abs || error_rel > max_error_rel) {
-                printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
-                            op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
-                //assert(false);
-                return false;
-            }
-        }
-        if (all_g0_bad) {
-            printf("%s: numerical calculation of the gradient failed for all values\n", op_name);
-            return false;
-        }
-    }
-
-    return true;
-}
-
-// TODO: clean-up this ..
-static bool check_mat_mul(
-        const struct ggml_tensor * y,
-        const struct ggml_tensor * x0,
-        const struct ggml_tensor * x1) {
-    float * dst  = (float *) y->data;
-    float * src0 = (float *) x0->data;
-    float * src1 = (float *) x1->data;
-
-    const int nc = x0->ne[1];
-    const int nr = x1->ne[1];
-    const int nk = x0->ne[0];
-
-    GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
-
-    GGML_PRINT_DEBUG("x0:\n");
-    for (int j = 0; j < x0->ne[1]; ++j) {
-        for (int i = 0; i < x0->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    GGML_PRINT_DEBUG("\n");
-
-    GGML_PRINT_DEBUG("x1:\n");
-    for (int j = 0; j < x1->ne[1]; ++j) {
-        for (int i = 0; i < x1->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    GGML_PRINT_DEBUG("\n");
-
-    GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
-    for (int j = 0; j < y->ne[1]; ++j) {
-        for (int i = 0; i < y->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-
-    for (int i = 0; i < nr; ++i) {
-        for (int j = 0; j < nc; ++j) {
-            float sum = 0.0f;
-
-            for (int k = 0; k < nk; ++k) {
-                sum += src0[j*nk + k]*src1[i*nk + k];
-            }
-
-            if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
-                fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
-                assert(false);
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-#define NUM_PERMUTATIONS (4*3*2*1)
-
-int main(int argc, const char ** argv) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 256*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    int64_t ne[4];
-
-    int all_permutations[4 * NUM_PERMUTATIONS];
-    {
-        int count = 0;
-        for (int ax0=0; ax0<4; ++ax0) {
-            for (int ax1=0; ax1<4; ++ax1) {
-                if (ax1 == ax0) continue;
-                for (int ax2=0; ax2<4; ++ax2) {
-                    if (ax2 == ax0) continue;
-                    if (ax2 == ax1) continue;
-                    for (int ax3=0; ax3<4; ++ax3) {
-                        if (ax3 == ax0) continue;
-                        if (ax3 == ax1) continue;
-                        if (ax3 == ax2) continue;
-                        assert(count < NUM_PERMUTATIONS);
-                        all_permutations[count*4+0] = ax0;
-                        all_permutations[count*4+1] = ax1;
-                        all_permutations[count*4+2] = ax2;
-                        all_permutations[count*4+3] = ax3;
-                        ++count;
-                    }
-                }
-            }
-        }
-    }
-
-    unsigned seed_iter = 1;
-
-    // original loop: 1000
-    int niter = 4;
-    const char *env = getenv("GGML_NLOOP");
-    if (env != NULL) {
-        niter = atoi(env);
-    }
-    if (argc > 1) {
-        niter = atoi(argv[1]);
-    }
-    for (int iter = 0; iter < niter; ++iter) {
-        srand(seed_iter);
-        seed_iter = rand();
-        unsigned seed = rand();
-
-        printf("test-grad0: iter:%d/%d\n", (iter+1), niter);
-        struct ggml_context * ctx0 = ggml_init(params);
-
-        get_random_dims(ne, 4);
-
-        struct ggml_tensor * x[MAX_NARGS];
-
-        // add f32
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-
-                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f, {});
-            }
-        }
-
-        // add f16
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-
-                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f, {});
-            }
-        }
-
-        // sub
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
-
-                check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
-            }
-        }
-
-        // mul
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
-
-                check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // div
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
-
-                check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f, {});
-            }
-        }
-
-        // sqr
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
-
-                check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // sqrt
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
-
-                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f, {});
-            }
-        }
-
-        // log
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
-
-                check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f, {});
-            }
-        }
-
-        // sum
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
-
-                check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
-            }
-        }
-
-
-        // sum_rows
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
-
-                check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
-            }
-        }
-
-        // mean, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
-
-                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
-            }
-        }
-
-        // argmax
-        if (0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
-
-                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
-            }
-        }
-
-        // repeat
-        {
-            srand(seed);
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            ne2[0] = ne[0] * ne2[0];
-            ne2[1] = ne[1] * ne2[1];
-            ne2[2] = 1;
-            ne2[3] = 1;
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
-
-                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
-            }
-        }
-
-        // repeat back
-        {
-            srand(seed);
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            ne2[0] = ne[0] * ne2[0];
-            ne2[1] = ne[1] * ne2[1];
-            ne2[2] = 1;
-            ne2[3] = 1;
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
-
-                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
-            }
-        }
-
-        // abs
-        {
-           const int nargs = 1;
-
-           for (int ndims = 1; ndims <= 4; ++ndims) {
-               for (int i = 0; i < nargs; ++i) {
-                   x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                   ggml_set_param(ctx0, x[i]);
-               }
-
-               struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
-
-               check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f, {-1.0, 1.0});
-           }
-        }
-
-        // sgn
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
-
-                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {0.0});
-            }
-        }
-
-        // neg
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
-
-                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
-            }
-        }
-
-        // step
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
-
-                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {0.0});
-            }
-        }
-
-        // tanh, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
-
-                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
-            }
-        }
-
-        // mul_mat
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-                int max_nrep = (ndims >= 3) ? 2 : 1;
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
-                    for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
-                        {
-                            int64_t ne2[4];
-                            get_random_dims(ne2, 4);
-                            ne2[0] = ne[0];
-                            ne2[2] = nrep2 * ne[2];
-                            ne2[3] = nrep3 * ne[3];
-                            x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-                        ggml_set_param(ctx0, x[1]);
-
-                        struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
-                        struct ggml_tensor * f = ggml_sum(ctx0, m);
-
-                        GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
-
-                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-                        if (ndims == 2) {
-                            // check_mat_mul does not support ndims > 2
-                            check_mat_mul(m, x[1], x[0]);
-                        }
-                    }
-                }
-            }
-        }
-
-        // elu, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
-
-                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
-            }
-        }
-
-        // relu
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
-
-                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {0.0, 1.0});
-            }
-        }
-
-        // gelu, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
-
-                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
-            }
-        }
-
-        // silu
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
-
-#ifdef GGML_SILU_FP16
-                // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY, {});
-#else
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-#endif
-            }
-        }
-
-        // rms_norm
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
-
-                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY, {});
-            }
-        }
-
-        // scale
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                const float s = -1.0f + 2.0f*frand();
-
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s));
-
-                check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // cpy f32
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-
-                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // cpy f16
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-
-                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY, {});
-            }
-        }
-
-        // reshape (1d->nd)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                int64_t ne2[4];
-                ne2[0] = 1;
-                ne2[1] = 1;
-                ne2[2] = 1;
-                ne2[3] = 1;
-                for (int i = 0; i < ndims; ++i) {
-                    ne2[0] *= ne[i];
-                }
-                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // reshape (nd->1d)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                int64_t ne2[4];
-                ne2[0] = 1;
-                ne2[1] = 1;
-                ne2[2] = 1;
-                ne2[3] = 1;
-                for (int i = 0; i < ndims; ++i) {
-                    ne2[0] *= ne[i];
-                }
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // acc 1d
-        {
-            srand(seed);
-            int64_t ne2[4] = { 1, 1, 1, 1 };
-
-            const int nargs = 2;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 1);
-                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 1);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
-                const int offset = irand(max_offset) * ggml_element_size(x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // acc 2d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 2);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 2);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                const int offset = offsets[0] + offsets[1];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // acc 3d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 3);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 3);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
-                const int offset = offsets[0] + offsets[1] + offsets[2];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // acc 4d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 4; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 4);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 4);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
-                max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
-                offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
-                const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // set_1d
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 2;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 1);
-                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 1);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
-                const int offset = irand(max_offset) * ggml_element_size(x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
-
-                check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // set_2d
-        {
-            srand(seed);
-            int64_t ne2[4];
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 1;
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 2);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 2);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                const int offset = offsets[0] + offsets[1];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
-
-                check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // view_1d
-        {
-            srand(seed);
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int k0 = irand(ggml_nelements(x[0]));
-                const int k1 = irand(ggml_nelements(x[0]));
-                const int i0 = MIN(k0, k1);
-                const int i1 = MAX(k0, k1);
-
-                const int offset = i0 * sizeof(float);
-                const int nelem  = i1 - i0;
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
-
-                check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // view_2d
-        {
-            srand(seed);
-            int64_t ne2[4];
-            int64_t nb2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                get_random_dims(ne2, 2);
-                while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
-                    get_random_dims(ne2, 2);
-                }
-                const int count = ne2[0]*ne2[1];
-
-                nb2[0] = sizeof(float);
-                nb2[1] = nb2[0]*ne2[0];
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int max_offset = ggml_nelements(x[0]) - count;
-                const int offset = irand(max_offset+1) * sizeof(float);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
-
-                check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // view_3d
-        {
-            srand(seed);
-            int64_t ne2[4] = {1,1,1,1};
-            int64_t nb2[4] = {0,0,0,0};
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                get_random_dims(ne2, 3);
-                while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
-                    get_random_dims(ne2, 3);
-                }
-                const int count = ne2[0]*ne2[1]*ne2[2];
-
-                nb2[0] = sizeof(float);
-                nb2[1] = nb2[0]*ne2[0];
-                nb2[2] = nb2[1]*ne2[1];
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int max_offset = ggml_nelements(x[0]) - count;
-                const int offset = irand(max_offset+1) * sizeof(float);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
-
-                check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // permute
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims)
-            {
-                // ggml_permute will set axes of dimensions below n_dims to 1.
-                // to make ggml_permute work correctly on all axes,
-                // the input tensor needs maximal n_dim of 4.
-                for (int i=0; i<ndims; ++i) {
-                    ne2[i] = ne[i];
-                }
-                for (int i=ndims; i<4; ++i) {
-                    ne2[i] = 1;
-                }
-                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int p = irand(NUM_PERMUTATIONS);
-                const int ax0 = all_permutations[p*4+0];
-                const int ax1 = all_permutations[p*4+1];
-                const int ax2 = all_permutations[p*4+2];
-                const int ax3 = all_permutations[p*4+3];
-
-                // sum requires contiguous tensor rows
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
-
-                check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // transpose
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims)
-            {
-                // ggml_transpose will set axes of dimensions below n_dims to 1.
-                // to make ggml_transpose work correctly on all axes,
-                // the input tensor needs maximal n_dim of 4.
-                for (int i=0; i<ndims; ++i) {
-                    ne2[i] = ne[i];
-                }
-                for (int i=ndims; i<4; ++i) {
-                    ne2[i] = 1;
-                }
-                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                // sum requires contiguous tensor rows
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
-
-                check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // get_rows
-        {
-            srand(seed);
-            int64_t ne2[4] = {ne[0], ne[1], 1, 1};
-            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
-            const int nargs = 1;
-            const int ndims = 2;
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
-
-            ggml_set_param(ctx0, x[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
-
-            check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-        }
-
-        // diag_mask_inf
-        {
-            srand(seed);
-            const int nargs = 1;
-            const int ndims = 2;
-
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-            ggml_set_param(ctx0, x[0]);
-
-            int n_past = irand(ne[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
-
-            check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-        }
-
-        // diag_mask_zero
-        {
-            srand(seed);
-            const int nargs = 1;
-            const int ndims = 2;
-
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-            ggml_set_param(ctx0, x[0]);
-
-            int n_past = irand(ne[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
-
-            check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-        }
-
-        // softmax
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                float eps = 1e-6f;
-                // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
-                // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
-                struct ggml_tensor * f = ggml_sum(ctx0,
-                                            ggml_log(ctx0,
-                                                ggml_add1(ctx0,
-                                                    ggml_scale(ctx0,
-                                                        ggml_soft_max(ctx0, x[0]),
-                                                        1.0f - eps),
-                                                    ggml_new_f32(ctx0, eps))));
-
-                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY, {});
-                // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
-                // this may result in different gradients too finite differences.
-                // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
-                // if only the table lookup causes gradients to differ this is acceptable.
-            }
-        }
-
-        // cross_entropy_loss
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
-                // the second argument to cross_entropy_loss must sum up to 1 for each row
-                int nr = ggml_nrows(x[1]);
-                int nc = ggml_nelements(x[1]) / nr;
-                for (int ir = 0; ir < nr; ++ir) {
-                    float sum = 0;
-                    for (int ic = 0; ic < nc; ++ic) {
-                        sum += ((float *) x[1]->data)[ic + ir*nc];
-                    }
-                    for (int ic = 0; ic < nc; ++ic) {
-                        ((float *) x[1]->data)[ic + ir*nc] /= sum;
-                    }
-                }
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
-
-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // rope f32
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            ne2[0] += ne2[0] % 2;
-            int n_rot = ne2[0];
-
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-                for (int mode = 0; mode < 4; ++mode) {
-                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-
-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
-                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-
-                        const bool skip_past = (mode & 1);
-                        if (skip_past) {
-                            // we have no past, so this would have to work on uninitialized memory.
-                            // we only test the gradients here;
-                            // skip_past should have no influence on gradient computation.
-                            // so when other modes work, we assume that this does as well.
-                            continue;
-                        }
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
-
-                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY, {});
-                    }
-                }
-            }
-        }
-
-        // rope f16
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            ne2[0] += ne2[0] % 2;
-            int n_rot = ne2[0];
-
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-                for (int mode = 0; mode < 4; ++mode) {
-                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
-
-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
-                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-
-                        const bool skip_past = (mode & 1);
-                        if (skip_past) {
-                            // we have no past, so this would have to work on uninitialized memory.
-                            // we only test the gradients here;
-                            // skip_past should have no influence on gradient computation.
-                            // so when other modes work, we assume that this does as well.
-                            continue;
-                        }
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
-
-                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY, {});
-                    }
-                }
-            }
-        }
-
-        // im2col f32
-        {
-            srand(seed);
-            const int nargs = 1;
-            const int ndims = 4;
-
-            for (const bool is_2D : {false, true}) {
-                int64_t ne0[ndims];
-                int64_t ne1[ndims];
-                get_random_dims(ne0, ndims);
-                get_random_dims(ne1, ndims);
-
-                // // Ensure that the output is not zero-sized:
-                ne1[0] += 8;
-                ne1[1] += 8;
-
-                if (is_2D) {
-                    ne1[2] = ne0[2];
-                } else {
-                    ne1[1] = ne0[1];
-                    ne0[3] = 1;
-                    ne1[3] = 1;
-                }
-
-                // The order of arguments is swapped because the first tensor is only used for its shape.
-                x[1] = get_random_tensor_f16(ctx0, ndims, ne0, -1.0f, 1.0f);
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne1, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int s0 =         1 + irand(2);
-                const int s1 = is_2D ? 1 + irand(2) : 0;
-                const int p0 =         0 + irand(2);
-                const int p1 = is_2D ? 0 + irand(2) : 0;
-                const int d0 =         1 + irand(2);
-                const int d1 = is_2D ? 1 + irand(2) : 0;
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_im2col(ctx0, x[1], x[0], s0, s1, p0, p1, d0, d1, is_2D, GGML_TYPE_F32));
-
-                GGML_PRINT_DEBUG("im2col f32: is_2D=%s, s0=%d, s1=%d, p0=%d, p1=%d, d0=%d, d1=%d\n", is_2D ? "yes" : "no", s0, s1, p0, p1, d0, d1);
-                check_gradient("im2col f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY, {});
-            }
-        }
-
-        // pool_2d f32
-        {
-            srand(seed);
-            const int nargs = 1;
-            const int ndims = 4;
-
-            for (const enum ggml_op_pool op : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
-                int64_t ne0[ndims];
-                get_random_dims(ne0, ndims);
-
-                ne0[0] += 8;
-                ne0[1] += 8;
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne0, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int k0 = 2 + irand(2);
-                const int k1 = 2 + irand(2);
-                const int s0 = 2 + irand(2);
-                const int s1 = 2 + irand(2);
-                const int p0 = 0 + irand(2);
-                const int p1 = 0 + irand(2);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_pool_2d(ctx0, x[0], op, k0, k1, s0, s1, p0, p1));
-
-                GGML_PRINT_DEBUG("ggml_pool_2d f32: op=%s k0=%d, k1=%d, s0=%d, s1=%d, p0=%d, p1=%d\n",
-                                 op == GGML_OP_POOL_MAX ? "max" : "avg", k0, k1, s0, s1, p0, p1);
-                std::vector<double> expected_vals;
-                if (op == GGML_OP_POOL_MAX) {
-                    expected_vals.push_back(0.0);
-                    expected_vals.push_back(1.0);
-                }
-                check_gradient("ggml_pool_2d f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, expected_vals);
-            }
-        }
-
-        // flash_attn f32
-        // TODO: adapt to ggml_flash_attn_ext() changes
-        //{
-        //    srand(seed);
-        //    const int nargs = 3;
-
-        //    int64_t ne2[4];
-
-        //    get_random_dims(ne2, 4);
-        //    int64_t D = ne2[0];
-        //    int64_t N = ne2[1];
-        //    int64_t M = ne2[2] + N;
-        //    int64_t B = ne2[3];
-
-        //    for (int masked = 0; masked <= 1; ++masked) {
-        //        for (int ndims = 2; ndims <= 4; ++ndims) {
-        //            int max_nrep = (ndims >= 3) ? 2 : 1;
-        //            for (int nrep = 1; nrep < max_nrep; ++nrep) {
-        //                int64_t neq[4] = { D, N, B*nrep, ne[3] };
-        //                int64_t nek[4] = { D, M, B, ne[3] };
-        //                int64_t nev[4] = { M, D, B, ne[3] };
-        //                if (ndims == 2) {
-        //                    neq[2] = 1; neq[3] = 1;
-        //                    nek[2] = 1; nek[3] = 1;
-        //                    nev[2] = 1; nev[3] = 1;
-        //                } else if (ndims == 3) {
-        //                    neq[3] = 1;
-        //                    nek[3] = 1;
-        //                    nev[3] = 1;
-        //                }
-        //                x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
-        //                x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
-        //                x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
-        //                ggml_set_param(ctx0, x[0]);
-        //                ggml_set_param(ctx0, x[1]);
-        //                ggml_set_param(ctx0, x[2]);
-
-        //                struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-
-        //                check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY, {});
-        //            }
-        //        }
-        //    }
-        //}
-
-        ggml_free(ctx0);
-    }
-
-    return 0;
-}

From a4200cafadebb7576a9a3905039858f5e73ce4cc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 16 Nov 2024 21:35:31 +0200
Subject: [PATCH 240/396] make : add ggml-opt (#0)

ggml-ci
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index ae9a9c632..fecf1f693 100644
--- a/Makefile
+++ b/Makefile
@@ -996,6 +996,7 @@ OBJ_GGML = \
 	$(DIR_GGML)/src/ggml-alloc.o \
 	$(DIR_GGML)/src/ggml-backend.o \
 	$(DIR_GGML)/src/ggml-backend-reg.o \
+	$(DIR_GGML)/src/ggml-opt.o \
 	$(DIR_GGML)/src/ggml-quants.o \
 	$(DIR_GGML)/src/ggml-threading.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \

From 5d9e59979c2fd91c99d03c23e8df9c50c9d55a85 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 16 Nov 2024 21:38:01 +0200
Subject: [PATCH 241/396] ggml : adapt AMX to tensor->grad removal (#0)

ggml-ci
---
 ggml/src/ggml-amx/ggml-amx.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp
index 37da98539..8568e7965 100644
--- a/ggml/src/ggml-amx/ggml-amx.cpp
+++ b/ggml/src/ggml-amx/ggml-amx.cpp
@@ -317,8 +317,6 @@ static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const st
             const enum ggml_type type = src0->type;
             const int64_t ne0 = op->ne[0];
 
-            bool is_training = src0->grad || src1->grad;
-
             // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
             // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
             bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
@@ -326,7 +324,6 @@ static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const st
             bool can_use_amx =
                 is_contiguous_2d(src0) &&       // src0 must be contiguous
                 is_contiguous_2d(src1) &&       // src1 must be contiguous
-                !is_training &&                 // inference only
                 src1->type == GGML_TYPE_F32 &&  // src1 must be float32
                 has_amx_kernels &&              // with amx kernel impls
                 ne0 % (TILE_N * 2) == 0;        // out_features is 32x

From 24203e9dd7355a4a10bc32d959fd0148d37bf666 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 16 Nov 2024 23:40:39 +0200
Subject: [PATCH 242/396] ggml : inttypes.h -> cinttypes (#0)

ggml-ci
---
 ggml/src/ggml-opt.cpp        | 2 +-
 tests/test-opt.cpp           | 2 +-
 tests/test-quantize-perf.cpp | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp
index 808aa0d02..040205a31 100644
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@@ -8,7 +8,7 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
-#include <inttypes.h>
+#include <cinttypes>
 #include <map>
 #include <random>
 #include <vector>
diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp
index 4abe85c74..f90c92b4b 100644
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
@@ -5,7 +5,7 @@
 #include "ggml-opt.h"
 
 #include <cmath>
-#include <inttypes.h>
+#include <cinttypes>
 #include <random>
 #include <string>
 #include <thread>
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index ac0d12714..288288493 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -7,7 +7,6 @@
 #include <algorithm>
 #include <assert.h>
 #include <functional>
-#include <inttypes.h>
 #include <math.h>
 #include <memory>
 #include <stdio.h>

From eda7e1d4f54711de1c9b40502d6c88bbc217da60 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Sun, 17 Nov 2024 07:31:17 +0100
Subject: [PATCH 243/396] ggml : fix possible buffer use after free in sched
 reserve (#9930)

---
 ggml/src/ggml-backend.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 634fe38ee..9a6010d36 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1538,12 +1538,13 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
 
     ggml_backend_sched_split_graph(sched, measure_graph);
 
+    ggml_backend_sched_synchronize(sched);
+
     if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
         return false;
     }
 
     ggml_backend_sched_reset(sched);
-    ggml_backend_sched_synchronize(sched);
 
     return true;
 }

From 467576b6cc7d2b9220f55bc635aa51469cf26fb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sun, 17 Nov 2024 09:06:34 +0100
Subject: [PATCH 244/396] CMake: default to -arch=native for CUDA build
 (#10320)

---
 README.md                         |  4 ++--
 ggml/src/ggml-cuda/CMakeLists.txt | 15 +++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 6ab6acf12..5f7933c13 100644
--- a/README.md
+++ b/README.md
@@ -459,14 +459,14 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
 
-## Other documentations
+## Other documentation
 
 - [main (cli)](./examples/main/README.md)
 - [server](./examples/server/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [GBNF grammars](./grammars/README.md)
 
-**Development documentations**
+**Development documentation**
 
 - [How to build](./docs/build.md)
 - [Running on Docker](./docs/docker.md)
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index 40ed2bdf3..860552f3a 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -6,15 +6,18 @@ if (CUDAToolkit_FOUND)
     message(STATUS "CUDA Toolkit found")
 
     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # 52 == lowest CUDA 12 standard
-        # 60 == FP16 CUDA intrinsics
-        # 61 == integer CUDA intrinsics
-        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
-        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+        # native == GPUs available at build time
+        # 52     == Maxwell, lowest CUDA 12 standard
+        # 60     == P100, FP16 CUDA intrinsics
+        # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
+        # 70     == V100, FP16 tensor cores
+        # 75     == Turing, int6 tensor cores
+        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6")
+            set(CMAKE_CUDA_ARCHITECTURES "native")
+        elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
             set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
         else()
             set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
-            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
         endif()
     endif()
     message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

From c3ea58aca406911eb4d409cdbfc76683393442b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sun, 17 Nov 2024 09:09:55 +0100
Subject: [PATCH 245/396] CUDA: remove DMMV, consolidate F16 mult mat vec
 (#10318)

---
 Makefile                                 |  57 --
 docs/build.md                            |  11 -
 ggml/CMakeLists.txt                      |   5 -
 ggml/src/ggml-cuda/CMakeLists.txt        |  13 -
 ggml/src/ggml-cuda/dmmv.cu               | 699 -----------------------
 ggml/src/ggml-cuda/ggml-cuda.cu          | 204 +------
 ggml/src/ggml-cuda/mmv.cu                | 223 ++++++++
 ggml/src/ggml-cuda/{dmmv.cuh => mmv.cuh} |  16 +-
 ggml/src/ggml-hip/CMakeLists.txt         |   7 -
 ggml/src/ggml-musa/CMakeLists.txt        |  11 -
 10 files changed, 246 insertions(+), 1000 deletions(-)
 delete mode 100644 ggml/src/ggml-cuda/dmmv.cu
 create mode 100644 ggml/src/ggml-cuda/mmv.cu
 rename ggml/src/ggml-cuda/{dmmv.cuh => mmv.cuh} (55%)

diff --git a/Makefile b/Makefile
index fecf1f693..bd0bad160 100644
--- a/Makefile
+++ b/Makefile
@@ -635,10 +635,6 @@ else ifndef CUDA_POWER_ARCH
 	MK_NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH
 
-ifdef GGML_CUDA_FORCE_DMMV
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
-endif # GGML_CUDA_FORCE_DMMV
-
 ifdef GGML_CUDA_FORCE_MMQ
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # GGML_CUDA_FORCE_MMQ
@@ -647,20 +643,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
 endif # GGML_CUDA_FORCE_CUBLAS
 
-ifdef GGML_CUDA_DMMV_X
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
-else
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
-endif # GGML_CUDA_DMMV_X
-
-ifdef GGML_CUDA_MMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
-else ifdef GGML_CUDA_DMMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility
-else
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
-endif # GGML_CUDA_MMV_Y
-
 ifdef GGML_CUDA_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_F16
@@ -669,12 +651,6 @@ ifdef GGML_CUDA_DMMV_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_DMMV_F16
 
-ifdef GGML_CUDA_KQUANTS_ITER
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
-else
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
-endif
-
 ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
 	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
 else
@@ -783,10 +759,6 @@ ifdef GGML_HIPBLAS
 		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
 
-	GGML_CUDA_DMMV_X       ?= 32
-	GGML_CUDA_MMV_Y        ?= 1
-	GGML_CUDA_KQUANTS_ITER ?= 2
-
 	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
 
 ifdef GGML_HIP_UMA
@@ -800,13 +772,6 @@ endif # GGML_HIP_UMA
 	HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
 
 	HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
-	HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
-	HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
-	HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
-
-ifdef GGML_CUDA_FORCE_DMMV
-	HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
-endif # GGML_CUDA_FORCE_DMMV
 
 ifdef GGML_CUDA_FORCE_MMQ
 	HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
@@ -869,10 +834,6 @@ ifdef GGML_MUSA
 
 	MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
 
-ifdef GGML_CUDA_FORCE_DMMV
-	MUSAFLAGS += -DGGML_CUDA_FORCE_DMMV
-endif # GGML_CUDA_FORCE_DMMV
-
 ifdef GGML_CUDA_FORCE_MMQ
 	MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # GGML_CUDA_FORCE_MMQ
@@ -881,18 +842,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
 	MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
 endif # GGML_CUDA_FORCE_CUBLAS
 
-ifdef GGML_CUDA_DMMV_X
-	MUSAFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
-else
-	MUSAFLAGS += -DGGML_CUDA_DMMV_X=32
-endif # GGML_CUDA_DMMV_X
-
-ifdef GGML_CUDA_MMV_Y
-	MUSAFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
-else
-	MUSAFLAGS += -DGGML_CUDA_MMV_Y=1
-endif # GGML_CUDA_MMV_Y
-
 ifdef GGML_CUDA_F16
 	MUSAFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_F16
@@ -901,12 +850,6 @@ ifdef GGML_CUDA_DMMV_F16
 	MUSAFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_DMMV_F16
 
-ifdef GGML_CUDA_KQUANTS_ITER
-	MUSAFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
-else
-	MUSAFLAGS += -DK_QUANTS_PER_ITERATION=2
-endif
-
 ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
 	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
 else
diff --git a/docs/build.md b/docs/build.md
index 811bbb409..359952b30 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -186,13 +186,9 @@ The following compilation options are also available to tweak performance:
 
 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-| GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-| GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
 | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
 | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
 | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
-| GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
 
@@ -268,13 +264,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 
 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
-The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
-
-| Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
-|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-| GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
-| GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
 
 ### Vulkan
 
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index fd9499826..a82818d60 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -128,14 +128,9 @@ option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"
 
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
-option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
-set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
-set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
-set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
-                                            "ggml: iters./thread per block for Q2_K/Q6_K")
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index 860552f3a..3dde0f366 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -54,21 +54,12 @@ if (CUDAToolkit_FOUND)
     target_link_libraries(ggml-cuda PRIVATE ggml-base)
     target_include_directories(ggml-cuda PRIVATE . ..)
 
-    # TODO: change the definitions to this target only
-
-    add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
-    add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
-    add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
     add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
 
     if (GGML_CUDA_GRAPHS)
         add_compile_definitions(GGML_CUDA_USE_GRAPHS)
     endif()
 
-    if (GGML_CUDA_FORCE_DMMV)
-        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-    endif()
-
     if (GGML_CUDA_FORCE_MMQ)
         add_compile_definitions(GGML_CUDA_FORCE_MMQ)
     endif()
@@ -81,10 +72,6 @@ if (CUDAToolkit_FOUND)
         add_compile_definitions(GGML_CUDA_NO_VMM)
     endif()
 
-    if (DEFINED GGML_CUDA_DMMV_Y)
-        add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
-    endif()
-
     if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
         add_compile_definitions(GGML_CUDA_F16)
     endif()
diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu
deleted file mode 100644
index 00e21b5d7..000000000
--- a/ggml/src/ggml-cuda/dmmv.cu
+++ /dev/null
@@ -1,699 +0,0 @@
-#include "dmmv.cuh"
-#include "dequantize.cuh"
-#include "convert.cuh"
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 2
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q2_K * x = (const block_q2_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-    const int q_offset = 32*im + l0;
-    const int s_offset = 8*im;
-    const int y_offset = 128*im + l0;
-
-    uint32_t aux[4];
-    const uint8_t * d = (const uint8_t *)aux;
-    const uint8_t * m = (const uint8_t *)(aux + 2);
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
-        aux[0] = a[0] & 0x0f0f0f0f;
-        aux[1] = a[1] & 0x0f0f0f0f;
-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-
-        }
-        tmp += dall * sum1 - dmin * sum2;
-
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q3_K * x = (const block_q3_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    const uint16_t kmask1 = 0x0303;
-    const uint16_t kmask2 = 0x0f0f;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
-    const int step = 16/K_QUANTS_PER_ITERATION;
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0....15 or 0...7
-
-    const uint8_t m = 1 << (4*im);
-
-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-    const int q_offset =  32*im + l0;
-    const int y_offset = 128*im + l0;
-
-    uint16_t utmp[4];
-    const int8_t * s = (const int8_t *)utmp;
-
-    const uint16_t s_shift = 4*im;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-        const uint8_t * h = x[i].hmask + l0;
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-
-        const float d = x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < n; ++l) {
-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
-        }
-        tmp += d * sum;
-
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q4_K * x = (const block_q4_K *)vx + ib0;
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
-
-    const int il  = tid/step;                            // 0...3
-    const int ir  = tid - step*il;                       // 0...7 or 0...3
-    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-#if K_QUANTS_PER_ITERATION == 2
-    uint32_t q32[4];
-    const uint8_t * q4 = (const uint8_t *)q32;
-#else
-    uint16_t q16[4];
-    const uint8_t * q4 = (const uint8_t *)q16;
-#endif
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y1 = yy + i*QK_K + y_offset;
-        const float   * y2 = y1 + 128;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-#if K_QUANTS_PER_ITERATION == 2
-        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
-        const uint32_t * q2 = q1 + 16;
-
-        q32[0] = q1[0] & 0x0f0f0f0f;
-        q32[1] = q1[0] & 0xf0f0f0f0;
-        q32[2] = q2[0] & 0x0f0f0f0f;
-        q32[3] = q2[0] & 0xf0f0f0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 4; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
-            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#else
-        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
-        const uint16_t * q2 = q1 + 32;
-
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[0] & 0xf0f0;
-        q16[2] = q2[0] & 0x0f0f;
-        q16[3] = q2[0] & 0xf0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 2; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
-            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#endif
-
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
-
-    const int row = blockIdx.x;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q5_K * x = (const block_q5_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = threadIdx.x/2;  // 0...15
-    const int ix  = threadIdx.x%2;
-
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 2;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    const uint8_t hm1  = 1 << (2*im);
-    const uint8_t hm2  = hm1 << 4;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    uint16_t q16[8];
-    const uint8_t * q4 = (const uint8_t *)q16;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2) {
-
-        const uint8_t * ql1 = x[i].qs + q_offset;
-        const uint8_t * qh  = x[i].qh + l0;
-        const float   * y1  = yy + i*QK_K + y_offset;
-        const float   * y2  = y1 + 128;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        float4 sum = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        const uint16_t * q1 = (const uint16_t *)ql1;
-        const uint16_t * q2 = q1 + 32;
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[8] & 0x0f0f;
-        q16[2] = (q1[0] >> 4) & 0x0f0f;
-        q16[3] = (q1[8] >> 4) & 0x0f0f;
-        q16[4] = q2[0] & 0x0f0f;
-        q16[5] = q2[8] & 0x0f0f;
-        q16[6] = (q2[0] >> 4) & 0x0f0f;
-        q16[7] = (q2[8] >> 4) & 0x0f0f;
-        for (int l = 0; l < n; ++l) {
-            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
-                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
-            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
-                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
-            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
-                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
-            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
-                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-        }
-        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q6_K * x = (const block_q6_K *)vx + ib0;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-#if K_QUANTS_PER_ITERATION == 1
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-#else
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-#endif
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * ql = x[i].ql + ql_offset;
-        const uint8_t * qh = x[i].qh + qh_offset;
-        const int8_t  * s  = x[i].scales + s_offset;
-
-        const float d = x[i].d;
-
-#if K_QUANTS_PER_ITERATION == 1
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp += sum;
-#else
-        float sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp += sum;
-#endif
-
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
-    const half * x = (const half *) vx;
-    // load 2 halfs into register in a single instruction
-    const half2 x_reg = *((half2 *) &(x[ib + iqs]));
-    // automatic half -> float type cast if dfloat == float
-    v.x = __low2float(x_reg);
-    v.y = __high2float(x_reg);
-}
-
-static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
-    return type == GGML_TYPE_Q4_0 ? dequantize_q4_0 :
-        type == GGML_TYPE_Q4_1 ? dequantize_q4_1 :
-        type == GGML_TYPE_Q5_0 ? dequantize_q5_0 :
-        type == GGML_TYPE_Q5_1 ? dequantize_q5_1 :
-        type == GGML_TYPE_Q8_0 ? dequantize_q8_0 :
-        type == GGML_TYPE_F16 ? convert_f16 :
-        nullptr;
-}
-
-template <ggml_type type>
-static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
-    constexpr int qk = ggml_cuda_type_traits<type>::qk; // quantized weights per x block
-    constexpr int qr = ggml_cuda_type_traits<type>::qr; // number of quantized weights per data value in x block
-    constexpr dequantize_kernel_t dequantize_kernel = get_dequantize_kernel(type);
-
-    const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int tid = threadIdx.x;
-
-    const int iter_stride = 2*GGML_CUDA_DMMV_X;
-    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-// partial sum for each thread
-#ifdef GGML_CUDA_F16
-    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
-#else
-    float tmp = 0.0f;
-#endif // GGML_CUDA_F16
-
-    for (int i = 0; i < ncols; i += iter_stride) {
-        const int col = i + vals_per_iter*tid;
-        const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
-        const int iqs = (col%qk)/qr; // x quant index
-        const int iybs = col - col%qk; // y block start index
-
-// processing >2 values per i iter is faster for fast GPUs
-#pragma unroll
-        for (int j = 0; j < vals_per_iter; j += 2) {
-            // process 2 vals per j iter
-
-            // dequantize
-            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
-            dfloat2 v;
-            dequantize_kernel(vx, ib, iqs + j/qr, v);
-
-            // matrix multiplication
-            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
-#ifdef GGML_CUDA_F16
-            if ( y_offset == 1 ) {
-                // load 2 dfloats into register in a single instruction
-                const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
-                tmp += __hmul2(v, y_reg);
-            }
-            else {
-                tmp += __hmul2(v, {
-                        y[iybs + iqs + j/qr + 0],
-                        y[iybs + iqs + j/qr + y_offset]
-                    });
-            }
-#else
-            if ( y_offset == 1 ) {
-                // load 2 dfloats into register in a single instruction
-                const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
-                tmp += v.x * y_reg.x;
-                tmp += v.y * y_reg.y;
-            }
-            else {
-                tmp += v.x * y[iybs + iqs + j/qr + 0];
-                tmp += v.y * y[iybs + iqs + j/qr + y_offset];
-            }
-#endif // GGML_CUDA_F16
-        }
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (tid == 0) {
-#ifdef GGML_CUDA_F16
-        dst[row] = tmp.x + tmp.y;
-#else
-        dst[row] = tmp;
-#endif // GGML_CUDA_F16
-    }
-}
-
-static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_Q4_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_Q4_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_Q5_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_Q5_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_Q8_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const dim3 block_dims(32, 1, 1);
-    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
-}
-
-static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_F16>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-void ggml_cuda_op_dequantize_mul_mat_vec(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-    GGML_UNUSED(ctx);
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
-#ifdef GGML_CUDA_F16
-    ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
-    half * src1_dfloat = nullptr; // dfloat == half
-
-    bool src1_convert_f16 =
-        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
-        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
-
-    if (src1_convert_f16) {
-        src1_dfloat = src1_dfloat_a.alloc(ne00);
-        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-        GGML_ASSERT(to_fp16_cuda != nullptr);
-        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
-    }
-#else
-    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
-#endif // GGML_CUDA_F16
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_F16:
-            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ABORT("fatal error");
-            break;
-    }
-
-    GGML_UNUSED(src1);
-    GGML_UNUSED(dst);
-    GGML_UNUSED(src1_ddq_i);
-    GGML_UNUSED(src1_ncols);
-    GGML_UNUSED(src1_padded_row_size);
-}
-
-bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
-    return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 ||
-        src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 ||
-        src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
-        src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
-        src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
-        src0_type == GGML_TYPE_F16;
-}
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 07f043328..ef56e944d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -16,11 +16,11 @@
 #include "ggml-cuda/cpy.cuh"
 #include "ggml-cuda/cross-entropy-loss.cuh"
 #include "ggml-cuda/diagmask.cuh"
-#include "ggml-cuda/dmmv.cuh"
 #include "ggml-cuda/fattn.cuh"
 #include "ggml-cuda/getrows.cuh"
 #include "ggml-cuda/im2col.cuh"
 #include "ggml-cuda/mmq.cuh"
+#include "ggml-cuda/mmv.cuh"
 #include "ggml-cuda/mmvq.cuh"
 #include "ggml-cuda/norm.cuh"
 #include "ggml-cuda/opt-step-adamw.cuh"
@@ -1020,114 +1020,6 @@ typedef void (*ggml_cuda_op_mul_mat_t)(
 
 #define MUL_MAT_SRC1_COL_STRIDE 128
 
-static __global__ void mul_mat_p021_f16_f32(
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
-
-    const half * x = (const half *) vx;
-
-    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
-    const int channel_x = channel / (nchannels_y / nchannels_x);
-
-    const int nrows_y = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst = row_x;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
-        const int col_x = col_x0 + threadIdx.x;
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        // x is transposed and permuted
-        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
-        const float xi = __half2float(x[ix]);
-
-        const int row_y = col_x;
-
-        // y is not transposed but permuted
-        const int iy = channel*nrows_y + row_y;
-
-        tmp += xi * y[iy];
-    }
-
-    // dst is not transposed and not permuted
-    const int idst = channel*nrows_dst + row_dst;
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
-    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
-
-    const half * x = (const half *) vx;
-
-    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
-    const int channel_x = channel / channel_x_divisor;
-
-    const int nrows_y   = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst   = row_x;
-
-    const int idst = channel*nrows_dst + row_dst;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
-        const int col_x = col_x0 + threadIdx.x;
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        const int row_y = col_x;
-
-        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
-        const int iy = channel*nrows_y + row_y;
-
-        const float xi = __half2float(x[ix]);
-
-        tmp += xi * y[iy];
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static void ggml_mul_mat_p021_f16_f32_cuda(
-    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
-    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
-
-    const dim3 block_nums(1, nrows_x, nchannels_y);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
-}
-
-static void ggml_mul_mat_vec_nc_f16_f32_cuda(
-    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
-    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
-
-    const dim3 block_nums(1, nrows_x, nchannels_y);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
-        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
-}
-
 static cudaError_t ggml_cuda_cpy_tensor_2d(
     void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
 
@@ -1654,58 +1546,6 @@ static void ggml_cuda_op_mul_mat(
     }
 }
 
-static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
-    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
-    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    cudaStream_t main_stream = ctx.stream();
-
-    void  * src0_ddq = src0->data;
-    float * src1_ddf = (float *) src1->data;
-    float * dst_ddf  = (float *) dst->data;
-
-    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
-}
-
-static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    cudaStream_t main_stream = ctx.stream();
-
-    void  * src0_ddq = src0->data;
-    float * src1_ddf = (float *) src1->data;
-    float * dst_ddf  = (float *) dst->data;
-
-    const int64_t row_stride_x = nb01 / sizeof(half);
-    const int64_t channel_stride_x = nb02 / sizeof(half);
-
-    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
-}
-
 static __global__ void k_compute_batched_ptrs(
         const half * src0_as_f16, const half * src1_as_f16, char * dst,
         const void ** ptrs_src, void ** ptrs_dst,
@@ -1879,21 +1719,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
 
-    bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
+    bool use_mul_mat_vec   = src0->type == GGML_TYPE_F16
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
-    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
+        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
+    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-    bool              use_mul_mat_q =  ggml_is_quantized(src0->type)
+    bool use_mul_mat_q     = ggml_is_quantized(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
 
-    // if mmvq is available it's a better choice than dmmv:
-#ifndef GGML_CUDA_FORCE_DMMV
-    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
-#endif // GGML_CUDA_FORCE_DMMV
-
-    bool any_gpus_with_slow_fp16 = false;
+    bool any_gpus_with_slow_fp16   = false;
+    bool any_gpus_without_fp16_mma = false;
 
     if (split) {
         ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
@@ -1904,14 +1740,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
                 continue;
             }
 
-            const int cc            = ggml_cuda_info().devices[id].cc;
-            use_mul_mat_q           = use_mul_mat_q           && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-            any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
+            const int cc              = ggml_cuda_info().devices[id].cc;
+            use_mul_mat_q             = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
+            any_gpus_with_slow_fp16   = any_gpus_with_slow_fp16   || !fast_fp16_available(cc);
+            any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
         }
     } else {
-        const int cc            = ggml_cuda_info().devices[ctx.device].cc;
-        use_mul_mat_q           = use_mul_mat_q           && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-        any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
+        const int cc              = ggml_cuda_info().devices[ctx.device].cc;
+        use_mul_mat_q             = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
+        any_gpus_with_slow_fp16   = any_gpus_with_slow_fp16   || !fast_fp16_available(cc);
+        any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
     }
 
     // debug helpers
@@ -1922,18 +1760,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
-    if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        // FP32 precision KQ single-batch for batch size 1 without FlashAttention
-        ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
-    } else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
-        // FP32 precision KQV single-batch for batch size 1 without FlashAttention
-        ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
+    if (!split && src0->type == GGML_TYPE_F16 && src1->ne[1] == 1 && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
+        ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
     } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
                && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
         // KQ + KQV multi-batch without FlashAttention
         ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
-    } else if (use_dequantize_mul_mat_vec) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
+    } else if (use_mul_mat_vec) {
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
     } else if (use_mul_mat_vec_q) {
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
     } else if (use_mul_mat_q) {
diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu
new file mode 100644
index 000000000..cfe91f428
--- /dev/null
+++ b/ggml/src/ggml-cuda/mmv.cu
@@ -0,0 +1,223 @@
+#include "common.cuh"
+#include "mmv.cuh"
+
+template <typename type_acc, int block_size>
+static __global__ void mul_mat_vec(
+        const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
+        const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
+    const int64_t row     = blockIdx.x;
+    const int64_t channel = blockIdx.z;
+    const int     tid     = threadIdx.x;
+
+    x   += (channel/channel_ratio)*stride_channel_x + row*stride_row;
+    y   +=  channel               *stride_channel_y;
+    dst +=  channel               *stride_channel_dst;
+
+    const half2  * x2 = (const half2  *) x;
+    const float2 * y2 = (const float2 *) y;
+
+    extern __shared__ char data_mmv[];
+    float * buf_iw = (float *) data_mmv;
+
+    if (block_size > WARP_SIZE) {
+        if (tid < WARP_SIZE) {
+            buf_iw[tid] = 0.0f;
+        }
+        __syncthreads();
+    }
+
+    float sumf;
+
+    if (std::is_same<type_acc, float>::value) {
+        sumf = 0.0f;
+
+        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+            const float2 tmpx = __half22float2(x2[col2]);
+            const float2 tmpy = y2[col2];
+            sumf += tmpx.x * tmpy.x;
+            sumf += tmpx.y * tmpy.y;
+        }
+    } else {
+#ifdef FP16_AVAILABLE
+        half2 sumh2 = make_half2(0.0f, 0.0f);
+
+        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+            const float2 tmp = y2[col2];
+            sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
+        }
+
+        sumf = __low2float(sumh2) + __high2float(sumh2);
+#else
+        NO_DEVICE_CODE;
+#endif // FP16_AVAILABLE
+    }
+
+    sumf = warp_reduce_sum(sumf);
+
+    if (block_size > WARP_SIZE) {
+        buf_iw[tid/WARP_SIZE] = sumf;
+        __syncthreads();
+        if (tid > WARP_SIZE) {
+            return;
+        }
+        sumf = buf_iw[tid];
+        sumf = warp_reduce_sum(sumf);
+    }
+
+    if (tid != 0) {
+        return;
+    }
+
+    dst[row] = sumf;
+}
+
+template <typename type_acc>
+static void launch_mul_mat_vec_cuda(
+        const half * x, const float * y, float * dst,
+        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
+        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
+        cudaStream_t stream) {
+    GGML_ASSERT(ncols      % 2 == 0);
+    GGML_ASSERT(stride_row % 2 == 0);
+    GGML_ASSERT(nchannels_y % nchannels_x == 0);
+    const int64_t channel_ratio = nchannels_y / nchannels_x;
+
+    int64_t block_size_best = WARP_SIZE;
+    int64_t niter_best      = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
+    for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
+        const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
+        if (niter < niter_best) {
+            niter_best      = niter;
+            block_size_best = block_size;
+        }
+    }
+
+    const int smem = WARP_SIZE*sizeof(float);
+    const dim3 block_nums(nrows, 1, nchannels_y);
+    const dim3 block_dims(block_size_best, 1, 1);
+    switch (block_size_best) {
+        case   32: {
+            mul_mat_vec<type_acc,  32><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
+        } break;
+        case   64: {
+            mul_mat_vec<type_acc,  64><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
+        } break;
+        case   96: {
+            mul_mat_vec<type_acc,  96><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
+        } break;
+        case  128: {
+            mul_mat_vec<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
+        } break;
+        case  160: {
+            mul_mat_vec<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
+        } break;
+        case  192: {
+            mul_mat_vec<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
+        } break;
+        case  224: {
+            mul_mat_vec<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
+        } break;
+        case  256: {
+            mul_mat_vec<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
+        } break;
+        default: {
+            GGML_ABORT("fatal error");
+        } break;
+    }
+}
+
+static void mul_mat_vec_cuda(
+        const half * x, const float * y, float * dst,
+        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
+        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
+        enum ggml_prec prec, cudaStream_t stream) {
+    switch (prec) {
+        case GGML_PREC_DEFAULT: {
+            launch_mul_mat_vec_cuda<half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
+                stride_channel_x, stride_channel_y, stride_channel_dst, stream);
+        } break;
+        case GGML_PREC_F32: {
+            launch_mul_mat_vec_cuda<float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
+                stride_channel_x, stride_channel_y, stride_channel_dst, stream);
+        } break;
+    }
+}
+
+void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+
+    GGML_ASSERT(src1->ne[1] == 1);
+
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
+
+    const half  * src0_d = (const half  *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       *  dst_d = (float       *)  dst->data;
+
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne12 = src1->ne[2];
+    GGML_ASSERT(dst->ne[2] == ne12);
+
+    GGML_ASSERT(src0->ne[3] == 1);
+    GGML_ASSERT(src1->ne[3] == 1);
+    GGML_ASSERT( dst->ne[3] == 1);
+
+    const int64_t stride_row         = src0->nb[1] / ggml_type_size(src0->type);
+    const int64_t channel_stride_x   = src0->nb[2] / ggml_type_size(src0->type);
+    const int64_t channel_stride_y   = src1->nb[2] / ggml_type_size(src1->type);
+    const int64_t channel_stride_dst =  dst->nb[2] / ggml_type_size( dst->type);
+
+    mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
+}
+
+void ggml_cuda_op_mul_mat_vec(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    GGML_ASSERT(src1_ncols == 1);
+
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
+
+
+    // ggml_cuda_op provides single, contiguous matrices
+    const int64_t stride_row         = ne00;
+    const int64_t nchannels_x        = 1;
+    const int64_t nchannels_y        = 1;
+    const int64_t channel_stride_x   = 0;
+    const int64_t channel_stride_y   = 0;
+    const int64_t channel_stride_dst = 0;
+
+    mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
+        nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
+
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddq_i);
+    GGML_UNUSED(src1_ncols);
+    GGML_UNUSED(src1_padded_row_size);
+}
diff --git a/ggml/src/ggml-cuda/dmmv.cuh b/ggml/src/ggml-cuda/mmv.cuh
similarity index 55%
rename from ggml/src/ggml-cuda/dmmv.cuh
rename to ggml/src/ggml-cuda/mmv.cuh
index e727eb97f..78a1cd4a6 100644
--- a/ggml/src/ggml-cuda/dmmv.cuh
+++ b/ggml/src/ggml-cuda/mmv.cuh
@@ -1,20 +1,12 @@
 #include "common.cuh"
 
-// dmmv = dequantize_mul_mat_vec
+// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
+#define MMV_MAX_ROWS 512
 
-// TODO: remove this?
-#ifndef GGML_CUDA_DMMV_X
-#define GGML_CUDA_DMMV_X 32
-#endif
+void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
 
-#ifndef GGML_CUDA_MMV_Y
-#define GGML_CUDA_MMV_Y 1
-#endif
-
-void ggml_cuda_op_dequantize_mul_mat_vec(
+void ggml_cuda_op_mul_mat_vec(
     ggml_backend_cuda_context & ctx,
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, cudaStream_t stream);
-
-bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt
index 5ed186ded..fccf8eb84 100644
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -75,18 +75,11 @@ target_include_directories(ggml-hip PRIVATE . ..)
 target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
 
 add_compile_definitions(GGML_USE_HIP)
-add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
-add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
-add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
 
 if (GGML_HIP_UMA)
     add_compile_definitions(GGML_HIP_UMA)
 endif()
 
-if (GGML_CUDA_FORCE_DMMV)
-    add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-endif()
-
 if (GGML_CUDA_FORCE_MMQ)
     add_compile_definitions(GGML_CUDA_FORCE_MMQ)
 endif()
diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt
index 8edc75cc5..f3c013692 100644
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -58,19 +58,12 @@ if (MUSAToolkit_FOUND)
     target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
 
     add_compile_definitions(GGML_USE_MUSA)
-    add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
-    add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
-    add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
     add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
 
     if (GGML_CUDA_GRAPHS)
         add_compile_definitions(GGML_CUDA_USE_GRAPHS)
     endif()
 
-    if (GGML_CUDA_FORCE_DMMV)
-        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-    endif()
-
     if (GGML_CUDA_FORCE_MMQ)
         add_compile_definitions(GGML_CUDA_FORCE_MMQ)
     endif()
@@ -83,10 +76,6 @@ if (MUSAToolkit_FOUND)
         add_compile_definitions(GGML_CUDA_NO_VMM)
     endif()
 
-    if (DEFINED GGML_CUDA_DMMV_Y)
-        add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
-    endif()
-
     if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
         add_compile_definitions(GGML_CUDA_F16)
     endif()

From a43178299c2f74f14bcfc659bfd9fd32d931d1f4 Mon Sep 17 00:00:00 2001
From: FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
Date: Sun, 17 Nov 2024 21:39:22 +1300
Subject: [PATCH 246/396] ggml : fix undefined reference to 'getcpu' (#10354)

https://github.com/ggerganov/llama.cpp/issues/10352
---
 ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index df6487929..0d23669c2 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2369,7 +2369,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
     // figure out which node we're on
     uint current_cpu;
     int getcpu_ret = 0;
-#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
     getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
 #else
     // old glibc doesn't have a wrapper for this call. Fall back on direct syscall

From cf32a9b93ad859ea592c31785b6bd3b4b2121463 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 17 Nov 2024 11:23:01 +0200
Subject: [PATCH 247/396] metal : refactor kernel args into structs (#10238)

* metal : add kernel arg structs (wip)

* metal : fattn args

ggml-ci

* metal : cont + avoid potential int overflow [no ci]

* metal : mul mat struct (wip)

* cont : mul mat vec

* cont : pass by reference

* cont : args is first argument

* cont : use char ptr

* cont : shmem style

* cont : thread counters style

* cont : mul mm id

ggml-ci

* cont : int safety + register optimizations

ggml-ci

* metal : GGML_OP_CONCAT

ggml-ci

* metal : GGML_OP_ADD, GGML_OP_SUB, GGML_OP_MUL, GGML_OP_DIV

* metal : GGML_OP_REPEAT

* metal : GGML_OP_CPY

* metal : GGML_OP_RMS_NORM

* metal : GGML_OP_NORM

* metal : add TODOs for rest of ops

* ggml : add ggml-metal-impl.h

ggml-ci
---
 Makefile                              |    5 +-
 ggml/src/ggml-metal/CMakeLists.txt    |   18 +-
 ggml/src/ggml-metal/ggml-metal-impl.h |  249 ++
 ggml/src/ggml-metal/ggml-metal.m      |  681 ++---
 ggml/src/ggml-metal/ggml-metal.metal  | 3544 +++++++++----------------
 5 files changed, 1925 insertions(+), 2572 deletions(-)
 create mode 100644 ggml/src/ggml-metal/ggml-metal-impl.h

diff --git a/Makefile b/Makefile
index bd0bad160..95110d4eb 100644
--- a/Makefile
+++ b/Makefile
@@ -906,6 +906,7 @@ endif # GGML_METAL
 ifdef GGML_METAL
 ggml/src/ggml-metal/ggml-metal.o: \
 	ggml/src/ggml-metal/ggml-metal.m \
+	ggml/src/ggml-metal/ggml-metal-impl.h \
 	ggml/include/ggml-metal.h \
 	ggml/include/ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@
@@ -913,9 +914,11 @@ ggml/src/ggml-metal/ggml-metal.o: \
 ifdef GGML_METAL_EMBED_LIBRARY
 ggml/src/ggml-metal-embed.o: \
 	ggml/src/ggml-metal/ggml-metal.metal \
+	ggml/src/ggml-metal/ggml-metal-impl.h \
 	ggml/src/ggml-common.h
 	@echo "Embedding Metal library"
-	@sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal
+	@sed -e '/__embed_ggml-common.h__/r      ggml/src/ggml-common.h'                -e '/__embed_ggml-common.h__/d'      < ggml/src/ggml-metal/ggml-metal.metal           > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
+	@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
 	@echo ".section __DATA, __ggml_metallib"                       >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
 	@echo ".globl _ggml_metallib_start"                            >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt
index e0992c744..b237d79f4 100644
--- a/ggml/src/ggml-metal/CMakeLists.txt
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@@ -25,9 +25,10 @@ if (GGML_METAL_USE_BF16)
     add_compile_definitions(GGML_METAL_USE_BF16)
 endif()
 
-# copy ggml-common.h and ggml-metal.metal to bin directory
-configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
-configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
+# copy metal files to bin directory
+configure_file(../ggml-common.h  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h     COPYONLY)
+configure_file(ggml-metal.metal  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal  COPYONLY)
+configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
 
 if (GGML_METAL_EMBED_LIBRARY)
     enable_language(ASM)
@@ -36,24 +37,27 @@ if (GGML_METAL_EMBED_LIBRARY)
 
     set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
     set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+    set(METALLIB_IMPL   "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")
 
     file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
 
     # merge ggml-common.h and ggml-metal.metal into a single file
-    set(METALLIB_EMBED_ASM    "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
-    set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
+    set(METALLIB_EMBED_ASM        "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
+    set(METALLIB_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
+    set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
 
     add_custom_command(
         OUTPUT ${METALLIB_EMBED_ASM}
         COMMAND echo "Embedding Metal library"
-        COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED}
+        COMMAND sed -e '/__embed_ggml-common.h__/r         ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d'         < ${METALLIB_SOURCE}           > ${METALLIB_SOURCE_EMBED_TMP}
+        COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}'   -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED}
         COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
         COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
         COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
         COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
         COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
         COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
-        DEPENDS ggml-metal.metal ../ggml-common.h
+        DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
         COMMENT "Generate assembly for embedded Metal library"
     )
 
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
new file mode 100644
index 000000000..53c135496
--- /dev/null
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -0,0 +1,249 @@
+#ifndef GGML_METAL_IMPL
+#define GGML_METAL_IMPL
+
+// kernel argument structs
+//
+// - element counters (e.g. ne00) typically use int32_t to reduce register usage
+//   however, be careful from int overflows when using those in the kernel implementation
+//
+// - strides (e.g. nb00) use uint64_t
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne10;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    int32_t  dim;
+} ggml_metal_kargs_concat;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne10;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    uint64_t offs;
+} ggml_metal_kargs_bin;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+} ggml_metal_kargs_repeat;
+
+typedef struct {
+    int64_t  ne00;
+    int64_t  ne01;
+    int64_t  ne02;
+    int64_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int64_t  ne0;
+    int64_t  ne1;
+    int64_t  ne2;
+    int64_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+} ggml_metal_kargs_cpy;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    int32_t  n_past;
+    int32_t  n_dims;
+    int32_t  n_ctx_orig;
+    float    freq_base;
+    float    freq_scale;
+    float    ext_factor;
+    float    attn_factor;
+    float    beta_fast;
+    float    beta_slow;
+} ggml_metal_kargs_rope;
+
+typedef struct {
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne11;
+    int32_t  ne_12_2; // assume K and V are same shape
+    int32_t  ne_12_3;
+    uint64_t nb_12_1;
+    uint64_t nb_12_2;
+    uint64_t nb_12_3;
+    uint64_t nb31;
+    int32_t  ne1;
+    int32_t  ne2;
+    float    scale;
+    float    max_bias;
+    float    m0;
+    float    m1;
+    uint16_t n_head_log2;
+    float    logit_softcap;
+} ggml_metal_kargs_flash_attn_ext;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne02;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne12;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    int32_t  ne0;
+    int32_t  ne1;
+    int16_t  r2;
+    int16_t  r3;
+} ggml_metal_kargs_mul_mm;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne10;
+    int32_t  ne11;
+    int32_t  ne12;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    int32_t  ne0;
+    int32_t  ne1;
+    int16_t  r2;
+    int16_t  r3;
+} ggml_metal_kargs_mul_mv;
+
+typedef struct {
+    int32_t  nei0;
+    int32_t  nei1;
+    uint64_t nbi1;
+    int32_t  ne00;
+    int32_t  ne02;
+    uint64_t nb01;
+    uint64_t nb02;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    int32_t  ne0;
+    int32_t  ne1;
+} ggml_metal_kargs_mul_mm_id;
+
+typedef struct {
+    int32_t  nei0;
+    int32_t  nei1;
+    uint64_t nbi1;
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    int32_t  ne10;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    int32_t  ne0;
+    int32_t  ne1;
+    uint64_t nb1;
+} ggml_metal_kargs_mul_mv_id;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne00_4;
+    uint64_t nb01;
+    float    eps;
+} ggml_metal_kargs_norm;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne00_4;
+    uint64_t nb01;
+    float    eps;
+} ggml_metal_kargs_rms_norm;
+
+#endif // GGML_METAL_IMPL
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 95b21fbf9..58fee4bfd 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -2,6 +2,7 @@
 
 #import "ggml-impl.h"
 #import "ggml-backend-impl.h"
+#import "ggml-metal-impl.h"
 
 #import <Foundation/Foundation.h>
 
@@ -1193,35 +1194,39 @@ static void ggml_metal_encode_node(
 
                 const int32_t dim = ((const int32_t *) dst->op_params)[0];
 
+                ggml_metal_kargs_concat args = {
+                    /*.ne00 =*/ ne00,
+                    /*.ne01 =*/ ne01,
+                    /*.ne02 =*/ ne02,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb03 =*/ nb03,
+                    /*.ne10 =*/ ne10,
+                    /*.ne11 =*/ ne11,
+                    /*.ne12 =*/ ne12,
+                    /*.ne13 =*/ ne13,
+                    /*.nb10 =*/ nb10,
+                    /*.nb11 =*/ nb11,
+                    /*.nb12 =*/ nb12,
+                    /*.nb13 =*/ nb13,
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.ne2  =*/ ne2,
+                    /*.ne3  =*/ ne3,
+                    /*.nb0  =*/ nb0,
+                    /*.nb1  =*/ nb1,
+                    /*.nb2  =*/ nb2,
+                    /*.nb3  =*/ nb3,
+                    /*.dim  =*/ dim,
+                };
+
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                [encoder setBytes:&dim  length:sizeof(dim)  atIndex:27];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
 
                 const int nth = MIN(1024, ne0);
 
@@ -1239,8 +1244,6 @@ static void ggml_metal_encode_node(
 
                 bool bcast_row = false;
 
-                int64_t nb = ne00; // used by the "row" kernels
-
                 id<MTLComputePipelineState> pipeline = nil;
 
                 if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
@@ -1249,7 +1252,6 @@ static void ggml_metal_encode_node(
                     // src1 is a row
                     GGML_ASSERT(ne11 == 1);
 
-                    nb = ne00 / 4;
                     switch (dst->op) {
                         case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
                         case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
@@ -1269,36 +1271,39 @@ static void ggml_metal_encode_node(
                     }
                 }
 
+                ggml_metal_kargs_bin args = {
+                    /*.ne00 =*/ ne00,
+                    /*.ne01 =*/ ne01,
+                    /*.ne02 =*/ ne02,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb03 =*/ nb03,
+                    /*.ne10 =*/ ne10,
+                    /*.ne11 =*/ ne11,
+                    /*.ne12 =*/ ne12,
+                    /*.ne13 =*/ ne13,
+                    /*.nb10 =*/ nb10,
+                    /*.nb11 =*/ nb11,
+                    /*.nb12 =*/ nb12,
+                    /*.nb13 =*/ nb13,
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.ne2  =*/ ne2,
+                    /*.ne3  =*/ ne3,
+                    /*.nb0  =*/ nb0,
+                    /*.nb1  =*/ nb1,
+                    /*.nb2  =*/ nb2,
+                    /*.nb3  =*/ nb3,
+                    /*.offs =*/ offs,
+                };
+
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-                [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
 
                 if (bcast_row) {
                     const int64_t n = ggml_nelements(dst)/4;
@@ -1322,25 +1327,29 @@ static void ggml_metal_encode_node(
                     default: GGML_ABORT("fatal error");
                 }
 
+                ggml_metal_kargs_repeat args = {
+                    /*.ne00 =*/ ne00,
+                    /*.ne01 =*/ ne01,
+                    /*.ne02 =*/ ne02,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb03 =*/ nb03,
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.ne2  =*/ ne2,
+                    /*.ne3  =*/ ne3,
+                    /*.nb0  =*/ nb0,
+                    /*.nb1  =*/ nb1,
+                    /*.nb2  =*/ nb2,
+                    /*.nb3  =*/ nb3,
+                };
+
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
 
                 const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
 
@@ -1369,25 +1378,29 @@ static void ggml_metal_encode_node(
 
                     const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
 
+                    ggml_metal_kargs_cpy args = {
+                        /*.ne00 =*/ ne00,
+                        /*.ne01 =*/ ne01,
+                        /*.ne02 =*/ ne02,
+                        /*.ne03 =*/ ne03,
+                        /*.nb00 =*/ nb00,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.nb03 =*/ nb03,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                        /*.ne2  =*/ ne2,
+                        /*.ne3  =*/ ne3,
+                        /*.nb0  =*/ nb0,
+                        /*.nb1  =*/ nb1,
+                        /*.nb2  =*/ nb2,
+                        /*.nb3  =*/ nb3,
+                    };
+
                     [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                    [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                    [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                    [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                    [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                    [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                    [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                    [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                    [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                    [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                    [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                    [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                    [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                    [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                    [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                    [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                    [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
 
                     const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
 
@@ -1396,35 +1409,39 @@ static void ggml_metal_encode_node(
 
                 const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
 
+                ggml_metal_kargs_bin args = {
+                    /*.ne00 =*/ ne00,
+                    /*.ne01 =*/ ne01,
+                    /*.ne02 =*/ ne02,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ pnb1,
+                    /*.nb02 =*/ pnb2,
+                    /*.nb03 =*/ pnb3,
+                    /*.ne10 =*/ ne10,
+                    /*.ne11 =*/ ne11,
+                    /*.ne12 =*/ ne12,
+                    /*.ne13 =*/ ne13,
+                    /*.nb10 =*/ nb10,
+                    /*.nb11 =*/ nb11,
+                    /*.nb12 =*/ nb12,
+                    /*.nb13 =*/ nb13,
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.ne2  =*/ ne2,
+                    /*.ne3  =*/ ne3,
+                    /*.nb0  =*/ nb0,
+                    /*.nb1  =*/ pnb1,
+                    /*.nb2  =*/ pnb2,
+                    /*.nb3  =*/ pnb3,
+                    /*.offs =*/ offs,
+                };
+
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
-                [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
-                [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
-                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
-                [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
-                [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
-                [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
 
                 const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
 
@@ -1465,10 +1482,10 @@ static void ggml_metal_encode_node(
                 memcpy(&max, ((const int32_t *) dst->op_params) + 1, sizeof(float));
 
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
-                [encoder setBytes:&min length:sizeof(min) atIndex:2];
-                [encoder setBytes:&max length:sizeof(max) atIndex:3];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&min   length:sizeof(min) atIndex:2];
+                [encoder setBytes:&max   length:sizeof(max) atIndex:3];
 
                 const int64_t n = ggml_nelements(dst);
 
@@ -1640,6 +1657,7 @@ static void ggml_metal_encode_node(
 
                 id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -1715,6 +1733,8 @@ static void ggml_metal_encode_node(
                 const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
                 const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
+                // TODO: add ggml_metal_kargs struct
+                // TODO: optimize (see https://github.com/ggerganov/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
                 if (id_src1) {
@@ -1731,6 +1751,7 @@ static void ggml_metal_encode_node(
                 [encoder setBytes:&m0          length:sizeof(m0)          atIndex:8];
                 [encoder setBytes:&m1          length:sizeof(m1)          atIndex:9];
                 [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
+
                 [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
                 [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
@@ -1747,6 +1768,7 @@ static void ggml_metal_encode_node(
                     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
                 }
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -1771,6 +1793,7 @@ static void ggml_metal_encode_node(
 
                 id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline;
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
                 [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
@@ -1841,6 +1864,7 @@ static void ggml_metal_encode_node(
 
                 id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                 [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
@@ -1959,24 +1983,29 @@ static void ggml_metal_encode_node(
                                 default: GGML_ABORT("MUL MAT-MAT not implemented");
                             }
 
+                            ggml_metal_kargs_mul_mm args = {
+                                /*.ne00 =*/ ne00,
+                                /*.ne02 =*/ ne02,
+                                /*.nb01 =*/ nb01,
+                                /*.nb02 =*/ nb02,
+                                /*.nb03 =*/ nb03,
+                                /*.ne12 =*/ ne12,
+                                /*.nb10 =*/ nb10,
+                                /*.nb11 =*/ nb11,
+                                /*.nb12 =*/ nb12,
+                                /*.nb13 =*/ nb13,
+                                /*.ne0  =*/ ne0,
+                                /*.ne1  =*/ ne1,
+                                /*.r2   =*/ r2,
+                                /*.r3   =*/ r3,
+                            };
+
                             [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
-                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
-                            [encoder setBytes:&nb03    length:sizeof(nb03) atIndex:7];
-                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
-                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:9];
-                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:10];
-                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:11];
-                            [encoder setBytes:&nb13    length:sizeof(nb13) atIndex:12];
-                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
-                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:15];
-                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:16];
+                            [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
+                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
+                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
+
                             [encoder setThreadgroupMemoryLength:8192 atIndex:0];
                             [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                         } else {
@@ -2154,28 +2183,32 @@ static void ggml_metal_encode_node(
                                     }
                             };
 
+                            ggml_metal_kargs_mul_mv args = {
+                                /*.ne00 =*/ ne00,
+                                /*.ne01 =*/ ne01,
+                                /*.ne02 =*/ ne02,
+                                /*.nb00 =*/ nb00,
+                                /*.nb01 =*/ nb01,
+                                /*.nb02 =*/ nb02,
+                                /*.nb03 =*/ nb03,
+                                /*.ne10 =*/ ne10,
+                                /*.ne11 =*/ ne11,
+                                /*.ne12 =*/ ne12,
+                                /*.nb10 =*/ nb10,
+                                /*.nb11 =*/ nb11,
+                                /*.nb12 =*/ nb12,
+                                /*.nb13 =*/ nb13,
+                                /*.ne0  =*/ ne0,
+                                /*.ne1  =*/ ne1,
+                                /*.r2   =*/ r2,
+                                /*.r3   =*/ r3,
+                            };
+
                             [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:13];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:14];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:15];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:16];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:18];
-                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:19];
-                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:20];
+                            [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
 
                             if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
                                 src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
@@ -2288,27 +2321,30 @@ static void ggml_metal_encode_node(
                         default: GGML_ABORT("MUL_MAT_ID not implemented");
                     }
 
+                    ggml_metal_kargs_mul_mm_id args = {
+                        /*.nei0 =*/ ne20,
+                        /*.nei1 =*/ ne21,
+                        /*.nbi1 =*/ nb21,
+                        /*.ne00 =*/ ne00,
+                        /*.ne02 =*/ ne02,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.ne11 =*/ ne11,
+                        /*.ne12 =*/ ne12,
+                        /*.ne13 =*/ ne13,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                    };
+
                     [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                    [encoder setBuffer:id_src2 offset:offs_src2    atIndex:3];
-                    [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
-                    [encoder setBytes:&ne21    length:sizeof(ne21) atIndex:5];
-                    [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
-                    [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:7];
-                    [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:8];
-                    [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:9];
-                    [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:10];
-                    [encoder setBytes:&ne11    length:sizeof(ne11) atIndex:11];
-                    [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:12];
-                    [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:13];
-                    [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:14];
-                    [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:15];
-                    [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:16];
-                    [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:17];
-                    [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:18];
-                    [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:19];
+                    [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
+                    [encoder setBuffer:id_src2 offset:offs_src2    atIndex:4];
 
                     [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];
 
@@ -2467,30 +2503,34 @@ static void ggml_metal_encode_node(
                         GGML_ASSERT(ne00 >= nth0*nth1);
                     }
 
+                    ggml_metal_kargs_mul_mv_id args = {
+                        /*.nei0 =*/ ne20,
+                        /*.nei1 =*/ ne21,
+                        /*.nbi1 =*/ nb21,
+                        /*.ne00 =*/ ne00,
+                        /*.ne01 =*/ ne01,
+                        /*.ne02 =*/ ne02,
+                        /*.nb00 =*/ nb00,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.ne10 =*/ ne10,
+                        /*.ne11 =*/ ne11,
+                        /*.ne12 =*/ ne12,
+                        /*.ne13 =*/ ne13,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                        /*.nb1  =*/ nb1,
+                    };
+
                     [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
-                    [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
-                    [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
-                    [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6];
-                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7];
-                    [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8];
-                    [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9];
-                    [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10];
-                    [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11];
-                    [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12];
-                    [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13];
-                    [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14];
-                    [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15];
-                    [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16];
-                    [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17];
-                    [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18];
-                    [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19];
-                    [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:20];
-                    [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:21];
-                    [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:22];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
+                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:4];
 
                     const int64_t _ne1 = 1;
                     const int tgz = dst_rows;
@@ -2563,6 +2603,7 @@ static void ggml_metal_encode_node(
                     default: GGML_ABORT("not implemented");
                 }
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
                 [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
@@ -2586,20 +2627,28 @@ static void ggml_metal_encode_node(
                 float eps;
                 memcpy(&eps, dst->op_params, sizeof(float));
 
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
+
                 int nth = 32; // SIMD width
 
-                while (nth < ne00/4 && nth < 1024) {
+                while (nth < ne00/4 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
                     nth *= 2;
                 }
 
-                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
+                nth = MIN(nth, ne00/4);
+
+                ggml_metal_kargs_rms_norm args = {
+                    /*.ne00   =*/ ne00,
+                    /*.ne00_4 =*/ ne00/4,
+                    /*.nb01   =*/ nb01,
+                    /*.eps    =*/ eps,
+                };
 
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+
                 [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
                 const int64_t nrows = ggml_nrows(src0);
@@ -2624,6 +2673,7 @@ static void ggml_metal_encode_node(
 
                 id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
                 [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
@@ -2641,22 +2691,35 @@ static void ggml_metal_encode_node(
             } break;
         case GGML_OP_NORM:
             {
+                GGML_ASSERT(ne00 % 4 == 0);
                 GGML_ASSERT(ggml_is_contiguous_1(src0));
 
                 float eps;
                 memcpy(&eps, dst->op_params, sizeof(float));
 
-                const int nth = MIN(256, ne00);
-
                 id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
 
+                int nth = 32; // SIMD width
+
+                while (nth < ne00/4 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                    nth *= 2;
+                }
+
+                nth = MIN(nth, ne00/4);
+
+                ggml_metal_kargs_norm args = {
+                    /*.ne00   =*/ ne00,
+                    /*.ne00_4 =*/ ne00/4,
+                    /*.nb01   =*/ nb01,
+                    /*.eps    =*/ eps,
+                };
+
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
                 const int64_t nrows = ggml_nrows(src0);
 
@@ -2706,40 +2769,44 @@ static void ggml_metal_encode_node(
                     };
                 }
 
+                ggml_metal_kargs_rope args = {
+                    /*.ne00        =*/ ne00,
+                    /*.ne01        =*/ ne01,
+                    /*.ne02        =*/ ne02,
+                    /*.ne03        =*/ ne03,
+                    /*.nb00        =*/ nb00,
+                    /*.nb01        =*/ nb01,
+                    /*.nb02        =*/ nb02,
+                    /*.nb03        =*/ nb03,
+                    /*.ne0         =*/ ne0,
+                    /*.ne1         =*/ ne1,
+                    /*.ne2         =*/ ne2,
+                    /*.ne3         =*/ ne3,
+                    /*.nb0         =*/ nb0,
+                    /*.nb1         =*/ nb1,
+                    /*.nb2         =*/ nb2,
+                    /*.nb3         =*/ nb3,
+                    /*.n_past      =*/ n_past,
+                    /*.n_dims      =*/ n_dims,
+                    /*.n_ctx_orig  =*/ n_ctx_orig,
+                    /*.freq_base   =*/ freq_base,
+                    /*.freq_scale  =*/ freq_scale,
+                    /*.ext_factor  =*/ ext_factor,
+                    /*.attn_factor =*/ attn_factor,
+                    /*.beta_fast   =*/ beta_fast,
+                    /*.beta_slow   =*/ beta_slow,
+                };
+
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
-                [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
+                [encoder setBytes:&args length:sizeof(args)     atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0     atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1     atIndex:2];
                 if (id_src2 != nil) {
-                    [encoder setBuffer:id_src2 offset:offs_src2        atIndex:2];
+                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
                 } else {
-                    [encoder setBuffer:id_src0 offset:offs_src0        atIndex:2];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:3];
                 }
-                [encoder setBuffer:id_dst      offset:offs_dst         atIndex:3];
-                [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:4];
-                [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:5];
-                [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:6];
-                [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:7];
-                [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:8];
-                [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:9];
-                [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:10];
-                [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:11];
-                [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:12];
-                [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:13];
-                [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:14];
-                [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:15];
-                [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:16];
-                [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:17];
-                [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:18];
-                [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:19];
-                [encoder setBytes:&n_past      length:sizeof(     int) atIndex:20];
-                [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:21];
-                [encoder setBytes:&n_ctx_orig  length:sizeof(     int) atIndex:22];
-                [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
-                [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
-                [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
-                [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
-                [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
-                [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
+                [encoder setBuffer:id_dst  offset:offs_dst      atIndex:4];
 
                 [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
@@ -2796,6 +2863,7 @@ static void ggml_metal_encode_node(
                     default: GGML_ABORT("fatal error");
                 };
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src1 offset:offs_src1       atIndex:0];
                 [encoder setBuffer:id_dst  offset:offs_dst        atIndex:1];
@@ -2836,6 +2904,7 @@ static void ggml_metal_encode_node(
 
                 const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -2870,6 +2939,7 @@ static void ggml_metal_encode_node(
 
                 id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -2906,6 +2976,7 @@ static void ggml_metal_encode_node(
 
                 id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_dst  offset:offs_dst    atIndex:0];
                 [encoder setBytes:&ne0   length:sizeof(ne0)   atIndex:1];
@@ -2927,6 +2998,7 @@ static void ggml_metal_encode_node(
 
                 id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -2965,6 +3037,7 @@ static void ggml_metal_encode_node(
                     default: GGML_ABORT("fatal error");
                 };
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
                 [encoder setBuffer:id_dst      offset:offs_dst         atIndex:1];
@@ -2983,6 +3056,7 @@ static void ggml_metal_encode_node(
 
                 id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
                 [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
@@ -3224,37 +3298,41 @@ static void ggml_metal_encode_node(
                     }
                 }
 
+                ggml_metal_kargs_flash_attn_ext args = {
+                    /*.ne01          =*/ ne01,
+                    /*.ne02          =*/ ne02,
+                    /*.ne03          =*/ ne03,
+                    /*.nb01          =*/ nb01,
+                    /*.nb02          =*/ nb02,
+                    /*.nb03          =*/ nb03,
+                    /*.ne11          =*/ ne11,
+                    /*.ne_12_2       =*/ ne12,
+                    /*.ne_12_3       =*/ ne13,
+                    /*.nb_12_1       =*/ nb11,
+                    /*.nb_12_2       =*/ nb12,
+                    /*.nb_12_3       =*/ nb13,
+                    /*.nb31          =*/ nb31,
+                    /*.ne1           =*/ ne1,
+                    /*.ne2           =*/ ne2,
+                    /*.scale         =*/ scale,
+                    /*.max_bias      =*/ max_bias,
+                    /*.m0            =*/ m0,
+                    /*.m1            =*/ m1,
+                    /*.n_head_log2   =*/ n_head_log2,
+                    /*.logit_softcap =*/ logit_softcap,
+                };
+
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0     offset:offs_src0           atIndex:0];
-                [encoder setBuffer:id_src1     offset:offs_src1           atIndex:1];
-                [encoder setBuffer:id_src2     offset:offs_src2           atIndex:2];
+                [encoder setBytes:&args length:sizeof(args)     atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0     atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1     atIndex:2];
+                [encoder setBuffer:id_src2 offset:offs_src2     atIndex:3];
                 if (id_src3) {
-                    [encoder setBuffer:id_src3     offset:offs_src3           atIndex:3];
+                    [encoder setBuffer:id_src3 offset:offs_src3 atIndex:4];
                 } else {
-                    [encoder setBuffer:id_src0     offset:offs_src0           atIndex:3];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:4];
                 }
-                [encoder setBuffer:id_dst        offset:offs_dst              atIndex:4];
-                [encoder setBytes:&ne01          length:sizeof( int64_t)      atIndex:5];
-                [encoder setBytes:&ne02          length:sizeof( int64_t)      atIndex:6];
-                [encoder setBytes:&ne03          length:sizeof( int64_t)      atIndex:7];
-                [encoder setBytes:&nb01          length:sizeof(uint64_t)      atIndex:8];
-                [encoder setBytes:&nb02          length:sizeof(uint64_t)      atIndex:9];
-                [encoder setBytes:&nb03          length:sizeof(uint64_t)      atIndex:10];
-                [encoder setBytes:&ne11          length:sizeof( int64_t)      atIndex:11];
-                [encoder setBytes:&ne12          length:sizeof( int64_t)      atIndex:12];
-                [encoder setBytes:&ne13          length:sizeof( int64_t)      atIndex:13];
-                [encoder setBytes:&nb11          length:sizeof(uint64_t)      atIndex:14];
-                [encoder setBytes:&nb12          length:sizeof(uint64_t)      atIndex:15];
-                [encoder setBytes:&nb13          length:sizeof(uint64_t)      atIndex:16];
-                [encoder setBytes:&nb31          length:sizeof(uint64_t)      atIndex:17];
-                [encoder setBytes:&ne1           length:sizeof( int64_t)      atIndex:18];
-                [encoder setBytes:&ne2           length:sizeof( int64_t)      atIndex:19];
-                [encoder setBytes:&scale         length:sizeof(   float)      atIndex:20];
-                [encoder setBytes:&max_bias      length:sizeof(   float)      atIndex:21];
-                [encoder setBytes:&m0            length:sizeof(m0)            atIndex:22];
-                [encoder setBytes:&m1            length:sizeof(m1)            atIndex:23];
-                [encoder setBytes:&n_head_log2   length:sizeof(n_head_log2)   atIndex:24];
-                [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:25];
+                [encoder setBuffer:id_dst offset:offs_dst       atIndex:5];
 
                 if (!use_vec_kernel) {
                     // half8x8 kernel
@@ -3389,25 +3467,29 @@ static void ggml_metal_encode_node(
                     default: GGML_ABORT("not implemented");
                 }
 
+                ggml_metal_kargs_cpy args = {
+                    /*.ne00 =*/ ne00,
+                    /*.ne01 =*/ ne01,
+                    /*.ne02 =*/ ne02,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb03 =*/ nb03,
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.ne2  =*/ ne2,
+                    /*.ne3  =*/ ne3,
+                    /*.nb0  =*/ nb0,
+                    /*.nb1  =*/ nb1,
+                    /*.nb2  =*/ nb2,
+                    /*.nb3  =*/ nb3,
+                };
+
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
 
                 [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
@@ -3452,6 +3534,7 @@ static void ggml_metal_encode_node(
                 const int64_t n_threads = MIN((int64_t)[pipeline maxTotalThreadsPerThreadgroup], parallel_elements);
                 const int64_t n_tg = (parallel_elements + n_threads - 1) / n_threads;
 
+                // TODO: add ggml_metal_kargs struct
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBuffer:id_src0 offset:offs_src0       atIndex:0];
                 [encoder setBuffer:id_dst  offset:offs_dst        atIndex:1];
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 8c7fcb113..86fdf1c18 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -6,6 +6,7 @@ __embed_ggml-common.h__
 // TODO: this should not be a relative path, but can't figure out how to set Metal include paths in Package.swift
 #include "../ggml-common.h"
 #endif
+#include "ggml-metal-impl.h"
 
 #include <metal_stdlib>
 
@@ -497,240 +498,131 @@ enum ggml_sort_order {
 // pros: works for non-contiguous tensors, supports broadcast across all dims
 // cons: not very efficient
 kernel void kernel_add(
+        constant ggml_metal_kargs_bin & args,
         device const char * src0,
         device const char * src1,
         device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        constant  int64_t & offs,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig.z;
+    const int i02 = tgpig.y;
+    const int i01 = tgpig.x;
 
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
+    const int i13 = i03%args.ne13;
+    const int i12 = i02%args.ne12;
+    const int i11 = i01%args.ne11;
 
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + offs;
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
+    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
+    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1  + args.offs;
 
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i10 = i0 % ne10;
-        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) + *((device float *)(src1_ptr + i10*nb10));
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i10 = i0%args.ne10;
+        *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) + *((device float *)(src1_ptr + i10*args.nb10));
     }
 }
 
 kernel void kernel_sub(
+        constant ggml_metal_kargs_bin & args,
         device const char * src0,
         device const char * src1,
         device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        constant  int64_t & offs,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig.z;
+    const int i02 = tgpig.y;
+    const int i01 = tgpig.x;
 
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
+    const int i13 = i03%args.ne13;
+    const int i12 = i02%args.ne12;
+    const int i11 = i01%args.ne11;
 
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + offs;
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
+    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
+    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1  + args.offs;
 
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i10 = i0 % ne10;
-        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) - *((device float *)(src1_ptr + i10*nb10));
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i10 = i0%args.ne10;
+        *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) - *((device float *)(src1_ptr + i10*args.nb10));
     }
 }
 
 kernel void kernel_mul(
+        constant ggml_metal_kargs_bin & args,
         device const char * src0,
         device const char * src1,
         device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig.z;
+    const int i02 = tgpig.y;
+    const int i01 = tgpig.x;
 
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
+    const int i13 = i03%args.ne13;
+    const int i12 = i02%args.ne12;
+    const int i11 = i01%args.ne11;
 
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
+    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
+    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1;
 
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i10 = i0 % ne10;
-        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) * *((device float *)(src1_ptr + i10*nb10));
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i10 = i0%args.ne10;
+        *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * *((device float *)(src1_ptr + i10*args.nb10));
     }
 }
 
 kernel void kernel_div(
+        constant ggml_metal_kargs_bin & args,
         device const char * src0,
         device const char * src1,
         device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig.z;
+    const int i02 = tgpig.y;
+    const int i01 = tgpig.x;
 
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
+    const int i13 = i03%args.ne13;
+    const int i12 = i02%args.ne12;
+    const int i11 = i01%args.ne11;
 
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
+    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
+    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1;
 
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i10 = i0 % ne10;
-        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) / *((device float *)(src1_ptr + i10*nb10));
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i10 = i0%args.ne10;
+        *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) / *((device float *)(src1_ptr + i10*args.nb10));
     }
 }
 
 template<typename T>
 kernel void kernel_repeat(
+        constant ggml_metal_kargs_repeat & args,
         device const char * src0,
         device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i3 = tgpig.z;
+    const int i2 = tgpig.y;
+    const int i1 = tgpig.x;
 
-    const int64_t i03 = i3 % ne03;
-    const int64_t i02 = i2 % ne02;
-    const int64_t i01 = i1 % ne01;
+    const int i03 = i3%args.ne03;
+    const int i02 = i2%args.ne02;
+    const int i01 = i1%args.ne01;
 
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    device       char * dst_ptr  = dst  +  i3*nb3  +  i2*nb2  +  i1*nb1 ;
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
+    device       char * dst_ptr  = dst  +  i3*args.nb3  +  i2*args.nb2  +  i1*args.nb1;
 
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i00 = i0 % ne00;
-        *((device T *)(dst_ptr + i0*nb0)) = *((device T *)(src0_ptr + i00*nb00));
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i00 = i0%args.ne00;
+        *((device T *)(dst_ptr + i0*args.nb0)) = *((device T *)(src0_ptr + i00*args.nb00));
     }
 }
 
@@ -744,38 +636,42 @@ template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_add_row(
+        constant ggml_metal_kargs_bin & args,
         device const float4 * src0,
         device const float4 * src1,
         device       float4 * dst,
-        constant   uint64_t & nb [[buffer(28)]],
         uint tpig[[thread_position_in_grid]]) {
+    const uint nb = args.ne00/4;
     dst[tpig] = src0[tpig] + src1[tpig % nb];
 }
 
 kernel void kernel_sub_row(
+        constant ggml_metal_kargs_bin & args,
         device const float4 * src0,
         device const float4 * src1,
         device       float4 * dst,
-        constant   uint64_t & nb [[buffer(28)]],
         uint tpig[[thread_position_in_grid]]) {
+    const uint nb = args.ne00/4;
     dst[tpig] = src0[tpig] - src1[tpig % nb];
 }
 
 kernel void kernel_mul_row(
+        constant ggml_metal_kargs_bin & args,
         device const float4 * src0,
         device const float4 * src1,
         device       float4 * dst,
-        constant   uint64_t & nb  [[buffer(28)]],
         uint tpig[[thread_position_in_grid]]) {
+    const uint nb = args.ne00/4;
     dst[tpig] = src0[tpig] * src1[tpig % nb];
 }
 
 kernel void kernel_div_row(
+        constant ggml_metal_kargs_bin & args,
         device const float4 * src0,
         device const float4 * src1,
         device       float4 * dst,
-        constant   uint64_t & nb  [[buffer(28)]],
         uint tpig[[thread_position_in_grid]]) {
+    const uint nb = args.ne00/4;
     dst[tpig] = src0[tpig] / src1[tpig % nb];
 }
 
@@ -1345,102 +1241,112 @@ kernel void kernel_ssm_scan_f32(
 }
 
 kernel void kernel_norm(
-        device const  void * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant     float & eps,
-        threadgroup float  * sum [[threadgroup(0)]],
-        uint tgpig[[threadgroup_position_in_grid]],
-        uint tpitg[[thread_position_in_threadgroup]],
-        uint   ntg[[threads_per_threadgroup]]) {
-    device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
-    // MEAN
-    // parallel sum
-    sum[tpitg] = 0.0f;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        sum[tpitg] += x[i00];
+        constant ggml_metal_kargs_norm & args,
+        device const char * src0,
+        device       char * dst,
+        threadgroup float * shmem_f32 [[threadgroup(0)]],
+        uint   tgpig[[threadgroup_position_in_grid]],
+        ushort tpitg[[thread_position_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort   ntg[[threads_per_threadgroup]]) {
+    if (sgitg == 0) {
+        shmem_f32[tiisg] = 0.0f;
     }
-    // reduce
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i = ntg/2; i > 0; i /= 2) {
-        if (tpitg < i) {
-            sum[tpitg] += sum[tpitg + i];
-        }
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-    const float mean  = sum[0] / ne00;
 
-    // recenter and VARIANCE
+    device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01);
+
+    float4 sumf4(0.0f);
+
+    float sumf = 0.0f;
+
+    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
+        sumf4 += x[i00];
+    }
+    sumf = sumf4[0] + sumf4[1] + sumf4[2] + sumf4[3];
+    sumf = simd_sum(sumf);
+
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    device float * y = dst + tgpig*ne00;
-    sum[tpitg] = 0.0f;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+
+    const float mean = sumf/args.ne00;
+
+    device float4 * y = (device float4 *) dst + tgpig*args.ne00_4;
+
+    sumf = 0.0f;
+    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
         y[i00] = x[i00] - mean;
-        sum[tpitg] += y[i00] * y[i00];
+        sumf += dot(y[i00], y[i00]);
     }
+    sumf = simd_sum(sumf);
 
-    // reduce
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i = ntg/2; i > 0; i /= 2) {
-        if (tpitg < i) {
-            sum[tpitg] += sum[tpitg + i];
-        }
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-    const float variance = sum[0] / ne00;
 
-    const float scale = 1.0f/sqrt(variance + eps);
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+
+    const float variance = sumf/args.ne00;
+
+    const float scale = 1.0f/sqrt(variance + args.eps);
+    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
         y[i00] = y[i00] * scale;
     }
 }
 
 kernel void kernel_rms_norm(
-        device const  void * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant     float & eps,
-        threadgroup float  * buf [[threadgroup(0)]],
-        uint tgpig[[threadgroup_position_in_grid]],
-        uint tpitg[[thread_position_in_threadgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint   ntg[[threads_per_threadgroup]]) {
-    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
+        constant ggml_metal_kargs_rms_norm & args,
+        device const char * src0,
+        device       char * dst,
+        threadgroup float * shmem_f32 [[threadgroup(0)]],
+        uint   tgpig[[threadgroup_position_in_grid]],
+        ushort tpitg[[thread_position_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort   ntg[[threads_per_threadgroup]]) {
+    if (sgitg == 0) {
+        shmem_f32[tiisg] = 0.0f;
+    }
 
-    float4 sumf = 0;
-    float all_sum = 0;
+    device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01);
+
+    float sumf = 0.0f;
 
     // parallel sum
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        sumf += x[i00] * x[i00];
+    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
+        sumf += dot(x[i00], x[i00]);
     }
-    all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
-    all_sum = simd_sum(all_sum);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
+    sumf = simd_sum(sumf);
 
-        threadgroup_barrier(mem_flags::mem_threadgroup);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
 
-        if (tiisg == 0) {
-            buf[sgitg] = all_sum;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        all_sum = buf[tiisg];
-        all_sum = simd_sum(all_sum);
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
     }
 
-    const float mean  = all_sum/ne00;
-    const float scale = 1.0f/sqrt(mean + eps);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    device float4 * y = (device float4 *) (dst + tgpig*ne00);
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+
+    const float mean  = sumf/args.ne00;
+    const float scale = 1.0f/sqrt(mean + args.eps);
+
+    device float4 * y = (device float4 *) dst + tgpig*args.ne00_4;
+    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
         y[i00] = x[i00] * scale;
     }
 }
@@ -1628,31 +1534,17 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
 //      quantizations where the block size is 32. It also does not
 //      guard against the number of rows not being divisible by
 //      N_DST, so this is another explicit assumption of the implementation.
-template<typename block_q_type, int nr, int nsg, int nw>
+template<typename block_q_type, int nr, int nsg, int nw, typename args_t>
 void mul_vec_q_n_f32_impl(
-        device const void  * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                     uint3   tgpig,
-                     uint    tiisg,
-                     uint    sgitg) {
-    const int nb = ne00/QK4_0;
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+    const int nb = args.ne00/QK4_0;
 
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
@@ -1660,19 +1552,19 @@ void mul_vec_q_n_f32_impl(
 
     const int first_row = (r0 * nsg + sgitg) * nr;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-  //const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+  //const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-  //device const block_q_type * x = (device const block_q_type *) ((device char *) src0 + offset0);
-    device const float        * y = (device const float        *) ((device char *) src1 + offset1);
+  //device const block_q_type * x = (device const block_q_type *) (src0 + offset0);
+    device const float        * y = (device const float        *) (src1 + offset1);
 
     // pointers to src0 rows
     device const block_q_type * ax[nr];
     for (int row = 0; row < nr; ++row) {
-        const uint offset0 = (first_row + row)*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
 
         ax[row] = (device const block_q_type *) ((device char *) src0 + offset0);
     }
@@ -1680,10 +1572,10 @@ void mul_vec_q_n_f32_impl(
     float yl[16]; // src1 vector cache
     float sumf[nr] = {0.f};
 
-    const int ix = (tiisg/2);
-    const int il = (tiisg%2)*8;
+    const short ix = (tiisg/2);
+    const short il = (tiisg%2)*8;
 
-    device const float * yb = y + ix * QK4_0 + il;
+    device const float * yb = y + ix*QK4_0 + il;
 
     // each thread in a SIMD group deals with half a block.
     for (int ib = ix; ib < nb; ib += nw/2) {
@@ -1708,324 +1600,216 @@ void mul_vec_q_n_f32_impl(
         yb += QK4_0 * 16;
     }
 
+    device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0;
+
     for (int row = 0; row < nr; ++row) {
         const float tot = simd_sum(sumf[row]);
-        if (tiisg == 0 && first_row + row < ne01) {
-            dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
+
+        if (tiisg == 0 && first_row + row < args.ne01) {
+            dst_f32[first_row + row] = tot;
         }
     }
 }
 
 kernel void kernel_mul_mv_q4_0_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
 kernel void kernel_mul_mv_q4_1_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-     mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+     mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
 kernel void kernel_mul_mv_q5_0_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
 kernel void kernel_mul_mv_q5_1_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
-
 #define NB_Q8_0 8
 
+template<typename args_t>
 void kernel_mul_mv_q8_0_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
     const int nr  = N_DST;
     const int nsg = N_SIMDGROUP;
     const int nw  = N_SIMDWIDTH;
 
-    const int nb = ne00/QK8_0;
+    const int nb = args.ne00/QK8_0;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
-    const int first_row = (r0 * nsg + sgitg) * nr;
+    const int first_row = (r0*nsg + sgitg)*nr;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-  //const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+  //const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-  //device const block_q8_0 * x = (device const block_q8_0 *) ((device char *) src0 + offset0);
-    device const float      * y = (device const float      *) ((device char *) src1 + offset1);
+  //device const block_q8_0 * x = (device const block_q8_0 *) (src0 + offset0);
+    device const float      * y = (device const float      *) (src1 + offset1);
 
     // pointers to src0 rows
     device const block_q8_0 * ax[nr];
     for (int row = 0; row < nr; ++row) {
-        const uint offset0 = (first_row + row)*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
 
         ax[row] = (device const block_q8_0 *) ((device char *) src0 + offset0);
     }
 
     float yl[NB_Q8_0];
-    float sumf[nr]={0.f};
+    float sumf[nr] = { 0.f };
 
-    const int ix = tiisg/4;
-    const int il = tiisg%4;
+    const short ix = tiisg/4;
+    const short il = tiisg%4;
 
-    device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
+    device const float * yb = y + ix*QK8_0 + il*NB_Q8_0;
 
     // each thread in a SIMD group deals with NB_Q8_0 quants at a time
     for (int ib = ix; ib < nb; ib += nw/4) {
-        for (int i = 0; i < NB_Q8_0; ++i) {
+        for (short i = 0; i < NB_Q8_0; ++i) {
             yl[i] = yb[i];
         }
 
         for (int row = 0; row < nr; row++) {
-            device const int8_t * qs = ax[row][ib].qs + NB_Q8_0*il;
+            device const int8_t * qs = ax[row][ib].qs + il*NB_Q8_0;
             float sumq = 0.f;
-            for (int iq = 0; iq < NB_Q8_0; ++iq) {
+            for (short iq = 0; iq < NB_Q8_0; ++iq) {
                 sumq += qs[iq] * yl[iq];
             }
             sumf[row] += sumq*ax[row][ib].d;
         }
 
-        yb += NB_Q8_0 * nw;
+        yb += nw*NB_Q8_0;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < nr; ++row) {
         const float tot = simd_sum(sumf[row]);
-        if (tiisg == 0 && first_row + row < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
+
+        if (tiisg == 0 && first_row + row < args.ne01) {
+            dst_f32[first_row + row] = tot;
         }
     }
 }
 
 [[host_name("kernel_mul_mv_q8_0_f32")]]
 kernel void kernel_mul_mv_q8_0_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    kernel_mul_mv_q8_0_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
 #define N_MV_T_T 4
 
-template<typename T0, typename T04, typename T1, typename T14>
+template<typename T0, typename T04, typename T1, typename T14, typename args_t>
 void kernel_mul_mv_impl(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb00,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne11,
-                   int64_t   ne12,
-                  uint64_t   nb10,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-                   uint3     tgpig,
-                   uint      tiisg) {
-    const int64_t r0 = tgpig.x;
-    const int64_t rb = tgpig.y*N_MV_T_T;
-    const int64_t im = tgpig.z;
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig,
+        ushort tiisg) {
+    const int r0 = tgpig.x;
+    const int rb = tgpig.y*N_MV_T_T;
+    const int im = tgpig.z;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
 
     device const T0 * x = (device const T0 *) (src0 + offset0);
 
-    if (ne00 < 128) {
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1;
+
+    if (args.ne00 < 128) {
         for (int row = 0; row < N_MV_T_T; ++row) {
             int r1 = rb + row;
-            if (r1 >= ne11) {
+            if (r1 >= args.ne11) {
                 break;
             }
 
-            const uint offset1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+            const uint64_t offset1 = r1*args.nb11 + (i12   )*args.nb12 + (i13   )*args.nb13;
 
             device const T1 * y = (device const T1 *) (src1 + offset1);
 
             float sumf = 0;
-            for (int i = tiisg; i < ne00; i += 32) {
+            for (int i = tiisg; i < args.ne00; i += 32) {
                 sumf += (T0) x[i] * (T1) y[i];
             }
 
             float all_sum = simd_sum(sumf);
             if (tiisg == 0) {
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+                dst_f32[(uint64_t)r1*args.ne0 + r0] = all_sum;
             }
         }
     } else {
         device const T04 * x4 = (device const T04 *) x;
         for (int row = 0; row < N_MV_T_T; ++row) {
             int r1 = rb + row;
-            if (r1 >= ne11) {
+            if (r1 >= args.ne11) {
                 break;
             }
 
-            const uint offset1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+            const uint64_t offset1 = r1*args.nb11 + (i12   )*args.nb12 + (i13   )*args.nb13;
 
             device const T1  * y  = (device const T1  *) (src1 + offset1);
             device const T14 * y4 = (device const T14 *) y;
 
             float sumf = 0;
-            for (int i = tiisg; i < ne00/4; i += 32) {
-                for (int k = 0; k < 4; ++k) sumf += (float) (x4[i][k] * y4[i][k]);
+            for (int i = tiisg; i < args.ne00/4; i += 32) {
+                sumf += dot((float4) x4[i], (float4) y4[i]);
             }
 
             float all_sum = simd_sum(sumf);
             if (tiisg == 0) {
-                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) (x[i] * y[i]);
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+                for (int i = 4*(args.ne00/4); i < args.ne00; ++i) all_sum += (float) (x[i] * y[i]);
+                dst_f32[(uint64_t)r1*args.ne0 + r0] = all_sum;
             }
         }
     }
@@ -2033,51 +1817,17 @@ void kernel_mul_mv_impl(
 
 template<typename T0, typename T04, typename T1, typename T14>
 kernel void kernel_mul_mv(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-    kernel_mul_mv_impl<T0, T04, T1, T14>(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]]) {
+    kernel_mul_mv_impl<T0, T04, T1, T14, constant ggml_metal_kargs_mul_mv &>(
+        args,
         src0,
         src1,
         dst,
-        ne00,
-        ne01,
-        ne02,
-        nb00,
-        nb01,
-        nb02,
-        nb03,
-        ne10,
-        ne11,
-        ne12,
-        nb10,
-        nb11,
-        nb12,
-        nb13,
-        ne0,
-        ne1,
-        r2,
-        r3,
         tgpig,
         tiisg);
 }
@@ -2094,65 +1844,50 @@ template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv<
 
 template<typename T, typename T4>
 kernel void kernel_mul_mv_1row(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]]) {
 
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int64_t im = tgpig.z;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 = r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
     device const T     * x = (device const T     *) (src0 + offset0);
     device const float * y = (device const float *) (src1 + offset1);
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     float sumf = 0;
-    if (ne00 < 128) {
-        for (int i = tiisg; i < ne00; i += 32) {
+    if (args.ne00 < 128) {
+        for (int i = tiisg; i < args.ne00; i += 32) {
             sumf += (float) x[i] * (float) y[i];
         }
         float all_sum = simd_sum(sumf);
         if (tiisg == 0) {
-            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            dst_f32[r0] = all_sum;
         }
     } else {
         device const T4     * x4 = (device const T4     *) x;
         device const float4 * y4 = (device const float4 *) y;
 
-        for (int i = tiisg; i < ne00/4; i += 32) {
-            for (int k = 0; k < 4; ++k) sumf += (float) (x4[i][k] * y4[i][k]);
+        for (int i = tiisg; i < args.ne00/4; i += 32) {
+            sumf += dot((float4) x4[i], y4[i]);
         }
 
         float all_sum = simd_sum(sumf);
 
         if (tiisg == 0) {
-            for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) (x[i] * y[i]);
-            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            for (int i = 4*(args.ne00/4); i < args.ne00; ++i) all_sum += (float) (x[i] * y[i]);
+            dst_f32[r0] = all_sum;
         }
     }
 }
@@ -2167,54 +1902,39 @@ template [[host_name("kernel_mul_mv_bf16_f32_1row")]] kernel mul_mv_1row_t kerne
 // Assumes row size (ne00) is a multiple of 4
 template<typename T, typename T4>
 kernel void kernel_mul_mv_l4(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]]) {
 
-    const int nrows = ne11;
-    const int64_t r0 = tgpig.x;
-    const int64_t im = tgpig.z;
+    const int nrows = args.ne11;
+    const int r0 = tgpig.x;
+    const int im = tgpig.z;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
 
     device const T4 * x4 = (device const T4 *) (src0 + offset0);
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1;
+
     for (int r1 = 0; r1 < nrows; ++r1) {
-        const uint offset1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+        const uint64_t offset1 = r1*args.nb11 + (i12   )*args.nb12 + (i13   )*args.nb13;
 
         device const float4 * y4 = (device const float4 *) (src1 + offset1);
 
         float sumf = 0;
-        for (int i = tiisg; i < ne00/4; i += 32) {
-            for (int k = 0; k < 4; ++k) sumf += (float) (x4[i][k] * y4[i][k]);
+        for (int i = tiisg; i < args.ne00/4; i += 32) {
+            sumf += dot((float4) x4[i], y4[i]);
         }
 
         float all_sum = simd_sum(sumf);
         if (tiisg == 0) {
-            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            dst_f32[(uint64_t)r1*args.ne0 + r0] = all_sum;
         }
     }
 }
@@ -2234,7 +1954,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
 // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static void rope_yarn(
-    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
+    float theta_extrap, float freq_scale, float corr_dims[2], int i0, float ext_factor, float mscale,
     thread float * cos_theta, thread float * sin_theta) {
     // Get n-d rotational scaling corrected for extrapolation
     float theta_interp = freq_scale * theta_extrap;
@@ -2266,65 +1986,41 @@ static void rope_yarn_corr_dims(
 
 template<typename T>
 kernel void kernel_rope_norm(
-        device const    void * src0,
-        device const int32_t * src1,
-        device const   float * src2,
-        device         float * dst,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant     int64_t & ne03,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant    uint64_t & nb03,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant     int64_t & ne2,
-        constant     int64_t & ne3,
-        constant    uint64_t & nb0,
-        constant    uint64_t & nb1,
-        constant    uint64_t & nb2,
-        constant    uint64_t & nb3,
-        constant         int & n_past,
-        constant         int & n_dims,
-        constant         int & n_ctx_orig,
-        constant       float & freq_base,
-        constant       float & freq_scale,
-        constant       float & ext_factor,
-        constant       float & attn_factor,
-        constant       float & beta_fast,
-        constant       float & beta_slow,
-        uint  tiitg[[thread_index_in_threadgroup]],
-        uint3 tptg[[threads_per_threadgroup]],
-        uint3 tgpig[[threadgroup_position_in_grid]]) {
-    const int64_t i3 = tgpig[2];
-    const int64_t i2 = tgpig[1];
-    const int64_t i1 = tgpig[0];
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
 
     float corr_dims[2];
-    rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
 
-    device const int32_t * pos = src1;
+    device const int32_t * pos = (device const int32_t *) src1;
 
     const float theta_base = (float) pos[i2];
-    const float inv_ndims = -1.f/n_dims;
+    const float inv_ndims = -1.f/args.n_dims;
 
     float cos_theta;
     float sin_theta;
 
-    for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
-        if (i0 < n_dims) {
-            const int64_t ic = i0/2;
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < args.n_dims) {
+            const int ic = i0/2;
 
-            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
 
-            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
 
-            rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
 
-            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
 
             const float x0 = src[0];
             const float x1 = src[1];
@@ -2332,8 +2028,8 @@ kernel void kernel_rope_norm(
             dst_data[0] = x0*cos_theta - x1*sin_theta;
             dst_data[1] = x0*sin_theta + x1*cos_theta;
         } else {
-            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
 
             dst_data[0] = src[0];
             dst_data[1] = src[1];
@@ -2343,74 +2039,50 @@ kernel void kernel_rope_norm(
 
 template<typename T>
 kernel void kernel_rope_neox(
-        device const    void * src0,
-        device const int32_t * src1,
-        device const   float * src2,
-        device         float * dst,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant     int64_t & ne03,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant    uint64_t & nb03,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant     int64_t & ne2,
-        constant     int64_t & ne3,
-        constant    uint64_t & nb0,
-        constant    uint64_t & nb1,
-        constant    uint64_t & nb2,
-        constant    uint64_t & nb3,
-        constant         int & n_past,
-        constant         int & n_dims,
-        constant         int & n_ctx_orig,
-        constant       float & freq_base,
-        constant       float & freq_scale,
-        constant       float & ext_factor,
-        constant       float & attn_factor,
-        constant       float & beta_fast,
-        constant       float & beta_slow,
-        uint  tiitg[[thread_index_in_threadgroup]],
-        uint3 tptg[[threads_per_threadgroup]],
-        uint3 tgpig[[threadgroup_position_in_grid]]) {
-    const int64_t i3 = tgpig[2];
-    const int64_t i2 = tgpig[1];
-    const int64_t i1 = tgpig[0];
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
 
     float corr_dims[2];
-    rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
 
-    device const int32_t * pos = src1;
+    device const int32_t * pos = (device const int32_t *) src1;
 
     const float theta_base = (float) pos[i2];
-    const float inv_ndims = -1.f/n_dims;
+    const float inv_ndims = -1.f/args.n_dims;
 
     float cos_theta;
     float sin_theta;
 
-    for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
-        if (i0 < n_dims) {
-            const int64_t ic = i0/2;
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < args.n_dims) {
+            const int ic = i0/2;
 
-            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
 
-            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
 
-            rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
 
-            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
-            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
 
             const float x0 = src[0];
-            const float x1 = src[n_dims/2];
+            const float x1 = src[args.n_dims/2];
 
-            dst_data[0]        = x0*cos_theta - x1*sin_theta;
-            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
         } else {
-            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
 
             dst_data[0] = src[0];
             dst_data[1] = src[1];
@@ -2808,37 +2480,17 @@ template<
     short KV = 8,    // key/value processed per each simdgroup
     short C  = 32>   // cache items per threadgroup
 kernel void kernel_flash_attn_ext(
-        device const  char * q,
-        device const  char * k,
-        device const  char * v,
-        device const  char * mask,
-        device       float * dst,
-        constant   int32_t & ne01,
-        constant   int32_t & ne02,
-        constant   int32_t & ne03,
-        constant  uint32_t & nb01,
-        constant  uint32_t & nb02,
-        constant  uint32_t & nb03,
-        constant   int32_t & ne11,
-        constant   int32_t & ne_12_2, // assume K and V are same shape
-        constant   int32_t & ne_12_3,
-        constant  uint32_t & nb_12_1,
-        constant  uint32_t & nb_12_2,
-        constant  uint32_t & nb_12_3,
-        constant  uint32_t & nb31,
-        constant   int32_t & ne1,
-        constant   int32_t & ne2,
-        constant     float & scale,
-        constant     float & max_bias,
-        constant     float & m0,
-        constant     float & m1,
-        constant  uint16_t & n_head_log2,
-        constant     float & logit_softcap,
-        threadgroup   half * shared [[threadgroup(0)]],
-        ushort3  tgpig[[threadgroup_position_in_grid]],
-        ushort3    ntg[[threads_per_threadgroup]],
-        ushort   tiisg[[thread_index_in_simdgroup]],
-        ushort   sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_flash_attn_ext & args,
+        device const char * q,
+        device const char * k,
+        device const char * v,
+        device const char * mask,
+        device       char * dst,
+        threadgroup  half * shmem_f16 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3   ntg[[threads_per_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
     const short nsg = ntg.y; // number of simdgroups
 
     const int iq3 = tgpig[2];
@@ -2854,27 +2506,27 @@ kernel void kernel_flash_attn_ext(
     const short TS = nsg*SH;   // shared memory size per query in (s_t == float)
     const short T  = D + 2*TS; // shared memory size per query in (half)
 
-    threadgroup q_t  * sq  = (threadgroup q_t  *) (shared +              0*D); // holds the query data
-    threadgroup q4_t * sq4 = (threadgroup q4_t *) (shared +              0*D); // same as above but in q4_t
-    threadgroup o_t  * so  = (threadgroup o_t  *) (shared +              0*D); // reuse query data for accumulation
-    threadgroup o4_t * so4 = (threadgroup o4_t *) (shared +              0*D); // same as above but in o4_t
-    threadgroup s_t  * ss  = (threadgroup s_t  *) (shared + 2*sgitg*SH + Q*D); // scratch buffer for attention, mask and diagonal matrix
+    threadgroup q_t  * sq  = (threadgroup q_t  *) (shmem_f16 +              0*D); // holds the query data
+    threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 +              0*D); // same as above but in q4_t
+    threadgroup o_t  * so  = (threadgroup o_t  *) (shmem_f16 +              0*D); // reuse query data for accumulation
+    threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 +              0*D); // same as above but in o4_t
+    threadgroup s_t  * ss  = (threadgroup s_t  *) (shmem_f16 + 2*sgitg*SH + Q*D); // scratch buffer for attention, mask and diagonal matrix
 
-    threadgroup k_t    * sk    = (threadgroup k_t    *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
-    threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in k4x4_t
+    threadgroup k_t    * sk    = (threadgroup k_t    *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
+    threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // same as above but in k4x4_t
 
-    threadgroup v_t    * sv    = (threadgroup v_t    *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load V in shared memory
-    threadgroup v4x4_t * sv4x4 = (threadgroup v4x4_t *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in v4x4_t
+    threadgroup v_t    * sv    = (threadgroup v_t    *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load V in shared memory
+    threadgroup v4x4_t * sv4x4 = (threadgroup v4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // same as above but in v4x4_t
 
     // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
     o8x8_t lo[D8];
 
     // load heads from Q to shared memory
     for (short j = sgitg; j < Q; j += nsg) {
-        device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03));
+        device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*args.nb01 + iq2*args.nb02 + iq3*args.nb03));
 
         for (short i = tiisg; i < D4; i += NW) {
-            if (iq1 + j < ne01) {
+            if (iq1 + j < args.ne01) {
                 sq4[j*D4 + i] = (q4_t) q4[i];
             } else {
                 sq4[j*D4 + i] = (q4_t) 0.0f;
@@ -2907,11 +2559,11 @@ kernel void kernel_flash_attn_ext(
         const short ty = tiisg/4;
 
         // broadcast kv
-        //const short rk2 = ne02/ne12;
-        //const short rk3 = ne03/ne13;
+        //const short rk2 = args.ne02/args.ne12;
+        //const short rk3 = args.ne03/args.ne13;
 
-        const short ikv2 = iq2/(ne02/ne_12_2);
-        const short ikv3 = iq3/(ne03/ne_12_3);
+        const short ikv2 = iq2/(args.ne02/args.ne_12_2);
+        const short ikv3 = iq3/(args.ne03/args.ne_12_3);
 
         // load the queries from shared memory into local memory
         q8x8_t mq[D8];
@@ -2925,20 +2577,20 @@ kernel void kernel_flash_attn_ext(
         half slope = 1.0f;
 
         // ALiBi
-        if (max_bias > 0.0f) {
+        if (args.max_bias > 0.0f) {
             const short h = iq2;
 
-            const half  base = h < n_head_log2 ? m0 : m1;
-            const short exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+            const half  base = h < args.n_head_log2 ? args.m0 : args.m1;
+            const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
 
             slope = pow(base, exph);
         }
 
         // loop over the KV cache
         // each simdgroup handles blocks of Q rows and C columns
-        for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) {
+        for (int ic0 = 0; ic0 < args.ne11; ic0 += C*nsg) {
             const int ic = ic0 + C*sgitg;
-            if (ic >= ne11) {
+            if (ic >= args.ne11) {
                 break;
             }
 
@@ -2949,7 +2601,7 @@ kernel void kernel_flash_attn_ext(
                 // load the mask in shared memory
                 #pragma unroll(Q)
                 for (short j = 0; j < Q; ++j) {
-                    device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*nb31);
+                    device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31);
 
                     const half m = pm[ic + tiisg];
 
@@ -2972,18 +2624,18 @@ kernel void kernel_flash_attn_ext(
                     // this is compile-time check, so it does not have runtime overhead
                     if (is_same<kd4x4_t, k4x4_t>::value) {
                         // we can read directly from global memory
-                        device const k_t * pk = (device const k_t *) ((device const char *) k + ((ic + 8*cc)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
+                        device const k_t * pk = (device const k_t *) ((device const char *) k + ((ic + 8*cc)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
 
                         #pragma unroll(D8)
                         for (short i = 0; i < D8; ++i) {
                             k8x8_t mk;
-                            simdgroup_load(mk, pk + i*8, nb_12_1/sizeof(k_t), 0, true); // transpose // TODO: use ne10
+                            simdgroup_load(mk, pk + i*8, args.nb_12_1/sizeof(k_t), 0, true); // transpose // TODO: use ne10
 
                             simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
                         }
                     } else {
                         for (short ii = 0; ii < D16; ii += 4) {
-                            device const kd4x4_t * pk4x4 = (device const kd4x4_t *) ((device const char *) k + ((ic + 8*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
+                            device const kd4x4_t * pk4x4 = (device const kd4x4_t *) ((device const char *) k + ((ic + 8*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
 
                             if (D16%4 == 0) {
                                 // the head is evenly divisible by 4*16 = 64, so no need for bound checks
@@ -3042,10 +2694,10 @@ kernel void kernel_flash_attn_ext(
                     const half m = M[j];
 
                     // scale and apply the logitcap / mask
-                    half s = ss[j*TS + tiisg]*scale;
+                    half s = ss[j*TS + tiisg]*args.scale;
 
-                    if (logit_softcap != 0.0f) {
-                        s = logit_softcap*precise::tanh(s);
+                    if (args.logit_softcap != 0.0f) {
+                        s = args.logit_softcap*precise::tanh(s);
                     }
 
                     // mqk = mqk + mask*slope
@@ -3087,18 +2739,18 @@ kernel void kernel_flash_attn_ext(
 
                     if (is_same<vd4x4_t, v4x4_t>::value) {
                         // we can read directly from global memory
-                        device const v_t * pv = (device const v_t *) ((device const char *) v + ((ic + 8*cc)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
+                        device const v_t * pv = (device const v_t *) ((device const char *) v + ((ic + 8*cc)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
 
                         #pragma unroll(D8)
                         for (short i = 0; i < D8; ++i) {
                             v8x8_t mv;
-                            simdgroup_load(mv, pv + i*8, nb_12_1/sizeof(v_t), 0, false); // TODO: use ne20
+                            simdgroup_load(mv, pv + i*8, args.nb_12_1/sizeof(v_t), 0, false); // TODO: use ne20
 
                             simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
                         }
                     } else {
                         for (short ii = 0; ii < D16; ii += 4) {
-                            device const vd4x4_t * pv4x4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 8*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
+                            device const vd4x4_t * pv4x4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 8*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
 
                             if (D16%4 == 0) {
                                 // no need for bound checks
@@ -3227,11 +2879,11 @@ kernel void kernel_flash_attn_ext(
 
     // final rescale with 1/S and store to global memory
     if (sgitg == 0) {
-        for (short j = 0; j < Q && iq1 + j < ne01; ++j) {
+        for (short j = 0; j < Q && iq1 + j < args.ne01; ++j) {
             const float S = ss[j*TS + 0];
 
             for (short i = tiisg; i < D4; i += NW) {
-                dst4[((int64_t)iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) so4[j*D4 + i]/S;
+                dst4[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*D4 + i] = (float4) so4[j*D4 + i]/S;
             }
         }
     }
@@ -3323,38 +2975,17 @@ template<
     short Q  = 1,    // queries per threadgroup
     short C  = 32>   // cache items per threadgroup
 kernel void kernel_flash_attn_ext_vec(
-        device const  char * q,
-        device const  char * k,
-        device const  char * v,
-        device const  char * mask,
-        device       float * dst,
-        constant   int32_t & ne01,
-        constant   int32_t & ne02,
-        constant   int32_t & ne03,
-        constant  uint32_t & nb01,
-        constant  uint32_t & nb02,
-        constant  uint32_t & nb03,
-        constant   int32_t & ne11,
-        constant   int32_t & ne_12_2, // assume K and V are same shape
-        constant   int32_t & ne_12_3,
-        constant  uint32_t & nb_12_1,
-        constant  uint32_t & nb_12_2,
-        constant  uint32_t & nb_12_3,
-        constant  uint32_t & nb31,
-        constant   int32_t & ne1,
-        constant   int32_t & ne2,
-        constant     float & scale,
-        constant     float & max_bias,
-        constant     float & m0,
-        constant     float & m1,
-        constant  uint16_t & n_head_log2,
-        constant     float & logit_softcap,
-        threadgroup   half * shared [[threadgroup(0)]],
-        ushort3  tgpig[[threadgroup_position_in_grid]],
-        ushort3  tpitg[[thread_position_in_threadgroup]],
-        ushort3    ntg[[threads_per_threadgroup]],
-        ushort   tiisg[[thread_index_in_simdgroup]],
-        ushort   sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_flash_attn_ext & args,
+        device const char * q,
+        device const char * k,
+        device const char * v,
+        device const char * mask,
+        device       char * dst,
+        threadgroup  half * shmem_f16 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3   ntg[[threads_per_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
     const short nsg = ntg.y; // number of simdgroups
 
     const int iq3 = tgpig[2];
@@ -3369,22 +3000,22 @@ kernel void kernel_flash_attn_ext_vec(
 
     const short T = D + nsg*SH; // shared memory size per query in (half)
 
-  //threadgroup q_t    * sq    = (threadgroup q_t    *) (shared +                0*D); // holds the query data
-    threadgroup q4_t   * sq4   = (threadgroup q4_t   *) (shared +                0*D); // same as above but in q4_t
-    threadgroup q4x4_t * sq4x4 = (threadgroup q4x4_t *) (shared +                0*D); // same as above but in q4x4_t
-    threadgroup s_t    * ss    = (threadgroup s_t    *) (shared + sgitg*SH     + Q*D); // scratch buffer for attention
-    threadgroup s4_t   * ss4   = (threadgroup s4_t   *) (shared + sgitg*SH     + Q*D); // same as above but in s4_t
-    threadgroup half   * sm    = (threadgroup half   *) (shared + sgitg*SH + C + Q*D); // scratch buffer for mask
-    threadgroup o4x4_t * sr4x4 = (threadgroup o4x4_t *) (shared + sgitg*D      + Q*T); // scratch buffer for the results
+  //threadgroup q_t    * sq    = (threadgroup q_t    *) (shmem_f16 +                0*D); // holds the query data
+    threadgroup q4_t   * sq4   = (threadgroup q4_t   *) (shmem_f16 +                0*D); // same as above but in q4_t
+    threadgroup q4x4_t * sq4x4 = (threadgroup q4x4_t *) (shmem_f16 +                0*D); // same as above but in q4x4_t
+    threadgroup s_t    * ss    = (threadgroup s_t    *) (shmem_f16 + sgitg*SH     + Q*D); // scratch buffer for attention
+    threadgroup s4_t   * ss4   = (threadgroup s4_t   *) (shmem_f16 + sgitg*SH     + Q*D); // same as above but in s4_t
+    threadgroup half   * sm    = (threadgroup half   *) (shmem_f16 + sgitg*SH + C + Q*D); // scratch buffer for mask
+    threadgroup o4x4_t * sr4x4 = (threadgroup o4x4_t *) (shmem_f16 + sgitg*D      + Q*T); // scratch buffer for the results
 
     // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
     o4x4_t lo[D16/NL];
 
     // load heads from Q to shared memory
-    device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03));
+    device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*args.nb01 + iq2*args.nb02 + iq3*args.nb03));
 
     for (short i = tiisg; i < D4; i += NW) {
-        if (iq1 < ne01) {
+        if (iq1 < args.ne01) {
             sq4[i] = (q4_t) q4[i];
         } else {
             sq4[i] = (q4_t) 0.0f;
@@ -3412,11 +3043,11 @@ kernel void kernel_flash_attn_ext_vec(
         const short ty = tiisg/NL;
 
         // broadcast kv
-        //const short rk2 = ne02/ne12;
-        //const short rk3 = ne03/ne13;
+        //const short rk2 = args.ne02/args.ne12;
+        //const short rk3 = args.ne03/args.ne13;
 
-        const short ikv2 = iq2/(ne02/ne_12_2);
-        const short ikv3 = iq3/(ne03/ne_12_3);
+        const short ikv2 = iq2/(args.ne02/args.ne_12_2);
+        const short ikv3 = iq3/(args.ne03/args.ne_12_3);
 
         // load the queries from shared memory into local memory
         q4x4_t mq[D16/NL];
@@ -3429,25 +3060,25 @@ kernel void kernel_flash_attn_ext_vec(
         const bool has_mask = mask != q;
 
         // pointer to the mask
-        device const half * pm = (device const half *) (mask + iq1*nb31);
+        device const half * pm = (device const half *) (mask + iq1*args.nb31);
 
         half slope = 1.0f;
 
         // ALiBi
-        if (max_bias > 0.0f) {
+        if (args.max_bias > 0.0f) {
             const short h = iq2;
 
-            const half  base = h < n_head_log2 ? m0 : m1;
-            const short exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+            const half  base = h < args.n_head_log2 ? args.m0 : args.m1;
+            const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
 
             slope = pow(base, exph);
         }
 
         // loop over the KV cache
         // each simdgroup handles blocks of Q rows and C columns
-        for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) {
+        for (int ic0 = 0; ic0 < args.ne11; ic0 += C*nsg) {
             const int ic = ic0 + C*sgitg;
-            if (ic >= ne11) {
+            if (ic >= args.ne11) {
                 break;
             }
 
@@ -3461,7 +3092,7 @@ kernel void kernel_flash_attn_ext_vec(
                 for (short cc = 0; cc < C/4; ++cc) {
                     qk_t mqka[4] = { 0.0, 0.0, 0.0, 0.0 };
 
-                    device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
+                    device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
 
                     #pragma unroll(D16/NL)
                     for (short ii = 0; ii < D16; ii += NL) {
@@ -3497,10 +3128,10 @@ kernel void kernel_flash_attn_ext_vec(
 
                     // mqk = mqk*scale + mask*slope
                     if (tx == 0) {
-                        mqk *= scale;
+                        mqk *= args.scale;
 
-                        if (logit_softcap != 0.0f) {
-                            mqk = logit_softcap*precise::tanh(mqk);
+                        if (args.logit_softcap != 0.0f) {
+                            mqk = args.logit_softcap*precise::tanh(mqk);
                         }
 
                         mqk += sm[4*cc + ty]*slope;
@@ -3539,7 +3170,7 @@ kernel void kernel_flash_attn_ext_vec(
             // O = O + (Q*K^T)*V
             {
                 for (short cc = 0; cc < C/4; ++cc) {
-                    device const vd4x4_t * pv4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
+                    device const vd4x4_t * pv4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 4*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
 
                     const s4x4_t ms(ss[4*cc + ty]);
 
@@ -3644,7 +3275,7 @@ kernel void kernel_flash_attn_ext_vec(
         const float S = ss[0];
 
         for (short i = tiisg; i < D16; i += NW) {
-            dst44[((int64_t)iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D16 + i] = (float4x4) sr4x4[i]/S;
+            dst44[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)iq1*args.ne1)*D16 + i] = (float4x4) sr4x4[i]/S;
         }
     }
 }
@@ -3686,42 +3317,27 @@ template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_
 
 template<typename T0, typename T1>
 kernel void kernel_cpy(
-        device  const void * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
+        constant ggml_metal_kargs_cpy & args,
+        device  const char * src0,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
 
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+    const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
 
-    device T1 * dst_data = (device T1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+    device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        device const T0 * src = (device T0 *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+    for (int64_t i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) {
+        device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
         dst_data[i00] = (T1) src[0];
     }
 }
@@ -3741,42 +3357,27 @@ template [[host_name("kernel_cpy_bf16_bf16")]] kernel kernel_cpy_t kernel_cpy<bf
 #endif
 
 kernel void kernel_cpy_f32_q8_0(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
 
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK8_0;
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK8_0;
 
-    device block_q8_0 * dst_data = (device block_q8_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+    device block_q8_0 * dst_data = (device block_q8_0 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = tpitg.x*QK8_0; i00 < ne00; i00 += ntg.x*QK8_0) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+    for (int64_t i00 = tpitg.x*QK8_0; i00 < args.ne00; i00 += ntg.x*QK8_0) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
 
         float amax = 0.0f; // absolute max
 
@@ -3799,42 +3400,27 @@ kernel void kernel_cpy_f32_q8_0(
 }
 
 kernel void kernel_cpy_f32_q4_0(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
 
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_0;
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK4_0;
 
-    device block_q4_0 * dst_data = (device block_q4_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+    device block_q4_0 * dst_data = (device block_q4_0 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = tpitg.x*QK4_0; i00 < ne00; i00 += ntg.x*QK4_0) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+    for (int64_t i00 = tpitg.x*QK4_0; i00 < args.ne00; i00 += ntg.x*QK4_0) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
 
         float amax = 0.0f; // absolute max
         float max  = 0.0f;
@@ -3866,42 +3452,27 @@ kernel void kernel_cpy_f32_q4_0(
 }
 
 kernel void kernel_cpy_f32_q4_1(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
 
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_1;
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK4_1;
 
-    device block_q4_1 * dst_data = (device block_q4_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+    device block_q4_1 * dst_data = (device block_q4_1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = tpitg.x*QK4_1; i00 < ne00; i00 += ntg.x*QK4_1) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+    for (int64_t i00 = tpitg.x*QK4_1; i00 < args.ne00; i00 += ntg.x*QK4_1) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
 
         float min = FLT_MAX;
         float max = -FLT_MAX;
@@ -3932,42 +3503,27 @@ kernel void kernel_cpy_f32_q4_1(
 }
 
 kernel void kernel_cpy_f32_q5_0(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
 
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK5_0;
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK5_0;
 
-    device block_q5_0 * dst_data = (device block_q5_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+    device block_q5_0 * dst_data = (device block_q5_0 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = tpitg.x*QK5_0; i00 < ne00; i00 += ntg.x*QK5_0) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+    for (int64_t i00 = tpitg.x*QK5_0; i00 < args.ne00; i00 += ntg.x*QK5_0) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
 
         float amax = 0.0f; // absolute max
         float max  = 0.0f;
@@ -4005,42 +3561,27 @@ kernel void kernel_cpy_f32_q5_0(
 }
 
 kernel void kernel_cpy_f32_q5_1(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
 
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK5_1;
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK5_1;
 
-    device block_q5_1 * dst_data = (device block_q5_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+    device block_q5_1 * dst_data = (device block_q5_1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = tpitg.x*QK5_1; i00 < ne00; i00 += ntg.x*QK5_1) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+    for (int64_t i00 = tpitg.x*QK5_1; i00 < args.ne00; i00 += ntg.x*QK5_1) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
 
         float max = src[0];
         float min = src[0];
@@ -4088,42 +3629,27 @@ static inline int best_index_int8(int n, constant float * val, float x) {
 }
 
 kernel void kernel_cpy_f32_iq4_nl(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
 
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_NL;
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK4_NL;
 
-    device block_iq4_nl * dst_data = (device block_iq4_nl *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+    device block_iq4_nl * dst_data = (device block_iq4_nl *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = tpitg.x*QK4_NL; i00 < ne00; i00 += ntg.x*QK4_NL) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+    for (int64_t i00 = tpitg.x*QK4_NL; i00 < args.ne00; i00 += ntg.x*QK4_NL) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
 
         float amax = 0.0f; // absolute max
         float max  = 0.0f;
@@ -4159,104 +3685,66 @@ kernel void kernel_cpy_f32_iq4_nl(
         }
 
         dst_data[i00/QK4_NL].d = sumq2 > 0 ? sumqx/sumq2 : d;
-
     }
 }
 
 kernel void kernel_concat(
+    constant ggml_metal_kargs_concat & args,
     device  const char * src0,
     device  const char * src1,
     device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne10,
-    constant   int64_t & ne11,
-    constant   int64_t & ne12,
-    constant   int64_t & ne13,
-    constant  uint64_t & nb10,
-    constant  uint64_t & nb11,
-    constant  uint64_t & nb12,
-    constant  uint64_t & nb13,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    constant   int32_t & dim,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
+    uint3   tgpig[[threadgroup_position_in_grid]],
+    ushort3 tpitg[[thread_position_in_threadgroup]],
+    ushort3   ntg[[threads_per_threadgroup]]) {
 
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
+    const int i3 = tgpig.z;
+    const int i2 = tgpig.y;
+    const int i1 = tgpig.x;
 
-    int64_t o[4] = {0, 0, 0, 0};
-    o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
+    int o[4] = {0, 0, 0, 0};
+    o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
 
     device const float * x;
 
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-            x = (device const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
+            x = (device const float *)(src0 + (i3       )*args.nb03 + (i2       )*args.nb02 + (i1       )*args.nb01 + (i0       )*args.nb00);
         } else {
-            x = (device const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
+            x = (device const float *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
         }
 
-        device float * y = (device float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+        device float * y = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
         *y = *x;
     }
 }
 
+template<typename args_t>
 void kernel_mul_mv_q2_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_q2_K * x = (device const block_q2_K *) ((device char *) src0 + offset0);
-    device const float      * y = (device const float      *) ((device char *) src1 + offset1);
+    device const block_q2_K * x = (device const block_q2_K *) (src0 + offset0);
+    device const float      * y = (device const float      *) (src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
@@ -4305,92 +3793,64 @@ void kernel_mul_mv_q2_K_f32_impl(
                                  (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) -
                          dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0));
 
-            qs += nb01/2;
-            sc += nb01;
-            dh += nb01/2;
+            qs += args.nb01/2;
+            sc += args.nb01;
+            dh += args.nb01/2;
         }
 
         y4 += 4 * QK_K;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
+            dst_f32[first_row + row] = all_sum;
         }
     }
 }
 
 [[host_name("kernel_mul_mv_q2_K_f32")]]
 kernel void kernel_mul_mv_q2_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_q2_K_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
+template<typename args_t>
 void kernel_mul_mv_q3_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
 
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int64_t im = tgpig.z;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_q3_K * x = (device const block_q3_K *) ((device char *) src0 + offset0);
-    device const float     * yy = (device const float      *) ((device char *) src1 + offset1);
+    device const block_q3_K * x = (device const block_q3_K *) (src0 + offset0);
+    device const float     * yy = (device const float      *) (src1 + offset1);
 
     float yl[32];
 
@@ -4420,9 +3880,10 @@ void kernel_mul_mv_q3_K_f32_impl(
 
     const ushort4 hm = mm[2*ip + il/2];
 
-    const int shift = 2*il;
-    const float    v1 = il == 0 ? 4.f : 64.f;
-    const float    v2 = 4.f * v1;
+    const short shift = 2*il;
+
+    const float v1 = il == 0 ? 4.f : 64.f;
+    const float v2 = 4.f * v1;
 
     const uint16_t s_shift1 = 4*ip;
     const uint16_t s_shift2 = s_shift1 + il;
@@ -4491,10 +3952,10 @@ void kernel_mul_mv_q3_K_f32_impl(
             sumf1[row] += d1 * (scales[1] - 32);
             sumf2[row] += d2 * (scales[3] - 32);
 
-            q  += nb01/2;
-            h  += nb01/2;
-            a  += nb01/2;
-            dh += nb01/2;
+            q  += args.nb01/2;
+            h  += args.nb01/2;
+            a  += args.nb01/2;
+            dh += args.nb01/2;
         }
 
         y1 += 4 * QK_K;
@@ -4504,66 +3965,39 @@ void kernel_mul_mv_q3_K_f32_impl(
         const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift);
         sumf1[row] = simd_sum(sumf);
     }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     if (tiisg == 0) {
         for (int row = 0; row < 2; ++row) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = sumf1[row];
+            dst_f32[first_row + row] = sumf1[row];
         }
     }
 }
 
 [[host_name("kernel_mul_mv_q3_K_f32")]]
 kernel void kernel_mul_mv_q3_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_q3_K_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
+template<typename args_t>
 void kernel_mul_mv_q4_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
     const uint16_t kmask1 = 0x3f3f;
     const uint16_t kmask2 = 0x0f0f;
@@ -4574,21 +4008,21 @@ void kernel_mul_mv_q4_K_f32_impl(
     const int iq = it/4;     // 0 or 1
     const int ir = it%4;     // 0...3
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
     //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
     const int first_row = r0 * N_DST;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_q4_K * x = (device const block_q4_K *) ((device char *) src0 + offset0);
-    device const float      * y = (device const float      *) ((device char *) src1 + offset1);
+    device const block_q4_K * x = (device const block_q4_K *) (src0 + offset0);
+    device const float      * y = (device const float      *) (src1 + offset1);
 
     float yl[16];
     float yh[16];
@@ -4641,92 +4075,64 @@ void kernel_mul_mv_q4_K_f32_impl(
                                  (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
                          dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
 
-            q1 += nb01/2;
-            sc += nb01/2;
-            dh += nb01/2;
+            q1 += args.nb01/2;
+            sc += args.nb01/2;
+            dh += args.nb01/2;
         }
 
         y4 += 4 * QK_K;
     }
 
+    device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
+
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
+            dst_f32[first_row + row] = all_sum;
         }
     }
 }
 
 [[host_name("kernel_mul_mv_q4_K_f32")]]
 kernel void kernel_mul_mv_q4_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_q4_K_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
+template<typename args_t>
 void kernel_mul_mv_q5_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
 
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_q5_K * x = (device const block_q5_K *) ((device char *) src0 + offset0);
-    device const float     * yy = (device const float      *) ((device char *) src1 + offset1);
+    device const block_q5_K * x = (device const block_q5_K *) (src0 + offset0);
+    device const float     * yy = (device const float      *) (src1 + offset1);
 
     float sumf[2]={0.f};
 
@@ -4800,98 +4206,70 @@ void kernel_mul_mv_q5_K_f32_impl(
                                  sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
                          dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
 
-            q1 += nb01;
-            qh += nb01;
-            dh += nb01/2;
-            a  += nb01/2;
+            q1 += args.nb01;
+            qh += args.nb01;
+            dh += args.nb01/2;
+            a  += args.nb01/2;
         }
 
         y1 += 4 * QK_K;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < 2; ++row) {
         const float tot = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
+            dst_f32[first_row + row] = tot;
         }
     }
 }
 
 [[host_name("kernel_mul_mv_q5_K_f32")]]
 kernel void kernel_mul_mv_q5_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_q5_K_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
+template <typename args_t>
 void kernel_mul_mv_q6_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
     const uint8_t kmask1 = 0x03;
     const uint8_t kmask2 = 0x0C;
     const uint8_t kmask3 = 0x30;
     const uint8_t kmask4 = 0xC0;
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
 
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int     im = tgpig.z;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
 
-    const int row = 2 * r0 + sgitg;
+    const int row = 2*r0 + sgitg;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =  r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =  r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_q6_K * x = (device const block_q6_K *) ((device char *) src0 + offset0);
-    device const float     * yy = (device const float      *) ((device char *) src1 + offset1);
+    device const block_q6_K * x = (device const block_q6_K *) (src0 + offset0);
+    device const float     * yy = (device const float      *) (src1 + offset1);
 
     float sumf = 0;
 
@@ -4908,7 +4286,6 @@ void kernel_mul_mv_q6_K_f32_impl(
     const int q_offset_h = 32*ip + l0;
 
     for (int i = ix; i < nb; i += 2) {
-
         device const uint8_t * q1 = x[i].ql + q_offset_l;
         device const uint8_t * q2 = q1 + 32;
         device const uint8_t * qh = x[i].qh + q_offset_h;
@@ -4930,98 +4307,70 @@ void kernel_mul_mv_q6_K_f32_impl(
 
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     const float tot = simd_sum(sumf);
     if (tiisg == 0) {
-        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
+        dst_f32[row] = tot;
     }
 }
 
 [[host_name("kernel_mul_mv_q6_K_f32")]]
 kernel void kernel_mul_mv_q6_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_q6_K_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
 // ======================= "True" 2-bit
 
+template<typename args_t>
 void kernel_mul_mv_iq2_xxs_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_iq2_xxs * x = (device const block_iq2_xxs *) ((device char *) src0 + offset0);
-    device const float         * y = (device const float         *) ((device char *) src1 + offset1);
+    device const block_iq2_xxs * x = (device const block_iq2_xxs *) (src0 + offset0);
+    device const float         * y = (device const float         *) (src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
 
     const int nb32 = nb * (QK_K / 32);
 
-    threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
-    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 256);
+    threadgroup uint64_t * svalues = (threadgroup uint64_t *)(shmem);
+    threadgroup uint8_t  * ssigns  = (threadgroup uint8_t  *)(svalues + 256);
     {
         int nval = 4;
         int pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) values[pos + i] = iq2xxs_grid[pos + i];
+        for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2xxs_grid[pos + i];
         nval = 2;
         pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
+        for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i];
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
@@ -5051,114 +4400,85 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
 
             float sum = 0;
             for (int l = 0; l < 4; ++l) {
-                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + aux8[l]);
-                const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
+                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + aux8[l]);
+                const uint8_t signs = ssigns[(aux32 >> 7*l) & 127];
                 for (int j = 0; j < 8; ++j) {
                     sum += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
                 }
             }
             sumf[row] += d * sum;
 
-            dh += nb01/2;
-            q2 += nb01/2;
+            dh += args.nb01/2;
+            q2 += args.nb01/2;
         }
 
         y4 += 32 * 32;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
+            dst_f32[first_row + row] = all_sum * 0.25f;
         }
     }
 }
 
 [[host_name("kernel_mul_mv_iq2_xxs_f32")]]
 kernel void kernel_mul_mv_iq2_xxs_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    kernel_mul_mv_iq2_xxs_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
 }
 
+template<typename args_t>
 void kernel_mul_mv_iq2_xs_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_iq2_xs * x = (device const block_iq2_xs *) ((device char *) src0 + offset0);
-    device const float        * y = (device const float        *) ((device char *) src1 + offset1);
+    device const block_iq2_xs * x = (device const block_iq2_xs *) (src0 + offset0);
+    device const float        * y = (device const float        *) (src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
 
     const int nb32 = nb * (QK_K / 32);
 
-    threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
-    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 512);
+    threadgroup uint64_t * svalues = (threadgroup uint64_t *)(shmem);
+    threadgroup uint8_t  * ssigns  = (threadgroup uint8_t  *)(svalues + 512);
     {
         int nval = 8;
         int pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) values[pos + i] = iq2xs_grid[pos + i];
+        for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2xs_grid[pos + i];
         nval = 2;
         pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
+        for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i];
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
@@ -5190,122 +4510,94 @@ void kernel_mul_mv_iq2_xs_f32_impl(
 
             float sum1 = 0, sum2 = 0;
             for (int l = 0; l < 2; ++l) {
-                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
-                const uint8_t signs = shared_signs[(q2[l] >> 9)];
+                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + (q2[l] & 511));
+                const uint8_t signs = ssigns[(q2[l] >> 9)];
                 for (int j = 0; j < 8; ++j) {
                     sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
                 }
             }
             for (int l = 2; l < 4; ++l) {
-                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
-                const uint8_t signs = shared_signs[(q2[l] >> 9)];
+                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + (q2[l] & 511));
+                const uint8_t signs = ssigns[(q2[l] >> 9)];
                 for (int j = 0; j < 8; ++j) {
                     sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
                 }
             }
             sumf[row] += d1 * sum1 + d2 * sum2;
 
-            dh += nb01/2;
-            q2 += nb01/2;
-            sc += nb01;
+            dh += args.nb01/2;
+            q2 += args.nb01/2;
+            sc += args.nb01;
         }
 
         y4 += 32 * 32;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
+            dst_f32[first_row + row] = all_sum * 0.25f;
         }
     }
 }
 
 [[host_name("kernel_mul_mv_iq2_xs_f32")]]
 kernel void kernel_mul_mv_iq2_xs_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq2_xs_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
 }
 
+template <typename args_t>
 void kernel_mul_mv_iq3_xxs_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_iq3_xxs * x = (device const block_iq3_xxs *) ((device char *) src0 + offset0);
-    device const float         * y = (device const float         *) ((device char *) src1 + offset1);
+    device const block_iq3_xxs * x = (device const block_iq3_xxs *) (src0 + offset0);
+    device const float         * y = (device const float         *) (src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
 
     const int nb32 = nb * (QK_K / 32);
 
-    threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
-    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 256);
+    threadgroup uint32_t * svalues = (threadgroup uint32_t *)(shmem);
+    threadgroup uint8_t  * ssigns  = (threadgroup uint8_t  *)(svalues + 256);
     {
         int nval = 4;
         int pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) values[pos + i] = iq3xxs_grid[pos + i];
+        for (int i = 0; i < nval; ++i) svalues[pos + i] = iq3xxs_grid[pos + i];
         nval = 2;
         pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
+        for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i];
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
@@ -5314,7 +4606,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
     device const float * y4 = y + 32 * ix;
 
     for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
-
         for (int i = 0; i < 32; ++i) {
             yl[i] = y4[i];
         }
@@ -5328,16 +4619,15 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
         device const half * dh = &xr->d;
 
         for (int row = 0; row < N_DST; row++) {
-
             const float db = dh[0];
             const uint32_t aux32 = gas[0] | (gas[1] << 16);
             const float d = db * (0.5f + (aux32 >> 28));
 
             float2 sum = {0};
             for (int l = 0; l < 4; ++l) {
-                const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + q3[2*l+0]);
-                const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + q3[2*l+1]);
-                const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
+                const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(svalues + q3[2*l+0]);
+                const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(svalues + q3[2*l+1]);
+                const uint8_t signs = ssigns[(aux32 >> 7*l) & 127];
                 for (int j = 0; j < 4; ++j) {
                     sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
                     sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
@@ -5345,103 +4635,75 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
             }
             sumf[row] += d * (sum[0] + sum[1]);
 
-            dh  += nb01/2;
-            q3  += nb01;
-            gas += nb01/2;
+            dh  += args.nb01/2;
+            q3  += args.nb01;
+            gas += args.nb01/2;
         }
 
         y4 += 32 * 32;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f;
+            dst_f32[first_row + row] = all_sum * 0.5f;
         }
     }
 }
 
 [[host_name("kernel_mul_mv_iq3_xxs_f32")]]
 kernel void kernel_mul_mv_iq3_xxs_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq3_xxs_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
 }
 
+template<typename args_t>
 void kernel_mul_mv_iq3_s_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_iq3_s * x = (device const block_iq3_s *) ((device char *) src0 + offset0);
-    device const float       * y = (device const float       *) ((device char *) src1 + offset1);
+    device const block_iq3_s * x = (device const block_iq3_s *) (src0 + offset0);
+    device const float       * y = (device const float       *) (src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
 
     const int nb32 = nb * (QK_K / 32);
 
-    threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
+    threadgroup uint32_t * svalues = (threadgroup uint32_t *) shmem;
     {
         int nval = 8;
         int pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) values[pos + i] = iq3s_grid[pos + i];
+        for (int i = 0; i < nval; ++i) svalues[pos + i] = iq3s_grid[pos + i];
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
@@ -5472,8 +4734,8 @@ void kernel_mul_mv_iq3_s_f32_impl(
 
             float2 sum = {0};
             for (int l = 0; l < 4; ++l) {
-                const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? values + 256 : values;
-                const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? values + 256 : values;
+                const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? svalues + 256 : svalues;
+                const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? svalues + 256 : svalues;
                 const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(table1 + qs[2*l+0]);
                 const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(table2 + qs[2*l+1]);
                 for (int j = 0; j < 4; ++j) {
@@ -5483,105 +4745,77 @@ void kernel_mul_mv_iq3_s_f32_impl(
             }
             sumf[row] += d * (sum[0] + sum[1]);
 
-            dh    += nb01/2;
-            qs    += nb01;
-            qh    += nb01;
-            sc    += nb01;
-            signs += nb01;
+            dh    += args.nb01/2;
+            qs    += args.nb01;
+            qh    += args.nb01;
+            sc    += args.nb01;
+            signs += args.nb01;
         }
 
         y4 += 32 * 32;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
+            dst_f32[first_row + row] = all_sum;
         }
     }
 }
 
 [[host_name("kernel_mul_mv_iq3_s_f32")]]
 kernel void kernel_mul_mv_iq3_s_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq3_s_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
 }
 
+template <typename args_t>
 void kernel_mul_mv_iq2_s_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_iq2_s * x = (device const block_iq2_s *) ((device char *) src0 + offset0);
-    device const float       * y = (device const float       *) ((device char *) src1 + offset1);
+    device const block_iq2_s * x = (device const block_iq2_s *) (src0 + offset0);
+    device const float       * y = (device const float       *) (src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
 
     const int nb32 = nb * (QK_K / 32);
 
-    //threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
+    //threadgroup uint64_t * svalues = (threadgroup uint64_t *) shmem;
     //{
     //    int nval = 32;
     //    int pos  = (32*sgitg + tiisg)*nval;
-    //    for (int i = 0; i < nval; ++i) values[pos + i] = iq2s_grid[pos + i];
+    //    for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2s_grid[pos + i];
     //    threadgroup_barrier(mem_flags::mem_threadgroup);
     //}
 
@@ -5613,8 +4847,8 @@ void kernel_mul_mv_iq2_s_f32_impl(
 
             float2 sum = {0};
             for (int l = 0; l < 2; ++l) {
-                //const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
-                //const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
+                //const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(svalues + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
+                //const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(svalues + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
                 constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
                 constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
                 for (int j = 0; j < 8; ++j) {
@@ -5624,94 +4858,66 @@ void kernel_mul_mv_iq2_s_f32_impl(
             }
             sumf[row] += d1 * sum[0] + d2 * sum[1];
 
-            dh    += nb01/2;
-            qs    += nb01;
-            qh    += nb01;
-            sc    += nb01;
-            signs += nb01;
+            dh    += args.nb01/2;
+            qs    += args.nb01;
+            qh    += args.nb01;
+            sc    += args.nb01;
+            signs += args.nb01;
         }
 
         y4 += 32 * 32;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
+            dst_f32[first_row + row] = all_sum * 0.25f;
         }
     }
 }
 
 [[host_name("kernel_mul_mv_iq2_s_f32")]]
 kernel void kernel_mul_mv_iq2_s_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq2_s_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
 }
 
+template<typename args_t>
 void kernel_mul_mv_iq1_s_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_value,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_iq1_s * x = (device const block_iq1_s *) ((device char *) src0 + offset0);
-    device const float       * y = (device const float       *) ((device char *) src1 + offset1);
+    device const block_iq1_s * x = (device const block_iq1_s *) (src0 + offset0);
+    device const float       * y = (device const float       *) (src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
@@ -5754,61 +4960,50 @@ void kernel_mul_mv_iq1_s_f32_impl(
             }
             sumf[row] += (float)dh[0] * (sum + sumy * (qh[0] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA)) * (2*((qh[0] >> 12) & 7) + 1);
 
-            dh += nb01/2;
-            qs += nb01;
-            qh += nb01/2;
+            dh += args.nb01/2;
+            qs += args.nb01;
+            qh += args.nb01/2;
         }
 
         y4 += 32 * 32;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
+            dst_f32[first_row + row] = all_sum;
         }
     }
 }
 
+template <typename args_t>
 void kernel_mul_mv_iq1_m_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_value,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    const int nb = ne00/QK_K;
+    const int nb = args.ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_iq1_m * x = (device const block_iq1_m *) ((device char *) src0 + offset0);
-    device const float       * y = (device const float       *) ((device char *) src1 + offset1);
+    device const block_iq1_m * x = (device const block_iq1_m *) (src0 + offset0);
+    device const float       * y = (device const float       *) (src1 + offset1);
 
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
@@ -5860,66 +5055,55 @@ void kernel_mul_mv_iq1_m_f32_impl(
             sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
                                              (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
 
-            sc += nb01/2;
-            qs += nb01;
-            qh += nb01;
+            sc += args.nb01/2;
+            qs += args.nb01;
+            qh += args.nb01;
         }
 
         y4 += 32 * 32;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
+            dst_f32[first_row + row] = all_sum;
         }
     }
 }
 
+template<typename args_t>
 void kernel_mul_mv_iq4_nl_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values_i8,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    threadgroup float * shared_values = (threadgroup float *)shared_values_i8;
-    const int nb = ne00/QK4_NL;
+    threadgroup float * shmem_f32 = (threadgroup float *) shmem;
+    const int nb = args.ne00/QK4_NL;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
     const int first_row = (r0 * 2 + sgitg) * 2;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_iq4_nl * x = (device const block_iq4_nl *) ((device char *) src0 + offset0);
-    device const float        * y = (device const float        *) ((device char *) src1 + offset1);
+    device const block_iq4_nl * x = (device const block_iq4_nl *) (src0 + offset0);
+    device const float        * y = (device const float        *) (src1 + offset1);
 
     const int ix = tiisg/2;  // 0...15
     const int it = tiisg%2;  // 0 or 1
 
-    shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
+    shmem_f32[tiisg] = kvalues_iq4nl_f[tiisg%16];
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     float4 yl[4];
@@ -5937,7 +5121,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
         device const float4 * y4 = (device const float4 *)yb;
         yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
 
-        for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
+        for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) {
 
             device const block_iq4_nl & xb = x[row*nb + ib];
             device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
@@ -5947,16 +5131,16 @@ void kernel_mul_mv_iq4_nl_f32_impl(
             aux32[0] = q4[0] | (q4[1] << 16);
             aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
             aux32[0] &= 0x0f0f0f0f;
-            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
-            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
+            qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]};
+            qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]};
             acc1 += yl[0] * qf1;
             acc2 += yl[1] * qf2;
 
             aux32[0] = q4[2] | (q4[3] << 16);
             aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
             aux32[0] &= 0x0f0f0f0f;
-            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
-            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
+            qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]};
+            qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]};
             acc1 += yl[2] * qf1;
             acc2 += yl[3] * qf2;
 
@@ -5969,60 +5153,49 @@ void kernel_mul_mv_iq4_nl_f32_impl(
         yb += 16 * QK4_NL;
     }
 
-    for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
+            dst_f32[first_row + row] = all_sum;
         }
     }
 }
 
+template<typename args_t>
 void kernel_mul_mv_iq4_xs_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values_i8,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg) {
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
 
-    threadgroup float * shared_values = (threadgroup float *)shared_values_i8;
-    const int nb = ne00/QK_K;
+    threadgroup float * shmem_f32 = (threadgroup float *) shmem;
+    const int nb = args.ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
     const int first_row = (r0 * 2 + sgitg) * 2;
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
 
-    const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    const uint offset1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
 
-    device const block_iq4_xs * x = (device const block_iq4_xs *) ((device char *) src0 + offset0);
-    device const float        * y = (device const float        *) ((device char *) src1 + offset1);
+    device const block_iq4_xs * x = (device const block_iq4_xs *) (src0 + offset0);
+    device const float        * y = (device const float        *) (src1 + offset1);
 
     const int ix = tiisg/16;  // 0 or 1
     const int it = tiisg%16;  // 0...15
     const int ib = it/2;
     const int il = it%2;
 
-    shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
+    shmem_f32[tiisg] = kvalues_iq4nl_f[tiisg%16];
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     float4 yl[4];
@@ -6036,28 +5209,26 @@ void kernel_mul_mv_iq4_xs_f32_impl(
     float4 qf1, qf2;
 
     for (int ibl = ix; ibl < nb; ibl += 2) {
-
         device const float4 * y4 = (device const float4 *)yb;
         yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
 
         for (int row = 0; row < 2; ++row) {
-
             device const block_iq4_xs & xb = x[row*nb + ibl];
             device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il);
 
             float4 acc1 = {0.f}, acc2 = {0.f};
 
-            aux32[0] = q4[0] & 0x0f0f0f0f;
+            aux32[0] = (q4[0]     ) & 0x0f0f0f0f;
             aux32[1] = (q4[0] >> 4) & 0x0f0f0f0f;
-            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
-            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
+            qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]};
+            qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]};
             acc1 += yl[0] * qf1;
             acc2 += yl[1] * qf2;
 
-            aux32[0] = q4[1] & 0x0f0f0f0f;
+            aux32[0] = (q4[1]     ) & 0x0f0f0f0f;
             aux32[1] = (q4[1] >> 4) & 0x0f0f0f0f;
-            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
-            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
+            qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]};
+            qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]};
             acc1 += yl[2] * qf1;
             acc2 += yl[3] * qf2;
 
@@ -6071,134 +5242,68 @@ void kernel_mul_mv_iq4_xs_f32_impl(
         yb += 2 * QK_K;
     }
 
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
     for (int row = 0; row < 2; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
+            dst_f32[first_row + row] = all_sum;
         }
     }
 }
 
 [[host_name("kernel_mul_mv_iq1_s_f32")]]
 kernel void kernel_mul_mv_iq1_s_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq1_s_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
 [[host_name("kernel_mul_mv_iq1_m_f32")]]
 kernel void kernel_mul_mv_iq1_m_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq1_m_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq1_m_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
 [[host_name("kernel_mul_mv_iq4_nl_f32")]]
 kernel void kernel_mul_mv_iq4_nl_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq4_nl_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
 }
 
 [[host_name("kernel_mul_mv_iq4_xs_f32")]]
 kernel void kernel_mul_mv_iq4_xs_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant  uint64_t & nb13,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+    kernel_mul_mv_iq4_xs_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
 }
 
 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
@@ -6302,38 +5407,26 @@ kernel void kernel_get_rows_i32(
 
 // each block_q contains 16*nl weights
 template<typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
-kernel void kernel_mul_mm(device const  uchar * src0,
-                          device const  uchar * src1,
-                          device        float * dst,
-                          constant    int64_t & ne00,
-                          constant    int64_t & ne02,
-                          constant   uint64_t & nb01,
-                          constant   uint64_t & nb02,
-                          constant   uint64_t & nb03,
-                          constant    int64_t & ne12,
-                          constant   uint64_t & nb10,
-                          constant   uint64_t & nb11,
-                          constant   uint64_t & nb12,
-                          constant   uint64_t & nb13,
-                          constant    int64_t & ne0,
-                          constant    int64_t & ne1,
-                          constant       uint & r2,
-                          constant       uint & r3,
-                          threadgroup   uchar * shared_memory [[threadgroup(0)]],
-                          uint3                 tgpig[[threadgroup_position_in_grid]],
-                          uint                  tiitg[[thread_index_in_threadgroup]],
-                          uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
+kernel void kernel_mul_mm(
+        constant ggml_metal_kargs_mul_mm & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiitg[[thread_index_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    threadgroup T     * sa = (threadgroup T     *)(shared_memory);
-    threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
+    threadgroup T     * sa = (threadgroup T     *)(shmem);
+    threadgroup float * sb = (threadgroup float *)(shmem + 4096);
 
-    const uint r0 = tgpig.y;
-    const uint r1 = tgpig.x;
-    const uint im = tgpig.z;
+    const int r0 = tgpig.y;
+    const int r1 = tgpig.x;
+    const int im = tgpig.z;
 
     // if this block is of 64x32 shape or smaller
-    short n_rows = (ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
 
     // a thread shouldn't load data outside of the matrix
     short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
@@ -6349,20 +5442,20 @@ kernel void kernel_mul_mm(device const  uchar * src0,
 
     short il = (tiitg % THREAD_PER_ROW);
 
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
+    const int i12 = im%args.ne12;
+    const int i13 = im/args.ne12;
 
-    uint   offset0 = (i12/r2)*nb02 + (i13/r3)*nb03;
-    ushort offset1 = il/nl;
+    uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    short    offset1 = il/nl;
 
-    device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*nb01 + offset0) + offset1;
+    device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*args.nb01 + offset0) + offset1;
     device const float   * y = (device const float   *)(src1
-        + nb13 * i13
-        + nb12 * i12
-        + nb11 * (r1 * BLOCK_SIZE_N + thread_col)
-        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+        + args.nb13*i13
+        + args.nb12*i12
+        + args.nb11*(r1 * BLOCK_SIZE_N + thread_col)
+        + args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
 
-    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
+    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
         // load data and store to threadgroup memory
         T4x4 temp_a;
         dequantize_func(x, il, temp_a);
@@ -6409,16 +5502,18 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         }
     }
 
-    if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
-        device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) \
-                               + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
+    if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) {
+        device float * C = (device float *) dst +
+            (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) + \
+            (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
+
         for (short i = 0; i < 8; i++) {
-            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
+            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
         }
     } else {
         // block is smaller than 64x32, we should avoid writing data outside of the matrix
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float * temp_str = ((threadgroup float *) shared_memory) \
+        threadgroup float * temp_str = ((threadgroup float *) shmem) \
                                       + 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M;
         for (short i = 0; i < 8; i++) {
             simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
@@ -6428,7 +5523,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
 
         if (sgitg == 0) {
             for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                device float  * D  = dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*ne0 + im*ne1*ne0;
+                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.ne0 + im*args.ne1*args.ne0;
                 device float4 * D4 = (device float4 *) D;
 
                 threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
@@ -6449,36 +5544,37 @@ kernel void kernel_mul_mm(device const  uchar * src0,
 }
 
 // same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids
+// TODO: this kernel needs to be reimplemented from scratch for better performance
 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
 void kernel_mul_mm_id_impl(
-        device const  uchar * src0,
-        device const  uchar * src1,
+        int32_t  ne00,
+        int32_t  ne02,
+        uint64_t nb01,
+        uint64_t nb02,
+        int32_t  ne11,
+        int32_t  ne12,
+        uint64_t nb10,
+        uint64_t nb11,
+        uint64_t nb12,
+        int32_t  ne0,
+        int32_t  ne1,
+        int64_t  ne0ne1,
+        device   const char * src0,
+        device   const char * src1,
         threadgroup ushort2 * rowids,
-        device        float * dst,
-        constant    int64_t & ne00,
-        constant    int64_t & ne02,
-        constant   uint64_t & nb01,
-        constant   uint64_t & nb02,
-        constant    int64_t & ne11,
-        constant    int64_t & ne12,
-        constant   uint64_t & nb10,
-        constant   uint64_t & nb11,
-        constant   uint64_t & nb12,
-        constant    int64_t & ne0,
-                    int64_t   ne1,
-                    int64_t   ne0ne1,
-        threadgroup   uchar * shared_memory,
-        uint3                 tgpig[[threadgroup_position_in_grid]],
-        uint                  tiitg[[thread_index_in_threadgroup]],
-        uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
+        device         char * dst,
+        threadgroup    char * shmem,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiitg[[thread_index_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    threadgroup half  * sa = (threadgroup half  *)(shared_memory);
-    threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
+    threadgroup half  * sa = (threadgroup half  *)(shmem);
+    threadgroup float * sb = (threadgroup float *)(shmem + 4096);
 
-    const uint r0 = tgpig.y;
-    const uint r1 = tgpig.x;
+    const int r0 = tgpig.y;
+    const int r1 = tgpig.x;
 
-    if (r1 * BLOCK_SIZE_N >= ne1) return;
+    if (r1*BLOCK_SIZE_N >= ne1) return;
 
     // if this block is of 64x32 shape or smaller
     short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
@@ -6490,9 +5586,9 @@ void kernel_mul_mm_id_impl(
 
     simdgroup_half8x8  ma[4];
     simdgroup_float8x8 mb[2];
-    simdgroup_float8x8 c_res[8];
+    simdgroup_float8x8 mc[8];
     for (int i = 0; i < 8; i++){
-        c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
+        mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
     }
     short il = (tiitg % THREAD_PER_ROW);
 
@@ -6530,11 +5626,14 @@ void kernel_mul_mm_id_impl(
         threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
         threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
 
+        #pragma unroll(BLOCK_SIZE_K/8)
         for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+            #pragma unroll(4)
             for (int i = 0; i < 4; i++) {
                 simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
             }
             simdgroup_barrier(mem_flags::mem_none);
+            #pragma unroll(2)
             for (int i = 0; i < 2; i++) {
                 simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
             }
@@ -6542,29 +5641,42 @@ void kernel_mul_mm_id_impl(
             lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
             lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
 
+            #pragma unroll(8)
             for (int i = 0; i < 8; i++){
-                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
+                simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
             }
         }
     }
 
     {
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
+        threadgroup float * temp_str = ((threadgroup float *) shmem) \
                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
         for (int i = 0; i < 8; i++) {
-            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
+            simdgroup_store(mc[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
         }
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
-        device float * C = dst + (BLOCK_SIZE_M * r0);
         if (sgitg == 0) {
             for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
                 threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j];
-                int joff =  jid[0] * ne0 + jid[1] * ne0ne1;
-                for (int i = 0; i < n_rows; i++) {
-                    *(C + i + joff) = *(temp_str + i + j * BLOCK_SIZE_M);
+                int64_t joff = jid[0]*ne0 + jid[1]*ne0ne1;
+
+                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + joff;
+                device float4 * D4 = (device float4 *) D;
+
+                threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
+                threadgroup float4 * C4 = (threadgroup float4 *) C;
+
+                int i = 0;
+                for (; i < n_rows/4; i++) {
+                    *(D4 + i) = *(C4 + i);
+                }
+
+                i *= 4;
+                for (; i < n_rows; i++) {
+                    *(D + i) = *(C + i);
                 }
             }
         }
@@ -6573,48 +5685,34 @@ void kernel_mul_mm_id_impl(
 
 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
 kernel void kernel_mul_mm_id(
-        device const   uchar * src0s,
-        device const   uchar * src1,
-        device         float * dst,
-        device const   uchar * ids,
-        constant     int64_t & nei0,
-        constant     int64_t & nei1,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        threadgroup    uchar * shared_memory [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
+        constant ggml_metal_kargs_mul_mm_id & args,
+        device const char * src0s,
+        device const char * src1,
+        device       char * dst,
+        device const char * ids,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiitg[[thread_index_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
     const int32_t i02 = tgpig.z;
+
     tgpig.z = 0;
 
-    device const uchar * src0 = src0s + i02*nb02;
+    device const char * src0 = src0s + i02*args.nb02;
 
     // row indices
-    threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shared_memory + 8192);
+    threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shmem + 8192);
 
     // TODO: parallelize this loop
-    int64_t _ne1 = 0;
-    for (ushort ii1 = 0; ii1 < nei1; ii1++) {
-        for (ushort ii0 = 0; ii0 < nei0; ii0++) {
-            int32_t id = ((device int32_t *) (ids + ii1*nbi1))[ii0];
+    int32_t _ne1 = 0;
+    for (ushort ii1 = 0; ii1 < args.nei1; ii1++) {
+        for (ushort ii0 = 0; ii0 < args.nei0; ii0++) {
+            int32_t id = ((device int32_t *) (ids + ii1*args.nbi1))[ii0];
             if (id == i02) {
-                //if (tiitg == 0) {
+                if (tiitg == 0) {
                     rowids[_ne1] = ushort2(ii0, ii1);
-                //}
+                }
                 _ne1++;
             }
         }
@@ -6623,23 +5721,23 @@ kernel void kernel_mul_mm_id(
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
+        args.ne00,
+        args.ne02,
+        args.nb01,
+        args.nb02,
+        args.ne11,
+        args.ne12,
+        args.nb10,
+        args.nb11,
+        args.nb12,
+        args.ne0,
+        _ne1,
+        (int64_t)args.ne0*args.ne1,
         src0,
         src1,
         rowids,
         dst,
-        ne00,
-        ne02,
-        nb01,
-        nb02,
-        ne11,
-        ne12,
-        nb10,
-        nb11,
-        nb12,
-        ne0,
-        _ne1,
-        ne0*ne1,
-        shared_memory,
+        shmem,
         tgpig,
         tiitg,
         sgitg);
@@ -6748,194 +5846,110 @@ template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel
 //
 
 typedef void (kernel_mul_mv_impl_t)(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb00,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne11,
-                   int64_t   ne12,
-                  uint64_t   nb10,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-                   uint3     tgpig,
-                   uint      tiisg);
+        ggml_metal_kargs_mul_mv args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig,
+        ushort tiisg);
 
 typedef void (kernel_mul_mv2_impl_t)(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                  uint64_t   nb01,
-                  uint64_t   nb02,
-                  uint64_t   nb03,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                  uint64_t   nb11,
-                  uint64_t   nb12,
-                  uint64_t   nb13,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-        threadgroup int8_t * shared_values,
-                   uint3     tgpig,
-                   uint      tiisg,
-                   uint      sgitg);
+        ggml_metal_kargs_mul_mv args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg);
 
 template<kernel_mul_mv_impl_t impl_fn>
 void mmv_fn(
-        device const    char * src0,
-        device const    char * src1,
-        device         float * dst,
-                     int64_t   ne00,
-                     int64_t   ne01,
-                     int64_t   ne02,
-                    uint64_t   nb00,
-                    uint64_t   nb01,
-                    uint64_t   nb02,
-                    uint64_t   nb03,
-                     int64_t   ne10,
-                     int64_t   ne11,
-                     int64_t   ne12,
-                     int64_t   ne13,
-                    uint64_t   nb10,
-                    uint64_t   nb11,
-                    uint64_t   nb12,
-                    uint64_t   nb13,
-                     int64_t   ne0,
-                     int64_t   ne1,
-                    uint64_t   nb1,
-                        uint   r2,
-                        uint   r3,
-        threadgroup int8_t   * shared_values,
-        uint3                  tgpig,
-        uint                   tiitg,
-        uint                   tiisg,
-        uint                   sgitg) {
-    impl_fn(src0,src1,dst,ne00,ne01,ne02,nb00,nb01,nb02,nb03,ne10,ne11,ne12,nb10,nb11,nb12,nb13,ne0,ne1,r2,r3,tgpig,tiisg);
+        ggml_metal_kargs_mul_mv args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiitg,
+        ushort tiisg,
+        ushort sgitg) {
+    impl_fn(args, src0, src1, dst, tgpig, tiisg);
 }
 
 template<kernel_mul_mv2_impl_t impl_fn>
 void mmv_fn(
-        device const    char * src0,
-        device const    char * src1,
-        device         float * dst,
-                     int64_t   ne00,
-                     int64_t   ne01,
-                     int64_t   ne02,
-                    uint64_t   nb00,
-                    uint64_t   nb01,
-                    uint64_t   nb02,
-                    uint64_t   nb03,
-                     int64_t   ne10,
-                     int64_t   ne11,
-                     int64_t   ne12,
-                     int64_t   ne13,
-                    uint64_t   nb10,
-                    uint64_t   nb11,
-                    uint64_t   nb12,
-                    uint64_t   nb13,
-                     int64_t   ne0,
-                     int64_t   ne1,
-                    uint64_t   nb1,
-                        uint   r2,
-                        uint   r3,
-        threadgroup int8_t   * shared_values,
-        uint3                  tgpig,
-        uint                   tiitg,
-        uint                   tiisg,
-        uint                   sgitg) {
-    impl_fn(src0,(const device float *)src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,shared_values,tgpig,tiisg,sgitg);
+        ggml_metal_kargs_mul_mv args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiitg,
+        ushort tiisg,
+        ushort sgitg) {
+    impl_fn(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
 }
 
-typedef decltype(mmv_fn<kernel_mul_mv_impl<half, half4, half, half4>>) mul_mv_impl_fn_t;
+typedef decltype(mmv_fn<kernel_mul_mv_impl<half, half4, half, half4, ggml_metal_kargs_mul_mv>>) mul_mv_impl_fn_t;
 
 template<mul_mv_impl_fn_t impl_fn>
 kernel void kernel_mul_mv_id(
-        device const    char * src0s,
-        device const    char * src1,
-        device         float * dst,
-        device const    char * ids,
-        constant     int64_t & nei0,
-        constant     int64_t & nei1,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        threadgroup int8_t   * shared_values [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    const int iid1 = tgpig.z/nei0;
-    const int idx = tgpig.z%nei0;
+        constant ggml_metal_kargs_mul_mv_id & args,
+        device const char * src0s,
+        device const char * src1,
+        device       char * dst,
+        device const char * ids,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiitg[[thread_index_in_threadgroup]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    const int iid1 = tgpig.z/args.nei0;
+    const int idx  = tgpig.z%args.nei0;
 
     tgpig.z = 0;
 
-    const int32_t i02 = ((device const int32_t *) (ids + iid1*nbi1))[idx];
+    const int32_t i02 = ((device const int32_t *) (ids + iid1*args.nbi1))[idx];
 
-    const int64_t i11 = idx % ne11;
+    const int64_t i11 = idx % args.ne11;
     const int64_t i12 = iid1;
 
     const int64_t i1 = idx;
     const int64_t i2 = i12;
 
-    device const char * src0_cur = src0s + i02*nb02;
-    device const char * src1_cur = src1  + i11*nb11 + i12*nb12;
-    device      float *  dst_cur = dst   + i1*ne0   + i2*ne1*ne0;
+    device const char * src0_cur = src0s + i02*args.nb02;
+    device const char * src1_cur = src1  + i11*args.nb11 + i12*args.nb12;
+
+    device char * dst_cur = dst + (i1*args.ne0 + i2*args.ne1*args.ne0)*sizeof(float);
+
+    ggml_metal_kargs_mul_mv args0 = {
+        /*.ne00 =*/ args.ne00,
+        /*.ne01 =*/ args.ne01,
+        /*.ne02 =*/ 1, // args.ne02,
+        /*.nb00 =*/ args.nb00,
+        /*.nb01 =*/ args.nb01,
+        /*.nb02 =*/ args.nb02,
+        /*.nb03 =*/ args.nb02, // args.ne02 == 1
+        /*.ne10 =*/ args.ne10,
+        /*.ne11 =*/ 1, // args.ne11,
+        /*.ne12 =*/ 1, // args.ne12,
+        /*.nb10 =*/ args.nb10,
+        /*.nb11 =*/ args.nb11,
+        /*.nb12 =*/ args.nb12,
+        /*.nb13 =*/ args.nb12, // ne12 == 1
+        /*.ne0  =*/ args.ne0,
+        /*.ne1  =*/ 1, // args.ne1,
+        /*.r2   =*/ 1,
+        /*.r3   =*/ 1,
+    };
 
     impl_fn(
+        args0,
         /* src0 */ src0_cur,
         /* src1 */ src1_cur,
         /* dst  */ dst_cur,
-        /* ne00 */ ne00,
-        /* ne01 */ ne01,
-        /* ne02 */ 1, // ne02,
-        /* nb00 */ nb00,
-        /* nb01 */ nb01,
-        /* nb02 */ nb02,
-        /* nb03 */ nb02, // ne02 == 1
-        /* ne10 */ ne10,
-        /* ne11 */ 1, // ne11,
-        /* ne12 */ 1, // ne12,
-        /* ne13 */ 1, // ne13,
-        /* nb10 */ nb10,
-        /* nb11 */ nb11,
-        /* nb12 */ nb12,
-        /* ne13 */ nb12, // ne12 == 1
-        /* ne0  */ ne0,
-        /* ne1  */ 1, // ne1,
-        /* nb1  */ nb1,
-        /* r2   */ 1,
-        /* r3   */ 1,
-        shared_values,
+        shmem,
         tgpig,
         tiitg,
         tiisg,

From 20a780c7b64c38071fe70ad23e16e80c23c0147b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 17 Nov 2024 13:12:22 +0200
Subject: [PATCH 248/396] gitignore : ignore local run scripts [no ci]

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 0b56bcc7a..307c065f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -134,3 +134,7 @@ poetry.toml
 
 # Test models for lora adapters
 /lora-tests
+
+# Local scripts
+/run-vim.sh
+/run-chat.sh

From be5caccef945546ee9fd25a151330a88d785faf9 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Sun, 17 Nov 2024 12:25:45 +0100
Subject: [PATCH 249/396] llama : only use default buffer types for the KV
 cache (#10358)

---
 ggml/src/ggml-backend.cpp |  9 +++++----
 src/llama.cpp             | 16 ++++------------
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 9a6010d36..9dcde8d11 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -689,7 +689,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
 }
 
 static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
-    ggml_backend_buffer_t buffer = tensor->buffer;
+    ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
     if (buffer == NULL) {
         return -1;
     }
@@ -722,8 +722,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML
 
 // returns the backend that should be used for the node based on the current locations
 static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
-    // TODO: use supports_op to check if the backend supports the op
-
     // assign pre-allocated nodes to their backend
     int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
     if (cur_backend_id != -1) {
@@ -742,7 +740,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
 
     if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
         // since the tensor is pre-allocated, it cannot be moved to another backend
-        GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
+        GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
     }
 
     // graph input
@@ -886,6 +884,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         int * node_backend_id = &tensor_backend_id(node);
+        if (ggml_is_view_op(node->op)) {
+            continue;
+        }
         // do not overwrite user assignments
         if (*node_backend_id == -1) {
             *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
diff --git a/src/llama.cpp b/src/llama.cpp
index 1703104fb..de96959f2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3460,21 +3460,13 @@ static bool llama_kv_cache_init(
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
-        const llama_model::buft_list_t * buft_list;
+        ggml_backend_buffer_type_t buft;
         if (offload) {
-            buft_list = model.dev_layer.at(i).buft_list;
+            auto * dev = model.dev_layer.at(i).dev;
+            buft = ggml_backend_dev_buffer_type(dev);
         } else {
-            buft_list = &model.cpu_buft_list;
+            buft = ggml_backend_cpu_buffer_type();
         }
-        ggml_backend_buffer_type_t buft = select_buft(*buft_list,
-            [&](ggml_context * ctx) {
-                ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-                if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
-                    return k;
-                }
-                ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
-                return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
-            });
         ggml_context * ctx = ctx_for_buft(buft);
 
         if (!ctx) {

From ce2e59ba107cf71ed566040ff20a15d1c58e09c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sun, 17 Nov 2024 12:59:38 +0100
Subject: [PATCH 250/396] CMake: fix typo in comment [no ci] (#10360)

---
 ggml/src/ggml-cuda/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index 3dde0f366..e592f7989 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -11,7 +11,7 @@ if (CUDAToolkit_FOUND)
         # 60     == P100, FP16 CUDA intrinsics
         # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
         # 70     == V100, FP16 tensor cores
-        # 75     == Turing, int6 tensor cores
+        # 75     == Turing, int8 tensor cores
         if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6")
             set(CMAKE_CUDA_ARCHITECTURES "native")
         elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)

From 76e9e58b7847112848aa1f40b65d1cbcd6d5f5a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sun, 17 Nov 2024 23:20:42 +0100
Subject: [PATCH 251/396] CUDA: fix MMV kernel being used for FP16 src1
 (#10357)

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ef56e944d..dd94ab03d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1760,11 +1760,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
-    if (!split && src0->type == GGML_TYPE_F16 && src1->ne[1] == 1 && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
+    if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
+        // the custom F16 vector kernel can be used over batched cuBLAS GEMM
+        // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
         ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
     } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
                && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
-        // KQ + KQV multi-batch without FlashAttention
+        // general KQ + KQV multi-batch without FlashAttention
         ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
     } else if (use_mul_mat_vec) {
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);

From 75207b3a887f91f813de1eb6e9fd135d3cb2b8c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Mon, 18 Nov 2024 00:21:53 +0100
Subject: [PATCH 252/396] docker: use GGML_NATIVE=OFF (#10368)

---
 .devops/full-cuda.Dockerfile           | 2 +-
 .devops/full-musa.Dockerfile           | 2 +-
 .devops/llama-cli-cann.Dockerfile      | 2 +-
 .devops/llama-cli-cuda.Dockerfile      | 2 +-
 .devops/llama-cli-intel.Dockerfile     | 2 +-
 .devops/llama-cli-musa.Dockerfile      | 2 +-
 .devops/llama-cli-vulkan.Dockerfile    | 2 +-
 .devops/llama-server-cuda.Dockerfile   | 2 +-
 .devops/llama-server-intel.Dockerfile  | 2 +-
 .devops/llama-server-musa.Dockerfile   | 2 +-
 .devops/llama-server-vulkan.Dockerfile | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
index d5acd35e2..05bff1bdf 100644
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -26,7 +26,7 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
     fi && \
-    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release -j$(nproc) && \
     cp build/bin/* .
 
diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile
index 34ba856d3..575e81b48 100644
--- a/.devops/full-musa.Dockerfile
+++ b/.devops/full-musa.Dockerfile
@@ -19,7 +19,7 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release -j$(nproc) && \
     cp build/bin/* .
 
diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile
index 45c0585b0..02dce501c 100644
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 
 RUN echo "Building with static libs" && \
     source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
-    cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
     cmake --build build --config Release --target llama-cli
 
 # TODO: use image with NNRT
diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile
index 3279c8da4..7796891d5 100644
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@@ -22,7 +22,7 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
     fi && \
-    cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release --target llama-cli -j$(nproc) && \
     mkdir -p /app/lib && \
     find build -name "*.so" -exec cp {} /app/lib \;
diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
index 1f4ce0730..0706f732a 100644
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -15,7 +15,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
         export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
     fi && \
     echo "Building with static libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
     ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
     cmake --build build --config Release --target llama-cli
 
diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile
index 1edf75cf2..3372749be 100644
--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/llama-cli-musa.Dockerfile
@@ -15,7 +15,7 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release --target llama-cli -j$(nproc) && \
     mkdir -p /app/lib && \
     find build -name "*.so" -exec cp {} /app/lib \;
diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile
index 9b0dad8bf..92a6e0479 100644
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
     cmake --build build --config Release --target llama-cli
 
 # Clean up
diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile
index ea07a4d52..bf8a198f9 100644
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -22,7 +22,7 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
     fi && \
-    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release --target llama-server -j$(nproc) && \
     mkdir -p /app/lib && \
     find build -name "*.so" -exec cp {} /app/lib \;
diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile
index 773f030a7..b503b8cfe 100644
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -15,7 +15,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
         export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
     fi && \
     echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
     cmake --build build --config Release --target llama-server
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile
index 259877468..eb67201c1 100644
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/llama-server-musa.Dockerfile
@@ -15,7 +15,7 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release --target llama-server -j$(nproc) && \
     mkdir -p /app/lib && \
     find build -name "*.so" -exec cp {} /app/lib \;
diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile
index 93c5e0c26..6aa786779 100644
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
     cmake --build build --config Release --target llama-server
 
 # Clean up

From 9b75f03cd2ec9cc482084049d87a0f08f9f01517 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 18 Nov 2024 11:02:43 +0100
Subject: [PATCH 253/396] Vulkan: Fix device info output format specifiers
 (#10366)

* Vulkan: Fix device info output format specifiers

* Vulkan: Use zu printf specifier for size_t instead of ld
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 19951d9b6..ef067356e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1764,11 +1764,11 @@ static void ggml_vk_print_gpu_info(size_t idx) {
     fp16 = fp16 && vk12_features.shaderFloat16;
 
     std::string device_name = props2.properties.deviceName.data();
-    GGML_LOG_DEBUG("ggml_vulkan: %d = %s (%s) | uma: %d | fp16: %d | warp size: %d\n",
-              idx, device_name.c_str(), driver_props.driverName, uma, fp16, subgroup_size);
+    GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu\n",
+              idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size);
 
     if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
-        std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
+        GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n");
     }
 }
 
@@ -1937,8 +1937,7 @@ void ggml_vk_instance_init() {
             vk_instance.device_indices.push_back(0);
         }
     }
-    GGML_LOG_DEBUG("ggml_vulkan: Found %d Vulkan devices:\n", vk_instance.device_indices.size());
-
+    GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size());
 
     for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
         ggml_vk_print_gpu_info(i);

From 2eb76b2a5e4ea395b971a419c95b473ab6f253e4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Nov 2024 16:08:20 +0200
Subject: [PATCH 254/396] flake.lock: Update (#10346)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/4aa36568d413aca0ea84a1684d2d46f55dbabad7?narHash=sha256-Zwl8YgTVJTEum%2BL%2B0zVAWvXAGbWAuXHax3KzuejaDyo%3D' (2024-11-05)
  → 'github:NixOS/nixpkgs/5e4fbfb6b3de1aa2872b76d49fafc942626e2add?narHash=sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg%3D' (2024-11-15)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index 470363a2f..ee8cf07e3 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1730785428,
-        "narHash": "sha256-Zwl8YgTVJTEum+L+0zVAWvXAGbWAuXHax3KzuejaDyo=",
+        "lastModified": 1731676054,
+        "narHash": "sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "4aa36568d413aca0ea84a1684d2d46f55dbabad7",
+        "rev": "5e4fbfb6b3de1aa2872b76d49fafc942626e2add",
         "type": "github"
       },
       "original": {

From f139d2ea611c5604395c95160d3c53f7c4eaf220 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Mon, 18 Nov 2024 08:28:42 -0600
Subject: [PATCH 255/396] vulkan: remove use of null initializer (#10372)

Seems like this isn't working for vulkan-over-metal when the array is sized
by a spec constant. Maybe a spirv-cross limitation?
---
 ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
index ddbac5d2c..970aac6ef 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -5,8 +5,6 @@
 #endif
 #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
 
-#extension GL_EXT_null_initializer : enable
-
 #include "mul_mat_vec_base.comp"
 
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
@@ -57,7 +55,11 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
 
     y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
 
-    FLOAT_TYPE temp[NUM_ROWS] = {};
+    FLOAT_TYPE temp[NUM_ROWS];
+
+    for (uint i = 0; i < NUM_ROWS; ++i) {
+        temp[i] = FLOAT_TYPE(0);
+    }
 
     const int unroll_count = 8;
 

From 531cb1c233800e6acb021dc56d69595e314db072 Mon Sep 17 00:00:00 2001
From: bandoti <141645996+bandoti@users.noreply.github.com>
Date: Mon, 18 Nov 2024 11:23:58 -0400
Subject: [PATCH 256/396] Skip searching root path for cross-compile builds
 (#10383)

---
 cmake/llama-config.cmake.in | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in
index ef68417b4..a7f1efb88 100644
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -66,11 +66,15 @@ endif()
 
 find_library(ggml_LIBRARY ggml
     REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
+    HINTS ${LLAMA_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH
+)
 
 find_library(llama_LIBRARY llama
     REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
+    HINTS ${LLAMA_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH
+)
 
 set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
 set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")

From d3481e631661b5e9517f78908cdd58cee63c4903 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 18 Nov 2024 18:43:40 +0100
Subject: [PATCH 257/396] cuda : only use native when supported by cmake
 (#10389)

---
 ggml/src/ggml-cuda/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index e592f7989..25858ece8 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -12,7 +12,7 @@ if (CUDAToolkit_FOUND)
         # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
         # 70     == V100, FP16 tensor cores
         # 75     == Turing, int8 tensor cores
-        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6")
+        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
             set(CMAKE_CUDA_ARCHITECTURES "native")
         elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
             set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")

From 557924f22237c76387a39c4db5abae154d57e754 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Tue, 19 Nov 2024 00:50:04 +0000
Subject: [PATCH 258/396] sycl: Revert MUL_MAT_OP support changes (#10385)

---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 2dba15d23..255bc64c6 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4350,10 +4350,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 if (op->op == GGML_OP_MUL_MAT) {
                     a = op->src[0];
                     b = op->src[1];
-                    if (ggml_is_permuted(a) || ggml_is_permuted(b)) {
-                        // TODO: fix like https://github.com/ggerganov/llama.cpp/pull/10021
-                        return false;
-                    }
                 } else {
                     a = op->src[2];
                     b = op->src[1];

From b3e585988fc65d3a8083c6d94dfc0629f9ce226d Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Tue, 19 Nov 2024 01:25:17 -0600
Subject: [PATCH 259/396] vulkan: Optimize soft_max (#10301)

* vulkan: Optimize soft_max

Large soft_max could already saturate memory, but small/medium sizes were
pretty slow. The bulk of the gains for them comes from using a smaller
workgroup size, and making the workgroup size match the subgroup size also
makes the barriers much cheaper.

Cache some values in locals to avoid refetching/recomputing. And stamp
out a few "template instantiations" so smaller cases will fully unroll.

Add a missing early return for OOB rows. This happens when there are more
than 512 rows and the dispatch is 512 x H.

* vulkan: Further soft_max optimizations

Restore the workgroup size of 512 case, use it for >1024.

Use unrollable loops for more iteration counts.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          |  13 +-
 .../ggml-vulkan/vulkan-shaders/soft_max.comp  | 114 ++++++++++++++----
 tests/test-backend-ops.cpp                    |   8 ++
 3 files changed, 107 insertions(+), 28 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ef067356e..21fee2f3d 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -218,6 +218,7 @@ struct vk_device_struct {
     vk_pipeline pipeline_tanh_f32;
     vk_pipeline pipeline_diag_mask_inf_f32;
     vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
+    vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
     vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
     vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
     vk_pipeline pipeline_argsort_f32;
@@ -388,6 +389,7 @@ struct vk_op_soft_max_push_constants {
     float m0;
     float m1;
     uint32_t n_head_log2;
+    uint32_t nrows_x;
 };
 
 struct vk_op_argsort_push_constants {
@@ -1497,8 +1499,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
@@ -3932,10 +3936,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
         GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
 
         if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_soft_max_f32;
+            return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_wg512 : ctx->device->pipeline_soft_max_f32;
         }
         if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_soft_max_f32_f16;
+            return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_f16_wg512 : ctx->device->pipeline_soft_max_f32_f16;
         }
         return nullptr;
     case GGML_OP_ROPE:
@@ -4581,6 +4585,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
         scale, max_bias,
         m0, m1,
         n_head_log2,
+        nrows_x,
     }, dryrun);
 }
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
index 0bd51ecab..f9727679e 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
@@ -1,6 +1,7 @@
 #version 450
 
-#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_control_flow_attributes : enable
 
 layout (push_constant) uniform parameter
 {
@@ -11,14 +12,13 @@ layout (push_constant) uniform parameter
     float m0;
     float m1;
     uint n_head_log2;
+    uint nrows_x;
 } p;
 
 #include "types.comp"
 
-#extension GL_EXT_control_flow_attributes : enable
-#define BLOCK_SIZE 512
-
-layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+layout(constant_id = 0) const uint BLOCK_SIZE = 32;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
@@ -26,11 +26,18 @@ layout (binding = 2) buffer D {D_TYPE data_d[];};
 
 shared FLOAT_TYPE vals[BLOCK_SIZE];
 
-void main() {
+// num_iters is the number of BLOCK_SIZE loop iterations we need to iterate
+// over all the columns. The main function tries to pass a constant here,
+// as if it were a template function, to allow unrolling.
+void soft_max(uint num_iters) {
     const uint tid = gl_LocalInvocationID.x;
     const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
     const uint rowy = rowx % p.KY;
 
+    if (rowx >= p.nrows_x) {
+        return;
+    }
+
     float slope = 1.0f;
 
     // ALiBi
@@ -46,19 +53,37 @@ void main() {
     // Find max
     FLOAT_TYPE max_val = uintBitsToFloat(0xFF800000);
 
-    [[unroll]] for (uint col0 = 0; col0 < p.KX; col0 += BLOCK_SIZE) {
+    // Cache values while we compute the max, so we don't need to read them
+    // again when we're ready to compute exp(x-max).
+    const uint DATA_CACHE_SIZE = 16;
+    FLOAT_TYPE data_cache[DATA_CACHE_SIZE];
+
+    [[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
         const uint col = col0 + tid;
 
-        if (col >= p.KX) {
-            break;
+        FLOAT_TYPE a = FLOAT_TYPE(0);
+        if (col < p.KX) {
+            a = data_a[rowx * p.KX + col];
         }
 
-        max_val = max(max_val, FLOAT_TYPE(data_a[rowx * p.KX + col]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)));
-    }
-    vals[tid] = max_val;
+        FLOAT_TYPE b = FLOAT_TYPE(0);
+        if (p.KY > 0 && col < p.KX) {
+            b = data_b[rowy * p.KX + col];
+        }
 
+        FLOAT_TYPE v = a * p.scale + slope * b;
+
+        max_val = max(max_val, v);
+
+        if (idx < DATA_CACHE_SIZE) {
+            data_cache[idx] = v;
+        }
+    }
+
+    // reduce across the workgroup
+    vals[tid] = max_val;
     barrier();
-    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
         if (tid < s) {
             vals[tid] = max(vals[tid], vals[tid + s]);
         }
@@ -68,39 +93,80 @@ void main() {
     max_val = vals[0];
     barrier();
 
-    // Sum up values
-    vals[tid] = FLOAT_TYPE(0.0f);
+    FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
 
-    [[unroll]] for (uint col0 = 0; col0 < p.KX; col0 += BLOCK_SIZE) {
+    // Compute sum{exp(x - max)}
+    [[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
         const uint col = col0 + tid;
 
         if (col >= p.KX) {
             break;
         }
 
+        // compute exp(a*scale+b*slope), add it to sum, and cache the new value
+        // in data_cache if possible.
         const uint i = rowx * p.KX + col;
-        const FLOAT_TYPE val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) - max_val);
-        vals[tid] += val;
-        data_d[i] = D_TYPE(val);
+        FLOAT_TYPE val;
+        if (idx < DATA_CACHE_SIZE) {
+            val = exp(data_cache[idx] - max_val);
+        } else {
+            val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) - max_val);
+        }
+        sum += val;
+        if (idx < DATA_CACHE_SIZE) {
+            data_cache[idx] = val;
+        } else {
+            data_d[i] = D_TYPE(val);
+        }
     }
 
+    // reduce across the workgroup
+    vals[tid] = sum;
     barrier();
-    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
         if (tid < s) {
             vals[tid] += vals[tid + s];
         }
         barrier();
     }
+    sum = vals[0];
 
-    const D_TYPE divisor = D_TYPE(vals[0]);
+    FLOAT_TYPE rcpdivisor = 1.0/sum;
 
-    [[unroll]] for (uint col0 = 0; col0 < p.KX; col0 += BLOCK_SIZE) {
+    [[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
         const uint col = col0 + tid;
 
         if (col >= p.KX) {
-            break;
+            continue;
         }
 
-        data_d[rowx*p.KX + col] /= divisor;
+        if (idx < DATA_CACHE_SIZE) {
+            data_d[rowx*p.KX + col] = D_TYPE(data_cache[idx] * rcpdivisor);
+        } else {
+            data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor);
+        }
+    }
+}
+
+void main() {
+    // instantiate the soft_max function for several different
+    // dimensions, to allow loop unrolling
+    uint num_blocks = (p.KX + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    if (num_blocks > 32) {
+        soft_max(num_blocks);
+    } else if (num_blocks > 16) {
+        soft_max(32);
+    } else if (num_blocks > 8) {
+        soft_max(16);
+    } else if (num_blocks > 4) {
+        soft_max(8);
+    } else if (num_blocks == 4) {
+        soft_max(4);
+    } else if (num_blocks == 3) {
+        soft_max(3);
+    } else if (num_blocks == 2) {
+        soft_max(2);
+    } else if (num_blocks == 1) {
+        soft_max(1);
     }
 }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index f8a59b6df..01ac7166e 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3823,6 +3823,14 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
 
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
 
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f));
+
     for (int bs : {1, 512}) {
         for (ggml_type type_a : all_types) {
             for (ggml_type type_b : {GGML_TYPE_F32}) {

From 2a1507c1629975d9d20a503d6a14f44eff292c25 Mon Sep 17 00:00:00 2001
From: Romain Biessy <romain.biessy@codeplay.com>
Date: Tue, 19 Nov 2024 09:02:23 +0100
Subject: [PATCH 260/396] sycl : Add option to set the SYCL architecture for
 all targets (#10266)

* Add option to set the SYCL architecture for all targets
* Convert GGML_SYCL_HIP_TARGET to the more generic GGML_SYCL_ARCH option
* Document that setting GGML_SYCL_ARCH can improve the performance
---
 docs/backend/SYCL.md              | 12 ++++++++----
 ggml/CMakeLists.txt               |  2 ++
 ggml/src/ggml-sycl/CMakeLists.txt | 10 +++++++---
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 38185f738..e431f51f1 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -312,12 +312,14 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
 
 # Build LLAMA with Nvidia BLAS acceleration through SYCL
+# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
+GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
 
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 
 # build all binary
 cmake --build build --config Release -j -v
@@ -335,8 +337,9 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE
 
 ## AMD
 # Use FP32, FP16 is not supported
-# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:'
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
+GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
 # build all binary
 cmake --build build --config Release -j -v
@@ -646,6 +649,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 |--------------------|---------------------------------------|---------------------------------------------|
 | GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.FP32 path - recommended for better perforemance than FP16 on quantized model|
 | GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
+| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)          | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
 | GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index a82818d60..b16a0e9ad 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -164,6 +164,8 @@ option(GGML_SYCL                            "ggml: use SYCL"
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                             "ggml: sycl target device")
+set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
+                                            "ggml: sycl device architecture")
 
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 03bf3cb92..d1d0ff83d 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -72,10 +72,14 @@ else()
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
         target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
     elseif (GGML_SYCL_TARGET STREQUAL "AMD")
-        if (GGML_SYCL_HIP_TARGET STREQUAL "")
-            message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.")
+        if (NOT GGML_SYCL_DEVICE_ARCH)
+            message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
         endif()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa")
         target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
     endif()
+
+    if (GGML_SYCL_DEVICE_ARCH)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}")
+  endif()
 endif()

From a88ad007de3eb5d92cb538fd269ff94c4bf0c8d2 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Tue, 19 Nov 2024 01:04:08 -0800
Subject: [PATCH 261/396] llama : add OLMo November 2024 support (#10394)

* Add OLMo November 2024 constants

* Add OLMo November 2024 converter

* Add loading of OLMo November 2024 tensors and hyper parameters

* Add building of OLMo November 2024 model
---
 convert_hf_to_gguf.py          |   5 +
 gguf-py/gguf/constants.py      |  18 ++++
 gguf-py/gguf/tensor_mapping.py |  28 ++---
 src/llama.cpp                  | 186 +++++++++++++++++++++++++++++++++
 4 files changed, 223 insertions(+), 14 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 39afa5ef4..9f4b8154b 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -3040,6 +3040,11 @@ class OlmoModel(Model):
         return [(self.map_tensor_name(name), data_torch)]
 
 
+@Model.register("Olmo1124ForCausalLM")
+class Olmo1124Model(Model):
+    model_arch = gguf.MODEL_ARCH.OLMO_1124
+
+
 @Model.register("OlmoeForCausalLM")
 class OlmoeModel(Model):
     model_arch = gguf.MODEL_ARCH.OLMOE
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index bc2b649d1..d83b72f76 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -243,6 +243,7 @@ class MODEL_ARCH(IntEnum):
     COMMAND_R    = auto()
     DBRX         = auto()
     OLMO         = auto()
+    OLMO_1124    = auto()
     OLMOE        = auto()
     OPENELM      = auto()
     ARCTIC       = auto()
@@ -404,6 +405,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.COMMAND_R:      "command-r",
     MODEL_ARCH.DBRX:           "dbrx",
     MODEL_ARCH.OLMO:           "olmo",
+    MODEL_ARCH.OLMO_1124:      "olmo_1124",
     MODEL_ARCH.OLMOE:          "olmoe",
     MODEL_ARCH.OPENELM:        "openelm",
     MODEL_ARCH.ARCTIC:         "arctic",
@@ -1069,6 +1071,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.OLMO_1124: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.FFN_POST_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.OLMOE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index f4a787c56..4cbd39e03 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -13,7 +13,7 @@ class TensorNameMap:
             "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone
             "transformer.word_embeddings",               # falcon
             "word_embeddings",                           # bloom
-            "model.embed_tokens",                        # llama-hf nemotron olmoe
+            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo_1124
             "tok_embeddings",                            # llama-pth
             "embeddings.word_embeddings",                # bert nomic-bert
             "language_model.embedding.word_embeddings",  # persimmon
@@ -54,7 +54,7 @@ class TensorNameMap:
         # Output
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe
+            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124
             "output",                    # llama-pth bloom internlm2
             "word_embeddings_for_head",  # persimmon
             "lm_head.linear",            # phi2
@@ -66,7 +66,7 @@ class TensorNameMap:
         MODEL_TENSOR.OUTPUT_NORM: (
             "gpt_neox.final_layer_norm",               # gptneox
             "transformer.ln_f",                        # gpt2 gpt-j falcon jais exaone
-            "model.norm",                              # llama-hf baichuan internlm2 olmoe
+            "model.norm",                              # llama-hf baichuan internlm2 olmoe olmo_1124
             "norm",                                    # llama-pth
             "transformer.norm_f",                      # mpt dbrx
             "ln_f",                                    # refact bloom qwen gpt2
@@ -145,7 +145,7 @@ class TensorNameMap:
 
         # Attention query
         MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe
+            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe olmo_1124
             "layers.{bid}.attention.wq",                                 # llama-pth
             "encoder.layer.{bid}.attention.self.query",                  # bert
             "transformer.h.{bid}.attn.q_proj",                           # gpt-j
@@ -157,7 +157,7 @@ class TensorNameMap:
 
         # Attention key
         MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe
+            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe olmo_1124
             "layers.{bid}.attention.wk",                               # llama-pth
             "encoder.layer.{bid}.attention.self.key",                  # bert
             "transformer.h.{bid}.attn.k_proj",                         # gpt-j
@@ -170,7 +170,7 @@ class TensorNameMap:
 
         # Attention value
         MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe
+            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo_1124
             "layers.{bid}.attention.wv",                                 # llama-pth
             "encoder.layer.{bid}.attention.self.value",                  # bert
             "transformer.h.{bid}.attn.v_proj",                           # gpt-j
@@ -188,7 +188,7 @@ class TensorNameMap:
             "transformer.blocks.{bid}.attn.out_proj",                       # mpt
             "transformer.h.{bid}.self_attention.dense",                     # falcon
             "h.{bid}.self_attention.dense",                                 # bloom
-            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe
+            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo_1124
             "layers.{bid}.attention.wo",                                    # llama-pth
             "encoder.layer.{bid}.attention.output.dense",                   # bert
             "transformer.h.{bid}.attn.out_proj",                            # gpt-j
@@ -215,7 +215,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.ATTN_POST_NORM: (
-            "model.layers.{bid}.post_attention_layernorm",     # gemma2
+            "model.layers.{bid}.post_attention_layernorm",     # gemma2 olmo_1124
         ),
 
         # Rotary embeddings
@@ -250,7 +250,7 @@ class TensorNameMap:
 
         # Post feed-forward norm
         MODEL_TENSOR.FFN_POST_NORM: (
-            "model.layers.{bid}.post_feedforward_layernorm", # gemma2
+            "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo_1124
         ),
 
         MODEL_TENSOR.FFN_GATE_INP: (
@@ -273,7 +273,7 @@ class TensorNameMap:
             "transformer.blocks.{bid}.ffn.up_proj",                   # mpt
             "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
             "h.{bid}.mlp.dense_h_to_4h",                              # bloom
-            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron
+            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo_1124
             "layers.{bid}.feed_forward.w3",                           # llama-pth
             "encoder.layer.{bid}.intermediate.dense",                 # bert
             "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
@@ -314,7 +314,7 @@ class TensorNameMap:
 
         # Feed-forward gate
         MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact
+            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact olmo_1124
             "layers.{bid}.feed_forward.w1",               # llama-pth
             "transformer.h.{bid}.mlp.w2",                 # qwen
             "transformer.h.{bid}.mlp.c_fc2",              # jais
@@ -346,7 +346,7 @@ class TensorNameMap:
             "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
             "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
             "h.{bid}.mlp.dense_4h_to_h",                              # bloom
-            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron
+            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo_1124
             "layers.{bid}.feed_forward.w2",                           # llama-pth
             "encoder.layer.{bid}.output.dense",                       # bert
             "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
@@ -383,7 +383,7 @@ class TensorNameMap:
         MODEL_TENSOR.ATTN_Q_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
             "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon
+            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon olmo_1124
             "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
             "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
             "transformer.layers.{bid}.attn.q_norm",                           # openelm
@@ -392,7 +392,7 @@ class TensorNameMap:
         MODEL_TENSOR.ATTN_K_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
             "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon
+            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon olmo_1124
             "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
             "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
             "transformer.layers.{bid}.attn.k_norm",                           # openelm
diff --git a/src/llama.cpp b/src/llama.cpp
index de96959f2..4f31f25b1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -179,6 +179,7 @@ enum llm_arch {
     LLM_ARCH_COMMAND_R,
     LLM_ARCH_DBRX,
     LLM_ARCH_OLMO,
+    LLM_ARCH_OLMO_1124,
     LLM_ARCH_OLMOE,
     LLM_ARCH_OPENELM,
     LLM_ARCH_ARCTIC,
@@ -232,6 +233,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_COMMAND_R,       "command-r"    },
     { LLM_ARCH_DBRX,            "dbrx"         },
     { LLM_ARCH_OLMO,            "olmo"         },
+    { LLM_ARCH_OLMO_1124,       "olmo_1124"    },
     { LLM_ARCH_OLMOE,           "olmoe"        },
     { LLM_ARCH_OPENELM,         "openelm"      },
     { LLM_ARCH_ARCTIC,          "arctic"       },
@@ -1207,6 +1209,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_OLMO_1124,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_OLMOE,
         {
@@ -5877,6 +5898,17 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_OLMO_1124:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 16: model.type = e_model::MODEL_1B; break;
+                    case 32: model.type = e_model::MODEL_7B; break;
+                    case 40: model.type = e_model::MODEL_13B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_OLMOE:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -8559,6 +8591,31 @@ static bool llm_load_tensors(
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
+            case LLM_ARCH_OLMO_1124:
+                {
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = model.layers[i];
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+                    }
+                } break;
             case LLM_ARCH_OLMOE:
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -14424,6 +14481,130 @@ struct llm_build_context {
         return gf;
     }
 
+    struct ggml_cgraph * build_olmo_1124() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = inpL;
+
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Qcur, "Qcur_normed", il);
+
+                Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Kcur, "Kcur_normed", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur_rope", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur_rope", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    model.layers[il].attn_post_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_post_norm", il);
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_ffn(ctx0, lctx, ffn_inp,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+            cb(cur, "ffn_post_norm", -1);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
     // based on the build_qwen2moe() function, changes:
     //   * removed shared experts
     //   * removed bias
@@ -16616,6 +16797,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_olmo();
             } break;
+        case LLM_ARCH_OLMO_1124:
+            {
+                result = llm.build_olmo_1124();
+            } break;
         case LLM_ARCH_OLMOE:
             {
                 result = llm.build_olmoe();
@@ -19885,6 +20070,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_QWEN:
         case LLM_ARCH_QWEN2:
         case LLM_ARCH_QWEN2MOE:
+        case LLM_ARCH_OLMO_1124:
         case LLM_ARCH_OLMOE:
         case LLM_ARCH_PHI2:
         case LLM_ARCH_PHI3:

From 8e752a777b272606f22cb741b03e062de4ddb8fe Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Nov 2024 13:29:26 +0200
Subject: [PATCH 262/396] llama : add check for KV cache shifts (#10401)

ggml-ci
---
 common/common.cpp | 6 ++++++
 include/llama.h   | 3 +++
 src/llama.cpp     | 6 +++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 930374621..d314523db 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -875,6 +875,12 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
+    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
+        LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
+        llama_free_model(model);
+        return iparams;
+    }
+
     if (!params.control_vectors.empty()) {
         if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
         if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
diff --git a/include/llama.h b/include/llama.h
index bc268e799..90791d5f5 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -667,6 +667,9 @@ extern "C" {
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
     LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
 
+    // Check if the context supports KV cache shifting
+    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
+
     //
     // State / sessions
     //
diff --git a/src/llama.cpp b/src/llama.cpp
index 4f31f25b1..c51b36e66 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18213,7 +18213,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
     // apply K-shift if needed
     if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
-        if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
+        if (!llama_kv_cache_can_shift(&lctx)) {
             GGML_ABORT("Deepseek2 does not support K-shift");
         }
 
@@ -20462,6 +20462,10 @@ void llama_kv_cache_update(struct llama_context * ctx) {
     llama_kv_cache_update_internal(*ctx);
 }
 
+bool llama_kv_cache_can_shift(struct llama_context * ctx) {
+    return ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+}
+
 // deprecated
 size_t llama_get_state_size(struct llama_context * ctx) {
     return llama_state_get_size(ctx);

From 3ee6382d48b07b31e64983969c16019490e19740 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 19 Nov 2024 14:29:38 +0100
Subject: [PATCH 263/396] cuda : fix CUDA_FLAGS not being applied (#10403)

---
 ggml/src/ggml-blas/CMakeLists.txt | 1 -
 ggml/src/ggml-cuda/CMakeLists.txt | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt
index ffe39e8bd..e2cbabf0d 100644
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@@ -75,7 +75,6 @@ if (BLAS_FOUND)
 
     message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
 
-    #add_compile_options(${BLAS_LINKER_FLAGS})
     target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
 
     if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index 25858ece8..e1482a269 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -149,7 +149,7 @@ if (CUDAToolkit_FOUND)
         list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
     endif()
 
-    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
+    target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
 else()
     message(FATAL_ERROR "CUDA Toolkit not found")
 endif()

From 2a11b6b0946c1abab2ab150725610e5ee736b3af Mon Sep 17 00:00:00 2001
From: bandoti <141645996+bandoti@users.noreply.github.com>
Date: Tue, 19 Nov 2024 12:10:30 -0400
Subject: [PATCH 264/396] Add required ggml-base and backend libs to cmake pkg
 (#10407)

---
 cmake/llama-config.cmake.in | 52 +++++++++++++++++++++++--------------
 ggml/CMakeLists.txt         |  8 ++----
 ggml/src/CMakeLists.txt     |  2 +-
 3 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in
index a7f1efb88..28a8c18b6 100644
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -3,17 +3,11 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
 
-set(GGML_BLAS       @GGML_BLAS@)
-set(GGML_CUDA       @GGML_CUDA@)
-set(GGML_METAL      @GGML_METAL@)
-set(GGML_HIP        @GGML_HIP@)
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
-set(GGML_VULKAN @GGML_VULKAN@)
 set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
 set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
 set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
 set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
-set(GGML_SYCL @GGML_SYCL@)
 set(GGML_OPENMP @GGML_OPENMP@)
 
 @PACKAGE_INIT@
@@ -22,10 +16,39 @@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
 set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
 
-# Ensure transient dependencies satisfied
-
 find_package(Threads REQUIRED)
 
+set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
+set(_llama_link_deps "")
+foreach(_ggml_lib ggml ggml-base)
+    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
+    find_library(${_ggml_lib_var} ${_ggml_lib}
+        REQUIRED
+        HINTS ${LLAMA_LIB_DIR}
+        NO_CMAKE_FIND_ROOT_PATH
+    )
+    list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
+    message(STATUS "Found ${${_ggml_lib_var}}")
+endforeach()
+
+foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
+    string(TOUPPER "GGML_${backend}" backend_id)
+    set(_ggml_lib "ggml-${backend}")
+    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
+
+    find_library(${_ggml_lib_var} ${_ggml_lib}
+        HINTS ${LLAMA_LIB_DIR}
+        NO_CMAKE_FIND_ROOT_PATH
+    )
+    if(${_ggml_lib_var})
+        list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
+        set(${backend_id} ON)
+        message(STATUS "Found backend ${${_ggml_lib_var}}")
+    else()
+        set(${backend_id} OFF)
+    endif()
+endforeach()
+
 if (APPLE AND GGML_ACCELERATE)
     find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
 endif()
@@ -48,7 +71,7 @@ if (GGML_VULKAN)
     find_package(Vulkan REQUIRED)
 endif()
 
-if (GGML_HIPBLAS)
+if (GGML_HIP)
     find_package(hip REQUIRED)
     find_package(hipblas REQUIRED)
     find_package(rocblas REQUIRED)
@@ -63,24 +86,13 @@ if (GGML_OPENMP)
     find_package(OpenMP REQUIRED)
 endif()
 
-
-find_library(ggml_LIBRARY ggml
-    REQUIRED
-    HINTS ${LLAMA_LIB_DIR}
-    NO_CMAKE_FIND_ROOT_PATH
-)
-
 find_library(llama_LIBRARY llama
     REQUIRED
     HINTS ${LLAMA_LIB_DIR}
     NO_CMAKE_FIND_ROOT_PATH
 )
 
-set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
-set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
-
 add_library(llama UNKNOWN IMPORTED)
-
 set_target_properties(llama
     PROPERTIES
         INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index b16a0e9ad..9ab91421a 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -235,12 +235,8 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 #if (GGML_METAL)
 #    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
 #endif()
-install(TARGETS ggml PUBLIC_HEADER)
-
-if (BUILD_SHARED_LIBS)
-    install(TARGETS ggml      LIBRARY)
-    install(TARGETS ggml-base LIBRARY)
-endif()
+install(TARGETS ggml LIBRARY PUBLIC_HEADER)
+install(TARGETS ggml-base LIBRARY)
 
 # FIXME: this should be done in the backend cmake files
 if (GGML_METAL)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index ae7d3abc8..8df0e85c0 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -239,8 +239,8 @@ function(ggml_add_backend backend)
             if (${BUILD_SHARED_LIBS})
                 target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
                 target_compile_definitions(${backend_target} PUBLIC  GGML_BACKEND_SHARED)
-                install(TARGETS ${backend_target} LIBRARY)
             endif()
+            install(TARGETS ${backend_target} LIBRARY)
             target_link_libraries(ggml PUBLIC ${backend_target})
             string(TOUPPER "GGML_USE_${backend}" backend_use)
             target_compile_definitions(ggml PUBLIC ${backend_use})

From 342397dc7edb311e0373205134d0d3a928b891b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=95=AD=E6=BE=A7=E9=82=A6?=
 <45505768+shou692199@users.noreply.github.com>
Date: Wed, 20 Nov 2024 01:42:00 +0800
Subject: [PATCH 265/396] cmake: force MSVC compiler charset to utf-8 (#9989)

---
 CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93c60ef43..994e61e45 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,13 @@ if (WIN32)
     add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 endif()
 
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/source-charset:utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/source-charset:utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/execution-charset:utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/execution-charset:utf-8>")
+endif()
+
 #
 # option list
 #

From 12b0ad953a59563ea8d973708760d747321d8432 Mon Sep 17 00:00:00 2001
From: PAB <pierreantoine.bannier@gmail.com>
Date: Mon, 18 Nov 2024 10:02:49 +0100
Subject: [PATCH 266/396] metal : add `GGML_UNARY_OP_ELU` kernel (ggml/1018)

---
 ggml/src/ggml-metal/ggml-metal.m     | 15 +++++++++++++++
 ggml/src/ggml-metal/ggml-metal.metal |  8 ++++++++
 2 files changed, 23 insertions(+)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 58fee4bfd..d1abb3cef 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -126,6 +126,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
     GGML_METAL_KERNEL_TYPE_SILU,
     GGML_METAL_KERNEL_TYPE_SILU_4,
+    GGML_METAL_KERNEL_TYPE_ELU,
     GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,
     GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,
     GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,
@@ -649,6 +650,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,                  gelu_quick_4,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                          silu,                           true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4,                        silu_4,                         true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ELU,                           elu,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,                  soft_max_f16,                   has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,                soft_max_f16_4,                 has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,                  soft_max_f32,                   has_simdgroup_reduction);
@@ -968,6 +970,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                 case GGML_UNARY_OP_GELU:
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_ELU:
                     return ggml_is_contiguous(op->src[0]);
                 default:
                     return false;
@@ -1589,6 +1592,18 @@ static void ggml_metal_encode_node(
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
+                case GGML_UNARY_OP_ELU:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ELU].pipeline;
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(dst);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
                 default:
                 {
                     GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 86fdf1c18..819b20ba8 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -782,6 +782,14 @@ kernel void kernel_silu_4(
     dst[tpig] = x / (1.0f + exp(-x));
 }
 
+kernel void kernel_elu(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    device const float & x = src0[tpig];
+    dst[tpig] = (x > 0.0f) ? x : (exp(x) - 1.0f);
+}
+
 kernel void kernel_sqr(
         device const float * src0,
         device       float * dst,

From 611fabd7922050e1e99bd276d3544527cd46047b Mon Sep 17 00:00:00 2001
From: Plamen Minev <pacominev@gmail.com>
Date: Mon, 18 Nov 2024 15:02:27 +0200
Subject: [PATCH 267/396] metal : fox offset integer overflows in im2col
 (ggml/1015)

-- While running StableDiffusion.cpp locally with Metal some offsets overflow and results in incorrect calculations
---
 ggml/src/ggml-metal/ggml-metal.metal | 52 ++++++++++++++++++----------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 819b20ba8..971f5054b 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2145,20 +2145,34 @@ kernel void kernel_im2col(
         uint3  tgpg[[threadgroups_per_grid]],
         uint3 tpitg[[thread_position_in_threadgroup]],
         uint3   ntg[[threads_per_threadgroup]]) {
-    const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0;
-    const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1;
+//    const int64_t IC = tgpg[0];
+    const int64_t OH = tgpg[1];
+    const int64_t OW = tgpg[2];
 
-    const int32_t offset_dst =
-        (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
-        (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
+//    const int64_t N  = ntg[0];
+    const int64_t KH = ntg[1];
+    const int64_t KW = ntg[2];
+
+    const int64_t in  = tpitg[0];
+    const int64_t ikh = tpitg[1];
+    const int64_t ikw = tpitg[2];
+
+    const int64_t iic = tgpig[0];
+    const int64_t ioh = tgpig[1];
+    const int64_t iow = tgpig[2];
+
+    const int64_t iiw = iow*s0 + ikw*d0 - p0;
+    const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+    const int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*CHW + (iic*(KH*KW) + ikh*KW + ikw);
 
     device T * pdst = (device T *) (dst);
 
     if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
         pdst[offset_dst] = 0.0f;
     } else {
-        const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
-        pdst[offset_dst] = x[offset_src + iih * IW + iiw];
+        const int64_t offset_src = in*ofs0 + iic*ofs1 + iih*IW + iiw;
+        pdst[offset_dst] = x[offset_src];
     }
 }
 
@@ -2209,25 +2223,25 @@ kernel void kernel_im2col_ext(
         uint3  tgpg[[threadgroups_per_grid]],      // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
         uint3 tpitg[[thread_position_in_threadgroup]],
         uint3   ntg[[threads_per_threadgroup]]) {  // [M, 1, 1]
-    const int32_t KHW = KH * KW;             // KHW == ntg[1] * ntg[2], KW == ntg[2]
+    const int64_t KHW = KH * KW;             // KHW == ntg[1] * ntg[2], KW == ntg[2]
 
-    const int32_t d = tgpig[0] / CHW;
-    const int32_t chw = tgpig[0] % CHW;
-    const int32_t tgpig_0 = chw / KHW;  // 0 ~ (IC - 1)
-    const int32_t HW = tgpig[0] % KHW;
+    const int64_t d = tgpig[0] / CHW;
+    const int64_t chw = tgpig[0] % CHW;
+    const int64_t tgpig_0 = chw / KHW;  // 0 ~ (IC - 1)
+    const int64_t HW = tgpig[0] % KHW;
 
-    const int32_t tpitg_0 = (d * ntg[0]) + tpitg[0];
+    const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
     if (tpitg_0 >= N) {
         return;
     }
 
-    const int32_t tpitg_1 = HW / KW;
-    const int32_t tpitg_2 = HW % KW;
+    const int64_t tpitg_1 = HW / KW;
+    const int64_t tpitg_2 = HW % KW;
 
-    const int32_t iiw = tgpig[2] * s0 + tpitg_2 * d0 - p0;
-    const int32_t iih = tgpig[1] * s1 + tpitg_1 * d1 - p1;
+    const int64_t iiw = tgpig[2] * s0 + tpitg_2 * d0 - p0;
+    const int64_t iih = tgpig[1] * s1 + tpitg_1 * d1 - p1;
 
-    const int32_t offset_dst =
+    const int64_t offset_dst =
         (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
         (tgpig_0 * KHW + tpitg_1 * KW + tpitg_2);
 
@@ -2236,7 +2250,7 @@ kernel void kernel_im2col_ext(
     if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
         pdst[offset_dst] = 0.0f;
     } else {
-        const int32_t offset_src = tpitg_0 * ofs0 + tgpig_0 * ofs1;
+        const int64_t offset_src = tpitg_0 * ofs0 + tgpig_0 * ofs1;
         pdst[offset_dst] = x[offset_src + iih * IW + iiw];
     }
 }

From 9fe0fb062630728e3c21b5839e3bce87bff2440a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Nov 2024 19:15:50 +0200
Subject: [PATCH 268/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 6ddb71ab1..e9bd2dbb0 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-9d0708e863f3aa2fc1eb0b75d433303c30bd0dbc
+2884dd72fea8922910fe53387c3d17ab928d3a8e

From 42ae10bbcd7b56f29a302c86796542a6dadf46c9 Mon Sep 17 00:00:00 2001
From: haopeng <657407891@qq.com>
Date: Wed, 20 Nov 2024 04:10:31 +0800
Subject: [PATCH 269/396] add cmake rvv support (#10411)

---
 ggml/CMakeLists.txt              | 1 +
 ggml/src/ggml-cpu/CMakeLists.txt | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 9ab91421a..2d32da1b6 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -109,6 +109,7 @@ if (NOT MSVC)
 endif()
 option(GGML_LASX        "ggml: enable lasx"             ON)
 option(GGML_LSX         "ggml: enable lsx"              ON)
+option(GGML_RVV         "ggml: enable rvv"              ON)
 option(GGML_SVE         "ggml: enable SVE"              OFF)
 
 if (WIN32)
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index cef41a074..288052333 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -244,6 +244,11 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
     if (GGML_LSX)
         list(APPEND ARCH_FLAGS -mlsx)
     endif()
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
+    message(STATUS "RISC-V detected")
+    if (GGML_RVV)
+        list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
+    endif()
 else()
     message(STATUS "Unknown architecture")
 endif()

From 3952a221af54b8a6549bc2bd4a7363ef7ad3081e Mon Sep 17 00:00:00 2001
From: Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
Date: Tue, 19 Nov 2024 23:18:17 +0100
Subject: [PATCH 270/396] Fix missing file renames in Makefile due to changes
 in commit ae8de6d50a (#10413)

---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 95110d4eb..5c8994385 100644
--- a/Makefile
+++ b/Makefile
@@ -730,10 +730,10 @@ GLSLC_CMD  = glslc
 _ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
 _ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
 _ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
-_ggml_vk_input_dir = ggml/src/vulkan-shaders
+_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders
 _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
 
-ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
+ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
 	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
 
 $(_ggml_vk_header): $(_ggml_vk_source)
@@ -745,8 +745,8 @@ $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
 		--target-hpp $(_ggml_vk_header) \
 		--target-cpp $(_ggml_vk_source)
 
-vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
-	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
 
 endif # GGML_VULKAN
 

From ad21c9e1f14d82b8c15ae369a8839019e3d498b4 Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Wed, 20 Nov 2024 13:54:25 +0800
Subject: [PATCH 271/396] update rel to 4040 (#10395)

Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
---
 docs/backend/SYCL.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index e431f51f1..8d8312e91 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -34,9 +34,10 @@ The SYCL backend would be broken by some PRs due to no online CI.
 
 The following release is verified with good quality:
 
-|Commit ID|Tag|Release|Verified  Platform|
-|-|-|-|-|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1MTL Arc GPU/Windows 11/oneAPI 2024.1|
+|Commit ID|Tag|Release|Verified  Platform| Update date|
+|-|-|-|-|-|
+|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
+|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1MTL Arc GPU/Windows 11/oneAPI 2024.1||
 
 
 ## News

From 1bacb9f62514b520bdf74ed6feb46c80508dad38 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 20 Nov 2024 01:11:00 -0600
Subject: [PATCH 272/396] vulkan: further optimize mul_mat_vec using larger
 loads (#10387)

* vulkan: Use pipeline_robustness to disable robustness in mul_mat_vec.

Add some early returns for nonexistent rows in mul_mat_vec shaders. These
can only be hit when dispatching a 2D grid of workgroups. Fix the logic
for the 2D grid of workgroups to round up.

Enable the pipeline robustness extension if it's available, and use it to
disable robustness for these pipelines. The instructions to do the bounds
checking contend for the same ALU resources as the bit twiddling dequant
instructions.

* vulkan: Add GLSL structure aliases for quant types to allow larger loads

In Vulkan it's not possible to cast pointer types, so instead you have to
declare an aliased binding for the memory with a different type. This
commit adds aliases for the quant formats using 16b ints, and in a few
places where the struct size is a multiple of 4 also using 32b ints.
Currently only q4_k's aliases are used, but others will be used in
subsequent commits.

* vulkan: use larger loads in q5_k and q6_k shaders.

Similar to the optimization I did in q4_k recently, this vectorizes some loads
and reduces the number of bit twiddling instructions.

* vulkan: use larger K step per iteration in mul_mat_vec.

Add vec4 dequantization functions, and use them to do K=8 per iteration in
mul_mat_vec. This uses 16b loads for the quant values and 128b loads for B
which helps reduce the load on the memory system.

The K_PER_ITER==2 logic is still there, just for F16/F32, and really only
because they support unaligned sizes.

Tweak the num_iters/unrolling logic to be simpler and catch a couple missed
unrolling opportunities.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 101 ++++++++-----
 .../vulkan-shaders/dequant_funcs.comp         |  48 ++++++
 .../vulkan-shaders/mul_mat_vec.comp           |  82 ++++++++--
 .../vulkan-shaders/mul_mat_vec_base.comp      |   3 +
 .../vulkan-shaders/mul_mat_vec_q2_k.comp      |   4 +
 .../vulkan-shaders/mul_mat_vec_q3_k.comp      |   4 +
 .../vulkan-shaders/mul_mat_vec_q4_k.comp      |  34 ++---
 .../vulkan-shaders/mul_mat_vec_q5_k.comp      | 140 ++++++++++++------
 .../vulkan-shaders/mul_mat_vec_q6_k.comp      |  69 ++++++---
 .../src/ggml-vulkan/vulkan-shaders/types.comp | 113 +++++++++++++-
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |   6 +-
 11 files changed, 457 insertions(+), 147 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 21fee2f3d..ca71da2f7 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -158,6 +158,7 @@ struct vk_device_struct {
     std::string name;
     uint64_t max_memory_allocation_size;
     bool fp16;
+    bool pipeline_robustness;
     vk::Device device;
     uint32_t vendor_id;
     vk_queue compute_queue;
@@ -654,7 +655,7 @@ static uint32_t compile_count = 0;
 static std::mutex compile_count_mutex;
 static std::condition_variable compile_count_cond;
 
-static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align) {
+static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align, bool disable_robustness) {
     VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
     GGML_ASSERT(parameter_count > 0);
     GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
@@ -724,6 +725,15 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
         vk::PipelineCreateFlags(),
         pipeline_shader_create_info,
         pipeline->layout);
+
+    vk::PipelineRobustnessCreateInfoEXT rci;
+
+    if (device->pipeline_robustness && disable_robustness) {
+        rci.storageBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled;
+        rci.uniformBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled;
+        compute_pipeline_create_info.setPNext(&rci);
+    }
+
     pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
 
     {
@@ -1261,7 +1271,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
 
     std::vector<std::future<void>> compiles;
-    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align) {
+    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align, bool disable_robustness = false) {
         {
             // wait until fewer than N compiles are in progress
             uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@@ -1271,7 +1281,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
             }
             compile_count++;
         }
-        compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align));
+        compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness));
     };
 
     if (device->fp16) {
@@ -1370,45 +1380,45 @@ static void ggml_vk_load_shaders(vk_device& device) {
     // computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32",  mul_mat_vec_f32_f32_f32_len,  mul_mat_vec_f32_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32",  mul_mat_vec_f16_f32_f32_len,  mul_mat_vec_f16_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32",  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true);
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
 
     // dequant shaders
     ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -1591,12 +1601,15 @@ static vk_device ggml_vk_get_device(size_t idx) {
 
         bool fp16_storage = false;
         bool fp16_compute = false;
+        bool pipeline_robustness = false;
 
         for (const auto& properties : ext_props) {
             if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
                 fp16_storage = true;
             } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
                 fp16_compute = true;
+            } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
+                pipeline_robustness = true;
             }
         }
 
@@ -1642,10 +1655,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
         vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
         vk11_features.pNext = &vk12_features;
 
+        VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features;
+        pl_robustness_features.pNext = nullptr;
+        pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT;
+        pl_robustness_features.pipelineRobustness = VK_FALSE;
+
+        if (pipeline_robustness) {
+            vk12_features.pNext = &pl_robustness_features;
+            device_extensions.push_back("VK_EXT_pipeline_robustness");
+        }
+
         vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
 
         device->fp16 = device->fp16 && vk12_features.shaderFloat16;
 
+        device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
+
         if (!vk11_features.storageBuffer16BitAccess) {
             std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl;
             throw std::runtime_error("Unsupported device");
@@ -3190,7 +3215,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
 
     if (ne01 > max_groups_x) {
         groups_z = 64;
-        groups_x /= groups_z;
+        groups_x = CEIL_DIV(groups_x, groups_z);
     }
 
     // compute
@@ -3767,7 +3792,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
 
     if (ne01 > max_groups_x) {
         groups_z = 64;
-        groups_x /= groups_z;
+        groups_x = CEIL_DIV(groups_x, groups_z);
     }
 
     // compute
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
index d5b989735..5fc1ba4ad 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
@@ -2,6 +2,15 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
 #endif
 
+#include "types.comp"
+
+#if defined(A_TYPE_PACKED16)
+layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
+#endif
+#if defined(A_TYPE_PACKED32)
+layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
+#endif
+
 #if defined(DATA_A_F32)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
@@ -20,6 +29,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
     return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) - 8.0f) * d;
+}
 #endif
 
 #if defined(DATA_A_Q4_1)
@@ -29,6 +43,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
     return vec2(vui & 0xF, vui >> 4) * d + m;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    const float m = float(data_a_packed16[a_offset + ib].m);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) * d + m;
+}
 #endif
 
 #if defined(DATA_A_Q5_0)
@@ -39,6 +59,14 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
     return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0];
+    const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+    const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return (vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) - 16.0f) * d;
+}
 #endif
 
 #if defined(DATA_A_Q5_1)
@@ -50,6 +78,15 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
     return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    const float m = float(data_a_packed16[a_offset + ib].m);
+    const uint uint_qh = data_a_packed16[a_offset + ib].qh;
+    const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+    const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * d + m;
+}
 #endif
 
 #if defined(DATA_A_Q8_0)
@@ -57,6 +94,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     const float d = float(data_a[a_offset + ib].d);
     return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2];
+    uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1];
+    return vec4(int8_t(v0 & 0xFF), int8_t((v0 >> 8) & 0xFF), int8_t(v1 & 0xFF), int8_t((v1 >> 8) & 0xFF)) * d;
+}
 #endif
 
 #if defined(DATA_A_IQ4_NL)
@@ -65,4 +108,9 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
     return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[(vui >> 12) & 0xF]) * d;
+}
 #endif
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
index 970aac6ef..00807a060 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -3,7 +3,7 @@
 #ifdef FLOAT16
 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
 #endif
-#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
 
 #include "mul_mat_vec_base.comp"
 
@@ -12,16 +12,48 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 layout (constant_id = 0) const uint BLOCK_SIZE = 32;
 layout (constant_id = 1) const uint NUM_ROWS = 1;
 
+#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
+#define K_PER_ITER 8
+#else
+#define K_PER_ITER 2
+#endif
+
+
 uint a_offset, b_offset, d_offset, y_offset;
 
 shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
 
 void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
 {
-    const uint col = i*BLOCK_SIZE + 2*tid;
+    const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
     const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
     const uint iybs = col - col%QUANT_K; // y block start index
 
+#if K_PER_ITER == 8
+#if QUANT_R == 2
+    B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4];
+    B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4];
+    FLOAT_TYPE b0 = FLOAT_TYPE(bv02.x);
+    FLOAT_TYPE b1 = FLOAT_TYPE(bv13.x);
+    FLOAT_TYPE b2 = FLOAT_TYPE(bv02.y);
+    FLOAT_TYPE b3 = FLOAT_TYPE(bv13.y);
+    FLOAT_TYPE b4 = FLOAT_TYPE(bv02.z);
+    FLOAT_TYPE b5 = FLOAT_TYPE(bv13.z);
+    FLOAT_TYPE b6 = FLOAT_TYPE(bv02.w);
+    FLOAT_TYPE b7 = FLOAT_TYPE(bv13.w);
+#else
+    B_TYPE_VEC4 bv0 = data_b_v4[(b_offset + iybs + iqs) / 4];
+    B_TYPE_VEC4 bv1 = data_b_v4[(b_offset + iybs + iqs) / 4 + 1];
+    FLOAT_TYPE b0 = FLOAT_TYPE(bv0.x);
+    FLOAT_TYPE b1 = FLOAT_TYPE(bv0.y);
+    FLOAT_TYPE b2 = FLOAT_TYPE(bv0.z);
+    FLOAT_TYPE b3 = FLOAT_TYPE(bv0.w);
+    FLOAT_TYPE b4 = FLOAT_TYPE(bv1.x);
+    FLOAT_TYPE b5 = FLOAT_TYPE(bv1.y);
+    FLOAT_TYPE b6 = FLOAT_TYPE(bv1.z);
+    FLOAT_TYPE b7 = FLOAT_TYPE(bv1.w);
+#endif
+#else
     // Check if the second of the pair of elements is OOB, and don't fetch B or
     // accumulate it. We still fetch a pair of elements for A, which is fine for
     // quantized formats since they'll be within the same block. We should
@@ -34,9 +66,24 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
     if (!OOB) {
         b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
     }
+#endif
     [[unroll]] for (uint n = 0; n < num_rows; ++n) {
         const uint ib = ((first_row + n)*p.ncols + col)/QUANT_K; // block index
 
+#if K_PER_ITER == 8
+        const vec4 v = dequantize4(ib, iqs, a_offset);
+        const vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
+
+        // matrix multiplication
+        temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]);
+        temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
+        temp[n] = fma(FLOAT_TYPE(v.z), b2, temp[n]);
+        temp[n] = fma(FLOAT_TYPE(v.w), b3, temp[n]);
+        temp[n] = fma(FLOAT_TYPE(v2.x), b4, temp[n]);
+        temp[n] = fma(FLOAT_TYPE(v2.y), b5, temp[n]);
+        temp[n] = fma(FLOAT_TYPE(v2.z), b6, temp[n]);
+        temp[n] = fma(FLOAT_TYPE(v2.w), b7, temp[n]);
+#else
         const vec2 v = dequantize(ib, iqs, a_offset);
 
         // matrix multiplication
@@ -44,6 +91,7 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
         if (!OOB) {
             temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
         }
+#endif
     }
 }
 
@@ -61,22 +109,33 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
         temp[i] = FLOAT_TYPE(0);
     }
 
-    const int unroll_count = 8;
-
-    const uint num_iters = (p.ncols >= 2*tid) ? ((p.ncols - 2*tid + BLOCK_SIZE - 1) / BLOCK_SIZE) : 0;
-    const uint unrolled_iters = num_iters & ~(2*unroll_count - 1);
+    uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
+    if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) {
+        num_iters++;
+    }
+    int unroll_count = 4;
+    uint unrolled_iters = num_iters & ~(unroll_count - 1);
 
     uint i = 0;
     while (i < unrolled_iters) {
         // Manually partially unroll the loop
         [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
-            iter(temp, first_row, num_rows, tid, i, false);
-            i += 2;
+            iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false);
+            i++;
+        }
+    }
+    unroll_count = 2;
+    unrolled_iters = num_iters & ~(unroll_count - 1);
+    while (i < unrolled_iters) {
+        // Manually partially unroll the loop
+        [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
+            iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false);
+            i++;
         }
     }
     while (i < num_iters) {
-        iter(temp, first_row, num_rows, tid, i, true);
-        i += 2;
+        iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true);
+        i++;
     }
 
     // sum up partial sums and write back result
@@ -106,6 +165,9 @@ void main() {
     if (first_row + NUM_ROWS <= p.stride_d) {
         compute_outputs(first_row, NUM_ROWS);
     } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
         compute_outputs(first_row, p.stride_d - first_row);
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
index 5920bc936..8d0a57913 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
@@ -12,6 +12,9 @@
 
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
+layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
+
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
 #ifdef MUL_MAT_ID
 layout (binding = 3) readonly buffer IDS {int data_ids[];};
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
index ec8eadcd5..e2625d32b 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@@ -9,6 +9,10 @@ shared FLOAT_TYPE tmp[32];
 void main() {
     const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
 
+    if (row >= p.stride_d) {
+        return;
+    }
+
     uint a_offset, b_offset, d_offset;
     get_offsets(a_offset, b_offset, d_offset);
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
index 3ca4ad85a..a28804533 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@@ -9,6 +9,10 @@ shared FLOAT_TYPE tmp[32];
 void main() {
     const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
 
+    if (row >= p.stride_d) {
+        return;
+    }
+
     uint a_offset, b_offset, d_offset;
     get_offsets(a_offset, b_offset, d_offset);
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
index b7c9b722d..5846f2e86 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
@@ -8,30 +8,14 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
 
 shared FLOAT_TYPE tmp[32];
 
-// Declare aliased versions of A and B bindings that can use 16b/32b loads for
-// the quantized values, and vec4 loads for B.
-struct block_q4_K_u32
-{
-    f16vec2 d;
-    uint32_t scales[3*QUANT_K/64/4];
-    uint32_t qs[QUANT_K/2/4];
-};
-
-struct block_q4_K_u16
-{
-    f16vec2 d;
-    uint16_t scales[3*QUANT_K/64/2];
-    uint16_t qs[QUANT_K/2/2];
-};
-
-layout (binding = 0) readonly buffer A_u32 {block_q4_K_u32 data_a_u32[];};
-layout (binding = 0) readonly buffer A_u16 {block_q4_K_u16 data_a_u16[];};
-layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
-
 // This shader assumes K_QUANTS_PER_ITERATION == 2 for alignment of loads
 void main() {
     const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
 
+    if (row >= p.stride_d) {
+        return;
+    }
+
     uint a_offset, b_offset, d_offset;
     get_offsets(a_offset, b_offset, d_offset);
 
@@ -64,9 +48,9 @@ void main() {
         const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
         const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
 
-        uint32_t scale0_u32 = data_a_u16[ib0 + i].scales[v_im    ];
-        uint32_t scale4_u32 = data_a_u16[ib0 + i].scales[v_im + 2];
-        uint32_t scale8_u32 = data_a_u16[ib0 + i].scales[v_im + 4];
+        uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
+        uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
+        uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
         uvec4 scale0 = uvec4(unpack8(scale0_u32));
         uvec4 scale4 = uvec4(unpack8(scale4_u32));
         uvec4 scale8 = uvec4(unpack8(scale8_u32));
@@ -80,8 +64,8 @@ void main() {
         const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
         const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
 
-        uint32_t qs0_u32 = data_a_u32[ib0 + i].qs[q_offset / 4];
-        uint32_t qs64_u32 = data_a_u32[ib0 + i].qs[q_offset / 4 + 16];
+        uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
+        uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
 
         uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
         uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
index 2306785af..22a6bfae4 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
@@ -1,5 +1,7 @@
 #version 450
 
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
 #include "mul_mat_vec_base.comp"
 
 layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
@@ -9,6 +11,10 @@ shared FLOAT_TYPE tmp[32];
 void main() {
     const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
 
+    if (row >= p.stride_d) {
+        return;
+    }
+
     uint a_offset, b_offset, d_offset;
     get_offsets(a_offset, b_offset, d_offset);
 
@@ -31,70 +37,106 @@ void main() {
     const uint8_t hm1 = uint8_t(1 << (2*v_im));
     const uint8_t hm2 = uint8_t(hm1 << 4);
 
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) {
         const uint y1_idx = i * QUANT_K + y_offset;
         const uint y2_idx = y1_idx + 128;
 
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y);
+        f16vec2 d = data_a[ib0 + i].d;
+        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
 
-        const uint8_t sc0 = uint8_t(  data_a[ib0 + i].scales[v_im * 2    ]       & 0x3f);
-        const uint8_t sc1 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 1]       & 0x3f);
-        const uint8_t sc2 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 4]       & 0x3f);
-        const uint8_t sc3 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 5]       & 0x3f);
-        const uint8_t sc4 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 8]       & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2    ] & 0xc0) >> 2));
-        const uint8_t sc5 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 9]       & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 1] & 0xc0) >> 2));
-        const uint8_t sc6 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 8] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 4] & 0xc0) >> 2));
-        const uint8_t sc7 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 9] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 5] & 0xc0) >> 2));
+        uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
+        uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
+        uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
+        uvec4 scale0 = uvec4(unpack8(scale0_u32));
+        uvec4 scale4 = uvec4(unpack8(scale4_u32));
+        uvec4 scale8 = uvec4(unpack8(scale8_u32));
 
-        const uint8_t q4_0  = uint8_t(data_a[ib0 + i].qs[q_offset     ] & 0xf);
-        const uint8_t q4_1  = uint8_t(data_a[ib0 + i].qs[q_offset +  1] & 0xf);
-        const uint8_t q4_2  = uint8_t(data_a[ib0 + i].qs[q_offset + 16] & 0xf);
-        const uint8_t q4_3  = uint8_t(data_a[ib0 + i].qs[q_offset + 17] & 0xf);
-        const uint8_t q4_4  = uint8_t(data_a[ib0 + i].qs[q_offset     ]  >> 4);
-        const uint8_t q4_5  = uint8_t(data_a[ib0 + i].qs[q_offset +  1]  >> 4);
-        const uint8_t q4_6  = uint8_t(data_a[ib0 + i].qs[q_offset + 16]  >> 4);
-        const uint8_t q4_7  = uint8_t(data_a[ib0 + i].qs[q_offset + 17]  >> 4);
-        const uint8_t q4_8  = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf);
-        const uint8_t q4_9  = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf);
-        const uint8_t q4_10 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] & 0xf);
-        const uint8_t q4_11 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] & 0xf);
-        const uint8_t q4_12 = uint8_t(data_a[ib0 + i].qs[q_offset + 64]  >> 4);
-        const uint8_t q4_13 = uint8_t(data_a[ib0 + i].qs[q_offset + 65]  >> 4);
-        const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80]  >> 4);
-        const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81]  >> 4);
+        const uint32_t sc0 = (  scale0.x       & 0x3f);
+        const uint32_t sc1 = (  scale0.y       & 0x3f);
+        const uint32_t sc2 = (  scale4.x       & 0x3f);
+        const uint32_t sc3 = (  scale4.y       & 0x3f);
+        const uint32_t sc4 = (( scale8.x       & 0x0f) | ((scale0.x & 0xc0) >> 2));
+        const uint32_t sc5 = (( scale8.y       & 0x0f) | ((scale0.y & 0xc0) >> 2));
+        const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
+        const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
+
+        uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
+        uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
+
+        uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
+        uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
+        uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
+        uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
+
+        uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
+        uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
+        uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
+        uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4));
+
+        const uint32_t q4_0  = qs0_16_lo4.x;
+        const uint32_t q4_1  = qs0_16_lo4.y;
+        const uint32_t q4_2  = qs0_16_lo4.z;
+        const uint32_t q4_3  = qs0_16_lo4.w;
+        const uint32_t q4_4  = qs0_16_hi4.x;
+        const uint32_t q4_5  = qs0_16_hi4.y;
+        const uint32_t q4_6  = qs0_16_hi4.z;
+        const uint32_t q4_7  = qs0_16_hi4.w;
+        const uint32_t q4_8  = qs64_80_lo4.x;
+        const uint32_t q4_9  = qs64_80_lo4.y;
+        const uint32_t q4_10 = qs64_80_lo4.z;
+        const uint32_t q4_11 = qs64_80_lo4.w;
+        const uint32_t q4_12 = qs64_80_hi4.x;
+        const uint32_t q4_13 = qs64_80_hi4.y;
+        const uint32_t q4_14 = qs64_80_hi4.z;
+        const uint32_t q4_15 = qs64_80_hi4.w;
+
+        B_TYPE_VEC2 by10 =  data_b_v2[(b_offset + y1_idx) / 2];
+        B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8];
+        B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16];
+        B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24];
+        B_TYPE_VEC2 by20 =  data_b_v2[(b_offset + y2_idx) / 2];
+        B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8];
+        B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
+        B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
+
+        uint32_t qh0 = data_a_packed16[ib0 + i].qh[l0 / 2];
+        uint32_t qh1 = qh0 >> 8;
+        uint32_t qh16 = data_a_packed16[ib0 + i].qh[l0 / 2 + 8];
+        uint32_t qh17 = qh16 >> 8;
 
         const FLOAT_TYPE sx =
-          fma(FLOAT_TYPE(data_b[b_offset + y1_idx     ]), (q4_0 + (((data_a[ib0 + i].qh[l0     ] & hm1) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(data_b[b_offset + y1_idx +  1]), (q4_1 + (((data_a[ib0 + i].qh[l0 +  1] & hm1) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 16]), (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)),
-             FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by10.x), (q4_0 + (((qh0 & hm1) != 0) ? 16 : 0)),
+          fma(FLOAT_TYPE(by10.y), (q4_1 + (((qh1 & hm1) != 0) ? 16 : 0)),
+          fma(FLOAT_TYPE(by116.x), (q4_2 + (((qh16 & hm1) != 0) ? 16 : 0)),
+             FLOAT_TYPE(by116.y) * (q4_3 + (((qh17 & hm1) != 0) ? 16 : 0)))));
         const FLOAT_TYPE sy =
-          fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), (q4_4 + (((data_a[ib0 + i].qh[l0     ] & (hm1 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), (q4_5 + (((data_a[ib0 + i].qh[l0 +  1] & (hm1 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 48]), (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)),
-             FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by132.x), (q4_4 + (((qh0 & (hm1 << 1)) != 0) ? 16 : 0)),
+          fma(FLOAT_TYPE(by132.y), (q4_5 + (((qh1 & (hm1 << 1)) != 0) ? 16 : 0)),
+          fma(FLOAT_TYPE(by148.x), (q4_6 + (((qh16 & (hm1 << 1)) != 0) ? 16 : 0)),
+             FLOAT_TYPE(by148.y) * (q4_7 + (((qh17 & (hm1 << 1)) != 0) ? 16 : 0)))));
         const FLOAT_TYPE sz =
-          fma(FLOAT_TYPE(data_b[b_offset + y2_idx     ]), (q4_8  + (((data_a[ib0 + i].qh[l0     ] & hm2) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(data_b[b_offset + y2_idx +  1]), (q4_9  + (((data_a[ib0 + i].qh[l0 +  1] & hm2) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 16]), (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)),
-             FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by20.x), (q4_8  + (((qh0 & hm2) != 0) ? 16 : 0)),
+          fma(FLOAT_TYPE(by20.y), (q4_9  + (((qh1 & hm2) != 0) ? 16 : 0)),
+          fma(FLOAT_TYPE(by216.x), (q4_10 + (((qh16 & hm2) != 0) ? 16 : 0)),
+             FLOAT_TYPE(by216.y) * (q4_11 + (((qh17 & hm2) != 0) ? 16 : 0)))));
         const FLOAT_TYPE sw =
-          fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), (q4_12 + (((data_a[ib0 + i].qh[l0     ] & (hm2 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), (q4_13 + (((data_a[ib0 + i].qh[l0 +  1] & (hm2 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 48]), (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)),
-             FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by232.x), (q4_12 + (((qh0 & (hm2 << 1)) != 0) ? 16 : 0)),
+          fma(FLOAT_TYPE(by232.y), (q4_13 + (((qh1 & (hm2 << 1)) != 0) ? 16 : 0)),
+          fma(FLOAT_TYPE(by248.x), (q4_14 + (((qh16 & (hm2 << 1)) != 0) ? 16 : 0)),
+             FLOAT_TYPE(by248.y) * (q4_15 + (((qh17 & (hm2 << 1)) != 0) ? 16 : 0)))));
         const FLOAT_TYPE smin =
-          fma(FLOAT_TYPE(data_b[b_offset + y1_idx     ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]), sc2,
-          fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]), sc3,
-          fma(FLOAT_TYPE(data_b[b_offset + y2_idx     ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]), sc6,
-              (FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7)));
-        const uint tmp_idx = 16 * ix + tid;
-        tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx]));
+          fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
+          fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
+          fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
+              (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
+        temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
     }
 
+    tmp[gl_LocalInvocationID.x] = temp;
+
     // sum up partial sums and write back result
     barrier();
     [[unroll]] for (uint s = 16; s > 0; s >>= 1) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
index 95c286eeb..0b392d68d 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@@ -1,5 +1,7 @@
 #version 450
 
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
 #include "mul_mat_vec_base.comp"
 
 layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
@@ -9,6 +11,10 @@ shared FLOAT_TYPE tmp[32];
 void main() {
     const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
 
+    if (row >= p.stride_d) {
+        return;
+    }
+
     uint a_offset, b_offset, d_offset;
     get_offsets(a_offset, b_offset, d_offset);
 
@@ -36,41 +42,66 @@ void main() {
     const uint s_offset  =  8*v_im + is;
     const uint y_offset = 128*v_im + l0;
 
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
         const uint y_idx   = i * QUANT_K + y_offset;
 
         const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
 
-#if K_QUANTS_PER_ITERATION == 1
-        const uint tmp_idx = 16 * ix + tid;
-        tmp[tmp_idx] = fma(FLOAT_TYPE(data_b[b_offset + y_idx +  0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset +  0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset +  0] & 0x03) << 4)) - 32),
-                       fma(FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32),
-                       fma(FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset +  0] & 0x0c) << 2)) - 32),
-                       fma(FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32),
-                       fma(FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset +  0]  >> 4) | ((data_a[ib0 + i].qh[qh_offset +  0] & 0x30) >> 0)) - 32),
-                       fma(FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16]  >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32),
-                       fma(FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32]  >> 4) | ((data_a[ib0 + i].qh[qh_offset +  0] & 0xc0) >> 2)) - 32),
-                       fma(FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48]  >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32), tmp[tmp_idx]))))))));
-#else
+        FLOAT_TYPE scales[4];
+        scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]);
+        scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]);
+        scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]);
+        scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]);
+
+        uint32_t ql0_u32 =  uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
+        uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
+
+        uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
+        uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
+        uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
+        uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
+
+        uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
+        uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
+        uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
+        uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0;
+        uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
+
+        uint32_t q0_u32 = ql0_u32_lo4  | qh0_u32;
+        uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
+        uint32_t q2_u32 = ql0_u32_hi4  | qh4_u32;
+        uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
+
+        uvec4 q0 = uvec4(unpack8(q0_u32));
+        uvec4 q1 = uvec4(unpack8(q1_u32));
+        uvec4 q2 = uvec4(unpack8(q2_u32));
+        uvec4 q3 = uvec4(unpack8(q3_u32));
+
+        B_TYPE_VEC4 by0  = data_b_v4[(b_offset + y_idx) / 4];
+        B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8];
+        B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16];
+        B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24];
+
         FLOAT_TYPE sum = FLOAT_TYPE(0.0);
         [[unroll]] for (int l = 0; l < 4; ++l) {
-            sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0]  >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32]  >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32), sum))));
+            sum = fma(FLOAT_TYPE(by0[l])  * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
+                  fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32),
+                  fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
+                  fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
         }
-        tmp[16 * ix + tid] += sum;
-#endif
+        temp += sum * d;
     }
 
+    tmp[gl_LocalInvocationID.x] = temp;
+
     // sum up partial sums and write back result
     barrier();
     [[unroll]] for (uint s = 16; s > 0; s >>= 1) {
         if (tid < s) {
             tmp[tid] += tmp[tid + s];
-       }
+        }
         barrier();
     }
     if (tid == 0) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
index 21dce72fc..7a34820bc 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@@ -1,6 +1,8 @@
-#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
-#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
-#endif
+
+#if !defined(GGML_TYPES_COMP)
+#define GGML_TYPES_COMP
+
+#extension GL_EXT_shader_explicit_arithmetic_types : require
 
 #if defined(DATA_A_F32)
 #define QUANT_K 1
@@ -38,8 +40,14 @@ struct block_q4_0
     float16_t d;
     uint8_t qs[16];
 };
+struct block_q4_0_packed16
+{
+    float16_t d;
+    uint16_t qs[16/2];
+};
 
 #define A_TYPE block_q4_0
+#define A_TYPE_PACKED16 block_q4_0_packed16
 #endif
 
 #if defined(DATA_A_Q4_1)
@@ -54,7 +62,15 @@ struct block_q4_1
     uint8_t qs[16];
 };
 
+struct block_q4_1_packed16
+{
+    float16_t d;
+    float16_t m;
+    uint16_t qs[16/2];
+};
+
 #define A_TYPE block_q4_1
+#define A_TYPE_PACKED16 block_q4_1_packed16
 #endif
 
 #if defined(DATA_A_Q5_0)
@@ -70,7 +86,15 @@ struct block_q5_0
     uint8_t qs[16];
 };
 
+struct block_q5_0_packed16
+{
+    float16_t d;
+    uint16_t qh[2];
+    uint16_t qs[16/2];
+};
+
 #define A_TYPE block_q5_0
+#define A_TYPE_PACKED16 block_q5_0_packed16
 #endif
 
 #if defined(DATA_A_Q5_1)
@@ -87,7 +111,16 @@ struct block_q5_1
     uint8_t qs[16];
 };
 
+struct block_q5_1_packed16
+{
+    float16_t d;
+    float16_t m;
+    uint qh;
+    uint16_t qs[16/2];
+};
+
 #define A_TYPE block_q5_1
+#define A_TYPE_PACKED16 block_q5_1_packed16
 #endif
 
 #if defined(DATA_A_Q8_0)
@@ -100,8 +133,14 @@ struct block_q8_0
     float16_t d;
     int8_t qs[32];
 };
+struct block_q8_0_packed16
+{
+    float16_t d;
+    uint16_t qs[32/2];
+};
 
 #define A_TYPE block_q8_0
+#define A_TYPE_PACKED16 block_q8_0_packed16
 #endif
 
 // K-quants
@@ -116,7 +155,23 @@ struct block_q2_K
     f16vec2 d;
 };
 
+struct block_q2_K_packed16
+{
+    uint16_t scales[QUANT_K/16/2];
+    uint16_t qs[QUANT_K/4/2];
+    f16vec2 d;
+};
+
+struct block_q2_K_packed32
+{
+    uint32_t scales[QUANT_K/16/4];
+    uint32_t qs[QUANT_K/4/4];
+    f16vec2 d;
+};
+
 #define A_TYPE block_q2_K
+#define A_TYPE_PACKED16 block_q2_K_packed16
+#define A_TYPE_PACKED32 block_q2_K_packed32
 #endif
 
 #if defined(DATA_A_Q3_K)
@@ -131,7 +186,16 @@ struct block_q3_K
     float16_t d;
 };
 
+struct block_q3_K_packed16
+{
+    uint16_t hmask[QUANT_K/8/2];
+    uint16_t qs[QUANT_K/4/2];
+    uint16_t scales[12/2];
+    float16_t d;
+};
+
 #define A_TYPE block_q3_K
+#define A_TYPE_PACKED16 block_q3_K_packed16
 #endif
 
 #if defined(DATA_A_Q4_K)
@@ -145,7 +209,23 @@ struct block_q4_K
     uint8_t qs[QUANT_K/2];
 };
 
+struct block_q4_K_packed16
+{
+    f16vec2 d;
+    uint16_t scales[3*QUANT_K/64/2];
+    uint16_t qs[QUANT_K/2/2];
+};
+
+struct block_q4_K_packed32
+{
+    f16vec2 d;
+    uint32_t scales[3*QUANT_K/64/4];
+    uint32_t qs[QUANT_K/2/4];
+};
+
 #define A_TYPE block_q4_K
+#define A_TYPE_PACKED16 block_q4_K_packed16
+#define A_TYPE_PACKED32 block_q4_K_packed32
 #endif
 
 #if defined(DATA_A_Q5_K)
@@ -160,7 +240,16 @@ struct block_q5_K
     uint8_t qs[QUANT_K/2];
 };
 
+struct block_q5_K_packed16
+{
+    f16vec2 d;
+    uint16_t scales[12/2];
+    uint16_t qh[QUANT_K/8/2];
+    uint16_t qs[QUANT_K/2/2];
+};
+
 #define A_TYPE block_q5_K
+#define A_TYPE_PACKED16 block_q5_K_packed16
 #endif
 
 #if defined(DATA_A_Q6_K)
@@ -175,7 +264,16 @@ struct block_q6_K
     float16_t d;
 };
 
+struct block_q6_K_packed16
+{
+    uint16_t ql[QUANT_K/2/2];
+    uint16_t qh[QUANT_K/4/2];
+    int8_t scales[QUANT_K/16];
+    float16_t d;
+};
+
 #define A_TYPE block_q6_K
+#define A_TYPE_PACKED16 block_q6_K_packed16
 #endif
 
 // IQuants
@@ -191,10 +289,19 @@ struct block_iq4_nl
     uint8_t qs[QUANT_K/2];
 };
 
+struct block_iq4_nl_packed16
+{
+    float16_t d;
+    uint16_t qs[QUANT_K/2/2];
+};
+
 #define A_TYPE block_iq4_nl
+#define A_TYPE_PACKED16 block_iq4_nl_packed16
 
 const int8_t kvalues_iq4nl[16] = {
     int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
     int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
 };
 #endif
+
+#endif // !defined(GGML_TYPES_COMP)
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index fe3e4cb39..f75310955 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -317,10 +317,10 @@ void process_shaders() {
         std::string data_a_key = "DATA_A_" + to_uppercase(tname);
         std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
 
-        string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
-        string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
 
-        string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
 
         // Dequant shaders
         if (tname != "f16") {

From 8fd4b7fa29c3061b2e02e897d818dfcbc593430a Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 20 Nov 2024 01:40:18 -0600
Subject: [PATCH 273/396] vulkan: copy iq4_nl LUT into shared memory (#10409)

---
 .../ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp  |  2 ++
 .../ggml-vulkan/vulkan-shaders/get_rows_quant.comp  |  4 ++++
 .../src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp |  4 ++++
 ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp     |  4 ++++
 ggml/src/ggml-vulkan/vulkan-shaders/types.comp      | 13 ++++++++++++-
 .../vulkan-shaders/vulkan-shaders-gen.cpp           |  6 +++---
 6 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
index 34ef3da30..8de14fc03 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
@@ -10,6 +10,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
 
+    init_iq4nl_shmem();
+
     const uint tid = gl_LocalInvocationID.x % 64;
     const uint il  = tid/32;
     const uint ir  = tid%32;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
index 8d30b63c1..7f608315b 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
@@ -12,6 +12,10 @@ void main() {
     const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
     const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
 
+#if defined(DATA_A_IQ4_NL)
+    init_iq4nl_shmem();
+#endif
+
     if (i00 >= p.ne00) {
         return;
     }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
index 00807a060..2d5b8e466 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -161,6 +161,10 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
 void main() {
     const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
 
+#if defined(DATA_A_IQ4_NL)
+    init_iq4nl_shmem();
+#endif
+
     // do NUM_ROWS at a time, unless there aren't enough remaining rows
     if (first_row + NUM_ROWS <= p.stride_d) {
         compute_outputs(first_row, NUM_ROWS);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
index fffdd1818..2ff5c4305 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -75,6 +75,10 @@ shared u16vec2 row_ids[3072];
 #endif
 
 void main() {
+#if defined(DATA_A_IQ4_NL)
+    init_iq4nl_shmem();
+#endif
+
 #ifdef MUL_MAT_ID
     const uint expert_idx = gl_GlobalInvocationID.z;
 #else
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
index 7a34820bc..bc28e0ab8 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@@ -298,10 +298,21 @@ struct block_iq4_nl_packed16
 #define A_TYPE block_iq4_nl
 #define A_TYPE_PACKED16 block_iq4_nl_packed16
 
-const int8_t kvalues_iq4nl[16] = {
+const int8_t kvalues_iq4nl_const[16] = {
     int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
     int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
 };
+
+shared FLOAT_TYPE kvalues_iq4nl[16];
+
+void init_iq4nl_shmem()
+{
+    // copy the table into shared memory and sync
+    if (gl_LocalInvocationIndex.x < 16) {
+        kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]);
+    }
+    barrier();
+}
 #endif
 
 #endif // !defined(GGML_TYPES_COMP)
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index f75310955..6bbe8e96e 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -331,11 +331,11 @@ void process_shaders() {
             shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
 
             if (tname == "f16") {
-                string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
+                string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
             } else {
-                string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
+                string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
             }
-            string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
+            string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
         }
     }
 

From fab5d30ff6729ff6ff615c41e8c0215d6bc30393 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 20 Nov 2024 12:57:53 +0100
Subject: [PATCH 274/396] llama : add .clang-format file (#10415)

---
 .clang-format                        | 161 ++++++
 examples/llama-bench/llama-bench.cpp | 700 +++++++++++++++------------
 2 files changed, 550 insertions(+), 311 deletions(-)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..45232b80e
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,161 @@
+---
+Language:        Cpp
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: Left
+AlignConsecutiveAssignments: AcrossComments
+AlignConsecutiveBitFields: AcrossComments
+AlignConsecutiveDeclarations: AcrossComments
+AlignConsecutiveMacros: AcrossComments
+# AlignConsecutiveShortCaseStatements: AcrossComments
+AlignEscapedNewlines: Left # LeftWithLastLine
+AlignOperands:   Align
+AlignTrailingComments:
+  Kind: Always
+  OverEmptyLines: 1
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: false
+# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakBeforeMultilineStrings: true
+BinPackArguments: true
+BinPackParameters: true # OnePerLine
+BitFieldColonSpacing: Both
+BreakBeforeBraces: Custom # Attach
+BraceWrapping:
+  AfterCaseLabel:  true
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile: false
+  IndentBraces:    false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+# BreakAdjacentStringLiterals: true
+BreakAfterAttributes: Never
+BreakBeforeBinaryOperators: None
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: false
+# BreakBinaryOperations: Never
+BreakConstructorInitializers: AfterColon
+# BreakFunctionDefinitionParameters: false
+BreakInheritanceList: AfterComma
+BreakStringLiterals: true
+# BreakTemplateDeclarations: Yes
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineBeforeAccessModifier: Leave
+EmptyLineAfterAccessModifier: Never
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+    SortPriority:    0
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseBlocks: true
+IndentCaseLabels: true
+IndentExternBlock: NoIndent
+IndentGotoLabels: false
+IndentPPDirectives: AfterHash
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+InsertBraces:    true # NOTE: may lead to incorrect formatting
+InsertNewlineAtEOF: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+LambdaBodyIndentation: Signature
+LineEnding: LF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PPIndentWidth: -1
+PackConstructorInitializers: CurrentLine
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Middle
+QualifierAlignment: Left
+#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+ReferenceAlignment: Middle
+ReflowComments:  false # IndentOnly
+SeparateDefinitionBlocks: Always
+SortIncludes:    CaseInsensitive
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  Never
+SpacesInContainerLiterals: true
+SpacesInLineCommentPrefix:
+  Minimum: 1
+  Maximum: -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard:        c++17
+TabWidth:        4
+UseTab:          Never
+WhitespaceSensitiveMacros: ['STRINGIZE']
+...
+
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 8f4e0e206..3dc84a75c 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -6,28 +6,28 @@
 #include <clocale>
 #include <cmath>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <ctime>
-#include <cstdlib>
 #include <iterator>
 #include <map>
 #include <numeric>
 #include <regex>
 #include <sstream>
 #include <string>
-#include <vector>
 #include <thread>
+#include <vector>
 
+#include "common.h"
 #include "ggml.h"
 #include "llama.h"
-#include "common.h"
 
 #ifdef _WIN32
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
 #endif
 
 // utils
@@ -36,8 +36,7 @@ static uint64_t get_time_ns() {
     return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
 }
 
-template<class T>
-static std::string join(const std::vector<T> & values, const std::string & delim) {
+template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
     std::ostringstream str;
     for (size_t i = 0; i < values.size(); i++) {
         str << values[i];
@@ -48,38 +47,35 @@ static std::string join(const std::vector<T> & values, const std::string & delim
     return str.str();
 }
 
-template<typename T, typename F>
-static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
+template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
     std::vector<std::string> str_values;
     std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
     return str_values;
 }
 
-template<typename T>
-static T avg(const std::vector<T> & v) {
+template <typename T> static T avg(const std::vector<T> & v) {
     if (v.empty()) {
         return 0;
     }
     T sum = std::accumulate(v.begin(), v.end(), T(0));
-    return sum / (T)v.size();
+    return sum / (T) v.size();
 }
 
-template<typename T>
-static T stdev(const std::vector<T> & v) {
+template <typename T> static T stdev(const std::vector<T> & v) {
     if (v.size() <= 1) {
         return 0;
     }
-    T mean = avg(v);
+    T mean   = avg(v);
     T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
-    T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
+    T stdev  = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1));
     return stdev;
 }
 
 static std::string get_cpu_info() {
     std::vector<std::string> cpu_list;
     for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-        auto * dev = ggml_backend_dev_get(i);
-        auto dev_type = ggml_backend_dev_type(dev);
+        auto * dev      = ggml_backend_dev_get(i);
+        auto   dev_type = ggml_backend_dev_type(dev);
         if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
             cpu_list.push_back(ggml_backend_dev_description(dev));
         }
@@ -90,8 +86,8 @@ static std::string get_cpu_info() {
 static std::string get_gpu_info() {
     std::vector<std::string> gpu_list;
     for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-        auto * dev = ggml_backend_dev_get(i);
-        auto dev_type = ggml_backend_dev_type(dev);
+        auto * dev      = ggml_backend_dev_get(i);
+        auto   dev_type = ggml_backend_dev_type(dev);
         if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
             gpu_list.push_back(ggml_backend_dev_description(dev));
         }
@@ -100,17 +96,24 @@ static std::string get_gpu_info() {
 }
 
 // command line params
-enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
+enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
 
 static const char * output_format_str(output_formats format) {
     switch (format) {
-        case NONE:     return "none";
-        case CSV:      return "csv";
-        case JSON:     return "json";
-        case JSONL:    return "jsonl";
-        case MARKDOWN: return "md";
-        case SQL:      return "sql";
-        default: GGML_ABORT("invalid output format");
+        case NONE:
+            return "none";
+        case CSV:
+            return "csv";
+        case JSON:
+            return "json";
+        case JSONL:
+            return "jsonl";
+        case MARKDOWN:
+            return "md";
+        case SQL:
+            return "sql";
+        default:
+            GGML_ABORT("invalid output format");
     }
 }
 
@@ -135,10 +138,14 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
 
 static const char * split_mode_str(llama_split_mode mode) {
     switch (mode) {
-        case LLAMA_SPLIT_MODE_NONE:  return "none";
-        case LLAMA_SPLIT_MODE_LAYER: return "layer";
-        case LLAMA_SPLIT_MODE_ROW:   return "row";
-        default: GGML_ABORT("invalid split mode");
+        case LLAMA_SPLIT_MODE_NONE:
+            return "none";
+        case LLAMA_SPLIT_MODE_LAYER:
+            return "layer";
+        case LLAMA_SPLIT_MODE_ROW:
+            return "row";
+        default:
+            GGML_ABORT("invalid split mode");
     }
 }
 
@@ -149,59 +156,59 @@ static std::string pair_str(const std::pair<int, int> & p) {
 }
 
 struct cmd_params {
-    std::vector<std::string> model;
-    std::vector<int> n_prompt;
-    std::vector<int> n_gen;
+    std::vector<std::string>         model;
+    std::vector<int>                 n_prompt;
+    std::vector<int>                 n_gen;
     std::vector<std::pair<int, int>> n_pg;
-    std::vector<int> n_batch;
-    std::vector<int> n_ubatch;
-    std::vector<ggml_type> type_k;
-    std::vector<ggml_type> type_v;
-    std::vector<int> n_threads;
-    std::vector<std::string> cpu_mask;
-    std::vector<bool> cpu_strict;
-    std::vector<int> poll;
-    std::vector<int> n_gpu_layers;
-    std::vector<std::string> rpc_servers;
-    std::vector<llama_split_mode> split_mode;
-    std::vector<int> main_gpu;
-    std::vector<bool> no_kv_offload;
-    std::vector<bool> flash_attn;
-    std::vector<std::vector<float>> tensor_split;
-    std::vector<bool> use_mmap;
-    std::vector<bool> embeddings;
-    ggml_numa_strategy numa;
-    int reps;
-    ggml_sched_priority prio;
-    int delay;
-    bool verbose;
-    bool progress;
-    output_formats output_format;
-    output_formats output_format_stderr;
+    std::vector<int>                 n_batch;
+    std::vector<int>                 n_ubatch;
+    std::vector<ggml_type>           type_k;
+    std::vector<ggml_type>           type_v;
+    std::vector<int>                 n_threads;
+    std::vector<std::string>         cpu_mask;
+    std::vector<bool>                cpu_strict;
+    std::vector<int>                 poll;
+    std::vector<int>                 n_gpu_layers;
+    std::vector<std::string>         rpc_servers;
+    std::vector<llama_split_mode>    split_mode;
+    std::vector<int>                 main_gpu;
+    std::vector<bool>                no_kv_offload;
+    std::vector<bool>                flash_attn;
+    std::vector<std::vector<float>>  tensor_split;
+    std::vector<bool>                use_mmap;
+    std::vector<bool>                embeddings;
+    ggml_numa_strategy               numa;
+    int                              reps;
+    ggml_sched_priority              prio;
+    int                              delay;
+    bool                             verbose;
+    bool                             progress;
+    output_formats                   output_format;
+    output_formats                   output_format_stderr;
 };
 
 static const cmd_params cmd_params_defaults = {
-    /* model                */ {"models/7B/ggml-model-q4_0.gguf"},
-    /* n_prompt             */ {512},
-    /* n_gen                */ {128},
+    /* model                */ { "models/7B/ggml-model-q4_0.gguf" },
+    /* n_prompt             */ { 512 },
+    /* n_gen                */ { 128 },
     /* n_pg                 */ {},
-    /* n_batch              */ {2048},
-    /* n_ubatch             */ {512},
-    /* type_k               */ {GGML_TYPE_F16},
-    /* type_v               */ {GGML_TYPE_F16},
-    /* n_threads            */ {cpu_get_num_math()},
-    /* cpu_mask             */ {"0x0"},
-    /* cpu_strict           */ {false},
-    /* poll                 */ {50},
-    /* n_gpu_layers         */ {99},
-    /* rpc_servers          */ {""},
-    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
-    /* main_gpu             */ {0},
-    /* no_kv_offload        */ {false},
-    /* flash_attn           */ {false},
-    /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
-    /* use_mmap             */ {true},
-    /* embeddings           */ {false},
+    /* n_batch              */ { 2048 },
+    /* n_ubatch             */ { 512 },
+    /* type_k               */ { GGML_TYPE_F16 },
+    /* type_v               */ { GGML_TYPE_F16 },
+    /* n_threads            */ { cpu_get_num_math() },
+    /* cpu_mask             */ { "0x0" },
+    /* cpu_strict           */ { false },
+    /* poll                 */ { 50 },
+    /* n_gpu_layers         */ { 99 },
+    /* rpc_servers          */ { "" },
+    /* split_mode           */ { LLAMA_SPLIT_MODE_LAYER },
+    /* main_gpu             */ { 0 },
+    /* no_kv_offload        */ { false },
+    /* flash_attn           */ { false },
+    /* tensor_split         */ { std::vector<float>(llama_max_devices(), 0.0f) },
+    /* use_mmap             */ { true },
+    /* embeddings           */ { false },
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
     /* reps                 */ 5,
     /* prio                 */ GGML_SCHED_PRIO_NORMAL,
@@ -218,38 +225,59 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("options:\n");
     printf("  -h, --help\n");
     printf("  -m, --model <filename>                    (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
-    printf("  -p, --n-prompt <n>                        (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+    printf("  -p, --n-prompt <n>                        (default: %s)\n",
+           join(cmd_params_defaults.n_prompt, ",").c_str());
     printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -pg <pp,tg>                               (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
-    printf("  -b, --batch-size <n>                      (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  -ub, --ubatch-size <n>                    (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
-    printf("  -ctk, --cache-type-k <t>                  (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
-    printf("  -ctv, --cache-type-v <t>                  (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
-    printf("  -t, --threads <n>                         (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
-    printf("  --cpu-strict <0|1>                        (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
+    printf("  -pg <pp,tg>                               (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
+    printf("  -b, --batch-size <n>                      (default: %s)\n",
+           join(cmd_params_defaults.n_batch, ",").c_str());
+    printf("  -ub, --ubatch-size <n>                    (default: %s)\n",
+           join(cmd_params_defaults.n_ubatch, ",").c_str());
+    printf("  -ctk, --cache-type-k <t>                  (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv, --cache-type-v <t>                  (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+    printf("  -t, --threads <n>                         (default: %s)\n",
+           join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n",
+           join(cmd_params_defaults.cpu_mask, ",").c_str());
+    printf("  --cpu-strict <0|1>                        (default: %s)\n",
+           join(cmd_params_defaults.cpu_strict, ",").c_str());
     printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
-    printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n",
+           join(cmd_params_defaults.n_gpu_layers, ",").c_str());
     if (llama_supports_rpc()) {
-        printf("  -rpc, --rpc <rpc_servers>                 (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
+        printf("  -rpc, --rpc <rpc_servers>                 (default: %s)\n",
+               join(cmd_params_defaults.rpc_servers, ",").c_str());
     }
-    printf("  -sm, --split-mode <none|layer|row>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
-    printf("  -mg, --main-gpu <i>                       (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -nkvo, --no-kv-offload <0|1>              (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
-    printf("  -fa, --flash-attn <0|1>                   (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
-    printf("  -mmp, --mmap <0|1>                        (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  -sm, --split-mode <none|layer|row>        (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
+    printf("  -mg, --main-gpu <i>                       (default: %s)\n",
+           join(cmd_params_defaults.main_gpu, ",").c_str());
+    printf("  -nkvo, --no-kv-offload <0|1>              (default: %s)\n",
+           join(cmd_params_defaults.no_kv_offload, ",").c_str());
+    printf("  -fa, --flash-attn <0|1>                   (default: %s)\n",
+           join(cmd_params_defaults.flash_attn, ",").c_str());
+    printf("  -mmp, --mmap <0|1>                        (default: %s)\n",
+           join(cmd_params_defaults.use_mmap, ",").c_str());
     printf("  --numa <distribute|isolate|numactl>       (default: disabled)\n");
-    printf("  -embd, --embeddings <0|1>                 (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
+    printf("  -embd, --embeddings <0|1>                 (default: %s)\n",
+           join(cmd_params_defaults.embeddings, ",").c_str());
     printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
     printf("  -r, --repetitions <n>                     (default: %d)\n", cmd_params_defaults.reps);
     printf("  --prio <0|1|2|3>                          (default: %d)\n", cmd_params_defaults.prio);
     printf("  --delay <0...N> (seconds)                 (default: %d)\n", cmd_params_defaults.delay);
-    printf("  -o, --output <csv|json|jsonl|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
-    printf("  -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
+    printf("  -o, --output <csv|json|jsonl|md|sql>      (default: %s)\n",
+           output_format_str(cmd_params_defaults.output_format));
+    printf("  -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
+           output_format_str(cmd_params_defaults.output_format_stderr));
     printf("  -v, --verbose                             (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
     printf("  --progress                                (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
     printf("\n");
-    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
+    printf(
+        "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter "
+        "multiple times.\n");
 }
 
 static ggml_type ggml_type_from_name(const std::string & s) {
@@ -281,22 +309,21 @@ static ggml_type ggml_type_from_name(const std::string & s) {
     return GGML_TYPE_COUNT;
 }
 
-
 static cmd_params parse_cmd_params(int argc, char ** argv) {
-    cmd_params params;
-    std::string arg;
-    bool invalid_param = false;
-    const std::string arg_prefix = "--";
-    const char split_delim = ',';
+    cmd_params        params;
+    std::string       arg;
+    bool              invalid_param = false;
+    const std::string arg_prefix    = "--";
+    const char        split_delim   = ',';
 
-    params.verbose = cmd_params_defaults.verbose;
-    params.output_format = cmd_params_defaults.output_format;
+    params.verbose              = cmd_params_defaults.verbose;
+    params.output_format        = cmd_params_defaults.output_format;
     params.output_format_stderr = cmd_params_defaults.output_format_stderr;
-    params.reps = cmd_params_defaults.reps;
-    params.numa = cmd_params_defaults.numa;
-    params.prio = cmd_params_defaults.prio;
-    params.delay = cmd_params_defaults.delay;
-    params.progress = cmd_params_defaults.progress;
+    params.reps                 = cmd_params_defaults.reps;
+    params.numa                 = cmd_params_defaults.numa;
+    params.prio                 = cmd_params_defaults.prio;
+    params.delay                = cmd_params_defaults.delay;
+    params.progress             = cmd_params_defaults.progress;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -338,7 +365,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
+            params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
         } else if (arg == "-b" || arg == "--batch-size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -358,7 +385,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            auto p = string_split<std::string>(argv[i], split_delim);
+            auto                   p = string_split<std::string>(argv[i], split_delim);
             std::vector<ggml_type> types;
             for (const auto & t : p) {
                 ggml_type gt = ggml_type_from_name(t);
@@ -377,7 +404,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            auto p = string_split<std::string>(argv[i], split_delim);
+            auto                   p = string_split<std::string>(argv[i], split_delim);
             std::vector<ggml_type> types;
             for (const auto & t : p) {
                 ggml_type gt = ggml_type_from_name(t);
@@ -437,7 +464,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            auto p = string_split<std::string>(argv[i], split_delim);
+            auto                          p = string_split<std::string>(argv[i], split_delim);
             std::vector<llama_split_mode> modes;
             for (const auto & m : p) {
                 llama_split_mode mode;
@@ -476,10 +503,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 break;
             } else {
                 std::string value(argv[i]);
-                /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
-                else if (value == "isolate")                    { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
-                else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
-                else { invalid_param = true; break; }
+                /**/ if (value == "distribute" || value == "") {
+                    params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
+                } else if (value == "isolate") {
+                    params.numa = GGML_NUMA_STRATEGY_ISOLATE;
+                } else if (value == "numactl") {
+                    params.numa = GGML_NUMA_STRATEGY_NUMACTL;
+                } else {
+                    invalid_param = true;
+                    break;
+                }
             }
         } else if (arg == "-fa" || arg == "--flash-attn") {
             if (++i >= argc) {
@@ -509,9 +542,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             for (auto ts : string_split<std::string>(argv[i], split_delim)) {
                 // split string by ; and /
-                const std::regex regex{R"([;/]+)"};
-                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
-                std::vector<std::string> split_arg{it, {}};
+                const std::regex           regex{ R"([;/]+)" };
+                std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
+                std::vector<std::string>   split_arg{ it, {} };
                 GGML_ASSERT(split_arg.size() <= llama_max_devices());
 
                 std::vector<float> tensor_split(llama_max_devices());
@@ -570,52 +603,94 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     }
 
     // set defaults
-    if (params.model.empty())        { params.model = cmd_params_defaults.model; }
-    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
-    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
-    if (params.n_pg.empty())         { params.n_pg = cmd_params_defaults.n_pg; }
-    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.n_ubatch.empty())     { params.n_ubatch = cmd_params_defaults.n_ubatch; }
-    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
-    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
-    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
-    if (params.rpc_servers.empty())  { params.rpc_servers = cmd_params_defaults.rpc_servers; }
-    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
-    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
-    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
-    if (params.flash_attn.empty())   { params.flash_attn = cmd_params_defaults.flash_attn; }
-    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
-    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
-    if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
-    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
-    if (params.cpu_mask.empty())     { params.cpu_mask  = cmd_params_defaults.cpu_mask;  }
-    if (params.cpu_strict.empty())   { params.cpu_strict = cmd_params_defaults.cpu_strict; }
-    if (params.poll.empty())         { params.poll = cmd_params_defaults.poll; }
+    if (params.model.empty()) {
+        params.model = cmd_params_defaults.model;
+    }
+    if (params.n_prompt.empty()) {
+        params.n_prompt = cmd_params_defaults.n_prompt;
+    }
+    if (params.n_gen.empty()) {
+        params.n_gen = cmd_params_defaults.n_gen;
+    }
+    if (params.n_pg.empty()) {
+        params.n_pg = cmd_params_defaults.n_pg;
+    }
+    if (params.n_batch.empty()) {
+        params.n_batch = cmd_params_defaults.n_batch;
+    }
+    if (params.n_ubatch.empty()) {
+        params.n_ubatch = cmd_params_defaults.n_ubatch;
+    }
+    if (params.type_k.empty()) {
+        params.type_k = cmd_params_defaults.type_k;
+    }
+    if (params.type_v.empty()) {
+        params.type_v = cmd_params_defaults.type_v;
+    }
+    if (params.n_gpu_layers.empty()) {
+        params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
+    }
+    if (params.rpc_servers.empty()) {
+        params.rpc_servers = cmd_params_defaults.rpc_servers;
+    }
+    if (params.split_mode.empty()) {
+        params.split_mode = cmd_params_defaults.split_mode;
+    }
+    if (params.main_gpu.empty()) {
+        params.main_gpu = cmd_params_defaults.main_gpu;
+    }
+    if (params.no_kv_offload.empty()) {
+        params.no_kv_offload = cmd_params_defaults.no_kv_offload;
+    }
+    if (params.flash_attn.empty()) {
+        params.flash_attn = cmd_params_defaults.flash_attn;
+    }
+    if (params.tensor_split.empty()) {
+        params.tensor_split = cmd_params_defaults.tensor_split;
+    }
+    if (params.use_mmap.empty()) {
+        params.use_mmap = cmd_params_defaults.use_mmap;
+    }
+    if (params.embeddings.empty()) {
+        params.embeddings = cmd_params_defaults.embeddings;
+    }
+    if (params.n_threads.empty()) {
+        params.n_threads = cmd_params_defaults.n_threads;
+    }
+    if (params.cpu_mask.empty()) {
+        params.cpu_mask = cmd_params_defaults.cpu_mask;
+    }
+    if (params.cpu_strict.empty()) {
+        params.cpu_strict = cmd_params_defaults.cpu_strict;
+    }
+    if (params.poll.empty()) {
+        params.poll = cmd_params_defaults.poll;
+    }
 
     return params;
 }
 
 struct cmd_params_instance {
-    std::string model;
-    int n_prompt;
-    int n_gen;
-    int n_batch;
-    int n_ubatch;
-    ggml_type type_k;
-    ggml_type type_v;
-    int n_threads;
-    std::string cpu_mask;
-    bool cpu_strict;
-    int poll;
-    int n_gpu_layers;
-    std::string rpc_servers;
-    llama_split_mode split_mode;
-    int main_gpu;
-    bool no_kv_offload;
-    bool flash_attn;
+    std::string        model;
+    int                n_prompt;
+    int                n_gen;
+    int                n_batch;
+    int                n_ubatch;
+    ggml_type          type_k;
+    ggml_type          type_v;
+    int                n_threads;
+    std::string        cpu_mask;
+    bool               cpu_strict;
+    int                poll;
+    int                n_gpu_layers;
+    std::string        rpc_servers;
+    llama_split_mode   split_mode;
+    int                main_gpu;
+    bool               no_kv_offload;
+    bool               flash_attn;
     std::vector<float> tensor_split;
-    bool use_mmap;
-    bool embeddings;
+    bool               use_mmap;
+    bool               embeddings;
 
     llama_model_params to_llama_mparams() const {
         llama_model_params mparams = llama_model_default_params();
@@ -624,35 +699,31 @@ struct cmd_params_instance {
         if (!rpc_servers.empty()) {
             mparams.rpc_servers = rpc_servers.c_str();
         }
-        mparams.split_mode = split_mode;
-        mparams.main_gpu = main_gpu;
+        mparams.split_mode   = split_mode;
+        mparams.main_gpu     = main_gpu;
         mparams.tensor_split = tensor_split.data();
-        mparams.use_mmap = use_mmap;
+        mparams.use_mmap     = use_mmap;
 
         return mparams;
     }
 
     bool equal_mparams(const cmd_params_instance & other) const {
-        return model == other.model &&
-               n_gpu_layers == other.n_gpu_layers &&
-               rpc_servers == other.rpc_servers &&
-               split_mode == other.split_mode &&
-               main_gpu == other.main_gpu &&
-               use_mmap == other.use_mmap &&
+        return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
+               split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
                tensor_split == other.tensor_split;
     }
 
     llama_context_params to_llama_cparams() const {
         llama_context_params cparams = llama_context_default_params();
 
-        cparams.n_ctx = n_prompt + n_gen;
-        cparams.n_batch = n_batch;
-        cparams.n_ubatch = n_ubatch;
-        cparams.type_k = type_k;
-        cparams.type_v = type_v;
+        cparams.n_ctx       = n_prompt + n_gen;
+        cparams.n_batch     = n_batch;
+        cparams.n_ubatch    = n_ubatch;
+        cparams.type_k      = type_k;
+        cparams.type_v      = type_v;
         cparams.offload_kqv = !no_kv_offload;
-        cparams.flash_attn = flash_attn;
-        cparams.embeddings = embeddings;
+        cparams.flash_attn  = flash_attn;
+        cparams.embeddings  = embeddings;
 
         return cparams;
     }
@@ -662,6 +733,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     std::vector<cmd_params_instance> instances;
 
     // this ordering minimizes the number of times that each model needs to be reloaded
+    // clang-format off
     for (const auto & m : params.model)
     for (const auto & nl : params.n_gpu_layers)
     for (const auto & rpc : params.rpc_servers)
@@ -767,100 +839,94 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
             instances.push_back(instance);
         }
     }
+    // clang-format on
 
     return instances;
 }
 
 struct test {
     static const std::string build_commit;
-    static const int build_number;
+    static const int         build_number;
     static const std::string cpu_info;
     static const std::string gpu_info;
-    std::string model_filename;
-    std::string model_type;
-    uint64_t model_size;
-    uint64_t model_n_params;
-    int n_batch;
-    int n_ubatch;
-    int n_threads;
-    std::string cpu_mask;
-    bool cpu_strict;
-    int poll;
-    ggml_type type_k;
-    ggml_type type_v;
-    int n_gpu_layers;
-    llama_split_mode split_mode;
-    int main_gpu;
-    bool no_kv_offload;
-    bool flash_attn;
-    std::vector<float> tensor_split;
-    bool use_mmap;
-    bool embeddings;
-    int n_prompt;
-    int n_gen;
-    std::string test_time;
-    std::vector<uint64_t> samples_ns;
+    std::string              model_filename;
+    std::string              model_type;
+    uint64_t                 model_size;
+    uint64_t                 model_n_params;
+    int                      n_batch;
+    int                      n_ubatch;
+    int                      n_threads;
+    std::string              cpu_mask;
+    bool                     cpu_strict;
+    int                      poll;
+    ggml_type                type_k;
+    ggml_type                type_v;
+    int                      n_gpu_layers;
+    llama_split_mode         split_mode;
+    int                      main_gpu;
+    bool                     no_kv_offload;
+    bool                     flash_attn;
+    std::vector<float>       tensor_split;
+    bool                     use_mmap;
+    bool                     embeddings;
+    int                      n_prompt;
+    int                      n_gen;
+    std::string              test_time;
+    std::vector<uint64_t>    samples_ns;
 
     test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
         model_filename = inst.model;
         char buf[128];
         llama_model_desc(lmodel, buf, sizeof(buf));
-        model_type = buf;
-        model_size = llama_model_size(lmodel);
+        model_type     = buf;
+        model_size     = llama_model_size(lmodel);
         model_n_params = llama_model_n_params(lmodel);
-        n_batch = inst.n_batch;
-        n_ubatch = inst.n_ubatch;
-        n_threads = inst.n_threads;
-        cpu_mask = inst.cpu_mask;
-        cpu_strict = inst.cpu_strict;
-        poll = inst.poll;
-        type_k = inst.type_k;
-        type_v = inst.type_v;
-        n_gpu_layers = inst.n_gpu_layers;
-        split_mode = inst.split_mode;
-        main_gpu = inst.main_gpu;
-        no_kv_offload = inst.no_kv_offload;
-        flash_attn = inst.flash_attn;
-        tensor_split = inst.tensor_split;
-        use_mmap = inst.use_mmap;
-        embeddings = inst.embeddings;
-        n_prompt = inst.n_prompt;
-        n_gen = inst.n_gen;
+        n_batch        = inst.n_batch;
+        n_ubatch       = inst.n_ubatch;
+        n_threads      = inst.n_threads;
+        cpu_mask       = inst.cpu_mask;
+        cpu_strict     = inst.cpu_strict;
+        poll           = inst.poll;
+        type_k         = inst.type_k;
+        type_v         = inst.type_v;
+        n_gpu_layers   = inst.n_gpu_layers;
+        split_mode     = inst.split_mode;
+        main_gpu       = inst.main_gpu;
+        no_kv_offload  = inst.no_kv_offload;
+        flash_attn     = inst.flash_attn;
+        tensor_split   = inst.tensor_split;
+        use_mmap       = inst.use_mmap;
+        embeddings     = inst.embeddings;
+        n_prompt       = inst.n_prompt;
+        n_gen          = inst.n_gen;
         // RFC 3339 date-time format
-        time_t t = time(NULL);
+        time_t t       = time(NULL);
         std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
         test_time = buf;
 
         (void) ctx;
     }
 
-    uint64_t avg_ns() const {
-        return ::avg(samples_ns);
-    }
+    uint64_t avg_ns() const { return ::avg(samples_ns); }
 
-    uint64_t stdev_ns() const {
-        return ::stdev(samples_ns);
-    }
+    uint64_t stdev_ns() const { return ::stdev(samples_ns); }
 
     std::vector<double> get_ts() const {
-        int n_tokens = n_prompt + n_gen;
+        int                 n_tokens = n_prompt + n_gen;
         std::vector<double> ts;
-        std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
+        std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
+                       [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
         return ts;
     }
 
-    double avg_ts() const {
-        return ::avg(get_ts());
-    }
+    double avg_ts() const { return ::avg(get_ts()); }
 
-    double stdev_ts() const {
-        return ::stdev(get_ts());
-    }
+    double stdev_ts() const { return ::stdev(get_ts()); }
 
     static std::string get_backend() {
         std::vector<std::string> backends;
         for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
-            auto * reg = ggml_backend_reg_get(i);
+            auto *      reg  = ggml_backend_reg_get(i);
             std::string name = ggml_backend_reg_name(reg);
             if (name != "CPU") {
                 backends.push_back(ggml_backend_reg_name(reg));
@@ -871,36 +937,27 @@ struct test {
 
     static const std::vector<std::string> & get_fields() {
         static const std::vector<std::string> fields = {
-            "build_commit", "build_number",
-            "cpu_info", "gpu_info", "backends",
-            "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_ubatch",
-            "n_threads", "cpu_mask", "cpu_strict", "poll",
-            "type_k", "type_v",
-            "n_gpu_layers", "split_mode",
-            "main_gpu", "no_kv_offload", "flash_attn",
-            "tensor_split", "use_mmap", "embeddings",
-            "n_prompt", "n_gen", "test_time",
-            "avg_ns", "stddev_ns",
-            "avg_ts", "stddev_ts",
+            "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
+            "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
+            "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
+            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
+            "embeddings",   "n_prompt",     "n_gen",          "test_time",  "avg_ns",       "stddev_ns",
+            "avg_ts",       "stddev_ts",
         };
         return fields;
     }
 
-    enum field_type {STRING, BOOL, INT, FLOAT};
+    enum field_type { STRING, BOOL, INT, FLOAT };
 
     static field_type get_field_type(const std::string & field) {
-        if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
-            field == "n_threads" || field == "poll" ||
-            field == "model_size" || field == "model_n_params" ||
-            field == "n_gpu_layers" || field == "main_gpu" ||
-            field == "n_prompt" || field == "n_gen" ||
-            field == "avg_ns" || field == "stddev_ns") {
+        if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
+            field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
+            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
+            field == "stddev_ns") {
             return INT;
         }
-        if (field == "f16_kv" || field == "no_kv_offload" ||
-            field == "cpu_strict" ||
-            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
+        if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
+            field == "use_mmap" || field == "embeddings") {
             return BOOL;
         }
         if (field == "avg_ts" || field == "stddev_ts") {
@@ -911,7 +968,7 @@ struct test {
 
     std::vector<std::string> get_values() const {
         std::string tensor_split_str;
-        int max_nonzero = 0;
+        int         max_nonzero = 0;
         for (size_t i = 0; i < llama_max_devices(); i++) {
             if (tensor_split[i] > 0) {
                 max_nonzero = i;
@@ -925,29 +982,47 @@ struct test {
                 tensor_split_str += "/";
             }
         }
-        std::vector<std::string> values = {
-            build_commit, std::to_string(build_number),
-            cpu_info, gpu_info, get_backend(),
-            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_ubatch),
-            std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
-            ggml_type_name(type_k), ggml_type_name(type_v),
-            std::to_string(n_gpu_layers), split_mode_str(split_mode),
-            std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
-            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
-            std::to_string(n_prompt), std::to_string(n_gen), test_time,
-            std::to_string(avg_ns()), std::to_string(stdev_ns()),
-            std::to_string(avg_ts()), std::to_string(stdev_ts())
-        };
+        std::vector<std::string> values = { build_commit,
+                                            std::to_string(build_number),
+                                            cpu_info,
+                                            gpu_info,
+                                            get_backend(),
+                                            model_filename,
+                                            model_type,
+                                            std::to_string(model_size),
+                                            std::to_string(model_n_params),
+                                            std::to_string(n_batch),
+                                            std::to_string(n_ubatch),
+                                            std::to_string(n_threads),
+                                            cpu_mask,
+                                            std::to_string(cpu_strict),
+                                            std::to_string(poll),
+                                            ggml_type_name(type_k),
+                                            ggml_type_name(type_v),
+                                            std::to_string(n_gpu_layers),
+                                            split_mode_str(split_mode),
+                                            std::to_string(main_gpu),
+                                            std::to_string(no_kv_offload),
+                                            std::to_string(flash_attn),
+                                            tensor_split_str,
+                                            std::to_string(use_mmap),
+                                            std::to_string(embeddings),
+                                            std::to_string(n_prompt),
+                                            std::to_string(n_gen),
+                                            test_time,
+                                            std::to_string(avg_ns()),
+                                            std::to_string(stdev_ns()),
+                                            std::to_string(avg_ts()),
+                                            std::to_string(stdev_ts()) };
         return values;
     }
 
     std::map<std::string, std::string> get_map() const {
         std::map<std::string, std::string> map;
-        auto fields = get_fields();
-        auto values = get_values();
-        std::transform(fields.begin(), fields.end(), values.begin(),
-                std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
+        auto                               fields = get_fields();
+        auto                               values = get_values();
+        std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
+                       std::make_pair<const std::string &, const std::string &>);
         return map;
     }
 };
@@ -961,9 +1036,12 @@ struct printer {
     virtual ~printer() {}
 
     FILE * fout;
+
     virtual void print_header(const cmd_params & params) { (void) params; }
+
     virtual void print_test(const test & t) = 0;
-    virtual void print_footer() { }
+
+    virtual void print_footer() {}
 };
 
 struct csv_printer : public printer {
@@ -979,7 +1057,7 @@ struct csv_printer : public printer {
         return escaped;
     }
 
-    void print_header(const cmd_params & params) override  {
+    void print_header(const cmd_params & params) override {
         std::vector<std::string> fields = test::get_fields();
         fprintf(fout, "%s\n", join(fields, ",").c_str());
         (void) params;
@@ -992,7 +1070,6 @@ struct csv_printer : public printer {
     }
 };
 
-
 static std::string escape_json(const std::string & value) {
     std::string escaped;
     for (auto c : value) {
@@ -1000,7 +1077,7 @@ static std::string escape_json(const std::string & value) {
             escaped += "\\\"";
         } else if (c == '\\') {
             escaped += "\\\\";
-        } else  if (c <= 0x1f) {
+        } else if (c <= 0x1f) {
             char buf[8];
             snprintf(buf, sizeof(buf), "\\u%04x", c);
             escaped += buf;
@@ -1033,7 +1110,8 @@ struct json_printer : public printer {
     void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
         assert(fields.size() == values.size());
         for (size_t i = 0; i < fields.size(); i++) {
-            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
+            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(),
+                    format_json_value(fields.at(i), values.at(i)).c_str());
         }
     }
 
@@ -1051,12 +1129,9 @@ struct json_printer : public printer {
         fflush(fout);
     }
 
-    void print_footer() override {
-        fprintf(fout, "\n]\n");
-    }
+    void print_footer() override { fprintf(fout, "\n]\n"); }
 };
 
-
 struct jsonl_printer : public printer {
     void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
         assert(fields.size() == values.size());
@@ -1116,7 +1191,7 @@ struct markdown_printer : public printer {
             return 13;
         }
 
-        int width = std::max((int)field.length(), 10);
+        int width = std::max((int) field.length(), 10);
 
         if (test::get_field_type(field) == test::STRING) {
             return -width;
@@ -1230,18 +1305,18 @@ struct markdown_printer : public printer {
         fprintf(fout, "|");
         for (const auto & field : fields) {
             std::string value;
-            char buf[128];
+            char        buf[128];
             if (field == "model") {
                 value = t.model_type;
             } else if (field == "size") {
-                if (t.model_size < 1024*1024*1024) {
+                if (t.model_size < 1024 * 1024 * 1024) {
                     snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
                 } else {
                     snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
                 }
                 value = buf;
             } else if (field == "params") {
-                if (t.model_n_params < 1000*1000*1000) {
+                if (t.model_n_params < 1000 * 1000 * 1000) {
                     snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
                 } else {
                     snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
@@ -1303,7 +1378,8 @@ struct sql_printer : public printer {
         std::vector<std::string> fields = test::get_fields();
         fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
         for (size_t i = 0; i < fields.size(); i++) {
-            fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),  i < fields.size() - 1 ? "," : "");
+            fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
+                    i < fields.size() - 1 ? "," : "");
         }
         fprintf(fout, ");\n");
         fprintf(fout, "\n");
@@ -1324,8 +1400,8 @@ struct sql_printer : public printer {
 static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
-    const llama_model * model = llama_get_model(ctx);
-    const int32_t n_vocab = llama_n_vocab(model);
+    const llama_model * model   = llama_get_model(ctx);
+    const int32_t       n_vocab = llama_n_vocab(model);
 
     std::vector<llama_token> tokens(n_batch);
 
@@ -1333,7 +1409,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
 
     while (n_processed < n_prompt) {
         int n_tokens = std::min(n_prompt - n_processed, n_batch);
-        tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
+        tokens[0]    = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
         for (int i = 1; i < n_tokens; i++) {
             tokens[i] = std::rand() % n_vocab;
         }
@@ -1347,8 +1423,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
 static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
-    const llama_model * model = llama_get_model(ctx);
-    const int32_t n_vocab = llama_n_vocab(model);
+    const llama_model * model   = llama_get_model(ctx);
+    const int32_t       n_vocab = llama_n_vocab(model);
 
     llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
 
@@ -1411,7 +1487,7 @@ int main(int argc, char ** argv) {
     set_process_priority(params.prio);
 
     // initialize printer
-    std::unique_ptr<printer> p = create_printer(params.output_format);
+    std::unique_ptr<printer> p     = create_printer(params.output_format);
     std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
 
     if (p) {
@@ -1426,13 +1502,13 @@ int main(int argc, char ** argv) {
 
     std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
 
-    llama_model * lmodel = nullptr;
+    llama_model *               lmodel    = nullptr;
     const cmd_params_instance * prev_inst = nullptr;
 
-    int params_idx = 0;
+    int  params_idx   = 0;
     auto params_count = params_instances.size();
     for (const auto & inst : params_instances) {
-        params_idx ++;
+        params_idx++;
         if (params.progress) {
             fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
         }
@@ -1475,7 +1551,7 @@ int main(int argc, char ** argv) {
         tpp.poll       = t.poll;
         tpp.prio       = params.prio;
 
-        struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+        struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
         if (!threadpool) {
             fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
             exit(1);
@@ -1505,13 +1581,15 @@ int main(int argc, char ** argv) {
 
             if (t.n_prompt > 0) {
                 if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
+                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
+                            i + 1, params.reps);
                 }
                 test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
             }
             if (t.n_gen > 0) {
                 if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
+                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
+                            i + 1, params.reps);
                 }
                 test_gen(ctx, t.n_gen, t.n_threads);
             }

From f95caa79546271722ada703da20ffb1cfcd21fed Mon Sep 17 00:00:00 2001
From: bandoti <141645996+bandoti@users.noreply.github.com>
Date: Wed, 20 Nov 2024 12:22:19 -0400
Subject: [PATCH 275/396] cmake: add link dependencies to cmake find pkg
 (#10433)

* cmake pkg: find accelerate, openmp, memkind libs

* cmake pkg: find BLAS libs

* try BLAS_LIBRARIES instead

* Add BLAS link opts

* Add more link deps. and set GGML_ vars
---
 cmake/llama-config.cmake.in | 136 +++++++++++++++++++++++++++---------
 1 file changed, 104 insertions(+), 32 deletions(-)

diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in
index 28a8c18b6..5c55bc6b8 100644
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -3,12 +3,60 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
 
+set(GGML_STATIC @GGML_STATIC@)
+set(GGML_NATIVE @GGML_NATIVE@)
+set(GGML_LTO    @GGML_LTO@)
+set(GGML_CCACHE @GGML_CCACHE@)
+set(GGML_AVX    @GGML_AVX@)
+set(GGML_AVX2   @GGML_AVX2@)
+set(GGML_AVX512 @GGML_AVX512@)
+set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@)
+set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@)
+set(GGML_AVX512_BF16 @GGML_AVX512_BF16@)
+set(GGML_AMX_TILE @GGML_AMX_TILE@)
+set(GGML_AMX_INT8 @GGML_AMX_INT8@)
+set(GGML_AMX_BF16 @GGML_AMX_BF16@)
+set(GGML_FMA  @GGML_FMA@)
+set(GGML_LASX @GGML_LASX@)
+set(GGML_LSX  @GGML_LSX@)
+set(GGML_RVV  @GGML_RVV@)
+set(GGML_SVE  @GGML_SVE@)
+
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
+set(GGML_OPENMP  @GGML_OPENMP@)
+set(GGML_CPU_HBM @GGML_CPU_HBM@)
+set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@)
+
+set(GGML_CUDA_FORCE_MMQ    @GGML_CUDA_FORCE_MMQ@)
+set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@)
+set(GGML_CUDA_F16          @GGML_CUDA_F16@)
+set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@)
+set(GGML_CUDA_NO_PEER_COPY  @GGML_CUDA_NO_PEER_COPY@)
+set(GGML_CUDA_NO_VMM        @GGML_CUDA_NO_VMM@)
+set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@)
+set(GGML_CUDA_GRAPHS        @GGML_CUDA_GRAPHS@)
+
+set(GGML_HIP_UMA @GGML_HIP_UMA@)
+
 set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
-set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
-set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
-set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
-set(GGML_OPENMP @GGML_OPENMP@)
+set(GGML_VULKAN_DEBUG         @GGML_VULKAN_DEBUG@)
+set(GGML_VULKAN_MEMORY_DEBUG  @GGML_VULKAN_MEMORY_DEBUG@)
+set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@)
+set(GGML_VULKAN_PERF      @GGML_VULKAN_PERF@)
+set(GGML_VULKAN_VALIDATE  @GGML_VULKAN_VALIDATE@)
+set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@)
+
+set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@)
+set(GGML_METAL_NDEBUG   @GGML_METAL_NDEBUG@)
+set(GGML_METAL_SHADER_DEBUG  @GGML_METAL_SHADER_DEBUG@)
+set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@)
+set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@)
+set(GGML_METAL_STD @GGML_METAL_STD@)
+
+set(GGML_SYCL_F16    @GGML_SYCL_F16@)
+set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@)
+set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@)
+
 
 @PACKAGE_INIT@
 
@@ -20,6 +68,7 @@ find_package(Threads REQUIRED)
 
 set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
 set(_llama_link_deps "")
+set(_llama_link_opts "")
 foreach(_ggml_lib ggml ggml-base)
     string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
     find_library(${_ggml_lib_var} ${_ggml_lib}
@@ -49,41 +98,63 @@ foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
     endif()
 endforeach()
 
-if (APPLE AND GGML_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
-endif()
+if (NOT LLAMA_SHARED_LIB)
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
+        list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK})
+    endif()
 
-if (GGML_BLAS)
-    find_package(BLAS REQUIRED)
-endif()
+    if (GGML_OPENMP)
+        find_package(OpenMP REQUIRED)
+        list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+    endif()
 
-if (GGML_CUDA)
-    find_package(CUDAToolkit REQUIRED)
-endif()
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)
+        list(APPEND _llama_link_deps memkind)
+    endif()
 
-if (GGML_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK Metal REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
-endif()
+    if (GGML_BLAS)
+        find_package(BLAS REQUIRED)
+        list(APPEND _llama_link_deps ${BLAS_LIBRARIES})
+        list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS})
+    endif()
 
-if (GGML_VULKAN)
-    find_package(Vulkan REQUIRED)
-endif()
+    if (GGML_CUDA)
+        find_package(CUDAToolkit REQUIRED)
+    endif()
 
-if (GGML_HIP)
-    find_package(hip REQUIRED)
-    find_package(hipblas REQUIRED)
-    find_package(rocblas REQUIRED)
-endif()
+    if (GGML_METAL)
+        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+        find_library(METAL_FRAMEWORK    Metal REQUIRED)
+        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+        list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY}
+                                     ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
+    endif()
 
-if (GGML_SYCL)
-    find_package(IntelSYCL REQUIRED)
-    find_package(MKL REQUIRED)
-endif()
+    if (GGML_VULKAN)
+        find_package(Vulkan REQUIRED)
+        list(APPEND _llama_link_deps Vulkan::Vulkan)
+    endif()
 
-if (GGML_OPENMP)
-    find_package(OpenMP REQUIRED)
+    if (GGML_HIP)
+        find_package(hip     REQUIRED)
+        find_package(hipblas REQUIRED)
+        find_package(rocblas REQUIRED)
+        list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
+    endif()
+
+    if (GGML_SYCL)
+        find_package(DNNL)
+        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+            list(APPEND _llama_link_deps DNNL::dnnl)
+        endif()
+        if (WIN32)
+            find_package(IntelSYCL REQUIRED)
+            find_package(MKL       REQUIRED)
+            list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        endif()
+    endif()
 endif()
 
 find_library(llama_LIBRARY llama
@@ -97,6 +168,7 @@ set_target_properties(llama
     PROPERTIES
         INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
         INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
+        INTERFACE_LINK_OPTIONS   "${_llama_link_opts}"
         INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
         IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
         IMPORTED_LOCATION "${llama_LIBRARY}"

From 9abe9eeae98b11fa93b82632b264126a010225ff Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 20 Nov 2024 13:47:36 -0600
Subject: [PATCH 276/396] vulkan: predicate max operation in soft_max
 shaders/soft_max (#10437)

Fixes #10434
---
 ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
index f9727679e..6e20b6411 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
@@ -73,7 +73,9 @@ void soft_max(uint num_iters) {
 
         FLOAT_TYPE v = a * p.scale + slope * b;
 
-        max_val = max(max_val, v);
+        if (col < p.KX) {
+            max_val = max(max_val, v);
+        }
 
         if (idx < DATA_CACHE_SIZE) {
             data_cache[idx] = v;

From 02e4eaf22f229a114054b053a9eff61483653670 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 20 Nov 2024 14:56:04 +0100
Subject: [PATCH 277/396] ggml-opt: fix data corruption (ggml/1022)

---
 ggml/src/ggml-backend.cpp  |   2 +
 ggml/src/ggml-impl.h       |   3 +
 ggml/src/ggml-opt.cpp      | 147 +++++++++++++++++--------------------
 ggml/src/ggml.c            |  94 ++++++++++++++----------
 tests/test-backend-ops.cpp |   1 -
 5 files changed, 129 insertions(+), 118 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 9dcde8d11..3433d082e 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -252,6 +252,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
 }
 
 void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor);
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
     if (size == 0) {
@@ -266,6 +267,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
 }
 
 void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor);
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
     if (size == 0) {
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 92a64fe5a..3965be787 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -295,6 +295,9 @@ struct ggml_cgraph {
     enum ggml_cgraph_eval_order order;
 };
 
+// returns a slice of cgraph with nodes [i0, i1)
+// the slice does not have leafs or gradients
+// if you need the gradients, get them from the original graph
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 
 // Memory allocation
diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp
index 040205a31..7c3e24103 100644
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@@ -14,51 +14,51 @@
 #include <vector>
 
 struct ggml_opt_dataset {
-    struct ggml_context   * ctx;
-    ggml_backend_buffer_t   buf;
-    struct ggml_tensor    * data;
-    struct ggml_tensor    * labels;
+    struct ggml_context   * ctx    = nullptr;
+    ggml_backend_buffer_t   buf    = nullptr;
+    struct ggml_tensor    * data   = nullptr;
+    struct ggml_tensor    * labels = nullptr;
 
-    int64_t ndata;
-    int64_t ndata_shard;
-    size_t  nbs_data;
-    size_t  nbs_labels;
+    int64_t ndata       = -1;
+    int64_t ndata_shard = -1;
+    size_t  nbs_data    = -1;
+    size_t  nbs_labels  = -1;
 
     std::vector<int64_t> permutation;
 };
 
 struct ggml_opt_context {
-    ggml_backend_sched_t    backend_sched;
-    ggml_cgraph           * allocated_graph;
-    ggml_cgraph           * allocated_graph_copy;
-    struct ggml_context   * ctx_static;
-    struct ggml_context   * ctx_static_cpu;
-    struct ggml_context   * ctx_compute;
-    struct ggml_context   * ctx_copy;
-    ggml_backend_buffer_t   buf_static;
-    ggml_backend_buffer_t   buf_static_cpu;
+    ggml_backend_sched_t    backend_sched        = nullptr;
+    ggml_cgraph           * allocated_graph      = nullptr;
+    ggml_cgraph           * allocated_graph_copy = nullptr;
+    struct ggml_context   * ctx_static           = nullptr;
+    struct ggml_context   * ctx_static_cpu       = nullptr;
+    struct ggml_context   * ctx_compute          = nullptr;
+    struct ggml_context   * ctx_copy             = nullptr;
+    ggml_backend_buffer_t   buf_static           = nullptr;
+    ggml_backend_buffer_t   buf_static_cpu       = nullptr;
     std::mt19937            rng;
 
-    struct ggml_tensor * inputs;
-    struct ggml_tensor * outputs;
-    struct ggml_tensor * labels;
+    struct ggml_tensor * inputs  = nullptr;
+    struct ggml_tensor * outputs = nullptr;
+    struct ggml_tensor * labels  = nullptr;
 
-    struct ggml_tensor * loss;
-    struct ggml_tensor * pred;
-    struct ggml_tensor * ncorrect;
+    struct ggml_tensor * loss     = nullptr;
+    struct ggml_tensor * pred     = nullptr;
+    struct ggml_tensor * ncorrect = nullptr;
 
-    struct ggml_cgraph * gf;
-    struct ggml_cgraph * gb_grad;
-    struct ggml_cgraph * gb_opt;
+    struct ggml_cgraph * gf      = nullptr;
+    struct ggml_cgraph * gb_grad = nullptr;
+    struct ggml_cgraph * gb_opt  = nullptr;
 
-    int64_t iter;
-    int32_t opt_period;
-    int32_t opt_i;
-    bool    loss_per_datapoint;
+    int64_t iter               = 1;
+    int32_t opt_period         = 1;
+    int32_t opt_i              = 0;
+    bool    loss_per_datapoint = false;
 
-    ggml_opt_get_optimizer_params get_opt_pars;
-    void * get_opt_pars_ud;
-    struct ggml_tensor * adamw_params;
+    ggml_opt_get_optimizer_params get_opt_pars = nullptr;
+    void * get_opt_pars_ud                     = nullptr;
+    struct ggml_tensor * adamw_params          = nullptr;
 };
 
 struct ggml_opt_result {
@@ -67,8 +67,8 @@ struct ggml_opt_result {
     std::vector<int32_t> pred;
     int64_t              ncorrect = 0;
 
-    bool loss_per_datapoint = false;
-    int64_t opt_period = -1;
+    int64_t opt_period         = -1;
+    bool    loss_per_datapoint = false;
 };
 
 // ====== Dataset ======
@@ -188,11 +188,11 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
 }
 
 struct ggml_opt_params ggml_opt_default_params(
-        ggml_backend_sched_t backend_sched,
-        struct ggml_context * ctx_compute,
-        struct ggml_tensor * inputs,
-        struct ggml_tensor * outputs,
-        enum ggml_opt_loss_type loss_type) {
+        ggml_backend_sched_t      backend_sched,
+        struct ggml_context     * ctx_compute,
+        struct ggml_tensor      * inputs,
+        struct ggml_tensor      * outputs,
+        enum ggml_opt_loss_type   loss_type) {
     return {
         /*backend_sched   =*/ backend_sched,
         /*ctx_compute     =*/ ctx_compute,
@@ -237,25 +237,33 @@ static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_
     return new_tensor;
 }
 
-static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * graph) {
+static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
     std::map<ggml_tensor *, ggml_tensor *> tensor_map;
 
-    ggml_cgraph * new_graph = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true);
+    ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true);
 
-    for (int i = 0; i < graph->n_leafs; i++) {
-        ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->leafs[i]));
+    for (int i = 0; i < src->n_leafs; i++) {
+        ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i]));
     }
-    for (int i = 0; i < graph->n_nodes; i++) {
-        ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->nodes[i]));
+    GGML_ASSERT(dst->n_leafs == src->n_leafs);
+    for (int i = 0; i < src->n_nodes; i++) {
+        ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i]));
     }
-    for (int i = 0; i < graph->n_nodes; ++i) {
-        const size_t igrad_src = ggml_hash_find(&graph->visited_hash_set, graph->nodes[i]);
-        const size_t igrad_dst = ggml_hash_find(&new_graph->visited_hash_set, new_graph->nodes[i]);
-        graph->grads[igrad_dst]     = new_graph->grads[igrad_src];
-        graph->grad_accs[igrad_dst] = new_graph->grad_accs[igrad_src];
+    GGML_ASSERT(dst->n_nodes == src->n_nodes);
+    for (int i = 0; i < src->n_nodes; ++i) {
+        const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
+        const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
+
+        GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
+        GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
+
+        dst->grads[igrad_dst]     = src->grads[igrad_src];
+        dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
     }
 
-    return new_graph;
+    return dst;
 }
 
 static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
@@ -284,18 +292,13 @@ static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph
 
 ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
     ggml_opt_context_t result = new struct ggml_opt_context;
-    result->backend_sched        = params.backend_sched;
-    result->allocated_graph      = nullptr;
-    result->allocated_graph_copy = nullptr;
-    result->ctx_compute          = params.ctx_compute;
-    result->ctx_copy             = nullptr;
-    result->inputs               = params.inputs;
-    result->outputs              = params.outputs;
-    result->iter                 = 1;
-    result->opt_period           = params.opt_period;
-    result->opt_i                = 0;
-    result->get_opt_pars         = params.get_opt_pars;
-    result->get_opt_pars_ud      = params.get_opt_pars_ud;
+    result->backend_sched   = params.backend_sched;
+    result->ctx_compute     = params.ctx_compute;
+    result->inputs          = params.inputs;
+    result->outputs         = params.outputs;
+    result->opt_period      = params.opt_period;
+    result->get_opt_pars    = params.get_opt_pars;
+    result->get_opt_pars_ud = params.get_opt_pars_ud;
 
     GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
     GGML_ASSERT(result->opt_period >= 1);
@@ -348,7 +351,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
 
     switch (params.loss_type) {
         case GGML_OPT_LOSS_TYPE_MEAN: {
-            result->labels = nullptr;
             result->loss = ggml_sum(result->ctx_static, result->outputs);
             ggml_set_name(result->loss, "loss_sum");
             const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
@@ -358,7 +360,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
             break;
         }
         case GGML_OPT_LOSS_TYPE_SUM: {
-            result->labels = nullptr;
             result->loss = ggml_sum(result->ctx_static, result->outputs);
             ggml_set_name(result->loss, "loss_sum");
             result->loss_per_datapoint = false;
@@ -413,14 +414,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
     }
 
     if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
-        result->gb_grad = nullptr;
-        result->gb_opt  = nullptr;
-
         result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
-        result->buf_static_cpu = nullptr;
-
-        ggml_opt_alloc_graph(result, result->gf);
-
         return result;
     }
 
@@ -429,14 +423,8 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
     ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
 
     if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
-        result->gb_opt  = nullptr;
-
         result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
-        result->buf_static_cpu = nullptr;
-
-        ggml_opt_alloc_graph(result, result->gb_grad);
         ggml_graph_reset(result->gb_grad);
-
         return result;
     }
 
@@ -466,7 +454,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
 
     result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
 
-    ggml_opt_alloc_graph(result, result->gb_opt);
     ggml_graph_reset(result->gb_opt);
 
     return result;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index ee72a173e..719d75c70 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5019,8 +5019,10 @@ static void ggml_hash_map_free(struct hash_map * map) {
 }
 
 // utility functions to change gradients
-// if a is in acc_table, modify gradients in-place and mark result as gradient accumulator
-// else if a is in zero_table, replace a
+// isrc is the index of tensor in cgraph->visited_has_set.keys
+// the corresponding gradient (accumulators) are also at position isrc
+// if tensor has a gradient accumulator, modify that accumulator in-place
+// else if there is no gradient for tensor, set the corresponding value
 // else, just add/subtract/etc. the gradients
 
 static void ggml_add_or_set(
@@ -5028,11 +5030,14 @@ static void ggml_add_or_set(
         struct ggml_cgraph  * cgraph,
         size_t                isrc,
         struct ggml_tensor  * tensor) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
-        cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
+        cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]);
     } else {
         cgraph->grads[isrc] = tensor;
     }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
     ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
 
@@ -5040,18 +5045,20 @@ static void ggml_acc_or_set(
         struct ggml_context * ctx,
         struct ggml_cgraph  * cgraph,
         size_t                isrc,
-        struct ggml_tensor  * src,
         struct ggml_tensor  * tensor,
         const  size_t         nb1,
         const  size_t         nb2,
         const  size_t         nb3,
         const  size_t         offset) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
         cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
     } else {
         struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
         cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
     }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name);
     ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
 
@@ -5059,13 +5066,15 @@ static void ggml_add1_or_set(
         struct ggml_context * ctx,
         struct ggml_cgraph  * cgraph,
         size_t                isrc,
-        struct ggml_tensor  * src,
         struct ggml_tensor  * tensor) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
         cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
     } else {
         cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
     }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
     ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
 
@@ -5074,11 +5083,14 @@ static void ggml_sub_or_set(
         struct ggml_cgraph  * cgraph,
         size_t                isrc,
         struct ggml_tensor  * tensor) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
         cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
     } else {
         cgraph->grads[isrc] = ggml_neg(ctx, tensor);
     }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
     ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
 
@@ -5095,12 +5107,12 @@ static void ggml_compute_backward(
     struct ggml_tensor * src1 = tensor->src[1];
     struct ggml_tensor * src2 = tensor->src[2];
     struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
-    const size_t isrc0 = ggml_hash_find(hash_set, src0);
-    const size_t isrc1 = ggml_hash_find(hash_set, src1);
-    const size_t isrc2 = ggml_hash_find(hash_set, src2);
-    const bool src0_needs_grads = isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
-    const bool src1_needs_grads = isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
-    const bool src2_needs_grads = isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
+    const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1;
+    const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1;
+    const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1;
+    const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
+    const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
+    const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
 
     switch (tensor->op) {
         case GGML_OP_DUP: {
@@ -5200,7 +5212,7 @@ static void ggml_compute_backward(
         } break;
         case GGML_OP_SUM: {
             if (src0_needs_grads) {
-                ggml_add1_or_set(ctx, cgraph, isrc0, src0, grad);
+                ggml_add1_or_set(ctx, cgraph, isrc0, grad);
             }
         } break;
         case GGML_OP_SUM_ROWS: {
@@ -5210,7 +5222,7 @@ static void ggml_compute_backward(
         } break;
         case GGML_OP_MEAN: {
             if (src0_needs_grads) {
-                ggml_add1_or_set(ctx, cgraph, isrc0, src0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
+                ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
             }
         } break;
         case GGML_OP_REPEAT: {
@@ -5363,7 +5375,7 @@ static void ggml_compute_backward(
                     nb3 = (nb3 / n0) * ng;
                 }
 
-                ggml_acc_or_set(ctx, cgraph, isrc0, src0, grad, nb1, nb2, nb3, offset);
+                ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset);
             }
         } break;
         case GGML_OP_PERMUTE: {
@@ -5597,10 +5609,9 @@ void ggml_build_backward_expand(
 
     const int n_nodes_f = cgraph->n_nodes;
 
-    const size_t hash_size = ggml_hash_size(2*cgraph->size);
-    memset(cgraph->grads,     0, hash_size*sizeof(struct ggml_tensor *));
-    memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
-    bool * grads_needed = calloc(hash_size, sizeof(bool));
+    memset(cgraph->grads,     0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
+    memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
+    bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool));
 
     {
         bool any_params = false;
@@ -5621,7 +5632,7 @@ void ggml_build_backward_expand(
             continue;
         }
 
-        bool node_needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM;
+        bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS);
         bool ignore_src[GGML_MAX_SRC] = {false};
         switch (node->op) {
             // gradients in node->src[0] for one reason or another have no effect on output gradients
@@ -5638,7 +5649,7 @@ void ggml_build_backward_expand(
             } break;
 
             // gradients in node->src[1] for one reason or another have no effect on output gradients
-            case GGML_OP_CPY:           // gradients in CPY target  are irrelevant
+            case GGML_OP_CPY:           // gradients in CPY target are irrelevant
             case GGML_OP_GET_ROWS:      // row indices not differentiable
             case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
             case GGML_OP_ROPE:          // positions not differentiable
@@ -5665,9 +5676,12 @@ void ggml_build_backward_expand(
             node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
 
         const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
+        GGML_ASSERT(igrad != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
         if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
-            cgraph->grads[igrad]     = ggml_dup_tensor(ctx_static, node);
-            cgraph->grad_accs[igrad] = cgraph->grads[igrad];
+            cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
+            cgraph->grads[igrad]     = cgraph->grad_accs[igrad];
+            ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
         }
         grads_needed[igrad] = true;
     }
@@ -5761,15 +5775,15 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
 
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
     struct ggml_cgraph cgraph = {
-        /*.size         =*/ 0,
-        /*.n_nodes      =*/ i1 - i0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ cgraph0->nodes + i0,
-        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
-        /*.grad_accs    =*/ cgraph0->grad_accs ? cgraph0->grad_accs + i0 : NULL,
-        /*.leafs        =*/ NULL,
-        /*.hash_table   =*/ { 0, NULL, NULL },
-        /*.order        =*/ cgraph0->order,
+        /*.size             =*/ 0,
+        /*.n_nodes          =*/ i1 - i0,
+        /*.n_leafs          =*/ 0,
+        /*.nodes            =*/ cgraph0->nodes + i0,
+        /*.grads            =*/ NULL, // gradients would need visited_hash_set
+        /*.grad_accs        =*/ NULL,
+        /*.leafs            =*/ NULL,
+        /*.visited_hash_set =*/ { 0, NULL, NULL },
+        /*.order            =*/ cgraph0->order,
     };
 
     return cgraph;
@@ -5799,12 +5813,22 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
         }
     }
 
+    if (dst->grads) {
+        memset(dst->grads,     0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
+        memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
+    }
     if (src->grads) {
         GGML_ASSERT(dst->grads     != NULL);
         GGML_ASSERT(dst->grad_accs != NULL);
         for (int i = 0; i < src->n_nodes; ++i) {
             const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
             const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
+
+            GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
+            GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
+            GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
+            GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
+
             dst->grads[igrad_dst]     = src->grads[igrad_src];
             dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
         }
@@ -5839,12 +5863,8 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
 
         if (node->op == GGML_OP_OPT_STEP_ADAMW) {
             // clear momenta
-            if (node->src[2]->data) {
-                ggml_set_zero(node->src[2]);
-            }
-            if (node->src[3]->data) {
-                ggml_set_zero(node->src[3]);
-            }
+            ggml_set_zero(node->src[2]);
+            ggml_set_zero(node->src[3]);
         }
 
         // initial gradients of loss should be 1, 0 otherwise
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 01ac7166e..37342c156 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -819,7 +819,6 @@ struct test_case {
             }
         }
 
-        // TODO: refactor so that this check is only needed once
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
             if (!ggml_backend_supports_op(backend, t)) {
                 printf("not supported [%s] ", ggml_backend_name(backend));

From 59b917282236eadfb82bf1f46a31eb119941da08 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 20 Nov 2024 13:25:08 +0100
Subject: [PATCH 278/396] ggml/sched : do not skip views in pre-assignments

---
 ggml/src/ggml-backend.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 3433d082e..45da0c27d 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -886,9 +886,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         int * node_backend_id = &tensor_backend_id(node);
-        if (ggml_is_view_op(node->op)) {
-            continue;
-        }
         // do not overwrite user assignments
         if (*node_backend_id == -1) {
             *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);

From 87a533be57e602f8ca469d14ad15ee851265b655 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 21 Nov 2024 09:22:11 +0200
Subject: [PATCH 279/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index e9bd2dbb0..d101d2b57 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-2884dd72fea8922910fe53387c3d17ab928d3a8e
+6fcbd60bc72ac3f7ad43f78c87e535f2e6206f58

From 1bb30bf28cb5a7adf111bc41c935bdaf128397e7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 21 Nov 2024 10:22:47 +0200
Subject: [PATCH 280/396] llama : handle KV shift for recurrent models (#10402)

ggml-ci
---
 src/llama.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index c51b36e66..001711037 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18211,13 +18211,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 static void llama_kv_cache_update_internal(struct llama_context & lctx) {
     bool need_reserve = false;
 
-    // apply K-shift if needed
-    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
+    if (lctx.kv_self.has_shift) {
         if (!llama_kv_cache_can_shift(&lctx)) {
-            GGML_ABORT("Deepseek2 does not support K-shift");
+            GGML_ABORT("The current context does not support K-shift");
         }
 
-        {
+        // apply K-shift if needed
+        if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
             ggml_backend_sched_reset(lctx.sched.get());
 
             ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
@@ -20463,7 +20463,7 @@ void llama_kv_cache_update(struct llama_context * ctx) {
 }
 
 bool llama_kv_cache_can_shift(struct llama_context * ctx) {
-    return ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+    return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
 }
 
 // deprecated

From a5e47592b6171ae21f3eaa1aba6fb2b707875063 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 21 Nov 2024 18:18:50 +0100
Subject: [PATCH 281/396] cuda : optimize argmax (#10441)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* cuda : optimize argmax

* remove unused parameter

ggml-ci

* fixup : use full warps

ggml-ci

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* fix ub

* ggml : check ne00 <= INT32_MAX in argmax and argsort

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/argmax.cu   | 108 ++++++++++++++++++---------------
 ggml/src/ggml-cuda/common.cuh  |  30 ++++-----
 ggml/src/ggml-cuda/quantize.cu |   8 +--
 ggml/src/ggml.c                |   2 +
 tests/test-backend-ops.cpp     |  29 +++++++++
 5 files changed, 110 insertions(+), 67 deletions(-)

diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu
index aab04eca7..5340eedc0 100644
--- a/ggml/src/ggml-cuda/argmax.cu
+++ b/ggml/src/ggml-cuda/argmax.cu
@@ -1,57 +1,69 @@
-#include "common.cuh"
-#include "argmax.cuh"
-#include "sum.cuh"
-
+#include <algorithm>
 #include <cstdint>
 
-static __global__ void argmax_f32(
-    const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) {
+#include "argmax.cuh"
+#include "common.cuh"
+#include "sum.cuh"
 
-    int argmax_thread = 0;
-    const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE;
+static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) {
+    const int64_t row = blockIdx.x;
 
-#pragma unroll
-    for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) {
-        const int64_t row = row0 + row1;
+    float maxval = -FLT_MAX;
+    int   argmax = -1;
+    const float * rowx = x + row * ncols;
 
-        if (row >= nrows) {
-            break;
+    for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) {
+        const float val = rowx[col];
+        if (val > maxval) {
+            maxval = val;
+            argmax = col;
         }
-
-        float maxval = -FLT_MAX;
-        int   argmax = -1;
-
-        for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) {
-            const float val        = x[row*ncols + col];
-            const int   bigger     = val > maxval;
-            const int   not_bigger = bigger ^ 0x00000001;
-
-            maxval = maxval*not_bigger + val*bigger;
-            argmax = argmax*not_bigger + col*bigger;
-        }
-
-#pragma unroll
-        for (int mask = 16; mask > 0; mask >>= 1) {
-            const float val        = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE);
-            const int   col        = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE);
-            const int   bigger     = val > maxval;
-            const int   not_bigger = bigger ^ 0x00000001;
-
-            maxval = maxval*not_bigger + val*bigger;
-            argmax = argmax*not_bigger + col*bigger;
-        }
-
-        const int store = row1 == threadIdx.x;
-        argmax_thread += store*argmax;
     }
 
-    const int row = row0 + threadIdx.x;
-
-    if (row >= nrows) {
-        return;
+#pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
+        const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
+        if (val > maxval) {
+            maxval = val;
+            argmax = col;
+        }
     }
 
-    dst[row] = argmax_thread;
+    const int n_warps = blockDim.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    if (n_warps > 1) {
+        constexpr int    max_warps = 1024 / WARP_SIZE;
+        __shared__ float shared_maxval[max_warps];
+        __shared__ int   shared_argmax[max_warps];
+        if (lane_id == 0) {
+            shared_maxval[warp_id] = maxval;
+            shared_argmax[warp_id] = argmax;
+        }
+
+        __syncthreads();
+
+        if (warp_id == 0) {
+            if (lane_id < n_warps) {
+                maxval = shared_maxval[lane_id];
+                argmax = shared_argmax[lane_id];
+            }
+#pragma unroll
+            for (int offset = 16; offset > 0; offset >>= 1) {
+                const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
+                const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
+                if (val > maxval) {
+                    maxval = val;
+                    argmax = col;
+                }
+            }
+        }
+    }
+
+    if (warp_id == 0 && lane_id == 0) {
+        dst[row] = argmax;
+    }
 }
 
 void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -70,10 +82,10 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     cudaStream_t stream = ctx.stream();
 
-    const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE;
-
-    const dim3 blocks_dim(WARP_SIZE, 1, 1);
+    const int64_t num_blocks = nrows;
+    const int64_t num_threads = std::min<int64_t>(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE);
+    const dim3 blocks_dim(num_threads, 1, 1);
     const dim3 blocks_num(num_blocks, 1, 1);
 
-    argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00, nrows);
+    argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00);
 }
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index e146c691c..b0dd16066 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -180,8 +180,8 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
     return __reduce_add_sync(0xffffffff, x);
 #else
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, offset, 32);
     }
     return x;
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
@@ -189,17 +189,17 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
 
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, offset, 32);
     }
     return x;
 }
 
 static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
-        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32);
+        a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32);
     }
     return a;
 }
@@ -209,16 +209,16 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32);
         reinterpret_cast<half&>(a.x) +=  __low2half(a_other);
         reinterpret_cast<half&>(a.y) += __high2half(a_other);
     }
     return a;
 #else
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32));
     }
     return a;
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
@@ -231,8 +231,8 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 
 static __device__ __forceinline__ float warp_reduce_max(float x) {
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
     }
     return x;
 }
@@ -275,8 +275,8 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
 static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 #pragma unroll
-   for (int mask = 16; mask > 0; mask >>= 1) {
-       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+   for (int offset = 16; offset > 0; offset >>= 1) {
+       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
    }
    return x;
 #else
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index 45408ce86..1702e4ce2 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -69,8 +69,8 @@ static __global__ void quantize_mmq_q8_1(
 
     // Exchange max. abs. value between vals_per_scale/4 threads.
 #pragma unroll
-    for (int mask = vals_per_scale/8; mask > 0; mask >>= 1) {
-        amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE));
+    for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) {
+        amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE));
     }
 
     float sum;
@@ -79,8 +79,8 @@ static __global__ void quantize_mmq_q8_1(
 
         // Exchange calculate sum across vals_per_sum/4 threads.
 #pragma unroll
-        for (int mask = vals_per_sum/8; mask > 0; mask >>= 1) {
-            sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, WARP_SIZE);
+        for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {
+            sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE);
         }
     }
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 719d75c70..78e7874de 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2255,6 +2255,7 @@ struct ggml_tensor * ggml_argmax(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
     GGML_ASSERT(ggml_is_matrix(a));
+    GGML_ASSERT(a->ne[0] <= INT32_MAX);
 
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
 
@@ -4138,6 +4139,7 @@ struct ggml_tensor * ggml_argsort(
         struct ggml_context  * ctx,
         struct ggml_tensor   * a,
         enum ggml_sort_order   order) {
+    GGML_ASSERT(a->ne[0] <= INT32_MAX);
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
 
     ggml_set_op_params_i32(result, 0, (int32_t) order);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 37342c156..b2b570524 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1154,6 +1154,26 @@ struct test_argmax : public test_case {
         return out;
     }
 
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_F32) {
+                // initialize with unique values to avoid ties
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<float> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+
     double max_nmse_err() override {
         return 0.0;
     }
@@ -3440,6 +3460,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
 
     test_cases.emplace_back(new test_argmax());
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 1, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1}));
+
     test_cases.emplace_back(new test_count_equal());
 
     for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
@@ -3830,6 +3855,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f));
 
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
+
     for (int bs : {1, 512}) {
         for (ggml_type type_a : all_types) {
             for (ggml_type type_b : {GGML_TYPE_F32}) {

From c18610b4ee29ca056bb4f2d375a4ad1b16f44ef7 Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Fri, 22 Nov 2024 14:07:20 +0800
Subject: [PATCH 282/396] CANN: Support Ascend310P to accelerate F32 and F16
 Model (#10216)

* CANN Support Ascend310P to accelerate F32 and F16 Model

* Add compile option soc type macro ASCEND_310P to ggml-cann lib

* Remove unused code

* Remove the ascend soc_type hard code compile option in CMakelist.txt
---
 ggml/src/ggml-cann/CMakeLists.txt           | 29 ++++++++++++++++
 ggml/src/ggml-cann/aclnn_ops.cpp            | 18 ++++++++++
 ggml/src/ggml-cann/kernels/CMakeLists.txt   |  7 ++--
 ggml/src/ggml-cann/kernels/dup.cpp          | 32 +++++++++++++-----
 ggml/src/ggml-cann/kernels/get_row_f16.cpp  | 37 +++++++++++++--------
 ggml/src/ggml-cann/kernels/get_row_f32.cpp  | 36 ++++++++++++--------
 ggml/src/ggml-cann/kernels/get_row_q4_0.cpp |  5 ++-
 7 files changed, 123 insertions(+), 41 deletions(-)

diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt
index c8e15c6d4..756200b89 100644
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM
     message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
 endif()
 
+# Auto-detech Soc type and Soc version, if detect failed, will abort build
+set(SOC_VERSION "")
+function(detect_ascend_soc_type SOC_VERSION)
+    execute_process(
+        COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
+        OUTPUT_VARIABLE npu_info
+        RESULT_VARIABLE npu_result
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if("${npu_info}" STREQUAL "" OR ${npu_result})
+        message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
+    endif()
+    set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
+endfunction()
+
+if(NOT SOC_TYPE)
+    detect_ascend_soc_type(SOC_VERSION)
+    set(SOC_TYPE "${SOC_VERSION}")
+    message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
+else()
+    string(TOLOWER ${SOC_TYPE} SOC_VERSION)
+endif()
+
+# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
+string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
+set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
+
 if (CANN_INSTALL_DIR)
     # Only Support Linux.
     if (NOT UNIX)
@@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR)
     target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
     target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
 
+    target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
+
     message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
     message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
 else()
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index a4ec8418e..1f4ee986c 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
     switch (src0->type) {
         case GGML_TYPE_F32:
+        {
+#ifdef ASCEND_310P
+             // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
+            if ((src0->ne[0] % 8) != 0) {
+                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
+                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            }
+#endif
             aclrtlaunch_ascendc_get_row_f32(
                 24, ctx.stream(), src0->data, src1->data, dst->data,
                 ((ggml_tensor*)src0->extra)->ne,
@@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                 ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
                 ((ggml_tensor*)dst->extra)->nb);
             break;
+        }
         case GGML_TYPE_F16:
+        {
+#ifdef ASCEND_310P
+             // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
+            if ((src0->ne[0] % 16) != 0) {
+                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
+                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            }
+#endif
             aclrtlaunch_ascendc_get_row_f16(
                 24, ctx.stream(), src0->data, src1->data, dst->data,
                 ((ggml_tensor*)src0->extra)->ne,
@@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                 ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
                 ((ggml_tensor*)dst->extra)->nb);
             break;
+        }
         case GGML_TYPE_Q4_0:
             aclrtlaunch_ascendc_get_row_q4_0(
                 24, ctx.stream(), src0->data, src1->data, dst->data,
diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt
index 5b4fef91b..6a4e17cce 100644
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@@ -1,7 +1,3 @@
-if (NOT SOC_TYPE)
-    set (SOC_TYPE "Ascend910B3")
-endif()
-
 file(GLOB SRC_FILES
     get_row_f32.cpp
     get_row_f16.cpp
@@ -13,7 +9,6 @@ file(GLOB SRC_FILES
     dup.cpp
 )
 
-string(TOLOWER ${SOC_TYPE} SOC_VERSION)
 set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
 set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
 
@@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC
     ${SRC_FILES}
 )
 
+message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
+ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
 # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp
index e2c651152..99f03e058 100644
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@@ -5,6 +5,7 @@
 using namespace AscendC;
 
 #define BUFFER_NUM 2
+const int64_t SUPPORTED_MAX_DIM = 65535;  // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
 
 template <typename SRC_T, typename DST_T>
 class DupByRows {
@@ -19,6 +20,7 @@ class DupByRows {
         // Input has four dims.
         int64_t op_block_num = GetBlockNum();
         int64_t op_block_idx = GetBlockIdx();
+        assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
 
         // param
         num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
@@ -51,24 +53,36 @@ class DupByRows {
 
     __aicore__ inline void copy_in() {
         LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
-
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
-        DataCopyPadExtParams<SRC_T> padParams;
-        DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
-
+        const size_t elem_per_block = 32 / sizeof(SRC_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
+        DataCopy(src_local, src_gm, cpy_elements_len);
         src_queue.EnQue(src_local);
     }
 
     __aicore__ inline void copy_out() {
         LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
-
+#ifdef ASCEND_310P
+        const size_t elem_per_block = 32 / sizeof(DST_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t len = num_elem & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(dst_gm, dst_local, len);
+        }
+        if(tail != 0) {
+            for (size_t i = tail; i < elem_per_block; i++) {
+                dst_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(dst_gm[len], dst_local[len], elem_per_block);
+            SetAtomicNone();
+        }
+#else
         DataCopyExtParams dataCopyParams;
         dataCopyParams.blockCount = 1;
         dataCopyParams.blockLen = num_elem * sizeof(DST_T);
         DataCopyPad(dst_gm, dst_local, dataCopyParams);
-
+#endif
         dst_queue.FreeTensor(dst_local);
     }
 
diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
index c704b5b2e..416b45104 100644
--- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@@ -14,7 +14,7 @@ class GET_ROW_F16 {
                                 int64_t *output_ne_ub, size_t *output_nb_ub) {
         // TODO, use template for F16/f32
         int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
+        op_block_idx = GetBlockIdx();
 
         for (int i = 0; i < 4; i++) {
             input_ne[i] = input_ne_ub[i];
@@ -59,32 +59,42 @@ class GET_ROW_F16 {
     }
 
     __aicore__ inline void copy_in(uint32_t offset, size_t len) {
+        size_t origin_len = len;
         LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(input_local, input_gm[offset], len);
+        const size_t elem_per_block = 32 / sizeof(half);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
         if(tail != 0) {
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(half);
-            DataCopyPadExtParams<half> padParams;
-            DataCopyPad(input_local[len], input_gm[offset + len],
-                        dataCopyParams, padParams);
+            len += elem_per_block;
         }
+        DataCopy(input_local, input_gm[offset], len);
         input_queue.EnQue(input_local);
     }
 
     __aicore__ inline void copy_out(uint32_t offset, size_t len) {
         LocalTensor<float> output_local = output_queue.DeQue<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(output_gm[offset], output_local, len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
+
         if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
             DataCopyExtParams dataCopyParams;
             dataCopyParams.blockCount = 1;
             dataCopyParams.blockLen = tail * sizeof(float);
             DataCopyPad(output_gm[offset + len], output_local[len],
                         dataCopyParams);
+#endif
         }
         output_queue.FreeTensor(output_local);
     }
@@ -150,6 +160,7 @@ class GET_ROW_F16 {
     GlobalTensor<float> output_gm;
     TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
     TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
 };
 
 template <typename T>
diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
index 9db080af3..02116905b 100644
--- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@@ -13,7 +13,7 @@ class GET_ROW_F32 {
                                 int64_t *indices_ne_ub, size_t *indices_nb_ub,
                                 int64_t *output_ne_ub, size_t *output_nb_ub) {
         int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
+        op_block_idx = GetBlockIdx();
 
         for (int i = 0; i < 4; i++) {
             input_ne[i] = input_ne_ub[i];
@@ -55,31 +55,40 @@ class GET_ROW_F32 {
 
     __aicore__ inline void copy_in(uint32_t offset, size_t len) {
         LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(input_local, input_gm[offset], len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
         if(tail != 0) {
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPadExtParams<float> padParams;
-            DataCopyPad(input_local[len], input_gm[offset + len],
-                        dataCopyParams, padParams);
+            len += elem_per_block;
         }
+        DataCopy(input_local, input_gm[offset], len);
         input_queue.EnQue(input_local);
     }
 
     __aicore__ inline void copy_out(uint32_t offset, size_t len) {
         LocalTensor<float> output_local = output_queue.DeQue<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(output_gm[offset], output_local, len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
+
         if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
             DataCopyExtParams dataCopyParams;
             dataCopyParams.blockCount = 1;
             dataCopyParams.blockLen = tail * sizeof(float);
             DataCopyPad(output_gm[offset + len], output_local[len],
                         dataCopyParams);
+#endif
         }
         output_queue.FreeTensor(output_local);
     }
@@ -144,6 +153,7 @@ class GET_ROW_F32 {
     GlobalTensor<float> output_gm;
     TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
     TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
 };
 
 template <typename T>
diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
index a80bfeec2..377211096 100644
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@@ -110,9 +110,12 @@ class GET_ROW_Q4_0 {
         LocalTensor<float> output_local = output_queue.AllocTensor<float>();
 
         // TODO: cast more data to speed up.
+#ifdef ASCEND_310P
+        // TODO: 310P support quantification
+#else
         Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
         Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-
+#endif
         // Only mul need compile by group.
         half scale = scale_gm.GetValue(scale_offset);
 

From 599b3e0cd40432cd1975a8906f3db70bbe53b627 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 22 Nov 2024 08:32:40 +0100
Subject: [PATCH 283/396] GitHub: ask for more info in issue templates (#10426)

* GitHub: ask for more info in issues [no ci]

* refactor issue templates to be component-specific

* more understandable issue description

* add dropdown for llama.cpp module
---
 .github/ISSUE_TEMPLATE/01-bug-low.yml         | 50 ----------
 .../ISSUE_TEMPLATE/010-bug-compilation.yml    | 73 ++++++++++++++
 .github/ISSUE_TEMPLATE/011-bug-results.yml    | 98 +++++++++++++++++++
 .github/ISSUE_TEMPLATE/019-bug-misc.yml       | 78 +++++++++++++++
 .github/ISSUE_TEMPLATE/02-bug-medium.yml      | 50 ----------
 ...05-enhancement.yml => 020-enhancement.yml} |  2 +-
 .github/ISSUE_TEMPLATE/03-bug-high.yml        | 50 ----------
 .../{06-research.yml => 030-research.yml}     |  2 +-
 .github/ISSUE_TEMPLATE/04-bug-critical.yml    | 50 ----------
 .../{07-refactor.yml => 040-refactor.yml}     |  2 +-
 10 files changed, 252 insertions(+), 203 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/01-bug-low.yml
 create mode 100644 .github/ISSUE_TEMPLATE/010-bug-compilation.yml
 create mode 100644 .github/ISSUE_TEMPLATE/011-bug-results.yml
 create mode 100644 .github/ISSUE_TEMPLATE/019-bug-misc.yml
 delete mode 100644 .github/ISSUE_TEMPLATE/02-bug-medium.yml
 rename .github/ISSUE_TEMPLATE/{05-enhancement.yml => 020-enhancement.yml} (97%)
 delete mode 100644 .github/ISSUE_TEMPLATE/03-bug-high.yml
 rename .github/ISSUE_TEMPLATE/{06-research.yml => 030-research.yml} (97%)
 delete mode 100644 .github/ISSUE_TEMPLATE/04-bug-critical.yml
 rename .github/ISSUE_TEMPLATE/{07-refactor.yml => 040-refactor.yml} (95%)

diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml
deleted file mode 100644
index 54785854f..000000000
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: Low Severity Bugs
-description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
-title: "Bug: "
-labels: ["bug-unconfirmed", "low severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
new file mode 100644
index 000000000..550ee1b49
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -0,0 +1,73 @@
+name: Bug (compilation)
+description: Something goes wrong when trying to compile llama.cpp.
+title: "Compile bug: "
+labels: ["bug-unconfirmed", "compilation"]
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for taking the time to fill out this bug report!
+        This issue template is intended for bug reports where the compilation of llama.cpp fails.
+        Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
+        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
+        by clearing `~/.cache/ccache` (on Linux).
+  - type: textarea
+    id: commit
+    attributes:
+      label: Git commit
+      description: Which commit are you trying to compile?
+      placeholder: |
+        $git rev-parse HEAD
+        84a07a17b1b08cf2b9747c633a2372782848a27f
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: Which operating systems do you know to be affected?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: true
+  - type: dropdown
+    id: backends
+    attributes:
+        label: GGML backends
+        description: Which GGML backends do you know to be affected?
+        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
+        multiple: true
+  - type: textarea
+    id: steps_to_reproduce
+    attributes:
+      label: Steps to Reproduce
+      description: >
+        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
+      placeholder: >
+        Here are the exact commands that I used: ...
+    validations:
+      required: true
+  - type: textarea
+    id: first_bad_commit
+    attributes:
+      label: First Bad Commit
+      description: >
+        If the bug was not present on an earlier version: when did it start appearing?
+        If possible, please do a git bisect and identify the exact commit that introduced the bug.
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: >
+          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: true
diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml
new file mode 100644
index 000000000..1adb162b7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -0,0 +1,98 @@
+name: Bug (model use)
+description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
+title: "Eval bug: "
+labels: ["bug-unconfirmed", "model evaluation"]
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for taking the time to fill out this bug report!
+        This issue template is intended for bug reports where the model evaluation results
+        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
+        If you encountered the issue while using an external UI (e.g. ollama),
+        please reproduce your issue using one of the examples/binaries in this repository.
+        The `llama-cli` binary can be used for simple and reproducible model inference.
+  - type: textarea
+    id: version
+    attributes:
+      label: Name and Version
+      description: Which version of our software are you running? (use `--version` to get a version string)
+      placeholder: |
+        $./llama-cli --version
+        version: 2999 (42b4109e)
+        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: Which operating systems do you know to be affected?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: true
+  - type: dropdown
+    id: backends
+    attributes:
+        label: GGML backends
+        description: Which GGML backends do you know to be affected?
+        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
+        multiple: true
+  - type: textarea
+    id: hardware
+    attributes:
+      label: Hardware
+      description: Which CPUs/GPUs are you using?
+      placeholder: >
+        e.g. Ryzen 5950X + 2x RTX 4090
+    validations:
+      required: true
+  - type: textarea
+    id: model
+    attributes:
+      label: Model
+      description: >
+        Which model at which quantization were you using when encountering the bug?
+        If you downloaded a GGUF file off of Huggingface, please provide a link.
+      placeholder: >
+        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
+    validations:
+      required: false
+  - type: textarea
+    id: steps_to_reproduce
+    attributes:
+      label: Steps to Reproduce
+      description: >
+        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
+        that information would be very much appreciated by us.
+      placeholder: >
+        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
+        When I use -ngl 0 it works correctly.
+        Here are the exact commands that I used: ...
+    validations:
+      required: true
+  - type: textarea
+    id: first_bad_commit
+    attributes:
+      label: First Bad Commit
+      description: >
+        If the bug was not present on an earlier version: when did it start appearing?
+        If possible, please do a git bisect and identify the exact commit that introduced the bug.
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: >
+          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: true
diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
new file mode 100644
index 000000000..124cdee91
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -0,0 +1,78 @@
+name: Bug (misc.)
+description: Something is not working the way it should (and it's not covered by any of the above cases).
+title: "Misc. bug: "
+labels: ["bug-unconfirmed"]
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for taking the time to fill out this bug report!
+        This issue template is intended for miscellaneous bugs that don't fit into any other category.
+        If you encountered the issue while using an external UI (e.g. ollama),
+        please reproduce your issue using one of the examples/binaries in this repository.
+  - type: textarea
+    id: version
+    attributes:
+      label: Name and Version
+      description: Which version of our software are you running? (use `--version` to get a version string)
+      placeholder: |
+        $./llama-cli --version
+        version: 2999 (42b4109e)
+        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: Which operating systems do you know to be affected?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: true
+  - type: dropdown
+    id: module
+    attributes:
+      label: Which llama.cpp modules do you know to be affected?
+      multiple: true
+      options:
+        - libllama (core library)
+        - llama-cli
+        - llama-server
+        - llama-bench
+        - llama-quantize
+        - Python/Bash scripts
+        - Other (Please specify in the next section)
+    validations:
+      required: true
+  - type: textarea
+    id: steps_to_reproduce
+    attributes:
+      label: Steps to Reproduce
+      description: >
+        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+    validations:
+      required: true
+  - type: textarea
+    id: first_bad_commit
+    attributes:
+      label: First Bad Commit
+      description: >
+        If the bug was not present on an earlier version: when did it start appearing?
+        If possible, please do a git bisect and identify the exact commit that introduced the bug.
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: >
+          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: true
diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
deleted file mode 100644
index a6285c6f0..000000000
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: Medium Severity Bug
-description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
-title: "Bug: "
-labels: ["bug-unconfirmed", "medium severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
diff --git a/.github/ISSUE_TEMPLATE/05-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml
similarity index 97%
rename from .github/ISSUE_TEMPLATE/05-enhancement.yml
rename to .github/ISSUE_TEMPLATE/020-enhancement.yml
index 58fca7318..02dd4f575 100644
--- a/.github/ISSUE_TEMPLATE/05-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@@ -1,5 +1,5 @@
 name: Enhancement
-description: Used to request enhancements for llama.cpp
+description: Used to request enhancements for llama.cpp.
 title: "Feature Request: "
 labels: ["enhancement"]
 body:
diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml
deleted file mode 100644
index ff816b937..000000000
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: High Severity Bug
-description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
-title: "Bug: "
-labels: ["bug-unconfirmed", "high severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
diff --git a/.github/ISSUE_TEMPLATE/06-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml
similarity index 97%
rename from .github/ISSUE_TEMPLATE/06-research.yml
rename to .github/ISSUE_TEMPLATE/030-research.yml
index 3ae4e9f8c..18975dbbf 100644
--- a/.github/ISSUE_TEMPLATE/06-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@@ -1,5 +1,5 @@
 name: Research
-description: Track new technical research area
+description: Track new technical research area.
 title: "Research: "
 labels: ["research 🔬"]
 body:
diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
deleted file mode 100644
index 7af42a80b..000000000
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: Critical Severity Bug
-description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
-title: "Bug: "
-labels: ["bug-unconfirmed", "critical severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
diff --git a/.github/ISSUE_TEMPLATE/07-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml
similarity index 95%
rename from .github/ISSUE_TEMPLATE/07-refactor.yml
rename to .github/ISSUE_TEMPLATE/040-refactor.yml
index 3a68d3d53..b6e6ab36d 100644
--- a/.github/ISSUE_TEMPLATE/07-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@@ -1,5 +1,5 @@
 name: Refactor (Maintainers)
-description: Used to track refactoring opportunities
+description: Used to track refactoring opportunities.
 title: "Refactor: "
 labels: ["refactor"]
 body:

From 6dfcfef0787e9902df29f510b63621f60a09a50b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=95=AD=E6=BE=A7=E9=82=A6?=
 <45505768+shou692199@users.noreply.github.com>
Date: Fri, 22 Nov 2024 17:44:08 +0800
Subject: [PATCH 284/396] ci: Update oneAPI runtime dll packaging (#10428)

This is the minimum runtime dll dependencies for oneAPI 2025.0
---
 .github/workflows/build.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6ef0770f3..572f91643 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -986,13 +986,14 @@ jobs:
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
 
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin

From 55ed008b2de01592659b9eba068ea01bb2f72160 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Sat, 23 Nov 2024 14:41:12 +0100
Subject: [PATCH 285/396] ggml : do not use ARM features not included in the
 build (#10457)

---
 ggml/src/ggml-cpu/ggml-cpu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 0d23669c2..4b58254e7 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -13896,7 +13896,7 @@ int ggml_cpu_has_vsx(void) {
 }
 
 int ggml_cpu_has_neon(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_NEON)
     return ggml_arm_arch_features.has_neon;
 #else
     return 0;
@@ -13904,7 +13904,7 @@ int ggml_cpu_has_neon(void) {
 }
 
 int ggml_cpu_has_sve(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
     return ggml_arm_arch_features.has_sve;
 #else
     return 0;
@@ -13912,7 +13912,7 @@ int ggml_cpu_has_sve(void) {
 }
 
 int ggml_cpu_has_matmul_int8(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
     return ggml_arm_arch_features.has_i8mm;
 #else
     return 0;
@@ -13920,7 +13920,7 @@ int ggml_cpu_has_matmul_int8(void) {
 }
 
 int ggml_cpu_get_sve_cnt(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
     return ggml_arm_arch_features.sve_cnt;
 #else
     return 0;

From 96fa2c5e2d6cb5319f34c3a7fb0cec05694b22f1 Mon Sep 17 00:00:00 2001
From: momonga <146910567+mmngays@users.noreply.github.com>
Date: Sun, 24 Nov 2024 09:09:22 +0900
Subject: [PATCH 286/396] fix gguf-py:  Conversion error when multiple licenses
 are configured (#9807)

* fix general.license list to str

* fix join license list

---------

Co-authored-by: momonga <115213907+mmnga@users.noreply.github.com>
---
 gguf-py/gguf/metadata.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index 321cbcd4c..962c27b20 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -545,7 +545,10 @@ class Metadata:
             gguf_writer.add_size_label(self.size_label)
 
         if self.license is not None:
-            gguf_writer.add_license(self.license)
+            if isinstance(self.license, list):
+                gguf_writer.add_license(",".join(self.license))
+            else:
+                gguf_writer.add_license(self.license)
         if self.license_name is not None:
             gguf_writer.add_license_name(self.license_name)
         if self.license_link is not None:

From 9336db462c0c34bbe2055413fe4e16442626c38b Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Sun, 24 Nov 2024 02:02:34 -0700
Subject: [PATCH 287/396] convert : XLMRoberta Type Vocab Size (#10458)

This matches the key in common bert-based embedding models and may have a
value other than 1 in it.

Branch: XLMRobertaTypeVocabSize

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 convert_hf_to_gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 9f4b8154b..80a179b86 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2707,7 +2707,7 @@ class XLMRobertaModel(BertModel):
         self.gguf_writer.add_token_scores(scores)
         self.gguf_writer.add_token_types(toktypes)
         self.gguf_writer.add_add_space_prefix(add_prefix)
-        self.gguf_writer.add_token_type_count(1)
+        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
         self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
         if precompiled_charsmap:
             self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)

From dc39012cbaf8752fabecaeb60af78ccdd1dfb73b Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Sun, 24 Nov 2024 16:10:26 +0100
Subject: [PATCH 288/396] llama : fix op mul check with command-r-plus (#10476)

---
 src/llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 001711037..20df09b13 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7181,12 +7181,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
             } break;
         case GGML_OP_ADD:
             {
-                ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
+                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
                 op_tensor = ggml_add(ctx, a, w);
             } break;
         case GGML_OP_MUL:
             {
-                ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
+                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
                 op_tensor = ggml_mul(ctx, a, w);
             } break;
         case GGML_OP_DIV:

From cce5a9007572c6e9fa522296b77571d2e5071357 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 24 Nov 2024 18:03:25 +0200
Subject: [PATCH 289/396] flake.lock: Update (#10470)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/5e4fbfb6b3de1aa2872b76d49fafc942626e2add?narHash=sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg%3D' (2024-11-15)
  → 'github:NixOS/nixpkgs/23e89b7da85c3640bbc2173fe04f4bd114342367?narHash=sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w%3D' (2024-11-19)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index ee8cf07e3..d114f4422 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1731676054,
-        "narHash": "sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg=",
+        "lastModified": 1732014248,
+        "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "5e4fbfb6b3de1aa2872b76d49fafc942626e2add",
+        "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
         "type": "github"
       },
       "original": {

From d9d54e498d38ec99bbc0031022f9c92711e97bbc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 Nov 2024 09:58:41 +0200
Subject: [PATCH 290/396] speculative : refactor and add a simpler example
 (#10362)

* speculative : refactor and add a simpler example

ggml-ci

* speculative : clean-up and add comments and TODOs [no ci]

* speculative : manage context in common_speculative

ggml-ci

* speculative : simplify

ggml-ci

* speculative : simplify (cont)

ggml-ci

* speculative : add --draft-min CLI arg

* speculative : minor fixup

* make : build fixes

* speculative : do not redraft previous drafts

ggml-ci

* speculative : fix the draft sampling

ggml-ci

* speculative : fix compile warning

* common : refactor args

ggml-ci

* common : change defaults [no ci]

* common : final touches

ggml-ci
---
 Makefile                                      |   1 +
 common/CMakeLists.txt                         |   2 +
 common/arg.cpp                                | 438 +++++++++---------
 common/common.cpp                             |  76 ++-
 common/common.h                               |  41 +-
 common/sampling.cpp                           |  45 +-
 common/sampling.h                             |  23 +-
 common/speculative.cpp                        | 269 +++++++++++
 common/speculative.h                          |  28 ++
 examples/CMakeLists.txt                       |   1 +
 examples/batched/batched.cpp                  |   8 +-
 examples/infill/infill.cpp                    |   2 +-
 examples/llava/llava-cli.cpp                  |   2 +-
 examples/llava/minicpmv-cli.cpp               |   2 +-
 examples/lookahead/lookahead.cpp              |   2 +-
 examples/lookup/lookup-stats.cpp              |   3 +-
 examples/lookup/lookup.cpp                    |   4 +-
 examples/main/main.cpp                        |   2 +-
 examples/parallel/parallel.cpp                |   2 +-
 examples/retrieval/retrieval.cpp              |   4 +-
 examples/save-load-state/save-load-state.cpp  |   8 +-
 examples/server/server.cpp                    |  10 +-
 examples/server/utils.hpp                     |  57 ---
 examples/speculative-simple/CMakeLists.txt    |   5 +
 examples/speculative-simple/README.md         |  12 +
 .../speculative-simple/speculative-simple.cpp | 273 +++++++++++
 examples/speculative/speculative.cpp          |  30 +-
 tests/test-arg-parser.cpp                     |   4 +-
 28 files changed, 1028 insertions(+), 326 deletions(-)
 create mode 100644 common/speculative.cpp
 create mode 100644 common/speculative.h
 create mode 100644 examples/speculative-simple/CMakeLists.txt
 create mode 100644 examples/speculative-simple/README.md
 create mode 100644 examples/speculative-simple/speculative-simple.cpp

diff --git a/Makefile b/Makefile
index 5c8994385..dd6d864ad 100644
--- a/Makefile
+++ b/Makefile
@@ -966,6 +966,7 @@ OBJ_COMMON = \
 	$(DIR_COMMON)/console.o \
 	$(DIR_COMMON)/ngram-cache.o \
 	$(DIR_COMMON)/sampling.o \
+	$(DIR_COMMON)/speculative.o \
 	$(DIR_COMMON)/build-info.o \
 	$(DIR_COMMON)/json-schema-to-grammar.o
 
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 5ab1ffa19..62a8a7db5 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -66,6 +66,8 @@ add_library(${TARGET} STATIC
     ngram-cache.h
     sampling.cpp
     sampling.h
+    speculative.cpp
+    speculative.h
     )
 
 if (BUILD_SHARED_LIBS)
diff --git a/common/arg.cpp b/common/arg.cpp
index 4115b2f75..32240f21f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -233,10 +233,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         }
     }
 
-    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams,       nullptr);
     postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
+    postprocess_cpu_params(params.speculative.cpuparams,       &params.cpuparams);
+    postprocess_cpu_params(params.speculative.cpuparams_batch, &params.cpuparams_batch);
 
     if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
@@ -251,7 +252,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         for (auto & antiprompt : params.antiprompt) {
             string_process_escapes(antiprompt);
         }
-        for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
+        for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
             string_process_escapes(seq_breaker);
         }
     }
@@ -329,7 +330,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
     std::string sampler_type_chars;
     std::string sampler_type_names;
-    for (const auto & sampler : params.sparams.samplers) {
+    for (const auto & sampler : params.sampling.samplers) {
         sampler_type_chars += common_sampler_type_to_chr(sampler);
         sampler_type_names += common_sampler_type_to_str(sampler) + ";";
     }
@@ -407,26 +408,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ));
-    add_opt(common_arg(
-        {"-td", "--threads-draft"}, "N",
-        "number of threads to use during generation (default: same as --threads)",
-        [](common_params & params, int value) {
-            params.draft_cpuparams.n_threads = value;
-            if (params.draft_cpuparams.n_threads <= 0) {
-                params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"-tbd", "--threads-batch-draft"}, "N",
-        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
-        [](common_params & params, int value) {
-            params.draft_cpuparams_batch.n_threads = value;
-            if (params.draft_cpuparams_batch.n_threads <= 0) {
-                params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-C", "--cpu-mask"}, "M",
         "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
@@ -515,108 +496,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.cpuparams_batch.poll = value;
         }
     ));
-    add_opt(common_arg(
-        {"-Cd", "--cpu-mask-draft"}, "M",
-        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](common_params & params, const std::string & mask) {
-            params.draft_cpuparams.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"-Crd", "--cpu-range-draft"}, "lo-hi",
-        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
-        [](common_params & params, const std::string & range) {
-            params.draft_cpuparams.mask_valid = true;
-            if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
-                throw std::invalid_argument("invalid range");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--cpu-strict-draft"}, "<0|1>",
-        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
-        [](common_params & params, int value) {
-            params.draft_cpuparams.strict_cpu = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--prio-draft"}, "N",
-        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
-        [](common_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
-                throw std::invalid_argument("invalid value");
-            }
-            params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--poll-draft"}, "<0|1>",
-        "Use polling to wait for draft model work (default: same as --poll])",
-        [](common_params & params, int value) {
-            params.draft_cpuparams.poll = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
-        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](common_params & params, const std::string & mask) {
-            params.draft_cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
-        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
-        [](common_params & params, const std::string & range) {
-            params.draft_cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--cpu-strict-batch-draft"}, "<0|1>",
-        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
-        [](common_params & params, int value) {
-            params.draft_cpuparams_batch.strict_cpu = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--prio-batch-draft"}, "N",
-        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
-        [](common_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
-                throw std::invalid_argument("invalid value");
-            }
-            params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--poll-batch-draft"}, "<0|1>",
-        "Use polling to wait for draft model work (default: --poll-draft)",
-        [](common_params & params, int value) {
-            params.draft_cpuparams_batch.poll = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--draft"}, "N",
-        string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
-        [](common_params & params, int value) {
-            params.n_draft = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
-    add_opt(common_arg(
-        {"-ps", "--p-split"}, "N",
-        string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
-        [](common_params & params, const std::string & value) {
-            params.p_split = std::stof(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-lcs", "--lookup-cache-static"}, "FNAME",
         "path to static lookup cache to use for lookup decoding (not updated by generation)",
@@ -701,7 +580,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
         [](common_params & params) {
             params.no_perf = true;
-            params.sparams.no_perf = true;
+            params.sampling.no_perf = true;
         }
     ).set_env("LLAMA_ARG_NO_PERF"));
     add_opt(common_arg(
@@ -883,155 +762,155 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
         [](common_params & params, const std::string & value) {
             const auto sampler_names = string_split<std::string>(value, ';');
-            params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+            params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"-s", "--seed"}, "SEED",
-        string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
+        string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
         [](common_params & params, const std::string & value) {
-            params.sparams.seed = std::stoul(value);
+            params.sampling.seed = std::stoul(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--sampling-seq"}, "SEQUENCE",
         string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
         [](common_params & params, const std::string & value) {
-            params.sparams.samplers = common_sampler_types_from_chars(value);
+            params.sampling.samplers = common_sampler_types_from_chars(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--ignore-eos"},
         "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
         [](common_params & params) {
-            params.sparams.ignore_eos = true;
+            params.sampling.ignore_eos = true;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--penalize-nl"},
-        string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
+        string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
         [](common_params & params) {
-            params.sparams.penalize_nl = true;
+            params.sampling.penalize_nl = true;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--temp"}, "N",
-        string_format("temperature (default: %.1f)", (double)params.sparams.temp),
+        string_format("temperature (default: %.1f)", (double)params.sampling.temp),
         [](common_params & params, const std::string & value) {
-            params.sparams.temp = std::stof(value);
-            params.sparams.temp = std::max(params.sparams.temp, 0.0f);
+            params.sampling.temp = std::stof(value);
+            params.sampling.temp = std::max(params.sampling.temp, 0.0f);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--top-k"}, "N",
-        string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
+        string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
         [](common_params & params, int value) {
-            params.sparams.top_k = value;
+            params.sampling.top_k = value;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--top-p"}, "N",
-        string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
+        string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
         [](common_params & params, const std::string & value) {
-            params.sparams.top_p = std::stof(value);
+            params.sampling.top_p = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--min-p"}, "N",
-        string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
+        string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
         [](common_params & params, const std::string & value) {
-            params.sparams.min_p = std::stof(value);
+            params.sampling.min_p = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--xtc-probability"}, "N",
-        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
+        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
         [](common_params & params, const std::string & value) {
-            params.sparams.xtc_probability = std::stof(value);
+            params.sampling.xtc_probability = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--xtc-threshold"}, "N",
-        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
+        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
         [](common_params & params, const std::string & value) {
-            params.sparams.xtc_threshold = std::stof(value);
+            params.sampling.xtc_threshold = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--typical"}, "N",
-        string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
+        string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
         [](common_params & params, const std::string & value) {
-            params.sparams.typ_p = std::stof(value);
+            params.sampling.typ_p = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--repeat-last-n"}, "N",
-        string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
+        string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
         [](common_params & params, int value) {
-            params.sparams.penalty_last_n = value;
-            params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
+            params.sampling.penalty_last_n = value;
+            params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--repeat-penalty"}, "N",
-        string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
+        string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
         [](common_params & params, const std::string & value) {
-            params.sparams.penalty_repeat = std::stof(value);
+            params.sampling.penalty_repeat = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--presence-penalty"}, "N",
-        string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
+        string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
         [](common_params & params, const std::string & value) {
-            params.sparams.penalty_present = std::stof(value);
+            params.sampling.penalty_present = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--frequency-penalty"}, "N",
-        string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
+        string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
         [](common_params & params, const std::string & value) {
-            params.sparams.penalty_freq = std::stof(value);
+            params.sampling.penalty_freq = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dry-multiplier"}, "N",
-        string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
+        string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
         [](common_params & params, const std::string & value) {
-            params.sparams.dry_multiplier = std::stof(value);
+            params.sampling.dry_multiplier = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dry-base"}, "N",
-        string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
+        string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
         [](common_params & params, const std::string & value) {
             float potential_base = std::stof(value);
             if (potential_base >= 1.0f)
             {
-                params.sparams.dry_base = potential_base;
+                params.sampling.dry_base = potential_base;
             }
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dry-allowed-length"}, "N",
-        string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
+        string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
         [](common_params & params, int value) {
-            params.sparams.dry_allowed_length = value;
+            params.sampling.dry_allowed_length = value;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dry-penalty-last-n"}, "N",
-        string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
+        string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
         [](common_params & params, int value) {
-            params.sparams.dry_penalty_last_n = value;
+            params.sampling.dry_penalty_last_n = value;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dry-sequence-breaker"}, "STRING",
         string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
-            params.sparams.dry_sequence_breakers.empty() ? "none" :
-            std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
-                params.sparams.dry_sequence_breakers.end(),
-                std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
+            params.sampling.dry_sequence_breakers.empty() ? "none" :
+            std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
+                params.sampling.dry_sequence_breakers.end(),
+                std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
                 [](const std::string& a, const std::string& b) {
                     std::string formatted_b = (b == "\n") ? "\\n" : b;
                     return a + ", '" + formatted_b + "'";
@@ -1040,51 +919,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             static bool defaults_cleared = false;
 
             if (!defaults_cleared) {
-                params.sparams.dry_sequence_breakers.clear();
+                params.sampling.dry_sequence_breakers.clear();
                 defaults_cleared = true;
             }
 
             if (value == "none") {
-                params.sparams.dry_sequence_breakers.clear();
+                params.sampling.dry_sequence_breakers.clear();
             } else {
-                params.sparams.dry_sequence_breakers.emplace_back(value);
+                params.sampling.dry_sequence_breakers.emplace_back(value);
             }
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dynatemp-range"}, "N",
-        string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
+        string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
         [](common_params & params, const std::string & value) {
-            params.sparams.dynatemp_range = std::stof(value);
+            params.sampling.dynatemp_range = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dynatemp-exp"}, "N",
-        string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
+        string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
         [](common_params & params, const std::string & value) {
-            params.sparams.dynatemp_exponent = std::stof(value);
+            params.sampling.dynatemp_exponent = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--mirostat"}, "N",
         string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
-        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
+        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
         [](common_params & params, int value) {
-            params.sparams.mirostat = value;
+            params.sampling.mirostat = value;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--mirostat-lr"}, "N",
-        string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
+        string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
         [](common_params & params, const std::string & value) {
-            params.sparams.mirostat_eta = std::stof(value);
+            params.sampling.mirostat_eta = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--mirostat-ent"}, "N",
-        string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
+        string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
         [](common_params & params, const std::string & value) {
-            params.sparams.mirostat_tau = std::stof(value);
+            params.sampling.mirostat_tau = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1100,7 +979,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             try {
                 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
                     const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-                    params.sparams.logit_bias.push_back({key, bias});
+                    params.sampling.logit_bias.push_back({key, bias});
                 } else {
                     throw std::invalid_argument("invalid input format");
                 }
@@ -1111,9 +990,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_sparam());
     add_opt(common_arg(
         {"--grammar"}, "GRAMMAR",
-        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
+        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
         [](common_params & params, const std::string & value) {
-            params.sparams.grammar = value;
+            params.sampling.grammar = value;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1127,7 +1006,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             std::copy(
                 std::istreambuf_iterator<char>(file),
                 std::istreambuf_iterator<char>(),
-                std::back_inserter(params.sparams.grammar)
+                std::back_inserter(params.sampling.grammar)
             );
         }
     ).set_sparam());
@@ -1135,7 +1014,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-j", "--json-schema"}, "SCHEMA",
         "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
         [](common_params & params, const std::string & value) {
-            params.sparams.grammar = json_schema_to_grammar(json::parse(value));
+            params.sampling.grammar = json_schema_to_grammar(json::parse(value));
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1444,17 +1323,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
-    add_opt(common_arg(
-        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
-        "number of layers to store in VRAM for the draft model",
-        [](common_params & params, int value) {
-            params.n_gpu_layers_draft = value;
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-sm", "--split-mode"}, "{none,layer,row}",
         "how to split the model across multiple GPUs, one of:\n"
@@ -1593,13 +1461,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.model = value;
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
-    add_opt(common_arg(
-        {"-md", "--model-draft"}, "FNAME",
-        "draft model for speculative decoding (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.model_draft = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-mu", "--model-url"}, "MODEL_URL",
         "model download url (default: unused)",
@@ -2037,5 +1898,168 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_LOG_TIMESTAMPS"));
 
+    // speculative parameters
+    add_opt(common_arg(
+        {"-td", "--threads-draft"}, "N",
+        "number of threads to use during generation (default: same as --threads)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams.n_threads = value;
+            if (params.speculative.cpuparams.n_threads <= 0) {
+                params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-tbd", "--threads-batch-draft"}, "N",
+        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams_batch.n_threads = value;
+            if (params.speculative.cpuparams_batch.n_threads <= 0) {
+                params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Cd", "--cpu-mask-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [](common_params & params, const std::string & mask) {
+            params.speculative.cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Crd", "--cpu-range-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
+        [](common_params & params, const std::string & range) {
+            params.speculative.cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--cpu-strict-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--prio-draft"}, "N",
+        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
+        [](common_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--poll-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: same as --poll])",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [](common_params & params, const std::string & mask) {
+            params.speculative.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
+        [](common_params & params, const std::string & range) {
+            params.speculative.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--cpu-strict-batch-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams_batch.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--prio-batch-draft"}, "N",
+        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
+        [](common_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--poll-batch-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: --poll-draft)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams_batch.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--draft-max", "--draft", "--draft-n"}, "N",
+        string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
+        [](common_params & params, int value) {
+            params.speculative.n_max = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--draft-min", "--draft-n-min"}, "N",
+        string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
+        [](common_params & params, int value) {
+            params.speculative.n_min = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--draft-p-split"}, "P",
+        string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
+        [](common_params & params, const std::string & value) {
+            params.speculative.p_split = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--draft-p-min"}, "P",
+        string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
+        [](common_params & params, const std::string & value) {
+            params.speculative.p_min = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-cd", "--ctx-size-draft"}, "N",
+        string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
+        [](common_params & params, int value) {
+            params.speculative.n_ctx = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
+        "number of layers to store in VRAM for the draft model",
+        [](common_params & params, int value) {
+            params.speculative.n_gpu_layers = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-md", "--model-draft"}, "FNAME",
+        "draft model for speculative decoding (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.speculative.model = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+
     return ctx_arg;
 }
diff --git a/common/common.cpp b/common/common.cpp
index d314523db..c398329d0 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -536,12 +536,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
                     [](const unsigned char c) { return !std::isprint(c); }),
                 detokenized.end());
 
-        buf << "\n" << std::to_string(i)
-            << ":token '" << detokenized << "'"
-            << ":pos " << std::to_string(batch.pos[i])
-            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
-            << ":seq_id " << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
+        buf << "\n"          << std::to_string(i)
+            << ", token '"   << detokenized << "'"
+            << ", pos "      << std::to_string(batch.pos[i])
+            << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
+            << ", seq_id "   << std::to_string(batch.seq_id[i][0])
+            << ", logits "   << std::to_string(batch.logits[i]);
     }
 
     buf << " ]";
@@ -925,9 +925,9 @@ struct common_init_result common_init_from_params(common_params & params) {
         common_lora_adapters_apply(lctx, iparams.lora_adapters);
     }
 
-    if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+    if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
         LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
-        params.sparams.ignore_eos = false;
+        params.sampling.ignore_eos = false;
     }
 
     if (params.warmup) {
@@ -1490,6 +1490,66 @@ void common_batch_add(
     batch.n_tokens++;
 }
 
+//
+// Token utils
+//
+
+size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+
+    return i;
+}
+
+size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
+    // check for empty sequences
+    if (a.empty() || b.empty()) {
+        return 0;
+    }
+
+    // get the lengths of the input sequences
+    size_t a_len = a.size();
+    size_t b_len = b.size();
+
+    // initialize the maximum length of the longest common subsequence (LCS)
+    size_t max_length = 0;
+
+    // use two rows instead of a 2D matrix to optimize space
+    std::vector<size_t> prev_row(b_len + 1, 0);
+    std::vector<size_t> curr_row(b_len + 1, 0);
+
+    // iterate through the elements of a
+    for (size_t i = 1; i <= a_len; i++) {
+        // iterate through the elements of b
+        for (size_t j = 1; j <= b_len; j++) {
+            // if elements at the current positions match
+            if (a[i - 1] == b[j - 1]) {
+                // if it's the first element of either sequences, set LCS length to 1
+                if (i == 1 || j == 1) {
+                    curr_row[j] = 1;
+                } else {
+                    // increment LCS length by 1 compared to the previous element
+                    curr_row[j] = prev_row[j - 1] + 1;
+                }
+
+                // update max_length if necessary
+                if (curr_row[j] > max_length) {
+                    max_length = curr_row[j];
+                }
+            } else {
+                // reset LCS length if elements don't match
+                curr_row[j] = 0;
+            }
+        }
+
+        // update the previous row for the next iteration
+        prev_row = curr_row;
+    }
+
+    // return the maximum length of the LCS
+    return max_length;
+}
+
 //
 // Vocab utils
 //
diff --git a/common/common.h b/common/common.h
index 7977cc7a9..5c579b5ab 100644
--- a/common/common.h
+++ b/common/common.h
@@ -33,6 +33,8 @@ struct common_lora_adapter_container : common_lora_adapter_info {
     struct llama_lora_adapter * adapter;
 };
 
+using llama_tokens = std::vector<llama_token>;
+
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const * LLAMA_COMMIT;
@@ -101,8 +103,8 @@ enum dimre_method {
     DIMRE_METHOD_MEAN,
 };
 
-// sampler parameters
-struct common_sampler_params {
+// sampling parameters
+struct common_params_sampling {
     uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
 
     int32_t n_prev             = 64;    // number of previous tokens to remember
@@ -153,19 +155,30 @@ struct common_sampler_params {
     std::string print() const;
 };
 
+struct common_params_speculative {
+    int32_t n_ctx        =     0; // draft context size
+    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+    float   p_split      =  0.1f; // speculative decoding split probability
+    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
+
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+
+    std::string model = ""; // draft model for speculative decoding                          // NOLINT
+};
+
 struct common_params {
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =  4096; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                =     0; // number of tokens to keep from initial prompt
-    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
     int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
     int32_t n_parallel            =     1; // number of parallel sequences to decode
     int32_t n_sequences           =     1; // number of sequences to decode
-    float   p_split               =  0.1f; // speculative decoding split probability
     int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
     float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
     int32_t grp_attn_n            =     1; // group-attention factor
@@ -182,8 +195,6 @@ struct common_params {
 
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
-    struct cpu_params draft_cpuparams;
-    struct cpu_params draft_cpuparams_batch;
 
     ggml_backend_sched_eval_callback cb_eval = nullptr;
     void * cb_eval_user_data                 = nullptr;
@@ -195,10 +206,10 @@ struct common_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
 
-    struct common_sampler_params sparams;
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
 
     std::string model                = ""; // model path                                                    // NOLINT
-    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
     std::string model_alias          = "unknown"; // model alias                                            // NOLINT
     std::string model_url            = ""; // model url to download                                         // NOLINT
     std::string hf_token             = ""; // HF token                                                      // NOLINT
@@ -461,7 +472,9 @@ struct llama_model * common_load_model_from_hf(const char * repo, const char * f
 // clear LoRA adapters from context, then apply new list of adapters
 void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
 
+//
 // Batch utils
+//
 
 void common_batch_clear(struct llama_batch & batch);
 
@@ -472,6 +485,16 @@ void common_batch_add(
     const std::vector<llama_seq_id> & seq_ids,
                                bool   logits);
 
+//
+// Token utils
+//
+
+// longest common prefix
+size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
+
+// longet common subsequence
+size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
+
 //
 // Vocab utils
 //
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 7922fde47..0c4699a89 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -99,7 +99,7 @@ struct ring_buffer {
 };
 
 struct common_sampler {
-    common_sampler_params params;
+    common_params_sampling params;
 
     struct llama_sampler * grmr;
     struct llama_sampler * chain;
@@ -125,7 +125,7 @@ struct common_sampler {
     }
 };
 
-std::string common_sampler_params::print() const {
+std::string common_params_sampling::print() const {
     char result[1024];
 
     snprintf(result, sizeof(result),
@@ -141,7 +141,7 @@ std::string common_sampler_params::print() const {
     return std::string(result);
 }
 
-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
     llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
 
     lparams.no_perf = params.no_perf;
@@ -320,6 +320,45 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
     return cur_p.data[cur_p.selected].id;
 }
 
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
+
+    std::vector<llama_token> result;
+    result.reserve(idxs.size());
+
+    size_t i = 0;
+    for (; i < draft.size(); i++) {
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+
+        common_sampler_accept(gsmpl, id, true);
+
+        result.push_back(id);
+
+        if (draft[i] != id) {
+            break;
+        }
+    }
+
+    if (i == draft.size()) {
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+
+        common_sampler_accept(gsmpl, id, true);
+
+        result.push_back(id);
+    }
+
+    return result;
+}
+
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+    std::vector<int> idxs(draft.size() + 1);
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        idxs[i] = i;
+    }
+
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+}
+
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
     return llama_sampler_get_seed(gsmpl->chain);
 }
diff --git a/common/sampling.h b/common/sampling.h
index d37f25ad3..348911b18 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -36,7 +36,7 @@ struct common_sampler;
 
 // llama_sampler API overloads
 
-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
 
 void common_sampler_free(struct common_sampler * gsmpl);
 
@@ -60,6 +60,27 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 //
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
 
+// generalized version of common_sampler_sample
+//
+// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
+// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
+//
+//      common_sampler_sample_n(gsmpl, ctx, { idx }, {});
+//
+// is equivalent to
+//
+//      common_sampler_sample(gsmpl, ctx, idx);
+//      common_sampler_accept(gsmpl, token, true);
+//
+// requires: idxs.size() == draft.size() + 1
+//
+// returns at least 1 token, up to idxs.size()
+//
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+
+// assume idxs == [ 0, 1, 2, ..., draft.size() ]
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
 
 // helpers
diff --git a/common/speculative.cpp b/common/speculative.cpp
new file mode 100644
index 000000000..fe315a270
--- /dev/null
+++ b/common/speculative.cpp
@@ -0,0 +1,269 @@
+#include "speculative.h"
+
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+
+#include <cstring>
+
+#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
+#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
+
+struct common_speculative {
+    struct llama_context * ctx;
+    struct common_sampler * smpl;
+
+    llama_batch batch;
+    llama_tokens prompt;
+};
+
+struct common_speculative * common_speculative_init(
+        struct llama_context * ctx_dft) {
+    auto * result = new common_speculative {
+        /* .ctx    = */ ctx_dft,
+        /* .smpl   = */ nullptr,
+        /* .batch  = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
+        /* .prompt = */ {},
+    };
+
+    // TODO: optimize or pass from outside?
+#if 0
+    {
+        common_params_sampling params;
+        params.no_perf = false;
+
+        params.top_k = 40;
+        params.top_p = 0.9;
+
+        params.samplers = {
+            COMMON_SAMPLER_TYPE_TOP_K,
+            COMMON_SAMPLER_TYPE_TOP_P,
+            COMMON_SAMPLER_TYPE_INFILL,
+        };
+
+        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
+    }
+#else
+    {
+        common_params_sampling params;
+        params.no_perf = false;
+
+        params.top_k = 10;
+
+        params.samplers = {
+            COMMON_SAMPLER_TYPE_TOP_K,
+        };
+
+        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
+    }
+#endif
+
+    return result;
+}
+
+void common_speculative_free(struct common_speculative * spec) {
+    common_sampler_free(spec->smpl);
+
+    llama_batch_free(spec->batch);
+
+    delete spec;
+}
+
+bool common_speculative_are_compatible(
+        const struct llama_context * ctx_tgt,
+        const struct llama_context * ctx_dft) {
+    const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
+    const struct llama_model * model_dft = llama_get_model(ctx_dft);
+
+    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
+    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
+
+    const bool vocab_type_dft = llama_vocab_type(model_dft);
+    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
+
+    if (vocab_type_tgt != vocab_type_dft) {
+        LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
+                     "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
+        return false;
+    }
+
+    if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
+        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
+        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
+        llama_token_eos(model_tgt) != llama_token_eos(model_dft)
+    ) {
+        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
+        return false;
+    }
+
+    {
+        const int n_vocab_tgt = llama_n_vocab(model_tgt);
+        const int n_vocab_dft = llama_n_vocab(model_dft);
+
+        const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
+
+        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
+            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
+                         "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+                    __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+            return false;
+        }
+
+        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
+            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
+            const char * token_text_dft = llama_token_get_text(model_dft, i);
+            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
+                LOG_ERR("%s: draft model vocab must match target model to use speculation but "
+                             "token %d content differs - target '%s', draft '%s'\n", __func__, i,
+                        common_token_to_piece(ctx_tgt, i).c_str(),
+                        common_token_to_piece(ctx_dft, i).c_str());
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+llama_tokens common_speculative_gen_draft(
+        struct common_speculative * spec,
+        struct common_speculative_params params,
+        const llama_tokens & prompt_tgt,
+        llama_token id_last) {
+    auto & batch  = spec->batch;
+    auto & ctx    = spec->ctx;
+    auto & smpl   = spec->smpl;
+    auto & prompt = spec->prompt;
+
+    int reuse_i = 0;
+    int reuse_n = 0;
+
+    const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
+
+    const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
+
+    // reuse as much as possible from the old draft context
+    // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
+    for (int i = 0; i < (int) prompt.size(); ++i) {
+        int cur = 0;
+        while (i_start + cur < (int) prompt_tgt.size() &&
+               i       + cur < (int) prompt.size() &&
+               prompt_tgt[i_start + cur] == prompt[i + cur]) {
+            cur++;
+        }
+
+        if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
+            reuse_i = i;
+            reuse_n = cur;
+        }
+    }
+
+    LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
+
+    llama_tokens result;
+    result.reserve(params.n_draft);
+
+    if (reuse_n == 0) {
+        llama_kv_cache_clear(ctx);
+
+        prompt.clear();
+    } else {
+        // this happens when a previous draft has been discarded (for example, due to being too small), but the
+        // target model agreed with it. in this case, we simply pass back the previous results to save compute
+        if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
+            for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
+                result.push_back(prompt[i]);
+
+                if (params.n_draft <= (int) result.size()) {
+                    break;
+                }
+            }
+
+            return result;
+        }
+
+        if (reuse_i > 0) {
+            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
+            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+
+            prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
+        }
+
+        if (reuse_n < (int) prompt.size()) {
+            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
+
+            prompt.erase(prompt.begin() + reuse_n, prompt.end());
+        }
+    }
+
+    // prepare a batch to evaluate any new tokens in the prompt
+    common_batch_clear(batch);
+
+    for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
+        //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
+        common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
+
+        prompt.push_back(prompt_tgt[i]);
+    }
+
+    // we should rarely end-up here during normal decoding
+    if (batch.n_tokens > 0) {
+        //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
+
+        llama_decode(ctx, batch);
+    }
+
+    const llama_pos n_past = prompt.size();
+
+    LOG_DBG("%s: n_past = %d\n", __func__, n_past);
+
+    common_batch_clear(batch);
+    common_batch_add  (batch, id_last, n_past, { 0 }, true);
+
+    prompt.push_back(id_last);
+
+    //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
+
+    llama_decode(ctx, batch);
+
+    common_sampler_reset(smpl);
+
+    // sample n_draft tokens from the draft model
+    for (int i = 0; i < params.n_draft; ++i) {
+        common_batch_clear(batch);
+
+        common_sampler_sample(smpl, ctx, 0, true);
+
+        const auto * cur_p = common_sampler_get_candidates(smpl);
+
+        for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
+            LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
+        }
+
+        // add drafted token for each sequence
+        const llama_token id = cur_p->data[0].id;
+
+        // only collect very high-confidence draft tokens
+        if (cur_p->data[0].p < params.p_min) {
+            break;
+        }
+
+        common_sampler_accept(smpl, id, true);
+
+        result.push_back(id);
+
+        if (params.n_draft <= (int) result.size()) {
+            break;
+        }
+
+        common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
+
+        // evaluate the drafted tokens on the draft model
+        llama_decode(ctx, batch);
+
+        prompt.push_back(id);
+    }
+
+    return result;
+}
diff --git a/common/speculative.h b/common/speculative.h
new file mode 100644
index 000000000..50ec03446
--- /dev/null
+++ b/common/speculative.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "llama.h"
+#include "common.h"
+
+struct common_speculative;
+
+struct common_speculative_params {
+    int n_draft = 16;  // max drafted tokens
+    int n_reuse = 256;
+
+    float p_min = 0.9f; // min probabiliy required to accept a token in the draft
+};
+
+struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
+
+void common_speculative_free(struct common_speculative * spec);
+
+bool common_speculative_are_compatible(
+        const struct llama_context * ctx_tgt,
+        const struct llama_context * ctx_dft);
+
+// sample up to n_draft tokens and add them to the batch using the draft model
+llama_tokens common_speculative_gen_draft(
+               struct common_speculative * spec,
+        struct common_speculative_params   params,
+                      const llama_tokens & prompt,
+                             llama_token   id_last);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d63a96c1c..9bd099d4e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -50,5 +50,6 @@ else()
     add_subdirectory(simple)
     add_subdirectory(simple-chat)
     add_subdirectory(speculative)
+    add_subdirectory(speculative-simple)
     add_subdirectory(tokenize)
 endif()
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 3b554033e..ba219cd4b 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -68,10 +68,10 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
-    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
+    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
 
     if (ctx == NULL) {
         LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 15b358dc4..ef7008957 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -73,7 +73,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    auto & sparams = params.sparams;
+    auto & sparams = params.sampling;
 
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 161098585..2691c6e6b 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
 
     LOG("\n");
 
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
     if (!smpl) {
         LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index cbecec343..e9cbb51ed 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -237,7 +237,7 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
 
     LOG_INF("\n");
 
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
     return smpl;
 }
 
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 3c0ccfea2..8d0ef8b3d 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
     llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
 
     // target model sampling context
-    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sampling);
 
     // verification n-grams
     std::vector<ngram_data> ngrams_cur(G);
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index 7faebe7ba..dff07c075 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -21,7 +21,7 @@ int main(int argc, char ** argv){
 
     common_init();
 
-    const int n_draft = params.n_draft;
+    const int n_draft = params.speculative.n_max;
 
     // init llama.cpp
     llama_backend_init();
@@ -40,6 +40,7 @@ int main(int argc, char ** argv){
     common_ngram_cache ngram_cache_context;
     common_ngram_cache ngram_cache_dynamic;
     common_ngram_cache ngram_cache_static;
+
     int64_t t_draft_flat_us = 0;
     int64_t t_draft_us = 0;
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index a04728b18..4d92bb238 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -22,7 +22,7 @@ int main(int argc, char ** argv){
     common_init();
 
     // max. number of additional tokens to draft if match is found
-    const int n_draft = params.n_draft;
+    const int n_draft = params.speculative.n_max;
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
@@ -102,7 +102,7 @@ int main(int argc, char ** argv){
 
     bool has_eos = false;
 
-    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sampling);
 
     std::vector<llama_token> draft;
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 7c4ce4be2..957451af7 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -100,7 +100,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    auto & sparams = params.sparams;
+    auto & sparams = params.sampling;
 
     // save choice to use color for later
     // (note for later: this is a slightly awkward choice)
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 43c8f3ed5..fd2b1c011 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.smpl = common_sampler_init(model, params.sparams);
+        client.smpl = common_sampler_init(model, params.sampling);
     }
 
     std::vector<llama_token> tokens_system;
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 1768aae51..e78a8596d 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -282,8 +282,8 @@ int main(int argc, char ** argv) {
                 return a.second > b.second;
             });
 
-            LOG("Top %d similar chunks:\n", params.sparams.top_k);
-            for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
+            LOG("Top %d similar chunks:\n", params.sampling.top_k);
+            for (int i = 0; i < std::min(params.sampling.top_k, (int) chunks.size()); i++) {
                 LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
                 LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
                 LOG("similarity: %f\n", similarities[i].second);
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 8c49a52a6..2f0cf9baa 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -9,7 +9,7 @@ int main(int argc, char ** argv) {
     common_params params;
 
     params.prompt = "The quick brown fox";
-    params.sparams.seed = 1234;
+    params.sampling.seed = 1234;
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
@@ -42,7 +42,7 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
 
     // tokenize prompt
     auto tokens = common_tokenize(ctx, params.prompt, true);
@@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed));
 
     printf("\nsecond run: %s", params.prompt.c_str());
 
@@ -169,7 +169,7 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
 
     printf("\nsingle seq run: %s", params.prompt.c_str());
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b8e003be9..6c55d65c0 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -175,7 +175,7 @@ struct server_slot {
     // sampling
     json json_schema;
 
-    struct common_sampler_params sparams;
+    struct common_params_sampling sparams;
     struct common_sampler * smpl = nullptr;
 
     llama_token sampled;
@@ -687,7 +687,7 @@ struct server_context {
 
             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
 
-            slot.sparams = params.sparams;
+            slot.sparams = params.sampling;
 
             slot.callback_on_release = [this](int) {
                 queue_tasks.pop_deferred_task();
@@ -743,7 +743,7 @@ struct server_context {
                 }
 
                 // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
-                int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
+                int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);
 
                 // fraction of the common subsequence length compared to the current slot's prompt length
                 float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
@@ -788,7 +788,7 @@ struct server_context {
     bool launch_slot_with_task(server_slot & slot, const server_task & task) {
         slot_params default_params;
         // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        auto default_sparams = params.sparams;
+        auto default_sparams = params.sampling;
         const auto & data = task.data;
 
         if (data.count("__oaicompat") != 0) {
@@ -1960,7 +1960,7 @@ struct server_context {
 
                             if (slot.params.cache_prompt) {
                                 // reuse any previously computed tokens that are common with the new prompt
-                                slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
+                                slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
 
                                 // reuse chunks from the cached prompt by shifting their KV cache in the new position
                                 if (params.n_cache_reuse > 0) {
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index c47ed3e47..1665e9dc3 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -24,7 +24,6 @@
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 
 using json = nlohmann::ordered_json;
-using llama_tokens = std::vector<llama_token>;
 
 #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
 #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
@@ -439,62 +438,6 @@ static std::string gen_chatcmplid() {
 // other common utils
 //
 
-static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
-    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
-
-    return i;
-}
-
-static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
-    // check for empty sequences
-    if (a.empty() || b.empty()) {
-        return 0;
-    }
-
-    // get the lengths of the input sequences
-    size_t a_len = a.size();
-    size_t b_len = b.size();
-
-    // initialize the maximum length of the longest common subsequence (LCS)
-    size_t max_length = 0;
-
-    // use two rows instead of a 2D matrix to optimize space
-    std::vector<size_t> prev_row(b_len + 1, 0);
-    std::vector<size_t> curr_row(b_len + 1, 0);
-
-    // iterate through the elements of a
-    for (size_t i = 1; i <= a_len; i++) {
-        // iterate through the elements of b
-        for (size_t j = 1; j <= b_len; j++) {
-            // if elements at the current positions match
-            if (a[i - 1] == b[j - 1]) {
-                // if it's the first element of either sequences, set LCS length to 1
-                if (i == 1 || j == 1) {
-                    curr_row[j] = 1;
-                } else {
-                    // increment LCS length by 1 compared to the previous element
-                    curr_row[j] = prev_row[j - 1] + 1;
-                }
-
-                // update max_length if necessary
-                if (curr_row[j] > max_length) {
-                    max_length = curr_row[j];
-                }
-            } else {
-                // reset LCS length if elements don't match
-                curr_row[j] = 0;
-            }
-        }
-
-        // update the previous row for the next iteration
-        prev_row = curr_row;
-    }
-
-    // return the maximum length of the LCS
-    return max_length;
-}
-
 static bool ends_with(const std::string & str, const std::string & suffix) {
     return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
diff --git a/examples/speculative-simple/CMakeLists.txt b/examples/speculative-simple/CMakeLists.txt
new file mode 100644
index 000000000..7a3a141c2
--- /dev/null
+++ b/examples/speculative-simple/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-speculative-simple)
+add_executable(${TARGET} speculative-simple.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/speculative-simple/README.md b/examples/speculative-simple/README.md
new file mode 100644
index 000000000..e3a6c6b4a
--- /dev/null
+++ b/examples/speculative-simple/README.md
@@ -0,0 +1,12 @@
+# llama.cpp/examples/speculative-simple
+
+Demonstration of basic greedy speculative decoding
+
+```bash
+./bin/llama-speculative-simple \
+    -m  ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
+    -md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
+    -f test.txt -c 0 -ngl 99 --color \
+    --sampling-seq k --top-k 1 -fa --temp 0.0 \
+    -ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
+```
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
new file mode 100644
index 000000000..1bc7f428c
--- /dev/null
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -0,0 +1,273 @@
+#include "arg.h"
+#include "common.h"
+#include "sampling.h"
+#include "speculative.h"
+#include "log.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
+        return 1;
+    }
+
+    if (params.n_predict < -1) {
+        LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
+        return 1;
+    }
+
+    common_init();
+
+    if (params.speculative.model.empty()) {
+        LOG_ERR("%s: --model-draft is required\n", __func__);
+        return 1;
+    }
+
+    // init llama.cpp
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model_tgt = NULL;
+    llama_model * model_dft = NULL;
+
+    llama_context * ctx_tgt = NULL;
+    llama_context * ctx_dft = NULL;
+
+    // load the target model
+    common_init_result llama_init_tgt = common_init_from_params(params);
+
+    model_tgt = llama_init_tgt.model;
+    ctx_tgt   = llama_init_tgt.context;
+
+    // load the draft model
+    params.model        = params.speculative.model;
+    params.n_ctx        = params.speculative.n_ctx;
+    params.n_batch      = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;
+    params.n_gpu_layers = params.speculative.n_gpu_layers;
+
+    if (params.speculative.cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
+    }
+
+    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
+    common_init_result llama_init_dft = common_init_from_params(params);
+
+    model_dft = llama_init_dft.model;
+    ctx_dft   = llama_init_dft.context;
+
+    if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
+        return 1;
+    }
+
+    // Tokenize the prompt
+    std::vector<llama_token> inp;
+    inp = common_tokenize(ctx_tgt, params.prompt, true, true);
+
+    if (llama_n_ctx(ctx_tgt) < (int) inp.size()) {
+        LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
+
+        return 1;
+    }
+
+    if (llama_n_batch(ctx_tgt) < (int) inp.size()) {
+        LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt));
+
+        return 1;
+    }
+
+    LOG("\n\n");
+
+    for (auto id : inp) {
+        LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
+    }
+
+    // how many tokens to draft each time
+    int n_draft     = params.speculative.n_max;
+    int n_draft_min = params.speculative.n_min;
+
+    float p_min = params.speculative.p_min;
+
+    int n_predict = 0;
+    int n_drafted = 0;
+    int n_accept  = 0;
+
+    // used to determine end of generation
+    bool has_eos = false;
+
+    // ================================================
+    // everything until here is standard initialization
+    // the relevant stuff for speculative decoding starts here
+
+    const auto t_enc_start = ggml_time_us();
+
+    // target model sampling context
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
+
+    // eval the prompt
+    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
+
+    // note: keep the last token separate!
+    llama_token id_last = inp.back();
+
+    // all tokens currently in the target context
+    auto prompt_tgt = std::vector<llama_token>(inp.begin(), inp.end() - 1);
+
+    int n_past = inp.size() - 1;
+
+    // init the speculator
+    struct common_speculative_params params_spec;
+    params_spec.n_draft = n_draft;
+    params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
+    params_spec.p_min   = p_min;
+
+    struct common_speculative * spec = common_speculative_init(ctx_dft);
+
+    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
+
+    const auto t_enc_end = ggml_time_us();
+
+    const auto t_dec_start = ggml_time_us();
+
+    while (true) {
+        // optionally, generate draft tokens that can be appended to the target batch
+        //
+        // this is the most important part of the speculation. the more probable tokens that are provided here
+        // the better the performance will be. in theory, this computation can be performed asynchronously and even
+        // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
+        // from a cache or lookup tables.
+        //
+        llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last);
+
+        //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
+
+        // always have a token to evaluate from before - id_last
+        common_batch_clear(batch_tgt);
+        common_batch_add  (batch_tgt, id_last, n_past++, { 0 }, true);
+
+        // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
+        {
+            // do not waste time on small drafts
+            if (draft.size() < n_draft_min) {
+                draft.clear();
+            }
+
+            for (size_t i = 0; i < draft.size(); ++i) {
+                common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+            }
+
+            //LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());
+
+            llama_decode(ctx_tgt, batch_tgt);
+        }
+
+        // sample from the full target batch and return the accepted tokens based on the target sampler
+        //
+        // for each token to be accepted, the sampler would have to sample that same token
+        // in such cases, instead of decoding the sampled token as we normally do, we simply continue with the
+        // available logits from the batch and sample the next token until we run out of logits or the sampler
+        // disagrees with the draft
+        //
+        const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
+
+        //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());
+
+        GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
+
+        n_past    += ids.size() - 1;
+        n_drafted += batch_tgt.n_tokens - 1;
+        n_accept  += ids.size() - 1;
+
+        // process the accepted tokens and update contexts
+        //
+        // this is the standard token post-processing that we normally do
+        // in this case, we do it for a group of accepted tokens at once
+        //
+        {
+            llama_token id;
+            std::string token_str;
+
+            for (size_t i = 0; i < ids.size(); ++i) {
+                id = ids[i];
+
+                ++n_predict;
+
+                if (llama_token_is_eog(model_tgt, id)) {
+                    has_eos = true;
+                    break;
+                }
+
+                token_str = common_token_to_piece(ctx_tgt, id);
+
+                if (params.use_color && i + 1 < ids.size()) {
+                    LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
+                } else {
+                    LOG("%s", token_str.c_str());
+                }
+            }
+
+            if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
+                break;
+            }
+
+            LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str());
+
+            {
+                LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
+
+                llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
+            }
+
+            prompt_tgt.push_back(id_last);
+            prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1);
+
+            // remember the last accepted token for the next iteration
+            id_last = id;
+        }
+    }
+
+    auto t_dec_end = ggml_time_us();
+
+    const int n_input = inp.size();
+
+    LOG("\n\n");
+
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+
+    LOG_INF("\n");
+    LOG_INF("n_draft   = %d\n", n_draft);
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_drafted = %d\n", n_drafted);
+    LOG_INF("n_accept  = %d\n", n_accept);
+    LOG_INF("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+
+    LOG_INF("\n");
+    LOG_INF("draft:\n\n");
+
+    llama_perf_context_print(ctx_dft);
+
+    LOG_INF("\n");
+    LOG_INF("target:\n\n");
+    common_perf_print(ctx_tgt, smpl);
+
+    common_sampler_free(smpl);
+    common_speculative_free(spec);
+
+    llama_free(ctx_tgt);
+    llama_free_model(model_tgt);
+
+    llama_free(ctx_dft);
+    llama_free_model(model_dft);
+
+    llama_backend_free();
+
+    LOG("\n\n");
+
+    return 0;
+}
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 6cafd8a83..eb8bb2de5 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -12,7 +12,7 @@
 #include <string>
 #include <vector>
 
-#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
+#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
 
 struct seq_draft {
@@ -33,7 +33,7 @@ int main(int argc, char ** argv) {
     common_params params;
 
     // needed to get candidate probs even for temp <= 0.0
-    params.sparams.n_probs = 128;
+    params.sampling.n_probs = 128;
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
         return 1;
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.model_draft.empty()) {
+    if (params.speculative.model.empty()) {
         LOG_ERR("%s: --model-draft is required\n", __func__);
         return 1;
     }
@@ -55,9 +55,9 @@ int main(int argc, char ** argv) {
     const int n_seq_dft = params.n_parallel;
 
     // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
-    const float p_split  = params.p_split;
+    const float p_draft_split = params.speculative.p_split;
 
-    std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
+    std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed);
     std::uniform_real_distribution<> u_dist;
 
     // init llama.cpp
@@ -76,13 +76,13 @@ int main(int argc, char ** argv) {
     ctx_tgt = llama_init_tgt.context;
 
     // load the draft model
-    params.model = params.model_draft;
-    params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.draft_cpuparams.n_threads > 0) {
-        params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
+    params.model = params.speculative.model;
+    params.n_gpu_layers = params.speculative.n_gpu_layers;
+    if (params.speculative.cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
     }
 
-    params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
+    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
     common_init_result llama_init_dft = common_init_from_params(params);
     model_dft = llama_init_dft.model;
     ctx_dft = llama_init_dft.context;
@@ -170,7 +170,7 @@ int main(int argc, char ** argv) {
     //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
 
     // how many tokens to draft each time
-    int n_draft = params.n_draft;
+    int n_draft = params.speculative.n_max;
 
     int n_predict = 0;
     int n_drafted = 0;
@@ -183,14 +183,14 @@ int main(int argc, char ** argv) {
     bool has_eos = false;
 
     // target model sampling context (reuse the llama_context's sampling instance)
-    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
 
     // draft sequence data
     std::vector<seq_draft> drafts(n_seq_dft);
 
     for (int s = 0; s < n_seq_dft; ++s) {
         // allocate llama_sampler for each draft sequence
-        drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
+        drafts[s].smpl = common_sampler_init(model_dft, params.sampling);
     }
 
     llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
@@ -230,7 +230,7 @@ int main(int argc, char ** argv) {
             // for stochastic sampling, attempt to match the token with the drafted tokens
             {
                 bool accept = false;
-                if (params.sparams.temp > 0) {
+                if (params.sampling.temp > 0) {
                     // stochastic verification
                     common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
 
@@ -494,7 +494,7 @@ int main(int argc, char ** argv) {
 
                 // attempt to split the branch if the probability is high enough
                 for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
+                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
                         LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
                         llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index 3665238b5..69604b87c 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -70,7 +70,7 @@ int main(void) {
 
     // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
     argv = {"binary_name", "--draft", "123"};
-    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING));
 
 
     printf("test-arg-parser: test valid usage\n\n");
@@ -96,7 +96,7 @@ int main(void) {
     // --draft cannot be used outside llama-speculative
     argv = {"binary_name", "--draft", "123"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
-    assert(params.n_draft == 123);
+    assert(params.speculative.n_max == 123);
 
 // skip this part on windows, because setenv is not supported
 #ifdef _WIN32

From 5a8987793f3e7c1fbfa6806bfcd17d578071b6c9 Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Mon, 25 Nov 2024 17:31:10 +0800
Subject: [PATCH 291/396] [SYCL] Fix building Win package for oneAPI 2025.0
 update (#10483)

* fix build package for 2025.0

* debug

* debug

* fix

* rm debug

---------

Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
---
 .github/workflows/build.yml | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 572f91643..abaf2c504 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -952,7 +952,7 @@ jobs:
 
     env:
       WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
       ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
     steps:
       - name: Clone
@@ -962,7 +962,8 @@ jobs:
           fetch-depth: 0
 
       - name: Install
-        run:  scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
 
       - name: Build
         id: cmake_build
@@ -981,27 +982,34 @@ jobs:
             echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
           fi
 
-      - name: Pack artifacts
+      - name: Build the release package
         id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
         run: |
           echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+
           cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
 
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
           cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+
           echo "cp oneAPI running time dll files to ./build/bin done"
           7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
 
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+      - name: Upload the release package
+        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
         uses: actions/upload-artifact@v4
         with:
           path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip

From b756441104ca8384640d6df22ba4ea6dab7ad799 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 Nov 2024 15:08:04 +0200
Subject: [PATCH 292/396] metal : minor code formatting

---
 ggml/src/ggml-metal/ggml-metal.m     | 594 +++++++++++++--------------
 ggml/src/ggml-metal/ggml-metal.metal |  43 +-
 2 files changed, 321 insertions(+), 316 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index d1abb3cef..3a533d7f9 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -1951,316 +1951,316 @@ static void ggml_metal_encode_node(
                         }
 #endif
 
-                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                        if ([device supportsFamily:MTLGPUFamilyApple7] &&
-                                !ggml_is_transposed(src0) &&
-                                !ggml_is_transposed(src1) &&
-                                src1t == GGML_TYPE_F32 &&
-                                ne00 % 32 == 0 && ne00 >= 64 &&
-                                (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
-                            //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+                // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                if ([device supportsFamily:MTLGPUFamilyApple7] &&
+                        !ggml_is_transposed(src0) &&
+                        !ggml_is_transposed(src1) &&
+                        src1t == GGML_TYPE_F32 &&
+                        ne00 % 32 == 0 && ne00 >= 64 &&
+                        (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
+                    //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
 
-                            // some Metal matrix data types require aligned pointers
-                            // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
-                            switch (src0->type) {
-                                case GGML_TYPE_F32:  GGML_ASSERT(nb01 % 16 == 0); break;
-                                case GGML_TYPE_F16:  GGML_ASSERT(nb01 % 8  == 0); break;
-                                case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8  == 0); break;
-                                default: break;
-                            }
+                    // some Metal matrix data types require aligned pointers
+                    // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:  GGML_ASSERT(nb01 % 16 == 0); break;
+                        case GGML_TYPE_F16:  GGML_ASSERT(nb01 % 8  == 0); break;
+                        case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8  == 0); break;
+                        default: break;
+                    }
 
-                            id<MTLComputePipelineState> pipeline = nil;
+                    id<MTLComputePipelineState> pipeline = nil;
 
-                            switch (src0->type) {
-                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
-                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
-                                case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
-                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
-                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
-                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
-                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
-                                case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
-                                case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
-                                default: GGML_ABORT("MUL MAT-MAT not implemented");
-                            }
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
+                        case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
+                        case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
+                        case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
+                        default: GGML_ABORT("MUL MAT-MAT not implemented");
+                    }
 
-                            ggml_metal_kargs_mul_mm args = {
-                                /*.ne00 =*/ ne00,
-                                /*.ne02 =*/ ne02,
-                                /*.nb01 =*/ nb01,
-                                /*.nb02 =*/ nb02,
-                                /*.nb03 =*/ nb03,
-                                /*.ne12 =*/ ne12,
-                                /*.nb10 =*/ nb10,
-                                /*.nb11 =*/ nb11,
-                                /*.nb12 =*/ nb12,
-                                /*.nb13 =*/ nb13,
-                                /*.ne0  =*/ ne0,
-                                /*.ne1  =*/ ne1,
-                                /*.r2   =*/ r2,
-                                /*.r3   =*/ r3,
-                            };
+                    ggml_metal_kargs_mul_mm args = {
+                        /*.ne00 =*/ ne00,
+                        /*.ne02 =*/ ne02,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.nb03 =*/ nb03,
+                        /*.ne12 =*/ ne12,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.nb13 =*/ nb13,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                        /*.r2   =*/ r2,
+                        /*.r3   =*/ r3,
+                    };
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBytes:&args    length:sizeof(args) atIndex:0];
-                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
-                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
-                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
 
-                            [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                        } else {
-                            int nth0 = 32;
-                            int nth1 = 1;
-                            int nrows = 1;
-                            //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+                    [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                    [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                } else {
+                    int nth0 = 32;
+                    int nth1 = 1;
+                    int nrows = 1;
+                    //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
 
-                            id<MTLComputePipelineState> pipeline = nil;
+                    id<MTLComputePipelineState> pipeline = nil;
 
-                            // use custom matrix x vector kernel
-                            switch (src0t) {
-                                case GGML_TYPE_F32:
-                                    {
-                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                    // use custom matrix x vector kernel
+                    switch (src0t) {
+                        case GGML_TYPE_F32:
+                            {
+                                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                                nrows = 4;
+                            } break;
+                        case GGML_TYPE_F16:
+                            {
+                                nth0 = 32;
+                                nth1 = 1;
+                                if (src1t == GGML_TYPE_F32) {
+                                    if (ne11 * ne12 < 4) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
+                                    } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
+                                        nrows = ne11;
+                                    } else {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
                                         nrows = 4;
-                                    } break;
-                                case GGML_TYPE_F16:
-                                    {
-                                        nth0 = 32;
-                                        nth1 = 1;
-                                        if (src1t == GGML_TYPE_F32) {
-                                            if (ne11 * ne12 < 4) {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
-                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
-                                                nrows = ne11;
-                                            } else {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
-                                                nrows = 4;
-                                            }
-                                        } else {
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
-                                            nrows = 4;
-                                        }
-                                    } break;
-                                case GGML_TYPE_BF16:
-                                    {
-                                        nth0 = 32;
-                                        nth1 = 1;
-                                        if (src1t == GGML_TYPE_F32) {
-                                            if (ne11 * ne12 < 4) {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline;
-                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline;
-                                                nrows = ne11;
-                                            } else {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline;
-                                                nrows = 4;
-                                            }
-                                        } else {
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline;
-                                            nrows = 4;
-                                        }
-                                    } break;
-                                case GGML_TYPE_Q4_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_1:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_1:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q8_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q2_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q3_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_K:
-                                    {
-                                        nth0 = 4; //1;
-                                        nth1 = 8; //32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q6_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_XXS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_XS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ3_XXS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ3_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ1_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ1_M:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ4_NL:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ4_XS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
-                                    } break;
-                                default:
-                                    {
-                                        GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
-                                        GGML_ABORT("not implemented");
                                     }
-                            };
+                                } else {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
+                                    nrows = 4;
+                                }
+                            } break;
+                        case GGML_TYPE_BF16:
+                            {
+                                nth0 = 32;
+                                nth1 = 1;
+                                if (src1t == GGML_TYPE_F32) {
+                                    if (ne11 * ne12 < 4) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline;
+                                    } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline;
+                                        nrows = ne11;
+                                    } else {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline;
+                                        nrows = 4;
+                                    }
+                                } else {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline;
+                                    nrows = 4;
+                                }
+                            } break;
+                        case GGML_TYPE_Q4_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_1:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_1:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q8_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q2_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q3_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_K:
+                            {
+                                nth0 = 4; //1;
+                                nth1 = 8; //32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q6_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_XXS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_XS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ3_XXS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ3_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ1_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ1_M:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ4_NL:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ4_XS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
+                            } break;
+                        default:
+                            {
+                                GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+                                GGML_ABORT("not implemented");
+                            }
+                    };
 
-                            ggml_metal_kargs_mul_mv args = {
-                                /*.ne00 =*/ ne00,
-                                /*.ne01 =*/ ne01,
-                                /*.ne02 =*/ ne02,
-                                /*.nb00 =*/ nb00,
-                                /*.nb01 =*/ nb01,
-                                /*.nb02 =*/ nb02,
-                                /*.nb03 =*/ nb03,
-                                /*.ne10 =*/ ne10,
-                                /*.ne11 =*/ ne11,
-                                /*.ne12 =*/ ne12,
-                                /*.nb10 =*/ nb10,
-                                /*.nb11 =*/ nb11,
-                                /*.nb12 =*/ nb12,
-                                /*.nb13 =*/ nb13,
-                                /*.ne0  =*/ ne0,
-                                /*.ne1  =*/ ne1,
-                                /*.r2   =*/ r2,
-                                /*.r3   =*/ r3,
-                            };
+                    ggml_metal_kargs_mul_mv args = {
+                        /*.ne00 =*/ ne00,
+                        /*.ne01 =*/ ne01,
+                        /*.ne02 =*/ ne02,
+                        /*.nb00 =*/ nb00,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.nb03 =*/ nb03,
+                        /*.ne10 =*/ ne10,
+                        /*.ne11 =*/ ne11,
+                        /*.ne12 =*/ ne12,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.nb13 =*/ nb13,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                        /*.r2   =*/ r2,
+                        /*.r3   =*/ r3,
+                    };
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
 
-                            if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
-                                src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
-                                src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
-                                const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
-                                const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
-                                const int mem_size = 32*sizeof(float);
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q4_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q3_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q5_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q6_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            } else {
-                                const int64_t ny = (ne11 + nrows - 1)/nrows;
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                        }
+                    if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
+                        src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
+                        src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
+                        const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
+                        const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
+                        const int mem_size = 32*sizeof(float);
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q4_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q3_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q5_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q6_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    } else {
+                        const int64_t ny = (ne11 + nrows - 1)/nrows;
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                }
             } break;
         case GGML_OP_MUL_MAT_ID:
             {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 971f5054b..eaca38864 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -5447,12 +5447,12 @@ kernel void kernel_mul_mm(
     const int im = tgpig.z;
 
     // if this block is of 64x32 shape or smaller
-    short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
 
     // a thread shouldn't load data outside of the matrix
-    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
 
     simdgroup_T8x8     ma[4];
     simdgroup_float8x8 mb[2];
@@ -5467,20 +5467,23 @@ kernel void kernel_mul_mm(
     const int i12 = im%args.ne12;
     const int i13 = im/args.ne12;
 
-    uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
-    short    offset1 = il/nl;
+    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const short    offset1 = il/nl;
+
+    device const block_q * x = (device const block_q *)(src0
+        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
 
-    device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*args.nb01 + offset0) + offset1;
     device const float   * y = (device const float   *)(src1
         + args.nb13*i13
         + args.nb12*i12
-        + args.nb11*(r1 * BLOCK_SIZE_N + thread_col)
+        + args.nb11*(r1*BLOCK_SIZE_N + thread_col)
         + args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
 
     for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
         // load data and store to threadgroup memory
         T4x4 temp_a;
         dequantize_func(x, il, temp_a);
+
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
         #pragma unroll(16)
@@ -5490,44 +5493,46 @@ kernel void kernel_mul_mm(
             +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
         }
 
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL)*8*32 + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
+        *(threadgroup float2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
 
         il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2+nl-1)/nl : x;
+        x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
         y += BLOCK_SIZE_K;
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
         // load matrices from threadgroup memory and conduct outer products
-        threadgroup T     * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
+        threadgroup const T     * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
+        threadgroup const float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
 
         #pragma unroll(4)
-        for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
             #pragma unroll(4)
             for (short i = 0; i < 4; i++) {
                 simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
             }
+
             simdgroup_barrier(mem_flags::mem_none);
+
             #pragma unroll(2)
             for (short i = 0; i < 2; i++) {
                 simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
             }
 
-            lsma += BLOCK_SIZE_M/SG_MAT_ROW * SG_MAT_SIZE;
-            lsmb += BLOCK_SIZE_N/SG_MAT_ROW * SG_MAT_SIZE;
-
             #pragma unroll(8)
             for (short i = 0; i < 8; i++){
                 simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
             }
+
+            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
+            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
         }
     }
 
     if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) {
         device float * C = (device float *) dst +
-            (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) + \
-            (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
+            (BLOCK_SIZE_M * r0 + 32*(sgitg &  1)) + \
+            (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
 
         for (short i = 0; i < 8; i++) {
             simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
@@ -5536,7 +5541,7 @@ kernel void kernel_mul_mm(
         // block is smaller than 64x32, we should avoid writing data outside of the matrix
         threadgroup_barrier(mem_flags::mem_threadgroup);
         threadgroup float * temp_str = ((threadgroup float *) shmem) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M;
+                                     + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
         for (short i = 0; i < 8; i++) {
             simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
         }

From f6d12e7df8fe64384f1939976871252e6422a01e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 Nov 2024 15:17:32 +0200
Subject: [PATCH 293/396] tests : fix compile warning

---
 tests/test-quantize-fns.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 8d0bf0470..c77c8ed13 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -79,9 +79,9 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
 }
 
 // Total dot product error
-static float dot_product_error(
-    const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2
-) {
+static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
+    GGML_UNUSED(qfns);
+
     std::vector<uint8_t> tmp_q1(2*test_size);
     std::vector<uint8_t> tmp_q2(2*test_size);
 

From 5931c1f233c616083d64e41a228249d58e039aa5 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 25 Nov 2024 15:13:39 +0100
Subject: [PATCH 294/396] ggml : add support for dynamic loading of backends
 (#10469)

* ggml : add support for dynamic loading of backends

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 Makefile                               |   3 +-
 Package.swift                          |   3 +-
 common/common.cpp                      |   3 +
 examples/CMakeLists.txt                |  27 +--
 examples/eval-callback/CMakeLists.txt  |   3 +-
 examples/llama-bench/llama-bench.cpp   |  15 +-
 examples/main/main.cpp                 |  12 +-
 examples/simple-chat/simple-chat.cpp   |   3 +
 examples/simple/simple.cpp             |   4 +
 ggml/CMakeLists.txt                    |   1 +
 ggml/include/ggml-backend.h            |  15 ++
 ggml/include/ggml-cpu.h                |  38 +---
 ggml/include/ggml.h                    |  31 +++
 ggml/src/CMakeLists.txt                |  41 +++-
 ggml/src/ggml-amx/CMakeLists.txt       |  10 +-
 ggml/src/ggml-amx/ggml-amx.cpp         |   7 +-
 ggml/src/ggml-backend-impl.h           |  44 +++--
 ggml/src/ggml-backend-reg.cpp          | 252 +++++++++++++++++++++++--
 ggml/src/ggml-blas/CMakeLists.txt      |   9 +-
 ggml/src/ggml-blas/ggml-blas.cpp       |   7 +-
 ggml/src/ggml-cann/CMakeLists.txt      |   6 +-
 ggml/src/ggml-cann/ggml-cann.cpp       |  13 +-
 ggml/src/ggml-cpu/CMakeLists.txt       |  19 +-
 ggml/src/ggml-cpu/ggml-cpu.c           |  23 ---
 ggml/src/ggml-cpu/ggml-cpu.cpp         |  50 +++--
 ggml/src/ggml-cuda/CMakeLists.txt      |  11 +-
 ggml/src/ggml-cuda/ggml-cuda.cu        |  71 ++++++-
 ggml/src/ggml-hip/CMakeLists.txt       |  10 +-
 ggml/src/ggml-kompute/CMakeLists.txt   |  10 +-
 ggml/src/ggml-kompute/ggml-kompute.cpp |   7 +-
 ggml/src/ggml-metal/CMakeLists.txt     |   9 +-
 ggml/src/ggml-metal/ggml-metal.m       |  34 +++-
 ggml/src/ggml-musa/CMakeLists.txt      |  10 +-
 ggml/src/ggml-rpc/CMakeLists.txt       |   8 +-
 ggml/src/ggml-rpc/ggml-rpc.cpp         |   7 +-
 ggml/src/ggml-sycl/CMakeLists.txt      |  10 +-
 ggml/src/ggml-sycl/ggml-sycl.cpp       |  12 +-
 ggml/src/ggml-vulkan/CMakeLists.txt    |  12 +-
 ggml/src/ggml-vulkan/ggml-vulkan.cpp   |   7 +-
 ggml/src/ggml.c                        |  23 +++
 pocs/CMakeLists.txt                    |   4 +-
 src/llama.cpp                          |  77 ++++----
 tests/CMakeLists.txt                   |  13 +-
 tests/test-backend-ops.cpp             |  26 +--
 44 files changed, 728 insertions(+), 272 deletions(-)

diff --git a/Makefile b/Makefile
index dd6d864ad..14c05e93e 100644
--- a/Makefile
+++ b/Makefile
@@ -251,7 +251,7 @@ endif
 #
 
 # keep standard at C11 and C++11
-MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
+MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
 MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11
@@ -290,6 +290,7 @@ endif
 # some memory allocation are available on Linux through GNU extensions in libc
 ifeq ($(UNAME_S),Linux)
 	MK_CPPFLAGS += -D_GNU_SOURCE
+	MK_LDFLAGS  += -ldl
 endif
 
 # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
diff --git a/Package.swift b/Package.swift
index 6b68aecde..d9e8a4e2d 100644
--- a/Package.swift
+++ b/Package.swift
@@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
     contentsOf: [
         .define("GGML_USE_ACCELERATE"),
-        .define("GGML_USE_METAL")
+        .define("GGML_USE_METAL"),
+        .define("GGML_USE_CPU")
     ]
 )
 #endif
diff --git a/common/common.cpp b/common/common.cpp
index c398329d0..98524f746 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -377,6 +377,9 @@ void common_init() {
 #endif
 
     LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+
+    // load dynamic backends
+    ggml_backend_load_all();
 }
 
 std::string common_params_get_system_info(const common_params & params) {
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 9bd099d4e..632409d55 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 if (EMSCRIPTEN)
 else()
-    add_subdirectory(cvector-generator)
     add_subdirectory(batched-bench)
     add_subdirectory(batched)
-    add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(embedding)
     add_subdirectory(eval-callback)
-    add_subdirectory(export-lora)
     add_subdirectory(gbnf-validator)
     add_subdirectory(gguf-hash)
     add_subdirectory(gguf-split)
@@ -27,24 +24,16 @@ else()
     add_subdirectory(imatrix)
     add_subdirectory(infill)
     add_subdirectory(llama-bench)
-    add_subdirectory(llava)
     add_subdirectory(lookahead)
     add_subdirectory(lookup)
     add_subdirectory(main)
     add_subdirectory(parallel)
     add_subdirectory(passkey)
     add_subdirectory(perplexity)
-    add_subdirectory(quantize-stats)
     add_subdirectory(quantize)
     add_subdirectory(retrieval)
-    if (GGML_RPC)
-        add_subdirectory(rpc)
-    endif()
     if (LLAMA_BUILD_SERVER)
-    add_subdirectory(server)
-    endif()
-    if (GGML_SYCL)
-        add_subdirectory(sycl)
+        add_subdirectory(server)
     endif()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
@@ -52,4 +41,18 @@ else()
     add_subdirectory(speculative)
     add_subdirectory(speculative-simple)
     add_subdirectory(tokenize)
+    if (NOT GGML_BACKEND_DL)
+        # these examples use the backends directly and cannot be built with dynamic loading
+        add_subdirectory(convert-llama2c-to-ggml)
+        add_subdirectory(cvector-generator)
+        add_subdirectory(export-lora)
+        add_subdirectory(quantize-stats)
+        add_subdirectory(llava)
+        if (GGML_RPC)
+            add_subdirectory(rpc)
+        endif()
+        if (GGML_SYCL)
+            add_subdirectory(sycl)
+        endif()
+    endif()
 endif()
diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt
index a48753d38..5d1048aad 100644
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
 set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+add_test(NAME ${TEST_TARGET}
+        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 3dc84a75c..bac606f47 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) {
 
     cmd_params params = parse_cmd_params(argc, argv);
 
+    // initialize backends
+    ggml_backend_load_all();
+    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (!cpu_dev) {
+        fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
+        return 1;
+    }
+    auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
+    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
+
     // initialize llama.cpp
     if (!params.verbose) {
         llama_log_set(llama_null_log_callback, NULL);
@@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) {
         tpp.poll       = t.poll;
         tpp.prio       = params.prio;
 
-        struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+        struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
         if (!threadpool) {
             fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
             exit(1);
@@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) {
 
         llama_free(ctx);
 
-        ggml_threadpool_free(threadpool);
+        ggml_threadpool_free_fn(threadpool);
     }
 
     llama_free_model(lmodel);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 957451af7..d0c28f317 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -165,6 +165,10 @@ int main(int argc, char ** argv) {
 
     LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
 
+    auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
+    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
+
     struct ggml_threadpool_params tpp_batch =
             ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
     struct ggml_threadpool_params tpp =
@@ -174,7 +178,7 @@ int main(int argc, char ** argv) {
 
     struct ggml_threadpool * threadpool_batch = NULL;
     if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
-        threadpool_batch = ggml_threadpool_new(&tpp_batch);
+        threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
         if (!threadpool_batch) {
             LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
             return 1;
@@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
         tpp.paused = true;
     }
 
-    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+    struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
     if (!threadpool) {
         LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
         return 1;
@@ -890,8 +894,8 @@ int main(int argc, char ** argv) {
 
     llama_backend_free();
 
-    ggml_threadpool_free(threadpool);
-    ggml_threadpool_free(threadpool_batch);
+    ggml_threadpool_free_fn(threadpool);
+    ggml_threadpool_free_fn(threadpool_batch);
 
     return 0;
 }
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index 5f9973163..7f4da666b 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
         }
     }, nullptr);
 
+    // load dynamic backends
+    ggml_backend_load_all();
+
     // initialize the model
     llama_model_params model_params = llama_model_default_params();
     model_params.n_gpu_layers = ngl;
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 59760fe95..3288c0250 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
         }
     }
 
+    // load dynamic backends
+
+    ggml_backend_load_all();
+
     // initialize the model
 
     llama_model_params model_params = llama_model_default_params();
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 2d32da1b6..70b5cfdf7 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -33,6 +33,7 @@ else()
 endif()
 
 option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
 
 #
 # option list
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index cef164764..19881a505 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -190,6 +190,14 @@ extern "C" {
     typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
     // Get additional buffer types provided by the device (returns a NULL-terminated array)
     typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
+    // Set the abort callback for the backend
+    typedef void                         (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
+    // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
+    struct ggml_backend_feature {
+        const char * name;
+        const char * value;
+    };
+    typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
 
     //
     // Backend registry
@@ -214,6 +222,13 @@ extern "C" {
     // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
     GGML_API ggml_backend_t ggml_backend_init_best(void);
 
+    // Load a backend from a dynamic library and register it
+    GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
+    // Unload a backend if loaded dynamically and unregister it
+    GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
+    // Load all known backends from dynamic libraries
+    GGML_API void               ggml_backend_load_all(void);
+
     //
     // Backend scheduler
     //
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 7571ef979..a5358d047 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -7,29 +7,6 @@
 extern "C" {
 #endif
 
-    // Scheduling priorities
-    enum ggml_sched_priority {
-        GGML_SCHED_PRIO_NORMAL,
-        GGML_SCHED_PRIO_MEDIUM,
-        GGML_SCHED_PRIO_HIGH,
-        GGML_SCHED_PRIO_REALTIME
-    };
-
-    // Threadpool params
-    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
-    struct ggml_threadpool_params {
-        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
-        int                 n_threads;                   // number of threads
-        enum ggml_sched_priority prio;                   // thread priority
-        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
-        bool                strict_cpu;                  // strict cpu placement
-        bool                paused;                      // start in paused state
-    };
-
-    struct ggml_threadpool;     // forward declaration, see ggml.c
-
-    typedef struct ggml_threadpool * ggml_threadpool_t;
-
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
     struct ggml_cplan {
@@ -75,14 +52,11 @@ extern "C" {
     GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
     GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
 
-    GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
-    GGML_BACKEND_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
-    GGML_BACKEND_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
-    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_BACKEND_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
+    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
 
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
@@ -104,10 +78,10 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
     GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
     GGML_BACKEND_API int ggml_cpu_has_avx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
     GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
     GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
     GGML_BACKEND_API int ggml_cpu_has_fma        (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
     GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
     GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
     GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 69e6a2434..9843b09fb 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2215,6 +2215,37 @@ extern "C" {
 
     GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
 
+    // ggml threadpool
+    // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
+    // the goal should be to create an API that other backends can use move everything to the ggml base
+
+    // scheduling priorities
+    enum ggml_sched_priority {
+        GGML_SCHED_PRIO_NORMAL,
+        GGML_SCHED_PRIO_MEDIUM,
+        GGML_SCHED_PRIO_HIGH,
+        GGML_SCHED_PRIO_REALTIME
+    };
+
+    // threadpool params
+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
+    struct ggml_threadpool_params {
+        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
+    };
+
+    struct ggml_threadpool;     // forward declaration, see ggml.c
+
+    typedef struct ggml_threadpool * ggml_threadpool_t;
+
+    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 8df0e85c0..071508dda 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -202,6 +202,10 @@ endif()
 
 # ggml
 
+if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
+    message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
+endif()
+
 add_library(ggml-base
             ../include/ggml.h
             ../include/ggml-alloc.h
@@ -226,6 +230,31 @@ add_library(ggml
 
 target_link_libraries(ggml PUBLIC ggml-base)
 
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    target_link_libraries(ggml PRIVATE dl)
+endif()
+
+function(ggml_add_backend_library backend)
+    if (GGML_BACKEND_DL)
+        add_library(${backend} MODULE ${ARGN})
+        # write the shared library to the output directory
+        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
+    else()
+        add_library(${backend} ${ARGN})
+        target_link_libraries(ggml PUBLIC ${backend})
+        install(TARGETS ${backend} LIBRARY)
+    endif()
+
+    target_link_libraries(${backend} PRIVATE ggml-base)
+    target_include_directories(${backend} PRIVATE ..)
+
+    if (${BUILD_SHARED_LIBS})
+        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
+        target_compile_definitions(${backend} PUBLIC  GGML_BACKEND_SHARED)
+    endif()
+endfunction()
+
 function(ggml_add_backend backend)
     string(TOUPPER "GGML_${backend}" backend_id)
     if (${backend_id})
@@ -236,14 +265,10 @@ function(ggml_add_backend backend)
         # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
         if (${backend_id})
             message(STATUS "Including ${backend} backend")
-            if (${BUILD_SHARED_LIBS})
-                target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
-                target_compile_definitions(${backend_target} PUBLIC  GGML_BACKEND_SHARED)
+            if (NOT GGML_BACKEND_DL)
+                string(TOUPPER "GGML_USE_${backend}" backend_use)
+                target_compile_definitions(ggml PUBLIC ${backend_use})
             endif()
-            install(TARGETS ${backend_target} LIBRARY)
-            target_link_libraries(ggml PUBLIC ${backend_target})
-            string(TOUPPER "GGML_USE_${backend}" backend_use)
-            target_compile_definitions(ggml PUBLIC ${backend_use})
         endif()
     endif()
 endfunction()
@@ -256,10 +281,10 @@ ggml_add_backend(CUDA)
 ggml_add_backend(HIP)
 ggml_add_backend(Kompute)
 ggml_add_backend(METAL)
+ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
-ggml_add_backend(MUSA)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt
index d6676f3f6..cf3ade6f0 100644
--- a/ggml/src/ggml-amx/CMakeLists.txt
+++ b/ggml/src/ggml-amx/CMakeLists.txt
@@ -9,12 +9,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MA
 
     file(GLOB   GGML_SOURCES_AMX "*.cpp")
 
-    add_library(ggml-amx
-                ${GGML_HEADERS_AMX}
-                ${GGML_SOURCES_AMX})
-
-    target_link_libraries(ggml-amx PRIVATE ggml-base)
-    target_include_directories(ggml-amx PRIVATE . ..)
+    ggml_add_backend_library(ggml-amx
+                             ${GGML_HEADERS_AMX}
+                             ${GGML_SOURCES_AMX}
+                            )
 
     # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
     # TODO: integrate AMX backend into the CPU backend
diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp
index 8568e7965..6bfb3da27 100644
--- a/ggml/src/ggml-amx/ggml-amx.cpp
+++ b/ggml/src/ggml-amx/ggml-amx.cpp
@@ -409,8 +409,9 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
 
 ggml_backend_reg_t ggml_backend_amx_reg(void) {
     static struct ggml_backend_reg ggml_backend_amx_reg = {
-        /* .iface   = */ ggml_backend_amx_reg_i,
-        /* .context = */ NULL,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_amx_reg_i,
+        /* .context     = */ NULL,
     };
 
     return &ggml_backend_amx_reg;
@@ -444,3 +445,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
 }
 
 #endif
+
+GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index fa8d5b7fb..dff7749b4 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -8,6 +8,8 @@
 extern "C" {
 #endif
 
+    #define GGML_BACKEND_API_VERSION 1
+
     //
     // Backend buffer type
     //
@@ -63,20 +65,20 @@ extern "C" {
         enum ggml_backend_buffer_usage usage;
     };
 
-    ggml_backend_buffer_t ggml_backend_buffer_init(
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
                    ggml_backend_buffer_type_t buft,
             struct ggml_backend_buffer_i      iface,
                    void *                     context,
                    size_t                     size);
 
     // do not use directly, use ggml_backend_tensor_copy instead
-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+    GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
 
     // multi-buffer
     // buffer that contains a collection of buffers
-    ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_API bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_API void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
 
     //
     // Backend (stream)
@@ -199,17 +201,37 @@ extern "C" {
     };
 
     struct ggml_backend_reg {
-        // int api_version; // TODO: for dynamic loading
+        int api_version; // initialize to GGML_BACKEND_API_VERSION
         struct ggml_backend_reg_i iface;
         void * context;
     };
 
-
     // Internal backend registry API
-    void ggml_backend_register(ggml_backend_reg_t reg);
-    void ggml_backend_device_register(ggml_backend_dev_t device);
-    // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
-    // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
+    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
+    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
+
+    // Add backend dynamic loading support to the backend
+    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
+
+    #ifdef GGML_BACKEND_DL
+        #ifdef __cplusplus
+        #    define GGML_BACKEND_DL_IMPL(reg_fn)                                 \
+                extern "C" {                                                     \
+                    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+                }                                                                \
+                ggml_backend_reg_t ggml_backend_init(void) {                     \
+                    return reg_fn();                                             \
+                }
+        #else
+        #    define GGML_BACKEND_DL_IMPL(reg_fn)                             \
+                GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+                ggml_backend_reg_t ggml_backend_init(void) {                 \
+                    return reg_fn();                                         \
+                }
+        #endif
+    #else
+    #    define GGML_BACKEND_DL_IMPL(reg_fn)
+    #endif
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 63e9d8201..43d03d7fa 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -1,11 +1,29 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
-#include "ggml-cpu.h"
 #include "ggml-impl.h"
+#include <algorithm>
 #include <cstring>
+#include <string>
 #include <vector>
 
+#ifdef _WIN32
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#elif defined(__APPLE__)
+#    include <mach-o/dyld.h>
+#    include <dlfcn.h>
+#else
+#    include <dlfcn.h>
+#    include <unistd.h>
+#endif
+
 // Backend registry
+#ifdef GGML_USE_CPU
+#include "ggml-cpu.h"
+#endif
 
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
@@ -43,8 +61,13 @@
 #include "ggml-kompute.h"
 #endif
 
+struct ggml_backend_reg_entry {
+    ggml_backend_reg_t reg;
+    void * handle;
+};
+
 struct ggml_backend_registry {
-    std::vector<ggml_backend_reg_t> backends;
+    std::vector<ggml_backend_reg_entry> backends;
     std::vector<ggml_backend_dev_t> devices;
 
     ggml_backend_registry() {
@@ -75,11 +98,19 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_KOMPUTE
         register_backend(ggml_backend_kompute_reg());
 #endif
-
+#ifdef GGML_USE_CPU
         register_backend(ggml_backend_cpu_reg());
+#endif
     }
 
-    void register_backend(ggml_backend_reg_t reg) {
+    ~ggml_backend_registry() {
+        while (!backends.empty()) {
+            // use silent since the log system may have been destroyed at this point
+            unload_backend(backends.back().reg, true);
+        }
+    }
+
+    void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
         if (!reg) {
             return;
         }
@@ -88,7 +119,7 @@ struct ggml_backend_registry {
         GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
             __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
 #endif
-        backends.push_back(reg);
+        backends.push_back({ reg, handle });
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
             register_device(ggml_backend_reg_dev_get(reg, i));
         }
@@ -100,6 +131,111 @@ struct ggml_backend_registry {
 #endif
         devices.push_back(device);
     }
+
+    ggml_backend_reg_t load_backend(const char * path, bool silent) {
+#ifdef _WIN32
+        // suppress error dialogs for missing DLLs
+        DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+        SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+        HMODULE handle = LoadLibraryA(path);
+
+        if (!handle) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
+            }
+            SetErrorMode(old_mode);
+            return nullptr;
+        }
+
+        ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
+
+        SetErrorMode(old_mode);
+
+        if (!backend_init) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
+            }
+            FreeLibrary(handle);
+            return nullptr;
+        }
+#else
+        void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
+
+        if (!handle) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
+            }
+            return nullptr;
+        }
+
+        auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
+
+        if (!backend_init) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
+            }
+            dlclose(handle);
+            return nullptr;
+        }
+#endif
+        ggml_backend_reg_t reg = backend_init();
+
+        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
+            if (!silent) {
+                if (!reg) {
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
+                } else {
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
+                                   __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
+                }
+            }
+#ifdef _WIN32
+            FreeLibrary(handle);
+#else
+            dlclose(handle);
+#endif
+            return nullptr;
+        }
+
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
+        register_backend(reg, handle);
+        return reg;
+    }
+
+    void unload_backend(ggml_backend_reg_t reg, bool silent) {
+        auto it = std::find_if(backends.begin(), backends.end(),
+                                [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
+
+        if (it == backends.end()) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: backend not found\n", __func__);
+            }
+            return;
+        }
+
+        if (!silent) {
+            GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
+        }
+
+        // remove devices
+        devices.erase(
+            std::remove_if(devices.begin(), devices.end(),
+                            [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
+            devices.end());
+
+        // unload library
+        if (it->handle) {
+#ifdef _WIN32
+            FreeLibrary((HMODULE) it->handle);
+#else
+            dlclose(it->handle);
+#endif
+        }
+
+        // remove backend
+        backends.erase(it);
+    }
 };
 
 static ggml_backend_registry & get_reg() {
@@ -123,7 +259,7 @@ size_t ggml_backend_reg_count() {
 
 ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
     GGML_ASSERT(index < ggml_backend_reg_count());
-    return get_reg().backends[index];
+    return get_reg().backends[index].reg;
 }
 
 ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
@@ -133,7 +269,7 @@ ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
             return reg;
         }
     }
-    return NULL;
+    return nullptr;
 }
 
 // Device enumeration
@@ -153,7 +289,7 @@ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
             return dev;
         }
     }
-    return NULL;
+    return nullptr;
 }
 
 ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
@@ -163,14 +299,14 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
             return dev;
         }
     }
-    return NULL;
+    return nullptr;
 }
 
 // Convenience functions
 ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
     ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
     if (!dev) {
-        return NULL;
+        return nullptr;
     }
     return ggml_backend_dev_init(dev, params);
 }
@@ -178,7 +314,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params)
 ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
     ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
     if (!dev) {
-        return NULL;
+        return nullptr;
     }
     return ggml_backend_dev_init(dev, params);
 }
@@ -189,7 +325,97 @@ ggml_backend_t ggml_backend_init_best(void) {
         dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
     }
     if (!dev) {
-        return NULL;
+        return nullptr;
     }
-    return ggml_backend_dev_init(dev, NULL);
+    return ggml_backend_dev_init(dev, nullptr);
+}
+
+// Dynamic loading
+ggml_backend_reg_t ggml_backend_load(const char * path) {
+    return get_reg().load_backend(path, false);
+}
+
+void ggml_backend_unload(ggml_backend_reg_t reg) {
+    get_reg().unload_backend(reg, true);
+}
+
+void ggml_backend_load_all() {
+    std::vector<std::string> search_prefix;
+
+    // add the executable directory to the search path
+    // FIXME: this is convenient for development, but it should probably be disabled in production
+
+#if defined(__APPLE__)
+    // get executable path
+    std::vector<char> path;
+    uint32_t size;
+    while (true) {
+        size = path.size();
+        if (_NSGetExecutablePath(path.data(), &size) == 0) {
+            break;
+        }
+        path.resize(size);
+    }
+    std::string base_path(path.data(), size);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('/');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    search_prefix.push_back(base_path + "/");
+#elif defined(__linux__)
+    std::string base_path = ".";
+    std::vector<char> path(1024);
+    while (true) {
+        // get executable path
+        ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
+        if (len == -1) {
+            break;
+        }
+        if (len < (ssize_t) path.size()) {
+            base_path = std::string(path.data(), len);
+            // remove executable name
+            auto last_slash = base_path.find_last_of('/');
+            if (last_slash != std::string::npos) {
+                base_path = base_path.substr(0, last_slash);
+            }
+            break;
+        }
+        path.resize(path.size() * 2);
+    }
+
+    search_prefix.push_back(base_path + "/");
+#endif
+
+    auto & reg = get_reg();
+
+    auto try_load = [&](const std::string & name) {
+        std::string os_name;
+#ifdef _WIN32
+        os_name = "ggml-" + name + ".dll";
+#else
+        os_name = "libggml-" + name + ".so";
+#endif
+        if (reg.load_backend(os_name.c_str(), true)) {
+            return;
+        }
+        for (const auto & prefix : search_prefix) {
+            if (reg.load_backend((prefix + os_name).c_str(), true)) {
+                return;
+            }
+        }
+    };
+
+    try_load("amx");
+    try_load("blas");
+    try_load("cann");
+    try_load("cuda");
+    try_load("hip");
+    try_load("kompute");
+    try_load("metal");
+    try_load("rpc");
+    try_load("sycl");
+    try_load("vulkan");
+    try_load("musa");
+    try_load("cpu");
 }
diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt
index e2cbabf0d..0bf3c05d9 100644
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@@ -11,12 +11,9 @@ find_package(BLAS)
 if (BLAS_FOUND)
     message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
 
-    add_library(ggml-blas
-                ggml-blas.cpp
-                )
-
-    target_link_libraries(ggml-blas PRIVATE ggml-base)
-    target_include_directories(ggml-blas PRIVATE . ..)
+    ggml_add_backend_library(ggml-blas
+                             ggml-blas.cpp
+                            )
 
     if (${GGML_BLAS_VENDOR} MATCHES "Apple")
         add_compile_definitions(ACCELERATE_NEW_LAPACK)
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index 648c9d875..ec158dfac 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -506,9 +506,12 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
 
 ggml_backend_reg_t ggml_backend_blas_reg(void) {
     static struct ggml_backend_reg ggml_backend_blas_reg = {
-        /* .iface   = */ ggml_backend_blas_reg_i,
-        /* .context = */ NULL,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_blas_reg_i,
+        /* .context     = */ NULL,
     };
 
     return &ggml_backend_blas_reg;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt
index 756200b89..901327185 100644
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -61,9 +61,9 @@ if (CANN_INSTALL_DIR)
 
     file(GLOB GGML_SOURCES_CANN "*.cpp")
 
-    add_library(ggml-cann ${GGML_SOURCES_CANN})
-    target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES})
-    target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
+    ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN})
+    target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES})
+    target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS})
     target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
 
     target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 776340881..d96f65936 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2064,16 +2064,17 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
                 dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
                 ggml_cann_set_device(i);
                 ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_cann_device_interface,
-                    /* .reg       = */ &reg,
-                    /* .context   = */ dev_ctx
+                    /* .iface   = */ ggml_backend_cann_device_interface,
+                    /* .reg     = */ &reg,
+                    /* .context = */ dev_ctx
                 };
                 ctx->devices.push_back(dev);
             }
 
             reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_cann_reg_interface,
-                /* .context   = */ ctx
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_cann_reg_interface,
+                /* .context     = */ ctx
             };
         }
 
@@ -2126,3 +2127,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
     ggml_cann_set_device(device);
     ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 288052333..c2905d1fb 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -1,14 +1,13 @@
-add_library(ggml-cpu
-            ggml-cpu.c
-            ggml-cpu.cpp
-            ggml-cpu-aarch64.c
-            ggml-cpu-aarch64.h
-            ggml-cpu-quants.c
-            ggml-cpu-quants.h
-            )
+ggml_add_backend_library(ggml-cpu
+                         ggml-cpu.c
+                         ggml-cpu.cpp
+                         ggml-cpu-aarch64.c
+                         ggml-cpu-aarch64.h
+                         ggml-cpu-quants.c
+                         ggml-cpu-quants.h
+                        )
 
-target_link_libraries(ggml-cpu PRIVATE ggml-base)
-target_include_directories(ggml-cpu PRIVATE . ..)
+target_include_directories(ggml-cpu PRIVATE .)
 
 if (APPLE AND GGML_ACCELERATE)
     find_library(ACCELERATE_FRAMEWORK Accelerate)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 4b58254e7..c6ede19d9 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
 
 #endif // GGML_USE_OPENMP
 
-void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
-    p->n_threads  = n_threads;
-    p->prio       = 0;     // default priority (usually means normal or inherited)
-    p->poll       = 50;    // hybrid-polling enabled
-    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
-    p->paused     = false; // threads are ready to go
-    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
-}
-
-struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
-    struct ggml_threadpool_params p;
-    ggml_threadpool_params_init(&p, n_threads);
-    return p;
-}
-
-bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
-    if (p0->n_threads      != p1->n_threads  )    return false;
-    if (p0->prio           != p1->prio       )    return false;
-    if (p0->poll           != p1->poll       )    return false;
-    if (p0->strict_cpu     != p1->strict_cpu )    return false;
-    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
-}
-
 static struct ggml_threadpool * ggml_threadpool_new_impl(
     struct ggml_threadpool_params * tpp,
                struct ggml_cgraph * cgraph,
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 573b7c5b9..febed433a 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg
     return &ggml_backend_cpu_device;
 }
 
-struct ggml_backend_feature {
-    const char * name;
-    const char * value;
-};
-
-// Not used yet
 // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
-// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
+// and additionally to allow other backends to expose their own list of features that applications can query using the same API
 static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
     static std::vector<ggml_backend_feature> features = []() {
+        ggml_cpu_init();
+
         std::vector<ggml_backend_feature> features;
         if (ggml_cpu_has_sse3()) {
             features.push_back({ "SSE3", "1" });
@@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_avx()) {
             features.push_back({ "AVX", "1" });
         }
+        if (ggml_cpu_has_avx_vnni()) {
+            features.push_back({ "AVX_VNNI", "1" });
+        }
         if (ggml_cpu_has_avx2()) {
             features.push_back({ "AVX2", "1" });
         }
@@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_fma()) {
             features.push_back({ "FMA", "1" });
         }
-        if (ggml_cpu_has_avx_vnni()) {
-            features.push_back({ "AVX_VNNI", "1" });
-        }
         if (ggml_cpu_has_avx512()) {
             features.push_back({ "AVX512", "1" });
         }
@@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_llamafile()) {
             features.push_back({ "LLAMAFILE", "1" });
         }
+        // TODO: rename this
+    #ifdef GGML_USE_CPU_AARCH64
+        features.push_back({ "AARCH64_REPACK", "1" });
+    #endif
 
         features.push_back({ nullptr, nullptr });
 
@@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
     if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
         return (void *)ggml_backend_cpu_get_extra_bufts;
     }
+    if (strcmp(name, "ggml_backend_get_features") == 0) {
+        return (void *)ggml_backend_cpu_get_features;
+    }
+    if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
+        return (void *)ggml_backend_cpu_set_abort_callback;
+    }
+    if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
+        return (void *)ggml_numa_init;
+    }
+    if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
+        return (void *)ggml_is_numa;
+    }
+
+    // threadpool - TODO:  move to ggml-base
+    if (strcmp(name, "ggml_threadpool_new") == 0) {
+        return (void *)ggml_threadpool_new;
+    }
+    if (strcmp(name, "ggml_threadpool_free") == 0) {
+        return (void *)ggml_threadpool_free;
+    }
+    if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
+        return (void *)ggml_backend_cpu_set_threadpool;
+    }
 
     return NULL;
 
@@ -655,9 +678,12 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
     ggml_cpu_init();
 
     static struct ggml_backend_reg ggml_backend_cpu_reg = {
-        /* .iface   = */ ggml_backend_cpu_reg_i,
-        /* .context = */ NULL,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_cpu_reg_i,
+        /* .context     = */ NULL,
     };
 
     return &ggml_backend_cpu_reg;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index e1482a269..b0cb93e07 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -46,13 +46,10 @@ if (CUDAToolkit_FOUND)
         list(APPEND GGML_SOURCES_CUDA ${SRCS})
     endif()
 
-    add_library(ggml-cuda
-                ${GGML_HEADERS_CUDA}
-                ${GGML_SOURCES_CUDA}
-                )
-
-    target_link_libraries(ggml-cuda PRIVATE ggml-base)
-    target_include_directories(ggml-cuda PRIVATE . ..)
+    ggml_add_backend_library(ggml-cuda
+                             ${GGML_HEADERS_CUDA}
+                             ${GGML_SOURCES_CUDA}
+                            )
 
     add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index dd94ab03d..2a78a4393 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re
     return ctx->devices[index];
 }
 
+static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
+    static std::vector<ggml_backend_feature> features = []() {
+        std::vector<ggml_backend_feature> features;
+    #define _STRINGIFY(...) #__VA_ARGS__
+    #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
+
+    #ifdef __CUDA_ARCH_LIST__
+        features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
+    #endif
+
+    #ifdef GGML_CUDA_FORCE_MMQ
+        features.push_back({ "FORCE_MMQ", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_FORCE_CUBLAS
+        features.push_back({ "FORCE_CUBLAS", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_NO_VMM
+        features.push_back({ "NO_VMM", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_NO_PEER_COPY
+        features.push_back({ "NO_PEER_COPY", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_F16
+        features.push_back({ "F16", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_USE_GRAPHS
+        features.push_back({ "USE_GRAPHS", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+        features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
+    #endif
+
+    #ifdef GGML_CUDA_FA_ALL_QUANTS
+        features.push_back({ "FA_ALL_QUANTS", "1" });
+    #endif
+
+    #undef _STRINGIFY
+    #undef STRINGIFY
+
+        features.push_back({ nullptr, nullptr });
+
+        return features;
+    }();
+
+    return features.data();
+
+    GGML_UNUSED(reg);
+}
+
 static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
     GGML_UNUSED(reg);
     if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
@@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
     if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
         return (void *)ggml_backend_cuda_unregister_host_buffer;
     }
+    if (strcmp(name, "ggml_backend_get_features") == 0) {
+        return (void *)ggml_backend_cuda_get_features;
+    }
     return nullptr;
 }
 
@@ -3169,16 +3227,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 dev_ctx->description = prop.name;
 
                 ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_cuda_device_interface,
-                    /* .reg       = */ &reg,
-                    /* .context   = */ dev_ctx
+                    /* .iface   = */ ggml_backend_cuda_device_interface,
+                    /* .reg     = */ &reg,
+                    /* .context = */ dev_ctx
                 };
                 ctx->devices.push_back(dev);
             }
 
             reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_cuda_reg_interface,
-                /* .context   = */ ctx
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_cuda_reg_interface,
+                /* .context     = */ ctx
             };
         }
 
@@ -3209,3 +3268,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
 
     return cuda_backend;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt
index fccf8eb84..b15fbd24d 100644
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -64,12 +64,10 @@ else()
     list(APPEND GGML_SOURCES_ROCM ${SRCS})
 endif()
 
-add_library(ggml-hip
-            ${GGML_HEADERS_ROCM}
-            ${GGML_SOURCES_ROCM})
-
-target_link_libraries(ggml-hip PRIVATE ggml-base)
-target_include_directories(ggml-hip PRIVATE . ..)
+ggml_add_backend_library(ggml-hip
+                         ${GGML_HEADERS_ROCM}
+                         ${GGML_SOURCES_ROCM}
+                        )
 
 # TODO: do not use CUDA definitions for HIP
 target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt
index 0bd027c7f..dc623926c 100644
--- a/ggml/src/ggml-kompute/CMakeLists.txt
+++ b/ggml/src/ggml-kompute/CMakeLists.txt
@@ -6,13 +6,13 @@ if (NOT glslc_executable)
     message(FATAL_ERROR "glslc not found")
 endif()
 
-add_library(ggml-kompute
-            ggml-kompute.cpp
-            ../../include/ggml-kompute.h
-            )
+ggml_add_backend_library(ggml-kompute
+                         ggml-kompute.cpp
+                         ../../include/ggml-kompute.h
+                        )
 
 target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
-target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
 add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
 
diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp
index 2fea9e4cc..24566404d 100644
--- a/ggml/src/ggml-kompute/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute/ggml-kompute.cpp
@@ -2176,9 +2176,12 @@ static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {
 
 ggml_backend_reg_t ggml_backend_kompute_reg() {
     static ggml_backend_reg reg = {
-        /* .iface   = */ ggml_backend_kompute_reg_i,
-        /* .context = */ nullptr,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_kompute_reg_i,
+        /* .context     = */ nullptr,
     };
 
     return &reg;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)
diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt
index b237d79f4..1bad27206 100644
--- a/ggml/src/ggml-metal/CMakeLists.txt
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@@ -4,19 +4,16 @@ find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
 
 message(STATUS "Metal framework found")
 
-add_library(ggml-metal
-            ggml-metal.m
-            )
+ggml_add_backend_library(ggml-metal
+                         ggml-metal.m
+                        )
 
 target_link_libraries(ggml-metal PRIVATE
-                      ggml-base
                       ${FOUNDATION_LIBRARY}
                       ${METAL_FRAMEWORK}
                       ${METALKIT_FRAMEWORK}
                       )
 
-target_include_directories(ggml-metal PRIVATE . ..)
-
 if (GGML_METAL_NDEBUG)
     add_compile_definitions(GGML_METAL_NDEBUG)
 endif()
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 3a533d7f9..63baaf163 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4372,19 +4372,45 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r
     GGML_UNUSED(index);
 }
 
+static struct ggml_backend_feature g_ggml_backend_metal_features[] = {
+#if defined(GGML_METAL_EMBED_LIBRARY)
+    { "EMBED_LIBRARY", "1" },
+#endif
+#if defined(GGML_METAL_USE_BF16)
+    { "BF16", "1" },
+#endif
+    { nil, nil },
+};
+
+static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) {
+    return g_ggml_backend_metal_features;
+
+    GGML_UNUSED(reg);
+}
+
+static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_get_features") == 0) {
+        return (void *)ggml_backend_metal_get_features;
+    }
+
+    return NULL;
+
+    GGML_UNUSED(reg);
+}
 static struct ggml_backend_reg_i ggml_backend_metal_reg_i = {
     /* .get_name         = */ ggml_backend_metal_reg_get_name,
     /* .device_count     = */ ggml_backend_metal_reg_device_count,
     /* .device_get       = */ ggml_backend_metal_reg_device_get,
-    /* .get_proc_address = */ NULL,
+    /* .get_proc_address = */ ggml_backend_metal_get_proc_address,
 };
 
 ggml_backend_reg_t ggml_backend_metal_reg(void) {
     // TODO: make this thread-safe somehow?
     {
         g_ggml_backend_metal_reg = (struct ggml_backend_reg) {
-            /* .iface   = */ ggml_backend_metal_reg_i,
-            /* .context = */ NULL,
+            /* .api_version = */ GGML_BACKEND_API_VERSION,
+            /* .iface       = */ ggml_backend_metal_reg_i,
+            /* .context     = */ NULL,
         };
 
         g_ggml_backend_metal_device = (struct ggml_backend_device) {
@@ -4396,3 +4422,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
 
     return &g_ggml_backend_metal_reg;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt
index f3c013692..e1a69186e 100644
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -47,12 +47,10 @@ if (MUSAToolkit_FOUND)
         set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
     endforeach()
 
-    add_library(ggml-musa
-                ${GGML_HEADERS_MUSA}
-                ${GGML_SOURCES_MUSA})
-
-    target_link_libraries(ggml-musa PRIVATE ggml-base)
-    target_include_directories(ggml-musa PRIVATE . ..)
+    ggml_add_backend_library(ggml-musa
+                             ${GGML_HEADERS_MUSA}
+                             ${GGML_SOURCES_MUSA}
+                            )
 
     # TODO: do not use CUDA definitions for MUSA
     target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
diff --git a/ggml/src/ggml-rpc/CMakeLists.txt b/ggml/src/ggml-rpc/CMakeLists.txt
index a2d6770eb..f5acb8ec2 100644
--- a/ggml/src/ggml-rpc/CMakeLists.txt
+++ b/ggml/src/ggml-rpc/CMakeLists.txt
@@ -1,10 +1,8 @@
 message(STATUS "Using RPC backend")
 
-add_library(ggml-rpc
-            ggml-rpc.cpp)
-
-target_link_libraries(ggml-rpc PRIVATE ggml-base)
-target_include_directories(ggml-rpc PRIVATE . ..)
+ggml_add_backend_library(ggml-rpc
+                         ggml-rpc.cpp
+                        )
 
 if (WIN32)
     target_link_libraries(ggml-rpc PRIVATE ws2_32)
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 47357daab..431082426 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -1369,8 +1369,9 @@ static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
 
 ggml_backend_reg_t ggml_backend_rpc_reg(void) {
     static struct ggml_backend_reg ggml_backend_rpc_reg = {
-        /* .iface   = */ ggml_backend_rpc_reg_i,
-        /* .context = */ NULL,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_rpc_reg_i,
+        /* .context     = */ NULL,
     };
 
     return &ggml_backend_rpc_reg;
@@ -1401,3 +1402,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
 
     return dev;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index d1d0ff83d..83f223fd7 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -16,12 +16,10 @@ endif()
 message(STATUS "SYCL found")
 #todo: AOT
 
-add_library(ggml-sycl
-            ggml-sycl.cpp
-            ../../include/ggml-sycl.h)
-
-target_link_libraries(ggml-sycl PRIVATE ggml-base)
-target_include_directories(ggml-sycl PRIVATE . ..)
+ggml_add_backend_library(ggml-sycl
+                         ggml-sycl.cpp
+                         ../../include/ggml-sycl.h
+                        )
 
 if (GGML_SYCL_F16)
     if (GGML_SYCL_TARGET STREQUAL "AMD")
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 255bc64c6..b6392ed8d 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4637,16 +4637,17 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
                 dev_ctx->description = prop.get_name();
 
                 ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_sycl_device_interface,
-                    /* .reg       = */ &reg,
-                    /* .context   = */ dev_ctx
+                    /* .iface       = */ ggml_backend_sycl_device_interface,
+                    /* .reg         = */ &reg,
+                    /* .context     = */ dev_ctx
                 };
                 ctx->devices.push_back(dev);
             }
 
             reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_sycl_reg_interface,
-                /* .context   = */ ctx
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_sycl_reg_interface,
+                /* .context     = */ ctx
             };
         }
 
@@ -4678,3 +4679,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
     return sycl_backend;
 }
 
+GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)
diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt
index 1e85dd15b..ae0485e04 100644
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -3,13 +3,13 @@ find_package(Vulkan COMPONENTS glslc REQUIRED)
 if (Vulkan_FOUND)
     message(STATUS "Vulkan found")
 
-    add_library(ggml-vulkan
-                ggml-vulkan.cpp
-                ../../include/ggml-vulkan.h
-                )
+    ggml_add_backend_library(ggml-vulkan
+                             ggml-vulkan.cpp
+                             ../../include/ggml-vulkan.h
+                            )
 
-    target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan)
-    target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+    target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
+    target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
     # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
     # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ca71da2f7..49527fdf4 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -6738,8 +6738,9 @@ static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
 
 ggml_backend_reg_t ggml_backend_vk_reg() {
     static ggml_backend_reg reg = {
-        /* .iface   = */ ggml_backend_vk_reg_i,
-        /* .context = */ nullptr,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_vk_reg_i,
+        /* .context     = */ nullptr,
     };
 
     return &reg;
@@ -7365,3 +7366,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
     VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
 }
 #endif
+
+GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 78e7874de..1a2318cb1 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
     g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
     g_logger_state.log_callback_user_data = user_data;
 }
+
+void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
+    p->n_threads  = n_threads;
+    p->prio       = 0;     // default priority (usually means normal or inherited)
+    p->poll       = 50;    // hybrid-polling enabled
+    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+    p->paused     = false; // threads are ready to go
+    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
+}
+
+struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
+    struct ggml_threadpool_params p;
+    ggml_threadpool_params_init(&p, n_threads);
+    return p;
+}
+
+bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
+    if (p0->n_threads      != p1->n_threads  )    return false;
+    if (p0->prio           != p1->prio       )    return false;
+    if (p0->poll           != p1->poll       )    return false;
+    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
+}
diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt
index 03e1d2c04..d49d14dee 100644
--- a/pocs/CMakeLists.txt
+++ b/pocs/CMakeLists.txt
@@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 if (EMSCRIPTEN)
 else()
-    add_subdirectory(vdot)
+    if (NOT GGML_BACKEND_DL)
+        add_subdirectory(vdot)
+    endif()
 endif()
diff --git a/src/llama.cpp b/src/llama.cpp
index 20df09b13..83bbc10a5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4866,7 +4866,9 @@ struct llama_model_loader {
             mappings.reserve(files.size());
             mmaps_used.reserve(files.size());
             for (const auto & file : files) {
-                std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
+                auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+                auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
+                std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
                 mmaps_used.emplace_back(mapping->size, 0);
                 if (mlock_mmaps) {
                     std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@@ -9190,7 +9192,7 @@ static bool llm_load_tensors(
         ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
         if (!dev) {
             // FIXME: workaround for CPU backend buft having a NULL device
-            dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
+            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
         }
         ggml_backend_dev_props props;
         ggml_backend_dev_get_props(dev, &props);
@@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute(
                     int   n_threads,
         ggml_threadpool * threadpool) {
     if (lctx.backend_cpu != nullptr) {
-        ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
-        ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
+        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
+        set_threadpool_fn(lctx.backend_cpu, threadpool);
     }
 
     // set the number of threads for all the backends
@@ -19478,7 +19481,11 @@ void llama_backend_init(void) {
 
 void llama_numa_init(enum ggml_numa_strategy numa) {
     if (numa != GGML_NUMA_STRATEGY_DISABLED) {
-        ggml_numa_init(numa);
+        auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        GGML_ASSERT(dev && "CPU backend is not loaded");
+        auto * reg = ggml_backend_dev_backend_reg(dev);
+        auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
+        numa_init_fn(numa);
     }
 }
 
@@ -19752,9 +19759,6 @@ struct llama_context * llama_new_context_with_model(
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
     }
 
-    ctx->abort_callback      = params.abort_callback;
-    ctx->abort_callback_data = params.abort_callback_data;
-
     ctx->logits_all = params.logits_all;
 
     // build worst-case graph for encoder if a model contains encoder
@@ -19803,7 +19807,7 @@ struct llama_context * llama_new_context_with_model(
         }
 
         // add CPU backend
-        ctx->backend_cpu = ggml_backend_cpu_init();
+        ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
         if (ctx->backend_cpu == nullptr) {
             LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
             llama_free(ctx);
@@ -19823,6 +19827,8 @@ struct llama_context * llama_new_context_with_model(
             }
         }
 
+        llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
+
         if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
@@ -19868,7 +19874,8 @@ struct llama_context * llama_new_context_with_model(
             std::vector<ggml_backend_t> backend_ptrs;
             for (auto & backend : ctx->backends) {
                 auto * buft = ggml_backend_get_default_buffer_type(backend.get());
-                if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
+                auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
                     // use the host buffer of the first device CPU for faster transfer of the intermediate state
                     auto * dev = model->devices[0];
                     auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
@@ -19896,7 +19903,8 @@ struct llama_context * llama_new_context_with_model(
             // pipeline parallelism requires support for async compute and events in all devices
             if (pipeline_parallel) {
                 for (auto & backend : ctx->backends) {
-                    if (ggml_backend_is_cpu(backend.get())) {
+                    auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                    if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
                         // ignore CPU backend
                         continue;
                     }
@@ -21450,6 +21458,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
 void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
     ctx->abort_callback      = abort_callback;
     ctx->abort_callback_data = abort_callback_data;
+
+    for (auto & backend : ctx->backends) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
+        auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
+        if (set_abort_callback_fn) {
+            set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
+        }
+    }
 }
 
 void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
@@ -22191,32 +22207,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
 }
 
 const char * llama_print_system_info(void) {
-    ggml_cpu_init(); // some ARM features are detected at runtime
-
     static std::string s;
 
-    s  = "";
-    s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
-    s += "AVX_VNNI = "    + std::to_string(ggml_cpu_has_avx_vnni())    + " | ";
-    s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
-    s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
-    s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
-    s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
-    s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
-    s += "AMX_INT8 = "    + std::to_string(ggml_cpu_has_amx_int8())    + " | ";
-    s += "FMA = "         + std::to_string(ggml_cpu_has_fma())         + " | ";
-    s += "NEON = "        + std::to_string(ggml_cpu_has_neon())        + " | ";
-    s += "SVE = "         + std::to_string(ggml_cpu_has_sve())         + " | ";
-    s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
-    s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
-    s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
-    s += "RISCV_VECT = "  + std::to_string(ggml_cpu_has_riscv_v())     + " | ";
-    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
-    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
-    s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
-    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
-    s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
-    s += "LLAMAFILE = "   + std::to_string(ggml_cpu_has_llamafile())   + " | ";
+    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+        auto * reg = ggml_backend_reg_get(i);
+        auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
+        if (get_features_fn) {
+            ggml_backend_feature * features = get_features_fn(reg);
+            s += ggml_backend_reg_name(reg);
+            s += " : ";
+            for (; features->name; features++) {
+                s += features->name;
+                s += " = ";
+                s += features->value;
+                s += " | ";
+            }
+        }
+    }
 
     return s.c_str();
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b06f122e8..82373ff4e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
 # llama_target_and_test(test-double-float.cpp) # SLOW
 llama_target_and_test(test-log.cpp)
 llama_target_and_test(test-arg-parser.cpp)
-llama_target_and_test(test-quantize-fns.cpp)
-llama_target_and_test(test-quantize-perf.cpp)
 llama_target_and_test(test-sampling.cpp)
 llama_target_and_test(test-chat-template.cpp)
 
 llama_target_and_test(test-grammar-parser.cpp)
 llama_target_and_test(test-grammar-integration.cpp)
 llama_target_and_test(test-llama-grammar.cpp)
-llama_target_and_test(test-barrier.cpp)
 # llama_target_and_test(test-opt.cpp) # SLOW
 llama_target_and_test(test-backend-ops.cpp)
 
-llama_target_and_test(test-rope.cpp)
-
 llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_target_and_test(test-autorelease.cpp        LABEL "model")
 
+if (NOT GGML_BACKEND_DL)
+    # these tests use the backends directly and cannot be built with dynamic loading
+    llama_target_and_test(test-barrier.cpp)
+    llama_target_and_test(test-quantize-fns.cpp)
+    llama_target_and_test(test-quantize-perf.cpp)
+    llama_target_and_test(test-rope.cpp)
+endif()
+
 # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
     llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index b2b570524..6376b0e4c 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -16,7 +16,6 @@
 
 
 #include <ggml.h>
-#include <ggml-cpu.h>
 #include <ggml-alloc.h>
 #include <ggml-backend.h>
 
@@ -26,7 +25,6 @@
 #include <cstdint>
 #include <cstring>
 #include <cinttypes>
-#include <functional>
 #include <memory>
 #include <random>
 #include <stdio.h>
@@ -639,19 +637,20 @@ struct test_case {
 
         // determine number of runs
         int n_runs;
+        bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
         if (op_flops(out) > 0) {
             // based on flops
             const uint64_t GFLOP = 1000 * 1000 * 1000;
             const uint64_t target_flops_cpu =   8ULL * GFLOP;
             const uint64_t target_flops_gpu = 100ULL * GFLOP;
-            uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
+            uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
             n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
         } else {
             // based on memory size
             const size_t GB = 1ULL << 30;
             const size_t target_size_cpu =  8 * GB;
             const size_t target_size_gpu = 32 * GB;
-            size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
+            size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
             n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
         }
 
@@ -3873,7 +3872,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
     if (mode == MODE_TEST) {
         auto test_cases = make_test_cases_eval();
-        ggml_backend_t backend_cpu = ggml_backend_cpu_init();
+        ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+        if (backend_cpu == NULL) {
+            printf("  Failed to initialize CPU backend\n");
+            return false;
+        }
 
         size_t n_ok = 0;
         for (auto & test : test_cases) {
@@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) {
         }
     }
 
-    // enumerate backends
+    // load and enumerate backends
+    ggml_backend_load_all();
+
     printf("Testing %zu devices\n\n", ggml_backend_dev_count());
 
     size_t n_ok = 0;
@@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) {
             continue;
         }
 
-        ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
-        GGML_ASSERT(backend != NULL);
-
-        if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
+        if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
             printf("  Skipping CPU backend\n");
-            ggml_backend_free(backend);
             n_ok++;
             continue;
         }
 
+        ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
+        GGML_ASSERT(backend != NULL);
+
         ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
         auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
         if (ggml_backend_set_n_threads_fn) {

From 9ca2e677626fce759d5d95c407c03677b9c87a26 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 Nov 2024 16:31:38 +0200
Subject: [PATCH 295/396] server : add speculative decoding support (#10455)

* server : add speculative decoding support

ggml-ci

* server : add helper function slot.can_speculate()

ggml-ci
---
 examples/server/server.cpp | 443 +++++++++++++++++++++++++------------
 1 file changed, 301 insertions(+), 142 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 6c55d65c0..f9d20fee5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2,10 +2,11 @@
 
 #include "arg.h"
 #include "common.h"
-#include "log.h"
-#include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
+#include "log.h"
+#include "sampling.h"
+#include "speculative.h"
 
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@@ -121,12 +122,21 @@ struct slot_params {
     int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
 
     std::vector<std::string> antiprompt;
+
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
 };
 
 struct server_slot {
     int id;
     int id_task = -1;
 
+    llama_batch batch_spec;
+
+    llama_context * ctx_dft = nullptr;
+
+    common_speculative * spec = nullptr;
+
     // the index relative to completion multi-task request
     size_t index = 0;
 
@@ -175,7 +185,6 @@ struct server_slot {
     // sampling
     json json_schema;
 
-    struct common_params_sampling sparams;
     struct common_sampler * smpl = nullptr;
 
     llama_token sampled;
@@ -212,7 +221,7 @@ struct server_slot {
         generated_token_probs.clear();
     }
 
-    bool has_budget(common_params &global_params) {
+    bool has_budget(const common_params & global_params) {
         if (params.n_predict == -1 && global_params.n_predict == -1) {
             return true; // limitless
         }
@@ -232,6 +241,10 @@ struct server_slot {
         return state != SLOT_STATE_IDLE;
     }
 
+    bool can_speculate() const {
+        return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt;
+    }
+
     void add_token(const completion_token_output & token) {
         if (!is_processing()) {
             SLT_WRN(*this, "%s", "slot is not processing\n");
@@ -591,11 +604,14 @@ struct server_response {
 };
 
 struct server_context {
+    common_params params_base;
+
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
     std::vector<common_lora_adapter_container> loras;
 
-    common_params params;
+    llama_model * model_dft = nullptr;
+    llama_context_params cparams_dft;
 
     llama_batch batch = {};
 
@@ -628,27 +644,41 @@ struct server_context {
             model = nullptr;
         }
 
+        if (model_dft) {
+            llama_free_model(model_dft);
+            model_dft = nullptr;
+        }
+
         // Clear any sampling context
         for (server_slot & slot : slots) {
-            if (slot.smpl != nullptr) {
-                common_sampler_free(slot.smpl);
-            }
+            common_sampler_free(slot.smpl);
+            slot.smpl = nullptr;
+
+            llama_free(slot.ctx_dft);
+            slot.ctx_dft = nullptr;
+
+            common_speculative_free(slot.spec);
+            slot.spec = nullptr;
+
+            llama_batch_free(slot.batch_spec);
         }
 
         llama_batch_free(batch);
     }
 
-    bool load_model(const common_params & params_) {
-        params = params_;
+    bool load_model(const common_params & params) {
+        SRV_INF("loading model '%s'\n", params.model.c_str());
 
-        common_init_result llama_init = common_init_from_params(params);
+        params_base = params;
+
+        common_init_result llama_init = common_init_from_params(params_base);
 
         model = llama_init.model;
         ctx   = llama_init.context;
         loras = llama_init.lora_adapters;
 
         if (model == nullptr) {
-            SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
+            SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
             return false;
         }
 
@@ -657,6 +687,40 @@ struct server_context {
         add_bos_token = llama_add_bos_token(model);
         has_eos_token = !llama_add_eos_token(model);
 
+        if (!params_base.speculative.model.empty()) {
+            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
+
+            auto params_dft = params_base;
+
+            params_dft.model        = params_base.speculative.model;
+            params_dft.n_ctx        = params_base.speculative.n_ctx;
+            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
+
+            common_init_result llama_init_dft = common_init_from_params(params_dft);
+
+            model_dft = llama_init_dft.model;
+
+            if (model_dft == nullptr) {
+                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
+                return false;
+            }
+
+            if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
+                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
+
+                llama_free      (llama_init_dft.context);
+                llama_free_model(llama_init_dft.model);
+
+                return false;
+            }
+
+            cparams_dft = common_context_params_to_llama(params_base);
+            cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
+
+            // the context is not needed - we will create one for each slot
+            llama_free(llama_init_dft.context);
+        }
+
         return true;
     }
 
@@ -674,20 +738,36 @@ struct server_context {
     }
 
     void init() {
-        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+        const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
 
-        SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel);
+        SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
 
-        for (int i = 0; i < params.n_parallel; i++) {
+        for (int i = 0; i < params_base.n_parallel; i++) {
             server_slot slot;
 
             slot.id = i;
             slot.n_ctx = n_ctx_slot;
-            slot.n_predict = params.n_predict;
+            slot.n_predict = params_base.n_predict;
+
+            if (model_dft) {
+                slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
+
+                slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
+                if (slot.ctx_dft == nullptr) {
+                    SRV_ERR("%s", "failed to create draft context\n");
+                    return;
+                }
+
+                slot.spec = common_speculative_init(slot.ctx_dft);
+                if (slot.spec == nullptr) {
+                    SRV_ERR("%s", "failed to create speculator\n");
+                    return;
+                }
+            }
 
             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
 
-            slot.sparams = params.sampling;
+            slot.params.sampling = params_base.sampling;
 
             slot.callback_on_release = [this](int) {
                 queue_tasks.pop_deferred_task();
@@ -707,7 +787,7 @@ struct server_context {
             const int32_t n_batch = llama_n_batch(ctx);
 
             // only a single seq_id per token is needed
-            batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
+            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
         }
 
         metrics.init();
@@ -786,9 +866,11 @@ struct server_context {
     }
 
     bool launch_slot_with_task(server_slot & slot, const server_task & task) {
-        slot_params default_params;
         // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        auto default_sparams = params.sampling;
+        slot_params defaults;
+        defaults.sampling    = params_base.sampling;
+        defaults.speculative = params_base.speculative;
+
         const auto & data = task.data;
 
         if (data.count("__oaicompat") != 0) {
@@ -799,42 +881,48 @@ struct server_context {
             slot.oaicompat_model = "";
         }
 
-        slot.params.stream              = json_value(data, "stream",             false);
-        slot.params.cache_prompt        = json_value(data, "cache_prompt",       false);
-        slot.params.n_predict           = json_value(data, "n_predict",          json_value(data, "max_tokens", default_params.n_predict));
-        slot.params.n_indent            = json_value(data, "n_indent",           default_params.n_indent);
-        slot.sparams.top_k              = json_value(data, "top_k",              default_sparams.top_k);
-        slot.sparams.top_p              = json_value(data, "top_p",              default_sparams.top_p);
-        slot.sparams.min_p              = json_value(data, "min_p",              default_sparams.min_p);
-        slot.sparams.xtc_probability    = json_value(data, "xtc_probability",    default_sparams.xtc_probability);
-        slot.sparams.xtc_threshold      = json_value(data, "xtc_threshold",      default_sparams.xtc_threshold);
-        slot.sparams.typ_p              = json_value(data, "typical_p",          default_sparams.typ_p);
-        slot.sparams.temp               = json_value(data, "temperature",        default_sparams.temp);
-        slot.sparams.dynatemp_range     = json_value(data, "dynatemp_range",     default_sparams.dynatemp_range);
-        slot.sparams.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  default_sparams.dynatemp_exponent);
-        slot.sparams.penalty_last_n     = json_value(data, "repeat_last_n",      default_sparams.penalty_last_n);
-        slot.sparams.penalty_repeat     = json_value(data, "repeat_penalty",     default_sparams.penalty_repeat);
-        slot.sparams.penalty_freq       = json_value(data, "frequency_penalty",  default_sparams.penalty_freq);
-        slot.sparams.penalty_present    = json_value(data, "presence_penalty",   default_sparams.penalty_present);
-        slot.sparams.dry_multiplier     = json_value(data, "dry_multiplier",     default_sparams.dry_multiplier);
-        slot.sparams.dry_base           = json_value(data, "dry_base",           default_sparams.dry_base);
-        slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length);
-        slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n);
-        slot.sparams.mirostat           = json_value(data, "mirostat",           default_sparams.mirostat);
-        slot.sparams.mirostat_tau       = json_value(data, "mirostat_tau",       default_sparams.mirostat_tau);
-        slot.sparams.mirostat_eta       = json_value(data, "mirostat_eta",       default_sparams.mirostat_eta);
-        slot.sparams.penalize_nl        = json_value(data, "penalize_nl",        default_sparams.penalize_nl);
-        slot.params.n_keep              = json_value(data, "n_keep",             default_params.n_keep);
-        slot.params.n_discard           = json_value(data, "n_discard",          default_params.n_discard);
-        slot.sparams.seed               = json_value(data, "seed",               default_sparams.seed);
-        slot.sparams.n_probs            = json_value(data, "n_probs",            default_sparams.n_probs);
-        slot.sparams.min_keep           = json_value(data, "min_keep",           default_sparams.min_keep);
-      //slot.params.t_max_prompt_ms     = json_value(data, "t_max_prompt_ms",    default_params.t_max_prompt_ms); // TODO: implement
-        slot.params.t_max_predict_ms    = json_value(data, "t_max_predict_ms",   default_params.t_max_predict_ms);
+        slot.params.stream           = json_value(data, "stream",             false);
+        slot.params.cache_prompt     = json_value(data, "cache_prompt",       false);
+        slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
+        slot.params.n_indent         = json_value(data, "n_indent",           defaults.n_indent);
+        slot.params.n_keep           = json_value(data, "n_keep",             defaults.n_keep);
+        slot.params.n_discard        = json_value(data, "n_discard",          defaults.n_discard);
+      //slot.params.t_max_prompt_ms  = json_value(data, "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
+        slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms",   defaults.t_max_predict_ms);
 
-        if (slot.sparams.dry_base < 1.0f)
-        {
-           slot.sparams.dry_base = default_sparams.dry_base;
+        slot.params.sampling.top_k              = json_value(data, "top_k",              defaults.sampling.top_k);
+        slot.params.sampling.top_p              = json_value(data, "top_p",              defaults.sampling.top_p);
+        slot.params.sampling.min_p              = json_value(data, "min_p",              defaults.sampling.min_p);
+        slot.params.sampling.xtc_probability    = json_value(data, "xtc_probability",    defaults.sampling.xtc_probability);
+        slot.params.sampling.xtc_threshold      = json_value(data, "xtc_threshold",      defaults.sampling.xtc_threshold);
+        slot.params.sampling.typ_p              = json_value(data, "typical_p",          defaults.sampling.typ_p);
+        slot.params.sampling.temp               = json_value(data, "temperature",        defaults.sampling.temp);
+        slot.params.sampling.dynatemp_range     = json_value(data, "dynatemp_range",     defaults.sampling.dynatemp_range);
+        slot.params.sampling.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  defaults.sampling.dynatemp_exponent);
+        slot.params.sampling.penalty_last_n     = json_value(data, "repeat_last_n",      defaults.sampling.penalty_last_n);
+        slot.params.sampling.penalty_repeat     = json_value(data, "repeat_penalty",     defaults.sampling.penalty_repeat);
+        slot.params.sampling.penalty_freq       = json_value(data, "frequency_penalty",  defaults.sampling.penalty_freq);
+        slot.params.sampling.penalty_present    = json_value(data, "presence_penalty",   defaults.sampling.penalty_present);
+        slot.params.sampling.dry_multiplier     = json_value(data, "dry_multiplier",     defaults.sampling.dry_multiplier);
+        slot.params.sampling.dry_base           = json_value(data, "dry_base",           defaults.sampling.dry_base);
+        slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length);
+        slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n);
+        slot.params.sampling.mirostat           = json_value(data, "mirostat",           defaults.sampling.mirostat);
+        slot.params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",       defaults.sampling.mirostat_tau);
+        slot.params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",       defaults.sampling.mirostat_eta);
+        slot.params.sampling.penalize_nl        = json_value(data, "penalize_nl",        defaults.sampling.penalize_nl);
+        slot.params.sampling.seed               = json_value(data, "seed",               defaults.sampling.seed);
+        slot.params.sampling.n_probs            = json_value(data, "n_probs",            defaults.sampling.n_probs);
+        slot.params.sampling.min_keep           = json_value(data, "min_keep",           defaults.sampling.min_keep);
+
+        slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
+        slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
+        slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
+
+        slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
+
+        if (slot.params.sampling.dry_base < 1.0f) {
+           slot.params.sampling.dry_base = defaults.sampling.dry_base;
         }
 
         // sequence breakers for DRY
@@ -843,8 +931,8 @@ struct server_context {
             // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
 
             if (data.contains("dry_sequence_breakers")) {
-                slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
-                if (slot.sparams.dry_sequence_breakers.empty()) {
+                slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
+                if (slot.params.sampling.dry_sequence_breakers.empty()) {
                     send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST);
                     return false;
                 }
@@ -858,14 +946,14 @@ struct server_context {
         }
         if (data.contains("json_schema") && !data.contains("grammar")) {
             try {
-                auto schema          = json_value(data, "json_schema", json::object());
-                slot.sparams.grammar = json_schema_to_grammar(schema);
+                auto schema                  = json_value(data, "json_schema", json::object());
+                slot.params.sampling.grammar = json_schema_to_grammar(schema);
             } catch (const std::exception & e) {
                 send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
                 return false;
             }
         } else {
-            slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
+            slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
         }
 
         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -875,10 +963,10 @@ struct server_context {
         }
 
         {
-            slot.sparams.logit_bias.clear();
+            slot.params.sampling.logit_bias.clear();
 
             if (json_value(data, "ignore_eos", false) && has_eos_token) {
-                slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
+                slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY});
             }
 
             const auto & logit_bias = data.find("logit_bias");
@@ -899,12 +987,12 @@ struct server_context {
                         if (el[0].is_number_integer()) {
                             llama_token tok = el[0].get<llama_token>();
                             if (tok >= 0 && tok < n_vocab) {
-                                slot.sparams.logit_bias.push_back({tok, bias});
+                                slot.params.sampling.logit_bias.push_back({tok, bias});
                             }
                         } else if (el[0].is_string()) {
                             auto toks = common_tokenize(model, el[0].get<std::string>(), false);
                             for (auto tok : toks) {
-                                slot.sparams.logit_bias.push_back({tok, bias});
+                                slot.params.sampling.logit_bias.push_back({tok, bias});
                             }
                         }
                     }
@@ -935,16 +1023,16 @@ struct server_context {
                             sampler_names.emplace_back(name);
                         }
                     }
-                    slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
+                    slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false);
                 } else if (samplers->is_string()){
                     std::string sampler_string;
                     for (const auto & name : *samplers) {
                         sampler_string += name;
                     }
-                    slot.sparams.samplers = common_sampler_types_from_chars(sampler_string);
+                    slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string);
                 }
             } else {
-                slot.sparams.samplers = default_sparams.samplers;
+                slot.params.sampling.samplers = defaults.sampling.samplers;
             }
         }
 
@@ -953,7 +1041,7 @@ struct server_context {
                 common_sampler_free(slot.smpl);
             }
 
-            slot.smpl = common_sampler_init(model, slot.sparams);
+            slot.smpl = common_sampler_init(model, slot.params.sampling);
             if (slot.smpl == nullptr) {
                 // for now, the only error that may happen here is invalid grammar
                 send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
@@ -961,6 +1049,12 @@ struct server_context {
             }
         }
 
+        if (slot.ctx_dft) {
+            llama_batch_free(slot.batch_spec);
+
+            slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1);
+        }
+
         slot.state = SLOT_STATE_STARTED;
 
         SLT_INF(slot, "%s", "processing task\n");
@@ -978,7 +1072,7 @@ struct server_context {
 
     bool process_token(completion_token_output & result, server_slot & slot) {
         // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
+        const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
         slot.sampled = result.tok;
 
         // search stop word and delete it
@@ -1043,7 +1137,7 @@ struct server_context {
         }
 
         // check the limits
-        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) {
+        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
             slot.stopped_limit  = true;
             slot.has_next_token = false;
 
@@ -1136,50 +1230,54 @@ struct server_context {
 
     json get_formated_generation(const server_slot & slot) const {
         std::vector<std::string> samplers;
-        samplers.reserve(slot.sparams.samplers.size());
-        for (const auto & sampler : slot.sparams.samplers) {
+        samplers.reserve(slot.params.sampling.samplers.size());
+        for (const auto & sampler : slot.params.sampling.samplers) {
             samplers.emplace_back(common_sampler_type_to_str(sampler));
         }
 
         return json {
             {"n_ctx",                     slot.n_ctx},
             {"n_predict",                 slot.n_predict},     // Server configured n_predict
-            {"model",                     params.model_alias},
-            {"seed",                      slot.sparams.seed},
+            {"model",                     params_base.model_alias},
+            {"seed",                      slot.params.sampling.seed},
             {"seed_cur",                  slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
-            {"temperature",               slot.sparams.temp},
-            {"dynatemp_range",            slot.sparams.dynatemp_range},
-            {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
-            {"top_k",                     slot.sparams.top_k},
-            {"top_p",                     slot.sparams.top_p},
-            {"min_p",                     slot.sparams.min_p},
-            {"xtc_probability",           slot.sparams.xtc_probability},
-            {"xtc_threshold",             slot.sparams.xtc_threshold},
-            {"typical_p",                 slot.sparams.typ_p},
-            {"repeat_last_n",             slot.sparams.penalty_last_n},
-            {"repeat_penalty",            slot.sparams.penalty_repeat},
-            {"presence_penalty",          slot.sparams.penalty_present},
-            {"frequency_penalty",         slot.sparams.penalty_freq},
-            {"dry_multiplier",            slot.sparams.dry_multiplier},
-            {"dry_base",                  slot.sparams.dry_base},
-            {"dry_allowed_length",        slot.sparams.dry_allowed_length},
-            {"dry_penalty_last_n",        slot.sparams.dry_penalty_last_n},
-            {"dry_sequence_breakers",     slot.sparams.dry_sequence_breakers},
-            {"mirostat",                  slot.sparams.mirostat},
-            {"mirostat_tau",              slot.sparams.mirostat_tau},
-            {"mirostat_eta",              slot.sparams.mirostat_eta},
-            {"penalize_nl",               slot.sparams.penalize_nl},
+            {"temperature",               slot.params.sampling.temp},
+            {"dynatemp_range",            slot.params.sampling.dynatemp_range},
+            {"dynatemp_exponent",         slot.params.sampling.dynatemp_exponent},
+            {"top_k",                     slot.params.sampling.top_k},
+            {"top_p",                     slot.params.sampling.top_p},
+            {"min_p",                     slot.params.sampling.min_p},
+            {"xtc_probability",           slot.params.sampling.xtc_probability},
+            {"xtc_threshold",             slot.params.sampling.xtc_threshold},
+            {"typical_p",                 slot.params.sampling.typ_p},
+            {"repeat_last_n",             slot.params.sampling.penalty_last_n},
+            {"repeat_penalty",            slot.params.sampling.penalty_repeat},
+            {"presence_penalty",          slot.params.sampling.penalty_present},
+            {"frequency_penalty",         slot.params.sampling.penalty_freq},
+            {"dry_multiplier",            slot.params.sampling.dry_multiplier},
+            {"dry_base",                  slot.params.sampling.dry_base},
+            {"dry_allowed_length",        slot.params.sampling.dry_allowed_length},
+            {"dry_penalty_last_n",        slot.params.sampling.dry_penalty_last_n},
+            {"dry_sequence_breakers",     slot.params.sampling.dry_sequence_breakers},
+            {"mirostat",                  slot.params.sampling.mirostat},
+            {"mirostat_tau",              slot.params.sampling.mirostat_tau},
+            {"mirostat_eta",              slot.params.sampling.mirostat_eta},
+            {"penalize_nl",               slot.params.sampling.penalize_nl},
             {"stop",                      slot.params.antiprompt},
             {"max_tokens",                slot.params.n_predict}, // User configured n_predict
             {"n_keep",                    slot.params.n_keep},
             {"n_discard",                 slot.params.n_discard},
-            {"ignore_eos",                slot.sparams.ignore_eos},
+            {"ignore_eos",                slot.params.sampling.ignore_eos},
             {"stream",                    slot.params.stream},
-          //{"logit_bias",                slot.sparams.logit_bias},
-            {"n_probs",                   slot.sparams.n_probs},
-            {"min_keep",                  slot.sparams.min_keep},
-            {"grammar",                   slot.sparams.grammar},
+          //{"logit_bias",                slot.params.sampling.logit_bias},
+            {"n_probs",                   slot.params.sampling.n_probs},
+            {"min_keep",                  slot.params.sampling.min_keep},
+            {"grammar",                   slot.params.sampling.grammar},
             {"samplers",                  samplers},
+            {"speculative",               slot.can_speculate()},
+            {"speculative.n_max",         slot.params.speculative.n_max},
+            {"speculative.n_min",         slot.params.speculative.n_min},
+            {"speculative.p_min",         slot.params.speculative.p_min},
         };
     }
 
@@ -1216,7 +1314,7 @@ struct server_context {
             {"index",      slot.index},
         };
 
-        if (slot.sparams.n_probs > 0) {
+        if (slot.params.sampling.n_probs > 0) {
             const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
             const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
             const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
@@ -1249,7 +1347,7 @@ struct server_context {
             {"content",             !slot.params.stream ? slot.generated_text : ""},
             {"id_slot",             slot.id},
             {"stop",                true},
-            {"model",               params.model_alias},
+            {"model",               params_base.model_alias},
             {"tokens_predicted",    slot.n_decoded},
             {"tokens_evaluated",    slot.n_prompt_tokens},
             {"generation_settings", get_formated_generation(slot)},
@@ -1265,7 +1363,7 @@ struct server_context {
             {"index",               slot.index},
         };
 
-        if (slot.sparams.n_probs > 0) {
+        if (slot.params.sampling.n_probs > 0) {
             std::vector<completion_token_output> probs;
             if (!slot.params.stream && slot.stopped_word) {
                 const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
@@ -1422,10 +1520,10 @@ struct server_context {
                             data.at("input_prefix"),
                             data.at("input_suffix"),
                             data.at("input_extra"),
-                            params.n_batch,
-                            params.n_predict,
+                            params_base.n_batch,
+                            params_base.n_predict,
                             slots[0].n_ctx, // TODO: there should be a better way
-                            params.spm_infill,
+                            params_base.spm_infill,
                             tokenized_prompts[i]
                         );
                         create_task(data, tokens);
@@ -1798,7 +1896,7 @@ struct server_context {
         // TODO: simplify and improve
         for (server_slot & slot : slots) {
             if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
-                if (!params.ctx_shift) {
+                if (!params_base.ctx_shift) {
                     // this check is redundant (for good)
                     // we should never get here, because generation should already stopped in process_token()
                     slot.release();
@@ -1864,7 +1962,7 @@ struct server_context {
         int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
 
         // next, batch any pending prompts without exceeding n_batch
-        if (params.cont_batching || batch.n_tokens == 0) {
+        if (params_base.cont_batching || batch.n_tokens == 0) {
             for (auto & slot : slots) {
                 // this slot still has a prompt to be processed
                 if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
@@ -1917,7 +2015,7 @@ struct server_context {
                                 continue;
                             }
                         } else {
-                            if (!params.ctx_shift) {
+                            if (!params_base.ctx_shift) {
                                 // if context shift is disabled, we make sure prompt size is smaller than KV size
                                 // TODO: there should be a separate parameter that control prompt truncation
                                 //       context shift should be applied only during the generation phase
@@ -1963,11 +2061,11 @@ struct server_context {
                                 slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
 
                                 // reuse chunks from the cached prompt by shifting their KV cache in the new position
-                                if (params.n_cache_reuse > 0) {
+                                if (params_base.n_cache_reuse > 0) {
                                     size_t head_c = slot.n_past; // cache
                                     size_t head_p = slot.n_past; // current prompt
 
-                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past);
+                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
 
                                     while (head_c < slot.cache_tokens.size() &&
                                            head_p < prompt_tokens.size()) {
@@ -1980,7 +2078,7 @@ struct server_context {
                                             n_match++;
                                         }
 
-                                        if (n_match >= (size_t) params.n_cache_reuse) {
+                                        if (n_match >= (size_t) params_base.n_cache_reuse) {
                                             SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
                                             //for (size_t i = head_p; i < head_p + n_match; i++) {
                                             //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
@@ -2168,38 +2266,99 @@ struct server_context {
                     continue; // continue loop of slots
                 }
 
-                completion_token_output result;
-                const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                llama_token id;
 
-                common_sampler_accept(slot.smpl, id, true);
+                {
+                    completion_token_output result;
 
-                slot.n_decoded += 1;
-                if (slot.n_decoded == 1) {
-                    slot.t_start_generation = ggml_time_us();
-                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
-                    metrics.on_prompt_eval(slot);
+                    id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+
+                    slot.i_batch = -1;
+
+                    common_sampler_accept(slot.smpl, id, true);
+
+                    slot.n_decoded += 1;
+                    if (slot.n_decoded == 1) {
+                        slot.t_start_generation = ggml_time_us();
+                        slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
+                        metrics.on_prompt_eval(slot);
+                    }
+
+                    result.tok = id;
+
+                    const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+
+                    for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
+                        result.probs.push_back({
+                            cur_p->data[i].id,
+                                i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                        });
+                    }
+
+                    if (!process_token(result, slot)) {
+                        // release slot because of stop condition
+                        slot.release();
+                        slot.print_timings();
+                        send_final_response(slot);
+                        metrics.on_prediction(slot);
+                        continue;
+                    }
                 }
 
-                result.tok = id;
-
-                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
-
-                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
-                    result.probs.push_back({
-                        cur_p->data[i].id,
-                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                    });
+                // check if the slot supports speculative decoding
+                if (!slot.can_speculate()) {
+                    continue;
                 }
 
-                if (!process_token(result, slot)) {
-                    // release slot because of stop condition
-                    slot.release();
-                    slot.print_timings();
-                    send_final_response(slot);
-                    metrics.on_prediction(slot);
+                struct common_speculative_params params_spec;
+                params_spec.n_draft   = slot.params.speculative.n_max;
+                params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
+                params_spec.p_min     = slot.params.speculative.p_min;
+
+                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
+
+                // ignore small drafts
+                if (slot.params.speculative.n_min > (int) draft.size()) {
+                    continue;
                 }
 
-                slot.i_batch = -1;
+                // construct the speculation batch
+                common_batch_clear(slot.batch_spec);
+                common_batch_add  (slot.batch_spec, id, slot.n_past, { slot.id }, true);
+
+                for (size_t i = 0; i < draft.size(); ++i) {
+                    common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
+                }
+
+                llama_decode(ctx, slot.batch_spec);
+
+                // the accepted tokens from the speculation
+                const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
+
+                slot.n_past    += ids.size();
+                slot.n_decoded += ids.size();
+
+                slot.cache_tokens.push_back(id);
+                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
+
+                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
+
+                for (size_t i = 0; i < ids.size(); ++i) {
+                    completion_token_output result;
+
+                    result.tok = ids[i];
+
+                    if (!process_token(result, slot)) {
+                        // release slot because of stop condition
+                        slot.release();
+                        slot.print_timings();
+                        send_final_response(slot);
+                        metrics.on_prediction(slot);
+                        break;
+                    }
+                }
+
+                SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size());
             }
         }
 
@@ -2697,7 +2856,7 @@ int main(int argc, char ** argv) {
     const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
         json data = {
             { "default_generation_settings", ctx_server.default_generation_settings_for_props },
-            { "total_slots",                 ctx_server.params.n_parallel },
+            { "total_slots",                 ctx_server.params_base.n_parallel },
             { "chat_template",               llama_get_chat_template(ctx_server.model) },
         };
 
@@ -2705,7 +2864,7 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params.endpoint_props) {
+        if (!ctx_server.params_base.endpoint_props) {
             res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
@@ -2718,7 +2877,7 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
-        if (ctx_server.params.embedding) {
+        if (ctx_server.params_base.embedding) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
@@ -2824,7 +2983,7 @@ int main(int argc, char ** argv) {
 
     // TODO: maybe merge this function with "handle_completions_generic"
     const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
-        if (ctx_server.params.embedding) {
+        if (ctx_server.params_base.embedding) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
@@ -3001,7 +3160,7 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params.reranking || ctx_server.params.embedding) {
+        if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
             res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }

From a9a678a6b2264e5895533217b0cf8f250534cc58 Mon Sep 17 00:00:00 2001
From: brucepro <git@brucepro.net>
Date: Mon, 25 Nov 2024 08:11:55 -0800
Subject: [PATCH 296/396] Add download chat feature to server chat (#10481)

* Add download chat feature to server chat

Add a download feature next to the delete chat feature in the server vue chat interface.

* code style

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 examples/server/public/index.html | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 6216c0841..c54260867 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -81,7 +81,13 @@
               <path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/>
             </svg>
           </button>
-
+          <button v-if="messages.length > 0" class="btn mr-1" @click="downloadConv(viewingConvId)" :disabled="isGenerating">
+              <!-- download conversation button -->
+              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-download" viewBox="0 0 16 16">
+                  <path d="M.5 9.9a.5.5 0 0 1 .5.5v2.5a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-2.5a.5.5 0 0 1 1 0v2.5a2 2 0 0 1-2 2H2a2 2 0 0 1-2-2v-2.5a.5.5 0 0 1 .5-.5"/>
+                  <path d="M7.646 11.854a.5.5 0 0 0 .708 0l3-3a.5.5 0 0 0-.708-.708L8.5 10.293V1.5a.5.5 0 0 0-1 0v8.793L5.354 8.146a.5.5 0 1 0-.708.708z"/>
+            </svg>
+          </button>
           <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
             <!-- edit config button -->
             <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
@@ -526,6 +532,23 @@
             this.fetchMessages();
           }
         },
+        downloadConv(convId) {
+          const conversation = StorageUtils.getOneConversation(convId);
+          if (!conversation) {
+            alert('Conversation not found.');
+            return;
+          }
+          const conversationJson = JSON.stringify(conversation, null, 2);
+          const blob = new Blob([conversationJson], { type: 'application/json' });
+          const url = URL.createObjectURL(blob);
+          const a = document.createElement('a');
+          a.href = url;
+          a.download = `conversation_${convId}.json`;
+          document.body.appendChild(a);
+          a.click();
+          document.body.removeChild(a);
+          URL.revokeObjectURL(url);
+        },
         async sendMessage() {
           if (!this.inputMsg) return;
           const currConvId = this.viewingConvId;

From 1f922254f0c984a8fb9fbaa0c390d7ffae49aedb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Mon, 25 Nov 2024 19:18:37 +0100
Subject: [PATCH 297/396] Github: update issue templates [no ci] (#10489)

---
 .../ISSUE_TEMPLATE/010-bug-compilation.yml    | 12 ++++++----
 .github/ISSUE_TEMPLATE/011-bug-results.yml    | 15 +++++++-----
 .github/ISSUE_TEMPLATE/019-bug-misc.yml       | 23 +++++++++++--------
 3 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
index 550ee1b49..f10b3a2b2 100644
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -24,7 +24,8 @@ body:
   - type: dropdown
     id: operating-system
     attributes:
-      label: Which operating systems do you know to be affected?
+      label: Operating systems
+      description: Which operating systems do you know to be affected?
       multiple: true
       options:
         - Linux
@@ -41,14 +42,17 @@ body:
         description: Which GGML backends do you know to be affected?
         options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
         multiple: true
+    validations:
+      required: true
   - type: textarea
-    id: steps_to_reproduce
+    id: info
     attributes:
-      label: Steps to Reproduce
+      label: Problem description & steps to reproduce
       description: >
-        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        Please give us a summary of the problem and tell us how to reproduce it.
         If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
       placeholder: >
+        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
         Here are the exact commands that I used: ...
     validations:
       required: true
diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml
index 1adb162b7..1ccef0793 100644
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -26,7 +26,8 @@ body:
   - type: dropdown
     id: operating-system
     attributes:
-      label: Which operating systems do you know to be affected?
+      label: Operating systems
+      description: Which operating systems do you know to be affected?
       multiple: true
       options:
         - Linux
@@ -43,6 +44,8 @@ body:
         description: Which GGML backends do you know to be affected?
         options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
         multiple: true
+    validations:
+      required: true
   - type: textarea
     id: hardware
     attributes:
@@ -55,20 +58,20 @@ body:
   - type: textarea
     id: model
     attributes:
-      label: Model
+      label: Models
       description: >
-        Which model at which quantization were you using when encountering the bug?
+        Which model(s) at which quantization were you using when encountering the bug?
         If you downloaded a GGUF file off of Huggingface, please provide a link.
       placeholder: >
         e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
     validations:
       required: false
   - type: textarea
-    id: steps_to_reproduce
+    id: info
     attributes:
-      label: Steps to Reproduce
+      label: Problem description & steps to reproduce
       description: >
-        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        Please give us a summary of the problem and tell us how to reproduce it.
         If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
         that information would be very much appreciated by us.
       placeholder: >
diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
index 124cdee91..d157ea307 100644
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -14,7 +14,7 @@ body:
     id: version
     attributes:
       label: Name and Version
-      description: Which version of our software are you running? (use `--version` to get a version string)
+      description: Which version of our software is affected? (You can use `--version` to get a version string.)
       placeholder: |
         $./llama-cli --version
         version: 2999 (42b4109e)
@@ -24,7 +24,8 @@ body:
   - type: dropdown
     id: operating-system
     attributes:
-      label: Which operating systems do you know to be affected?
+      label: Operating systems
+      description: Which operating systems do you know to be affected?
       multiple: true
       options:
         - Linux
@@ -33,28 +34,30 @@ body:
         - BSD
         - Other? (Please let us know in description)
     validations:
-      required: true
+      required: false
   - type: dropdown
     id: module
     attributes:
       label: Which llama.cpp modules do you know to be affected?
       multiple: true
       options:
+        - Documentation/Github
         - libllama (core library)
         - llama-cli
         - llama-server
         - llama-bench
         - llama-quantize
         - Python/Bash scripts
+        - Test code
         - Other (Please specify in the next section)
     validations:
-      required: true
+      required: false
   - type: textarea
-    id: steps_to_reproduce
+    id: info
     attributes:
-      label: Steps to Reproduce
+      label: Problem description & steps to reproduce
       description: >
-        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
     validations:
       required: true
   - type: textarea
@@ -62,7 +65,7 @@ body:
     attributes:
       label: First Bad Commit
       description: >
-        If the bug was not present on an earlier version: when did it start appearing?
+        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
         If possible, please do a git bisect and identify the exact commit that introduced the bug.
     validations:
       required: false
@@ -71,8 +74,8 @@ body:
     attributes:
       label: Relevant log output
       description: >
-          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
           This will be automatically formatted into code, so no need for backticks.
       render: shell
     validations:
-      required: true
+      required: false

From 10bce0450f0c4d80087e06312b9dbbab3e87f16b Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 25 Nov 2024 19:30:06 +0100
Subject: [PATCH 298/396] llama : accept a list of devices to use to offload a
 model (#10497)

* llama : accept a list of devices to use to offload a model

* accept `--dev none` to completely disable offloading

* fix dev list with dl backends

* rename env parameter to LLAMA_ARG_DEVICE for consistency
---
 common/arg.cpp                                | 60 +++++++++++++++++--
 common/common.cpp                             |  8 +--
 common/common.h                               | 14 +++--
 examples/server/server.cpp                    |  1 +
 .../speculative-simple/speculative-simple.cpp |  1 +
 examples/speculative/speculative.cpp          |  1 +
 ggml/src/ggml-backend-reg.cpp                 | 13 +++-
 include/llama.h                               |  3 +
 src/llama.cpp                                 | 30 ++++++----
 9 files changed, 104 insertions(+), 27 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 32240f21f..272492e50 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -298,6 +298,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
     print_options(specific_options);
 }
 
+static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
+    std::vector<ggml_backend_dev_t> devices;
+    auto dev_names = string_split<std::string>(value, ',');
+    if (dev_names.empty()) {
+        throw std::invalid_argument("no devices specified");
+    }
+    if (dev_names.size() == 1 && dev_names[0] == "none") {
+        devices.push_back(nullptr);
+    } else {
+        for (const auto & device : dev_names) {
+            auto * dev = ggml_backend_dev_by_name(device.c_str());
+            if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
+                throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
+            }
+            devices.push_back(dev);
+        }
+        devices.push_back(nullptr);
+    }
+    return devices;
+}
+
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
     auto ctx_arg = common_params_parser_init(params, ex, print_usage);
     const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -324,6 +345,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
 }
 
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    // load dynamic backends
+    ggml_backend_load_all();
+
     common_params_context ctx_arg(params);
     ctx_arg.print_usage = print_usage;
     ctx_arg.ex          = ex;
@@ -1312,6 +1336,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_env("LLAMA_ARG_NUMA"));
+    add_opt(common_arg(
+        {"-dev", "--device"}, "<dev1,dev2,..>",
+        "comma-separated list of devices to use for offloading (none = don't offload)\n"
+        "use --list-devices to see a list of available devices",
+        [](common_params & params, const std::string & value) {
+            params.devices = parse_device_list(value);
+        }
+    ).set_env("LLAMA_ARG_DEVICE"));
+    add_opt(common_arg(
+        {"--list-devices"},
+        "print list of available devices and exit",
+        [](common_params &) {
+            printf("Available devices:\n");
+            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                auto * dev = ggml_backend_dev_get(i);
+                if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+                    size_t free, total;
+                    ggml_backend_dev_memory(dev, &free, &total);
+                    printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+                }
+            }
+            exit(0);
+        }
+    ));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
@@ -1336,10 +1384,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             } else if (arg_next == "layer") {
                 params.split_mode = LLAMA_SPLIT_MODE_LAYER;
             } else if (arg_next == "row") {
-#ifdef GGML_USE_SYCL
-                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
-                exit(1);
-#endif // GGML_USE_SYCL
                 params.split_mode = LLAMA_SPLIT_MODE_ROW;
             } else {
                 throw std::invalid_argument("invalid value");
@@ -2042,6 +2086,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.n_ctx = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-devd", "--device-draft"}, "<dev1,dev2,..>",
+        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
+        "use --list-devices to see a list of available devices",
+        [](common_params & params, const std::string & value) {
+            params.speculative.devices = parse_device_list(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
         "number of layers to store in VRAM for the draft model",
diff --git a/common/common.cpp b/common/common.cpp
index 98524f746..09ec9f238 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -377,9 +377,6 @@ void common_init() {
 #endif
 
     LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
-
-    // load dynamic backends
-    ggml_backend_load_all();
 }
 
 std::string common_params_get_system_info(const common_params & params) {
@@ -982,9 +979,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
     }
 }
 
-struct llama_model_params common_model_params_to_llama(const common_params & params) {
+struct llama_model_params common_model_params_to_llama(common_params & params) {
     auto mparams = llama_model_default_params();
 
+    if (!params.devices.empty()) {
+        mparams.devices = params.devices.data();
+    }
     if (params.n_gpu_layers != -1) {
         mparams.n_gpu_layers = params.n_gpu_layers;
     }
diff --git a/common/common.h b/common/common.h
index 5c579b5ab..286642db2 100644
--- a/common/common.h
+++ b/common/common.h
@@ -156,6 +156,7 @@ struct common_params_sampling {
 };
 
 struct common_params_speculative {
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
     int32_t n_ctx        =     0; // draft context size
     int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
     int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
@@ -178,9 +179,6 @@ struct common_params {
     int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
     int32_t n_parallel            =     1; // number of parallel sequences to decode
     int32_t n_sequences           =     1; // number of sequences to decode
-    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
     int32_t grp_attn_n            =     1; // group-attention factor
     int32_t grp_attn_w            =   512; // group-attention width
     int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
@@ -193,6 +191,13 @@ struct common_params {
     int32_t yarn_orig_ctx         =     0; // YaRN original context length
     float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
 
+    // offload params
+    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
+    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
+    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
 
@@ -201,7 +206,6 @@ struct common_params {
 
     ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
 
-    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
     enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
@@ -462,7 +466,7 @@ struct common_init_result {
 
 struct common_init_result     common_init_from_params(common_params & params);
 
-struct llama_model_params     common_model_params_to_llama  (const common_params & params);
+struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f9d20fee5..8684771e2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -692,6 +692,7 @@ struct server_context {
 
             auto params_dft = params_base;
 
+            params_dft.devices      = params_base.speculative.devices;
             params_dft.model        = params_base.speculative.model;
             params_dft.n_ctx        = params_base.speculative.n_ctx;
             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 1bc7f428c..7bf9056bf 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -46,6 +46,7 @@ int main(int argc, char ** argv) {
     ctx_tgt   = llama_init_tgt.context;
 
     // load the draft model
+    params.devices      = params.speculative.devices;
     params.model        = params.speculative.model;
     params.n_ctx        = params.speculative.n_ctx;
     params.n_batch      = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index eb8bb2de5..d4ad9751e 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -76,6 +76,7 @@ int main(int argc, char ** argv) {
     ctx_tgt = llama_init_tgt.context;
 
     // load the draft model
+    params.devices = params.speculative.devices;
     params.model = params.speculative.model;
     params.n_gpu_layers = params.speculative.n_gpu_layers;
     if (params.speculative.cpuparams.n_threads > 0) {
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 43d03d7fa..a0e0e2c58 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -253,6 +253,15 @@ void ggml_backend_device_register(ggml_backend_dev_t device) {
 }
 
 // Backend (reg) enumeration
+static bool striequals(const char * a, const char * b) {
+    for (; *a && *b; a++, b++) {
+        if (std::tolower(*a) != std::tolower(*b)) {
+            return false;
+        }
+    }
+    return *a == *b;
+}
+
 size_t ggml_backend_reg_count() {
     return get_reg().backends.size();
 }
@@ -265,7 +274,7 @@ ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
 ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
     for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
         ggml_backend_reg_t reg = ggml_backend_reg_get(i);
-        if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
+        if (striequals(ggml_backend_reg_name(reg), name)) {
             return reg;
         }
     }
@@ -285,7 +294,7 @@ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
 ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
     for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
         ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
+        if (striequals(ggml_backend_dev_name(dev), name)) {
             return dev;
         }
     }
diff --git a/include/llama.h b/include/llama.h
index 90791d5f5..ab5e376e6 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -272,6 +272,9 @@ extern "C" {
     };
 
     struct llama_model_params {
+        // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
+        ggml_backend_dev_t * devices;
+
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 83bbc10a5..571cb68e2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19364,6 +19364,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
 //
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
+        /*.devices                     =*/ nullptr,
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
@@ -19576,19 +19577,24 @@ struct llama_model * llama_load_model_from_file(
     }
 
     // create list of devices to use with this model
-    // currently, we use all available devices
-    // TODO: rework API to give user more control over device selection
-    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        switch (ggml_backend_dev_type(dev)) {
-            case GGML_BACKEND_DEVICE_TYPE_CPU:
-            case GGML_BACKEND_DEVICE_TYPE_ACCEL:
-                // skip CPU backends since they are handled separately
-                break;
+    if (params.devices) {
+        for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
+            model->devices.push_back(*dev);
+        }
+    } else {
+        // use all available devices
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            switch (ggml_backend_dev_type(dev)) {
+                case GGML_BACKEND_DEVICE_TYPE_CPU:
+                case GGML_BACKEND_DEVICE_TYPE_ACCEL:
+                    // skip CPU backends since they are handled separately
+                    break;
 
-            case GGML_BACKEND_DEVICE_TYPE_GPU:
-                model->devices.push_back(dev);
-                break;
+                case GGML_BACKEND_DEVICE_TYPE_GPU:
+                    model->devices.push_back(dev);
+                    break;
+            }
         }
     }
 

From 80acb7b430d826a7685326603a07342e9abc1b45 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Mon, 25 Nov 2024 10:36:09 -0800
Subject: [PATCH 299/396] Rename Olmo1124 to Olmo2 (#10500)

---
 convert_hf_to_gguf.py          |  6 +++---
 gguf-py/gguf/constants.py      |  6 +++---
 gguf-py/gguf/tensor_mapping.py | 28 ++++++++++++++--------------
 src/llama.cpp                  | 18 +++++++++---------
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 80a179b86..b931049d1 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -3040,9 +3040,9 @@ class OlmoModel(Model):
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("Olmo1124ForCausalLM")
-class Olmo1124Model(Model):
-    model_arch = gguf.MODEL_ARCH.OLMO_1124
+@Model.register("Olmo2ForCausalLM")
+class Olmo2Model(Model):
+    model_arch = gguf.MODEL_ARCH.OLMO2
 
 
 @Model.register("OlmoeForCausalLM")
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index d83b72f76..7df23371c 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -243,7 +243,7 @@ class MODEL_ARCH(IntEnum):
     COMMAND_R    = auto()
     DBRX         = auto()
     OLMO         = auto()
-    OLMO_1124    = auto()
+    OLMO2        = auto()
     OLMOE        = auto()
     OPENELM      = auto()
     ARCTIC       = auto()
@@ -405,7 +405,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.COMMAND_R:      "command-r",
     MODEL_ARCH.DBRX:           "dbrx",
     MODEL_ARCH.OLMO:           "olmo",
-    MODEL_ARCH.OLMO_1124:      "olmo_1124",
+    MODEL_ARCH.OLMO2:          "olmo2",
     MODEL_ARCH.OLMOE:          "olmoe",
     MODEL_ARCH.OPENELM:        "openelm",
     MODEL_ARCH.ARCTIC:         "arctic",
@@ -1071,7 +1071,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
-    MODEL_ARCH.OLMO_1124: [
+    MODEL_ARCH.OLMO2: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.OUTPUT,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 4cbd39e03..1b6a3f4ad 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -13,7 +13,7 @@ class TensorNameMap:
             "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone
             "transformer.word_embeddings",               # falcon
             "word_embeddings",                           # bloom
-            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo_1124
+            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2
             "tok_embeddings",                            # llama-pth
             "embeddings.word_embeddings",                # bert nomic-bert
             "language_model.embedding.word_embeddings",  # persimmon
@@ -54,7 +54,7 @@ class TensorNameMap:
         # Output
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124
+            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
             "output",                    # llama-pth bloom internlm2
             "word_embeddings_for_head",  # persimmon
             "lm_head.linear",            # phi2
@@ -66,7 +66,7 @@ class TensorNameMap:
         MODEL_TENSOR.OUTPUT_NORM: (
             "gpt_neox.final_layer_norm",               # gptneox
             "transformer.ln_f",                        # gpt2 gpt-j falcon jais exaone
-            "model.norm",                              # llama-hf baichuan internlm2 olmoe olmo_1124
+            "model.norm",                              # llama-hf baichuan internlm2 olmoe olmo2
             "norm",                                    # llama-pth
             "transformer.norm_f",                      # mpt dbrx
             "ln_f",                                    # refact bloom qwen gpt2
@@ -145,7 +145,7 @@ class TensorNameMap:
 
         # Attention query
         MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe olmo_1124
+            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe olmo2
             "layers.{bid}.attention.wq",                                 # llama-pth
             "encoder.layer.{bid}.attention.self.query",                  # bert
             "transformer.h.{bid}.attn.q_proj",                           # gpt-j
@@ -157,7 +157,7 @@ class TensorNameMap:
 
         # Attention key
         MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe olmo_1124
+            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe olmo2
             "layers.{bid}.attention.wk",                               # llama-pth
             "encoder.layer.{bid}.attention.self.key",                  # bert
             "transformer.h.{bid}.attn.k_proj",                         # gpt-j
@@ -170,7 +170,7 @@ class TensorNameMap:
 
         # Attention value
         MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo_1124
+            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo2
             "layers.{bid}.attention.wv",                                 # llama-pth
             "encoder.layer.{bid}.attention.self.value",                  # bert
             "transformer.h.{bid}.attn.v_proj",                           # gpt-j
@@ -188,7 +188,7 @@ class TensorNameMap:
             "transformer.blocks.{bid}.attn.out_proj",                       # mpt
             "transformer.h.{bid}.self_attention.dense",                     # falcon
             "h.{bid}.self_attention.dense",                                 # bloom
-            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo_1124
+            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo2
             "layers.{bid}.attention.wo",                                    # llama-pth
             "encoder.layer.{bid}.attention.output.dense",                   # bert
             "transformer.h.{bid}.attn.out_proj",                            # gpt-j
@@ -215,7 +215,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.ATTN_POST_NORM: (
-            "model.layers.{bid}.post_attention_layernorm",     # gemma2 olmo_1124
+            "model.layers.{bid}.post_attention_layernorm",     # gemma2 olmo2
         ),
 
         # Rotary embeddings
@@ -250,7 +250,7 @@ class TensorNameMap:
 
         # Post feed-forward norm
         MODEL_TENSOR.FFN_POST_NORM: (
-            "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo_1124
+            "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
         ),
 
         MODEL_TENSOR.FFN_GATE_INP: (
@@ -273,7 +273,7 @@ class TensorNameMap:
             "transformer.blocks.{bid}.ffn.up_proj",                   # mpt
             "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
             "h.{bid}.mlp.dense_h_to_4h",                              # bloom
-            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo_1124
+            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
             "layers.{bid}.feed_forward.w3",                           # llama-pth
             "encoder.layer.{bid}.intermediate.dense",                 # bert
             "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
@@ -314,7 +314,7 @@ class TensorNameMap:
 
         # Feed-forward gate
         MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact olmo_1124
+            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact olmo2
             "layers.{bid}.feed_forward.w1",               # llama-pth
             "transformer.h.{bid}.mlp.w2",                 # qwen
             "transformer.h.{bid}.mlp.c_fc2",              # jais
@@ -346,7 +346,7 @@ class TensorNameMap:
             "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
             "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
             "h.{bid}.mlp.dense_4h_to_h",                              # bloom
-            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo_1124
+            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
             "layers.{bid}.feed_forward.w2",                           # llama-pth
             "encoder.layer.{bid}.output.dense",                       # bert
             "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
@@ -383,7 +383,7 @@ class TensorNameMap:
         MODEL_TENSOR.ATTN_Q_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
             "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon olmo_1124
+            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon olmo2
             "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
             "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
             "transformer.layers.{bid}.attn.q_norm",                           # openelm
@@ -392,7 +392,7 @@ class TensorNameMap:
         MODEL_TENSOR.ATTN_K_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
             "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon olmo_1124
+            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon olmo2
             "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
             "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
             "transformer.layers.{bid}.attn.k_norm",                           # openelm
diff --git a/src/llama.cpp b/src/llama.cpp
index 571cb68e2..af5e686e0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -179,7 +179,7 @@ enum llm_arch {
     LLM_ARCH_COMMAND_R,
     LLM_ARCH_DBRX,
     LLM_ARCH_OLMO,
-    LLM_ARCH_OLMO_1124,
+    LLM_ARCH_OLMO2,
     LLM_ARCH_OLMOE,
     LLM_ARCH_OPENELM,
     LLM_ARCH_ARCTIC,
@@ -233,7 +233,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_COMMAND_R,       "command-r"    },
     { LLM_ARCH_DBRX,            "dbrx"         },
     { LLM_ARCH_OLMO,            "olmo"         },
-    { LLM_ARCH_OLMO_1124,       "olmo_1124"    },
+    { LLM_ARCH_OLMO2,           "olmo2"        },
     { LLM_ARCH_OLMOE,           "olmoe"        },
     { LLM_ARCH_OPENELM,         "openelm"      },
     { LLM_ARCH_ARCTIC,          "arctic"       },
@@ -1210,7 +1210,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
         },
     },
     {
-        LLM_ARCH_OLMO_1124,
+        LLM_ARCH_OLMO2,
         {
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
@@ -5900,7 +5900,7 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
-        case LLM_ARCH_OLMO_1124:
+        case LLM_ARCH_OLMO2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
@@ -8593,7 +8593,7 @@ static bool llm_load_tensors(
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
-            case LLM_ARCH_OLMO_1124:
+            case LLM_ARCH_OLMO2:
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
@@ -14483,7 +14483,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_olmo_1124() {
+    struct ggml_cgraph * build_olmo2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -16799,9 +16799,9 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_olmo();
             } break;
-        case LLM_ARCH_OLMO_1124:
+        case LLM_ARCH_OLMO2:
             {
-                result = llm.build_olmo_1124();
+                result = llm.build_olmo2();
             } break;
         case LLM_ARCH_OLMOE:
             {
@@ -20084,7 +20084,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_QWEN:
         case LLM_ARCH_QWEN2:
         case LLM_ARCH_QWEN2MOE:
-        case LLM_ARCH_OLMO_1124:
+        case LLM_ARCH_OLMO2:
         case LLM_ARCH_OLMOE:
         case LLM_ARCH_PHI2:
         case LLM_ARCH_PHI3:

From 106964e3d266740f571b5aad7b57545b4a901ac9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 Nov 2024 21:49:31 +0200
Subject: [PATCH 300/396] metal : enable mat-vec kernels for bs <= 4 (#10491)

---
 ggml/src/ggml-metal/ggml-metal.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 63baaf163..c47f07a9e 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -1927,7 +1927,7 @@ static void ggml_metal_encode_node(
 
                 // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                 // to the matrix-vector kernel
-                int ne11_mm_min = 1;
+                int ne11_mm_min = 4;
 
 #if 0
                 // the numbers below are measured on M2 Ultra for 7B and 13B models

From 47f931c8f9a26c072d71224bc8013cc66ea9e445 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 Nov 2024 21:50:07 +0200
Subject: [PATCH 301/396] server : enable cache_prompt by default (#10501)

ggml-ci
---
 examples/server/README.md  | 2 +-
 examples/server/server.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 0936e0b7b..877768c8b 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -412,7 +412,7 @@ node index.js
 
     `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
 
-    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
+    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
 
     `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8684771e2..c0ea4faf7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -111,7 +111,7 @@ struct server_static_file {
 
 struct slot_params {
     bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
 
     int32_t n_keep    =  0; // number of tokens to keep from initial prompt
     int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@@ -883,7 +883,7 @@ struct server_context {
         }
 
         slot.params.stream           = json_value(data, "stream",             false);
-        slot.params.cache_prompt     = json_value(data, "cache_prompt",       false);
+        slot.params.cache_prompt     = json_value(data, "cache_prompt",       true);
         slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
         slot.params.n_indent         = json_value(data, "n_indent",           defaults.n_indent);
         slot.params.n_keep           = json_value(data, "n_keep",             defaults.n_keep);

From 9fd8c2687f5aa2f095ac6e12a376e1c0583888e8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 Nov 2024 22:28:27 +0200
Subject: [PATCH 302/396] server : add more information about error (#10455)

---
 common/speculative.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index fe315a270..e559675c4 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -90,9 +90,10 @@ bool common_speculative_are_compatible(
     if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
         llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
         llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
-        llama_token_eos(model_tgt) != llama_token_eos(model_dft)
-    ) {
+        llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
         LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
+        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
+        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
         return false;
     }
 

From 50d5cecbda3b0d03344eed326287adc1f6c7f3ef Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 25 Nov 2024 22:05:39 +0100
Subject: [PATCH 303/396] ci : build docker images only once daily (#10503)

---
 .github/workflows/docker.yml | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index a953cdac9..9cef283d9 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -10,12 +10,10 @@
 name: Publish Docker image
 
 on:
-  #pull_request:
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
-  workflow_dispatch: # allows manual triggering, useful for debugging
+  workflow_dispatch: # allows manual triggering
+  schedule:
+    # Rebuild daily rather than on every push because it is expensive
+    - cron: '12 4 * * *'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -29,7 +27,6 @@ permissions:
 jobs:
   push_to_registry:
     name: Push Docker image to Docker Hub
-    #if: github.event.pull_request.draft == false
 
     runs-on: ubuntu-latest
     env:

From 0cc63754b831d3a6c37bc5d721d12ce9540ffe76 Mon Sep 17 00:00:00 2001
From: Eric Curtin <ecurtin@redhat.com>
Date: Mon, 25 Nov 2024 16:56:24 -0500
Subject: [PATCH 304/396] Introduce llama-run (#10291)

It's like simple-chat but it uses smart pointers to avoid manual
memory cleanups. Less memory leaks in the code now. Avoid printing
multiple dots. Split code into smaller functions. Uses no exception
handling.

Signed-off-by: Eric Curtin <ecurtin@redhat.com>
---
 CMakeLists.txt              |   7 +-
 Makefile                    |   6 +
 examples/CMakeLists.txt     |   1 +
 examples/run/CMakeLists.txt |   5 +
 examples/run/README.md      |   7 +
 examples/run/run.cpp        | 409 ++++++++++++++++++++++++++++++++++++
 include/llama-cpp.h         |  25 +++
 7 files changed, 458 insertions(+), 2 deletions(-)
 create mode 100644 examples/run/CMakeLists.txt
 create mode 100644 examples/run/README.md
 create mode 100644 examples/run/run.cpp
 create mode 100644 include/llama-cpp.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 994e61e45..e7d91a5b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -163,8 +163,11 @@ if (GGML_TARGET_DEFINES)
     list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
 endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
-
-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
+# all public headers
+set(LLAMA_PUBLIC_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
+set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
 
 configure_package_config_file(
diff --git a/Makefile b/Makefile
index 14c05e93e..cfc74c1dc 100644
--- a/Makefile
+++ b/Makefile
@@ -34,6 +34,7 @@ BUILD_TARGETS = \
 	llama-server \
 	llama-simple \
 	llama-simple-chat \
+	llama-run \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
@@ -1167,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+llama-run: examples/run/run.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 llama-simple: examples/simple/simple.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 632409d55..21db1f3c2 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -36,6 +36,7 @@ else()
         add_subdirectory(server)
     endif()
     add_subdirectory(save-load-state)
+    add_subdirectory(run)
     add_subdirectory(simple)
     add_subdirectory(simple-chat)
     add_subdirectory(speculative)
diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt
new file mode 100644
index 000000000..084f1e92d
--- /dev/null
+++ b/examples/run/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-run)
+add_executable(${TARGET} run.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/run/README.md b/examples/run/README.md
new file mode 100644
index 000000000..6e926811f
--- /dev/null
+++ b/examples/run/README.md
@@ -0,0 +1,7 @@
+# llama.cpp/example/run
+
+The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
+
+```bash
+./llama-run Meta-Llama-3.1-8B-Instruct.gguf
+...
diff --git a/examples/run/run.cpp b/examples/run/run.cpp
new file mode 100644
index 000000000..cac2faefc
--- /dev/null
+++ b/examples/run/run.cpp
@@ -0,0 +1,409 @@
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
+#include <climits>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "llama-cpp.h"
+
+typedef std::unique_ptr<char[]> char_array_ptr;
+
+struct Argument {
+    std::string flag;
+    std::string help_text;
+};
+
+struct Options {
+    std::string model_path, prompt_non_interactive;
+    int ngl = 99;
+    int n_ctx = 2048;
+};
+
+class ArgumentParser {
+   public:
+    ArgumentParser(const char * program_name) : program_name(program_name) {}
+
+    void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
+        string_args[flag] = &var;
+        arguments.push_back({flag, help_text});
+    }
+
+    void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
+        int_args[flag] = &var;
+        arguments.push_back({flag, help_text});
+    }
+
+    int parse(int argc, const char ** argv) {
+        for (int i = 1; i < argc; ++i) {
+            std::string arg = argv[i];
+            if (string_args.count(arg)) {
+                if (i + 1 < argc) {
+                    *string_args[arg] = argv[++i];
+                } else {
+                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
+                    print_usage();
+                    return 1;
+                }
+            } else if (int_args.count(arg)) {
+                if (i + 1 < argc) {
+                    if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
+                        fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
+                        print_usage();
+                        return 1;
+                    }
+                } else {
+                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
+                    print_usage();
+                    return 1;
+                }
+            } else {
+                fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
+                print_usage();
+                return 1;
+            }
+        }
+
+        if (string_args["-m"]->empty()) {
+            fprintf(stderr, "error: -m is required\n");
+            print_usage();
+            return 1;
+        }
+
+        return 0;
+    }
+
+   private:
+    const char * program_name;
+    std::unordered_map<std::string, std::string *> string_args;
+    std::unordered_map<std::string, int *> int_args;
+    std::vector<Argument> arguments;
+
+    int parse_int_arg(const char * arg, int & value) {
+        char * end;
+        const long val = std::strtol(arg, &end, 10);
+        if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
+            value = static_cast<int>(val);
+            return 0;
+        }
+        return 1;
+    }
+
+    void print_usage() const {
+        printf("\nUsage:\n");
+        printf("  %s [OPTIONS]\n\n", program_name);
+        printf("Options:\n");
+        for (const auto & arg : arguments) {
+            printf("  %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
+        }
+
+        printf("\n");
+    }
+};
+
+class LlamaData {
+   public:
+    llama_model_ptr model;
+    llama_sampler_ptr sampler;
+    llama_context_ptr context;
+    std::vector<llama_chat_message> messages;
+
+    int init(const Options & opt) {
+        model = initialize_model(opt.model_path, opt.ngl);
+        if (!model) {
+            return 1;
+        }
+
+        context = initialize_context(model, opt.n_ctx);
+        if (!context) {
+            return 1;
+        }
+
+        sampler = initialize_sampler();
+        return 0;
+    }
+
+   private:
+    // Initializes the model and returns a unique pointer to it
+    llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
+        llama_model_params model_params = llama_model_default_params();
+        model_params.n_gpu_layers = ngl;
+
+        llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
+        if (!model) {
+            fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        }
+
+        return model;
+    }
+
+    // Initializes the context with the specified parameters
+    llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
+        llama_context_params ctx_params = llama_context_default_params();
+        ctx_params.n_ctx = n_ctx;
+        ctx_params.n_batch = n_ctx;
+
+        llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
+        if (!context) {
+            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
+        }
+
+        return context;
+    }
+
+    // Initializes and configures the sampler
+    llama_sampler_ptr initialize_sampler() {
+        llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+
+        return sampler;
+    }
+};
+
+// Add a message to `messages` and store its content in `owned_content`
+static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
+                        std::vector<char_array_ptr> & owned_content) {
+    char_array_ptr content(new char[text.size() + 1]);
+    std::strcpy(content.get(), text.c_str());
+    llama_data.messages.push_back({role, content.get()});
+    owned_content.push_back(std::move(content));
+}
+
+// Function to apply the chat template and resize `formatted` if needed
+static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
+    int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
+                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
+    if (result > static_cast<int>(formatted.size())) {
+        formatted.resize(result);
+        result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
+                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
+    }
+
+    return result;
+}
+
+// Function to tokenize the prompt
+static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
+                           std::vector<llama_token> & prompt_tokens) {
+    const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
+    prompt_tokens.resize(n_prompt_tokens);
+    if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
+                       true) < 0) {
+        GGML_ABORT("failed to tokenize the prompt\n");
+    }
+
+    return n_prompt_tokens;
+}
+
+// Check if we have enough space in the context to evaluate this batch
+static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
+    const int n_ctx = llama_n_ctx(ctx.get());
+    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
+    if (n_ctx_used + batch.n_tokens > n_ctx) {
+        printf("\033[0m\n");
+        fprintf(stderr, "context size exceeded\n");
+        return 1;
+    }
+
+    return 0;
+}
+
+// convert the token to a string
+static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
+    char buf[256];
+    int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
+    if (n < 0) {
+        GGML_ABORT("failed to convert token to piece\n");
+    }
+
+    piece = std::string(buf, n);
+    return 0;
+}
+
+static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
+    printf("%s", piece.c_str());
+    fflush(stdout);
+    response += piece;
+}
+
+// helper function to evaluate a prompt and generate a response
+static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
+    std::vector<llama_token> prompt_tokens;
+    const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
+    if (n_prompt_tokens < 0) {
+        return 1;
+    }
+
+    // prepare a batch for the prompt
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+    llama_token new_token_id;
+    while (true) {
+        check_context_size(llama_data.context, batch);
+        if (llama_decode(llama_data.context.get(), batch)) {
+            GGML_ABORT("failed to decode\n");
+        }
+
+        // sample the next token, check is it an end of generation?
+        new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
+        if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
+            break;
+        }
+
+        std::string piece;
+        if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
+            return 1;
+        }
+
+        print_word_and_concatenate_to_response(piece, response);
+
+        // prepare the next batch with the sampled token
+        batch = llama_batch_get_one(&new_token_id, 1);
+    }
+
+    return 0;
+}
+
+static int parse_arguments(const int argc, const char ** argv, Options & opt) {
+    ArgumentParser parser(argv[0]);
+    parser.add_argument("-m", opt.model_path, "model");
+    parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
+    parser.add_argument("-c", opt.n_ctx, "context_size");
+    parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
+    if (parser.parse(argc, argv)) {
+        return 1;
+    }
+
+    return 0;
+}
+
+static int read_user_input(std::string & user) {
+    std::getline(std::cin, user);
+    return user.empty();  // Indicate an error or empty input
+}
+
+// Function to generate a response based on the prompt
+static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
+    // Set response color
+    printf("\033[33m");
+    if (generate(llama_data, prompt, response)) {
+        fprintf(stderr, "failed to generate response\n");
+        return 1;
+    }
+
+    // End response with color reset and newline
+    printf("\n\033[0m");
+    return 0;
+}
+
+// Helper function to apply the chat template and handle errors
+static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
+                                                   const bool is_user_input, int & output_length) {
+    const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
+    if (new_len < 0) {
+        fprintf(stderr, "failed to apply the chat template\n");
+        return -1;
+    }
+
+    output_length = new_len;
+    return 0;
+}
+
+// Helper function to handle user input
+static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
+    if (!prompt_non_interactive.empty()) {
+        user_input = prompt_non_interactive;
+        return true;  // No need for interactive input
+    }
+
+    printf("\033[32m> \033[0m");
+    return !read_user_input(user_input);  // Returns false if input ends the loop
+}
+
+// Function to tokenize the prompt
+static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
+    std::vector<char_array_ptr> owned_content;
+    std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
+    int prev_len = 0;
+
+    while (true) {
+        // Get user input
+        std::string user_input;
+        if (!handle_user_input(user_input, prompt_non_interactive)) {
+            break;
+        }
+
+        add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
+                    owned_content);
+
+        int new_len;
+        if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
+            return 1;
+        }
+
+        std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
+        std::string response;
+        if (generate_response(llama_data, prompt, response)) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static void log_callback(const enum ggml_log_level level, const char * text, void *) {
+    if (level == GGML_LOG_LEVEL_ERROR) {
+        fprintf(stderr, "%s", text);
+    }
+}
+
+static bool is_stdin_a_terminal() {
+#if defined(_WIN32)
+    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
+    DWORD mode;
+    return GetConsoleMode(hStdin, &mode);
+#else
+    return isatty(STDIN_FILENO);
+#endif
+}
+
+static std::string read_pipe_data() {
+    std::ostringstream result;
+    result << std::cin.rdbuf();  // Read all data from std::cin
+    return result.str();
+}
+
+int main(int argc, const char ** argv) {
+    Options opt;
+    if (parse_arguments(argc, argv, opt)) {
+        return 1;
+    }
+
+    if (!is_stdin_a_terminal()) {
+        if (!opt.prompt_non_interactive.empty()) {
+            opt.prompt_non_interactive += "\n\n";
+        }
+
+        opt.prompt_non_interactive += read_pipe_data();
+    }
+
+    llama_log_set(log_callback, nullptr);
+    LlamaData llama_data;
+    if (llama_data.init(opt)) {
+        return 1;
+    }
+
+    if (chat_loop(llama_data, opt.prompt_non_interactive)) {
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/include/llama-cpp.h b/include/llama-cpp.h
new file mode 100644
index 000000000..daa04d4d8
--- /dev/null
+++ b/include/llama-cpp.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include <memory>
+
+#include "llama.h"
+
+struct llama_model_deleter {
+    void operator()(llama_model * model) { llama_free_model(model); }
+};
+
+struct llama_context_deleter {
+    void operator()(llama_context * context) { llama_free(context); }
+};
+
+struct llama_sampler_deleter {
+    void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
+};
+
+typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
+typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
+typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;

From 0eb4e12beebabae46d37b78742f4c5d4dbe52dc1 Mon Sep 17 00:00:00 2001
From: Junil Kim <logyourself@gmail.com>
Date: Tue, 26 Nov 2024 10:47:20 +0900
Subject: [PATCH 305/396] vulkan: Fix a vulkan-shaders-gen arugment parsing
 error (#10484)

The vulkan-shaders-gen was not parsing the --no-clean argument correctly.
Because the previous code was parsing the arguments which have a value only
and the --no-clean argument does not have a value, it was not being parsed
correctly. This commit can now correctly parse arguments that don't have values.
---
 .../vulkan-shaders/vulkan-shaders-gen.cpp            | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index 6bbe8e96e..5c317b68b 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -474,9 +474,15 @@ void write_output_files() {
 
 int main(int argc, char** argv) {
     std::map<std::string, std::string> args;
-    for (int i = 1; i < argc; i += 2) {
-        if (i + 1 < argc) {
-            args[argv[i]] = argv[i + 1];
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg.rfind("--", 0) == 0) {
+            if (i + 1 < argc && argv[i + 1][0] != '-') {
+                args[arg] = argv[i + 1];
+                ++i;
+            } else {
+                args[arg] = "";
+            }
         }
     }
 

From 7066b4cce2898993e943ad6af5d8f1de5840c8e9 Mon Sep 17 00:00:00 2001
From: Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
Date: Tue, 26 Nov 2024 17:31:05 +0800
Subject: [PATCH 306/396] CANN: RoPE and CANCAT operator optimization (#10488)

Co-authored-by: noemotiovon <noemotiovon@gmail.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 225 +++++++------------------------
 ggml/src/ggml-cann/ggml-cann.cpp |  68 ++++++++--
 2 files changed, 106 insertions(+), 187 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 1f4ee986c..6113b59f4 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -21,6 +21,7 @@
  */
 
 #include "aclnn_ops.h"
+#include "ggml-impl.h"
 
 #include <aclnnop/aclnn_avgpool2d.h>
 #include <aclnnop/aclnn_cast.h>
@@ -241,10 +242,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
     aclTensor* acl_dst = ggml_cann_create_tensor(dst);
 
-    int64_t concat_dim = 1;
+    const int32_t dim = ggml_get_op_params_i32(dst, 0);
+
+    GGML_ASSERT(dim >= 0 && dim < 4);
+    int32_t acl_dim = 3 - dim;
+
     aclTensor* tensors[] = {acl_src0, acl_src1};
     aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
-    aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
+    aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
 
     ACL_CHECK(aclDestroyTensorList(tensorList));
     ACL_CHECK(aclDestroyTensor(acl_dst));
@@ -1437,10 +1442,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_tensor* src0 = dst->src[0];  // kernel
     ggml_tensor* src1 = dst->src[1];  // input
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
-
     GGML_TENSOR_BINARY_OP_LOCALS;
 
     // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@@ -1462,9 +1463,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     const int64_t OH = is_2D ? ne2 : 1;
     const int64_t OW = ne1;
 
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
     // memory allocated increased to 3x when is_2D == false
     const int64_t n_bytes_factor = is_2D ? 1 : 3;
 
@@ -2859,15 +2857,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
 }
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
+    const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
+    int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
+    aclOpExecutor** executor);
+aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
+                                         uint64_t workspaceSize,
+                                         aclOpExecutor* executor,
+                                         aclrtStream stream);
+#ifdef __cplusplus
+}
+#endif
+
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     // TODO: use ascendc
     // Only test with LLAMA model.
     ggml_tensor* src0 = dst->src[0];  // input
     ggml_tensor* src2 = dst->src[2];  // freq_factors
 
-    // TODO: with freq_factors
-    GGML_ASSERT(src2 == NULL);
-
     // param
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
     // const int n_past     = ((int32_t *) dst->op_params)[0];
@@ -2885,13 +2895,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
     memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
 
-    GGML_ASSERT(n_dims <= ne0);
+    // TODO: with freq_factors
+    GGML_ASSERT(src2 == NULL);
+    // TODO: attn_factor != 1
+    GGML_ASSERT(attn_factor == 1);
+    // TODO: n_dims <= ne0
+    GGML_ASSERT(n_dims == ne0);
     GGML_ASSERT(n_dims % 2 == 0);
-
     // TODO: ext_factor != 0
     GGML_ASSERT(ext_factor == 0);
     // TODO: freq_scale != 1
     GGML_ASSERT(freq_scale == 1);
+    // TODO: type == GGML_TYPE_F16
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 
     const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
@@ -2924,177 +2940,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
                      theta_scale, is_neox);
 
-    // roll input
-    void* input_roll_buffer;
-    aclTensor* acl_minus_one_tensor;
-    void* minus_one_scale_buffer = nullptr;
-    ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
-    ggml_cann_pool_alloc minus_one_scale_allocator(
-        ctx.pool(), sizeof(float_t) * src0->ne[0]);
-    if (!is_neox) {
-        // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
-        input_roll_buffer = roll_allocator.get();
-        int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
-                                    src0->ne[2], src0->ne[3]};
-        size_t input_roll_nb[GGML_MAX_DIMS];
-        input_roll_nb[0] = ggml_type_size(src0->type);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
-        }
-        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
-            input_roll_buffer, ggml_cann_type_mapping(src0->type),
-            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
-            GGML_MAX_DIMS);
-        aclTensor* acl_input_tensor = ggml_cann_create_tensor(
-            src0->data, ggml_cann_type_mapping(src0->type),
-            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
-            GGML_MAX_DIMS);
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
 
-        int64_t shifts[] = {1};
-        int64_t dims[] = {3};
-        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
-        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
-        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+    void* workspaceAddr = nullptr;
 
-        // init [-1, 1, -1, 1, ...]
-        minus_one_scale_buffer = minus_one_scale_allocator.get();
-
-        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
-        size_t minus_one_nb[GGML_MAX_DIMS];
-        minus_one_nb[0] = sizeof(float_t);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
-        }
-        acl_minus_one_tensor = aclnn_ones(
-            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
-            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
-        int64_t dim = 3;
-        int64_t* index = new int64_t[src0->ne[0]];
-        for (int i = 0; i < src0->ne[0]; i++) {
-            index[i] = i / 2 * 2;
-        }
-        int64_t index_num = src0->ne[0];
-        float value = -1;
-        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
-                                index_num, value);
-    } else {
-        // roll input: [q0,q1,q2,...] ->
-        // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
-        input_roll_buffer = roll_allocator.get();
-        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
-            input_roll_buffer, ggml_cann_type_mapping(src0->type),
-            ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
-        aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
-
-        int64_t shifts[] = {src0->ne[0] / 2};
-        int64_t dims[] = {3};
-        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
-
-        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
-        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
-
-        // init [-1, -1, -1, 1, 1，1，...]
-        minus_one_scale_buffer = minus_one_scale_allocator.get();
-
-        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
-        size_t minus_one_nb[GGML_MAX_DIMS];
-        minus_one_nb[0] = sizeof(float_t);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
-        }
-        acl_minus_one_tensor = aclnn_ones(
-            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
-            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
-        // -1 * first half
-        int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
-        size_t first_half_nb[GGML_MAX_DIMS];
-        first_half_nb[0] = sizeof(float_t);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
-        }
-        aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
-            minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
-            first_half_nb, GGML_MAX_DIMS);
-        bool inplace = true;
-        float scale = -1;
-        aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
-        ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
+    int acl_mode = mode;
+    if (mode == 0) {
+        acl_mode = 1;
     }
 
-    // TODO: n_dims < ne0
-    GGML_ASSERT(n_dims == src0->ne[0]);
-
-    // input * scale
-    ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
-                                                  ggml_nbytes(src0));
-    void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
-    size_t input_nb[GGML_MAX_DIMS];
-    input_nb[0] = ggml_type_size(src0->type);
-    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
-    }
-    aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
-        input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
-        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
-    aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
-        input_roll_buffer, ggml_cann_type_mapping(src0->type),
-        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
-
-    aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
-              acl_input_roll_mul_scale_tensor);
-
-    // output
-    aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
+    aclTensor* acl_x = ggml_cann_create_tensor(src0);
     aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-    void* output_fp32_buffer;
-    if (src0->type == GGML_TYPE_F32) {
-        aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor);
-        aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
-                          acl_sin_reshape_tensor);
-        aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
-        // TODO: ne0 != n_dims in mode2
-    } else if (src0->type == GGML_TYPE_F16) {
-        size_t input_fp32_nb[GGML_MAX_DIMS];
-        input_fp32_nb[0] = sizeof(float_t);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
-        }
-        ggml_cann_pool_alloc fp32_allocator1(
-            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
-        void* input_fp32_buffer1 = fp32_allocator1.get();
-        aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
-            input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
-            input_fp32_nb, GGML_MAX_DIMS);
-        ggml_cann_pool_alloc fp32_allocator2(
-            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
-        void* input_fp32_buffer2 = fp32_allocator2.get();
-        aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
-            input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
-            input_fp32_nb, GGML_MAX_DIMS);
-
-        ggml_cann_pool_alloc fp32_allocator(
-            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
-        output_fp32_buffer = fp32_allocator.get();
-        aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
-            output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
-            input_fp32_nb, GGML_MAX_DIMS);
-        aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
-        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
-                  input_fp32_tensor2);
-        aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
-                  output_fp32_tensor);
-        aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
-
-        ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
-        ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
-        ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
+    ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
+        acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
     }
 
-    ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+    ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
+                                           executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_x));
     ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_src0));
+    ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index d96f65936..2ef5b590a 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1669,12 +1669,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             }
         case GGML_OP_MUL_MAT: {
             switch (op->src[0]->type) {
+                case GGML_TYPE_Q8_0:
+                    // Current groupsize should not be greater than k-1 in
+                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
+                    if (op->src[0]->ne[0] <= QK8_0) {
+                        return false;
+                    }
                 case GGML_TYPE_F16:
                 case GGML_TYPE_F32:
-                case GGML_TYPE_Q8_0:
-                    // TODO: fix me
-                    // Current groupsize should not be greater than k-1 in
-                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
                 case GGML_TYPE_Q4_0:
                     return true;
                 default:
@@ -1706,9 +1708,61 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                     return false;
             }
         }
+        case GGML_OP_CONT: {
+            // TODO: support GGML_TYPE_BF16
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+        case GGML_OP_ROPE: {
+            // TODO: with ops-test v == 1
+            float * freq_scale = (float*)((int32_t*)op->op_params + 6);
+            float * ext_factor = (float*)((int32_t*)op->op_params + 7);
+            float * attn_factor = (float*)((int32_t*)op->op_params + 8);
+            // TODO: with freq_factors
+            if (op->src[2] != NULL) {
+                return false;
+            }
+            // TODO: n_dims <= ne0
+            if (op->src[0]->ne[0] != op->op_params[1]) {
+                return false;
+            }
+            // TODO: ext_factor != 0
+            if (*ext_factor != 0) {
+                return false;
+            }
+            // TODO: freq_scale != 1
+            if (*freq_scale != 1) {
+                return false;
+            }
+            // TODO: attn_factor != 1
+            if (*attn_factor != 1) {
+                return false;
+            }
+            //TODO: type == GGML_TYPE_F16
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+        case GGML_OP_UPSCALE: {
+            // aclnnUpsampleNearest2dGetWorkspaceSize not support
+            // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
+            if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
+                return false;
+            }
+            return true;
+        }
+        case GGML_OP_IM2COL:
+        case GGML_OP_CONCAT:
         case GGML_OP_DUP:
         case GGML_OP_REPEAT:
-        case GGML_OP_CONCAT:
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
@@ -1722,17 +1776,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_SCALE:
         case GGML_OP_SQR:
         case GGML_OP_CLAMP:
-        case GGML_OP_CONT:
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
-        case GGML_OP_ROPE:
-        case GGML_OP_IM2COL:
         case GGML_OP_POOL_2D:
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
         case GGML_OP_GROUP_NORM:
-        case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:

From 9a4b79bcfa4338b922fa8cf903bd5ac058aaf46f Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Tue, 26 Nov 2024 18:08:37 +0800
Subject: [PATCH 307/396] CANN: Improve the Inferencing Performance for Ascend
 NPU Device (#10454)

* improve inferencing performance for ascend npu.

Co-authored-by: Frank Mai <thxCode@thxcode0824@gmail.com>

* some modification after review

* some modifications after review

* restore some modifications

* restore some modifications

---------

Co-authored-by: shanshan shen <shanshanshen333@gmail.com>
Co-authored-by: Frank Mai <thxCode@thxcode0824@gmail.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 301 +++++++++++++++++++++++--------
 ggml/src/ggml-cann/common.h      |   9 +-
 ggml/src/ggml-cann/ggml-cann.cpp |  58 ++++--
 3 files changed, 266 insertions(+), 102 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 6113b59f4..d7472ee3a 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -33,6 +33,8 @@
 #include <aclnnop/aclnn_group_norm.h>
 #include <aclnnop/aclnn_index_fill_tensor.h>
 #include <aclnnop/aclnn_layer_norm.h>
+#include <aclnnop/aclnn_mm.h>
+#include <aclnnop/aclnn_batch_matmul.h>
 #include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/aclnn_max_pool.h>
 #include <aclnnop/aclnn_permute.h>
@@ -2423,7 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
                           aclTensor* acl_weight, aclTensor* acl_dst) {
     int8_t cube_math_type = 1;  // ALLOW_FP32_DOWN_PRECISION, when input is
                                 // fp32, atlas a2 will transpose it to HFLOAT32.
-
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -2441,6 +2442,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
         aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
+/**
+ * @brief Performs matrix multiplication of two 2D tensors.
+ *
+ * This function computes the matrix multiplication of the input tensor
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
+ * destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst}=\text {acl_input@acl_weight}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_input The input tensor for the matrix multiplication.
+ * @param acl_weight The weight tensor for the matrix multiplication.
+ * @param acl_dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
+                             aclTensor* acl_weight, aclTensor* acl_dst) {
+    int8_t cube_math_type = 2;
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
+                                      cube_math_type, &workspaceSize,
+                                      &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Performs matrix multiplication of two 3D tensors.
+ *
+ * This function computes the matrix multiplication of the input tensor
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
+ * destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst}=\text {acl_input@acl_weight}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_input The input tensor for the matrix multiplication.
+ * @param acl_weight The weight tensor for the matrix multiplication.
+ * @param acl_dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
+                             aclTensor* acl_weight, aclTensor* acl_dst) {
+    int8_t cube_math_type = 2;
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
+                                               cube_math_type, &workspaceSize,
+                                               &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
 /**
  * @brief Performs matrix multiplication with floating-point precision on
  * tensors using the CANN backend.
@@ -2462,20 +2537,43 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
     // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
     BCAST_MUL_MAT_SHAPE(input, weight, dst);
 
-    // transpose weight: [1,2,3,4] -> [1,2,4,3]
-    int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
-                              bcast_weight_ne[2], bcast_weight_ne[3],
-                              bcast_weight_ne[4], bcast_weight_ne[5]};
-    size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
-                             bcast_weight_nb[2], bcast_weight_nb[3],
-                             bcast_weight_nb[4], bcast_weight_nb[5]};
+    int64_t n_dims = bcast_dims;
+    if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
+        if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
+            n_dims = 2;
+        } else if (bcast_input_ne[2] == 1) {
+            n_dims = 3;
+        }
+    }
 
-    aclTensor* acl_weight_tensor =
-        ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
     aclTensor* acl_input_tensor =
-        ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input));
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
-    aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
+    int64_t transpose_ne[] = {
+        bcast_weight_ne[1], bcast_weight_ne[0],
+        bcast_weight_ne[2], bcast_weight_ne[3],
+        bcast_weight_ne[4], bcast_weight_ne[5]
+    };
+    size_t transpose_nb[] = {
+        bcast_weight_nb[1], bcast_weight_nb[0],
+        bcast_weight_nb[2], bcast_weight_nb[3],
+        bcast_weight_nb[4], bcast_weight_nb[5]
+    };
+    aclTensor* acl_weight_tensor =
+        ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
+    aclTensor* acl_dst =
+        ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
+
+    switch (n_dims) {
+    case 2:
+        aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        break;
+    case 3:
+        aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        break;
+    default:
+        aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        break;
+    }
 
     ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
     ACL_CHECK(aclDestroyTensor(acl_input_tensor));
@@ -2501,46 +2599,40 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     ggml_tensor* src0 = dst->src[0];  // weight
     ggml_tensor* src1 = dst->src[1];  // input
 
-    // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
-    // is regarded as batch. weight need transpose.
-    int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
+    // The shape of the weight is NCHW.
+    // Matrix multiplication uses HW dims.
+    // HC is regarded as batch.
+    // weight need transpose.
     float weight_elem_size;
     if (type == GGML_TYPE_Q4_0) {
         weight_elem_size = float(sizeof(uint8_t)) / 2;
-    }
-    else if (type == GGML_TYPE_Q8_0) {
+    } else if (type == GGML_TYPE_Q8_0) {
         weight_elem_size = float(sizeof(uint8_t));
-    }
-    else {
+    } else {
         GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
     }
-    float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
-
-    // size of one matrix is element_size * height * width.
-    size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
+    float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
+    size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
     size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
 
     // scale stored at the end of weight. Also need transpose.
-    GGML_ASSERT(QK4_0 == QK8_0);
-    int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
     size_t scale_elem_size = sizeof(uint16_t);
-    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
-                         scale_elem_size};
-    size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
+    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
+    size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
     char* scale_offset = (char*)src0->data + weight_size;
 
     // input
-    void* input_buffer;
     size_t input_elem_size = sizeof(uint16_t);
     int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
-    size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
-    size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
-
+    size_t input_nb[] = {input_elem_size,  input_ne[0] * input_elem_size};
+    size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
     ggml_cann_pool_alloc input_alloctor(ctx.pool());
+    void* input_buffer = src1->data;
+
+    // case in
     if (src1->type != GGML_TYPE_F16) {
         aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
-        input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
-        input_buffer = input_alloctor.get();
+        input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
 
         int64_t* input_cast_ne = src1->ne;
         size_t input_cast_nb[GGML_MAX_DIMS];
@@ -2550,88 +2642,139 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
         }
 
         aclTensor* acl_input_tensor = ggml_cann_create_tensor(
-            input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
-            input_cast_nb, GGML_MAX_DIMS);
+            input_buffer,
+            ACL_FLOAT16,
+            input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
         aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
+
         ACL_CHECK(aclDestroyTensor(acl_input_tensor));
         ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
-    } else {
-        input_buffer = src1->data;
     }
 
     // output
     size_t output_elem_size = sizeof(uint16_t);
-    int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
-    size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
-    ggml_cann_pool_alloc output_alloctor(
-        ctx.pool(), ggml_nelements(dst) * output_elem_size);
-    void* output_buffer = output_alloctor.get();
-    size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
+    size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
+    ggml_cann_pool_alloc output_allocator(ctx.pool());
+    void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
+    size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
 
     // aclnn
+    int64_t max_elem_size = 65535;
+    int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
+    ggml_cann_pool_alloc workspace_allocator(ctx.pool());
+    aclOpExecutor* executor = nullptr;
     uint64_t workspaceSize = 0;
-    aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
-
     for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
         for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
             int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
             int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
 
-            int64_t batch1 = n1 * src1->ne[2] + c1;
-            int64_t batch0 = n0 * src0->ne[2] + c0;
+            int64_t batch1 = (n1 * src1->ne[2]) + c1;
+            int64_t batch0 = (n0 * src0->ne[2]) + c0;
 
             aclTensor* acl_input_tensor = ggml_cann_create_tensor(
                 (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
                 input_elem_size, input_ne, input_nb, 2);
+
+            // first split
+            int64_t weight_ne_offset = 0;
+            int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
+            int64_t scale_ne_offset = 0;
+            int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
+            int64_t output_ne_offset = 0;
+            int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
+
             aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
                 (char*)src0->data + batch0 * weight_stride,
-                ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
-                weight_nb, 2);
+                ggml_cann_type_mapping(type),
+                weight_elem_size, weight_ne, weight_nb, 2,
+                ACL_FORMAT_ND, weight_ne_offset);
             aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
-                scale_offset + batch0 * scale_stride, ACL_FLOAT16,
-                scale_elem_size, scale_ne, scale_nb, 2);
+                scale_offset + batch0 * scale_stride,
+                ACL_FLOAT16,
+                scale_elem_size, scale_ne, scale_nb, 2,
+                ACL_FORMAT_ND, scale_ne_offset);
             aclTensor* acl_output_tensor = ggml_cann_create_tensor(
-                (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
-                output_elem_size, output_ne, output_nb, 2);
+                (char*)output_buffer + batch1 * output_stride,
+                ACL_FLOAT16,
+                output_elem_size, output_ne, output_nb, 2,
+                ACL_FORMAT_ND, output_ne_offset);
 
             ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
-                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
-                nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
-                &workspaceSize, &executor));
-
-            if (workspaceSize > 0 && workspaceAddr == nullptr) {
-                ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
-                                                         workspaceSize);
-                workspaceAddr = workspace_allocator.get();
+                acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
+                nullptr, nullptr, nullptr, nullptr, QK8_0,
+                acl_output_tensor, &workspaceSize, &executor));
+            if (workspaceAddr == nullptr) {
+                workspaceAddr = workspace_allocator.alloc(workspaceSize);
             }
-
             ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
                 workspaceAddr, workspaceSize, executor, ctx.stream()));
 
-            ACL_CHECK(aclDestroyTensor(acl_input_tensor));
             ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
             ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
             ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+
+            // other splits
+            for (int64_t split = 1; split < split_size; split++) {
+                weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
+                weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
+                scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
+                scale_ne[0] = weight_ne[0];
+                output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
+                output_ne[0] = weight_ne[0];
+
+                acl_weight_tensor = ggml_cann_create_tensor(
+                    (char*)src0->data + batch0 * weight_stride,
+                    ggml_cann_type_mapping(type),
+                    weight_elem_size, weight_ne, weight_nb, 2,
+                    ACL_FORMAT_ND, weight_ne_offset);
+                acl_scale_tensor = ggml_cann_create_tensor(
+                    scale_offset + batch0 * scale_stride,
+                    ACL_FLOAT16,
+                    scale_elem_size, scale_ne, scale_nb, 2,
+                    ACL_FORMAT_ND, scale_ne_offset);
+                acl_output_tensor = ggml_cann_create_tensor(
+                    (char*)output_buffer + batch1 * output_stride,
+                    ACL_FLOAT16,
+                    output_elem_size, output_ne, output_nb, 2,
+                    ACL_FORMAT_ND, output_ne_offset);
+
+                ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
+                    acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
+                    nullptr, nullptr, nullptr, nullptr, QK8_0,
+                    acl_output_tensor, &workspaceSize, &executor));
+                ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
+                    workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+                ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
+                ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
+                ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+            }
+
+            ACL_CHECK(aclDestroyTensor(acl_input_tensor));
         }
     }
 
     // cast out
-    int64_t* output_cast_ne = dst->ne;
-    size_t output_cast_nb[GGML_MAX_DIMS];
-    output_cast_nb[0] = sizeof(uint16_t);
-    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
+    if (dst->type != GGML_TYPE_F16) {
+        int64_t* output_cast_ne = dst->ne;
+        size_t output_cast_nb[GGML_MAX_DIMS];
+        output_cast_nb[0] = sizeof(uint16_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
+        }
+
+        aclTensor* acl_output_tensor = ggml_cann_create_tensor(
+            output_buffer,
+            ACL_FLOAT16,
+            output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
+        aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
+        aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
+
+        ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
     }
-
-    aclTensor* acl_output_tensor =
-        ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
-                                output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
-    aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
-    aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT);
-
-    ACL_CHECK(aclDestroyTensor(acl_output_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
 }
 
 void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index edfa49614..5164cb74e 100644
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
 struct ggml_backend_cann_context {
     int32_t device;                  /**< Device ID. */
     std::string name;                /**< Name of the device. */
+    std::string description;         /**< Description of the device. */
     aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
 
-    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
-        {nullptr}}; /**< Array of streams for the device. */
+    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
 
     /**
      * @brief Constructor for initializing the context with a given device.
      * @param device Device ID.
      */
     explicit ggml_backend_cann_context(int device)
-        : device(device), name("CANN" + std::to_string(device)) {}
+        : device(device), name("CANN" + std::to_string(device)) {
+        ggml_cann_set_device(device);
+        description = aclrtGetSocName();
+    }
 
     /**
      * @brief Destructor for cleaning up resources.
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 2ef5b590a..c7a3419c7 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
         ACL_CHECK(aclrtMemGetAllocationGranularity(
             &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
             &info.devices[id].vmm_granularity));
+
+        size_t free, total;
+        ggml_backend_cann_get_device_memory(id, &free, &total);
+        info.devices[id].total_vram = free;
     }
 
     // TODO: add more device info later.
@@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
      * @return A pointer to the allocated buffer.
      */
     void* alloc(size_t size, size_t* actual_size) override {
+        const size_t alignment = 128;
+        size = GGML_PAD(size, alignment);
+        if (size == 0) {
+            size = alignment;
+        }
 #ifdef DEBUG_CANN_MALLOC
         int nnz = 0;
         size_t max_size = 0;
@@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
             return ptr;
         }
         void* ptr;
-        size_t look_ahead_size = (size_t)(1.05 * size);
-        look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
         ggml_cann_set_device(device);
         ACL_CHECK(
-            aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
-        *actual_size = look_ahead_size;
-        pool_size += look_ahead_size;
+            aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        *actual_size = size;
+        pool_size += size;
 #ifdef DEBUG_CANN_MALLOC
         GGML_LOG_INFO(
             "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
@@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
     /**
      * @brief The maximum size of the virtual memory pool (32 GB).
      */
-    static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35;  // 32 GB
+    size_t max_size;
 
     /**
      * @brief The device ID associated with this buffer pool.
@@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      */
     explicit ggml_cann_pool_vmm(int device)
         : device(device),
-          granularity(ggml_cann_info().devices[device].vmm_granularity) {}
+          granularity(ggml_cann_info().devices[device].vmm_granularity) {
+        auto dev = ggml_cann_info().devices[device];
+        granularity = dev.vmm_granularity;
+        max_size = dev.total_vram;
+    }
 
     /**
      * @brief Destructor to free all buffers in the virtual memory pool.
@@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
         // round up the allocation size to the alignment to ensure that all
         // allocations are aligned for all data types
         const size_t alignment = 128;
-        size = alignment * ((size + alignment - 1) / alignment);
+        size = GGML_PAD(size, alignment);
+        if (size == 0) {
+            size = alignment;
+        }
 
         size_t avail = pool_size - pool_used;
 
         if (size > avail) {
             // round up to the next multiple of the granularity
             size_t reserve_size = size - avail;
-            reserve_size =
-                granularity * ((reserve_size + granularity - 1) / granularity);
+            reserve_size = GGML_PAD(reserve_size, granularity);
 
-            GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
+            GGML_ASSERT(pool_size + reserve_size <= max_size);
 
             // allocate more physical memory
             aclrtPhysicalMemProp prop = {};
@@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
             // reserve virtual address space (if not already reserved)
             if (pool_addr == 0) {
                 ACL_CHECK(aclrtReserveMemAddress(
-                    &pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
+                    &pool_addr, max_size, 0, NULL, 1));
             }
 
             // map at the end of the pool
@@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
             // add to the pool
             pool_size += reserve_size;
 
-            // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
-            // reserved %llu MB)\n",
-            //       device, (unsigned long long) (pool_size/1024/1024),
-            //       (unsigned long long) (reserve_size/1024/1024));
+#ifdef DEBUG_CANN_MALLOC
+             GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+                   device, (unsigned long long) (pool_size/1024/1024),
+                   (unsigned long long) (reserve_size/1024/1024));
+#endif
         }
 
         GGML_ASSERT(pool_addr != 0);
@@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
  */
 std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
     int device) {
-    // return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
     return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
 }
 
@@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
     static bool ggml_backend_cann_buffer_type_initialized = false;
 
     if (!ggml_backend_cann_buffer_type_initialized) {
-        for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
+        for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
             ggml_backend_cann_buffer_types[i] = {
                 /* .iface    = */ ggml_backend_cann_buffer_type_interface,
-                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
                 /* .context  = */
                  new ggml_backend_cann_buffer_type_context{
                     i, "CANN" + std::to_string(i)},
@@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
         return nullptr;
     }
 
+    const size_t alignment = 128;
+    size = GGML_PAD(size, alignment);
+    if (size == 0) {
+        size = alignment;
+    }
+
     void * hostPtr = nullptr;
     aclError err = aclrtMallocHost((void **) &hostPtr, size);
     if (err != ACL_SUCCESS) {
-
         GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                            size / 1024.0 / 1024.0, aclGetRecentErrMsg());
         return nullptr;

From 811872a59daefb25fc0c4326bcb6d8ae893c2f7c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 26 Nov 2024 12:29:38 +0200
Subject: [PATCH 308/396] speculative : simplify the implementation (#10504)

ggml-ci
---
 .../speculative-simple/speculative-simple.cpp | 57 ++++++++-----------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 7bf9056bf..2ea49d47c 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -117,7 +117,8 @@ int main(int argc, char ** argv) {
     llama_token id_last = inp.back();
 
     // all tokens currently in the target context
-    auto prompt_tgt = std::vector<llama_token>(inp.begin(), inp.end() - 1);
+    llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
+    prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
 
     int n_past = inp.size() - 1;
 
@@ -181,54 +182,44 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
 
         n_past    += ids.size() - 1;
-        n_drafted += batch_tgt.n_tokens - 1;
+        n_drafted += draft.size(); // note: we ignore the discarded small drafts
         n_accept  += ids.size() - 1;
+        n_predict += ids.size();
 
         // process the accepted tokens and update contexts
         //
         // this is the standard token post-processing that we normally do
         // in this case, we do it for a group of accepted tokens at once
         //
-        {
-            llama_token id;
-            std::string token_str;
+        for (size_t i = 0; i < ids.size(); ++i) {
+            prompt_tgt.push_back(id_last);
 
-            for (size_t i = 0; i < ids.size(); ++i) {
-                id = ids[i];
+            id_last = ids[i];
 
-                ++n_predict;
-
-                if (llama_token_is_eog(model_tgt, id)) {
-                    has_eos = true;
-                    break;
-                }
-
-                token_str = common_token_to_piece(ctx_tgt, id);
-
-                if (params.use_color && i + 1 < ids.size()) {
-                    LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
-                } else {
-                    LOG("%s", token_str.c_str());
-                }
-            }
-
-            if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
+            if (llama_token_is_eog(model_tgt, id_last)) {
+                has_eos = true;
                 break;
             }
 
-            LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str());
+            const std::string token_str = common_token_to_piece(ctx_tgt, id_last);
 
-            {
-                LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
-
-                llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
+            if (params.use_color && i + 1 < ids.size()) {
+                LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
+            } else {
+                LOG("%s", token_str.c_str());
             }
+        }
 
-            prompt_tgt.push_back(id_last);
-            prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1);
+        LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
 
-            // remember the last accepted token for the next iteration
-            id_last = id;
+        {
+            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
+
+            llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
+        }
+
+        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
+            break;
         }
     }
 

From 84e1c33cde9e0a7aafcda2d4f21ba51c300482d7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 26 Nov 2024 13:36:40 +0200
Subject: [PATCH 309/396] server : fix parallel speculative decoding (#10513)

ggml-ci
---
 examples/server/server.cpp | 71 +++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 36 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c0ea4faf7..9c86407c2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2267,49 +2267,48 @@ struct server_context {
                     continue; // continue loop of slots
                 }
 
-                llama_token id;
+                llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
 
-                {
-                    completion_token_output result;
+                slot.i_batch = -1;
 
-                    id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                common_sampler_accept(slot.smpl, id, true);
 
-                    slot.i_batch = -1;
-
-                    common_sampler_accept(slot.smpl, id, true);
-
-                    slot.n_decoded += 1;
-                    if (slot.n_decoded == 1) {
-                        slot.t_start_generation = ggml_time_us();
-                        slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
-                        metrics.on_prompt_eval(slot);
-                    }
-
-                    result.tok = id;
-
-                    const auto * cur_p = common_sampler_get_candidates(slot.smpl);
-
-                    for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
-                        result.probs.push_back({
-                            cur_p->data[i].id,
-                                i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                        });
-                    }
-
-                    if (!process_token(result, slot)) {
-                        // release slot because of stop condition
-                        slot.release();
-                        slot.print_timings();
-                        send_final_response(slot);
-                        metrics.on_prediction(slot);
-                        continue;
-                    }
+                slot.n_decoded += 1;
+                if (slot.n_decoded == 1) {
+                    slot.t_start_generation = ggml_time_us();
+                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
+                    metrics.on_prompt_eval(slot);
                 }
 
-                // check if the slot supports speculative decoding
-                if (!slot.can_speculate()) {
+                completion_token_output result;
+                result.tok = id;
+
+                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+
+                for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
+                    result.probs.push_back({
+                        cur_p->data[i].id,
+                            i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                    });
+                }
+
+                if (!process_token(result, slot)) {
+                    // release slot because of stop condition
+                    slot.release();
+                    slot.print_timings();
+                    send_final_response(slot);
+                    metrics.on_prediction(slot);
                     continue;
                 }
+            }
+
+            // do speculative decoding
+            for (auto & slot : slots) {
+                if (!slot.is_processing() || !slot.can_speculate()) {
+                    continue;
+                }
+
+                llama_token id = slot.sampled;
 
                 struct common_speculative_params params_spec;
                 params_spec.n_draft   = slot.params.speculative.n_max;

From 25669aa92caaddff09f39b54a5173e5cb2680fa3 Mon Sep 17 00:00:00 2001
From: Charles Xu <charles.xu@arm.com>
Date: Tue, 26 Nov 2024 12:37:05 +0100
Subject: [PATCH 310/396] ggml-cpu: cmake add arm64 cpu feature check for macos
 (#10487)

* ggml-cpu: cmake add arm64 cpu feature check for macos

* use vmmlaq_s32 for compile option i8mm check
---
 ggml/src/ggml-cpu/CMakeLists.txt | 33 ++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index c2905d1fb..ddc05ecef 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -96,6 +96,39 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
         endif ()
 
         set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
+    elseif (APPLE)
+        if (GGML_NATIVE)
+            set(USER_PROVIDED_MARCH FALSE)
+            foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
+                if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
+                    set(USER_PROVIDED_MARCH TRUE)
+                    break()
+                endif()
+            endforeach()
+
+            if (NOT USER_PROVIDED_MARCH)
+                set(MARCH_FLAGS "-march=armv8.2a")
+
+                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
+                if (GGML_COMPILER_SUPPORT_DOTPROD)
+                    set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
+                    add_compile_definitions(__ARM_FEATURE_DOTPROD)
+                endif ()
+
+                set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
+
+                set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+                set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
+                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+                if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+                    set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
+                    add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+                endif ()
+                set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+                list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
+            endif ()
+        endif ()
     else()
         check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
         if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")

From c6807b3f28cc3dbfda3ec390bcb87e69fb5634e2 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 26 Nov 2024 13:05:07 +0100
Subject: [PATCH 311/396] ci : add ubuntu cuda build, build with one arch on
 windows (#10456)

---
 .github/labeler.yml               | 15 +++-----
 .github/workflows/build.yml       | 59 ++++++++++++++++++++++++++++++-
 .github/workflows/nix-ci.yml      |  2 ++
 .github/workflows/python-lint.yml |  9 ++++-
 4 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 89436740d..1b47bc968 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -3,19 +3,18 @@ Kompute:
     - changed-files:
         - any-glob-to-any-file:
             - ggml/include/ggml-kompute.h
-            - ggml/src/ggml-kompute.cpp
+            - ggml/src/ggml-kompute/**
             - README-kompute.md
 Apple Metal:
     - changed-files:
         - any-glob-to-any-file:
             - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal.cpp
+            - ggml/src/ggml-metal/**
             - README-metal.md
 SYCL:
     - changed-files:
         - any-glob-to-any-file:
             - ggml/include/ggml-sycl.h
-            - ggml/src/ggml-sycl.cpp
             - ggml/src/ggml-sycl/**
             - docs/backend/SYCL.md
             - examples/sycl/**
@@ -27,8 +26,8 @@ Nvidia GPU:
 Vulkan:
     - changed-files:
         - any-glob-to-any-file:
-            - ggml/ggml_vk_generate_shaders.py
-            - ggml/src/ggml-vulkan*
+            - ggml/include/ggml-vulkan.h
+            - ggml/src/ggml-vulkan/**
 documentation:
     - changed-files:
         - any-glob-to-any-file:
@@ -75,11 +74,7 @@ server:
 ggml:
     - changed-files:
         - any-glob-to-any-file:
-            - ggml/include/ggml*.h
-            - ggml/src/ggml*.c
-            - ggml/src/ggml*.cpp
-            - ggml/src/ggml*.h
-            - ggml-cuda/**
+            - ggml/**
 nix:
     - changed-files:
         - any-glob-to-any-file:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index abaf2c504..6281663ec 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -871,8 +871,65 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
           name: llama-bin-win-${{ matrix.build }}.zip
 
+  ubuntu-latest-cmake-cuda:
+    runs-on: ubuntu-latest
+    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
+
+    steps:
+        - name: Clone
+          id: checkout
+          uses: actions/checkout@v4
+
+        - name: Install dependencies
+          env:
+            DEBIAN_FRONTEND: noninteractive
+          run: |
+              apt update
+              apt install -y cmake build-essential ninja-build libgomp1 git
+
+        - name: Build with CMake
+          run: |
+            cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
+            cmake --build build
+
   windows-latest-cmake-cuda:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        cuda: ['12.6.2']
+        build: ['cuda']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Install CUDA toolkit
+        id: cuda-toolkit
+        uses: Jimver/cuda-toolkit@v0.2.19
+        with:
+          cuda: ${{ matrix.cuda }}
+          method: 'network'
+          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON -DCMAKE_CUDA_ARCHITECTURES=89-real
+          cmake --build build --config Release -t ggml-cuda
+          cmake --build build --config Release
+
+  windows-2019-cmake-cuda:
     runs-on: windows-2019
+    if: ${{ github.event == 'push' && github.ref == 'refs/heads/master' }}
 
     strategy:
       matrix:
@@ -1173,7 +1230,7 @@ jobs:
       - macOS-latest-make
       - macOS-latest-cmake
       - windows-latest-cmake
-      - windows-latest-cmake-cuda
+      - windows-2019-cmake-cuda
       - windows-latest-cmake-hip-release
       - macOS-latest-cmake-arm64
       - macOS-latest-cmake-x64
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
index 8ecbbe53b..3fe941576 100644
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -5,8 +5,10 @@ on:
   push:
     branches:
       - master
+    paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
   pull_request:
     types: [opened, synchronize, reopened]
+    paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
index a8d46f31d..ddfdf73b8 100644
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -1,6 +1,13 @@
 name: flake8 Lint
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/python-lint.yml', '**/*.py']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['.github/workflows/python-lint.yml', '**/*.py']
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

From 7db3846a94ce7683b3e8120abe427457edf840c9 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 26 Nov 2024 13:05:20 +0100
Subject: [PATCH 312/396] ci : publish the docker images created during
 scheduled runs (#10515)

---
 .github/workflows/docker.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 9cef283d9..bc2e5020d 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -114,7 +114,7 @@ jobs:
           swap-storage: true
 
       - name: Build and push Docker image (tagged + versioned)
-        if: github.event_name == 'push'
+        if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
         uses: docker/build-push-action@v6
         with:
           context: .

From ab96610b1e58684bc5e8b810130c4cf6d8252e21 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 26 Nov 2024 14:18:08 +0200
Subject: [PATCH 313/396] cmake : enable warnings in llama (#10474)

* cmake : enable warnings in llama

ggml-ci

* cmake : add llama_get_flags and respect LLAMA_FATAL_WARNINGS

* cmake : get_flags -> ggml_get_flags

* speculative-simple : fix warnings

* cmake : reuse ggml_get_flags

ggml-ci

* speculative-simple : fix compile warning

ggml-ci
---
 CMakeLists.txt                                |  1 +
 cmake/common.cmake                            | 33 +++++++++++++++++++
 common/CMakeLists.txt                         |  2 ++
 examples/CMakeLists.txt                       |  4 +++
 .../speculative-simple/speculative-simple.cpp |  6 ++--
 ggml/src/CMakeLists.txt                       |  5 +--
 ggml/src/ggml-cuda/CMakeLists.txt             |  2 +-
 src/CMakeLists.txt                            |  2 ++
 8 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 cmake/common.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e7d91a5b5..0d389dccb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,6 +82,7 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
 
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
 
 # override ggml options
 set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
diff --git a/cmake/common.cmake b/cmake/common.cmake
new file mode 100644
index 000000000..0f54871e4
--- /dev/null
+++ b/cmake/common.cmake
@@ -0,0 +1,33 @@
+function(llama_add_compile_flags)
+    if (LLAMA_FATAL_WARNINGS)
+        if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+            list(APPEND C_FLAGS   -Werror)
+            list(APPEND CXX_FLAGS -Werror)
+        elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+            add_compile_options(/WX)
+        endif()
+    endif()
+
+    if (LLAMA_ALL_WARNINGS)
+        if (NOT MSVC)
+            list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                                -Werror=implicit-int -Werror=implicit-function-declaration)
+
+            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
+
+            list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+
+            list(APPEND C_FLAGS   ${WARNING_FLAGS})
+            list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+
+            ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+            add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                                "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+        else()
+            # todo : msvc
+            set(C_FLAGS   "" PARENT_SCOPE)
+            set(CXX_FLAGS "" PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 62a8a7db5..223174884 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -2,6 +2,8 @@
 
 find_package(Threads REQUIRED)
 
+llama_add_compile_flags()
+
 # Build info header
 #
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 21db1f3c2..9210e9fea 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,6 +6,10 @@ find_package(Threads REQUIRED)
 
 # ...
 
+# flags
+
+llama_add_compile_flags()
+
 # examples
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 2ea49d47c..8ca84f7af 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -70,13 +70,13 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> inp;
     inp = common_tokenize(ctx_tgt, params.prompt, true, true);
 
-    if (llama_n_ctx(ctx_tgt) < (int) inp.size()) {
+    if (llama_n_ctx(ctx_tgt) < (uint32_t) inp.size()) {
         LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
 
         return 1;
     }
 
-    if (llama_n_batch(ctx_tgt) < (int) inp.size()) {
+    if (llama_n_batch(ctx_tgt) < (uint32_t) inp.size()) {
         LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt));
 
         return 1;
@@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
         // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
         {
             // do not waste time on small drafts
-            if (draft.size() < n_draft_min) {
+            if (draft.size() < (size_t) n_draft_min) {
                 draft.clear();
             }
 
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 071508dda..9022aa3ae 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -24,7 +24,7 @@ if (NOT MSVC)
     endif()
 endif()
 
-function(get_flags CCID CCVER)
+function(ggml_get_flags CCID CCVER)
     set(C_FLAGS "")
     set(CXX_FLAGS "")
 
@@ -41,6 +41,7 @@ function(get_flags CCID CCVER)
     elseif (CCID STREQUAL "GNU")
         set(C_FLAGS   -Wdouble-promotion)
         set(CXX_FLAGS -Wno-array-bounds)
+
         if (CCVER VERSION_GREATER_EQUAL 8.1.0)
             list(APPEND CXX_FLAGS -Wextra-semi)
         endif()
@@ -69,7 +70,7 @@ if (GGML_ALL_WARNINGS)
         list(APPEND C_FLAGS   ${WARNING_FLAGS})
         list(APPEND CXX_FLAGS ${WARNING_FLAGS})
 
-        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+        ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
 
         add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
                             "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index b0cb93e07..14761650f 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -132,7 +132,7 @@ if (CUDAToolkit_FOUND)
 
         message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
 
-        get_flags(${CUDA_CCID} ${CUDA_CCVER})
+        ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
         list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
     endif()
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a86624750..2f581b921 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,6 +5,8 @@ if (WIN32)
     endif()
 endif()
 
+llama_add_compile_flags()
+
 #
 # libraries
 #

From 0bbd2262a3263f37385297b30de37941836e57f7 Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Tue, 26 Nov 2024 21:43:47 +0800
Subject: [PATCH 314/396] restore the condistion to build & update pacakge when
 merge (#10507)

Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6281663ec..c6aecec6e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1041,7 +1041,7 @@ jobs:
 
       - name: Build the release package
         id: pack_artifacts
-        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
 
@@ -1066,7 +1066,7 @@ jobs:
           7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
 
       - name: Upload the release package
-        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
           path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip

From 45abe0f74ee281aea6e5283c1e738061256cfcae Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 26 Nov 2024 16:20:18 +0100
Subject: [PATCH 315/396] server : replace behave with pytest (#10416)

* server : replace behave with pytest

* fix test on windows

* misc

* add more tests

* more tests

* styling

* log less, fix embd test

* added all sequential tests

* fix coding style

* fix save slot test

* add parallel completion test

* fix parallel test

* remove feature files

* update test docs

* no cache_prompt for some tests

* add test_cache_vs_nocache_prompt
---
 .devops/nix/python-scripts.nix                |    2 +-
 .github/workflows/server.yml                  |    9 +-
 examples/server/tests/.gitignore              |    1 +
 examples/server/tests/README.md               |   33 +-
 examples/server/tests/conftest.py             |   15 +
 .../server/tests/features/ctx_shift.feature   |   66 -
 .../server/tests/features/embeddings.feature  |  113 --
 examples/server/tests/features/environment.py |   71 -
 examples/server/tests/features/infill.feature |   36 -
 examples/server/tests/features/issues.feature |    5 -
 examples/server/tests/features/lora.feature   |   36 -
 .../server/tests/features/parallel.feature    |  131 --
 .../server/tests/features/passkey.feature     |   56 -
 examples/server/tests/features/rerank.feature |   42 -
 .../server/tests/features/results.feature     |  118 --
 .../server/tests/features/security.feature    |   68 -
 examples/server/tests/features/server.feature |  120 --
 .../server/tests/features/slotsave.feature    |   58 -
 examples/server/tests/features/steps/steps.py | 1518 -----------------
 .../tests/features/wrong_usages.feature       |   25 -
 examples/server/tests/requirements.txt        |    2 +-
 examples/server/tests/tests.sh                |    5 +-
 examples/server/tests/unit/test_basic.py      |   34 +
 .../server/tests/unit/test_chat_completion.py |  129 ++
 examples/server/tests/unit/test_completion.py |  223 +++
 examples/server/tests/unit/test_ctx_shift.py  |   67 +
 examples/server/tests/unit/test_embedding.py  |   99 ++
 examples/server/tests/unit/test_infill.py     |   35 +
 examples/server/tests/unit/test_lora.py       |   42 +
 examples/server/tests/unit/test_rerank.py     |   38 +
 examples/server/tests/unit/test_security.py   |   83 +
 examples/server/tests/unit/test_slot_save.py  |   98 ++
 examples/server/tests/unit/test_tokenize.py   |   59 +
 examples/server/tests/utils.py                |  377 ++++
 34 files changed, 1317 insertions(+), 2497 deletions(-)
 create mode 100644 examples/server/tests/conftest.py
 delete mode 100644 examples/server/tests/features/ctx_shift.feature
 delete mode 100644 examples/server/tests/features/embeddings.feature
 delete mode 100644 examples/server/tests/features/environment.py
 delete mode 100644 examples/server/tests/features/infill.feature
 delete mode 100644 examples/server/tests/features/issues.feature
 delete mode 100644 examples/server/tests/features/lora.feature
 delete mode 100644 examples/server/tests/features/parallel.feature
 delete mode 100644 examples/server/tests/features/passkey.feature
 delete mode 100644 examples/server/tests/features/rerank.feature
 delete mode 100644 examples/server/tests/features/results.feature
 delete mode 100644 examples/server/tests/features/security.feature
 delete mode 100644 examples/server/tests/features/server.feature
 delete mode 100644 examples/server/tests/features/slotsave.feature
 delete mode 100644 examples/server/tests/features/steps/steps.py
 delete mode 100644 examples/server/tests/features/wrong_usages.feature
 create mode 100644 examples/server/tests/unit/test_basic.py
 create mode 100644 examples/server/tests/unit/test_chat_completion.py
 create mode 100644 examples/server/tests/unit/test_completion.py
 create mode 100644 examples/server/tests/unit/test_ctx_shift.py
 create mode 100644 examples/server/tests/unit/test_embedding.py
 create mode 100644 examples/server/tests/unit/test_infill.py
 create mode 100644 examples/server/tests/unit/test_lora.py
 create mode 100644 examples/server/tests/unit/test_rerank.py
 create mode 100644 examples/server/tests/unit/test_security.py
 create mode 100644 examples/server/tests/unit/test_slot_save.py
 create mode 100644 examples/server/tests/unit/test_tokenize.py
 create mode 100644 examples/server/tests/utils.py

diff --git a/.devops/nix/python-scripts.nix b/.devops/nix/python-scripts.nix
index 392e9ffe4..56ea18278 100644
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@@ -34,7 +34,7 @@ let
 
     # server tests
     openai
-    behave
+    pytest
     prometheus-client
   ];
 in
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 699ac095d..2e8e3348f 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -122,14 +122,14 @@ jobs:
         id: server_integration_tests
         run: |
           cd examples/server/tests
-          PORT=8888 ./tests.sh
+          ./tests.sh
 
       - name: Slow tests
         id: server_integration_tests_slow
         if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
         run: |
           cd examples/server/tests
-          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
+          SLOW_TESTS=1 ./tests.sh
 
 
   server-windows:
@@ -180,11 +180,12 @@ jobs:
         run: |
           cd examples/server/tests
           $env:PYTHONIOENCODING = ":replace"
-          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+          pytest -v -x
 
       - name: Slow tests
         id: server_integration_tests_slow
         if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
         run: |
           cd examples/server/tests
-          behave.exe --stop --no-skipped --no-capture --tags slow
+          $env:SLOW_TESTS = "1"
+          pytest -v -x
diff --git a/examples/server/tests/.gitignore b/examples/server/tests/.gitignore
index 1d17dae13..90ee7fe6d 100644
--- a/examples/server/tests/.gitignore
+++ b/examples/server/tests/.gitignore
@@ -1 +1,2 @@
 .venv
+tmp
diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
index 10f22c447..2930a2e0d 100644
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -1,19 +1,9 @@
 # Server tests
 
-Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development)
-and [behave](https://behave.readthedocs.io/en/latest/):
-
-* [issues.feature](./features/issues.feature) Pending issues scenario
-* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
-* [security.feature](./features/security.feature) Security, CORS and API Key
-* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
+Python based server tests scenario using [pytest](https://docs.pytest.org/en/stable/).
 
 Tests target GitHub workflows job runners with 4 vCPU.
 
-Requests are
-using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html)
-based http client.
-
 Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
 To mitigate it, you can increase values in `n_predict`, `kv_size`.
 
@@ -39,26 +29,19 @@ It's possible to override some scenario steps values with environment variables:
 |--------------------------|------------------------------------------------------------------------------------------------|
 | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
 | `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
-| `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
+| `DEBUG`                  | to enable steps and server verbose mode `--verbose`                                       |
 | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
 
-### Run @bug, @wip or @wrong_usage annotated scenario
-
-Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
-
-- `@bug` annotation aims to link a scenario with a GitHub issue.
-- `@wrong_usage` are meant to show user issue that are actually an expected behavior
-- `@wip` to focus on a scenario working in progress
-- `@slow` heavy test, disabled by default
-
-To run a scenario annotated with `@bug`, start:
+To run slow tests:
 
 ```shell
-DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
+SLOW_TESTS=1 ./tests.sh
 ```
 
-After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
+To run with stdout/stderr display in real time (verbose output, but useful for debugging):
 
 ```shell
-./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile"
+DEBUG=1 ./tests.sh -s -v -x
 ```
+
+To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
diff --git a/examples/server/tests/conftest.py b/examples/server/tests/conftest.py
new file mode 100644
index 000000000..017d1bb84
--- /dev/null
+++ b/examples/server/tests/conftest.py
@@ -0,0 +1,15 @@
+import pytest
+from utils import *
+
+
+# ref: https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test
+@pytest.fixture(autouse=True)
+def stop_server_after_each_test():
+    # do nothing before each test
+    yield
+    # stop all servers after each test
+    instances = set(
+        server_instances
+    )  # copy the set to prevent 'Set changed size during iteration'
+    for server in instances:
+        server.stop()
diff --git a/examples/server/tests/features/ctx_shift.feature b/examples/server/tests/features/ctx_shift.feature
deleted file mode 100644
index ae6c6b01b..000000000
--- a/examples/server/tests/features/ctx_shift.feature
+++ /dev/null
@@ -1,66 +0,0 @@
-@llama.cpp
-@ctx_shift
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model file test-model.gguf
-    And   a model alias tinyllama-2
-    And   BOS token is 1
-    And   42 as server seed
-    And   256 KV cache size
-    And   32 as batch size
-    And   2 slots
-
-    # the prompt is 301 tokens
-    # the slot context is 256/2 = 128 tokens
-    # the prompt is truncated to keep the last 109 tokens
-    # 64 tokens are generated thanks to shifting the context when it gets full
-  Scenario: Inference with context shift
-    And   64 server max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
-    And   the completion is  truncated
-    And   109 prompt tokens are processed
-
-  Scenario Outline: Inference without context shift
-    And   <n_predict> server max tokens to predict
-    And   disable context shifting
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Hi how are you
-    """
-    And   a completion request with no api error
-    Then  <n_token_output> tokens are predicted matching twind|Anna
-    And   the completion is <truncated> truncated
-    And   8 prompt tokens are processed
-    Examples:
-      | n_predict | n_token_output | truncated |
-      | 64        | 64             | not       |
-      | -1        | 120            |           |
-
-  Scenario: Inference without context shift (expected error: prompt too long)
-    And   disable context shifting
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with 400 api error
-
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
deleted file mode 100644
index f4fe2ee43..000000000
--- a/examples/server/tests/features/embeddings.feature
+++ /dev/null
@@ -1,113 +0,0 @@
-@llama.cpp
-@embeddings
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
-    And   a model file bert-bge-small.gguf
-    And   a model alias bert-bge-small
-    And   42 as server seed
-    And   2 slots
-    # the bert-bge-small model has context size of 512
-    # since the generated prompts are as big as the batch size, we need to set the batch size to <= 512
-    # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
-    And   128 as batch size
-    And   128 as ubatch size
-    And   512 KV cache size
-    And   enable embeddings endpoint
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Embedding
-    When embeddings are computed for:
-    """
-    What is the capital of Bulgaria ?
-    """
-    Then embeddings are generated
-
-  Scenario: Embedding (error: prompt too long)
-    When embeddings are computed for:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And  embeddings request with 500 api error
-
-  Scenario: OAI Embeddings compatibility
-    Given a model bert-bge-small
-    When an OAI compatible embeddings computation request for:
-    """
-    What is the capital of Spain ?
-    """
-    Then embeddings are generated
-
-  Scenario: OAI Embeddings compatibility with multiple inputs
-    Given a model bert-bge-small
-    Given a prompt:
-      """
-      In which country Paris is located ?
-      """
-    And a prompt:
-      """
-      Is Madrid the capital of Spain ?
-      """
-    When an OAI compatible embeddings computation request for multiple inputs
-    Then embeddings are generated
-
-  Scenario: Multi users embeddings
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    Given concurrent embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
-
-  Scenario: Multi users OAI compatibility embeddings
-    Given a prompt:
-      """
-      In which country Paris is located ?
-      """
-    And a prompt:
-      """
-      Is Madrid the capital of Spain ?
-      """
-    And a prompt:
-      """
-      What is the biggest US city ?
-      """
-    And a prompt:
-      """
-      What is the capital of Bulgaria ?
-      """
-    And   a model bert-bge-small
-    Given concurrent OAI embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
-
-  Scenario: All embeddings should be the same
-    Given 10 fixed prompts
-    And   a model bert-bge-small
-    Given concurrent OAI embedding requests
-    Then all embeddings are the same
diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
deleted file mode 100644
index e7845dc2f..000000000
--- a/examples/server/tests/features/environment.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import os
-import signal
-import socket
-import sys
-import time
-import traceback
-from contextlib import closing
-from subprocess import TimeoutExpired
-
-
-def before_scenario(context, scenario):
-    context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
-    if context.debug:
-        print("DEBUG=ON")
-    print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
-    port = 8080
-    if 'PORT' in os.environ:
-        port = int(os.environ['PORT'])
-    if is_server_listening("localhost", port):
-        assert False, "Server already started"
-
-
-def after_scenario(context, scenario):
-    try:
-        if 'server_process' not in context or context.server_process is None:
-            return
-        if scenario.status == "failed":
-            if 'GITHUB_ACTIONS' in os.environ:
-                print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n")
-                if os.path.isfile('llama.log'):
-                    with closing(open('llama.log', 'r')) as f:
-                        for line in f:
-                            print(line)
-            if not is_server_listening(context.server_fqdn, context.server_port):
-                print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
-
-        if context.server_process.poll() is not None:
-            assert False, f"Server not running pid={context.server_process.pid} ..."
-
-        server_graceful_shutdown(context)  # SIGINT
-
-        try:
-            context.server_process.wait(0.5)
-        except TimeoutExpired:
-            print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...")
-            context.server_process.kill()  # SIGKILL
-            context.server_process.wait()
-
-        while is_server_listening(context.server_fqdn, context.server_port):
-            time.sleep(0.1)
-    except Exception:
-        print("ignoring error in after_scenario:")
-        traceback.print_exc(file=sys.stdout)
-
-
-def server_graceful_shutdown(context):
-    print(f"shutting down server pid={context.server_process.pid} ...")
-    if os.name == 'nt':
-        interrupt = signal.CTRL_C_EVENT
-    else:
-        interrupt = signal.SIGINT
-    context.server_process.send_signal(interrupt)
-
-
-def is_server_listening(server_fqdn, server_port):
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-        result = sock.connect_ex((server_fqdn, server_port))
-        _is_server_listening = result == 0
-        if _is_server_listening:
-            print(f"server is listening on {server_fqdn}:{server_port}...")
-        return _is_server_listening
diff --git a/examples/server/tests/features/infill.feature b/examples/server/tests/features/infill.feature
deleted file mode 100644
index a0bbfef77..000000000
--- a/examples/server/tests/features/infill.feature
+++ /dev/null
@@ -1,36 +0,0 @@
-@llama.cpp
-@infill
-Feature: llama.cpp server
-
-  # The current model is made by adding FIM tokens to the existing stories260K
-  # We may want to use a better model in the future, maybe something like SmolLM 360M
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K-infill.gguf from HF repo ggml-org/models
-    And   a model file test-model-infill.gguf
-    And   a model alias tinyllama-infill
-    And   42 as server seed
-    And   1024 as batch size
-    And   1024 as ubatch size
-    And   2048 KV cache size
-    And   64 max tokens to predict
-    And   0.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Infill without input_extra
-    Given a prompt "Complete this"
-    And   an infill input extra none none
-    And   an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_"
-    And   an infill input suffix "}\n"
-    And   an infill request with no api error
-    Then  64 tokens are predicted matching One|day|she|saw|big|scary|bird
-
-  Scenario: Infill with input_extra
-    Given a prompt "Complete this"
-    And   an infill input extra "llama.h" "LLAMA_API int32_t llama_n_threads();\n"
-    And   an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_"
-    And   an infill input suffix "}\n"
-    And   an infill request with no api error
-    Then  64 tokens are predicted matching cuts|Jimmy|mom|came|into|the|room"
diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature
deleted file mode 100644
index 7b13e44ca..000000000
--- a/examples/server/tests/features/issues.feature
+++ /dev/null
@@ -1,5 +0,0 @@
-# List of ongoing issues
-# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
-@bug
-Feature: Issues
-  # No confirmed issue at the moment
diff --git a/examples/server/tests/features/lora.feature b/examples/server/tests/features/lora.feature
deleted file mode 100644
index 7b85988ac..000000000
--- a/examples/server/tests/features/lora.feature
+++ /dev/null
@@ -1,36 +0,0 @@
-@llama.cpp
-@lora
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/stories15M_MOE-F16.gguf
-    And   a model file stories15M_MOE-F16.gguf
-    And   a model alias stories15M_MOE
-    And   a lora adapter file from https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf
-    And   42 as server seed
-    And   1024 as batch size
-    And   1024 as ubatch size
-    And   2048 KV cache size
-    And   64 max tokens to predict
-    And   0.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Completion LoRA disabled
-    Given switch off lora adapter 0
-    Given a prompt:
-    """
-    Look in thy glass
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching little|girl|three|years|old
-
-  Scenario: Completion LoRA enabled
-    Given switch on lora adapter 0
-    Given a prompt:
-    """
-    Look in thy glass
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching eye|love|glass|sun
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
deleted file mode 100644
index 423d0f1d4..000000000
--- a/examples/server/tests/features/parallel.feature
+++ /dev/null
@@ -1,131 +0,0 @@
-@llama.cpp
-@parallel
-Feature: Parallel
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
-    And   a model file test-model-00001-of-00003.gguf
-    And   42 as server seed
-    And   128 as batch size
-    And   256 KV cache size
-    And   2 slots
-    And   continuous batching
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario Outline: Multi users completion
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And <n_predict> max tokens to predict
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | n_predict |
-      | 128       |
-
-  Scenario Outline: Multi users OAI completions compatibility
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario Outline: Multi users OAI completions compatibility no v1
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests no v1
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario Outline: Multi users with number of prompts exceeding number of slots
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And a prompt:
-      """
-      What is LLM?
-      """
-    And a prompt:
-      """
-      The sky is blue and I love it.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    And 128 max tokens to predict
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted
diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature
deleted file mode 100644
index ff0a82cc4..000000000
--- a/examples/server/tests/features/passkey.feature
+++ /dev/null
@@ -1,56 +0,0 @@
-# run with: ./tests.sh --no-skipped --tags passkey
-@passkey
-@slow
-Feature: Passkey / Self-extend with context shift
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-
-  # Generates a long text of junk and inserts a secret passkey number inside it.
-  # Then we query the LLM for the secret passkey.
-  # see #3856 and #4810
-  Scenario Outline: Passkey
-    Given a model file <hf_file> from HF repo <hf_repo>
-    And   <n_batch> as batch size
-    And   <n_junk> as number of junk
-    And   <n_predicted> server max tokens to predict
-    And   42 as seed
-    And   0.0 temperature
-    And   <n_ctx> KV cache size
-    And   1 slots
-    And   <n_ga> group attention factor to extend context size through self-extend
-    And   <n_ga_w> group attention width to extend context size through self-extend
-    # Can be override with N_GPU_LAYERS
-    And   <ngl> GPU offloaded layers
-    Then  the server is starting
-    # Higher timeout because the model may need to be downloaded from the internet
-    Then  the server is healthy with timeout 120 seconds
-    Given available models
-    Then  model 0 is trained on <n_ctx_train> tokens context
-    Given a prefix prompt:
-    """
-    here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.
-    """
-    And a passkey prompt template:
-    """
-    The pass key is <passkey> Remember it. <passkey> is the pass key.
-    """
-    And a junk suffix prompt:
-    """
-    The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.
-    """
-    And a suffix prompt:
-    """
-    What is the pass key? The pass key is
-    """
-    Given a "<passkey>" passkey challenge prompt with the passkey inserted every <i_pos> junk
-    And  a completion request with no api error
-    Then <n_predicted> tokens are predicted matching <re_content>
-
-    Examples:
-      | hf_repo                         | hf_file                     | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content     |
-      | TheBloke/phi-2-GGUF             | phi-2.Q4_K_M.gguf           | 2048        | 5   | 8192  | 512     | 4    | 512    | 250    | 50    | 42      | 1           | 42             |
-      | TheBloke/phi-2-GGUF             | phi-2.Q4_K_M.gguf           | 2048        | 5   | 8192  | 512     | 2    | 512    | 250    | 50    | 42      | 1           | \b((?!42)\w)+\b  |
-      #| TheBloke/Llama-2-7B-GGUF        | llama-2-7b.Q2_K.gguf        | 4096        | 3   | 16384 | 512     | 4    | 512    | 500    | 300   | 1234    | 5           | 1234           |
-      #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768       | 2   | 16384 | 512     | 4    | 512    | 500    | 100   | 0987    | 5           | 0
-      # 987           |
diff --git a/examples/server/tests/features/rerank.feature b/examples/server/tests/features/rerank.feature
deleted file mode 100644
index c36cc8e21..000000000
--- a/examples/server/tests/features/rerank.feature
+++ /dev/null
@@ -1,42 +0,0 @@
-@llama.cpp
-@rerank
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/models/resolve/main/jina-reranker-v1-tiny-en/ggml-model-f16.gguf
-    And   a model file jina-reranker-v1-tiny-en.gguf
-    And   a model alias jina-reranker-v1-tiny-en
-    And   42 as server seed
-    And   2 slots
-    And   512 as batch size
-    And   512 as ubatch size
-    And   512 KV cache size
-    And   enable reranking endpoint
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Rerank
-    Given a rerank query:
-      """
-      Machine learning is
-      """
-    And   a rerank document:
-      """
-      A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.
-      """
-    And   a rerank document:
-      """
-      Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.
-      """
-    And   a rerank document:
-      """
-      Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.
-      """
-    And   a rerank document:
-      """
-      Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine.
-      """
-    When  reranking request
-    Then  reranking results are returned
-    Then  reranking highest score is index 2 and lowest score is index 3
diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature
deleted file mode 100644
index e8e1b5414..000000000
--- a/examples/server/tests/features/results.feature
+++ /dev/null
@@ -1,118 +0,0 @@
-@llama.cpp
-@results
-Feature: Results
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
-    And   a model file test-model-00001-of-00003.gguf
-    And   128 as batch size
-    And   1024 KV cache size
-    And   128 max tokens to predict
-    And   continuous batching
-
-  Scenario Outline: consistent results with same seed
-    Given <n_slots> slots
-    And   1.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
-
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all predictions are equal
-    Examples:
-      | n_slots |
-      | 1       |
-      # FIXME: unified KV cache nondeterminism
-      # | 2       |
-
-  Scenario Outline: different results with different seed
-    Given <n_slots> slots
-    And   1.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45
-
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all predictions are different
-    Examples:
-      | n_slots |
-      | 1       |
-      | 2       |
-
-  Scenario Outline: consistent results with same seed and varying batch size
-    Given 4 slots
-    And   <temp> temperature
-    # And   0 as draft
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 1 prompts "Write a very long story about AI." with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then  the server is idle
-    And   all slots are idle
-
-    Given <n_parallel> prompts "Write a very long story about AI." with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then the server is idle
-    And  all slots are idle
-
-    Then all predictions are equal
-    Examples:
-      | n_parallel | temp |
-      | 1          | 0.0  |
-      | 1          | 1.0  |
-      # FIXME: unified KV cache nondeterminism
-      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
-      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
-      # | 2          | 0.0  |
-      # | 4          | 0.0  |
-      # | 2          | 1.0  |
-      # | 4          | 1.0  |
-
-  Scenario Outline: consistent token probs with same seed and prompt
-    Given <n_slots> slots
-    And   <n_kv> KV cache size
-    And   1.0 temperature
-    And   <n_predict> max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 1 prompts "The meaning of life is" with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then  the server is idle
-    And   all slots are idle
-
-    Given <n_parallel> prompts "The meaning of life is" with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then the server is idle
-    And  all slots are idle
-
-    Then all token probabilities are equal
-    Examples:
-      | n_slots | n_kv | n_predict | n_parallel |
-      | 4       | 1024 | 1         | 1          |
-      # FIXME: unified KV cache nondeterminism
-      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
-      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
-      # | 4       | 1024 | 1         | 4          |
-      # | 4       | 1024 | 100       | 1          |
-      # This test still fails even the above patches; the first token probabilities are already different.
-      # | 4       | 1024 | 100       | 4          |
diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature
deleted file mode 100644
index ef30007c3..000000000
--- a/examples/server/tests/features/security.feature
+++ /dev/null
@@ -1,68 +0,0 @@
-@llama.cpp
-@security
-Feature: Security
-
-  Background: Server startup with an api key defined
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a server api key THIS_IS_THE_KEY
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario Outline: Completion with some user api key
-    Given a prompt test
-    And   a user api key <api_key>
-    And   4 max tokens to predict
-    And   a completion request with <api_error> api error
-
-    Examples: Prompts
-      | api_key         | api_error |
-      | THIS_IS_THE_KEY | no        |
-      | THIS_IS_THE_KEY | no        |
-      | hackeme         | raised    |
-      |                 | raised    |
-
-  Scenario Outline: OAI Compatibility
-    Given a system prompt test
-    And   a user prompt test
-    And   a model test
-    And   2 max tokens to predict
-    And   streaming is disabled
-    And   a user api key <api_key>
-    Given an OAI compatible chat completions request with <api_error> api error
-
-    Examples: Prompts
-      | api_key         | api_error |
-      | THIS_IS_THE_KEY | no        |
-      | THIS_IS_THE_KEY | no        |
-      | hackme          | raised    |
-
-  Scenario Outline: OAI Compatibility (invalid response formats)
-    Given a system prompt test
-    And   a user prompt test
-    And   a response format <response_format>
-    And   a model test
-    And   2 max tokens to predict
-    And   streaming is disabled
-    Given an OAI compatible chat completions request with raised api error
-
-    Examples: Prompts
-      | response_format                                       |
-      | {"type": "sound"}                                     |
-      | {"type": "json_object", "schema": 123}                |
-      | {"type": "json_object", "schema": {"type": 123}}      |
-      | {"type": "json_object", "schema": {"type": "hiccup"}} |
-
-
-  Scenario Outline: CORS Options
-    Given a user api key THIS_IS_THE_KEY
-    When  an OPTIONS request is sent from <origin>
-    Then  CORS header <cors_header> is set to <cors_header_value>
-
-    Examples: Headers
-      | origin          | cors_header                      | cors_header_value |
-      | localhost       | Access-Control-Allow-Origin      | localhost         |
-      | web.mydomain.fr | Access-Control-Allow-Origin      | web.mydomain.fr   |
-      | origin          | Access-Control-Allow-Credentials | true              |
-      | web.mydomain.fr | Access-Control-Allow-Methods     | GET, POST         |
-      | web.mydomain.fr | Access-Control-Allow-Headers     | *                 |
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
deleted file mode 100644
index 15e24c624..000000000
--- a/examples/server/tests/features/server.feature
+++ /dev/null
@@ -1,120 +0,0 @@
-@llama.cpp
-@server
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model file test-model.gguf
-    And   a model alias tinyllama-2
-    And   BOS token is 1
-    And   42 as server seed
-      # KV Cache corresponds to the total amount of tokens
-      # that can be stored across all independent sequences: #4130
-      # see --ctx-size and #5568
-    And   256 KV cache size
-    And   32 as batch size
-    And   2 slots
-    And   64 server max tokens to predict
-    And   prometheus compatible metrics exposed
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Health
-    Then the server is ready
-    And  all slots are idle
-
-
-  Scenario Outline: Completion
-    Given a prompt <prompt>
-    And   <n_predict> max tokens to predict
-    And   a completion request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-    And   the completion is <truncated> truncated
-    And   <n_prompt> prompt tokens are processed
-    And   prometheus metrics are exposed
-    And   metric llamacpp:tokens_predicted is <n_predicted>
-
-    Examples: Prompts
-      | prompt                                                                    | n_predict | re_content                                  | n_prompt | n_predicted | truncated |
-      | I believe the meaning of life is                                          | 8         | (read\|going)+                              | 18       | 8           | not       |
-      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids\|Anna\|forest)+ | 46       | 64          | not       |
-
-  Scenario: Completion prompt truncated
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
-    And   the completion is  truncated
-    And   109 prompt tokens are processed
-
-
-  Scenario Outline: OAI Compatibility
-    Given a model <model>
-    And   a system prompt <system_prompt>
-    And   a user prompt <user_prompt>
-    And   <max_tokens> max tokens to predict
-    And   streaming is <enable_streaming>
-    Given an OAI compatible chat completions request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-    And   <n_prompt> prompt tokens are processed
-    And   the completion is <truncated> truncated
-
-    Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | re_content                        | n_prompt | n_predicted | enable_streaming | truncated |
-      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 77       | 8           | disabled         | not       |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |           |
-
-
-  Scenario Outline: OAI Compatibility w/ response format
-    Given a model test
-    And   a system prompt test
-    And   a user prompt test
-    And   a response format <response_format>
-    And   10 max tokens to predict
-    Given an OAI compatible chat completions request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-
-    Examples: Prompts
-      | response_format                                                     | n_predicted | re_content             |
-      | {"type": "json_object", "schema": {"const": "42"}}                  | 6           | "42"                   |
-      | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10          | \[ -300 \]             |
-      | {"type": "json_object"}                                             | 10          | \{ " Jacky.            |
-
-
-  Scenario: Tokenize / Detokenize
-    When tokenizing:
-    """
-    What is the capital of France ?
-    """
-    Then tokens can be detokenized
-    And  tokens do not begin with BOS
-
-  Scenario: Tokenize w/ BOS
-    Given adding special tokens
-    When  tokenizing:
-    """
-    What is the capital of Germany?
-    """
-    Then  tokens begin with BOS
-    Given first token is removed
-    Then  tokens can be detokenized
-
-  Scenario: Tokenize with pieces
-    When  tokenizing with pieces:
-    """
-    What is the capital of Germany?
-    媽
-    """
-    Then  tokens are given with pieces
-
-  Scenario: Models available
-    Given available models
-    Then  1 models are supported
-    Then  model 0 is identified by tinyllama-2
-    Then  model 0 is trained on 128 tokens context
diff --git a/examples/server/tests/features/slotsave.feature b/examples/server/tests/features/slotsave.feature
deleted file mode 100644
index 1c281c074..000000000
--- a/examples/server/tests/features/slotsave.feature
+++ /dev/null
@@ -1,58 +0,0 @@
-@llama.cpp
-@slotsave
-Feature: llama.cpp server slot management
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   prompt caching is enabled
-    And   2 slots
-    And   . as slot save path
-    And   2048 KV cache size
-    And   42 as server seed
-    And   24 max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Save and Restore Slot
-    # First prompt in slot 1 should be fully processed
-    Given a user prompt "What is the capital of France?"
-    And   using slot id 1
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   22 prompt tokens are processed
-    When  the slot 1 is saved with filename "slot1.bin"
-    Then  the server responds with status code 200
-    # Since we have cache, this should only process the last tokens
-    Given a user prompt "What is the capital of Germany?"
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Thank|special)
-    And   7 prompt tokens are processed
-    # Loading the original cache into slot 0,
-    # we should only be processing 1 prompt token and get the same output
-    When  the slot 0 is restored with filename "slot1.bin"
-    Then  the server responds with status code 200
-    Given a user prompt "What is the capital of France?"
-    And   using slot id 0
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   1 prompt tokens are processed
-    # For verification that slot 1 was not corrupted during slot 0 load, same thing
-    Given a user prompt "What is the capital of Germany?"
-    And   using slot id 1
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Thank|special)
-    And   1 prompt tokens are processed
-
-  Scenario: Erase Slot
-    Given a user prompt "What is the capital of France?"
-    And   using slot id 1
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   22 prompt tokens are processed
-    When  the slot 1 is erased
-    Then  the server responds with status code 200
-    Given a user prompt "What is the capital of France?"
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   22 prompt tokens are processed
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
deleted file mode 100644
index 687b163f4..000000000
--- a/examples/server/tests/features/steps/steps.py
+++ /dev/null
@@ -1,1518 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import asyncio
-import json
-import os
-import re
-import socket
-import subprocess
-import sys
-import threading
-import time
-import requests
-from collections.abc import Sequence
-from contextlib import closing
-from re import RegexFlag
-from typing import Any, Literal, cast
-
-import aiohttp
-import numpy as np
-import openai
-from openai.types.chat import ChatCompletionChunk
-from behave import step  # pyright: ignore[reportAttributeAccessIssue]
-from behave.api.async_step import async_run_until_complete
-from prometheus_client import parser
-
-# pyright: reportRedeclaration=false
-
-DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
-
-@step("a server listening on {server_fqdn}:{server_port}")
-def step_server_config(context, server_fqdn: str, server_port: str):
-    context.server_fqdn = server_fqdn
-    context.server_port = int(server_port)
-    context.n_threads = None
-    context.n_gpu_layer = None
-    if 'PORT' in os.environ:
-        context.server_port = int(os.environ['PORT'])
-        print(f"$PORT set, overriding server port with to {context.server_port}")
-    if 'FQDN' in os.environ:
-        context.server_fqdn = os.environ['FQDN']
-        print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
-    if 'N_GPU_LAYERS' in os.environ:
-        context.n_gpu_layer = int(os.environ['N_GPU_LAYERS'])
-        print(f"$N_GPU_LAYERS set, overriding n_gpu_layer with to {context.n_gpu_layer}")
-
-    context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
-
-    context.model_alias = None
-    context.model_file = None
-    context.model_hf_repo = None
-    context.model_hf_file = None
-    context.model_url = None
-    context.n_batch = None
-    context.n_ubatch = None
-    context.n_ctx = None
-    context.n_ga = None
-    context.n_ga_w = None
-    context.n_predict = None
-    context.n_prompts = 0
-    context.n_server_predict = None
-    context.slot_save_path = None
-    context.id_slot = None
-    context.cache_prompt = None
-    context.n_slots = None
-    context.prompt_prefix = None
-    context.prompt_suffix = None
-    context.server_api_key = None
-    context.server_continuous_batching = False
-    context.server_embeddings = False
-    context.server_reranking = False
-    context.server_metrics = False
-    context.server_process = None
-    context.seed = None
-    context.draft = None
-    context.server_seed = None
-    context.user_api_key = None
-    context.response_format = None
-    context.temperature = None
-    context.lora_file = None
-    context.disable_ctx_shift = False
-
-    # infill
-    context.infill_input_extra = None
-    context.infill_input_suffix = ''
-    context.infill_input_prefix = ''
-
-    context.tasks_result = []
-    context.concurrent_tasks = []
-    context.prompts = []
-
-    context.reranking_query = None
-    context.reranking_documents = []
-    context.reranking_results = None
-
-
-@step('a model file {hf_file} from HF repo {hf_repo}')
-def step_download_hf_model(context, hf_file: str, hf_repo: str):
-    context.model_hf_repo = hf_repo
-    context.model_hf_file = hf_file
-    context.model_file = os.path.basename(hf_file)
-
-@step('a lora adapter file from {lora_file_url}')
-def step_download_lora_file(context, lora_file_url: str):
-    file_name = lora_file_url.split('/').pop()
-    context.lora_file = f'../../../{file_name}'
-    with open(context.lora_file, 'wb') as f:
-        f.write(requests.get(lora_file_url).content)
-
-@step('a model file {model_file}')
-def step_model_file(context, model_file: str):
-    context.model_file = model_file
-
-
-@step('a model url {model_url}')
-def step_model_url(context, model_url: str):
-    context.model_url = model_url
-
-
-@step('a model alias {model_alias}')
-def step_model_alias(context, model_alias: str):
-    context.model_alias = model_alias
-
-
-@step('{seed:d} as server seed')
-def step_seed(context, seed: int):
-    context.server_seed = seed
-
-
-@step('{ngl:d} GPU offloaded layers')
-def step_n_gpu_layer(context, ngl: int):
-    if 'N_GPU_LAYERS' in os.environ:
-        new_ngl = int(os.environ['N_GPU_LAYERS'])
-        if context.debug:
-            print(f"-ngl upgraded from {ngl} to {new_ngl}")
-        ngl = new_ngl
-    context.n_gpu_layer = ngl
-
-
-@step('{n_threads:d} threads')
-def step_n_threads(context, n_threads: int):
-    context.n_thread = n_threads
-
-
-@step('{draft:d} as draft')
-def step_draft(context, draft: int):
-    context.draft = draft
-
-
-@step('{n_ctx:d} KV cache size')
-def step_n_ctx(context, n_ctx: int):
-    context.n_ctx = n_ctx
-
-
-@step('{n_slots:d} slots')
-def step_n_slots(context, n_slots: int):
-    context.n_slots = n_slots
-
-
-@step('{n_predict:d} server max tokens to predict')
-def step_server_n_predict(context, n_predict: int):
-    context.n_server_predict = n_predict if n_predict > 0 else None
-
-
-@step('{slot_save_path} as slot save path')
-def step_slot_save_path(context, slot_save_path: str):
-    context.slot_save_path = slot_save_path
-
-
-@step('using slot id {id_slot:d}')
-def step_id_slot(context, id_slot: int):
-    context.id_slot = id_slot
-
-
-@step('prompt caching is enabled')
-def step_enable_prompt_cache(context):
-    context.cache_prompt = True
-
-
-@step('continuous batching')
-def step_server_continuous_batching(context):
-    context.server_continuous_batching = True
-
-
-@step('enable embeddings endpoint')
-def step_server_embeddings(context):
-    context.server_embeddings = True
-
-@step('enable reranking endpoint')
-def step_server_reranking(context):
-    context.server_reranking = True
-
-@step('prometheus compatible metrics exposed')
-def step_server_metrics(context):
-    context.server_metrics = True
-
-@step('disable context shifting')
-def step_server_disable_ctx_shift(context):
-    context.disable_ctx_shift = True
-
-@step("the server is starting")
-def step_start_server(context):
-    start_server_background(context)
-    attempts = 0
-    max_attempts = 20
-    if 'GITHUB_ACTIONS' in os.environ:
-        max_attempts *= 2
-
-    addrs = socket.getaddrinfo(context.server_fqdn, context.server_port, type=socket.SOCK_STREAM)
-    family, typ, proto, _, sockaddr = addrs[0]
-
-    while True:
-        with closing(socket.socket(family, typ, proto)) as sock:
-            result = sock.connect_ex(sockaddr)
-            if result == 0:
-                print("\x1b[33;46mserver started!\x1b[0m")
-                return
-            attempts += 1
-            if attempts > max_attempts:
-                assert False, "server not started"
-            print(f"waiting for server to start, connect error code = {result}...")
-            time.sleep(0.1)
-
-
-async def wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
-    match expecting_status:
-        case 'healthy':
-            await wait_for_slots_status(context, context.base_url, 200,
-                                        timeout=timeout)
-
-        case 'ready' | 'idle':
-            await wait_for_slots_status(context, context.base_url, 200,
-                                        timeout=timeout,
-                                        params={'fail_on_no_slot': 1},
-                                        slots_idle=context.n_slots,
-                                        slots_processing=0)
-        case 'busy':
-            await wait_for_slots_status(context, context.base_url, 503,
-                                        params={'fail_on_no_slot': 1},
-                                        slots_idle=0,
-                                        slots_processing=context.n_slots)
-        case _:
-            assert False, "unknown status"
-
-
-@step("the server is {expecting_status} with timeout {timeout:d} seconds")
-@async_run_until_complete
-async def step_wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
-    await wait_for_server_status_with_timeout(context, expecting_status, timeout)
-
-
-@step("the server is {expecting_status}")
-@async_run_until_complete
-async def step_wait_for_server_status(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
-    await wait_for_server_status_with_timeout(context, expecting_status, 30)
-
-
-@step('all slots are {expected_slot_status_string}')
-@async_run_until_complete
-async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
-    match expected_slot_status_string:
-        case 'idle':
-            expected_slot_status = False
-        case 'busy':
-            expected_slot_status = True
-        case _:
-            assert False, "unknown status"
-
-    expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
-                      for slot_id in range(context.n_slots)]
-    await request_slots_status(context, expected_slots)
-
-
-@step('a completion request with {api_error} api error')
-@async_run_until_complete
-async def step_request_completion(context, api_error: Literal['raised'] | str):
-    expect_api_error = api_error == 'raised' or api_error != 'no'
-    seeds = await completions_seed(context, num_seeds=1)
-    completion = await request_completion(context.prompts.pop(),
-                                          seeds[0] if seeds is not None else seeds,
-                                          context.base_url,
-                                          debug=context.debug,
-                                          n_predict=context.n_predict,
-                                          cache_prompt=context.cache_prompt,
-                                          id_slot=context.id_slot,
-                                          expect_api_error=expect_api_error,
-                                          user_api_key=context.user_api_key,
-                                          temperature=context.temperature)
-    context.tasks_result.append(completion)
-    if context.debug:
-        print(f"Completion response: {completion}")
-    if api_error == 'raised':
-        assert completion == 401, f"completion must be an 401 status code: {completion}"
-    elif api_error.isdigit():
-        api_error_code = int(api_error)
-        assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}"
-
-
-@step('an infill request with {api_error} api error')
-@async_run_until_complete
-async def step_request_completion(context, api_error: Literal['raised'] | str):
-    if api_error != 'no':
-        raise ValueError(f'api_error={api_error} is not yet implemented')
-    payload = {
-        "prompt": context.prompts[0],
-        "input_suffix": context.infill_input_suffix,
-        "input_prefix": context.infill_input_prefix,
-        "n_predict": context.n_predict,
-        "seed": context.seed,
-        "temperature": context.temperature,
-    }
-    if context.infill_input_extra is not None:
-        payload['input_extra'] = context.infill_input_extra
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/infill',
-                                json=payload) as response:
-            assert response.status == 200
-            context.tasks_result = [await response.json()]
-
-
-@step('{predicted_n:d} tokens are predicted matching {re_content}')
-def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
-    context.completion = context.tasks_result.pop()
-    assert_n_tokens_predicted(context.completion, predicted_n, re_content)
-
-
-@step('{predicted_n:d} tokens are predicted')
-def step_n_tokens_predicted(context, predicted_n):
-    context.completion = context.tasks_result.pop()
-    assert_n_tokens_predicted(context.completion, predicted_n)
-
-
-@step('all predictions are equal')
-@async_run_until_complete
-async def step_predictions_equal(context):
-    n_completions = await gather_tasks_results(context)
-    assert n_completions >= 2, "need at least 2 completions"
-    assert_all_predictions_equal(context.tasks_result)
-    context.tasks_result = []
-
-
-@step('all predictions are different')
-@async_run_until_complete
-async def step_predictions_different(context):
-    n_completions = await gather_tasks_results(context)
-    assert n_completions >= 2, "need at least 2 completions"
-    assert_all_predictions_different(context.tasks_result)
-    context.tasks_result = []
-
-
-@step('all token probabilities are equal')
-@async_run_until_complete
-async def step_token_probabilities_equal(context):
-    n_completions = await gather_tasks_results(context)
-    assert n_completions >= 2, "need at least 2 completions"
-    assert_all_token_probabilities_equal(context.tasks_result)
-    context.tasks_result = []
-
-
-@step('the completion is  truncated')
-def step_assert_completion_truncated(context):
-    step_assert_completion_truncated(context, '')
-
-
-@step('the completion is {truncated} truncated')
-def step_assert_completion_truncated(context, truncated):
-    truncated = truncated != "not"
-    assert context.completion['truncated'] == truncated, f'{context.completion}'
-
-
-@step('{n_prompt:d} prompt tokens are processed')
-def step_impl(context, n_prompt):
-    assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}"
-
-
-@step('a user prompt {user_prompt}')
-def step_user_prompt(context, user_prompt):
-    context.prompts.append(user_prompt)
-    context.n_prompts = len(context.prompts)
-
-
-@step('a system prompt {system_prompt}')
-def step_system_prompt(context, system_prompt):
-    context.system_prompt = system_prompt
-
-
-@step('a model {model}')
-def step_model(context, model):
-    context.model = model
-
-
-@step('{max_tokens:d} max tokens to predict')
-def step_max_tokens(context, max_tokens):
-    context.n_predict = max_tokens
-
-
-@step('a response format {response_format}')
-def step_response_format(context, response_format):
-    context.response_format = json.loads(response_format)
-
-
-@step('{temperature:f} temperature')
-def step_temperature(context, temperature):
-    context.temperature = temperature
-
-
-@step('streaming is {enable_streaming}')
-def step_streaming(context, enable_streaming):
-    context.enable_streaming = enable_streaming == 'enabled'
-
-
-@step('a user api key {user_api_key}')
-def step_user_api_key(context, user_api_key):
-    context.user_api_key = user_api_key
-
-
-@step('no user api key')
-def step_no_user_api_key(context):
-    context.user_api_key = None
-
-
-@step('a user api key ')
-def step_no_user_api_key_space(context):
-    context.user_api_key = None
-
-
-@step('a server api key {server_api_key}')
-def step_server_api_key(context, server_api_key):
-    context.server_api_key = server_api_key
-
-
-@step('{n_junk:d} as number of junk')
-def step_n_junk(context, n_junk):
-    context.n_junk = n_junk
-
-
-@step('{n_batch:d} as batch size')
-def step_n_batch(context, n_batch):
-    context.n_batch = n_batch
-
-
-@step('{n_ubatch:d} as ubatch size')
-def step_n_ubatch(context, n_ubatch):
-    context.n_ubatch = n_ubatch
-
-
-@step('{seed:d} as seed')
-def step_seed(context, seed):
-    if context.seed is None:
-        context.seed = [seed]
-    else:
-        context.seed.append(seed)
-
-
-@step('BOS token is {bos:d}')
-def step_bos_token(context, bos):
-    context.bos = bos
-
-
-@step('a prefix prompt')
-def step_prompt_prefix(context):
-    context.prompt_prefix = context_text(context)
-
-
-@step('a junk suffix prompt')
-def step_prompt_junk_suffix(context):
-    context.prompt_junk_suffix = context_text(context)
-
-
-@step('a suffix prompt')
-def step_prompt_suffix(context):
-    context.prompt_suffix = context_text(context)
-
-
-@step('{n_ga:d} group attention factor'
-      ' to extend context size through self-extend')
-def step_impl(context, n_ga):
-    context.n_ga = n_ga
-
-
-@step('{n_ga_w:d} group attention width to extend context size through self-extend')
-def step_impl(context, n_ga_w):
-    context.n_ga_w = n_ga_w
-
-
-@step('a passkey prompt template')
-def step_prompt_passkey(context):
-    context.prompt_passkey = context_text(context)
-
-@step('a rerank query')
-def step_set_rerank_query(context):
-    context.reranking_query = context_text(context)
-    context.reranking_documents = []
-
-@step('a rerank document')
-def step_set_rerank_document(context):
-    context.reranking_documents.append(context_text(context))
-
-@step('{n_prompts:d} fixed prompts')
-def step_fixed_prompts(context, n_prompts):
-    context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)])
-    context.n_prompts = n_prompts
-
-
-@step('a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
-def step_prompt_passkey(context, passkey, i_pos):
-    prompt = ""
-    for i in range(context.n_junk):
-        if i % context.n_junk == i_pos:
-            prompt += context.prompt_passkey # the passkey is already substituted
-        prompt += context.prompt_junk_suffix
-    if context.debug:
-        passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
-        print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```")
-    context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
-    context.n_prompts = len(context.prompts)
-
-
-@step('an OAI compatible chat completions request with {api_error} api error')
-@async_run_until_complete
-async def step_oai_chat_completions(context, api_error):
-    if context.debug:
-        print(f"Submitting OAI compatible completions request...")
-    expect_api_error = api_error == 'raised'
-    seeds = await completions_seed(context, num_seeds=1),
-    completion = await oai_chat_completions(context.prompts.pop(),
-                                            seeds[0] if seeds is not None else seeds,
-                                            context.system_prompt,
-                                            context.base_url,
-                                            '/v1/chat',
-                                            False,
-                                            model=context.model if hasattr(context, 'model') else None,
-
-                                            n_predict=context.n_predict
-                                            if hasattr(context, 'n_predict') else None,
-
-                                            enable_streaming=context.enable_streaming
-                                            if hasattr(context, 'enable_streaming') else None,
-
-                                            response_format=context.response_format
-                                            if hasattr(context, 'response_format') else None,
-
-                                            user_api_key=context.user_api_key
-                                            if hasattr(context, 'user_api_key') else None,
-
-                                            expect_api_error=expect_api_error)
-    context.tasks_result.append(completion)
-    if context.debug:
-        print(f"Completion response: {completion}")
-    if expect_api_error:
-        assert completion == 401, f"completion must be an 401 status code: {completion}"
-
-    if context.debug:
-        print(f"Completion response: {completion}")
-
-
-@step('a prompt')
-def step_a_prompt(context):
-    context.prompts.append(context_text(context))
-    context.n_prompts = len(context.prompts)
-
-
-@step('a prompt {prompt}')
-def step_a_prompt_prompt(context, prompt):
-    context.prompts.append(prompt)
-    context.n_prompts = len(context.prompts)
-
-
-# TODO: allow this to be repeated
-@step('an infill input extra {filename} {text}')
-def step_infill_input_extra(context, filename, text):
-    if filename == 'none':
-        context.infill_input_extra = None
-    else:
-        context.infill_input_extra = [{'filename': filename, 'text': text}]
-
-
-@step('an infill input suffix {text}')
-def step_infill_input_suffix(context, text):
-    context.infill_input_suffix = text
-
-
-@step('an infill input prefix {text}')
-def step_infill_input_prefix(context, text):
-    context.infill_input_prefix = text
-
-
-@step('{num_prompts:d} prompts {prompt} with seed {seed:d}')
-def step_many_prompts(context, num_prompts, prompt, seed):
-    if context.seed is None:
-        context.seed = []
-    for _ in range(num_prompts):
-        context.seed.append(seed)
-        context.prompts.append(prompt)
-    context.n_prompts = len(context.prompts)
-
-
-@step('concurrent completion requests')
-@async_run_until_complete()
-async def step_concurrent_completion_requests(context):
-    await concurrent_requests(
-        context,
-        request_completion,
-        # prompt is inserted automatically
-        context.base_url,
-        debug=context.debug,
-        prompt_prefix=context.prompt_prefix,
-        prompt_suffix=context.prompt_suffix,
-        n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
-        user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None,
-        temperature=context.temperature,
-    )
-
-
-@step('concurrent OAI completions requests')
-@async_run_until_complete
-async def step_oai_chat_completions(context):
-    await concurrent_requests(context, oai_chat_completions,
-                              # user_prompt is inserted automatically
-                              context.system_prompt,
-                              context.base_url,
-                              '/v1/chat/completions',
-                              True,  # async_client
-                              model=context.model
-                              if hasattr(context, 'model') else None,
-                              n_predict=context.n_predict
-                              if hasattr(context, 'n_predict') else None,
-                              enable_streaming=context.enable_streaming
-                              if hasattr(context, 'enable_streaming') else None,
-                              response_format=context.response_format
-                              if hasattr(context, 'response_format') else None,
-                              user_api_key=context.user_api_key
-                              if hasattr(context, 'user_api_key') else None)
-
-
-@step('concurrent OAI completions requests no v1')
-@async_run_until_complete
-async def step_oai_chat_completions(context):
-    await concurrent_requests(context, oai_chat_completions,
-                              # user_prompt is inserted automatically
-                              context.system_prompt,
-                              context.base_url,
-                              '/chat/completions',
-                              True,  # async_client
-                              model=context.model
-                              if hasattr(context, 'model') else None,
-                              n_predict=context.n_predict
-                              if hasattr(context, 'n_predict') else None,
-                              enable_streaming=context.enable_streaming
-                              if hasattr(context, 'enable_streaming') else None,
-                              response_format=context.response_format
-                              if hasattr(context, 'response_format') else None,
-                              user_api_key=context.user_api_key
-                              if hasattr(context, 'user_api_key') else None)
-
-
-@step('all prompts are predicted')
-@async_run_until_complete
-async def step_all_prompts_are_predicted(context):
-    await all_prompts_are_predicted(context)
-
-
-@step('all prompts are predicted with {n_expected_predicted:d} tokens')
-@async_run_until_complete
-async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted):
-    await all_prompts_are_predicted(context, n_expected_predicted)
-
-
-async def all_prompts_are_predicted(context, expected_predicted_n=None):
-    n_completions = await gather_tasks_results(context)
-    assert n_completions > 0
-    for i in range(n_completions):
-        assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n)
-    assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests"
-
-
-@step('embeddings are computed for')
-@async_run_until_complete
-async def step_compute_embedding(context):
-    context.n_prompts = 1
-    context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url)
-
-
-@step('reranking request')
-@async_run_until_complete
-async def step_compute_reranking(context):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/reranking',
-                                json={
-                                    "query": context.reranking_query,
-                                    "documents": context.reranking_documents,
-                                }) as response:
-            if response.status == 200:
-                response_json = await response.json()
-                context.reranking_results = response_json['results']
-            else:
-                context.reranking_results = response.status
-
-
-@step('all embeddings are the same')
-@async_run_until_complete
-async def step_all_embeddings_are_the_same(context):
-    n_embedding_requests = await gather_tasks_results(context)
-    assert n_embedding_requests > 0
-    embeddings = []
-    for i in range(n_embedding_requests):
-        embedding = context.tasks_result.pop().pop()
-        embeddings.append(embedding)
-        assert_embeddings(embedding)
-    n = len(embeddings)
-    for i in range(n-1):
-        for j in range(i+1, n):
-            embedding1 = np.array(embeddings[i])
-            embedding2 = np.array(embeddings[j])
-            if context.debug:
-                print(f"embedding1: {embedding1[-8:]}")
-                print(f"embedding2: {embedding2[-8:]}")
-            similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
-            msg = f"Similarity between {i} and {j}: {similarity:.10f}"
-            if context.debug:
-                print(f"{msg}")
-            assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
-
-
-@step('embeddings are generated')
-def step_assert_embeddings(context):
-    assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n"
-                                                             f"context.n_prompts={context.n_prompts}\n"
-                                                             f"context.embeddings={context.embeddings}")
-    for embedding in context.embeddings:
-        assert_embeddings(embedding)
-
-@step('embeddings request with {api_error_code:d} api error')
-def step_assert_embeddings(context, api_error_code: int):
-    assert context.embeddings == api_error_code, f"embeddings request must return code {api_error_code}, but got {context.embeddings}"
-
-@step('an OAI compatible embeddings computation request for')
-@async_run_until_complete
-async def step_oai_compute_embeddings(context):
-    context.n_prompts = 1
-    context.embeddings = await request_oai_embeddings(context_text(context), None,
-                                                      base_url=context.base_url,
-                                                      user_api_key=context.user_api_key,
-                                                      model=context.model)
-
-
-@step('an OAI compatible embeddings computation request for multiple inputs')
-@async_run_until_complete
-async def step_oai_compute_embeddings_multiple_inputs(context):
-    context.embeddings = await request_oai_embeddings(context.prompts, None,
-                                                      base_url=context.base_url,
-                                                      user_api_key=context.user_api_key,
-                                                      model=context.model)
-    context.prompts.clear()
-
-
-@step('concurrent embedding requests')
-@async_run_until_complete()
-async def step_concurrent_embedding_requests(context):
-    await concurrent_requests(context,
-                              request_embedding,
-                              # prompt is inserted automatically
-                              base_url=context.base_url)
-
-
-@step('concurrent OAI embedding requests')
-@async_run_until_complete()
-async def step_concurrent_oai_embedding_requests(context):
-    await concurrent_requests(context,
-                              request_oai_embeddings,
-                              # prompt is inserted automatically
-                              base_url=context.base_url,
-                              async_client=True,
-                              model=context.model)
-
-
-@step('all embeddings are generated')
-@async_run_until_complete()
-async def all_embeddings_are_generated(context):
-    n_embedding_requests = await gather_tasks_results(context)
-    assert n_embedding_requests == context.n_prompts
-    for i in range(n_embedding_requests):
-        assert_embeddings(context.tasks_result.pop().pop())
-
-@step('reranking results are returned')
-def reranking_results_are_returned(context):
-    assert len(context.reranking_results) == len(context.reranking_documents)
-
-@step('reranking highest score is index {idx_high:d} and lowest score is index {idx_low:d}')
-def reranking_results_are_returned(context, idx_high: int, idx_low: int):
-    max_score, max_idx = 0, 0
-    min_score, min_idx = 0, 0
-    for res in context.reranking_results:
-        if max_score < res['relevance_score']:
-            max_score = res['relevance_score']
-            max_idx   = res['index']
-        if min_score > res['relevance_score']:
-            min_score = res['relevance_score']
-            min_idx   = res['index']
-    print(context.reranking_results)
-    assert max_idx == idx_high
-    assert min_idx == idx_low
-
-@step('adding special tokens')
-def step_tokenize_set_add_special(context):
-    context.tokenize_add_special = True
-
-
-@step("tokenizing with pieces")
-@async_run_until_complete
-async def step_tokenize_with_pieces(context):
-    context.tokenized_text = context_text(context)
-    async with aiohttp.ClientSession() as session:
-        tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
-        if getattr(context, "tokenize_add_special", None) is not None:
-            tokenize_args["add_special"] = context.tokenize_add_special
-
-        async with session.post(
-            f"{context.base_url}/tokenize", json=tokenize_args
-        ) as response:
-            assert response.status == 200
-            tokenize_json = await response.json()
-            context.tokens_with_pieces = tokenize_json["tokens"]
-
-
-@step("tokens are given with pieces")
-@async_run_until_complete
-async def step_tokenize_with_pieces(context):
-    # Verify that the response contains both token IDs and pieces
-    assert all(
-        "id" in token and "piece" in token for token in context.tokens_with_pieces
-    )
-
-
-@step('tokenizing')
-@async_run_until_complete
-async def step_tokenize(context):
-    context.tokenized_text = context_text(context)
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        tokenize_args = {
-            "content": context.tokenized_text,
-        }
-        if getattr(context, 'tokenize_add_special', None) is not None:
-            tokenize_args['add_special'] = context.tokenize_add_special
-        async with session.post(f'{context.base_url}/tokenize',
-                                json=tokenize_args) as response:
-            assert response.status == 200
-            tokenize_json = await response.json()
-            context.tokens = tokenize_json['tokens']
-
-
-@step('tokens can be detokenized')
-@async_run_until_complete
-async def step_detokenize(context):
-    assert len(context.tokens) > 0
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/detokenize',
-                                json={
-                                    "tokens": context.tokens,
-                                }) as response:
-            assert response.status == 200
-            detokenize_json = await response.json()
-            # SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15
-            assert context.tokenized_text == detokenize_json['content'].strip()
-
-
-@step('tokens begin with BOS')
-def step_strings_for_tokenization(context):
-    assert context.tokens[0] == context.bos
-
-
-@step('tokens do not begin with BOS')
-def step_strings_for_tokenization(context):
-    assert context.tokens[0] != context.bos
-
-
-@step('first token is removed')
-def step_strings_for_tokenization(context):
-    context.tokens = context.tokens[1:]
-
-
-@step('an OPTIONS request is sent from {origin}')
-@async_run_until_complete
-async def step_options_request(context, origin):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin}
-        async with session.options(f'{context.base_url}/v1/chat/completions',
-                                    headers=headers) as response:
-            assert response.status == 200
-            context.options_response = response
-
-
-@step('CORS header {cors_header} is set to {cors_header_value}')
-def step_check_options_header_value(context, cors_header, cors_header_value):
-    assert context.options_response.headers[cors_header] == cors_header_value
-
-
-@step('prometheus metrics are exposed')
-@async_run_until_complete
-async def step_prometheus_metrics_exported(context):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with await session.get(f'{context.base_url}/metrics') as metrics_response:
-            assert metrics_response.status == 200
-            assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
-            metrics_raw = await metrics_response.text()
-            metric_exported = False
-            if context.debug:
-                print(f"/metrics answer:\n{metrics_raw}")
-            context.metrics = {}
-            for metric in parser.text_string_to_metric_families(metrics_raw):
-                match metric.name:
-                    case "llamacpp:kv_cache_usage_ratio":
-                        assert len(metric.samples) > 0
-                        metric_exported = True
-                context.metrics[metric.name] = metric
-            assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time"
-            assert metric_exported, "No metrics exported"
-
-
-@step('metric {metric_name} is {metric_value:d}')
-def step_assert_metric_value(context, metric_name, metric_value):
-    if metric_name not in context.metrics:
-        assert False, f"no metric {metric_name} in {context.metrics.keys()}"
-    assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}"
-
-
-@step('available models')
-def step_available_models(context):
-    # openai client always expects an api_key
-    openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
-    openai.base_url = f'{context.base_url}/v1/'
-    context.models = openai.models.list().data
-
-
-@step('{n_model:d} models are supported')
-def step_supported_models(context, n_model):
-    if context.debug:
-        print("server models available:", context.models)
-    assert len(context.models) == n_model
-
-
-@step('model {i_model:d} is {param} {preposition} {param_value}')
-def step_supported_models(context, i_model: int, param: Literal['identified', 'trained'] | str, preposition: str, param_value: str):
-    assert i_model < len(context.models)
-    model = context.models[i_model]
-
-    param_value = param_value.split(' ', 1)[0]
-    match param:
-        case 'identified':
-            value = model.id
-        case 'trained':
-            value = str(model.meta["n_ctx_train"])
-        case _:
-            assert False, "param {param} not supported"
-    assert param_value == value, f"model param {param} {value} != {param_value}"
-
-
-async def concurrent_requests(context, f_completion, *args, **kwargs):
-    context.n_prompts = len(context.prompts)
-    if context.debug:
-        print(f"starting {context.n_prompts} concurrent completion requests...")
-    assert context.n_prompts > 0
-    seeds = await completions_seed(context)
-    assert seeds is not None
-    for prompt_no in range(context.n_prompts):
-        shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
-        context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
-    await asyncio.sleep(0.01)
-
-
-@step('the slot {slot_id:d} is saved with filename "{filename}"')
-@async_run_until_complete
-async def step_save_slot(context, slot_id, filename):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
-                                json={"filename": filename},
-                                headers={"Content-Type": "application/json"}) as response:
-            context.response = response
-
-
-@step('the slot {slot_id:d} is restored with filename "{filename}"')
-@async_run_until_complete
-async def step_restore_slot(context, slot_id, filename):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
-                                json={"filename": filename},
-                                headers={"Content-Type": "application/json"}) as response:
-            context.response = response
-
-
-@step('the slot {slot_id:d} is erased')
-@async_run_until_complete
-async def step_erase_slot(context, slot_id):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
-                                headers={"Content-Type": "application/json"}) as response:
-            context.response = response
-
-
-@step('switch {on_or_off} lora adapter {lora_id:d}')
-@async_run_until_complete
-async def toggle_lora_adapter(context, on_or_off: str, lora_id: int):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/lora-adapters',
-                                json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}],
-                                headers={"Content-Type": "application/json"}) as response:
-            context.response = response
-            print([{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}])
-
-
-@step('the server responds with status code {status_code:d}')
-def step_server_responds_with_status_code(context, status_code):
-    assert context.response.status == status_code
-
-
-async def request_completion(prompt,
-                             seed,
-                             base_url,
-                             debug=False,
-                             prompt_prefix=None,
-                             prompt_suffix=None,
-                             n_predict=None,
-                             cache_prompt=False,
-                             id_slot=None,
-                             expect_api_error=None,
-                             user_api_key=None,
-                             temperature=None) -> int | dict[str, Any]:
-    if debug:
-        print(f"Sending completion request: {prompt}")
-    origin = "my.super.domain"
-    headers = {
-        'Origin': origin
-    }
-    if user_api_key is not None:
-        if debug:
-            print(f"Set user_api_key: {user_api_key}")
-        headers['Authorization'] = f'Bearer {user_api_key}'
-
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{base_url}/completion',
-                                json={
-                                    "input_prefix": prompt_prefix,
-                                    "prompt": prompt,
-                                    "input_suffix": prompt_suffix,
-                                    "n_predict": n_predict if n_predict is not None else -1,
-                                    "cache_prompt": cache_prompt,
-                                    "id_slot": id_slot,
-                                    "seed": seed if seed is not None else 42,
-                                    "temperature": temperature if temperature is not None else 0.8,
-                                    "n_probs": 2,
-                                },
-                                headers=headers) as response:
-            if expect_api_error is None or not expect_api_error:
-                assert response.status == 200
-                assert response.headers['Access-Control-Allow-Origin'] == origin
-                return await response.json()
-            else:
-                return response.status
-
-
-async def oai_chat_completions(user_prompt,
-                               seed,
-                               system_prompt,
-                               base_url: str,
-                               base_path: str,
-                               async_client,
-                               debug=False,
-                               temperature=None,
-                               model=None,
-                               n_predict=None,
-                               enable_streaming=None,
-                               response_format=None,
-                               user_api_key=None,
-                               expect_api_error=None) -> int | dict[str, Any]:
-    if debug:
-        print(f"Sending OAI Chat completions request: {user_prompt}")
-    # openai client always expects an api key
-    user_api_key = user_api_key if user_api_key is not None else 'nope'
-    seed = seed if seed is not None else 42
-    enable_streaming = enable_streaming if enable_streaming is not None else False
-    payload = {
-        "messages": [
-            {
-                "role": "system",
-                "content": system_prompt,
-            },
-            {
-                "role": "user",
-                "content": user_prompt,
-            }
-        ],
-        "model": model,
-        "max_tokens": n_predict,
-        "stream": enable_streaming,
-        "temperature": temperature if temperature is not None else 0.0,
-        "seed": seed,
-    }
-    if response_format is not None:
-        payload['response_format'] = response_format
-    completion_response = {
-        'content': '',
-        'timings': {
-            'predicted_n': 0,
-            'prompt_n': 0
-        }
-    }
-    if async_client:
-        origin = 'llama.cpp'
-        headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-            async with session.post(f'{base_url}{base_path}',
-                                    json=payload,
-                                    headers=headers) as response:
-                if enable_streaming:
-                    assert response.status == 200
-                    assert response.headers['Access-Control-Allow-Origin'] == origin
-                    assert response.headers['Content-Type'] == "text/event-stream"
-                    event_received = True
-                    while event_received:
-                        event_received = False
-                        async for line_in_bytes in response.content:
-                            line = line_in_bytes.decode('utf-8')
-                            line = line.rstrip('\n').rstrip('\r')
-                            if line == '':
-                                continue
-                            event_data = line.split(': ', 1)
-                            assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
-                            chunk_raw = event_data[1]
-                            if chunk_raw == '[DONE]':
-                                break
-
-                            chunk = json.loads(chunk_raw)
-                            assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
-                            delta = chunk['choices'][0]['delta']
-                            if 'content' in delta:
-                                completion_response['content'] += delta['content']
-                                completion_response['timings']['predicted_n'] += 1
-                else:
-                    if expect_api_error is None or not expect_api_error:
-                        assert response.status == 200
-                        assert response.headers['Access-Control-Allow-Origin'] == origin
-                        assert response.headers['Content-Type'] == "application/json; charset=utf-8"
-                        chat_completion_raw = await response.json()
-                        completion_response = {
-                            'content': chat_completion_raw['choices'][0]['message'],
-                            'timings': {
-                                'predicted_n': chat_completion_raw['usage']['completion_tokens'],
-                                'prompt_n': chat_completion_raw['usage']['prompt_tokens']
-                            }
-                        }
-                    else:
-                        return response.status
-    else:
-        try:
-            openai.api_key = user_api_key
-            openai.base_url = f'{base_url}{base_path.removesuffix("chat")}'
-            assert model is not None
-            chat_completion = openai.chat.completions.create(
-                messages=payload['messages'],
-                model=model,
-                max_tokens=n_predict,
-                stream=enable_streaming,
-                response_format=payload.get('response_format') or openai.NOT_GIVEN,
-                seed=seed,
-                temperature=payload['temperature']
-            )
-        except openai.AuthenticationError as e:
-            if expect_api_error is not None and expect_api_error:
-                return 401
-            else:
-                assert False, f'error raised: {e}'
-
-        if enable_streaming:
-            chat_completion = cast(openai.Stream[ChatCompletionChunk], chat_completion)
-            for chunk in chat_completion:
-                assert len(chunk.choices) == 1
-                delta = chunk.choices[0].delta
-                if delta.content is not None:
-                    completion_response['content'] += delta.content
-                    completion_response['timings']['predicted_n'] += 1
-                completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop'
-        else:
-            assert len(chat_completion.choices) == 1
-            assert chat_completion.usage is not None
-            completion_response = {
-                'content': chat_completion.choices[0].message.content,
-                'timings': {
-                    'predicted_n': chat_completion.usage.completion_tokens,
-                    'prompt_n': chat_completion.usage.prompt_tokens
-                    },
-                'truncated': chat_completion.choices[0].finish_reason != 'stop'
-            }
-    if debug:
-        print("OAI response formatted to llama.cpp:", completion_response)
-    return completion_response
-
-
-async def request_embedding(content, seed, base_url=None) -> list[list[float]] | int:
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{base_url}/embedding',
-                                json={
-                                    "content": content,
-                                }) as response:
-            if response.status == 200:
-                response_json = await response.json()
-                return [response_json['embedding']]
-            else:
-                return response.status
-
-
-async def request_oai_embeddings(input, seed,
-                                 base_url=None, user_api_key=None,
-                                 model=None, async_client=False) -> list[list[float]]:
-    # openai client always expects an api_key
-    user_api_key = user_api_key if user_api_key is not None else 'nope'
-    if async_client:
-        origin = 'llama.cpp'
-        headers=[]
-        if user_api_key is not None:
-            headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-            async with session.post(f'{base_url}/v1/embeddings',
-                                    json={
-                                        "input": input,
-                                        "model": model,
-                                    },
-                                    headers=headers) as response:
-                assert response.status == 200, f"received status code not expected: {response.status}"
-                assert response.headers['Access-Control-Allow-Origin'] == origin
-                assert response.headers['Content-Type'] == "application/json; charset=utf-8"
-                response_json = await response.json()
-                assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
-                assert response_json['object'] == 'list'
-                if isinstance(input, Sequence):
-                    embeddings = []
-                    for an_oai_embeddings in response_json['data']:
-                        embeddings.append(an_oai_embeddings['embedding'])
-                else:
-                    embeddings = [response_json['data']['embedding']]
-                return embeddings
-    else:
-        openai.api_key = user_api_key
-        openai.base_url = f'{base_url}/v1/'
-        assert model is not None
-        oai_embeddings = openai.embeddings.create(
-            model=model,
-            input=input,
-        )
-
-        return [e.embedding for e in oai_embeddings.data]
-
-
-def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
-    content = completion_response['content']
-    n_predicted = completion_response['timings']['predicted_n']
-    assert len(content) > 0, "no token predicted"
-    if re_content is not None:
-        p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL)
-        matches = p.finditer(content)
-        last_match = 0
-        highlighted = ''
-        for match in matches:
-            start, end = match.span()
-            highlighted += content[last_match: start]
-            highlighted += '\x1b[33m'
-            highlighted += content[start: end]
-            highlighted += '\x1b[0m'
-            last_match = end
-        highlighted += content[last_match:]
-        if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
-          print(f"Checking completion response: {highlighted}")
-        assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```'
-    if expected_predicted_n and expected_predicted_n > 0:
-        assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
-                                                     f' {n_predicted} <> {expected_predicted_n}')
-
-def assert_all_predictions_equal(completion_responses):
-    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
-        for i, response_i in enumerate(completion_responses):
-            content_i = response_i['content']
-            print(f"content {i}: {content_i}")
-    for i, response_i in enumerate(completion_responses):
-        content_i = response_i['content']
-        for j, response_j in enumerate(completion_responses):
-            if i == j:
-                continue
-            content_j = response_j['content']
-            assert content_i == content_j, "contents not equal"
-
-
-def assert_all_predictions_different(completion_responses):
-    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
-        for i, response_i in enumerate(completion_responses):
-            content_i = response_i['content']
-            print(f"content {i}: {content_i}")
-    for i, response_i in enumerate(completion_responses):
-        content_i = response_i['content']
-        for j, response_j in enumerate(completion_responses):
-            if i == j:
-                continue
-            content_j = response_j['content']
-            assert content_i != content_j, "contents not different"
-
-
-def assert_all_token_probabilities_equal(completion_responses):
-    n_predict = len(completion_responses[0]['completion_probabilities'])
-    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
-        for pos in range(n_predict):
-            for i, response_i in enumerate(completion_responses):
-                probs_i = response_i['completion_probabilities'][pos]['probs']
-                print(f"pos {pos}, probs {i}: {probs_i}")
-    for pos in range(n_predict):
-        for i, response_i in enumerate(completion_responses):
-            probs_i = response_i['completion_probabilities'][pos]['probs']
-            for j, response_j in enumerate(completion_responses):
-                if i == j:
-                    continue
-                probs_j = response_j['completion_probabilities'][pos]['probs']
-                assert probs_i == probs_j, "contents not equal"
-
-
-async def gather_tasks_results(context):
-    n_tasks = len(context.concurrent_tasks)
-    if context.debug:
-        print(f"Waiting for all {n_tasks} tasks results...")
-    for task_no in range(n_tasks):
-        context.tasks_result.append(await context.concurrent_tasks.pop())
-    n_completions = len(context.tasks_result)
-    return n_completions
-
-
-async def wait_for_slots_status(context,
-                                base_url,
-                                expected_http_status_code,
-                                timeout=3,
-                                params=None,
-                                slots_idle=None,
-                                slots_processing=None):
-    if context.debug:
-        print(f"Starting checking for health for expected_http_status_code={expected_http_status_code}")
-    interval = 0.5
-    counter = 0
-    if 'GITHUB_ACTIONS' in os.environ:
-        timeout *= 2
-
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        while True:
-            headers = {'Authorization': f'Bearer {context.server_api_key}'}
-            async with await session.get(f'{base_url}/slots', params=params, headers=headers) as slots_response:
-                status_code = slots_response.status
-                slots = await slots_response.json()
-                if context.debug:
-                    print(f"slots responses {slots}\n")
-                if status_code == 503 and status_code == expected_http_status_code:
-                    return
-                if status_code == 200 and status_code == expected_http_status_code:
-                    n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
-                    n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
-                    if ((slots_idle is None or slots_idle == n_slots_idle)
-                        and (slots_processing is None or slots_processing == n_slots_processing)):
-                        return
-            await asyncio.sleep(interval)
-
-            counter += interval
-            if counter >= timeout:
-                # Sometimes health requests are triggered after completions are predicted
-                if expected_http_status_code == 503:
-                    if len(context.tasks_result) == 0:
-                        print("\x1b[5;37;43mWARNING: forcing concurrent tasks,"
-                              " busy health check missed, probably too fast inference\x1b[0m\n")
-                        n_completions = await gather_tasks_results(context)
-                        if n_completions > 0:
-                            return
-
-                assert False, f'slots check timeout exceeded {counter}s>={timeout}'
-
-
-def assert_embeddings(embeddings):
-    assert len(embeddings) > 0
-    embeddings_computed = False
-    for emb in embeddings:
-        if not isinstance(emb, float):
-            assert False, f"Bad embeddings: {embeddings}"
-        if emb != 0:
-            embeddings_computed = True
-    assert embeddings_computed, f"Embeddings: {embeddings}"
-
-
-async def request_slots_status(context, expected_slots):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with await session.get(f'{context.base_url}/slots') as slots_response:
-            assert slots_response.status == 200
-            slots = await slots_response.json()
-            assert_slots_status(slots, expected_slots)
-
-
-def assert_slots_status(slots, expected_slots):
-    assert len(slots) == len(expected_slots)
-    for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)):
-        for key in expected:
-            assert expected[key] == slot[key], (f"invalid slot {slot_id}"
-                                                f" expected[{key}] != slot[{key}]"
-                                                f" = {expected[key]} != {slot[key]}")
-
-
-async def completions_seed(context, num_seeds=None):
-    if hasattr(context, "seed") and context.seed is not None:
-        assert len(context.seed) == context.n_prompts
-        if num_seeds is None:
-            num_seeds = context.n_prompts
-        assert num_seeds <= context.n_prompts
-        seeds = context.seed[:num_seeds]
-        context.seed = context.seed[num_seeds:] if num_seeds < context.n_prompts else None
-        return seeds
-
-    if hasattr(context, "server_seed") and context.server_seed is not None:
-        if num_seeds is None:
-            return [context.server_seed] * context.n_prompts
-        else:
-            return [context.server_seed] * num_seeds
-    return None
-
-
-def context_text(context):
-    return context.text.replace('\r', '')
-
-
-def start_server_background(context):
-    if os.name == 'nt':
-        context.server_path = '../../../build/bin/Release/llama-server.exe'
-    else:
-        context.server_path = '../../../build/bin/llama-server'
-    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
-        context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
-    server_listen_addr = context.server_fqdn
-    server_args = [
-        '--slots', # requires to get slot status via /slots endpoint
-        '--host', server_listen_addr,
-        '--port', context.server_port,
-    ]
-    if context.model_file:
-        server_args.extend(['--model', context.model_file])
-    if context.model_url:
-        server_args.extend(['--model-url', context.model_url])
-    if context.model_hf_repo:
-        server_args.extend(['--hf-repo', context.model_hf_repo])
-    if context.model_hf_file:
-        server_args.extend(['--hf-file', context.model_hf_file])
-    if context.n_batch:
-        server_args.extend(['--batch-size', context.n_batch])
-    if context.n_ubatch:
-        server_args.extend(['--ubatch-size', context.n_ubatch])
-    if context.n_threads:
-        server_args.extend(['--threads', context.threads])
-    if context.n_gpu_layer:
-        server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
-    if context.draft is not None:
-        server_args.extend(['--draft', context.draft])
-    if context.server_continuous_batching:
-        server_args.append('--cont-batching')
-    if context.server_embeddings:
-        server_args.append('--embedding')
-    if context.server_reranking:
-        server_args.append('--reranking')
-    if context.server_metrics:
-        server_args.append('--metrics')
-    if context.model_alias:
-        server_args.extend(['--alias', context.model_alias])
-    if context.n_ctx:
-        server_args.extend(['--ctx-size', context.n_ctx])
-    if context.n_slots:
-        server_args.extend(['--parallel', context.n_slots])
-    if context.n_server_predict:
-        server_args.extend(['--n-predict', context.n_server_predict])
-    if context.slot_save_path:
-        server_args.extend(['--slot-save-path', context.slot_save_path])
-    if context.server_api_key:
-        server_args.extend(['--api-key', context.server_api_key])
-    if context.n_ga:
-        server_args.extend(['--grp-attn-n', context.n_ga])
-    if context.n_ga_w:
-        server_args.extend(['--grp-attn-w', context.n_ga_w])
-    if context.debug:
-        server_args.append('--verbose')
-    if context.lora_file:
-        server_args.extend(['--lora', context.lora_file])
-    if context.disable_ctx_shift:
-        server_args.extend(['--no-context-shift'])
-
-    args = [str(arg) for arg in [context.server_path, *server_args]]
-    print(f"bench: starting server with: {' '.join(args)}")
-
-    flags = 0
-    if 'nt' == os.name:
-        flags |= subprocess.DETACHED_PROCESS
-        flags |= subprocess.CREATE_NEW_PROCESS_GROUP
-        flags |= subprocess.CREATE_NO_WINDOW
-
-    pkwargs = {
-        'creationflags': flags,
-        'stdout': subprocess.PIPE,
-        'stderr': subprocess.PIPE
-    }
-    context.server_process = subprocess.Popen(
-        [str(arg) for arg in [context.server_path, *server_args]],
-        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue]
-
-    def server_log(in_stream, out_stream):
-        for line in iter(in_stream.readline, b''):
-            print(line.decode('utf-8'), end='', file=out_stream)
-
-    thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout))
-    thread_stdout.start()
-
-    thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
-    thread_stderr.start()
-
-    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature
deleted file mode 100644
index 61d5f315e..000000000
--- a/examples/server/tests/features/wrong_usages.feature
+++ /dev/null
@@ -1,25 +0,0 @@
-# run with: ./tests.sh --no-skipped --tags wrong_usage
-@wrong_usage
-Feature: Wrong usage of llama.cpp server
-
-  #3969 The user must always set --n-predict option
-  # to cap the number of tokens any completion request can generate
-  # or pass n_predict/max_tokens in the request.
-  Scenario: Infinite loop
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   42 as server seed
-    And   2048 KV cache size
-    # Uncomment below to fix the issue
-    #And   64 server max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-      """
-      Go to: infinite loop
-      """
-    # Uncomment below to fix the issue
-    #And   128 max tokens to predict
-    Given concurrent completion requests
-    Then the server is idle
-    Then all prompts are predicted
diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
index 553954872..935a79114 100644
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -1,5 +1,5 @@
 aiohttp~=3.9.3
-behave~=1.2.6
+pytest~=8.3.3
 huggingface_hub~=0.23.2
 numpy~=1.26.4
 openai~=1.30.3
diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh
index 72a0fbad8..1e285dcda 100755
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -4,8 +4,7 @@ set -eu
 
 if [ $# -lt 1 ]
 then
-    # Start @llama.cpp scenario
-    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+    pytest -v -x
 else
-    behave "$@"
+    pytest "$@"
 fi
diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py
new file mode 100644
index 000000000..84db5ca1c
--- /dev/null
+++ b/examples/server/tests/unit/test_basic.py
@@ -0,0 +1,34 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_server_start_simple():
+    global server
+    server.start()
+    res = server.make_request("GET", "/health")
+    assert res.status_code == 200
+
+
+def test_server_props():
+    global server
+    server.start()
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert res.body["total_slots"] == server.n_slots
+
+
+def test_server_models():
+    global server
+    server.start()
+    res = server.make_request("GET", "/models")
+    assert res.status_code == 200
+    assert len(res.body["data"]) == 1
+    assert res.body["data"][0]["id"] == server.model_alias
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
new file mode 100644
index 000000000..d7aeb288d
--- /dev/null
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -0,0 +1,129 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+@pytest.mark.parametrize(
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
+    [
+        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
+    ]
+)
+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "model": model,
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+    })
+    assert res.status_code == 200
+    assert res.body["usage"]["prompt_tokens"] == n_prompt
+    assert res.body["usage"]["completion_tokens"] == n_predicted
+    choice = res.body["choices"][0]
+    assert "assistant" == choice["message"]["role"]
+    assert match_regex(re_content, choice["message"]["content"])
+    if truncated:
+        assert choice["finish_reason"] == "length"
+    else:
+        assert choice["finish_reason"] == "stop"
+
+
+@pytest.mark.parametrize(
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
+    [
+        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
+    ]
+)
+def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/chat/completions", data={
+        "model": model,
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        choice = data["choices"][0]
+        if choice["finish_reason"] in ["stop", "length"]:
+            assert data["usage"]["prompt_tokens"] == n_prompt
+            assert data["usage"]["completion_tokens"] == n_predicted
+            assert "content" not in choice["delta"]
+            assert match_regex(re_content, content)
+            # FIXME: not sure why this is incorrect in stream mode
+            # if truncated:
+            #   assert choice["finish_reason"] == "length"
+            # else:
+            #   assert choice["finish_reason"] == "stop"
+        else:
+            assert choice["finish_reason"] is None
+            content += choice["delta"]["content"]
+
+
+def test_chat_completion_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=8,
+        seed=42,
+        temperature=0.8,
+    )
+    print(res)
+    assert res.choices[0].finish_reason == "stop"
+    assert res.choices[0].message.content is not None
+    assert match_regex("(Suddenly)+", res.choices[0].message.content)
+
+
+@pytest.mark.parametrize("response_format,n_predicted,re_content", [
+    ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
+    ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
+    ({"type": "json_object"}, 10, "(\\{|John)+"),
+    ({"type": "sound"}, 0, None),
+    # invalid response format (expected to fail)
+    ({"type": "json_object", "schema": 123}, 0, None),
+    ({"type": "json_object", "schema": {"type": 123}}, 0, None),
+    ({"type": "json_object", "schema": {"type": "hiccup"}}, 0, None),
+])
+def test_completion_with_response_format(response_format: dict, n_predicted: int, re_content: str | None):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predicted,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "Write an example"},
+        ],
+        "response_format": response_format,
+    })
+    if re_content is not None:
+        assert res.status_code == 200
+        choice = res.body["choices"][0]
+        assert match_regex(re_content, choice["message"]["content"])
+    else:
+        assert res.status_code != 200
+        assert "error" in res.body
+
diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py
new file mode 100644
index 000000000..2fa30dd03
--- /dev/null
+++ b/examples/server/tests/unit/test_completion.py
@@ -0,0 +1,223 @@
+import pytest
+import time
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+])
+def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": prompt,
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["prompt_n"] == n_prompt
+    assert res.body["timings"]["predicted_n"] == n_predicted
+    assert res.body["truncated"] == truncated
+    assert match_regex(re_content, res.body["content"])
+
+
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+])
+def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": prompt,
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        if data["stop"]:
+            assert data["timings"]["prompt_n"] == n_prompt
+            assert data["timings"]["predicted_n"] == n_predicted
+            assert data["truncated"] == truncated
+            assert match_regex(re_content, content)
+        else:
+            content += data["content"]
+
+
+@pytest.mark.parametrize("n_slots", [1, 2])
+def test_consistent_result_same_seed(n_slots: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    last_res = None
+    for _ in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": 42,
+            "temperature": 1.0,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] == last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.parametrize("n_slots", [1, 2])
+def test_different_result_different_seed(n_slots: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    last_res = None
+    for seed in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": seed,
+            "temperature": 1.0,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] != last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.parametrize("n_batch", [16, 32])
+@pytest.mark.parametrize("temperature", [0.0, 1.0])
+def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
+    global server
+    server.n_batch = n_batch
+    server.start()
+    last_res = None
+    for _ in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": 42,
+            "temperature": temperature,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] == last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.skip(reason="This test fails on linux, need to be fixed")
+def test_cache_vs_nocache_prompt():
+    global server
+    server.start()
+    res_cache = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": True,
+    })
+    res_no_cache = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": False,
+    })
+    assert res_cache.body["content"] == res_no_cache.body["content"]
+
+
+def test_completion_with_tokens_input():
+    global server
+    server.temperature = 0.0
+    server.start()
+    prompt_str = "I believe the meaning of life is"
+    res = server.make_request("POST", "/tokenize", data={
+        "content": prompt_str,
+        "add_special": True,
+    })
+    assert res.status_code == 200
+    tokens = res.body["tokens"]
+
+    # single completion
+    res = server.make_request("POST", "/completion", data={
+        "prompt": tokens,
+    })
+    assert res.status_code == 200
+    assert type(res.body["content"]) == str
+
+    # batch completion
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [tokens, tokens],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed string and tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [tokens, prompt_str],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed string and tokens in one sequence
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
+    })
+    assert res.status_code == 200
+    assert type(res.body["content"]) == str
+
+
+@pytest.mark.parametrize("n_slots,n_requests", [
+    (1, 3),
+    (2, 2),
+    (2, 4),
+    (4, 2), # some slots must be idle
+    (4, 6),
+])
+def test_completion_parallel_slots(n_slots: int, n_requests: int):
+    global server
+    server.n_slots = n_slots
+    server.temperature = 0.0
+    server.start()
+
+    PROMPTS = [
+        ("Write a very long book.", "(very|special|big)+"),
+        ("Write another a poem.", "(small|house)+"),
+        ("What is LLM?", "(Dad|said)+"),
+        ("The sky is blue and I love it.", "(climb|leaf)+"),
+        ("Write another very long music lyrics.", "(friends|step|sky)+"),
+        ("Write a very long joke.", "(cat|Whiskers)+"),
+    ]
+    def check_slots_status():
+        should_all_slots_busy = n_requests >= n_slots
+        time.sleep(0.1)
+        res = server.make_request("GET", "/slots")
+        n_busy = sum([1 for slot in res.body if slot["is_processing"]])
+        if should_all_slots_busy:
+            assert n_busy == n_slots
+        else:
+            assert n_busy <= n_slots
+
+    tasks = []
+    for i in range(n_requests):
+        prompt, re_content = PROMPTS[i % len(PROMPTS)]
+        tasks.append((server.make_request, ("POST", "/completion", {
+            "prompt": prompt,
+            "seed": 42,
+            "temperature": 1.0,
+        })))
+    tasks.append((check_slots_status, ()))
+    results = parallel_function_calls(tasks)
+
+    # check results
+    for i in range(n_requests):
+        prompt, re_content = PROMPTS[i % len(PROMPTS)]
+        res = results[i]
+        assert res.status_code == 200
+        assert type(res.body["content"]) == str
+        assert len(res.body["content"]) > 10
+        # FIXME: the result is not deterministic when using other slot than slot 0
+        # assert match_regex(re_content, res.body["content"])
diff --git a/examples/server/tests/unit/test_ctx_shift.py b/examples/server/tests/unit/test_ctx_shift.py
new file mode 100644
index 000000000..be93a6d31
--- /dev/null
+++ b/examples/server/tests/unit/test_ctx_shift.py
@@ -0,0 +1,67 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+LONG_TEXT = """
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+""".strip()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.n_ctx = 256
+    server.n_slots = 2
+
+
+def test_ctx_shift_enabled():
+    # the prompt is 301 tokens
+    # the slot context is 256/2 = 128 tokens
+    # the prompt is truncated to keep the last 109 tokens
+    # 64 tokens are generated thanks to shifting the context when it gets full
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": LONG_TEXT,
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["prompt_n"] == 109
+    assert res.body["timings"]["predicted_n"] == 64
+    assert res.body["truncated"] is True
+
+
+@pytest.mark.parametrize("n_predict,n_token_output,truncated", [
+    (64, 64, False),
+    (-1, 120, True),
+])
+def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
+    global server
+    server.disable_ctx_shift = True
+    server.n_predict = -1
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": "Hi how are you",
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["predicted_n"] == n_token_output
+    assert res.body["truncated"] == truncated
+
+
+def test_ctx_shift_disabled_long_prompt():
+    global server
+    server.disable_ctx_shift = True
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": LONG_TEXT,
+    })
+    assert res.status_code != 200
+    assert "error" in res.body
+    assert "exceeds the available context size" in res.body["error"]["message"]
diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py
new file mode 100644
index 000000000..fc7c20064
--- /dev/null
+++ b/examples/server/tests/unit/test_embedding.py
@@ -0,0 +1,99 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.bert_bge_small()
+
+EPSILON = 1e-3
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.bert_bge_small()
+
+
+def test_embedding_single():
+    global server
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": "I believe the meaning of life is",
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 1
+    assert 'embedding' in res.body['data'][0]
+    assert len(res.body['data'][0]['embedding']) > 1
+
+    # make sure embedding vector is normalized
+    assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON
+
+
+def test_embedding_multiple():
+    global server
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "Write a joke about AI from a very long prompt which will not be truncated",
+            "This is a test",
+            "This is another test",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 4
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
+def test_embedding_openai_library_single():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is")
+    assert len(res.data) == 1
+    assert len(res.data[0].embedding) > 1
+
+
+def test_embedding_openai_library_multiple():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.embeddings.create(model="text-embedding-3-small", input=[
+        "I believe the meaning of life is",
+        "Write a joke about AI from a very long prompt which will not be truncated",
+        "This is a test",
+        "This is another test",
+    ])
+    assert len(res.data) == 4
+    for d in res.data:
+        assert len(d.embedding) > 1
+
+
+def test_embedding_error_prompt_too_long():
+    global server
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": "This is a test " * 512,
+    })
+    assert res.status_code != 200
+    assert "too large" in res.body["error"]["message"]
+
+
+def test_same_prompt_give_same_result():
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 5
+    for i in range(1, len(res.body['data'])):
+        v0 = res.body['data'][0]['embedding']
+        vi = res.body['data'][i]['embedding']
+        for x, y in zip(v0, vi):
+            assert abs(x - y) < EPSILON
diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
new file mode 100644
index 000000000..38ce6c429
--- /dev/null
+++ b/examples/server/tests/unit/test_infill.py
@@ -0,0 +1,35 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama_infill()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama_infill()
+
+def test_infill_without_input_extra():
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
+
+def test_infill_with_input_extra():
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
+        "input_extra": [{
+            "filename": "llama.h",
+            "text": "LLAMA_API int32_t llama_n_threads();\n"
+        }],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
diff --git a/examples/server/tests/unit/test_lora.py b/examples/server/tests/unit/test_lora.py
new file mode 100644
index 000000000..749615449
--- /dev/null
+++ b/examples/server/tests/unit/test_lora.py
@@ -0,0 +1,42 @@
+import pytest
+import os
+from utils import *
+
+server = ServerPreset.stories15m_moe()
+
+LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.stories15m_moe()
+    # download lora file if needed
+    file_name = LORA_FILE_URL.split('/').pop()
+    lora_file = f'../../../{file_name}'
+    if not os.path.exists(lora_file):
+        print(f"Downloading {LORA_FILE_URL} to {lora_file}")
+        with open(lora_file, 'wb') as f:
+            f.write(requests.get(LORA_FILE_URL).content)
+        print(f"Done downloading lora file")
+    server.lora_files = [lora_file]
+
+
+@pytest.mark.parametrize("scale,re_content", [
+    # without applying lora, the model should behave like a bedtime story generator
+    (0.0, "(little|girl|three|years|old)+"),
+    # with lora, the model should behave like a Shakespearean text generator
+    (1.0, "(eye|love|glass|sun)+"),
+])
+def test_lora(scale: float, re_content: str):
+    global server
+    server.start()
+    res_lora_control = server.make_request("POST", "/lora-adapters", data=[
+        {"id": 0, "scale": scale}
+    ])
+    assert res_lora_control.status_code == 200
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Look in thy glass",
+    })
+    assert res.status_code == 200
+    assert match_regex(re_content, res.body["content"])
+
diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py
new file mode 100644
index 000000000..3a49fd3ac
--- /dev/null
+++ b/examples/server/tests/unit/test_rerank.py
@@ -0,0 +1,38 @@
+import pytest
+from utils import *
+
+server = ServerPreset.jina_reranker_tiny()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.jina_reranker_tiny()
+
+
+def test_rerank():
+    global server
+    server.start()
+    res = server.make_request("POST", "/rerank", data={
+        "query": "Machine learning is",
+        "documents": [
+            "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
+            "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
+            "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
+            "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine."
+        ]
+    })
+    assert res.status_code == 200
+    assert len(res.body["results"]) == 4
+
+    most_relevant = res.body["results"][0]
+    least_relevant = res.body["results"][0]
+    for doc in res.body["results"]:
+        if doc["relevance_score"] > most_relevant["relevance_score"]:
+            most_relevant = doc
+        if doc["relevance_score"] < least_relevant["relevance_score"]:
+            least_relevant = doc
+
+    assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
+    assert most_relevant["index"] == 2
+    assert least_relevant["index"] == 3
diff --git a/examples/server/tests/unit/test_security.py b/examples/server/tests/unit/test_security.py
new file mode 100644
index 000000000..620b25376
--- /dev/null
+++ b/examples/server/tests/unit/test_security.py
@@ -0,0 +1,83 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+TEST_API_KEY = "sk-this-is-the-secret-key"
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.api_key = TEST_API_KEY
+
+
+@pytest.mark.parametrize("endpoint", ["/health", "/models"])
+def test_access_public_endpoint(endpoint: str):
+    global server
+    server.start()
+    res = server.make_request("GET", endpoint)
+    assert res.status_code == 200
+    assert "error" not in res.body
+
+
+@pytest.mark.parametrize("api_key", [None, "invalid-key"])
+def test_incorrect_api_key(api_key: str):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "Authorization": f"Bearer {api_key}" if api_key else None,
+    })
+    assert res.status_code == 401
+    assert "error" in res.body
+    assert res.body["error"]["type"] == "authentication_error"
+
+
+def test_correct_api_key():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "Authorization": f"Bearer {TEST_API_KEY}",
+    })
+    assert res.status_code == 200
+    assert "error" not in res.body
+    assert "content" in res.body
+
+
+def test_openai_library_correct_api_key():
+    global server
+    server.start()
+    client = OpenAI(api_key=TEST_API_KEY, base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a chatbot."},
+            {"role": "user", "content": "What is the meaning of life?"},
+        ],
+    )
+    assert len(res.choices) == 1
+
+
+@pytest.mark.parametrize("origin,cors_header,cors_header_value", [
+    ("localhost", "Access-Control-Allow-Origin", "localhost"),
+    ("web.mydomain.fr", "Access-Control-Allow-Origin", "web.mydomain.fr"),
+    ("origin", "Access-Control-Allow-Credentials", "true"),
+    ("web.mydomain.fr", "Access-Control-Allow-Methods", "GET, POST"),
+    ("web.mydomain.fr", "Access-Control-Allow-Headers", "*"),
+])
+def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
+    global server
+    server.start()
+    res = server.make_request("OPTIONS", "/completions", headers={
+        "Origin": origin,
+        "Access-Control-Request-Method": "POST",
+        "Access-Control-Request-Headers": "Authorization",
+    })
+    assert res.status_code == 200
+    assert cors_header in res.headers
+    assert res.headers[cors_header] == cors_header_value
diff --git a/examples/server/tests/unit/test_slot_save.py b/examples/server/tests/unit/test_slot_save.py
new file mode 100644
index 000000000..38704f5ec
--- /dev/null
+++ b/examples/server/tests/unit/test_slot_save.py
@@ -0,0 +1,98 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.slot_save_path = "./tmp"
+    server.temperature = 0.0
+
+
+def test_slot_save_restore():
+    global server
+    server.start()
+
+    # First prompt in slot 1 should be fully processed
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
+
+    # Save state of slot 1
+    res = server.make_request("POST", "/slots/1?action=save", data={
+        "filename": "slot1.bin",
+    })
+    assert res.status_code == 200
+    assert res.body["n_saved"] == 84
+
+    # Since we have cache, this should only process the last tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 6  # only different part is processed
+
+    # Loading the saved cache into slot 0
+    res = server.make_request("POST", "/slots/0?action=restore", data={
+        "filename": "slot1.bin",
+    })
+    assert res.status_code == 200
+    assert res.body["n_restored"] == 84
+
+    # Since we have cache, slot 0 should only process the last tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 0,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 6  # only different part is processed
+
+    # For verification that slot 1 was not corrupted during slot 0 load, same thing should work
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 1
+
+
+def test_slot_erase():
+    global server
+    server.start()
+
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
+
+    # erase slot 1
+    res = server.make_request("POST", "/slots/1?action=erase")
+    assert res.status_code == 200
+
+    # re-run the same prompt, it should process all tokens again
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
diff --git a/examples/server/tests/unit/test_tokenize.py b/examples/server/tests/unit/test_tokenize.py
new file mode 100644
index 000000000..382457c9d
--- /dev/null
+++ b/examples/server/tests/unit/test_tokenize.py
@@ -0,0 +1,59 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_tokenize_detokenize():
+    global server
+    server.start()
+    # tokenize
+    content = "What is the capital of France ?"
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content
+    })
+    assert res_tok.status_code == 200
+    assert len(res_tok.body["tokens"]) > 5
+    # detokenize
+    res_detok = server.make_request("POST", "/detokenize", data={
+        "tokens": res_tok.body["tokens"],
+    })
+    assert res_detok.status_code == 200
+    assert res_detok.body["content"].strip() == content
+
+
+def test_tokenize_with_bos():
+    global server
+    server.start()
+    # tokenize
+    content = "What is the capital of France ?"
+    bosId = 1
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content,
+        "add_special": True,
+    })
+    assert res_tok.status_code == 200
+    assert res_tok.body["tokens"][0] == bosId
+
+
+def test_tokenize_with_pieces():
+    global server
+    server.start()
+    # tokenize
+    content = "This is a test string with unicode 媽 and emoji 🤗"
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content,
+        "with_pieces": True,
+    })
+    assert res_tok.status_code == 200
+    for token in res_tok.body["tokens"]:
+        assert "id" in token
+        assert token["id"] > 0
+        assert "piece" in token
+        assert len(token["piece"]) > 0
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
new file mode 100644
index 000000000..bc590bcb3
--- /dev/null
+++ b/examples/server/tests/utils.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# type: ignore[reportUnusedImport]
+
+import subprocess
+import os
+import re
+import json
+import sys
+import threading
+import requests
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Tuple,
+    Set,
+)
+from re import RegexFlag
+
+
+class ServerResponse:
+    headers: dict
+    status_code: int
+    body: dict | Any
+
+
+class ServerProcess:
+    # default options
+    debug: bool = False
+    server_port: int = 8080
+    server_host: str = "127.0.0.1"
+    model_hf_repo: str = "ggml-org/models"
+    model_hf_file: str = "tinyllamas/stories260K.gguf"
+    model_alias: str = "tinyllama-2"
+    temperature: float = 0.8
+    seed: int = 42
+
+    # custom options
+    model_alias: str | None = None
+    model_url: str | None = None
+    model_file: str | None = None
+    n_threads: int | None = None
+    n_gpu_layer: int | None = None
+    n_batch: int | None = None
+    n_ubatch: int | None = None
+    n_ctx: int | None = None
+    n_ga: int | None = None
+    n_ga_w: int | None = None
+    n_predict: int | None = None
+    n_prompts: int | None = 0
+    slot_save_path: str | None = None
+    id_slot: int | None = None
+    cache_prompt: bool | None = None
+    n_slots: int | None = None
+    server_continuous_batching: bool | None = False
+    server_embeddings: bool | None = False
+    server_reranking: bool | None = False
+    server_metrics: bool | None = False
+    draft: int | None = None
+    api_key: str | None = None
+    response_format: str | None = None
+    lora_files: List[str] | None = None
+    disable_ctx_shift: int | None = False
+
+    # session variables
+    process: subprocess.Popen | None = None
+
+    def __init__(self):
+        if "N_GPU_LAYERS" in os.environ:
+            self.n_gpu_layer = int(os.environ["N_GPU_LAYERS"])
+        if "DEBUG" in os.environ:
+            self.debug = True
+        if "PORT" in os.environ:
+            self.server_port = int(os.environ["PORT"])
+
+    def start(self, timeout_seconds: int = 10) -> None:
+        if "LLAMA_SERVER_BIN_PATH" in os.environ:
+            server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
+        elif os.name == "nt":
+            server_path = "../../../build/bin/Release/llama-server.exe"
+        else:
+            server_path = "../../../build/bin/llama-server"
+        server_args = [
+            "--slots",  # requires to get slot status via /slots endpoint
+            "--host",
+            self.server_host,
+            "--port",
+            self.server_port,
+            "--temp",
+            self.temperature,
+            "--seed",
+            self.seed,
+        ]
+        if self.model_file:
+            server_args.extend(["--model", self.model_file])
+        if self.model_url:
+            server_args.extend(["--model-url", self.model_url])
+        if self.model_hf_repo:
+            server_args.extend(["--hf-repo", self.model_hf_repo])
+        if self.model_hf_file:
+            server_args.extend(["--hf-file", self.model_hf_file])
+        if self.n_batch:
+            server_args.extend(["--batch-size", self.n_batch])
+        if self.n_ubatch:
+            server_args.extend(["--ubatch-size", self.n_ubatch])
+        if self.n_threads:
+            server_args.extend(["--threads", self.n_threads])
+        if self.n_gpu_layer:
+            server_args.extend(["--n-gpu-layers", self.n_gpu_layer])
+        if self.draft is not None:
+            server_args.extend(["--draft", self.draft])
+        if self.server_continuous_batching:
+            server_args.append("--cont-batching")
+        if self.server_embeddings:
+            server_args.append("--embedding")
+        if self.server_reranking:
+            server_args.append("--reranking")
+        if self.server_metrics:
+            server_args.append("--metrics")
+        if self.model_alias:
+            server_args.extend(["--alias", self.model_alias])
+        if self.n_ctx:
+            server_args.extend(["--ctx-size", self.n_ctx])
+        if self.n_slots:
+            server_args.extend(["--parallel", self.n_slots])
+        if self.n_predict:
+            server_args.extend(["--n-predict", self.n_predict])
+        if self.slot_save_path:
+            server_args.extend(["--slot-save-path", self.slot_save_path])
+        if self.n_ga:
+            server_args.extend(["--grp-attn-n", self.n_ga])
+        if self.n_ga_w:
+            server_args.extend(["--grp-attn-w", self.n_ga_w])
+        if self.debug:
+            server_args.append("--verbose")
+        if self.lora_files:
+            for lora_file in self.lora_files:
+                server_args.extend(["--lora", lora_file])
+        if self.disable_ctx_shift:
+            server_args.extend(["--no-context-shift"])
+        if self.api_key:
+            server_args.extend(["--api-key", self.api_key])
+
+        args = [str(arg) for arg in [server_path, *server_args]]
+        print(f"bench: starting server with: {' '.join(args)}")
+
+        flags = 0
+        if "nt" == os.name:
+            flags |= subprocess.DETACHED_PROCESS
+            flags |= subprocess.CREATE_NEW_PROCESS_GROUP
+            flags |= subprocess.CREATE_NO_WINDOW
+
+        self.process = subprocess.Popen(
+            [str(arg) for arg in [server_path, *server_args]],
+            creationflags=flags,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env={**os.environ, "LLAMA_CACHE": "tmp"},
+        )
+        server_instances.add(self)
+
+        def server_log(in_stream, out_stream):
+            for line in iter(in_stream.readline, b""):
+                print(line.decode("utf-8"), end="", file=out_stream)
+
+        thread_stdout = threading.Thread(
+            target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
+        )
+        thread_stdout.start()
+
+        thread_stderr = threading.Thread(
+            target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
+        )
+        thread_stderr.start()
+
+        print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
+
+        # wait for server to start
+        start_time = time.time()
+        while time.time() - start_time < timeout_seconds:
+            try:
+                response = self.make_request("GET", "/slots", headers={
+                    "Authorization": f"Bearer {self.api_key}" if self.api_key else None
+                })
+                if response.status_code == 200:
+                    self.ready = True
+                    return  # server is ready
+            except Exception as e:
+                pass
+            print(f"Waiting for server to start...")
+            time.sleep(0.5)
+        raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
+
+    def stop(self) -> None:
+        server_instances.remove(self)
+        if self.process:
+            print(f"Stopping server with pid={self.process.pid}")
+            self.process.kill()
+            self.process = None
+
+    def make_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | Any | None = None,
+        headers: dict | None = None,
+    ) -> ServerResponse:
+        url = f"http://{self.server_host}:{self.server_port}{path}"
+        parse_body = False
+        if method == "GET":
+            response = requests.get(url, headers=headers)
+            parse_body = True
+        elif method == "POST":
+            response = requests.post(url, headers=headers, json=data)
+            parse_body = True
+        elif method == "OPTIONS":
+            response = requests.options(url, headers=headers)
+        else:
+            raise ValueError(f"Unimplemented method: {method}")
+        result = ServerResponse()
+        result.headers = dict(response.headers)
+        result.status_code = response.status_code
+        result.body = response.json() if parse_body else None
+        print("Response from server", result.body)
+        return result
+
+    def make_stream_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | None = None,
+        headers: dict | None = None,
+    ) -> Iterator[dict]:
+        url = f"http://{self.server_host}:{self.server_port}{path}"
+        if method == "POST":
+            response = requests.post(url, headers=headers, json=data, stream=True)
+        else:
+            raise ValueError(f"Unimplemented method: {method}")
+        for line_bytes in response.iter_lines():
+            line = line_bytes.decode("utf-8")
+            if '[DONE]' in line:
+                break
+            elif line.startswith('data: '):
+                data = json.loads(line[6:])
+                print("Partial response from server", data)
+                yield data
+
+
+server_instances: Set[ServerProcess] = set()
+
+
+class ServerPreset:
+    @staticmethod
+    def tinyllama2() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "tinyllamas/stories260K.gguf"
+        server.model_alias = "tinyllama-2"
+        server.n_ctx = 256
+        server.n_batch = 32
+        server.n_slots = 2
+        server.n_predict = 64
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def bert_bge_small() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
+        server.model_alias = "bert-bge-small"
+        server.n_ctx = 512
+        server.n_batch = 128
+        server.n_ubatch = 128
+        server.n_slots = 2
+        server.seed = 42
+        server.server_embeddings = True
+        return server
+
+    @staticmethod
+    def tinyllama_infill() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
+        server.model_alias = "tinyllama-infill"
+        server.n_ctx = 2048
+        server.n_batch = 1024
+        server.n_slots = 1
+        server.n_predict = 64
+        server.temperature = 0.0
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def stories15m_moe() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/stories15M_MOE"
+        server.model_hf_file = "stories15M_MOE-F16.gguf"
+        server.model_alias = "stories15m-moe"
+        server.n_ctx = 2048
+        server.n_batch = 1024
+        server.n_slots = 1
+        server.n_predict = 64
+        server.temperature = 0.0
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def jina_reranker_tiny() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
+        server.model_alias = "jina-reranker"
+        server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
+        server.n_ctx = 512
+        server.n_batch = 512
+        server.n_slots = 1
+        server.seed = 42
+        server.server_reranking = True
+        return server
+
+
+def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
+    """
+    Run multiple functions in parallel and return results in the same order as calls. Equivalent to Promise.all in JS.
+
+    Example usage:
+
+    results = parallel_function_calls([
+        (func1, (arg1, arg2)),
+        (func2, (arg3, arg4)),
+    ])
+    """
+    results = [None] * len(function_list)
+    exceptions = []
+
+    def worker(index, func, args):
+        try:
+            result = func(*args)
+            results[index] = result
+        except Exception as e:
+            exceptions.append((index, str(e)))
+
+    with ThreadPoolExecutor() as executor:
+        futures = []
+        for i, (func, args) in enumerate(function_list):
+            future = executor.submit(worker, i, func, args)
+            futures.append(future)
+
+        # Wait for all futures to complete
+        for future in as_completed(futures):
+            pass
+
+    # Check if there were any exceptions
+    if exceptions:
+        print("Exceptions occurred:")
+        for index, error in exceptions:
+            print(f"Function at index {index}: {error}")
+
+    return results
+
+
+def match_regex(regex: str, text: str) -> bool:
+    return (
+        re.compile(
+            regex, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL
+        ).search(text)
+        is not None
+    )

From 904109ed0d97c9b656a5e8bf612925f739bb8166 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Tue, 26 Nov 2024 09:45:05 -0600
Subject: [PATCH 316/396] vulkan: fix group_norm (#10496)

Fix bad calculation of the end of the range. Add a backend test that
covers the bad case (taken from stable diffusion).

Fixes https://github.com/leejet/stable-diffusion.cpp/issues/439.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp                | 2 +-
 ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp | 2 +-
 tests/test-backend-ops.cpp                          | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 49527fdf4..da1cfd24e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -7157,7 +7157,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
         const int32_t max_period = tensor->op_params[1];
         tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
     } else if (tensor->op == GGML_OP_POOL_2D) {
-        enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
+        enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
         const int32_t k0 = tensor->op_params[1];
         const int32_t k1 = tensor->op_params[2];
         const int32_t s0 = tensor->op_params[3];
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
index 5ad9b28da..b6a0d5645 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
@@ -19,7 +19,7 @@ void main() {
 
     const uint tid = gl_LocalInvocationID.x;
     const uint start = gl_WorkGroupID.x * group_size + tid;
-    const uint end = start + group_size;
+    const uint end = (gl_WorkGroupID.x + 1) * group_size;
 
     tmp[tid] = 0.0f;
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 6376b0e4c..da66ed856 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3796,7 +3796,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_upscale());
     test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
     test_cases.emplace_back(new test_upscale_ext());
-    test_cases.emplace_back(new test_group_norm());
+    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
+    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
     test_cases.emplace_back(new test_acc());
     test_cases.emplace_back(new test_pad());
     test_cases.emplace_back(new test_arange());

From 249cd93da3df9c8fa78869b0522526d1625aca91 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Wed, 27 Nov 2024 00:00:41 +0800
Subject: [PATCH 317/396] mtgpu: Add MUSA_DOCKER_ARCH in Dockerfiles && update
 cmake and make (#10516)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 .devops/full-musa.Dockerfile         |  9 ++++++++-
 .devops/llama-cli-musa.Dockerfile    |  9 ++++++++-
 .devops/llama-server-musa.Dockerfile |  9 ++++++++-
 Makefile                             |  9 +++++----
 ggml/src/ggml-musa/CMakeLists.txt    | 11 ++++++++++-
 5 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile
index 575e81b48..3193fea1e 100644
--- a/.devops/full-musa.Dockerfile
+++ b/.devops/full-musa.Dockerfile
@@ -6,6 +6,9 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V
 
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
 RUN apt-get update && \
     apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
 
@@ -19,7 +22,11 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release -j$(nproc) && \
     cp build/bin/* .
 
diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile
index 3372749be..e7c75af20 100644
--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/llama-cli-musa.Dockerfile
@@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU
 
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
 RUN apt-get update && \
     apt-get install -y build-essential git cmake
 
@@ -15,7 +18,11 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release --target llama-cli -j$(nproc) && \
     mkdir -p /app/lib && \
     find build -name "*.so" -exec cp {} /app/lib \;
diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile
index eb67201c1..cebe51d42 100644
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/llama-server-musa.Dockerfile
@@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU
 
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
 RUN apt-get update && \
     apt-get install -y build-essential git cmake libcurl4-openssl-dev
 
@@ -15,7 +18,11 @@ WORKDIR /app
 
 COPY . .
 
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release --target llama-server -j$(nproc) && \
     mkdir -p /app/lib && \
     find build -name "*.so" -exec cp {} /app/lib \;
diff --git a/Makefile b/Makefile
index cfc74c1dc..9a079a237 100644
--- a/Makefile
+++ b/Makefile
@@ -815,7 +815,7 @@ ifdef GGML_MUSA
 	else
 		MUSA_PATH ?= /opt/musa
 	endif
-	MTGPU_TARGETS ?= mp_21 mp_22
+	MUSA_ARCHITECTURES ?= 21;22
 
 	MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
 	MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
@@ -834,7 +834,8 @@ ifdef GGML_MUSA
 	CXX := $(MUSA_PATH)/bin/clang++
 	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
 
-	MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
+	MUSAFLAGS  = -x musa -mtgpu
+	MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
 
 ifdef GGML_CUDA_FORCE_MMQ
 	MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
@@ -878,14 +879,14 @@ ggml/src/ggml-cuda/ggml-cuda.o: \
 	ggml/src/ggml-backend-impl.h \
 	ggml/src/ggml-common.h \
 	$(wildcard ggml/src/ggml-cuda/*.cuh)
-	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
 
 ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-cuda/%.cu \
 	ggml/include/ggml.h \
 	ggml/src/ggml-common.h \
 	ggml/src/ggml-cuda/common.cuh
-	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
 endif # GGML_MUSA
 
 ifdef GGML_METAL
diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt
index e1a69186e..415b2b2e0 100644
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -20,6 +20,11 @@ find_package(MUSAToolkit)
 if (MUSAToolkit_FOUND)
     message(STATUS "MUSA Toolkit found")
 
+    if (NOT DEFINED MUSA_ARCHITECTURES)
+        set(MUSA_ARCHITECTURES "21;22")
+    endif()
+    message(STATUS "Using MUSA architectures: ${MUSA_ARCHITECTURES}")
+
     file(GLOB   GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
     list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
 
@@ -44,7 +49,11 @@ if (MUSAToolkit_FOUND)
 
     set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
     foreach(SOURCE ${GGML_SOURCES_MUSA})
-        set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
+        set(COMPILE_FLAGS "-x musa -mtgpu")
+        foreach(ARCH ${MUSA_ARCHITECTURES})
+            set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
+        endforeach()
+        set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
     endforeach()
 
     ggml_add_backend_library(ggml-musa

From be0e350c8b69632b27d5fb41fa064fa256dd7fbf Mon Sep 17 00:00:00 2001
From: Tristan Druyen <tristan@vault81.mozmail.com>
Date: Tue, 26 Nov 2024 19:27:28 +0100
Subject: [PATCH 318/396] Fix HIP flag inconsistency & build docs (#10524)

* Fix inconsistency of HIP flags in cmake & make

* Fix docs regarding GGML_HIP
---
 Makefile      | 4 ++--
 docs/build.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 9a079a237..25214ec05 100644
--- a/Makefile
+++ b/Makefile
@@ -752,7 +752,7 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
 
 endif # GGML_VULKAN
 
-ifdef GGML_HIPBLAS
+ifdef GGML_HIP
 	ifeq ($(wildcard /opt/rocm),)
 		ROCM_PATH      ?= /usr
 		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -807,7 +807,7 @@ ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-common.h \
 	ggml/src/ggml-cuda/common.cuh
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-endif # GGML_HIPBLAS
+endif # GGML_HIP
 
 ifdef GGML_MUSA
 	ifeq ($(wildcard /opt/musa),)
diff --git a/docs/build.md b/docs/build.md
index 359952b30..72b810437 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -221,7 +221,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 
 - Using `make`:
   ```bash
-  make GGML_HIPBLAS=1
+  make GGML_HIP=1
   ```
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
   ```bash
@@ -249,7 +249,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 
 - Using `make` (example for target gfx1030, build with 16 CPU threads):
   ```bash
-  make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
+  make -j16 GGML_HIP=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
   ```
 
 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):

From 30ec39832165627dd6ed98938df63adfc6e6a21a Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 26 Nov 2024 21:01:47 +0100
Subject: [PATCH 319/396] llama : disable warnings for 3rd party sha1
 dependency (#10527)

---
 examples/gguf-hash/CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt
index 633f45535..7a494ce32 100644
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@@ -4,10 +4,17 @@ install(TARGETS ${TARGET} RUNTIME)
 
 # clibs dependencies
 include_directories(deps/)
+
 add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
 target_link_libraries(${TARGET} PRIVATE xxhash)
+
 add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
 target_link_libraries(${TARGET} PRIVATE sha1)
+if (NOT MSVC)
+    # disable warnings in 3rd party code
+    target_compile_options(sha1 PRIVATE -w)
+endif()
+
 add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
 target_link_libraries(${TARGET} PRIVATE sha256)
 

From 5a349f2809dc825960dfcfdf8f76b19cd0345be7 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 26 Nov 2024 21:13:54 +0100
Subject: [PATCH 320/396] ci : remove nix workflows (#10526)

---
 .github/workflows/nix-ci-aarch64.yml    | 72 ----------------------
 .github/workflows/nix-ci.yml            | 81 -------------------------
 .github/workflows/nix-flake-update.yml  | 22 -------
 .github/workflows/nix-publish-flake.yml | 36 -----------
 4 files changed, 211 deletions(-)
 delete mode 100644 .github/workflows/nix-ci-aarch64.yml
 delete mode 100644 .github/workflows/nix-ci.yml
 delete mode 100644 .github/workflows/nix-flake-update.yml
 delete mode 100644 .github/workflows/nix-publish-flake.yml

diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
deleted file mode 100644
index 0da6acdf1..000000000
--- a/.github/workflows/nix-ci-aarch64.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: Nix aarch64 builds
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
-    # 1.5h instead of minutes with the cold cache).
-    #
-    # randint(0, 59), randint(0, 23)
-    - cron: '26 12 * * *'
-  # But also rebuild if we touched any of the Nix expressions:
-  push:
-    branches:
-      - master
-    paths: ['**/*.nix', 'flake.lock']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/*.nix', 'flake.lock']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
-  id-token: write
-  contents: read
-
-jobs:
-  nix-build-aarch64:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install QEMU
-      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y qemu-user-static qemu-system-aarch64
-        sudo usermod -a -G kvm $USER
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-platforms = aarch64-linux
-          extra-system-features = nixos-test kvm
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.aarch64-linux"
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --systems aarch64-linux
-          --flake
-          ".#checks.aarch64-linux"
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
deleted file mode 100644
index 3fe941576..000000000
--- a/.github/workflows/nix-ci.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-name: Nix CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
-  id-token: write
-  contents: read
-
-jobs:
-  nix-eval:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: List all flake outputs
-      run: nix flake show --all-systems
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
-  nix-build:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --flake
-          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml
deleted file mode 100644
index 3a6a96e26..000000000
--- a/.github/workflows/nix-flake-update.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: update-flake-lock
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
-
-jobs:
-  lockfile:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Install Nix
-        uses: DeterminateSystems/nix-installer-action@main
-      - name: Update flake.lock
-        uses: DeterminateSystems/update-flake-lock@main
-        with:
-          pr-title: "nix: update flake.lock"
-          pr-labels: |
-            nix
-          pr-reviewers: philiptaron,SomeoneSerge
-          token: ${{ secrets.FLAKE_TOKEN }}
diff --git a/.github/workflows/nix-publish-flake.yml b/.github/workflows/nix-publish-flake.yml
deleted file mode 100644
index 2c3c1ebda..000000000
--- a/.github/workflows/nix-publish-flake.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
-name: "Publish a flake to flakestry & flakehub"
-on:
-    push:
-        tags:
-        - "*"
-    workflow_dispatch:
-        inputs:
-            tag:
-                description: "The existing tag to publish"
-                type: "string"
-                required: true
-jobs:
-    flakestry-publish:
-        runs-on: ubuntu-latest
-        permissions:
-            id-token: "write"
-            contents: "read"
-        steps:
-            - uses: flakestry/flakestry-publish@main
-              with:
-                version: "${{ inputs.tag || github.ref_name }}"
-    flakehub-publish:
-      runs-on: "ubuntu-latest"
-      permissions:
-        id-token: "write"
-        contents: "read"
-      steps:
-        - uses: "actions/checkout@v4"
-          with:
-            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
-        - uses: "DeterminateSystems/nix-installer-action@main"
-        - uses: "DeterminateSystems/flakehub-push@main"
-          with:
-            visibility: "public"
-            tag: "${{ inputs.tag }}"

From de5097351caffb3deaea3393633609df49ef41d0 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Tue, 26 Nov 2024 12:55:29 -0800
Subject: [PATCH 321/396] Add OLMo 2 model in docs (#10530)

* Add link to OLMo 2 model in docs

* Change link to landing page
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5f7933c13..414c5b1c0 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
+- [x] [OLMo 2](https://allenai.org/olmo)
 - [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
 - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)

From c9b00a70b080d5c0668608024afc3e0e2fed822f Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Tue, 26 Nov 2024 22:12:10 +0100
Subject: [PATCH 322/396] ci : fix cuda releases (#10532)

---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c6aecec6e..399641c4a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -728,7 +728,7 @@ jobs:
             cmake --build build --config ${{ matrix.build }} -j $(nproc)
 
   windows-latest-cmake:
-    runs-on: windows-2019
+    runs-on: windows-latest
 
     env:
       OPENBLAS_VERSION: 0.3.23
@@ -929,7 +929,7 @@ jobs:
 
   windows-2019-cmake-cuda:
     runs-on: windows-2019
-    if: ${{ github.event == 'push' && github.ref == 'refs/heads/master' }}
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
     strategy:
       matrix:

From 4a57d362e1948ada50af997a92c3cbff9711e78b Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 27 Nov 2024 01:00:50 -0600
Subject: [PATCH 323/396] vulkan: optimize Q2_K and Q3_K mul_mat_vec (#10459)

---
 .../vulkan-shaders/mul_mat_vec_q2_k.comp      | 74 +++++++++++++------
 .../vulkan-shaders/mul_mat_vec_q3_k.comp      | 46 +++++++++---
 2 files changed, 87 insertions(+), 33 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
index e2625d32b..fcf02210e 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@@ -1,4 +1,5 @@
 #version 450
+#extension GL_EXT_shader_explicit_arithmetic_types : require
 
 #include "mul_mat_vec_base.comp"
 
@@ -32,38 +33,67 @@ void main() {
     const uint s_offset = 8*v_im;
     const uint y_offset = 128*v_im + l0;
 
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
         const uint y_idx = i * QUANT_K + y_offset;
 
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y);
+        f16vec2 d = data_a[ib0 + i].d;
+        const FLOAT_TYPE dall = d.x;
+        const FLOAT_TYPE dmin = d.y;
+
+        B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
+        B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
+        B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
+        B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
+        B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
+        B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
+        B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
+        B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
+
+        uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
+        uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
+
+        uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F;
+        uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F;
+        uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F;
+        uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F;
+
+        uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32));
+        uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32));
+        uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32));
+        uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32));
+
+        uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0];
+        uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8];
+        uvec2 qs0 =  uvec2(unpack8(qs0_u16));
+        uvec2 qs16 = uvec2(unpack8(qs16_u16));
 
         FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
         FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +  0]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3), sum1))))))));
-            sum2 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +  0]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF), sum2))))))));
+        [[unroll]] for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 0) & 3),
+                   fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
+                   fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 2) & 3),
+                   fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3),
+                   fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 4) & 3),
+                   fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3),
+                   fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 6) & 3),
+                   fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1))))))));
+            sum2 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_hi4[0]),
+                   fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_hi4[1]),
+                   fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_hi4[2]),
+                   fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_hi4[3]),
+                   fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_hi4[0]),
+                   fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_hi4[1]),
+                   fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_hi4[2]),
+                   fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
         }
-        const uint tmp_idx = 16 * ix + tid;
-        tmp[tmp_idx] = fma(dall, sum1, fma(-dmin, sum2, tmp[tmp_idx]));
+        temp = fma(dall, sum1, fma(-dmin, sum2, temp));
     }
 
+    tmp[gl_LocalInvocationID.x] = temp;
+
     // sum up partial sums and write back result
     barrier();
     [[unroll]] for (uint s = 16; s > 0; s >>= 1) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
index a28804533..723fadde0 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@@ -1,4 +1,5 @@
 #version 450
+#extension GL_EXT_shader_explicit_arithmetic_types : require
 
 #include "mul_mat_vec_base.comp"
 
@@ -33,7 +34,7 @@ void main() {
     const uint q_offset = 32*v_im + l0;
     const uint y_offset = 128*v_im + l0;
 
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
     const uint s_shift = 4 * v_im;
 
@@ -42,21 +43,44 @@ void main() {
 
         const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
 
+        B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
+        B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
+        B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
+        B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
+        B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
+        B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
+        B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
+        B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
+
+        uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
+        uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
+        uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
+        uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
+        uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
+        uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
+        u8vec2 s0 = unpack8(s0_16);
+        u8vec2 s2 = unpack8(s2_16);
+        u8vec2 s4 = unpack8(s4_16);
+        u8vec2 s6 = unpack8(s6_16);
+        u8vec2 s8 = unpack8(s8_16);
+        u8vec2 s10 = unpack8(s10_16);
+
         FLOAT_TYPE sum = FLOAT_TYPE(0.0);
         for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +  0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
+            sum = fma(FLOAT_TYPE(b0[l])   * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b32[l])  * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b64[l])  * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b96[l])  * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b16[l])  * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b48[l])  * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b80[l])  * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
         }
-        const uint tmp_idx = 16 * ix + tid;
-        tmp[tmp_idx] = fma(d, sum, tmp[tmp_idx]);
+        temp = fma(d, sum, temp);
     }
 
+    tmp[gl_LocalInvocationID.x] = temp;
+
     // sum up partial sums and write back result
     barrier();
     [[unroll]] for (uint s = 16; s > 0; s >>= 1) {

From 71a64989a5d2e25c13507efada145f12cf358914 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 27 Nov 2024 01:08:54 -0600
Subject: [PATCH 324/396] vulkan: skip integer div/mod in get_offsets for
 batch_idx==0 (#10506)

---
 .../vulkan-shaders/mul_mat_vec_base.comp            | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
index 8d0a57913..2ec1af5c7 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
@@ -52,13 +52,16 @@ void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
 #endif
 
 #ifndef MUL_MAT_ID
-    const uint i13 = batch_idx / p.ne12;
-    const uint i12 = batch_idx % p.ne12;
+    uint batch_idx_a = 0;
+    if (batch_idx != 0) {
+        const uint i13 = batch_idx / p.ne12;
+        const uint i12 = batch_idx % p.ne12;
 
-    const uint i03 = i13 / p.broadcast3;
-    const uint i02 = i12 / p.broadcast2;
+        const uint i03 = i13 / p.broadcast3;
+        const uint i02 = i12 / p.broadcast2;
 
-    const uint batch_idx_a = i03 * p.ne02 + i02;
+        batch_idx_a = i03 * p.ne02 + i02;
+    }
 #else
     const uint expert_id = data_ids[expert_idx];
 #endif

From 249a7902ec710c8d027b9cc0ed10219d2b4184f8 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 27 Nov 2024 01:21:59 -0600
Subject: [PATCH 325/396] vulkan: further optimize q5_k mul_mat_vec (#10479)

---
 .../vulkan-shaders/mul_mat_vec_q5_k.comp      | 52 ++++++++++---------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
index 22a6bfae4..b455cbd31 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
@@ -34,9 +34,6 @@ void main() {
     const uint q_offset = 32*v_im + l0;
     const uint y_offset = 64*v_im + l0;
 
-    const uint8_t hm1 = uint8_t(1 << (2*v_im));
-    const uint8_t hm2 = uint8_t(hm1 << 4);
-
     FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) {
@@ -71,6 +68,18 @@ void main() {
         uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
         uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
 
+        uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
+
+        uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
+        uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
+        uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0;
+        uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
+
+        qs0_16_u32_lo4 += qs0_16_lo4_offset16;
+        qs0_16_u32_hi4 += qs0_16_hi4_offset16;
+        qs64_80_u32_lo4 += qs64_80_lo4_offset16;
+        qs64_80_u32_hi4 += qs64_80_hi4_offset16;
+
         uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
         uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
         uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
@@ -102,31 +111,26 @@ void main() {
         B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
         B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
 
-        uint32_t qh0 = data_a_packed16[ib0 + i].qh[l0 / 2];
-        uint32_t qh1 = qh0 >> 8;
-        uint32_t qh16 = data_a_packed16[ib0 + i].qh[l0 / 2 + 8];
-        uint32_t qh17 = qh16 >> 8;
-
         const FLOAT_TYPE sx =
-          fma(FLOAT_TYPE(by10.x), (q4_0 + (((qh0 & hm1) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by10.y), (q4_1 + (((qh1 & hm1) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by116.x), (q4_2 + (((qh16 & hm1) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by116.y) * (q4_3 + (((qh17 & hm1) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by10.x), q4_0,
+          fma(FLOAT_TYPE(by10.y), q4_1,
+          fma(FLOAT_TYPE(by116.x), q4_2,
+             FLOAT_TYPE(by116.y) * q4_3)));
         const FLOAT_TYPE sy =
-          fma(FLOAT_TYPE(by132.x), (q4_4 + (((qh0 & (hm1 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by132.y), (q4_5 + (((qh1 & (hm1 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by148.x), (q4_6 + (((qh16 & (hm1 << 1)) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by148.y) * (q4_7 + (((qh17 & (hm1 << 1)) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by132.x), q4_4,
+          fma(FLOAT_TYPE(by132.y), q4_5,
+          fma(FLOAT_TYPE(by148.x), q4_6,
+             FLOAT_TYPE(by148.y) * q4_7)));
         const FLOAT_TYPE sz =
-          fma(FLOAT_TYPE(by20.x), (q4_8  + (((qh0 & hm2) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by20.y), (q4_9  + (((qh1 & hm2) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by216.x), (q4_10 + (((qh16 & hm2) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by216.y) * (q4_11 + (((qh17 & hm2) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by20.x), q4_8,
+          fma(FLOAT_TYPE(by20.y), q4_9,
+          fma(FLOAT_TYPE(by216.x), q4_10,
+             FLOAT_TYPE(by216.y) * q4_11)));
         const FLOAT_TYPE sw =
-          fma(FLOAT_TYPE(by232.x), (q4_12 + (((qh0 & (hm2 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by232.y), (q4_13 + (((qh1 & (hm2 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by248.x), (q4_14 + (((qh16 & (hm2 << 1)) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by248.y) * (q4_15 + (((qh17 & (hm2 << 1)) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by232.x), q4_12,
+          fma(FLOAT_TYPE(by232.y), q4_13,
+          fma(FLOAT_TYPE(by248.x), q4_14,
+             FLOAT_TYPE(by248.y) * q4_15)));
         const FLOAT_TYPE smin =
           fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
           fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,

From 5b3466bedfa84aa29c6871c7254467550186ecc6 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 27 Nov 2024 01:30:27 -0600
Subject: [PATCH 326/396] vulkan: Handle GPUs with less shared memory (#10468)

There have been reports of failure to compile on systems with <= 32KB
of shared memory (e.g. #10037). This change makes the large tile size
fall back to a smaller size if necessary, and makes mul_mat_id fall
back to CPU if there's only 16KB of shared memory.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 158 +++++++++++++++++----------
 1 file changed, 103 insertions(+), 55 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index da1cfd24e..a833007fb 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1232,8 +1232,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
     std::cerr << "ggml_vulkan: Compiling shaders";
 
     // mulmat
-    std::vector<uint32_t> l_warptile, m_warptile, s_warptile, l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
-    std::array<uint32_t, 3> l_wg_denoms, m_wg_denoms, s_wg_denoms;
+    std::vector<uint32_t> l_warptile, m_warptile, s_warptile,
+                          l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
+    std::array<uint32_t, 3> l_wg_denoms, m_wg_denoms, s_wg_denoms,
+                            l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms;
     uint32_t l_align, m_align, s_align;
 
     l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
@@ -1244,14 +1246,48 @@ static void ggml_vk_load_shaders(vk_device& device) {
     m_warptile_mmq = { 128,  64,  64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
     s_warptile_mmq = { std::max(device->subgroup_size, 16u),  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
 
-    l_wg_denoms = {128, 128, 1 };
-    m_wg_denoms = { 64,  64, 1 };
-    s_wg_denoms = { 32,  32, 1 };
+    l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
+    m_mmq_wg_denoms = m_wg_denoms = { 64,  64, 1 };
+    s_mmq_wg_denoms = s_wg_denoms = { 32,  32, 1 };
 
     l_align = 128;
     m_align =  64;
     s_align =  32;
 
+    // Fallback to smaller sizes if there's not enough shared memory. Given the current shaders
+    // and tile sizes, this should handle 16KB, 32KB, and 48KB+.
+    // This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders.
+    // But the numbers happen to work out for 32KB shared memory size that when using the medium
+    // size there's enough room for everything, and we assert for this.
+    uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float);
+    if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) {
+        l_warptile = m_warptile;
+        l_wg_denoms = m_wg_denoms;
+        shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float);
+        GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+    if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+        // assert mul_mat_mat_id shaders will fit.
+        GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+
+    shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float);
+    if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) {
+        if (device->properties.limits.maxComputeSharedMemorySize == 32768) {
+            l_warptile_mmq = m_warptile_mmq;
+            l_mmq_wg_denoms = m_mmq_wg_denoms;
+        } else {
+            l_warptile_mmq = s_warptile_mmq;
+            l_mmq_wg_denoms = s_mmq_wg_denoms;
+        }
+        shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float);
+        GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+    if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+        // assert mul_mat_mat_id shaders will fit.
+        GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+
     device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
     device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
 
@@ -1299,35 +1335,38 @@ static void ggml_vk_load_shaders(vk_device& device) {
         CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
         CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+        // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
+        if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+            CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        }
 #undef CREATE_MM
     } else {
         // Create 6 variants, {s,m,l}x{unaligned,aligned}
@@ -1344,35 +1383,38 @@ static void ggml_vk_load_shaders(vk_device& device) {
         CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
         CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
 
-        CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+        // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
+        if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+            CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        }
 #undef CREATE_MM
     }
 
@@ -6541,6 +6583,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
             {
+                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+                if (op->op == GGML_OP_MUL_MAT_ID &&
+                    ggml_vk_get_device(ctx->device)->properties.limits.maxComputeSharedMemorySize < 32768) {
+                    // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
+                    return false;
+                }
                 switch (op->src[0]->type) {
                     case GGML_TYPE_F32:
                     case GGML_TYPE_F16:

From c31ed2abfce05c38a2a5189586bfae45a139a547 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 27 Nov 2024 01:32:54 -0600
Subject: [PATCH 327/396] vulkan: define all quant data structures in
 types.comp (#10440)

---
 .../src/ggml-vulkan/vulkan-shaders/types.comp | 147 +++++++++---------
 1 file changed, 76 insertions(+), 71 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
index bc28e0ab8..eecc47f3a 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@@ -30,10 +30,8 @@
 #endif
 #endif
 
-#if defined(DATA_A_Q4_0)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q4_0 32
+#define QUANT_R_Q4_0 2
 
 struct block_q4_0
 {
@@ -46,14 +44,15 @@ struct block_q4_0_packed16
     uint16_t qs[16/2];
 };
 
+#if defined(DATA_A_Q4_0)
+#define QUANT_K QUANT_K_Q4_0
+#define QUANT_R QUANT_R_Q4_0
 #define A_TYPE block_q4_0
 #define A_TYPE_PACKED16 block_q4_0_packed16
 #endif
 
-#if defined(DATA_A_Q4_1)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q4_1 32
+#define QUANT_R_Q4_1 2
 
 struct block_q4_1
 {
@@ -69,15 +68,15 @@ struct block_q4_1_packed16
     uint16_t qs[16/2];
 };
 
+#if defined(DATA_A_Q4_1)
+#define QUANT_K QUANT_K_Q4_1
+#define QUANT_R QUANT_R_Q4_1
 #define A_TYPE block_q4_1
 #define A_TYPE_PACKED16 block_q4_1_packed16
 #endif
 
-#if defined(DATA_A_Q5_0)
-#extension GL_EXT_shader_16bit_storage : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q5_0 32
+#define QUANT_R_Q5_0 2
 
 struct block_q5_0
 {
@@ -93,15 +92,15 @@ struct block_q5_0_packed16
     uint16_t qs[16/2];
 };
 
+#if defined(DATA_A_Q5_0)
+#define QUANT_K QUANT_K_Q5_0
+#define QUANT_R QUANT_R_Q5_0
 #define A_TYPE block_q5_0
 #define A_TYPE_PACKED16 block_q5_0_packed16
 #endif
 
-#if defined(DATA_A_Q5_1)
-#extension GL_EXT_shader_16bit_storage : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q5_1 32
+#define QUANT_R_Q5_1 2
 
 struct block_q5_1
 {
@@ -119,14 +118,15 @@ struct block_q5_1_packed16
     uint16_t qs[16/2];
 };
 
+#if defined(DATA_A_Q5_1)
+#define QUANT_K QUANT_K_Q5_1
+#define QUANT_R QUANT_R_Q5_1
 #define A_TYPE block_q5_1
 #define A_TYPE_PACKED16 block_q5_1_packed16
 #endif
 
-#if defined(DATA_A_Q8_0)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 1
+#define QUANT_K_Q8_0 32
+#define QUANT_R_Q8_0 1
 
 struct block_q8_0
 {
@@ -139,164 +139,164 @@ struct block_q8_0_packed16
     uint16_t qs[32/2];
 };
 
+#if defined(DATA_A_Q8_0)
+#define QUANT_K QUANT_K_Q8_0
+#define QUANT_R QUANT_R_Q8_0
 #define A_TYPE block_q8_0
 #define A_TYPE_PACKED16 block_q8_0_packed16
 #endif
 
 // K-quants
-#if defined(DATA_A_Q2_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q2_K 256
 
 struct block_q2_K
 {
-    uint8_t scales[QUANT_K/16];
-    uint8_t qs[QUANT_K/4];
+    uint8_t scales[QUANT_K_Q2_K/16];
+    uint8_t qs[QUANT_K_Q2_K/4];
     f16vec2 d;
 };
 
 struct block_q2_K_packed16
 {
-    uint16_t scales[QUANT_K/16/2];
-    uint16_t qs[QUANT_K/4/2];
+    uint16_t scales[QUANT_K_Q2_K/16/2];
+    uint16_t qs[QUANT_K_Q2_K/4/2];
     f16vec2 d;
 };
 
 struct block_q2_K_packed32
 {
-    uint32_t scales[QUANT_K/16/4];
-    uint32_t qs[QUANT_K/4/4];
+    uint32_t scales[QUANT_K_Q2_K/16/4];
+    uint32_t qs[QUANT_K_Q2_K/4/4];
     f16vec2 d;
 };
 
+#if defined(DATA_A_Q2_K)
+#define QUANT_K QUANT_K_Q2_K
 #define A_TYPE block_q2_K
 #define A_TYPE_PACKED16 block_q2_K_packed16
 #define A_TYPE_PACKED32 block_q2_K_packed32
 #endif
 
-#if defined(DATA_A_Q3_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q3_K 256
 
 struct block_q3_K
 {
-    uint8_t hmask[QUANT_K/8];
-    uint8_t qs[QUANT_K/4];
+    uint8_t hmask[QUANT_K_Q3_K/8];
+    uint8_t qs[QUANT_K_Q3_K/4];
     uint8_t scales[12];
     float16_t d;
 };
 
 struct block_q3_K_packed16
 {
-    uint16_t hmask[QUANT_K/8/2];
-    uint16_t qs[QUANT_K/4/2];
+    uint16_t hmask[QUANT_K_Q3_K/8/2];
+    uint16_t qs[QUANT_K_Q3_K/4/2];
     uint16_t scales[12/2];
     float16_t d;
 };
 
+#if defined(DATA_A_Q3_K)
+#define QUANT_K QUANT_K_Q3_K
 #define A_TYPE block_q3_K
 #define A_TYPE_PACKED16 block_q3_K_packed16
 #endif
 
-#if defined(DATA_A_Q4_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q4_K 256
 
 struct block_q4_K
 {
     f16vec2 d;
-    uint8_t scales[3*QUANT_K/64];
-    uint8_t qs[QUANT_K/2];
+    uint8_t scales[3*QUANT_K_Q4_K/64];
+    uint8_t qs[QUANT_K_Q4_K/2];
 };
 
 struct block_q4_K_packed16
 {
     f16vec2 d;
-    uint16_t scales[3*QUANT_K/64/2];
-    uint16_t qs[QUANT_K/2/2];
+    uint16_t scales[3*QUANT_K_Q4_K/64/2];
+    uint16_t qs[QUANT_K_Q4_K/2/2];
 };
 
 struct block_q4_K_packed32
 {
     f16vec2 d;
-    uint32_t scales[3*QUANT_K/64/4];
-    uint32_t qs[QUANT_K/2/4];
+    uint32_t scales[3*QUANT_K_Q4_K/64/4];
+    uint32_t qs[QUANT_K_Q4_K/2/4];
 };
 
+#if defined(DATA_A_Q4_K)
+#define QUANT_K QUANT_K_Q4_K
 #define A_TYPE block_q4_K
 #define A_TYPE_PACKED16 block_q4_K_packed16
 #define A_TYPE_PACKED32 block_q4_K_packed32
 #endif
 
-#if defined(DATA_A_Q5_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q5_K 256
 
 struct block_q5_K
 {
     f16vec2 d;
     uint8_t scales[12];
-    uint8_t qh[QUANT_K/8];
-    uint8_t qs[QUANT_K/2];
+    uint8_t qh[QUANT_K_Q5_K/8];
+    uint8_t qs[QUANT_K_Q5_K/2];
 };
 
 struct block_q5_K_packed16
 {
     f16vec2 d;
     uint16_t scales[12/2];
-    uint16_t qh[QUANT_K/8/2];
-    uint16_t qs[QUANT_K/2/2];
+    uint16_t qh[QUANT_K_Q5_K/8/2];
+    uint16_t qs[QUANT_K_Q5_K/2/2];
 };
 
+#if defined(DATA_A_Q5_K)
+#define QUANT_K QUANT_K_Q5_K
 #define A_TYPE block_q5_K
 #define A_TYPE_PACKED16 block_q5_K_packed16
 #endif
 
-#if defined(DATA_A_Q6_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q6_K 256
 
 struct block_q6_K
 {
-    uint8_t ql[QUANT_K/2];
-    uint8_t qh[QUANT_K/4];
-    int8_t scales[QUANT_K/16];
+    uint8_t ql[QUANT_K_Q6_K/2];
+    uint8_t qh[QUANT_K_Q6_K/4];
+    int8_t scales[QUANT_K_Q6_K/16];
     float16_t d;
 };
 
 struct block_q6_K_packed16
 {
-    uint16_t ql[QUANT_K/2/2];
-    uint16_t qh[QUANT_K/4/2];
-    int8_t scales[QUANT_K/16];
+    uint16_t ql[QUANT_K_Q6_K/2/2];
+    uint16_t qh[QUANT_K_Q6_K/4/2];
+    int8_t scales[QUANT_K_Q6_K/16];
     float16_t d;
 };
 
+#if defined(DATA_A_Q6_K)
+#define QUANT_K QUANT_K_Q6_K
 #define A_TYPE block_q6_K
 #define A_TYPE_PACKED16 block_q6_K_packed16
 #endif
 
 // IQuants
 
-#if defined(DATA_A_IQ4_NL)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_IQ4_NL 32
+#define QUANT_R_IQ4_NL 2
 
 struct block_iq4_nl
 {
     float16_t d;
-    uint8_t qs[QUANT_K/2];
+    uint8_t qs[QUANT_K_IQ4_NL/2];
 };
 
 struct block_iq4_nl_packed16
 {
     float16_t d;
-    uint16_t qs[QUANT_K/2/2];
+    uint16_t qs[QUANT_K_IQ4_NL/2/2];
 };
 
-#define A_TYPE block_iq4_nl
-#define A_TYPE_PACKED16 block_iq4_nl_packed16
+#if defined(DATA_A_IQ4_NL)
 
 const int8_t kvalues_iq4nl_const[16] = {
     int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
@@ -313,6 +313,11 @@ void init_iq4nl_shmem()
     }
     barrier();
 }
+
+#define QUANT_K QUANT_K_IQ4_NL
+#define QUANT_R QUANT_R_IQ4_NL
+#define A_TYPE block_iq4_nl
+#define A_TYPE_PACKED16 block_iq4_nl_packed16
 #endif
 
 #endif // !defined(GGML_TYPES_COMP)

From 9150f8fef95327474d39ccd6c6e30787e85f3529 Mon Sep 17 00:00:00 2001
From: Frankie Robertson <frankier@users.noreply.github.com>
Date: Tue, 26 Nov 2024 15:50:26 +0200
Subject: [PATCH 328/396] Do not include arm_neon.h when compiling CUDA code
 (ggml/1028)

---
 ggml/src/ggml-impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 3965be787..f39b7a88c 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -14,7 +14,7 @@
 #include <arm_sve.h>
 #endif // __ARM_FEATURE_SVE
 
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 //
 //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/

From fee824a1a1e35b5c49d482f654613addade61764 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 27 Nov 2024 11:10:42 +0200
Subject: [PATCH 329/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index d101d2b57..fd71a0a46 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-6fcbd60bc72ac3f7ad43f78c87e535f2e6206f58
+c598cbe30621251e80acbcf3b601589a37c17f4d

From 9e2301f4a4ef1690bd99360c11de43fe830b1c8d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 27 Nov 2024 11:22:14 +0200
Subject: [PATCH 330/396] metal : fix group_norm support condition (#0)

---
 ggml/src/ggml-metal/ggml-metal.m | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index c47f07a9e..ae6b25ede 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -997,9 +997,10 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
             return ggml_is_contiguous(op->src[0]);
         case GGML_OP_SUM_ROWS:
         case GGML_OP_SOFT_MAX:
-        case GGML_OP_RMS_NORM:
         case GGML_OP_GROUP_NORM:
             return has_simdgroup_reduction;
+        case GGML_OP_RMS_NORM:
+            return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
         case GGML_OP_NORM:
         case GGML_OP_ROPE:
             return true;
@@ -2672,7 +2673,6 @@ static void ggml_metal_encode_node(
             } break;
         case GGML_OP_GROUP_NORM:
             {
-                GGML_ASSERT(ne00 % 4 == 0);
                 GGML_ASSERT(ggml_is_contiguous(src0));
 
                 float eps;

From 46c69e0e752ff16206347bb12f96ed69f4a01abf Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 27 Nov 2024 11:03:25 +0100
Subject: [PATCH 331/396] ci : faster CUDA toolkit installation method and use
 ccache (#10537)

* ci : faster CUDA toolkit installation method and use ccache

* remove fetch-depth

* only pack CUDA runtime on master
---
 .github/workflows/build.yml | 117 +++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 399641c4a..e02b5c620 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -892,12 +892,12 @@ jobs:
             cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
             cmake --build build
 
-  windows-latest-cmake-cuda:
-    runs-on: windows-latest
+  windows-2019-cmake-cuda:
+    runs-on: windows-2019
 
     strategy:
       matrix:
-        cuda: ['12.6.2']
+        cuda: ['12.4', '11.7']
         build: ['cuda']
 
     steps:
@@ -905,13 +905,66 @@ jobs:
         id: checkout
         uses: actions/checkout@v4
 
-      - name: Install CUDA toolkit
-        id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.19
+      - name: Install Cuda Toolkit 11.7
+        if: ${{ matrix.cuda == '11.7' }}
+        run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+      - name: Install Cuda Toolkit 12.4
+        if: ${{ matrix.cuda == '12.4' }}
+        run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2
         with:
-          cuda: ${{ matrix.cuda }}
-          method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
+          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
 
       - name: Install Ninja
         id: install_ninja
@@ -922,44 +975,12 @@ jobs:
         id: cmake_build
         shell: cmd
         run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON -DCMAKE_CUDA_ARCHITECTURES=89-real
-          cmake --build build --config Release -t ggml-cuda
+          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
           cmake --build build --config Release
 
-  windows-2019-cmake-cuda:
-    runs-on: windows-2019
-    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-    strategy:
-      matrix:
-        cuda: ['12.2.0', '11.7.1']
-        build: ['cuda']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Install CUDA toolkit
-        id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.15
-        with:
-          cuda: ${{ matrix.cuda }}
-          method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
-          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
       - name: Determine tag name
         id: tag
         shell: bash
@@ -987,10 +1008,12 @@ jobs:
           name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
 
       - name: Copy and pack Cuda runtime
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
         run: |
-          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          echo "Cuda install location: ${{ env.CUDA_PATH }}"
           $dst='.\build\bin\cudart\'
-          robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
           7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
 
       - name: Upload Cuda runtime

From 3ad5451f3b75809e3033e4e577b9f60bcaf6676a Mon Sep 17 00:00:00 2001
From: uvos <devnull@uvos.xyz>
Date: Wed, 27 Nov 2024 17:10:08 +0100
Subject: [PATCH 332/396] Add some minimal optimizations for CDNA (#10498)

* Add some minimal optimizations for CDNA

* ggml_cuda: set launch bounds also for GCN as it helps there too
---
 ggml/src/ggml-cuda/common.cuh    | 17 ++++++++++++++---
 ggml/src/ggml-cuda/ggml-cuda.cu  | 11 ++++++++++-
 ggml/src/ggml-cuda/mmq.cu        |  2 +-
 ggml/src/ggml-cuda/mmq.cuh       |  4 ++--
 ggml/src/ggml-cuda/mmvq.cu       |  2 +-
 ggml/src/ggml-cuda/vendors/hip.h |  8 ++++++++
 6 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index b0dd16066..535118d87 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -47,9 +47,20 @@
 #define CC_TURING     750
 #define CC_AMPERE     800
 #define CC_OFFSET_AMD 1000000
-#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
-#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
-#define CC_RDNA3      (CC_OFFSET_AMD + 1100)
+
+// GCN/CNDA, wave size is 64
+#define CC_GCN4       (CC_OFFSET_AMD + 803)  // Tonga, Fiji, Polaris, minimum for fast fp16
+#define CC_VEGA       (CC_OFFSET_AMD + 900)  // Vega56/64, minimum for fp16 dual issue
+#define CC_VEGA20     (CC_OFFSET_AMD + 906)  // MI50/Radeon VII, minimum for dp4a
+#define CC_CDNA       (CC_OFFSET_AMD + 908)  // MI100, minimum for MFMA, acc registers
+#define CC_CDNA2      (CC_OFFSET_AMD + 910)  // MI210, minimum acc register renameing
+#define CC_CDNA3      (CC_OFFSET_AMD + 942)  // MI300
+
+// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
+#define CC_RDNA1      (CC_OFFSET_AMD + 1010) // RX 5000
+#define CC_RDNA2      (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
+#define CC_RDNA3      (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
+
 #define CC_QY1        210
 #define CC_QY2        220
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 2a78a4393..d6e4bfdd0 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1107,6 +1107,11 @@ static void ggml_cuda_op_mul_mat_cublas(
         const half alpha_f16 = 1.0f;
         const half beta_f16 = 0.0f;
 
+        cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
+        if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
+            cu_compute_type = CUBLAS_COMPUTE_32F;
+        }
+
         CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
         CUBLAS_CHECK(
             cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
@@ -1114,7 +1119,7 @@ static void ggml_cuda_op_mul_mat_cublas(
                     &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
                                 src1_ptr,       CUDA_R_16F, ne10,
                     &beta_f16,   dst_f16.get(), CUDA_R_16F, ldc,
-                    CUBLAS_COMPUTE_16F,
+                    cu_compute_type,
                     CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 
         const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
@@ -1607,6 +1612,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
     cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
     cudaDataType_t      cu_data_type    = CUDA_R_16F;
 
+    if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
+        cu_compute_type = CUBLAS_COMPUTE_32F;
+    }
+
     // dst strides
     size_t nbd2 = dst->nb[2];
     size_t nbd3 = dst->nb[3];
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index ae5c68ab3..7f7c8c90b 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -148,5 +148,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
         return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
     }
 
-    return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+    return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
 }
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 425acb20d..8d8867121 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2570,9 +2570,9 @@ static __device__ void mul_mat_q_process_tile(
 
 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
+#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
     __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
 #else
 #if __CUDA_ARCH__ >= CC_VOLTA
     __launch_bounds__(WARP_SIZE*nwarps, 1)
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 735975c16..02d150983 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
     int64_t nwarps = 1;
     int64_t rows_per_cuda_block = 1;
 
-    if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
+    if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
         switch(ncols_y) {
             case 1:
                 nwarps = 4;
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 1f3c70c2e..3205534d6 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -95,6 +95,14 @@
 
 #define __CUDA_ARCH__ 1300
 
+#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
+#define GCN
+#endif
+
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
+#define CDNA
+#endif
+
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
     defined(__gfx1150__) || defined(__gfx1151__)
 #define RDNA3

From 9f912511bc9414fa7a3c521378b6388cd932b58d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Wed, 27 Nov 2024 22:30:52 +0100
Subject: [PATCH 333/396] common : fix duplicated file name with hf_repo and
 hf_file (#10550)

---
 common/arg.cpp                 |  6 +++-
 common/common.cpp              | 54 +++++++++++++++++-----------------
 common/common.h                | 13 ++++++--
 examples/server/tests/utils.py |  1 -
 4 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 272492e50..a6b7a1394 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
             }
             params.hf_file = params.model;
         } else if (params.model.empty()) {
-            params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
+            // this is to avoid different repo having same file name, or same file name in different subdirs
+            std::string filename = params.hf_repo + "_" + params.hf_file;
+            // to make sure we don't have any slashes in the filename
+            string_replace_all(filename, "/", "_");
+            params.model = fs_get_cache_file(filename);
         }
     } else if (!params.model_url.empty()) {
         if (params.model.empty()) {
diff --git a/common/common.cpp b/common/common.cpp
index 09ec9f238..2b2f00098 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -829,9 +829,9 @@ struct common_init_result common_init_from_params(common_params & params) {
     llama_model * model = nullptr;
 
     if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+        model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
     } else if (!params.model_url.empty()) {
-        model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+        model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
     } else {
         model = llama_load_model_from_file(params.model.c_str(), mparams);
     }
@@ -1342,17 +1342,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
 }
 
 struct llama_model * common_load_model_from_url(
-        const char * model_url,
-        const char * path_model,
-        const char * hf_token,
+        const std::string & model_url,
+        const std::string & local_path,
+        const std::string & hf_token,
         const struct llama_model_params & params) {
     // Basic validation of the model_url
-    if (!model_url || strlen(model_url) == 0) {
+    if (model_url.empty()) {
         LOG_ERR("%s: invalid model_url\n", __func__);
         return NULL;
     }
 
-    if (!common_download_file(model_url, path_model, hf_token)) {
+    if (!common_download_file(model_url, local_path, hf_token)) {
         return NULL;
     }
 
@@ -1363,9 +1363,9 @@ struct llama_model * common_load_model_from_url(
             /*.no_alloc = */ true,
             /*.ctx      = */ NULL,
         };
-        auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
+        auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
         if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, local_path.c_str());
             return NULL;
         }
 
@@ -1384,13 +1384,13 @@ struct llama_model * common_load_model_from_url(
         // Verify the first split file format
         // and extract split URL and PATH prefixes
         {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
                 return NULL;
             }
 
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
                 return NULL;
             }
         }
@@ -1417,14 +1417,14 @@ struct llama_model * common_load_model_from_url(
         }
     }
 
-    return llama_load_model_from_file(path_model, params);
+    return llama_load_model_from_file(local_path.c_str(), params);
 }
 
 struct llama_model * common_load_model_from_hf(
-        const char * repo,
-        const char * model,
-        const char * path_model,
-        const char * hf_token,
+        const std::string & repo,
+        const std::string & remote_path,
+        const std::string & local_path,
+        const std::string & hf_token,
         const struct llama_model_params & params) {
     // construct hugging face model url:
     //
@@ -1438,27 +1438,27 @@ struct llama_model * common_load_model_from_hf(
     std::string model_url = "https://huggingface.co/";
     model_url += repo;
     model_url += "/resolve/main/";
-    model_url += model;
+    model_url += remote_path;
 
-    return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
+    return common_load_model_from_url(model_url, local_path, hf_token, params);
 }
 
 #else
 
 struct llama_model * common_load_model_from_url(
-        const char * /*model_url*/,
-        const char * /*path_model*/,
-        const char * /*hf_token*/,
+        const std::string & /*model_url*/,
+        const std::string & /*local_path*/,
+        const std::string & /*hf_token*/,
         const struct llama_model_params & /*params*/) {
     LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
     return nullptr;
 }
 
 struct llama_model * common_load_model_from_hf(
-        const char * /*repo*/,
-        const char * /*model*/,
-        const char * /*path_model*/,
-        const char * /*hf_token*/,
+        const std::string & /*repo*/,
+        const std::string & /*remote_path*/,
+        const std::string & /*local_path*/,
+        const std::string & /*hf_token*/,
         const struct llama_model_params & /*params*/) {
     LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
     return nullptr;
diff --git a/common/common.h b/common/common.h
index 286642db2..9b1508a15 100644
--- a/common/common.h
+++ b/common/common.h
@@ -470,8 +470,17 @@ struct llama_model_params     common_model_params_to_llama  (      common_params
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
-struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_url(
+    const std::string & model_url,
+    const std::string & local_path,
+    const std::string & hf_token,
+    const struct llama_model_params & params);
+struct llama_model * common_load_model_from_hf(
+    const std::string & repo,
+    const std::string & remote_path,
+    const std::string & local_path,
+    const std::string & hf_token,
+    const struct llama_model_params & params);
 
 // clear LoRA adapters from context, then apply new list of adapters
 void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index bc590bcb3..e31743c50 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -319,7 +319,6 @@ class ServerPreset:
         server.model_hf_repo = "ggml-org/models"
         server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
         server.model_alias = "jina-reranker"
-        server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
         server.n_ctx = 512
         server.n_batch = 512
         server.n_slots = 1

From b7420131bf8ab3e067bc660439ab1ab18be7edbd Mon Sep 17 00:00:00 2001
From: Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:24:46 +0800
Subject: [PATCH 334/396] CANN: ROPE operator optimization (#10540)

* [cann] ROPE operator optimization

Co-authored-by: noemotiovon <noemotiovon@gmail.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 308 +++++++++++++++++++++----------
 ggml/src/ggml-cann/ggml-cann.cpp |   9 -
 2 files changed, 211 insertions(+), 106 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index d7472ee3a..d707efc5d 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -21,22 +21,23 @@
  */
 
 #include "aclnn_ops.h"
-#include "ggml-impl.h"
 
+#include <aclnnop/aclnn_addcdiv.h>
 #include <aclnnop/aclnn_avgpool2d.h>
+#include <aclnnop/aclnn_batch_matmul.h>
 #include <aclnnop/aclnn_cast.h>
 #include <aclnnop/aclnn_constant_pad_nd.h>
 #include <aclnnop/aclnn_copy.h>
 #include <aclnnop/aclnn_cos.h>
+#include <aclnnop/aclnn_div.h>
 #include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_fill_scalar.h>
 #include <aclnnop/aclnn_group_norm.h>
 #include <aclnnop/aclnn_index_fill_tensor.h>
 #include <aclnnop/aclnn_layer_norm.h>
-#include <aclnnop/aclnn_mm.h>
-#include <aclnnop/aclnn_batch_matmul.h>
 #include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/aclnn_max_pool.h>
+#include <aclnnop/aclnn_mm.h>
 #include <aclnnop/aclnn_permute.h>
 #include <aclnnop/aclnn_pow_tensor_tensor.h>
 #include <aclnnop/aclnn_reduce_sum.h>
@@ -56,6 +57,7 @@
 #include <exception>
 #include <vector>
 
+#include "ggml-impl.h"
 #include "kernels/ascendc_kernels.h"
 
 #define GGML_COMMON_DECL_C
@@ -1103,9 +1105,9 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
 }
 
 /**
- * @brief Creates an ACL tensor initialized with ones using a provided buffer.
+ * @brief Creates an ACL tensor initialized with value using a provided buffer.
  *
- * This function initializes a tensor with ones using the specified buffer and
+ * This function initializes a tensor with value using the specified buffer and
  * tensor parameters.
  *
  * @param ctx The context for the CANN backend operations.
@@ -1118,12 +1120,12 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
  * @param type_size The size of each element in the tensor data type.
  * @param value The value to be used for initializing the tensor (default
  * is 1.0).
- * @return An ACL tensor initialized with ones.
+ * @return An ACL tensor initialized with value.
  */
-static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer,
-                             size_t n_bytes, int64_t* ne, int64_t dims,
-                             aclDataType type, size_t type_size,
-                             float value = 1.0f) {
+static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
+                               size_t n_bytes, int64_t* ne, int64_t dims,
+                               aclDataType type, size_t type_size,
+                               float value = 1.0f) {
     aclTensor* acl_tensor =
         aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
     float alpha_host = 1.0f;
@@ -1165,7 +1167,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
     ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
 
-    aclTensor* acl_gamma = aclnn_ones(
+    aclTensor* acl_gamma = aclnn_values(
         ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
         ggml_cann_type_mapping(src->type), ggml_element_size(src));
 
@@ -1209,9 +1211,9 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
 
     aclTensor* mask_tensor =
-        aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne,
-                   GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
-                   ggml_element_size(src), value);
+        aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
+                     src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
+                     ggml_element_size(src), value);
 
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -1768,6 +1770,92 @@ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
+/**
+ * @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
+ result by the scalar value and adds it to self .
+ *
+ * Performs element-wise division of tensor1 by tensor2,
+ * multiplies the result by the scalar value and adds it to self .
+ * The operation is defined as:
+ * \f[
+ *     \text{out}_i = \text{selft}_i + \text{value} \times
+ \frac{\text{tensor1}_i}{\text{tensor2}_i}
+ * \f]
+
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_self The source tensor on which the addcdiv function will be
+ applied.
+ * @param tensor1 Numerator tensor.
+ * @param tensor2 Denominator tensor.
+ * @param value The value to be used for coefficient.
+ */
+static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
+                                  aclTensor* acl_self, aclTensor* tensor1,
+                                  aclTensor* tensor2, float value) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+    aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+
+    ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
+        acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
+                                  ctx.stream()));
+}
+
+/**
+ * @brief Matrix division, optionally in-place.
+ *
+ * This function division each element of the source tensor `acl_src` by the
+ * tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
+ * If `inplace` is true, `acl_dst` will not be used and the operation is
+ * performed in-place on `acl_src`. The operation is defined as: \f[
+ *     \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src Numerator tensor..
+ * @param acl_other Denominator tensor.
+ * @param acl_dst The destination tensor where the result will be stored if
+ * `inplace` is false.
+ * @param inplace Flag indicating whether to perform the operation in-place on
+ * `acl_src`.
+ */
+static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                             aclTensor* acl_other, aclTensor* acl_dst,
+                             bool inplace) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    if (inplace) {
+        ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
+                                                  &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+            workspaceAddr = workspace_allocator.get();
+        }
+
+        ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
+                                  ctx.stream()));
+    } else {
+        ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
+                                           &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+            workspaceAddr = workspace_allocator.get();
+        }
+
+        ACL_CHECK(
+            aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
+    }
+}
+
 void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
                                   ggml_tensor* dst) {
     const ggml_tensor* src = dst->src[0];
@@ -2311,12 +2399,13 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                                ctx.stream()));
 
     switch (src0->type) {
-        case GGML_TYPE_F32:
-        {
+        case GGML_TYPE_F32: {
 #ifdef ASCEND_310P
-             // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
+            // Special operation for get_row_f32 kernel of 310P: clear the
+            // content of dest data buffer when row is not aligned to 32 bytes
             if ((src0->ne[0] % 8) != 0) {
-                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
+                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
+                                 src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
                 ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
             }
 #endif
@@ -2329,12 +2418,15 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                 ((ggml_tensor*)dst->extra)->nb);
             break;
         }
-        case GGML_TYPE_F16:
-        {
+        case GGML_TYPE_F16: {
 #ifdef ASCEND_310P
-             // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
+            // Special operation for get_row_f16 kernel of 310P: clear the
+            // content of dest data buffer when row is not aligned to 32 bytes
             if ((src0->ne[0] % 16) != 0) {
-                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
+                size_t dst_len =
+                    src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
+                    ggml_type_size(
+                        GGML_TYPE_F32);  // out is also f32, even input is f16
                 ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
             }
 #endif
@@ -2459,8 +2551,9 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
  * @param acl_dst The destination tensor where the result of the matrix
  * multiplication will be stored.
  */
-static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
-                             aclTensor* acl_weight, aclTensor* acl_dst) {
+static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
+                             aclTensor* acl_input, aclTensor* acl_weight,
+                             aclTensor* acl_dst) {
     int8_t cube_math_type = 2;
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -2475,8 +2568,7 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
         workspaceAddr = workspace_allocator.get();
     }
 
-    ACL_CHECK(
-        aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
+    ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
 /**
@@ -2496,8 +2588,9 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
  * @param acl_dst The destination tensor where the result of the matrix
  * multiplication will be stored.
  */
-static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
-                             aclTensor* acl_weight, aclTensor* acl_dst) {
+static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
+                             aclTensor* acl_input, aclTensor* acl_weight,
+                             aclTensor* acl_dst) {
     int8_t cube_math_type = 2;
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -2548,31 +2641,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
 
     aclTensor* acl_input_tensor =
         ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
-    int64_t transpose_ne[] = {
-        bcast_weight_ne[1], bcast_weight_ne[0],
-        bcast_weight_ne[2], bcast_weight_ne[3],
-        bcast_weight_ne[4], bcast_weight_ne[5]
-    };
-    size_t transpose_nb[] = {
-        bcast_weight_nb[1], bcast_weight_nb[0],
-        bcast_weight_nb[2], bcast_weight_nb[3],
-        bcast_weight_nb[4], bcast_weight_nb[5]
-    };
+    int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
+                              bcast_weight_ne[2], bcast_weight_ne[3],
+                              bcast_weight_ne[4], bcast_weight_ne[5]};
+    size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
+                             bcast_weight_nb[2], bcast_weight_nb[3],
+                             bcast_weight_nb[4], bcast_weight_nb[5]};
     aclTensor* acl_weight_tensor =
         ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
     aclTensor* acl_dst =
         ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
 
     switch (n_dims) {
-    case 2:
-        aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
-        break;
-    case 3:
-        aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
-        break;
-    default:
-        aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
-        break;
+        case 2:
+            aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+            break;
+        case 3:
+            aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+            break;
+        default:
+            aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+            break;
     }
 
     ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
@@ -2594,8 +2683,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
  * multiplication will be stored.
  */
 static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
-                                   ggml_tensor* dst,
-                                   const enum ggml_type type) {
+                                    ggml_tensor* dst,
+                                    const enum ggml_type type) {
     ggml_tensor* src0 = dst->src[0];  // weight
     ggml_tensor* src1 = dst->src[1];  // input
 
@@ -2617,14 +2706,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
 
     // scale stored at the end of weight. Also need transpose.
     size_t scale_elem_size = sizeof(uint16_t);
-    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
+    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
+                         scale_elem_size};
     size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
     char* scale_offset = (char*)src0->data + weight_size;
 
     // input
     size_t input_elem_size = sizeof(uint16_t);
     int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
-    size_t input_nb[] = {input_elem_size,  input_ne[0] * input_elem_size};
+    size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
     size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
     ggml_cann_pool_alloc input_alloctor(ctx.pool());
     void* input_buffer = src1->data;
@@ -2632,7 +2722,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     // case in
     if (src1->type != GGML_TYPE_F16) {
         aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
-        input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
+        input_buffer =
+            input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
 
         int64_t* input_cast_ne = src1->ne;
         size_t input_cast_nb[GGML_MAX_DIMS];
@@ -2642,9 +2733,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
         }
 
         aclTensor* acl_input_tensor = ggml_cann_create_tensor(
-            input_buffer,
-            ACL_FLOAT16,
-            input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
+            input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
+            input_cast_nb, GGML_MAX_DIMS);
         aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
 
         ACL_CHECK(aclDestroyTensor(acl_input_tensor));
@@ -2655,7 +2745,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     size_t output_elem_size = sizeof(uint16_t);
     size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
     ggml_cann_pool_alloc output_allocator(ctx.pool());
-    void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
+    void* output_buffer =
+        output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
     size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
 
     // aclnn
@@ -2679,7 +2770,9 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
 
             // first split
             int64_t weight_ne_offset = 0;
-            int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
+            int64_t weight_ne[2] = {
+                max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
+                src0->ne[0]};
             int64_t scale_ne_offset = 0;
             int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
             int64_t output_ne_offset = 0;
@@ -2687,24 +2780,21 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
 
             aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
                 (char*)src0->data + batch0 * weight_stride,
-                ggml_cann_type_mapping(type),
-                weight_elem_size, weight_ne, weight_nb, 2,
-                ACL_FORMAT_ND, weight_ne_offset);
+                ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
+                weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
             aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
-                scale_offset + batch0 * scale_stride,
-                ACL_FLOAT16,
-                scale_elem_size, scale_ne, scale_nb, 2,
-                ACL_FORMAT_ND, scale_ne_offset);
+                scale_offset + batch0 * scale_stride, ACL_FLOAT16,
+                scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
+                scale_ne_offset);
             aclTensor* acl_output_tensor = ggml_cann_create_tensor(
-                (char*)output_buffer + batch1 * output_stride,
-                ACL_FLOAT16,
-                output_elem_size, output_ne, output_nb, 2,
-                ACL_FORMAT_ND, output_ne_offset);
+                (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
+                output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
+                output_ne_offset);
 
             ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
-                acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
-                nullptr, nullptr, nullptr, nullptr, QK8_0,
-                acl_output_tensor, &workspaceSize, &executor));
+                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
+                nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
+                &workspaceSize, &executor));
             if (workspaceAddr == nullptr) {
                 workspaceAddr = workspace_allocator.alloc(workspaceSize);
             }
@@ -2717,28 +2807,29 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
 
             // other splits
             for (int64_t split = 1; split < split_size; split++) {
-                weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
-                weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
+                weight_ne_offset +=
+                    weight_elem_size * weight_ne[0] * weight_ne[1];
+                weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
+                                   ? src0->ne[1] - (max_elem_size * split)
+                                   : max_elem_size;
                 scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
                 scale_ne[0] = weight_ne[0];
-                output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
+                output_ne_offset +=
+                    output_elem_size * output_ne[0] * output_ne[1];
                 output_ne[0] = weight_ne[0];
 
                 acl_weight_tensor = ggml_cann_create_tensor(
                     (char*)src0->data + batch0 * weight_stride,
-                    ggml_cann_type_mapping(type),
-                    weight_elem_size, weight_ne, weight_nb, 2,
-                    ACL_FORMAT_ND, weight_ne_offset);
+                    ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
+                    weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
                 acl_scale_tensor = ggml_cann_create_tensor(
-                    scale_offset + batch0 * scale_stride,
-                    ACL_FLOAT16,
-                    scale_elem_size, scale_ne, scale_nb, 2,
-                    ACL_FORMAT_ND, scale_ne_offset);
+                    scale_offset + batch0 * scale_stride, ACL_FLOAT16,
+                    scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
+                    scale_ne_offset);
                 acl_output_tensor = ggml_cann_create_tensor(
-                    (char*)output_buffer + batch1 * output_stride,
-                    ACL_FLOAT16,
-                    output_elem_size, output_ne, output_nb, 2,
-                    ACL_FORMAT_ND, output_ne_offset);
+                    (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
+                    output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
+                    output_ne_offset);
 
                 ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
                     acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
@@ -2766,11 +2857,11 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
         }
 
         aclTensor* acl_output_tensor = ggml_cann_create_tensor(
-            output_buffer,
-            ACL_FLOAT16,
-            output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
+            output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
+            output_cast_nb, GGML_MAX_DIMS);
         aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
-        aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
+        aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
+                   ggml_cann_type_mapping(dst->type));
 
         ACL_CHECK(aclDestroyTensor(acl_output_tensor));
         ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
@@ -2873,12 +2964,14 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
 static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
                              aclTensor* acl_cos_repeat_tensor,
                              aclTensor* acl_sin_repeat_tensor,
-                             float theta_scale, bool is_neox) {
+                             float theta_scale, float freq_scale,
+                             bool is_neox) {
     // int sin/cos cache, cache has different repeat method depond on
     // @param.is_neox
 
     ggml_tensor* src0 = dst->src[0];  // input
     ggml_tensor* src1 = dst->src[1];  // position
+    ggml_tensor* src2 = dst->src[2];  // freq_factors
 
     // arange, [0,1,...,ne0/2]
     int64_t arange_length = src0->ne[0] / 2;
@@ -2907,11 +3000,25 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
                                                arange_length * sizeof(float_t));
     void* theta_scale_buffer = theta_scale_allocator.get();
-    aclTensor* acl_theta_scale_tensor = aclnn_ones(
+    aclTensor* acl_theta_scale_tensor = aclnn_values(
         ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
         GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
     aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
 
+    // freq_scale
+    if (freq_scale != 1) {
+        aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
+    }
+
+    // freq_factors
+    if (src2) {
+        aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
+            src2->data, ggml_cann_type_mapping(src2->type),
+            ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
+        aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
+                         nullptr, true);
+    }
+
     // position
     GGML_ASSERT(src1->type == GGML_TYPE_I32);
     int64_t position_length = src1->ne[0];
@@ -2940,6 +3047,16 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
               acl_theta_tensor);
 
+    // // power[] * position[] * freq_scale / freq_factors[]
+    // ggml_cann_pool_alloc theta_final_allocator(ctx.pool(),
+    //                                            theta_length *
+    //                                            sizeof(float_t));
+    // aclTensor* acl_theat_final_tensor = aclnn_zero(
+    //     ctx, theta_final_allocator.get(), sizeof(float_t) * theta_length,
+    //     theta_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t));
+    // aclnn_inplace_addcdiv(ctx, acl_theat_final_tensor, acl_theta_tensor,
+    //                       acl_freq_factors_tensor, freq_scale);
+
     // permute: [0,1,2,3]->[0,2,1,3]
     int64_t permute_ne[] = {arange_length, 1, position_length, 1};
     size_t permute_nb[GGML_MAX_DIMS];
@@ -3038,8 +3155,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
     memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
 
-    // TODO: with freq_factors
-    GGML_ASSERT(src2 == NULL);
     // TODO: attn_factor != 1
     GGML_ASSERT(attn_factor == 1);
     // TODO: n_dims <= ne0
@@ -3047,8 +3162,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     GGML_ASSERT(n_dims % 2 == 0);
     // TODO: ext_factor != 0
     GGML_ASSERT(ext_factor == 0);
-    // TODO: freq_scale != 1
-    GGML_ASSERT(freq_scale == 1);
     // TODO: type == GGML_TYPE_F16
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
 
@@ -3081,7 +3194,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
                                 sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
     aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
-                     theta_scale, is_neox);
+                     theta_scale, freq_scale, is_neox);
 
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -3096,7 +3209,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     aclTensor* acl_x = ggml_cann_create_tensor(src0);
     aclTensor* acl_dst = ggml_cann_create_tensor(dst);
     ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
-        acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
+        acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
+        acl_dst, &workspaceSize, &executor));
     if (workspaceSize > 0) {
         ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
         workspaceAddr = workspace_allocator.get();
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index c7a3419c7..bcb54e444 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1738,13 +1738,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         }
         case GGML_OP_ROPE: {
             // TODO: with ops-test v == 1
-            float * freq_scale = (float*)((int32_t*)op->op_params + 6);
             float * ext_factor = (float*)((int32_t*)op->op_params + 7);
             float * attn_factor = (float*)((int32_t*)op->op_params + 8);
-            // TODO: with freq_factors
-            if (op->src[2] != NULL) {
-                return false;
-            }
             // TODO: n_dims <= ne0
             if (op->src[0]->ne[0] != op->op_params[1]) {
                 return false;
@@ -1753,10 +1748,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             if (*ext_factor != 0) {
                 return false;
             }
-            // TODO: freq_scale != 1
-            if (*freq_scale != 1) {
-                return false;
-            }
             // TODO: attn_factor != 1
             if (*attn_factor != 1) {
                 return false;

From 605fa66c509f9f117bd654cf0b9b3ea08bb86e80 Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Thu, 28 Nov 2024 15:25:24 +0800
Subject: [PATCH 335/396] CANN: Fix SOC_TYPE compile bug (#10519)

* CANN: Fix the bug build fail on Ascend310P under two cases:
1) Manual specify SOC_TYPE
2) Under some unusual compile environment

* Update the cann backend News content: Support F16 and F32 data type model for Ascend 310P NPU.

* fix CANN  compile fail bug: the assert in ascend kernel function doesn't supportted on some CANN version
---
 docs/backend/CANN.md                            |  3 +++
 ggml/src/ggml-cann/CMakeLists.txt               |  7 ++++---
 ggml/src/ggml-cann/kernels/CMakeLists.txt       |  2 +-
 ggml/src/ggml-cann/kernels/dup.cpp              |  1 -
 ggml/src/ggml-cann/kernels/get_row_q4_0.cpp     | 16 ++++++++++++----
 .../src/ggml-cann/kernels/quantize_f16_q8_0.cpp | 10 ++++++++++
 .../src/ggml-cann/kernels/quantize_f32_q8_0.cpp | 10 ++++++++++
 .../kernels/quantize_float_to_q4_0.cpp          | 17 +++++++++++++++++
 8 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
index 6bdd9d2da..496e05807 100644
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 
 ## News
 
+- 2024.11
+  - Support F16 and F32 data type model for Ascend 310P NPU.
 - 2024.8
   - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
 - 2024.7
@@ -43,6 +45,7 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 | Ascend NPU                    | Status  |
 |:-----------------------------:|:-------:|
 | Atlas 300T A2                 | Support |
+| Atlas 300I Duo                | Support |
 
 *Notes:*
 
diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt
index 901327185..05cf06bfa 100644
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -22,13 +22,14 @@ if(NOT SOC_TYPE)
     detect_ascend_soc_type(SOC_VERSION)
     set(SOC_TYPE "${SOC_VERSION}")
     message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
-else()
-    string(TOLOWER ${SOC_TYPE} SOC_VERSION)
 endif()
 
-# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
+string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
+
+# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
 string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
+string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
 
 if (CANN_INSTALL_DIR)
     # Only Support Linux.
diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt
index 6a4e17cce..d687220c3 100644
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@@ -25,6 +25,6 @@ ascendc_library(ascendc_kernels STATIC
     ${SRC_FILES}
 )
 
-message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
+message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
 ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
 # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp
index 99f03e058..c7ba38d10 100644
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@@ -20,7 +20,6 @@ class DupByRows {
         // Input has four dims.
         int64_t op_block_num = GetBlockNum();
         int64_t op_block_idx = GetBlockIdx();
-        assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
 
         // param
         num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
index 377211096..4fbe72208 100644
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@@ -2,6 +2,15 @@
 
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support 4bit get row
+    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
+        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support 4bit get row.\n");
+    }
+#else
 
 #define BUFFER_NUM 2
 
@@ -110,12 +119,9 @@ class GET_ROW_Q4_0 {
         LocalTensor<float> output_local = output_queue.AllocTensor<float>();
 
         // TODO: cast more data to speed up.
-#ifdef ASCEND_310P
-        // TODO: 310P support quantification
-#else
         Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
         Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-#endif
+
         // Only mul need compile by group.
         half scale = scale_gm.GetValue(scale_offset);
 
@@ -194,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
             indices_nb_ub, output_ne_ub, output_nb_ub);
     op.calculate();
 }
+
+#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
index 8423b3f02..504b43afa 100644
--- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
@@ -1,6 +1,14 @@
 #include "kernel_operator.h"
 
 using namespace AscendC;
+#ifdef ASCEND_310P
+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f16->8bit quantization.\n");
+    }
+#else
 
 #define BUFFER_NUM 2
 #define QK8_0 32
@@ -206,3 +214,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
     op.calculate();
 }
+
+#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
index b7c575093..05b0bc1df 100644
--- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
@@ -1,6 +1,14 @@
 #include "kernel_operator.h"
 
 using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support f32->8bit quantization
+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f32->8bit quantization.\n");
+    }
+#else
 
 #define BUFFER_NUM 2
 #define QK8_0 32
@@ -204,3 +212,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
     op.calculate();
 }
+
+#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
index 9c8c86b66..1188937b7 100644
--- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@@ -1,6 +1,21 @@
 #include "kernel_operator.h"
 
 using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support float->4bit quantization
+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f32->4bit quantization.\n");
+    }
+
+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f16->4bit quantization.\n");
+    }
+#else
 
 #define BUFFER_NUM 2
 #define Group_Size 32
@@ -276,3 +291,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
     op.calculate();
 }
+
+#endif // #ifdef ASCEND_310P

From c6bc73951ed52466392b1abda98c28ecbe522c7f Mon Sep 17 00:00:00 2001
From: Ruixin Huang <18860020911@163.com>
Date: Thu, 28 Nov 2024 15:27:11 +0800
Subject: [PATCH 336/396] CANN: Update cann.md to display correctly in CLion
 (#10538)

---
 docs/backend/CANN.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
index 496e05807..23f10175a 100644
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -42,6 +42,7 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 ### Ascend NPU
 
 **Verified devices**
+
 | Ascend NPU                    | Status  |
 |:-----------------------------:|:-------:|
 | Atlas 300T A2                 | Support |

From 2025fa67e94358deda4740a74fe9803916cb2f60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergio=20L=C3=B3pez?= <slp@redhat.com>
Date: Thu, 28 Nov 2024 12:51:38 +0100
Subject: [PATCH 337/396] kompute : improve backend to pass test_backend_ops
 (#10542)

* kompute: op_unary: reject unsupported parameters

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: softmax: implement ALiBi support

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: rope: implement neox and phi3 support

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: op_mul_mat_q4_k permutted support

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: op_mul_mat_[q4_0|q4_1|q8_0] permutted support

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: op_mul_mat_f16 permutted support

Signed-off-by: Sergio Lopez <slp@redhat.com>

* kompute: op_mul_mat_q6_k permutted support

Signed-off-by: Sergio Lopez <slp@redhat.com>

---------

Signed-off-by: Sergio Lopez <slp@redhat.com>
---
 ggml/src/ggml-kompute/CMakeLists.txt          |  12 +-
 ggml/src/ggml-kompute/ggml-kompute.cpp        | 176 ++++++++++++------
 .../ggml-kompute/kompute-shaders/common.comp  |   1 +
 .../kompute-shaders/op_mul_mat_f16.comp       |   6 +-
 .../kompute-shaders/op_mul_mat_q4_k.comp      |  19 +-
 .../kompute-shaders/op_mul_mat_q6_k.comp      |  24 ++-
 .../kompute-shaders/op_mul_mv_q_n.comp        |  14 +-
 .../kompute-shaders/op_mul_mv_q_n_pre.comp    |   8 +-
 .../kompute-shaders/op_rope_f16.comp          |  73 --------
 .../kompute-shaders/op_rope_f32.comp          |  73 --------
 .../kompute-shaders/op_rope_neox_f16.comp     |  52 ++++++
 .../kompute-shaders/op_rope_neox_f32.comp     |  52 ++++++
 .../kompute-shaders/op_rope_norm_f16.comp     |  52 ++++++
 .../kompute-shaders/op_rope_norm_f32.comp     |  52 ++++++
 .../kompute-shaders/op_softmax.comp           |  20 +-
 .../kompute-shaders/rope_common.comp          |   2 +
 16 files changed, 403 insertions(+), 233 deletions(-)
 delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp
 delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp
 create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp
 create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp
 create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp
 create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp

diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt
index dc623926c..c9109d5e8 100644
--- a/ggml/src/ggml-kompute/CMakeLists.txt
+++ b/ggml/src/ggml-kompute/CMakeLists.txt
@@ -105,8 +105,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
         kompute-shaders/op_getrows_q4_0.comp
         kompute-shaders/op_getrows_q4_1.comp
         kompute-shaders/op_getrows_q6_k.comp
-        kompute-shaders/op_rope_f16.comp
-        kompute-shaders/op_rope_f32.comp
+        kompute-shaders/op_rope_norm_f16.comp
+        kompute-shaders/op_rope_norm_f32.comp
+        kompute-shaders/op_rope_neox_f16.comp
+        kompute-shaders/op_rope_neox_f32.comp
         kompute-shaders/op_cpy_f16_f16.comp
         kompute-shaders/op_cpy_f16_f32.comp
         kompute-shaders/op_cpy_f32_f16.comp
@@ -139,8 +141,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
         shaderop_getrows_q4_0.h
         shaderop_getrows_q4_1.h
         shaderop_getrows_q6_k.h
-        shaderop_rope_f16.h
-        shaderop_rope_f32.h
+        shaderop_rope_norm_f16.h
+        shaderop_rope_norm_f32.h
+        shaderop_rope_neox_f16.h
+        shaderop_rope_neox_f32.h
         shaderop_cpy_f16_f16.h
         shaderop_cpy_f16_f32.h
         shaderop_cpy_f32_f16.h
diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp
index 24566404d..28ceecfc4 100644
--- a/ggml/src/ggml-kompute/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute/ggml-kompute.cpp
@@ -28,8 +28,10 @@
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
 #include "shaderop_getrows_q6_k.h"
-#include "shaderop_rope_f16.h"
-#include "shaderop_rope_f32.h"
+#include "shaderop_rope_norm_f16.h"
+#include "shaderop_rope_norm_f32.h"
+#include "shaderop_rope_neox_f16.h"
+#include "shaderop_rope_neox_f32.h"
 #include "shaderop_cpy_f16_f16.h"
 #include "shaderop_cpy_f16_f32.h"
 #include "shaderop_cpy_f32_f16.h"
@@ -345,7 +347,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
     std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
         vk::DescriptorPoolSize(
           vk::DescriptorType::eStorageBuffer,
-          3 * size // Descriptor count is number of possible tensors to pass into an algorithm
+          4 * size // Descriptor count is number of possible tensors to pass into an algorithm
           )
     };
 
@@ -788,7 +790,8 @@ static void ggml_vk_soft_max(
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
     int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
-    float scale
+    float scale, float max_bias, float m0, float m1,
+    uint32_t n_head_log2
 ) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
         kp::shader_data::op_softmax_comp_spv_len);
@@ -796,12 +799,14 @@ static void ggml_vk_soft_max(
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
         int32_t ne00, ne01, ne02;
-        float scale;
+        float scale, max_bias, m0, m1;
+        uint32_t n_head_log2;
         int32_t mask;
     } pushConsts {
         safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, ne01, ne02,
-        scale,
+        scale, max_bias, m0, m1,
+        n_head_log2,
         bool(inB)
     };
 
@@ -911,9 +916,9 @@ static void ggml_vk_mul_mat_f16(
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
     int32_t ne00, int32_t ne01, int32_t ne02,
-    uint32_t nb00, uint32_t nb01, uint32_t nb02,
+    uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
     int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
-    uint32_t nb10, uint32_t nb11, uint32_t nb12,
+    uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13,
     int32_t ne0, int32_t ne1,
     uint32_t r2, uint32_t r3
 ) {
@@ -923,17 +928,17 @@ static void ggml_vk_mul_mat_f16(
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
         int32_t ne00, ne01, ne02;
-        uint32_t nb00, nb01, nb02;
+        uint32_t nb00, nb01, nb02, nb03;
         int32_t ne10, ne11, ne12;
-        uint32_t nb10, nb11, nb12;
+        uint32_t nb10, nb11, nb12, nb13;
         int32_t ne0, ne1;
         uint32_t r2, r3;
     } pushConsts {
         safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, ne01, ne02,
-        nb00, nb01, nb02,
+        nb00, nb01, nb02, nb03,
         ne10, ne11, ne12,
-        nb10, nb11, nb12,
+        nb10, nb11, nb12, nb13,
         ne0, ne1,
         r2, r3
     };
@@ -1013,6 +1018,8 @@ static void ggml_vk_mul_mat_impl(
     int32_t ne00, int32_t ne01, int32_t ne02,
     int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
     int32_t ne0, int32_t ne1,
+    uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    uint32_t nb11, uint32_t nb12, uint32_t nb13,
     uint32_t r2, uint32_t r3
 ) {
     struct PushConstants {
@@ -1020,19 +1027,23 @@ static void ggml_vk_mul_mat_impl(
         int32_t ne00, ne01, ne02;
         int32_t ne10, ne12;
         int32_t ne0, ne1;
+        uint32_t nb01, nb02, nb03;
+        uint32_t nb11, nb12, nb13;
         uint32_t r2, r3;
     } pushConsts {
         safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, ne01, ne02,
         ne10, ne12,
         ne0, ne1,
+        nb01, nb02, nb03,
+        nb11, nb12, nb13,
         r2, r3
     };
 
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8;
         s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
@@ -1074,19 +1085,26 @@ static void ggml_vk_mul_mat_q4_k(
     const std::shared_ptr<kp::Tensor>& inB,
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10,
-    int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0,
-    int32_t ne1, int32_t r2, int32_t r3
+    int32_t ne00, int32_t ne01, int32_t ne02,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    int32_t ne0, int32_t ne1,
+    uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    uint32_t nb11, uint32_t nb12, uint32_t nb13,
+    uint32_t r2, uint32_t r3
 ) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
         kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3;
+        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
+        uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
+        uint32_t r2, r3;
     } pushConsts {
-        0, 0, 0,
-        ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne10, ne0, ne1, ne01, ne02, ne12,
+        nb01, nb02, nb03, nb11, nb12, nb13,
+        r2, r3
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
@@ -1108,28 +1126,37 @@ static void ggml_vk_mul_mat_q6_k(
     const std::shared_ptr<kp::Tensor>& inB,
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
-    int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02
+    int32_t ne00, int32_t ne01, int32_t ne02,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    int32_t ne0, int32_t ne1,
+    uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    uint32_t nb11, uint32_t nb12, uint32_t nb13,
+    uint32_t r2, uint32_t r3
 ) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
         kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
+        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
+        uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
+        uint32_t r2, r3;
     } pushConsts {
         inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne10, ne0, ne1, ne01, ne12/ne02
+        ne00, ne10, ne0, ne1, ne01, ne02, ne12,
+        nb01, nb02, nb03, nb11, nb12, nb13,
+        r2, r3
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+        const uint32_t local_x = 2;
+        const uint32_t local_y = ggml_vk_current_device().subgroupSize;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
+        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
@@ -1217,10 +1244,11 @@ static void ggml_vk_rope(
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& inC,
     const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff,
     ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
-    float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
+    float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
     int32_t ne01, int32_t ne02, int32_t ne03,
     uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
     int32_t ne0,
@@ -1228,11 +1256,17 @@ static void ggml_vk_rope(
 ) {
     GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
 
-    static const auto spirv_f16 = getSpirvShader(
-        kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len
+    static const auto spirv_norm_f16 = getSpirvShader(
+        kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len
     );
-    static const auto spirv_f32 = getSpirvShader(
-        kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len
+    static const auto spirv_norm_f32 = getSpirvShader(
+        kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len
+    );
+    static const auto spirv_neox_f16 = getSpirvShader(
+        kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len
+    );
+    static const auto spirv_neox_f32 = getSpirvShader(
+        kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len
     );
 
     int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
@@ -1247,32 +1281,40 @@ static void ggml_vk_rope(
     GGML_ASSERT(nb0  % type_size == 0);
 
     struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
+        uint32_t inAOff, inBOff, inCOff, outOff;
         int32_t n_dims, mode, n_ctx_orig;
-        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+        float freq_base, freq_scale;
+        bool has_freq_factors;
+        float ext_factor, attn_factor, beta_fast, beta_slow;
         uint32_t nb00, nb01, nb02, nb03;
         int32_t ne0;
         uint32_t nb0, nb1, nb2, nb3;
     } pushConsts {
-        safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
+        safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size),
         n_dims, mode, n_ctx_orig,
-        freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
+        freq_base, freq_scale,
+        has_freq_factors,
+        ext_factor, attn_factor, beta_fast, beta_slow,
         nb00, nb01, nb02, nb03,
         ne0,
         nb0, nb1, nb2, nb3
     };
 
-    auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
+    auto & inC_ = inC ? inC : inA;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_f16 = src0t == GGML_TYPE_F16;
+
+    auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
+        auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32;
         s_algo = komputeManager()->algorithm<float, PushConstants>(
-            name, s_kompute_context->pool.get(), {inA, inB, out},
-            src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
+            name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv,
             {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
         );
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
-        s_algo->setTensors({inA, inB, out});
+        s_algo->setTensors({inA, inB, inC_, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
@@ -1351,11 +1393,15 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
 }
 
 static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    int64_t n = ggml_nelements(op);
     switch (op->op) {
         case GGML_OP_UNARY:
+            if (n % 4 != 0) return false;
             switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_RELU:
                 case GGML_UNARY_OP_GELU:
+                    if (n % 8 != 0) return false;
+                    // fall through
+                case GGML_UNARY_OP_RELU:
                 case GGML_UNARY_OP_SILU:
                     return ggml_is_contiguous(op->src[0]);
                 default:
@@ -1413,8 +1459,8 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
 
             switch (op->src[0]->type) {
                 case GGML_TYPE_F32:
-                case GGML_TYPE_Q6_K:
                     return op->ne[3] == 1;
+                case GGML_TYPE_Q6_K:
                 case GGML_TYPE_F16:
                 case GGML_TYPE_Q8_0:
                 case GGML_TYPE_Q4_0:
@@ -1515,9 +1561,11 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
             const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
             uint32_t off_src0 = 0;
             uint32_t off_src1 = 0;
+            uint32_t off_src2 = 0;
             uint32_t off_dst  = 0;
             const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
             const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor;
             const std::shared_ptr<kp::Tensor>& id_dst  = dst  ? ggml_vk_get_tensor(dst,  &off_dst)  : nullTensor;
 
             switch (dst->op) {
@@ -1593,11 +1641,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
 #pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5021")
                         GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
 
-#pragma message("TODO: add ALiBi support")
-#pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/7192")
-                        GGML_ASSERT(max_bias == 0.0f);
+                        const int64_t nrows_x = ggml_nrows(src0);
+                        const int64_t nrows_y = src0->ne[1];
 
-                        ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
+                        const uint32_t n_head      = nrows_x/nrows_y;
+                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+                        const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+                        ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2);
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                     {
@@ -1649,38 +1702,44 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                             case GGML_TYPE_F16:
                                 ggml_vk_mul_mat_f16(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12,
+                                    ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                    ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
                                     ne0, ne1, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q8_0:
                                 ggml_vk_mul_mat_q8_0(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_0:
                                 ggml_vk_mul_mat_q4_0(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_1:
                                 ggml_vk_mul_mat_q4_1(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_K:
                                 ggml_vk_mul_mat_q4_k(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, ne12/ne02, ne13/ne03
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q6_K:
                                 ggml_vk_mul_mat_q6_k(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             default: {
@@ -1709,13 +1768,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                     } break;
                 case GGML_OP_ROPE:
                     {
-#pragma message("TODO: implement phi3 frequency factors support")
-#pragma message("      https://github.com/ggerganov/llama.cpp/pull/7225")
-                        GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
-
-#pragma message("TODO: update rope NORM mode to match NEOX mode")
-#pragma message("      https://github.com/ggerganov/llama.cpp/pull/7634")
-
                         GGML_ASSERT(ne10 == ne02);
                         GGML_ASSERT(src0t == dstt);
                         // const int n_past = ((int32_t *) dst->op_params)[0];
@@ -1724,6 +1776,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                         // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
                         const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
 
+                        const bool has_freq_factors = dst->src[2] != nullptr;
+
                         float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
                         memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
                         memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
@@ -1732,8 +1786,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                         memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
                         memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
                         ggml_vk_rope(
-                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
-                            freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
+                            seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig,
+                            freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow,
                             ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
                         );
                     } break;
diff --git a/ggml/src/ggml-kompute/kompute-shaders/common.comp b/ggml/src/ggml-kompute/kompute-shaders/common.comp
index 2aaddf704..dbe4cf804 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/common.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/common.comp
@@ -3,6 +3,7 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_EXT_shader_explicit_arithmetic_types_int8: require
 #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
 #extension GL_EXT_control_flow_attributes: enable
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
index 8f0a9031f..0ab1b2fc2 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
@@ -20,12 +20,14 @@ layout (push_constant) uniform parameter {
     uint nb00;
     uint nb01;
     uint nb02;
+    uint nb03;
     int ne10;
     int ne11;
     int ne12;
     uint nb10;
     uint nb11;
     uint nb12;
+    uint nb13;
     int ne0;
     int ne1;
     uint r2;
@@ -42,7 +44,7 @@ void main() {
     const uint i12 = im%pcs.ne12;
     const uint i13 = im/pcs.ne12;
 
-    const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02;
+    const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
 
     const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
 
@@ -52,7 +54,7 @@ void main() {
             break;
         }
 
-        const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB
+        const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
 
         float sumf = 0;
         for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
index fc8e45aa9..a5752a3a0 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
@@ -24,8 +24,14 @@ layout (push_constant) uniform parameter {
     int ne01;
     int ne02;
     int ne12;
-    int r2;
-    int r3;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    uint nb11;
+    uint nb12;
+    uint nb13;
+    uint r2;
+    uint r3;
 } pcs;
 
 void main() {
@@ -50,10 +56,11 @@ void main() {
     const uint i12 = im%pcs.ne12;
     const uint i13 = im/pcs.ne12;
 
-    const uint offset0 = (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
+    const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
+    const uint offset1 =        r1*pcs.nb11 + (i12       )*pcs.nb12 + (i13       )*pcs.nb13;
 
-    const uint xblk = ib_row + offset0 + pcs.inAOff;
-    const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff;
+    const uint xblk = offset0 + pcs.inAOff;
+    const uint y = (offset1 / 4) + pcs.inBOff;
 
     float yl[16];
     float yh[16];
@@ -74,7 +81,7 @@ void main() {
         }
 
         for (int row = 0; row < N_DST; row++) {
-            uint row_idx = row * nb;
+            uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
 
             uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
             uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
index c9baebdf4..d331d1a70 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
@@ -21,7 +21,16 @@ layout (push_constant) uniform parameter {
     int ne0;
     int ne1;
     int ne01;
-    int gqa;
+    int ne02;
+    int ne12;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    uint nb11;
+    uint nb12;
+    uint nb13;
+    uint r2;
+    uint r3;
 } pcs;
 
 void main() {
@@ -34,12 +43,15 @@ void main() {
 
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
-    const uint r2 = gl_WorkGroupID.z;
+    const uint im = gl_WorkGroupID.z;
 
     const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
-    const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
-    const uint x = row * nb + offset0; // Based from inA without base offset
-    const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+
+    const uint i12 = im%pcs.ne12;
+    const uint i13 = im/pcs.ne12;
+
+    const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
+    const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
 
     float sumf = 0;
 
@@ -89,6 +101,6 @@ void main() {
 
     const float tot = subgroupAdd(sumf);
     if (subgroupElect()) {
-        out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
+        out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
     }
 }
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
index 440b5ab2c..a6517cc1f 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
@@ -14,10 +14,15 @@ void main() {
     const uint i12 = im%pcs.ne12;
     const uint i13 = im/pcs.ne12;
 
-    const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
+    // pointers to src0 rows
+    uint ax[N_ROWS];
+    for (int row = 0; row < N_ROWS; ++row) {
+        const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
 
-    const uint x = offset0; // Based from inA without base offset
-    const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+        ax[row] = offset0 + pcs.inAOff;
+    }
+
+    const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
 
     float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
 
@@ -32,8 +37,7 @@ void main() {
 
     for (uint ib = ix; ib < nb; ib += 16) {
         for (int row = 0; row < N_ROWS; row++) {
-            const uint block_index = x + ib + row * nb;
-            sumf[row] += block_q_n_dot_y(block_index, yb, il);
+            sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
         }
 
         yb += BLOCKS_IN_QUANT * 16;
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
index 7912b09ac..a9a2f2218 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
@@ -1,5 +1,5 @@
 layout(local_size_x_id = 0) in;
-layout(local_size_y = 1) in;
+layout(local_size_y = 8) in;
 layout(local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
@@ -17,6 +17,12 @@ layout (push_constant) uniform parameter {
     int  ne12;
     int  ne0;
     int  ne1;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    uint nb11;
+    uint nb12;
+    uint nb13;
     uint r2;
     uint r3;
 } pcs;
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp
deleted file mode 100644
index 0ecfb2eab..000000000
--- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp
+++ /dev/null
@@ -1,73 +0,0 @@
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
-
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-
-    const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
-    const int p = inB[pcs.inBOff + i2];
-
-    float theta = float(p);
-
-    if (!is_neox) {
-        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-
-            const float x0 = float(inA[src]);
-            const float x1 = float(inA[src+1]);
-
-            out_[dst_data]   = float16_t(x0*cos_theta - x1*sin_theta);
-            out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
-        }
-    } else {
-        const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-            const uint cur_rot = ic;
-
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint i0 = ic/2;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-
-            const float x0 = float(inA[src]);
-            const float x1 = float(inA[src+pcs.n_dims/2]);
-
-            out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
-            out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
-        }
-
-        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
-            const uint i0 = ic;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-
-            out_[dst_data + 0] = inA[src + 0];
-            out_[dst_data + 1] = inA[src + 1];
-        }
-    }
-}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp
deleted file mode 100644
index cec0fd9a5..000000000
--- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp
+++ /dev/null
@@ -1,73 +0,0 @@
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-
-    const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
-    const int p = inB[pcs.inBOff + i2];
-
-    float theta = float(p);
-
-    if (!is_neox) {
-        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-
-            const float x0 = inA[src];
-            const float x1 = inA[src+1];
-
-            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
-            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
-        }
-    } else {
-        const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-            const uint cur_rot = ic;
-
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint i0 = ic/2;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-
-            const float x0 = inA[src];
-            const float x1 = inA[src+pcs.n_dims/2];
-
-            out_[dst_data] = x0*cos_theta - x1*sin_theta;
-            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
-        }
-
-        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
-            const uint i0 = ic;
-
-            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
-
-            out_[dst_data + 0] = inA[src + 0];
-            out_[dst_data + 1] = inA[src + 1];
-        }
-    }
-}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp
new file mode 100644
index 000000000..63659cbfe
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float     inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + ic*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            const float x0 = float(inA[src]);
+            const float x1 = float(inA[src+pcs.n_dims/2]);
+
+            out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
+            out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp
new file mode 100644
index 000000000..4df56204d
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + ic*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            const float x0 = inA[src];
+            const float x1 = inA[src+pcs.n_dims/2];
+
+            out_[dst_data]              = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp
new file mode 100644
index 000000000..a3c0eda8b
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float     inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            const float x0 = float(inA[src]);
+            const float x1 = float(inA[src+1]);
+
+            out_[dst_data]   = float16_t(x0*cos_theta - x1*sin_theta);
+            out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp
new file mode 100644
index 000000000..b7963ae72
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            const float x0 = inA[src];
+            const float x1 = inA[src+1];
+
+            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
index 7bc9176ca..4165295bf 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
@@ -18,6 +18,10 @@ layout(push_constant) uniform PushConstants {
     int ne01;
     int ne02;
     float scale;
+    float max_bias;
+    float m0;
+    float m1;
+    uint n_head_log2;
     int mask;
 } pcs;
 
@@ -34,17 +38,29 @@ void main() {
     const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
     const uint pdst = extra_off + pcs.outOff; // Based from out_
 
+    float slope = 1.0f;
+
+    // ALiBi
+    if (pcs.max_bias > 0.0f) {
+        int64_t h = i02;
+
+        float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
+        int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
+
+        slope = pow(base, float(exp));
+    }
+
     // parallel max
     float localMax = uintBitsToFloat(0xFF800000);
     for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
+        localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
     }
     float max_ = subgroupMax(localMax);
 
     // parallel sum
     float localSum = 0.0f;
     for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
+        const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
         localSum += exp_psrc0;
         out_[pdst + i00] = exp_psrc0;
     }
diff --git a/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
index df4702896..0fca640dc 100644
--- a/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
@@ -8,12 +8,14 @@ layout(local_size_x = 1) in;
 layout (push_constant) uniform parameter {
     uint inAOff;
     uint inBOff;
+    uint inCOff;
     uint outOff;
     int n_dims;
     int mode;
     int n_ctx_orig;
     float freq_base;
     float freq_scale;
+    bool has_freq_factors;
     float ext_factor;
     float attn_factor;
     float beta_fast;

From c202cef1686182a78f8f4e253ab8d0c0ffe2fcc8 Mon Sep 17 00:00:00 2001
From: Shupei Fan <dymarkfan@outlook.com>
Date: Thu, 28 Nov 2024 20:52:03 +0800
Subject: [PATCH 338/396] ggml-cpu: support IQ4_NL_4_4 by runtime repack
 (#10541)

* ggml-cpu: support IQ4_NL_4_4 by runtime repack

* ggml-cpu: add __ARM_FEATURE_DOTPROD guard
---
 ggml/include/ggml-cpu.h              |   1 +
 ggml/include/ggml.h                  |   3 +
 ggml/src/ggml-common.h               |   6 +
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 321 +++++++++++++++++++++++++--
 ggml/src/ggml-cpu/ggml-cpu-aarch64.h |   2 +
 ggml/src/ggml-cpu/ggml-cpu.c         |  27 ++-
 ggml/src/ggml-cpu/ggml-cpu.cpp       |   2 +-
 ggml/src/ggml.c                      |   9 +
 8 files changed, 352 insertions(+), 19 deletions(-)

diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index a5358d047..e14ea9ea5 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -91,6 +91,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_neon       (void);
     GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
     GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
+    GGML_BACKEND_API int ggml_cpu_has_dotprod    (void);
     GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
     GGML_BACKEND_API int ggml_cpu_has_sve        (void);
     GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 9843b09fb..65cb92c44 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -389,6 +389,9 @@ extern "C" {
         GGML_TYPE_Q4_0_8_8 = 33,
         GGML_TYPE_TQ1_0   = 34,
         GGML_TYPE_TQ2_0   = 35,
+        GGML_TYPE_IQ4_NL_4_4 = 36,
+        // GGML_TYPE_IQ4_NL_4_8 = 37,
+        // GGML_TYPE_IQ4_NL_8_8 = 38,
         GGML_TYPE_COUNT,
     };
 
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 050161393..27253a6c2 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -418,6 +418,12 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 
+typedef struct {
+    ggml_half d[4];        // deltas for 4 iq4_nl blocks
+    uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
+} block_iq4_nlx4;
+static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
+
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 96a16dfba..ced378879 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -187,6 +187,8 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y)
 }
 #endif
 
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
 static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
@@ -528,7 +530,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    if (ggml_cpu_has_neon()) {
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -996,6 +998,102 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     }
 }
 
+void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        float * res_ptr = s;
+
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+            float32x4_t sumf = vdupq_n_f32(0);
+            for (int l = 0; l < nb; l++) {
+                uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
+                uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
+                uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
+                uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
+
+                int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
+                int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
+                int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
+                int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
+                int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
+                int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
+                int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
+                int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
+
+                int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
+                int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
+
+                int32x4_t sumi = vdupq_n_s32(0);
+                sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
+                sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
+                sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
+                sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
+                sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
+                sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
+                sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
+                sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
+
+                float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
+                float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
+                float32x4_t d = a_d * b_d;
+
+                sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
+            }
+
+            vst1q_f32(res_ptr + x * 4, sumf);
+        }
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                            const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
+                        }
+                        sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
     const int qk = QK8_0;
     const int nb = n / qk;
@@ -1017,7 +1115,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    if (ggml_cpu_has_neon()) {
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -3386,6 +3484,117 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     }
 }
 
+void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+                float32x4_t sumf[4];
+                for (int m = 0; m < 4; m++) {
+                    sumf[m] = vdupq_n_f32(0);
+                }
+
+                for (int l = 0; l < nb; l++) {
+                    float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
+                    float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
+
+                    int32x4_t sumi_0 = vdupq_n_s32(0);
+                    int32x4_t sumi_1 = vdupq_n_s32(0);
+                    int32x4_t sumi_2 = vdupq_n_s32(0);
+                    int32x4_t sumi_3 = vdupq_n_s32(0);
+
+                    for (int k = 0; k < 4; k++) {
+                        int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
+                        int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
+
+                        uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
+                        int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
+                        int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
+
+                        sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
+                        sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
+                        sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
+                        sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
+                        sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
+                        sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
+                        sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
+                        sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
+                    }
+
+                    sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
+                    sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
+                    sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
+                    sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
+                }
+            }
+        }
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                                    const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
+                                }
+                                sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
 // FIXME: this code is duplicated from ggml-aarch64.c
 static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
     block_q4_0x4 out;
@@ -3518,6 +3727,70 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
     GGML_UNUSED(data_size);
 }
 
+static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
+    block_iq4_nlx4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end = QK4_NL * 2 / blck_size_interleave;
+
+    if (blck_size_interleave == 8) {
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            // Using memcpy to avoid unaligned memory accesses
+            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
+        }
+    } else if (blck_size_interleave == 4) {
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    return out;
+}
+
+static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
+    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+
+    block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
+    const block_iq4_nl * src = (const block_iq4_nl *)data;
+    block_iq4_nl dst_tmp[4];
+    int nrow = t->ne[1]; // Number of rows
+    int nrows_interleaved = 4;
+    int nblocks = t->ne[0] / QK4_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
+
+    if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i = 0; i < nrows_interleaved; i++) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+
 // Prepare for optimized kernels if applicable
 void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
     if (cur->type == repack_type) {
@@ -3525,20 +3798,30 @@ void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_
         return;
     }
 
-    GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
-
-    switch (repack_type) {
-        case GGML_TYPE_Q4_0_8_8:
-            repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
-            break;
-        case GGML_TYPE_Q4_0_4_8:
-            repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
-            break;
-        case GGML_TYPE_Q4_0_4_4:
-            repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
-            break;
-        default:
-            GGML_ABORT("Unsupported type");
+    if (cur->type == GGML_TYPE_Q4_0) {
+        switch (repack_type) {
+            case GGML_TYPE_Q4_0_8_8:
+                repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
+                break;
+            case GGML_TYPE_Q4_0_4_8:
+                repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
+                break;
+            case GGML_TYPE_Q4_0_4_4:
+                repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
+                break;
+            default:
+                GGML_ABORT("Unsupported type");
+        }
+    } else if (cur->type == GGML_TYPE_IQ4_NL) {
+        switch (repack_type) {
+            case GGML_TYPE_IQ4_NL_4_4:
+                repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size);
+                break;
+            default:
+                GGML_ABORT("Unsupported type");
+        }
+    } else {
+        GGML_ABORT("Unsupported type");
     }
 }
 
@@ -3551,9 +3834,13 @@ enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * c
         if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
             return GGML_TYPE_Q4_0_4_8;
         }
-        if (ggml_cpu_has_neon()) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
             return GGML_TYPE_Q4_0_4_4;
         }
+    } else if (cur->type == GGML_TYPE_IQ4_NL) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+            return GGML_TYPE_IQ4_NL_4_4;
+        }
     }
 
     return cur->type;
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
index 53b30c1dd..3d9db6a19 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@@ -15,11 +15,13 @@ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
 // GEMM
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
 void           ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
 enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index c6ede19d9..fea867440 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -109,10 +109,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
 #if defined(__ARM_ARCH)
 struct ggml_arm_arch_features_type {
     int has_neon;
+    int has_dotprod;
     int has_i8mm;
     int has_sve;
     int sve_cnt;
-} ggml_arm_arch_features = {-1, -1, -1, 0};
+} ggml_arm_arch_features = {-1, -1, -1, -1, 0};
 #endif
 
 
@@ -446,6 +447,15 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
+    [GGML_TYPE_IQ4_NL_4_4] = {
+        .from_float               = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+        .ncols                    = 4,
+        .gemv                     = ggml_gemv_iq4_nl_4x4_q8_0,
+        .gemm                     = ggml_gemm_iq4_nl_4x4_q8_0,
+    },
 };
 
 const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@@ -2439,6 +2449,7 @@ static void ggml_init_arm_arch_features(void) {
     uint32_t hwcap2 = getauxval(AT_HWCAP2);
 
     ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
+    ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
     ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
     ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
 
@@ -2453,6 +2464,11 @@ static void ggml_init_arm_arch_features(void) {
     }
     ggml_arm_arch_features.has_neon = oldp;
 
+    if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_arm_arch_features.has_dotprod = oldp;
+
     if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
         oldp = 0;
     }
@@ -9133,6 +9149,7 @@ static void ggml_compute_forward_clamp(
         case GGML_TYPE_Q4_0_4_4:
         case GGML_TYPE_Q4_0_4_8:
         case GGML_TYPE_Q4_0_8_8:
+        case GGML_TYPE_IQ4_NL_4_4:
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
@@ -13880,6 +13897,14 @@ int ggml_cpu_has_neon(void) {
 #endif
 }
 
+int ggml_cpu_has_dotprod(void) {
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
+    return ggml_arm_arch_features.has_dotprod;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_sve(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
     return ggml_arm_arch_features.has_sve;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index febed433a..44d99089a 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -457,7 +457,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
     const struct ggml_tensor * src1 = op->src[1];
 
     if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
-        if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
+        if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
             return false;
         }
     }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1a2318cb1..1a9a7efaf 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -831,6 +831,15 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_tq2_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_tq2_0_ref,
     },
+    [GGML_TYPE_IQ4_NL_4_4] = {
+        .type_name                = "iq4_nl_4x4",
+        .blck_size                = QK4_NL,
+        .blck_size_interleave     = 4,
+        .type_size                = sizeof(block_iq4_nl),
+        .is_quantized             = true,
+        .to_float                 = NULL,
+        .from_float_ref           = NULL,
+    },
 };
 
 const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {

From eea986f215e1dc490654d012ccf2ab62fe8f606d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Nov 2024 14:56:23 +0200
Subject: [PATCH 339/396] cmake : fix ARM feature detection (#10543)

ggml-ci
---
 ggml/src/ggml-cpu/CMakeLists.txt | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index ddc05ecef..4dbc1f75b 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -82,17 +82,23 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
         if (GGML_COMPILER_SUPPORT_DOTPROD)
             add_compile_definitions(__ARM_FEATURE_DOTPROD)
+
+            message(STATUS "ARM feature DOTPROD enabled")
         endif ()
 
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
 
         if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
             add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+
+            message(STATUS "ARM feature MATMUL_INT8 enabled")
         endif ()
 
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
         if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
             add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+            message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
         endif ()
 
         set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
@@ -113,17 +119,23 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
                 if (GGML_COMPILER_SUPPORT_DOTPROD)
                     set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
                     add_compile_definitions(__ARM_FEATURE_DOTPROD)
+
+                    message(STATUS "ARM feature DOTPROD enabled")
                 endif ()
 
                 set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
 
                 set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-                set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
+                set(CMAKE_REQUIRED_FLAGS     "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
+
                 check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
                 if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
                     set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
                     add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+
+                    message(STATUS "ARM feature MATMUL_INT8 enabled")
                 endif ()
+
                 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
 
                 list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")

From 76b27d29c22af03172cf211a8a31025c7c828a57 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Nov 2024 14:56:37 +0200
Subject: [PATCH 340/396] ggml : fix row condition for i8mm kernels (#10561)

ggml-ci
---
 ggml/src/ggml-cpu/ggml-cpu-quants.c |  6 ++++--
 ggml/src/ggml-cpu/ggml-cpu.c        | 17 +++++++++--------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index f0e276b69..11e8df253 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -1813,11 +1813,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
                                                                                 l1, r1)), l2, r2)), l3, r3))), scale);
         }
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
+
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
         float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
 
-        vst1_f32(s,      vget_low_f32(sumv2));
+        vst1_f32(s,      vget_low_f32 (sumv2));
         vst1_f32(s + bs, vget_high_f32(sumv2));
+
         return;
     }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index fea867440..1c88e5d81 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -7576,14 +7576,6 @@ UseGgmlGemm2:;
     // This is the size of the rest of the dimensions of the result
     const int64_t nr1 = ne1 * ne2 * ne3;
 
-    // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
-    int64_t num_rows_per_vec_dot = vec_dot_num_rows;
-    // TODO: currently the mmla kernels support only even numbered rows/cols.
-    // this check can be removed once they are extended to support odd numbered rows/cols too
-    if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
-        num_rows_per_vec_dot = 1;
-    }
-
     // Now select a reasonable chunk size.
     int chunk_size = 16;
 
@@ -7646,6 +7638,15 @@ UseGgmlGemm2:;
         const int64_t ir1_start = dr1 * ith1;
         const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
 
+        // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
+        int64_t num_rows_per_vec_dot = vec_dot_num_rows;
+
+        // TODO: currently the mmla kernels support only even numbered rows/cols.
+        // this check can be removed once they are extended to support odd numbered rows/cols too
+        if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
+            num_rows_per_vec_dot = 1;
+        }
+
         ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
 
         if (nth >= nchunk0 * nchunk1) {

From e90688edd004fdb7063f463bd18408ba9ae008dd Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Thu, 28 Nov 2024 15:58:54 +0100
Subject: [PATCH 341/396] ci : fix tag name in cuda and hip releases (#10566)

---
 .github/workflows/build.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e02b5c620..48953dafa 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -904,6 +904,8 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
 
       - name: Install Cuda Toolkit 11.7
         if: ${{ matrix.cuda == '11.7' }}
@@ -1139,6 +1141,8 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
 
       - name: Install
         id: depends

From 7281cf13addfae9b64bb2be87e3b5b1914505d63 Mon Sep 17 00:00:00 2001
From: Random Fly <renfei8@live.cn>
Date: Thu, 28 Nov 2024 23:03:11 +0800
Subject: [PATCH 342/396] docs: fix outdated usage of llama-simple (#10565)

---
 docs/android.md           | 4 ++--
 examples/simple/README.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/android.md b/docs/android.md
index 320b62240..47530c6c1 100644
--- a/docs/android.md
+++ b/docs/android.md
@@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
 Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
 
 ```
-$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
+$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
 ```
 
-Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
+Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
 
 To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
 
diff --git a/examples/simple/README.md b/examples/simple/README.md
index 0ff342535..937008b24 100644
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@@ -3,7 +3,7 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
 
 ```bash
-./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
+./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
 
 ...
 

From 890719311b6535e572f15965c6d7ec4ac2537f60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 28 Nov 2024 18:15:25 +0100
Subject: [PATCH 343/396] common: fix warning message when no GPU found
 (#10564)

---
 common/arg.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index a6b7a1394..32d9a964c 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1370,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
+                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
     ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
@@ -2104,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.speculative.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));

From 6c595676899013102fdb0aa4b06a49954300c94a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 28 Nov 2024 19:17:49 +0100
Subject: [PATCH 344/396] server : (tests) don't use thread for capturing
 stdout/stderr, bump openai client library (#10568)

* server : (tests) don't use thread for capturing stdout/stderr

* test: bump openai to 1.55.2

* bump openai to 1.55.3
---
 examples/server/tests/requirements.txt |  2 +-
 examples/server/tests/utils.py         | 19 ++-----------------
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
index 935a79114..074b9d47b 100644
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -2,6 +2,6 @@ aiohttp~=3.9.3
 pytest~=8.3.3
 huggingface_hub~=0.23.2
 numpy~=1.26.4
-openai~=1.30.3
+openai~=1.55.3
 prometheus-client~=0.20.0
 requests~=2.32.3
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index e31743c50..a831f113f 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -8,7 +8,6 @@ import os
 import re
 import json
 import sys
-import threading
 import requests
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -161,26 +160,12 @@ class ServerProcess:
         self.process = subprocess.Popen(
             [str(arg) for arg in [server_path, *server_args]],
             creationflags=flags,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stdout=sys.stdout,
+            stderr=sys.stdout,
             env={**os.environ, "LLAMA_CACHE": "tmp"},
         )
         server_instances.add(self)
 
-        def server_log(in_stream, out_stream):
-            for line in iter(in_stream.readline, b""):
-                print(line.decode("utf-8"), end="", file=out_stream)
-
-        thread_stdout = threading.Thread(
-            target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
-        )
-        thread_stdout.start()
-
-        thread_stderr = threading.Thread(
-            target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
-        )
-        thread_stderr.start()
-
         print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
 
         # wait for server to start

From 4c0a95b1074907ce7efe6f5bb6ae3351c01429ab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Nov 2024 20:45:07 +0200
Subject: [PATCH 345/396] llama : add missing model types

---
 src/llama.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index af5e686e0..22b951ba2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2341,6 +2341,7 @@ enum e_model {
     MODEL_16B,
     MODEL_20B,
     MODEL_30B,
+    MODEL_32B,
     MODEL_34B,
     MODEL_35B,
     MODEL_40B,
@@ -5330,6 +5331,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_16B:           return "16B";
         case MODEL_20B:           return "20B";
         case MODEL_30B:           return "30B";
+        case MODEL_32B:           return "32B";
         case MODEL_34B:           return "34B";
         case MODEL_35B:           return "35B";
         case MODEL_40B:           return "40B";
@@ -5690,7 +5692,10 @@ static void llm_load_hparams(
                     case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
                     case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
                     case 32: model.type = e_model::MODEL_7B; break;
+                    case 36: model.type = e_model::MODEL_3B; break;
                     case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
+                    case 48: model.type = e_model::MODEL_14B; break;
+                    case 64: model.type = e_model::MODEL_32B; break;
                     case 80: model.type = e_model::MODEL_70B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }

From dc22344088a7ee81a1e4f096459b03a72f24ccdc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Nov 2024 20:46:40 +0200
Subject: [PATCH 346/396] ggml : remove redundant copyright notice + update
 authors

---
 AUTHORS                              | 186 ++++++++++++++++++++++++++-
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c |   4 -
 2 files changed, 185 insertions(+), 5 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 1bd36158a..2eb60806a 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,4 +1,4 @@
-# date: Wed Jun 26 19:36:34 EEST 2024
+# date: Thu Nov 28 20:46:15 EET 2024
 # this file is auto-generated by scripts/gen-authors.sh
 
 0cc4m <picard12@live.de>
@@ -7,6 +7,7 @@
 2f38b454 <dxf@protonmail.com>
 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
 44670 <44670@users.noreply.github.com>
+65a <10104049+65a@users.noreply.github.com>
 AN Long <aisk@users.noreply.github.com>
 AT <manyoso@users.noreply.github.com>
 Aarni Koskela <akx@iki.fi>
@@ -19,20 +20,28 @@ Adithya Balaji <adithya.b94@gmail.com>
 AdithyanI <adithyan.i4internet@gmail.com>
 Adrian <smith.adriane@gmail.com>
 Adrian Hesketh <a-h@users.noreply.github.com>
+Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
 Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
 AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
+AidanBeltonS <aidan.belton@codeplay.com>
 Aisuko <urakiny@gmail.com>
+Akarshan Biswas <akarshan.biswas@gmail.com>
 Akarshan Biswas <akarshanbiswas@fedoraproject.org>
+Al Mochkin <14274697+amochkin@users.noreply.github.com>
 Albert Jin <albert.jin@gmail.com>
 Alberto <57916483+albbus-stack@users.noreply.github.com>
+Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
+Alberto Cabrera Pérez <alberto.cabrera@intel.com>
 Alex <awhill19@icloud.com>
 Alex Azarov <alex@azarov.by>
 Alex Azarov <alexander.azarov@mapbox.com>
 Alex Klinkhamer <from.github.com.917@grencez.dev>
 Alex Klinkhamer <git@grencez.dev>
 Alex Nguyen <tiendung@users.noreply.github.com>
+Alex O'Connell <35843486+acon96@users.noreply.github.com>
 Alex Petenchea <alex.petenchea@gmail.com>
 Alex Renda <alexrenda@users.noreply.github.com>
+Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com>
 Alex von Gluck IV <kallisti5@unixzen.com>
 Alexey Parfenov <zxed@alkatrazstudio.net>
 Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
@@ -45,18 +54,25 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
 Ananta Bastola <anantarajbastola@gmail.com>
 Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
 András Salamon <ott2@users.noreply.github.com>
+Andreas (Andi) Kunar <andreask@msn.com>
 Andrei <abetlen@gmail.com>
 Andrew Canis <andrew.canis@gmail.com>
 Andrew Downing <andrew2085@gmail.com>
 Andrew Duffy <a10y@users.noreply.github.com>
 Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
+Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
+Andy Salerno <andysalerno@gmail.com>
 Andy Tai <andy-tai@users.noreply.github.com>
+Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
+Antonis Makropoulos <benuix@gmail.com>
 Arik Poznanski <arikpoz@users.noreply.github.com>
+Armen Kaleshian <kriation@users.noreply.github.com>
 Artem <guinmoon@gmail.com>
 Artem Zinnatullin <ceo@abstractny.gay>
 Artyom Lebedev <vagran.ast@gmail.com>
 Asbjørn Olling <asbjornolling@gmail.com>
 Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
+Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
 Ashish <1856117+ashishdatta@users.noreply.github.com>
 Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
 Ashraful Islam <ashraful.meche@gmail.com>
@@ -76,12 +92,16 @@ Ben Williams <ben@719ben.com>
 Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
 Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
 Bernat Vadell <hounter.caza@gmail.com>
+Bert Wagner <github@bertwagner.com>
 Bingan <70050083+binganao@users.noreply.github.com>
+Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
 Bodo Graumann <mail@bodograumann.de>
 Bono Lv <lvscar@users.noreply.github.com>
 Borislav Stanimirov <b.stanimirov@abv.bg>
 Branden Butler <bwtbutler@hotmail.com>
+Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
 Brian <mofosyne@gmail.com>
+Brian Cunnie <brian.cunnie@gmail.com>
 Bruce MacDonald <brucewmacdonald@gmail.com>
 Bryan Honof <bryanhonof@gmail.com>
 CJ Pais <cj@cjpais.com>
@@ -90,32 +110,47 @@ Calvin Laurenson <calvin@laurenson.dev>
 Cameron <csteele@steelecameron.com>
 Cameron Kaiser <classilla@users.noreply.github.com>
 Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
+CarryFun <76023481+CarryFun@users.noreply.github.com>
+Carsten Kragelund Jørgensen <carsten@kragelund.me>
+CarterLi999 <664681047@qq.com>
 Casey Primozic <casey@cprimozic.net>
 Casey Primozic <me@ameo.link>
 CausalLM <148736309+CausalLM@users.noreply.github.com>
 Cebtenzzre <cebtenzzre@gmail.com>
 Chad Brewbaker <crb002@gmail.com>
+Changyeon Kim <cyzero.kim@samsung.com>
 Chao Jiang <jc19chaoj@zoho.com>
+Charles Xu <63788048+chaxu01@users.noreply.github.com>
+Charles Xu <charles.xu@arm.com>
+Chen Xi <xi2.chen@intel.com>
+Chen Xi <xixichen08@foxmail.com>
 Cheng Shao <terrorjack@type.dance>
+Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
 Chris Elrod <elrodc@gmail.com>
 Chris Kuehl <ckuehl@ckuehl.me>
 Christian Demsar <christian@github.email.demsar.us>
 Christian Demsar <crasm@git.vczf.us>
 Christian Falch <875252+chrfalch@users.noreply.github.com>
 Christian Kögler <ck3d@gmx.de>
+Christian Köhnenkamp <cvk5@me.com>
 Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
 Clark Saben <76020733+csaben@users.noreply.github.com>
 Clint Herron <hanclinto@gmail.com>
+Conrad Kramer <conrad@conradkramer.com>
 CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
+Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
 Cuong Trinh Manh <nguoithichkhampha@gmail.com>
 DAN™ <dranger003@gmail.com>
 Damian Stewart <d@damianstewart.com>
+Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
+Dan Johansson <dan.johansson@arm.com>
 Dane Madsen <dane_madsen@hotmail.com>
 DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
 Daniel Bevenius <daniel.bevenius@gmail.com>
 Daniel Drake <drake@endlessos.org>
 Daniel Hiltgen <dhiltgen@users.noreply.github.com>
 Daniel Illescas Romero <illescas.daniel@protonmail.com>
+Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
 Daniele <57776841+daniandtheweb@users.noreply.github.com>
 DannyDaemonic <DannyDaemonic@gmail.com>
 Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
@@ -129,19 +164,28 @@ David Pflug <david@pflug.email>
 David Renshaw <dwrenshaw@gmail.com>
 David Sommers <12738+databyte@users.noreply.github.com>
 David Yang <davidyang6us@gmail.com>
+DavidKorczynski <david@adalogics.com>
 Dawid Potocki <github@dawidpotocki.com>
 Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
 Dean <Dean.Sinaean@gmail.com>
 Deins <deinsegle@gmail.com>
+Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com>
+Derrick T. Woolworth <dwoolworth@gmail.com>
 Deven Mistry <31466137+deven367@users.noreply.github.com>
+Dibakar Gope <dibakar.gope@arm.com>
 Didzis Gosko <didzis@users.noreply.github.com>
+Diego Devesa <slarengh@gmail.com>
+Diogo Teles Sant'Anna <diogoteles@google.com>
 Djip007 <djip.perois@free.fr>
 Don Mahurin <dmahurin@users.noreply.github.com>
 DooWoong Lee (David) <manics99@naver.com>
 Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
+Dou Xinpeng <15529241576@163.com>
+Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
 Douglas Hanley <thesecretaryofwar@gmail.com>
 Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
 Ebey Abraham <ebey97@gmail.com>
+Echo Nolan <echo@echonolan.net>
 Ed Lee <edilee@mozilla.com>
 Ed Lepedus <ed.lepedus@googlemail.com>
 Eddie-Wang <wangjinheng1120@163.com>
@@ -151,10 +195,13 @@ Elbios <141279586+Elbios@users.noreply.github.com>
 Elton Kola <eltonkola@gmail.com>
 Engininja2 <139037756+Engininja2@users.noreply.github.com>
 Equim <sayaka@ekyu.moe>
+Eric Curtin <ecurtin@redhat.com>
+Eric Curtin <ericcurtin17@gmail.com>
 Eric Sommerlade <es0m@users.noreply.github.com>
 Eric Zhang <34133756+EZForever@users.noreply.github.com>
 Erik Garrison <erik.garrison@gmail.com>
 Erik Scholz <Green-Sky@users.noreply.github.com>
+Esko Toivonen <eskot98@gmail.com>
 Ettore Di Giacinto <mudler@users.noreply.github.com>
 Evan Jones <evan.q.jones@gmail.com>
 Evan Miller <emmiller@gmail.com>
@@ -166,19 +213,26 @@ FK <sozforex@gmail.com>
 Fabian <cmdrf@users.noreply.github.com>
 Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
 Faez Shakil <faez.shakil@gmail.com>
+Faisal Zaghloul <faisal.zaghloul@gmail.com>
+Faisal Zaghloul <quic_fzaghlou@quicinc.com>
+Fan Shupei <dymarkfan@outlook.com>
 FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
+Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
 Fattire <528174+fat-tire@users.noreply.github.com>
 Felix <stenbackfelix@gmail.com>
 Finn Voorhees <finnvoorhees@gmail.com>
 Firat <firatkiral@gmail.com>
+FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
 Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
 Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
 Francisco Melo <43780565+francis2tm@users.noreply.github.com>
 Frank Mai <thxcode0824@gmail.com>
 FrankHB <frankhb1989@gmail.com>
+Frankie Robertson <frankier@users.noreply.github.com>
 Fred Douglas <43351173+fredlas@users.noreply.github.com>
 Frederik Vogel <Schaltfehler@users.noreply.github.com>
 Gabe Goodhart <gabe.l.hart@gmail.com>
+Gabe Goodhart <ghart@us.ibm.com>
 GainLee <perfecter.gen@gmail.com>
 Galunid <karolek1231456@gmail.com>
 Gary Linscott <glinscott@gmail.com>
@@ -187,11 +241,13 @@ Gavin Zhao <gavinzhaojw@protonmail.com>
 Genkagaku.GPT <hlhr202@163.com>
 Georgi Gerganov <ggerganov@gmail.com>
 Gilad S <giladgd@users.noreply.github.com>
+Gilad S. <7817232+giladgd@users.noreply.github.com>
 Giuseppe Scrivano <giuseppe@scrivano.org>
 GiviMAD <GiviMAD@users.noreply.github.com>
 Govlzkoy <gotope@users.noreply.github.com>
 Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
 Guillaume Wenzek <gwenzek@users.noreply.github.com>
+Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
 Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
 Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
 Haggai Nuchi <h.nuchi@gmail.com>
@@ -213,11 +269,14 @@ Hong Bo PENG <penghb@cn.ibm.com>
 Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
 Howard Su <howard0su@gmail.com>
 Hua Jiang <allenhjiang@outlook.com>
+Huang Qi <huangqi3@xiaomi.com>
 Huawei Lin <huaweilin.cs@gmail.com>
 Hugo Roussel <hugo.rous@gmail.com>
+Huifeng Ou <79071290+ho2103@users.noreply.github.com>
 Ian Bull <irbull@eclipsesource.com>
 Ian Bull <irbull@gmail.com>
 Ian Scrivener <github@zilogy.asia>
+Icecream95 <the.real.icecream95@gmail.com>
 Ido S <ido.pluto@gmail.com>
 IgnacioFDM <ignaciofdm@gmail.com>
 Igor Okulist <okigan@gmail.com>
@@ -226,11 +285,15 @@ Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
 Ionoclast Laboratories <brigham@ionoclast.com>
 Isaac McFadyen <isaac@imcf.me>
 IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
+Ivan <nekotekina@gmail.com>
+Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
 Ivan Komarov <Ivan.Komarov@dfyz.info>
 Ivan Stepanov <ivanstepanovftw@gmail.com>
 JH23X <165871467+JH23X@users.noreply.github.com>
+Jack Mousseau <jack@software.inc>
 Jack Mousseau <jmousseau@users.noreply.github.com>
 JackJollimore <130917767+JackJollimore@users.noreply.github.com>
+Jaeden Amero <jaeden@patater.com>
 Jaemin Son <woalsdnd@gmail.com>
 Jag Chadha <jagtesh@gmail.com>
 Jakub N <jakubniemczyk97@gmail.com>
@@ -243,10 +306,14 @@ Jannis Schönleber <joennlae@gmail.com>
 Jared Van Bortel <cebtenzzre@gmail.com>
 Jared Van Bortel <jared@nomic.ai>
 Jason McCartney <jmac@theroot.org>
+Jason Stillerman <jason.t.stillerman@gmail.com>
 Jean-Christophe Hoelt <hoelt@fovea.cc>
 Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
 Jed Fox <git@jedfox.com>
+Jeff Bolz <jbolz@nvidia.com>
+Jeffrey Morgan <jmorganca@gmail.com>
 Jeffrey Quesnelle <emozilla@nousresearch.com>
+Jeroen Mostert <jeroen.mostert@cm.com>
 Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
 Jeximo <jeximo@gmail.com>
 Jhen-Jie Hong <iainst0409@gmail.com>
@@ -258,6 +325,9 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
 Jiří Sejkora <Sejseloid@gmail.com>
 Joan Fontanals <jfontanalsmartinez@gmail.com>
 Joan Fontanals <joan.fontanals.martinez@jina.ai>
+João Dinis Ferreira <hello@joaof.eu>
+Joe Eli McIlvain <joe.eli.mac@gmail.com>
+Joe Todd <joe.todd@codeplay.com>
 Johan <JohanAR@users.noreply.github.com>
 Johannes Gäßler <johannesg@5d6.de>
 Johannes Rudolph <johannes.rudolph@gmail.com>
@@ -274,7 +344,9 @@ Joyce <joycebrum@google.com>
 Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
 Judd <foldl@users.noreply.github.com>
 Julius Arkenberg <arki05@users.noreply.github.com>
+Jun Hee Yoo <contact.jhyoo@gmail.com>
 Jun Jie <71215065+junnjiee16@users.noreply.github.com>
+Junil Kim <logyourself@gmail.com>
 Junyang Lin <justinlin930319@hotmail.com>
 Juraj Bednar <juraj@bednar.io>
 Justin Parker <jparkerweb@gmail.com>
@@ -292,12 +364,14 @@ Karthik Sethuraman <k.seth1993@gmail.com>
 Kasumi <90275229+kasumi-1@users.noreply.github.com>
 Kawrakow <48489457+ikawrakow@users.noreply.github.com>
 Keiichi Tabata <keiichi.tabata@outlook.com>
+Keke Han <hankeke303@163.com>
 Kenvix ⭐ <kenvixzure@live.com>
 Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
 Kevin Gibbons <bakkot@gmail.com>
 Kevin Ji <1146876+kevinji@users.noreply.github.com>
 Kevin Kwok <antimatter15@gmail.com>
 Kevin Lo <kevlo@kevlo.org>
+Kevin Wang <kevmo314@gmail.com>
 Kolen Cheung <ickc@users.noreply.github.com>
 Konstantin Herud <konstantin.herud@denkbares.com>
 Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
@@ -315,22 +389,29 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
 Leonardo Neumann <leonardo@neumann.dev.br>
 Li Tan <tanliboy@gmail.com>
 Linwei Wang <wanix1988@gmail.com>
+Liu Jia <109258120+Septa2112@users.noreply.github.com>
+Liu Jia <jia3.liu@intel.com>
 LoganDark <github@logandark.mozmail.com>
+Loïc Carrère <loic.carrere@gmail.com>
 LostRuins <39025047+LostRuins@users.noreply.github.com>
 Luciano <lucianostrika44@gmail.com>
 Luo Tian <lt@basecity.com>
 Lyle Dean <dean@lyle.dev>
+M-A <maruel@gmail.com>
 M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
+Ma Mingfei <mingfei.ma@intel.com>
 Maarten ter Huurne <maarten@treewalker.org>
 Mack Straight <eiz@users.noreply.github.com>
 Maël Kerbiriou <m431.kerbiriou@gmail.com>
 MaggotHATE <clay1326@gmail.com>
+Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
 Manuel <44313466+makuche@users.noreply.github.com>
 Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
 Marco Matthies <71844+marcom@users.noreply.github.com>
 Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
 Marian Cepok <marian.cepok@gmail.com>
 Mark Fairbairn <thebaron88@gmail.com>
+Mark Zhuang <zhuangqiubin@gmail.com>
 Marko Tasic <mtasic85@gmail.com>
 Markus Tavenrath <mtavenrath@users.noreply.github.com>
 Martin Delille <martin@delille.org>
@@ -342,11 +423,15 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
 Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
 Matheus C. França <matheus-catarino@hotmail.com>
 Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
+Mathieu Geli <mathieu.geli@gmail.com>
 Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
+Mathijs Henquet <mathijs.henquet@gmail.com>
 Mathijs de Bruin <mathijs@mathijsfietst.nl>
 Matt Clayton <156335168+mattjcly@users.noreply.github.com>
 Matt Pulver <matt.pulver@heavy.ai>
+Matt Stephenson <mstephenson6@users.noreply.github.com>
 Matteo Boschini <12133566+mbosc@users.noreply.github.com>
+Matteo Mortari <matteo.mortari@gmail.com>
 Mattheus Chediak <shammcity00@gmail.com>
 Matthew Tejo <matthew.tejo@gmail.com>
 Matvey Soloviev <blackhole89@gmail.com>
@@ -356,8 +441,10 @@ Maxime <672982+maximegmd@users.noreply.github.com>
 Maximilian Winter <maximilian.winter.91@gmail.com>
 Meng Zhang <meng@tabbyml.com>
 Meng, Hengyu <hengyu.meng@intel.com>
+Mengqing Cao <cmq0113@163.com>
 Merrick Christensen <merrick.christensen@gmail.com>
 Michael Coppola <m18coppola@gmail.com>
+Michael Francis <edude03@gmail.com>
 Michael Hueschen <m@mhueschen.dev>
 Michael Kesper <mkesper@schokokeks.org>
 Michael Klimenko <mklimenko29@gmail.com>
@@ -365,41 +452,57 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
 Michael Potter <NanoTekGuy@Gmail.com>
 Michael de Gans <michael.john.degans@gmail.com>
 Michaël de Vries <vriesdemichael@gmail.com>
+Michał Tuszyński <srgtuszy@gmail.com>
 Mihai <mihai.chirculescu@yahoo.com>
 Mike <ytianhui2004@gmail.com>
 Mikko Juola <mikjuo@gmail.com>
 Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
+Minsoo Cheong <icycle0409@snu.ac.kr>
 Mirko185 <mirkosig@gmail.com>
 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
+MistApproach <98988043+MistApproach@users.noreply.github.com>
 Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
 Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
 Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
+Molly Sophia <mollysophia379@gmail.com>
+MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
 Murilo Santana <mvrilo@gmail.com>
 Musab Gultekin <musabgultekin@users.noreply.github.com>
 Nam D. Tran <42194884+namtranase@users.noreply.github.com>
 Nathan Epstein <nate2@umbc.edu>
+Natsu <chino@hotococoa.moe>
 NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
 Nebula <infinitewormhole@gmail.com>
 Neo Zhang <14088817+arthw@users.noreply.github.com>
 Neo Zhang <zhang.jianyu@outlook.com>
 Neo Zhang Jianyu <jianyu.zhang@intel.com>
 Neuman Vong <neuman.vong@gmail.com>
+Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
 Nexesenex <124105151+Nexesenex@users.noreply.github.com>
 Niall Coates <1349685+Niall-@users.noreply.github.com>
+Nicholai Tukanov <nicholaitukanov@gmail.com>
+Nico Bosshard <nico@bosshome.ch>
 Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
 Nicolás Pérez <nicolas_perez@brown.edu>
 Nigel Bosch <pnigelb@gmail.com>
 Niklas Korz <niklas@niklaskorz.de>
+NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
 Nikolas <127742645+nneubacher@users.noreply.github.com>
 Nindaleth <Nindaleth@users.noreply.github.com>
+OSecret <135510162+OLSecret@users.noreply.github.com>
 Oleksandr Nikitin <oleksandr@tvori.info>
 Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
 Olivier Chafik <ochafik@users.noreply.github.com>
 Ondřej Čertík <ondrej@certik.us>
 Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
+PAB <pierreantoine.bannier@gmail.com>
+Pablo Duboue <pablo.duboue@gmail.com>
+Pascal Patry <ppatry@mtacitlabs.com>
 Patrice Ferlet <metal3d@gmail.com>
 Paul Tsochantaris <ptsochantaris@icloud.com>
+Pavel Zloi <github.com@drteam.rocks>
 Pavol Rusnak <pavol@rusnak.io>
+Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
 Pedro Cuenca <pedro@huggingface.co>
 Peter Sugihara <peter@campsh.com>
 Phil H <5756783+phiharri@users.noreply.github.com>
@@ -407,10 +510,15 @@ Philip Taron <philip.taron@gmail.com>
 Phillip Kravtsov <phillip@kravtsov.net>
 Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
 Pierrick Hymbert <pierrick.hymbert@gmail.com>
+Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
+Plamen Minev <pacominev@gmail.com>
+Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
 Przemysław Pawełczyk <przemoc@gmail.com>
 Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
 Qingyou Meng <meng.qingyou@gmail.com>
 Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
+R0CKSTAR <xiaodong.ye@mthreads.com>
+R0CKSTAR <yeahdongcn@gmail.com>
 RJ Adriaansen <adriaansen@eshcc.eur.nl>
 Radoslav Gerganov <rgerganov@gmail.com>
 Radosław Gryta <radek.gryta@gmail.com>
@@ -419,11 +527,13 @@ Raj Hammeer Singh Hada <hammeerraj@gmail.com>
 Ralph Soika <ralph.soika@imixs.com>
 Rand Xie <randxiexyy29@gmail.com>
 Randall Fitzgerald <randall@dasaku.net>
+Random Fly <renfei8@live.cn>
 Reinforce-II <fate@eastal.com>
 Ren Xuancheng <jklj077@users.noreply.github.com>
 Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
 RhinoDevel <RhinoDevel@users.noreply.github.com>
 Riceball LEE <snowyu.lee@gmail.com>
+Rich Dougherty <rich@rd.nz>
 Richard Kiss <him@richardkiss.com>
 Richard Roberson <richardr1126@gmail.com>
 Rick G <26732651+TheFlipbook@users.noreply.github.com>
@@ -439,21 +549,30 @@ Robey Holderith <robey@flaminglunchbox.net>
 Robyn <robyngraf@users.noreply.github.com>
 Roger Meier <r.meier@siemens.com>
 Roland <14355895+rbur0425@users.noreply.github.com>
+Romain Biessy <romain.biessy@codeplay.com>
 Romain D <90720+Artefact2@users.noreply.github.com>
 Romain Neutron <romain@neutron.io>
 Roman Parykin <donderom@gmail.com>
 Ron Evans <ron@hybridgroup.com>
 Ron Jailall <rojailal@gmail.com>
+Roni <sulpher@gmx.net>
 Ronny Brendel <ronnybrendel@gmail.com>
 Ronsor <ronsor@ronsor.pw>
 Rowan Hart <rowanbhart@gmail.com>
+Ruchira Hasaranga <ruchira66@gmail.com>
+Ruixin Huang <18860020911@163.com>
 Rune <43761327+Rune-AI@users.noreply.github.com>
+RunningLeon <maningsheng@sensetime.com>
+RunningLeon <mnsheng@yeah.net>
 Ryan Landay <rlanday@gmail.com>
 Ryder Wishart <ryderwishart@gmail.com>
 Ryuei <louixs@users.noreply.github.com>
 Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
+SRHMorris <69468379+SRHMorris@users.noreply.github.com>
+SXX <sxx1136965276@gmail.com>
 SakuraUmi <yukinon244@gmail.com>
 Salvador E. Tropea <stropea@inti.gob.ar>
+Salvatore Mesoraca <s.mesoraca16@gmail.com>
 Sam Spilsbury <smspillaz@gmail.com>
 Sami Farin <3876865+Safari77@users.noreply.github.com>
 Samuel Maynard <samwmaynard@gmail.com>
@@ -463,23 +582,29 @@ Sebastián A <sebastian.aedo29@gmail.com>
 SebastianApel <13675545+SebastianApel@users.noreply.github.com>
 Senemu <10880819+Senemu@users.noreply.github.com>
 Sergey Alirzaev <zl29ah@gmail.com>
+Sergio López <slp@redhat.com>
 Sergio López <slp@sinrega.org>
 Sertaç Özercan <852750+sozercan@users.noreply.github.com>
 SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
 ShadovvBeast <ShadovvBeast@gmail.com>
 Shakhar Dasgupta <shakhardasgupta@gmail.com>
+Shane A <shanea@allenai.org>
 Shangning Xu <32517059+xushangning@users.noreply.github.com>
+Shankar <gshankar.87@gmail.com>
+Shanshan Shen <467638484@qq.com>
 Shijie <821898965@qq.com>
 Shintarou Okada <kokuzen@gmail.com>
 Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
 Shouzheng Liu <lshzh.hi@gmail.com>
 Shuichi Tsutsumi <shuichi0526@gmail.com>
+Shupei Fan <dymarkfan@outlook.com>
 Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
 Simon Willison <swillison@gmail.com>
 Siwen Yu <yusiwen@gmail.com>
 Sky Yan <skyan83@gmail.com>
 Slaren <2141330+slaren@users.noreply.github.com>
 Slava Primenko <primenko.s@gmail.com>
+Small Grass Forest <zixuanxcl@gmail.com>
 SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
 Someone <sergei.kozlukov@aalto.fi>
 Someone Serge <sergei.kozlukov@aalto.fi>
@@ -491,12 +616,15 @@ Stefan Sydow <stefan@sydow.email>
 Steffen Röcker <sroecker@gmail.com>
 Stephan Walter <stephan@walter.name>
 Stephen Nichols <snichols@users.noreply.github.com>
+Steve Bonds <sbonds@gmail.com>
 Steve Grubb <ausearch.1@gmail.com>
 Steven Prichard <spprichard20@gmail.com>
 Steven Roussey <sroussey@gmail.com>
 Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
+StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
 Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
 SuperUserNameMan <yoann@terminajones.com>
+Sutou Kouhei <kou@cozmixng.org>
 Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
 Taikono-Himazin <kazu@po.harenet.ne.jp>
 Tameem <113388789+AhmadTameem@users.noreply.github.com>
@@ -507,7 +635,9 @@ Theia Vogel <theia@vgel.me>
 Thérence <13496987+Royalphax@users.noreply.github.com>
 Thibault Terrasson <thibault.terrasson@gmail.com>
 Thomas Klausner <wiz@gatalith.at>
+Thorsten Sommer <SommerEngineering@users.noreply.github.com>
 Tim Miller <drasticactions@users.noreply.github.com>
+Tim Wang <overocean@gmail.com>
 Timmy Knight <r2d2fish@gmail.com>
 Timothy Cronin <40186632+4imothy@users.noreply.github.com>
 Ting Lou <ting.lou@gmail.com>
@@ -517,24 +647,31 @@ Tom C <tom.corelis@gmail.com>
 Tom Jobbins <784313+TheBloke@users.noreply.github.com>
 Tomas <tom.tomas.36478119@gmail.com>
 Tomáš Pazdiora <tomas.pazdiora@gmail.com>
+Tony Wasserka <4840017+neobrain@users.noreply.github.com>
 Tristan Druyen <tristan@vault81.mozmail.com>
 Tristan Ross <rosscomputerguy@protonmail.com>
+Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
 Tungsten842 <886724vf@anonaddy.me>
 Tungsten842 <quantmint@protonmail.com>
 Tushar <ditsuke@protonmail.com>
 UEXTM.com <84163508+uextm@users.noreply.github.com>
+Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
 Ulrich Drepper <drepper@gmail.com>
 Uzo Nweke <uzoechi@gmail.com>
 Vaibhav Srivastav <vaibhavs10@gmail.com>
 Val Kharitonov <mail@kharvd.com>
 Valentin Konovalov <valle.ketsujin@gmail.com>
 Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
+Vali Malinoiu <0x4139@gmail.com>
 Victor Nogueira <felladrin@gmail.com>
 Victor Z. Peng <ziliangdotme@gmail.com>
+Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
+Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
 Vlad <spitfireage@gmail.com>
 Vladimir <bogdad@gmail.com>
 Vladimir Malyutin <first-leon@yandex.ru>
 Vladimir Zorin <vladimir@deviant.guru>
+VoidIsVoid <343750470@qq.com>
 Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
 WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
 Weird Constructor <weirdconstructor@gmail.com>
@@ -551,15 +688,22 @@ Xiang (Kevin) Li <kevinli020508@gmail.com>
 Xiao-Yong Jin <jinxiaoyong@gmail.com>
 XiaotaoChen <chenxiaotao1234@gmail.com>
 Xiaoyi Chen <cxychina@gmail.com>
+Xie Yanbo <xieyanbo@gmail.com>
 Xingchen Song(宋星辰) <xingchensong1996@163.com>
+Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
 Xuan Son Nguyen <thichthat@gmail.com>
+Yaiko <elyaiko@hotmail.com>
 Yann Follet <131855179+YannFollet@users.noreply.github.com>
 Yaroslav <yaroslav.yashin@me.com>
 Yazan Agha-Schrader <mountaiin@icloud.com>
 Yiming Cui <conandiy@vip.qq.com>
 Yishuo Wang <MeouSker77@outlook.com>
+Yoshi Suhara <y.suhara@gmail.com>
+Yoshi Suhara <ysuhara@nvidia.com>
+Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
 Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
 Yui <dev@sleepyyui.com>
+Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
 Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
 Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
 ZHAOKAI WANG <sanxianwei@163.com>
@@ -568,6 +712,8 @@ Zay <95888118+isaiahbjork@users.noreply.github.com>
 Zenix <zenixls2@gmail.com>
 Zhang Peiyuan <a1286225768@gmail.com>
 Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
+Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
+Zhiyuan Li <lizhiyuan@uniartisan.com>
 ZhouYuChen <zhouyuchen@naver.com>
 Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
 Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
@@ -581,6 +727,7 @@ alexpinel <93524949+alexpinel@users.noreply.github.com>
 alonfaraj <alonfaraj@gmail.com>
 alwqx <kenan3015@gmail.com>
 amd-lalithnc <lalithnc@amd.com>
+amritahs-ibm <amritahs@linux.vnet.ibm.com>
 andrijdavid <david@geek.mg>
 anon998 <131767832+anon998@users.noreply.github.com>
 anzz1 <anzz1@live.com>
@@ -588,14 +735,18 @@ apaz <aarpazdera@gmail.com>
 apcameron <37645737+apcameron@users.noreply.github.com>
 arch-btw <57669023+arch-btw@users.noreply.github.com>
 arcrank <arcrank@gmail.com>
+ardfork <134447697+ardfork@users.noreply.github.com>
 arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
 at8u <129688334+at8u@users.noreply.github.com>
 automaticcat <daogiatuank54@gmail.com>
+awatuna <23447591+awatuna@users.noreply.github.com>
+b4b4o <zwbao@foxmail.com>
 bandoti <141645996+bandoti@users.noreply.github.com>
 beiller <beiller@gmail.com>
 bhubbb <79117352+bhubbb@users.noreply.github.com>
 bmwl <brian.marshall@tolko.com>
 bobqianic <129547291+bobqianic@users.noreply.github.com>
+brucepro <git@brucepro.net>
 bryanSwk <93190252+bryanSwk@users.noreply.github.com>
 bsilvereagle <bsilvereagle@users.noreply.github.com>
 bssrdf <merlintiger@hotmail.com>
@@ -614,10 +765,14 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com>
 crasm <crasm@git.vczf.net>
 crasm <crasm@git.vczf.us>
 daboe01 <daboe01@googlemail.com>
+daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
+daminho <37615795+daminho@users.noreply.github.com>
 david raistrick <keen99@users.noreply.github.com>
 ddh0 <dylanhalladay02@icloud.com>
 ddpasa <112642920+ddpasa@users.noreply.github.com>
 deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
+devojony <61173062+devojony@users.noreply.github.com>
+ditsuke <ditsuke@protonmail.com>
 divinity76 <divinity76@gmail.com>
 dm4 <sunrisedm4@gmail.com>
 dotpy314 <33351922+dotpy314@users.noreply.github.com>
@@ -629,14 +784,18 @@ ebraminio <ebraminio@gmail.com>
 eiery <19350831+eiery@users.noreply.github.com>
 eric8607242 <e0928021388@gmail.com>
 fairydreaming <166155368+fairydreaming@users.noreply.github.com>
+fengerhu1 <2748250768@qq.com>
 fraxy-v <65565042+fraxy-v@users.noreply.github.com>
 github-actions[bot] <github-actions[bot]@users.noreply.github.com>
 gliptic <gliptic@users.noreply.github.com>
 goerch <jhr.walter@t-online.de>
 grahameth <96447521+grahameth@users.noreply.github.com>
+gtygo <gtydoit@gmail.com>
 gwjr <502526+gwjr@users.noreply.github.com>
 h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
 hankcs <cnhankmc@gmail.com>
+haopeng <657407891@qq.com>
+hipudding <huafengchun@gmail.com>
 hoangmit <hoangmit@users.noreply.github.com>
 hongbo.mo <352280764@qq.com>
 hopkins385 <98618192+hopkins385@users.noreply.github.com>
@@ -649,12 +808,14 @@ hxer7963 <hxer7963@gmail.com>
 hydai <z54981220@gmail.com>
 iSma <ismail.senhaji@gmail.com>
 iacore <74560659+iacore@users.noreply.github.com>
+icppWorld <124377669+icppWorld@users.noreply.github.com>
 igarnier <igarnier@protonmail.com>
 intelmatt <61025942+intelmatt@users.noreply.github.com>
 iohub <rickyang.pro@gmail.com>
 jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
 jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
 jameswu2014 <545426914@qq.com>
+jdomke <28772296+jdomke@users.noreply.github.com>
 jiez <373447296@qq.com>
 jneem <joeneeman@gmail.com>
 joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
@@ -677,28 +838,35 @@ klosax <131523366+klosax@users.noreply.github.com>
 kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
 kunnis <kunnis@users.noreply.github.com>
 kuronekosaiko <EvanChanJ@163.com>
+kustaaya <58045274+kustaaya@users.noreply.github.com>
 kuvaus <22169537+kuvaus@users.noreply.github.com>
 kwin1412 <42286931+kwin1412@users.noreply.github.com>
 l3utterfly <gc.pthzfoldr@gmail.com>
+laik <laik.lj@me.com>
 ldwang <ftgreat@163.com>
 le.chang <cljs118@126.com>
 leejet <leejet714@gmail.com>
+leo-pony <nengjunma@outlook.com>
 limitedAtonement <limitedAtonement@users.noreply.github.com>
 liuwei-git <14815172+liuwei-git@users.noreply.github.com>
 lon <114724657+longregen@users.noreply.github.com>
 loonerin <132926317+loonerin@users.noreply.github.com>
+ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
 luoyu-intel <yu.luo@intel.com>
 m3ndax <adrian.goessl@outlook.com>
 maddes8cht <55592906+maddes8cht@users.noreply.github.com>
 makomk <makosoft@googlemail.com>
 manikbhandari <mbbhandarimanik2@gmail.com>
 maor-ps <154728172+maor-ps@users.noreply.github.com>
+matiaslin <45382001+matiaslin@users.noreply.github.com>
+matteo <matteogeniaccio@yahoo.it>
 mdrokz <mohammadmunshi@gmail.com>
 mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
 minarchist <minarchist@users.noreply.github.com>
 mj-shifu <77107165+mj-shifu@users.noreply.github.com>
 mmyjona <jonathan.gonse@gmail.com>
 momonga <115213907+mmnga@users.noreply.github.com>
+momonga <146910567+mmngays@users.noreply.github.com>
 moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
 mzcu <milos.cubrilo@gmail.com>
 nanahi <130121847+na-na-hi@users.noreply.github.com>
@@ -716,8 +884,10 @@ omahs <73983677+omahs@users.noreply.github.com>
 oobabooga <112222186+oobabooga@users.noreply.github.com>
 opparco <parco.opaai@gmail.com>
 ostix360 <55257054+ostix360@users.noreply.github.com>
+pculliton <phillipculliton@gmail.com>
 pengxin99 <pengxin.yuan@intel.com>
 perserk <perserk@gmail.com>
+piDack <104877312+piDack@users.noreply.github.com>
 pmysl <piotr.myslinski@outlook.com>
 postmasters <namnguyen@google.com>
 pudepiedj <pudepiedj@gmail.com>
@@ -733,6 +903,7 @@ runfuture <runfuture@users.noreply.github.com>
 sandyiscool <sandyiscool@gmail.com>
 sasha0552 <admin@sasha0552.org>
 semidark <me@semidark.net>
+serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
 sharpHL <132747147+sharpHL@users.noreply.github.com>
 shibe2 <shibe@tuta.io>
 singularity <12184989+singularity-s0@users.noreply.github.com>
@@ -741,42 +912,55 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
 slaren <2141330+slaren@users.noreply.github.com>
 slaren <slarengh@gmail.com>
 snadampal <87143774+snadampal@users.noreply.github.com>
+standby24x7 <standby24x7@gmail.com>
 staviq <staviq@gmail.com>
 stduhpf <stephduh@live.fr>
 strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
 swittk <switt1995@gmail.com>
 takov751 <40316768+takov751@users.noreply.github.com>
 tarcey <cey.tarik@gmail.com>
+tc-mb <157115220+tc-mb@users.noreply.github.com>
 texmex76 <40733439+texmex76@users.noreply.github.com>
 thement <40525767+thement@users.noreply.github.com>
+thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
 tjohnman <tjohnman@users.noreply.github.com>
+toyer <2042519524@qq.com>
 tslmy <tslmy@users.noreply.github.com>
 ubik2 <ubik2@users.noreply.github.com>
 uint256_t <konndennsa@gmail.com>
 uint256_t <maekawatoshiki1017@gmail.com>
 unbounded <haakon@likedan.net>
+uvos <devnull@uvos.xyz>
 valiray <133289098+valiray@users.noreply.github.com>
+vb <vaibhavs10@gmail.com>
 vik <vikhyatk@gmail.com>
 viric <viric@viric.name>
 vodkaslime <646329483@qq.com>
 vvhg1 <94630311+vvhg1@users.noreply.github.com>
 vxiiduu <73044267+vxiiduu@users.noreply.github.com>
+wangshuai09 <391746016@qq.com>
 wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
 whoreson <139810751+whoreson@users.noreply.github.com>
 woachk <24752637+woachk@users.noreply.github.com>
 wonjun Jang <strutive07@gmail.com>
 woodx <124784234+woodx9@users.noreply.github.com>
+wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
 wzy <32936898+Freed-Wu@users.noreply.github.com>
 xaedes <xaedes@gmail.com>
 xaedes <xaedes@googlemail.com>
+xctan <axunlei@gmail.com>
 xloem <0xloem@gmail.com>
 yangli2 <yangli2@gmail.com>
 yuiseki <yuiseki@gmail.com>
+yuri@FreeBSD <yurivict@users.noreply.github.com>
 zakkor <edward.partenie@gmail.com>
 zhangkaihuo <zhangkaihuo@gmail.com>
+zhentaoyu <zhentao.yu@intel.com>
 zhouwg <6889919+zhouwg@users.noreply.github.com>
 zhouwg <zhouwg2000@gmail.com>
 zrm <trustiosity.zrm@gmail.com>
 Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
+杨朱 · Kiki <baofa.fan@daocloud.io>
 源文雨 <41315874+fumiama@users.noreply.github.com>
+蕭澧邦 <45505768+shou692199@users.noreply.github.com>
 Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index ced378879..69d3d327d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -1,7 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-// SPDX-License-Identifier: MIT
-//
-
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 

From 678d7994f4da0af3d29046be99950ac999ee9762 Mon Sep 17 00:00:00 2001
From: Ting Lou <louting@189.cn>
Date: Fri, 29 Nov 2024 08:09:46 +0800
Subject: [PATCH 347/396] llava: return false instead of exit (#10546)

---
 examples/llava/clip.cpp  | 15 +++++++++++----
 examples/llava/llava.cpp | 28 +++++++++++++++++++---------
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index aae49c965..7ba4cea58 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -40,10 +40,17 @@
 #include <cinttypes>
 #include <limits>
 
-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#if defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...)
+#   define LOG_WRN(...)
+#   define LOG_ERR(...)
+#   define LOG_DBG(...)
+#else // defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#   define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#endif // defined(LLAVA_LOG_OFF)
 
 //#define CLIP_DEBUG_FUNCTIONS
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index be6988540..4ca53a0b8 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -11,13 +11,17 @@
 #include <limits>
 #include <vector>
 
-#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
-#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
-
-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#if defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...)
+#   define LOG_WRN(...)
+#   define LOG_ERR(...)
+#   define LOG_DBG(...)
+#else // defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#   define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#endif // defined(LLAVA_LOG_OFF)
 
 // RGB uint8 image
 struct clip_image_u8 {
@@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
     errno = 0;
     size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
     if (ferror(file)) {
-        die_fmt("read error: %s", strerror(errno));
+        LOG_ERR("read error: %s", strerror(errno));
+        free(buffer);
+        fclose(file);
+        return false;
     }
     if (ret != (size_t) fileSize) {
-        die("unexpectedly reached end of file");
+        LOG_ERR("unexpectedly reached end of file");
+        free(buffer);
+        fclose(file);
+        return false;
     }
     fclose(file); // Close the file
 

From f095a649ec390e04dfab1b04e646ae8549dafaef Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Fri, 29 Nov 2024 00:18:02 -0600
Subject: [PATCH 348/396] vulkan: get the first command buffer submitted sooner
 (#10499)

This is an incremental improvement over #9118 to get work to the GPU a bit
sooner. The first part is to start with a smaller number of nodes before
the first submit, and ramp it up to the current 100 nodes/submit. The
second part is to reduce the dryrun overhead for all the nodes that just
need to request descriptor space.

With these changes I get around 1-2% speedup on RTX 4070 combined with my
old Haswell-era CPU.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 60 ++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index a833007fb..849c11923 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5672,6 +5672,48 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         } else {
             compute_ctx = ctx->compute_ctx.lock();
         }
+    } else {
+        switch (node->op) {
+        case GGML_OP_REPEAT:
+        case GGML_OP_ACC:
+        case GGML_OP_GET_ROWS:
+        case GGML_OP_ADD:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_CONCAT:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
+        case GGML_OP_CLAMP:
+        case GGML_OP_PAD:
+        case GGML_OP_CPY:
+        case GGML_OP_CONT:
+        case GGML_OP_DUP:
+        case GGML_OP_NORM:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_UNARY:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ROPE:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_IM2COL:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_POOL_2D:
+        case GGML_OP_LEAKY_RELU:
+            {
+                // These operations all go through ggml_vk_op_f32, so short-circuit and
+                // do the only thing needed for the dryrun.
+                vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
+                ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+                return false;
+            }
+        default:
+            break;
+        }
     }
 
     switch (node->op) {
@@ -6401,16 +6443,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     bool first_node_in_batch = true; // true if next node will be first node in a batch
     int submit_node_idx = 0; // index to first node in a batch
 
-    // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
-    constexpr int submit_count = 100;
+    // Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution.
+    // Start with a smaller count to get work submitted right away, and increase it after each submit.
+    int nodes_per_submit = 20;
     int submitted_nodes = 0;
+    int submit_count = 0;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (first_node_in_batch) {
             submit_node_idx = i;
         }
 
-        bool submit = (submitted_nodes >= submit_count) || (i == last_node);
-
+        bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node);
 
         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
 
@@ -6427,6 +6470,15 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         if (submit) {
             first_node_in_batch = true;
             submitted_nodes = 0;
+            switch (submit_count) {
+            case 0:
+                nodes_per_submit = 50;
+                break;
+            default:
+                nodes_per_submit = 100;
+                break;
+            }
+            submit_count++;
         }
     }
 

From 938f6087421889a3af7d0786c64406ced2be81b8 Mon Sep 17 00:00:00 2001
From: Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
Date: Fri, 29 Nov 2024 14:46:55 +0800
Subject: [PATCH 349/396] CANN: RoPE operator optimization (#10563)

* [cann] RoPE operator optimization

* [CANN]Code Formatting

---------

Co-authored-by: noemotiovon <noemotiovon@gmail.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 241 ++++++++++++++++++++++++++++---
 ggml/src/ggml-cann/ggml-cann.cpp |  13 +-
 2 files changed, 222 insertions(+), 32 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index d707efc5d..b2d857e1e 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2965,7 +2965,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
                              aclTensor* acl_cos_repeat_tensor,
                              aclTensor* acl_sin_repeat_tensor,
                              float theta_scale, float freq_scale,
-                             bool is_neox) {
+                             float attn_factor, bool is_neox) {
     // int sin/cos cache, cache has different repeat method depond on
     // @param.is_neox
 
@@ -3017,6 +3017,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
             ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
         aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
                          nullptr, true);
+        ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
     }
 
     // position
@@ -3047,16 +3048,6 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
               acl_theta_tensor);
 
-    // // power[] * position[] * freq_scale / freq_factors[]
-    // ggml_cann_pool_alloc theta_final_allocator(ctx.pool(),
-    //                                            theta_length *
-    //                                            sizeof(float_t));
-    // aclTensor* acl_theat_final_tensor = aclnn_zero(
-    //     ctx, theta_final_allocator.get(), sizeof(float_t) * theta_length,
-    //     theta_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t));
-    // aclnn_inplace_addcdiv(ctx, acl_theat_final_tensor, acl_theta_tensor,
-    //                       acl_freq_factors_tensor, freq_scale);
-
     // permute: [0,1,2,3]->[0,2,1,3]
     int64_t permute_ne[] = {arange_length, 1, position_length, 1};
     size_t permute_nb[GGML_MAX_DIMS];
@@ -3092,6 +3083,12 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
         GGML_MAX_DIMS, ACL_FORMAT_ND);
     aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
 
+    // attn_factor
+    if (attn_factor != 1) {
+        aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
+        aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
+    }
+
     // repeat
     if (is_neox) {
         int64_t repeatsArray[] = {1, 1, 1, 2};
@@ -3155,15 +3152,11 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
     memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
 
-    // TODO: attn_factor != 1
-    GGML_ASSERT(attn_factor == 1);
     // TODO: n_dims <= ne0
     GGML_ASSERT(n_dims == ne0);
     GGML_ASSERT(n_dims % 2 == 0);
     // TODO: ext_factor != 0
     GGML_ASSERT(ext_factor == 0);
-    // TODO: type == GGML_TYPE_F16
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 
     const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
@@ -3194,7 +3187,217 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
                                 sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
     aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
-                     theta_scale, freq_scale, is_neox);
+                     theta_scale, freq_scale, attn_factor, is_neox);
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+#ifdef ASCEND_310P
+    // Special ROPE operation for 310P
+
+    // roll input
+    void* input_roll_buffer;
+    aclTensor* acl_minus_one_tensor;
+    void* minus_one_scale_buffer = nullptr;
+    ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
+    ggml_cann_pool_alloc minus_one_scale_allocator(
+        ctx.pool(), sizeof(float_t) * src0->ne[0]);
+    if (!is_neox) {
+        // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
+        input_roll_buffer = roll_allocator.get();
+        int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
+                                    src0->ne[2], src0->ne[3]};
+        size_t input_roll_nb[GGML_MAX_DIMS];
+        input_roll_nb[0] = ggml_type_size(src0->type);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
+        }
+        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
+            input_roll_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
+            GGML_MAX_DIMS);
+        aclTensor* acl_input_tensor = ggml_cann_create_tensor(
+            src0->data, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
+            GGML_MAX_DIMS);
+
+        int64_t shifts[] = {1};
+        int64_t dims[] = {3};
+        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+
+        // init [-1, 1, -1, 1, ...]
+        minus_one_scale_buffer = minus_one_scale_allocator.get();
+
+        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
+        size_t minus_one_nb[GGML_MAX_DIMS];
+        minus_one_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+        }
+        acl_minus_one_tensor = aclnn_values(
+            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
+            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
+        int64_t dim = 3;
+        int64_t* index = new int64_t[src0->ne[0]];
+        for (int i = 0; i < src0->ne[0]; i++) {
+            index[i] = i / 2 * 2;
+        }
+        int64_t index_num = src0->ne[0];
+        float value = -1;
+        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
+                                index_num, value);
+    } else {
+        // roll input: [q0,q1,q2,...] ->
+        // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
+        input_roll_buffer = roll_allocator.get();
+        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
+            input_roll_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
+        aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
+
+        int64_t shifts[] = {src0->ne[0] / 2};
+        int64_t dims[] = {3};
+        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
+
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+        // init [-1, -1, -1, 1, 1，1，...]
+        minus_one_scale_buffer = minus_one_scale_allocator.get();
+        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
+        size_t minus_one_nb[GGML_MAX_DIMS];
+        minus_one_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+        }
+        acl_minus_one_tensor = aclnn_values(
+            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
+            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
+        // -1 * first half
+        int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
+        size_t first_half_nb[GGML_MAX_DIMS];
+        first_half_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
+        }
+        aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
+            minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
+            first_half_nb, GGML_MAX_DIMS);
+        bool inplace = true;
+        float scale = -1;
+        aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
+        ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
+    }
+
+    // TODO: n_dims < ne0
+    GGML_ASSERT(n_dims == src0->ne[0]);
+
+    // input * scale
+    ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
+                                                  ggml_nbytes(src0));
+    void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
+    size_t input_nb[GGML_MAX_DIMS];
+    input_nb[0] = ggml_type_size(src0->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
+    }
+    aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
+        input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
+        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+    aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
+        input_roll_buffer, ggml_cann_type_mapping(src0->type),
+        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+
+    aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
+              acl_input_roll_mul_scale_tensor);
+
+    // output
+    void* output_fp32_buffer;
+    if (src0->type == GGML_TYPE_F32) {
+        aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor);
+        aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
+                          acl_sin_reshape_tensor);
+        aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
+        // TODO: ne0 != n_dims in mode2
+    } else if (src0->type == GGML_TYPE_F16) {
+        size_t input_fp32_nb[GGML_MAX_DIMS];
+        input_fp32_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
+        }
+        ggml_cann_pool_alloc fp32_allocator1(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        void* input_fp32_buffer1 = fp32_allocator1.get();
+        aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
+            input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
+            input_fp32_nb, GGML_MAX_DIMS);
+        ggml_cann_pool_alloc fp32_allocator2(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        void* input_fp32_buffer2 = fp32_allocator2.get();
+        aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
+            input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
+            input_fp32_nb, GGML_MAX_DIMS);
+
+        ggml_cann_pool_alloc fp32_allocator(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        output_fp32_buffer = fp32_allocator.get();
+        aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
+            output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
+            input_fp32_nb, GGML_MAX_DIMS);
+        aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
+        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
+                  input_fp32_tensor2);
+        aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
+                  output_fp32_tensor);
+        aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
+
+        ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
+        ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
+        ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_src));
+    }
+    return;
+#endif
+
+    // src0 == GGML_TYPE_F16
+    // TODO: optimization this `if` code
+    if (src0->type == GGML_TYPE_F16) {
+        ggml_cann_pool_alloc sin_final_allocator(
+            ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
+        ggml_cann_pool_alloc cos_final_allocator(
+            ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
+        void* sin_final_buffer = sin_final_allocator.get();
+        void* cos_final_buffer = cos_final_allocator.get();
+
+        int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
+        size_t sin_final_nb[GGML_MAX_DIMS];
+        sin_final_nb[0] = ggml_type_size(src0->type);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1];
+        }
+        aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor(
+            sin_final_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
+            GGML_MAX_DIMS);
+        aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor(
+            cos_final_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
+            GGML_MAX_DIMS);
+
+        aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor,
+                   ggml_cann_type_mapping(src0->type));
+        aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor,
+                   ggml_cann_type_mapping(src0->type));
+        ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+        acl_sin_reshape_tensor = acl_sin_final_tensor;
+        acl_cos_reshape_tensor = acl_cos_final_tensor;
+    }
 
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -3206,10 +3409,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         acl_mode = 1;
     }
 
-    aclTensor* acl_x = ggml_cann_create_tensor(src0);
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
     ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
-        acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
+        acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
         acl_dst, &workspaceSize, &executor));
     if (workspaceSize > 0) {
         ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
@@ -3219,7 +3420,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
                                            executor, ctx.stream()));
 
-    ACL_CHECK(aclDestroyTensor(acl_x));
+    ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
     ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
     ACL_CHECK(aclDestroyTensor(acl_dst));
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index bcb54e444..04e25b8ab 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1739,7 +1739,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_ROPE: {
             // TODO: with ops-test v == 1
             float * ext_factor = (float*)((int32_t*)op->op_params + 7);
-            float * attn_factor = (float*)((int32_t*)op->op_params + 8);
             // TODO: n_dims <= ne0
             if (op->src[0]->ne[0] != op->op_params[1]) {
                 return false;
@@ -1748,17 +1747,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             if (*ext_factor != 0) {
                 return false;
             }
-            // TODO: attn_factor != 1
-            if (*attn_factor != 1) {
-                return false;
-            }
-            //TODO: type == GGML_TYPE_F16
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F32:
-                    return true;
-                default:
-                    return false;
-            }
+            return true;
         }
         case GGML_OP_UPSCALE: {
             // aclnnUpsampleNearest2dGetWorkspaceSize not support

From 266b8519ee6d21e7ba2bf56f5629e20a181fee8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Fri, 29 Nov 2024 09:49:43 +0000
Subject: [PATCH 350/396] sycl : Reroute permuted mul_mats through oneMKL
 (#10408)

This PR fixes the failing MUL_MAT tests for the sycl backend.
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index b6392ed8d..aabcdc224 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3447,8 +3447,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
         use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
 
     if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        // KQ single-batch
-        ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
+        // TODO: Refactor and cleanup of mul mat dispatching.
+        if (src0->ne[3] == 1 && src1->ne[3] == 1) {
+            // KQ single-batch
+            // mmv p021 was specific for these dimensions
+            ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
+        } else {
+            // The kernel from the if path is faster for that specific case, but does not support all mul mats.
+            ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
+        }
     } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
         // KQV single-batch
         ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);

From 0f77aae5608f16780a49926b67be6d56ec4b09bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Fri, 29 Nov 2024 12:38:45 +0000
Subject: [PATCH 351/396] sycl : offload of get_rows set to 0 (#10432)

---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index aabcdc224..808f74fa0 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4493,7 +4493,7 @@ static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_
 static int64_t get_op_batch_size(const ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_GET_ROWS:
-            return op->ne[1]; // this will increse the speed of prefill in test
+            return 0;
         case GGML_OP_MUL_MAT:
             return op->ne[1];
         case GGML_OP_MUL_MAT_ID:

From 4b3242bbea172ac0980378496fbc676d44c4f459 Mon Sep 17 00:00:00 2001
From: Shupei Fan <dymarkfan@outlook.com>
Date: Fri, 29 Nov 2024 21:49:02 +0800
Subject: [PATCH 352/396] ggml-cpu: fix typo in gemv/gemm iq4_nl_4_4 (#10580)

---
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 69d3d327d..14a1f00eb 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -1020,7 +1020,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
         float * res_ptr = s;
 
         for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
 
             float32x4_t sumf = vdupq_n_f32(0);
             for (int l = 0; l < nb; l++) {
@@ -3507,7 +3507,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
         for (int y = 0; y < nr / 4; y++) {
             const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
             for (int x = 0; x < nc / ncols_interleaved; x++) {
-                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
 
                 float32x4_t sumf[4];
                 for (int m = 0; m < 4; m++) {

From f0678c5ff4cb8873d6ff48801475ff270db656fa Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 29 Nov 2024 16:25:39 +0200
Subject: [PATCH 353/396] ggml : fix I8MM Q4_1 scaling factor conversion
 (#10562)

ggml-ci
---
 ggml/src/ggml-cpu/ggml-cpu-quants.c | 57 +++++++++++++++++------------
 ggml/src/ggml-cpu/ggml-cpu.c        |  4 +-
 tests/test-backend-ops.cpp          |  2 +
 3 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index 11e8df253..634c5fa11 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -1791,11 +1791,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             const int8x16_t y1_l = vld1q_s8(b_y1->qs);
             const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
 
-            float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                    GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                                    GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                    GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
-
+            float32_t _scale[4] = {
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
+            };
             float32x4_t scale = vld1q_f32(_scale);
 
             int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -1811,7 +1812,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
 
             sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
         }
 
         float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
@@ -2347,10 +2348,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
             const block_q8_1 * restrict b_y0 = &vy0[i];
             const block_q8_1 * restrict b_y1 = &vy1[i];
 
-            float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
-                                    GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
-                                    GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
-                                    GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
+            float32_t summs_t[4] = {
+                GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
+                GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
+                GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
+                GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
+            };
             summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
 
             const uint8x16_t m4b = vdupq_n_u8(0x0F);
@@ -2371,10 +2374,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
             const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
 
             // mmla into int32x4_t
-            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
-                                   GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
-                                   GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
-                                   GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
+            float32_t _scale[4] = {
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
+            };
             float32x4_t scale = vld1q_f32(_scale);
 
             int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -2389,15 +2394,17 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
             int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
             int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
             sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
         }
 
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
         float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+
         sumv2 = vaddq_f32(sumv2, summs0);
 
         vst1_f32(s,      vget_low_f32 (sumv2));
         vst1_f32(s + bs, vget_high_f32(sumv2));
+
         return;
     }
 #endif
@@ -3374,10 +3381,12 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             const int8x16_t y1_l = vld1q_s8(b_y1->qs);
             const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
 
-            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                   GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+            float32_t _scale[4] = {
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
+            };
             float32x4_t scale = vld1q_f32(_scale);
 
             int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -3393,13 +3402,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
 
             sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                       l1, r1)), l2, r2)), l3, r3))), scale);
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
         }
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
+
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
         float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
 
-        vst1_f32(s, vget_low_f32(sumv2));
+        vst1_f32(s,      vget_low_f32 (sumv2));
         vst1_f32(s + bs, vget_high_f32(sumv2));
+
         return;
     }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 1c88e5d81..e0cefc20b 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -7641,8 +7641,8 @@ UseGgmlGemm2:;
         // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
         int64_t num_rows_per_vec_dot = vec_dot_num_rows;
 
-        // TODO: currently the mmla kernels support only even numbered rows/cols.
-        // this check can be removed once they are extended to support odd numbered rows/cols too
+        // these checks are needed to avoid crossing dim1 boundaries
+        // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
         if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
             num_rows_per_vec_dot = 1;
         }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index da66ed856..4d9df1a64 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3334,7 +3334,9 @@ static const ggml_type all_types[] = {
 
 static const ggml_type base_types[] = {
     GGML_TYPE_F32, GGML_TYPE_F16,
+    GGML_TYPE_Q8_0, // for I8MM tests
     GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1, // for I8MM tests
     GGML_TYPE_Q4_K,
     GGML_TYPE_IQ2_XXS
 };

From a3a3048e7a0f9464d0d625a29257d8bce5da5090 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 29 Nov 2024 17:45:08 +0100
Subject: [PATCH 354/396] cleanup UI link list (#10577)

* cleanup UI link list

* sort list alphabetically

* add missing licenses
---
 README.md | 58 ++++++++++++++++++++++++-------------------------------
 1 file changed, 25 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 414c5b1c0..8aa3d1e59 100644
--- a/README.md
+++ b/README.md
@@ -140,44 +140,36 @@ Typically finetunes of the base models below are supported as well.
 
 **UI:**
 
-Unless otherwise noted these projects are open-source with permissive licensing:
-
-- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
-- [iohub/collama](https://github.com/iohub/coLLaMA)
+- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
+- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
+- [Dot](https://github.com/alexpinel/Dot) (GPL)
+- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
+- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
-- [nat/openplayground](https://github.com/nat/openplayground)
-- [Faraday](https://faraday.dev/) (proprietary)
+- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
+- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [LARS](https://github.com/abgulati/LARS) (AGPL)
+- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
+- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
+- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
-- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
-- [ramalama](https://github.com/containers/ramalama) (MIT)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
-- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
-- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
-- [ollama/ollama](https://github.com/ollama/ollama)
-- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
-- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
-- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
-- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
-- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
-- [RAGNA Desktop](https://ragna.app/) (proprietary)
-- [RecurseChat](https://recurse.chat/) (proprietary)
-- [semperai/amica](https://github.com/semperai/amica)
-- [withcatai/catai](https://github.com/withcatai/catai)
-- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
-- [Msty](https://msty.app) (proprietary)
-- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
-- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
-- [Dot](https://github.com/alexpinel/Dot) (GPL)
 - [MindMac](https://mindmac.app) (proprietary)
-- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
-- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
-- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
-- [AIKit](https://github.com/sozercan/aikit) (MIT)
-- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
-- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
-- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
-- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
+- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
+- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
+- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
+- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
+- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
+- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
+- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
+- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
+- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
+- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
+- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
+- [ramalama](https://github.com/containers/ramalama) (MIT)
+- [semperai/amica](https://github.com/semperai/amica) (MIT)
+- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
 
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
 

From 3a8e9af402f7893423bdab444aa16c5d9a2d429a Mon Sep 17 00:00:00 2001
From: Robert Collins <roberto.tomas.cuentas@gmail.com>
Date: Fri, 29 Nov 2024 12:21:37 -0500
Subject: [PATCH 355/396] imatrix : support combine-only (#10492)

* imatrix-combine-only idea

* ensured that behavior consistent with log
---
 examples/imatrix/imatrix.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 70ff47768..45206f4a7 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -637,10 +637,19 @@ int main(int argc, char ** argv) {
         LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
-    if (!compute_imatrix(ctx, params)) {
-        return 1;
+    if (params.prompt.empty()) {
+        if (params.in_files.empty()) {
+            LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
+            return 1;
+        }
+        LOG_INF("No prompt provided; combining precomputed matrices only.\n");
+    } else {
+        if (!compute_imatrix(ctx, params)) {
+            return 1;
+        }
     }
 
+
     g_collector.save_imatrix();
 
     LOG("\n");

From b782e5c7d453b3f1fa8dc6c34cde7e2fa946af93 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Fri, 29 Nov 2024 21:48:56 +0100
Subject: [PATCH 356/396] server : add more test cases (#10569)

* server : add split model test

* add test speculative

* add invalid cases
---
 examples/server/tests/unit/test_basic.py      |  14 +++
 .../server/tests/unit/test_chat_completion.py |  19 ++++
 examples/server/tests/unit/test_infill.py     |  22 ++++
 examples/server/tests/unit/test_rerank.py     |  17 +++
 .../server/tests/unit/test_speculative.py     | 103 ++++++++++++++++++
 examples/server/tests/utils.py                |  12 +-
 6 files changed, 186 insertions(+), 1 deletion(-)
 create mode 100644 examples/server/tests/unit/test_speculative.py

diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py
index 84db5ca1c..d82d54a5a 100644
--- a/examples/server/tests/unit/test_basic.py
+++ b/examples/server/tests/unit/test_basic.py
@@ -32,3 +32,17 @@ def test_server_models():
     assert res.status_code == 200
     assert len(res.body["data"]) == 1
     assert res.body["data"][0]["id"] == server.model_alias
+
+def test_load_split_model():
+    global server
+    server.model_hf_repo = "ggml-org/models"
+    server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf"
+    server.model_alias = "tinyllama-split"
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 16,
+        "prompt": "Hello",
+        "temperature": 0.0,
+    })
+    assert res.status_code == 200
+    assert match_regex("(little|girl)+", res.body["content"])
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
index d7aeb288d..1048d6fca 100644
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -127,3 +127,22 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
         assert res.status_code != 200
         assert "error" in res.body
 
+
+@pytest.mark.parametrize("messages", [
+    None,
+    "string",
+    [123],
+    [{}],
+    [{"role": 123}],
+    [{"role": "system", "content": 123}],
+    # [{"content": "hello"}], # TODO: should not be a valid case
+    [{"role": "system", "content": "test"}, {}],
+])
+def test_invalid_chat_completion_req(messages):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "messages": messages,
+    })
+    assert res.status_code == 400 or res.status_code == 500
+    assert "error" in res.body
diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
index 38ce6c429..6a6d40a1c 100644
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@@ -8,6 +8,7 @@ def create_server():
     global server
     server = ServerPreset.tinyllama_infill()
 
+
 def test_infill_without_input_extra():
     global server
     server.start()
@@ -19,6 +20,7 @@ def test_infill_without_input_extra():
     assert res.status_code == 200
     assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
 
+
 def test_infill_with_input_extra():
     global server
     server.start()
@@ -33,3 +35,23 @@ def test_infill_with_input_extra():
     })
     assert res.status_code == 200
     assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
+
+
+@pytest.mark.parametrize("input_extra", [
+    {},
+    {"filename": "ok"},
+    {"filename": 123},
+    {"filename": 123, "text": "abc"},
+    {"filename": 123, "text": 456},
+])
+def test_invalid_input_extra_req(input_extra):
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
+        "input_extra": [input_extra],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 400
+    assert "error" in res.body
diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py
index 3a49fd3ac..189bc4c96 100644
--- a/examples/server/tests/unit/test_rerank.py
+++ b/examples/server/tests/unit/test_rerank.py
@@ -36,3 +36,20 @@ def test_rerank():
     assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
     assert most_relevant["index"] == 2
     assert least_relevant["index"] == 3
+
+
+@pytest.mark.parametrize("documents", [
+    [],
+    None,
+    123,
+    [1, 2, 3],
+])
+def test_invalid_rerank_req(documents):
+    global server
+    server.start()
+    res = server.make_request("POST", "/rerank", data={
+        "query": "Machine learning is",
+        "documents": documents,
+    })
+    assert res.status_code == 400
+    assert "error" in res.body
diff --git a/examples/server/tests/unit/test_speculative.py b/examples/server/tests/unit/test_speculative.py
new file mode 100644
index 000000000..982d6abb4
--- /dev/null
+++ b/examples/server/tests/unit/test_speculative.py
@@ -0,0 +1,103 @@
+import pytest
+from utils import *
+
+# We use a F16 MOE gguf as main model, and q4_0 as draft model
+
+server = ServerPreset.stories15m_moe()
+
+MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf"
+
+def create_server():
+    global server
+    server = ServerPreset.stories15m_moe()
+    # download draft model file if needed
+    file_name = MODEL_DRAFT_FILE_URL.split('/').pop()
+    model_draft_file = f'../../../{file_name}'
+    if not os.path.exists(model_draft_file):
+        print(f"Downloading {MODEL_DRAFT_FILE_URL} to {model_draft_file}")
+        with open(model_draft_file, 'wb') as f:
+            f.write(requests.get(MODEL_DRAFT_FILE_URL).content)
+        print(f"Done downloading draft model file")
+    # set default values
+    server.model_draft = model_draft_file
+    server.draft_min = 4
+    server.draft_max = 8
+
+
+@pytest.fixture(scope="module", autouse=True)
+def fixture_create_server():
+    return create_server()
+
+
+def test_with_and_without_draft():
+    global server
+    server.model_draft = None  # disable draft model
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "temperature": 0.0,
+        "top_k": 1,
+    })
+    assert res.status_code == 200
+    content_no_draft = res.body["content"]
+    server.stop()
+
+    # create new server with draft model
+    create_server()
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "temperature": 0.0,
+        "top_k": 1,
+    })
+    assert res.status_code == 200
+    content_draft = res.body["content"]
+
+    assert content_no_draft == content_draft
+
+
+def test_different_draft_min_draft_max():
+    global server
+    test_values = [
+        (1, 2),
+        (1, 4),
+        (4, 8),
+        (4, 12),
+        (8, 16),
+    ]
+    last_content = None
+    for draft_min, draft_max in test_values:
+        server.stop()
+        server.draft_min = draft_min
+        server.draft_max = draft_max
+        server.start()
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "temperature": 0.0,
+            "top_k": 1,
+        })
+        assert res.status_code == 200
+        if last_content is not None:
+            assert last_content == res.body["content"]
+        last_content = res.body["content"]
+
+
+@pytest.mark.parametrize("n_slots,n_requests", [
+    (1, 2),
+    (2, 2),
+])
+def test_multi_requests_parallel(n_slots: int, n_requests: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    tasks = []
+    for _ in range(n_requests):
+        tasks.append((server.make_request, ("POST", "/completion", {
+            "prompt": "I believe the meaning of life is",
+            "temperature": 0.0,
+            "top_k": 1,
+        })))
+    results = parallel_function_calls(tasks)
+    for res in results:
+        assert res.status_code == 200
+        assert match_regex("(wise|kind|owl|answer)+", res.body["content"])
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index a831f113f..e17a05ff6 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -46,6 +46,7 @@ class ServerProcess:
     model_alias: str | None = None
     model_url: str | None = None
     model_file: str | None = None
+    model_draft: str | None = None
     n_threads: int | None = None
     n_gpu_layer: int | None = None
     n_batch: int | None = None
@@ -68,6 +69,8 @@ class ServerProcess:
     response_format: str | None = None
     lora_files: List[str] | None = None
     disable_ctx_shift: int | None = False
+    draft_min: int | None = None
+    draft_max: int | None = None
 
     # session variables
     process: subprocess.Popen | None = None
@@ -102,6 +105,8 @@ class ServerProcess:
             server_args.extend(["--model", self.model_file])
         if self.model_url:
             server_args.extend(["--model-url", self.model_url])
+        if self.model_draft:
+            server_args.extend(["--model-draft", self.model_draft])
         if self.model_hf_repo:
             server_args.extend(["--hf-repo", self.model_hf_repo])
         if self.model_hf_file:
@@ -147,6 +152,10 @@ class ServerProcess:
             server_args.extend(["--no-context-shift"])
         if self.api_key:
             server_args.extend(["--api-key", self.api_key])
+        if self.draft_max:
+            server_args.extend(["--draft-max", self.draft_max])
+        if self.draft_min:
+            server_args.extend(["--draft-min", self.draft_min])
 
         args = [str(arg) for arg in [server_path, *server_args]]
         print(f"bench: starting server with: {' '.join(args)}")
@@ -185,7 +194,8 @@ class ServerProcess:
         raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
 
     def stop(self) -> None:
-        server_instances.remove(self)
+        if self in server_instances:
+            server_instances.remove(self)
         if self.process:
             print(f"Stopping server with pid={self.process.pid}")
             self.process.kill()

From 7cc2d2c88908fc92b97b28acafb82f7d6e425b85 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 29 Nov 2024 21:54:58 +0100
Subject: [PATCH 357/396] ggml : move AMX to the CPU backend (#10570)

* ggml : move AMX to the CPU backend

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .clang-tidy                                   |   2 +
 .github/workflows/build.yml                   |   5 +
 Makefile                                      |   9 +-
 Package.swift                                 |   4 +-
 common/CMakeLists.txt                         |   2 +-
 common/common.cpp                             |  10 +
 examples/batched-bench/CMakeLists.txt         |   2 +-
 examples/batched/CMakeLists.txt               |   2 +-
 .../convert-llama2c-to-ggml/CMakeLists.txt    |   2 +-
 examples/cvector-generator/CMakeLists.txt     |   2 +-
 examples/embedding/CMakeLists.txt             |   2 +-
 examples/eval-callback/CMakeLists.txt         |   2 +-
 examples/export-lora/CMakeLists.txt           |   2 +-
 examples/gbnf-validator/CMakeLists.txt        |   2 +-
 examples/gen-docs/CMakeLists.txt              |   2 +-
 examples/gguf-hash/CMakeLists.txt             |   2 +-
 examples/gguf-split/CMakeLists.txt            |   2 +-
 examples/gguf/CMakeLists.txt                  |   2 +-
 examples/gritlm/CMakeLists.txt                |   2 +-
 examples/imatrix/CMakeLists.txt               |   2 +-
 examples/infill/CMakeLists.txt                |   2 +-
 examples/llama-bench/CMakeLists.txt           |   2 +-
 examples/llava/CMakeLists.txt                 |   6 +-
 examples/lookahead/CMakeLists.txt             |   2 +-
 examples/lookup/CMakeLists.txt                |   8 +-
 examples/main-cmake-pkg/CMakeLists.txt        |   2 +-
 examples/main/CMakeLists.txt                  |   2 +-
 examples/parallel/CMakeLists.txt              |   2 +-
 examples/passkey/CMakeLists.txt               |   2 +-
 examples/perplexity/CMakeLists.txt            |   2 +-
 examples/quantize-stats/CMakeLists.txt        |   2 +-
 examples/quantize/CMakeLists.txt              |   2 +-
 examples/retrieval/CMakeLists.txt             |   2 +-
 examples/run/CMakeLists.txt                   |   2 +-
 examples/save-load-state/CMakeLists.txt       |   2 +-
 examples/server/CMakeLists.txt                |   2 +-
 examples/simple-chat/CMakeLists.txt           |   2 +-
 examples/simple/CMakeLists.txt                |   2 +-
 examples/speculative-simple/CMakeLists.txt    |   2 +-
 examples/speculative/CMakeLists.txt           |   2 +-
 examples/tokenize/CMakeLists.txt              |   2 +-
 ggml/CMakeLists.txt                           |   1 -
 ggml/include/ggml-amx.h                       |  25 -
 ggml/src/CMakeLists.txt                       |  16 +-
 ggml/src/ggml-amx/CMakeLists.txt              | 105 ----
 ggml/src/ggml-amx/ggml-amx.cpp                | 449 ------------------
 ggml/src/ggml-backend-reg.cpp                 |   7 -
 ggml/src/ggml-backend.cpp                     |   3 +-
 ggml/src/ggml-cpu/CMakeLists.txt              |  92 ++--
 ggml/src/ggml-cpu/amx/amx.cpp                 | 196 ++++++++
 ggml/src/ggml-cpu/amx/amx.h                   |  20 +
 ggml/src/{ggml-amx => ggml-cpu/amx}/common.h  |  23 +-
 ggml/src/{ggml-amx => ggml-cpu/amx}/mmq.cpp   |  96 ++--
 ggml/src/{ggml-amx => ggml-cpu/amx}/mmq.h     |   3 +-
 ggml/src/ggml-cpu/ggml-cpu-impl.h             |  15 +
 ggml/src/ggml-cpu/ggml-cpu.c                  |  84 ++--
 ggml/src/ggml-cpu/ggml-cpu.cpp                |  34 +-
 ggml/src/ggml-cpu/llamafile/sgemm.cpp         |   3 +-
 ggml/src/ggml-impl.h                          |  10 +-
 .../ggml-vulkan/vulkan-shaders/CMakeLists.txt |   2 +-
 pocs/vdot/CMakeLists.txt                      |   4 +-
 src/CMakeLists.txt                            |   2 +-
 src/unicode.cpp                               |  11 +
 tests/test-sampling.cpp                       |   2 +-
 64 files changed, 514 insertions(+), 801 deletions(-)
 delete mode 100644 ggml/include/ggml-amx.h
 delete mode 100644 ggml/src/ggml-amx/CMakeLists.txt
 delete mode 100644 ggml/src/ggml-amx/ggml-amx.cpp
 create mode 100644 ggml/src/ggml-cpu/amx/amx.cpp
 create mode 100644 ggml/src/ggml-cpu/amx/amx.h
 rename ggml/src/{ggml-amx => ggml-cpu/amx}/common.h (77%)
 rename ggml/src/{ggml-amx => ggml-cpu/amx}/mmq.cpp (98%)
 rename ggml/src/{ggml-amx => ggml-cpu/amx}/mmq.h (72%)

diff --git a/.clang-tidy b/.clang-tidy
index 952c0cca8..310c3d182 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -17,8 +17,10 @@ Checks: >
     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
     performance-*,
     portability-*,
+    -portability-simd-intrinsics,
     misc-*,
     -misc-const-correctness,
     -misc-non-private-member-variables-in-classes,
     -misc-no-recursion,
+    -misc-use-anonymous-namespace,
 FormatStyle: none
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 48953dafa..e2291bd34 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1121,6 +1121,11 @@ jobs:
         run: |
           & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
 
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ github.job }}
+
       - name: Build
         id: cmake_build
         run: |
diff --git a/Makefile b/Makefile
index 25214ec05..83adcef28 100644
--- a/Makefile
+++ b/Makefile
@@ -254,8 +254,8 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
 MK_CFLAGS    = -std=c11   -fPIC
-MK_CXXFLAGS  = -std=c++11 -fPIC
-MK_NVCCFLAGS = -std=c++11
+MK_CXXFLAGS  = -std=c++17 -fPIC
+MK_NVCCFLAGS = -std=c++17
 
 ifdef LLAMA_NO_CCACHE
 GGML_NO_CCACHE := 1
@@ -575,9 +575,12 @@ endif
 
 ifndef GGML_NO_AMX
 	MK_CPPFLAGS += -DGGML_USE_AMX
-	OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
+	OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
 endif
 
+# only necessary for the CPU backend files
+MK_CPPFLAGS += -Iggml/src/ggml-cpu
+
 ifdef GGML_RPC
 	MK_CPPFLAGS  += -DGGML_USE_RPC
 	OBJ_GGML_EXT += ggml/src/ggml-rpc.o
diff --git a/Package.swift b/Package.swift
index d9e8a4e2d..1e75aa7e2 100644
--- a/Package.swift
+++ b/Package.swift
@@ -28,13 +28,16 @@ var cSettings: [CSetting] =  [
     .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
     .unsafeFlags(["-fno-objc-arc"]),
     .headerSearchPath("ggml/src"),
+    .headerSearchPath("ggml/src/ggml-cpu"),
     // NOTE: NEW_LAPACK will required iOS version 16.4+
     // We should consider add this in the future when we drop support for iOS 14
     // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
     // .define("ACCELERATE_NEW_LAPACK"),
     // .define("ACCELERATE_LAPACK_ILP64")
+    .define("GGML_USE_CPU"),
 ]
 
+
 #if canImport(Darwin)
 sources.append("ggml/src/ggml-common.h")
 sources.append("ggml/src/ggml-metal/ggml-metal.m")
@@ -44,7 +47,6 @@ cSettings.append(
     contentsOf: [
         .define("GGML_USE_ACCELERATE"),
         .define("GGML_USE_METAL"),
-        .define("GGML_USE_CPU")
     ]
 )
 #endif
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 223174884..89862fe11 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -88,5 +88,5 @@ if (LLAMA_CURL)
 endif ()
 
 target_include_directories(${TARGET} PUBLIC .)
-target_compile_features   (${TARGET} PUBLIC cxx_std_11)
+target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/common/common.cpp b/common/common.cpp
index 2b2f00098..6143516d2 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) {
 
     std::u32string filename_utf32;
     try {
+#if defined(__clang__)
+        // disable C++17 deprecation warning for std::codecvt_utf8
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
         std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
         filename_utf32 = converter.from_bytes(filename);
 
         // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt
index 959acaeee..68ad707f3 100644
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt
index 77e33343b..0d439f498 100644
--- a/examples/batched/CMakeLists.txt
+++ b/examples/batched/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-batched)
 add_executable(${TARGET} batched.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt
index a6790e617..44e5f722a 100644
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml)
 add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/cvector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt
index 0a559d60c..49ad9561c 100644
--- a/examples/cvector-generator/CMakeLists.txt
+++ b/examples/cvector-generator/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator)
 add_executable(${TARGET} cvector-generator.cpp pca.hpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index 8256e789a..809040307 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt
index 5d1048aad..95915ed91 100644
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -2,7 +2,7 @@ set(TARGET llama-eval-callback)
 add_executable(${TARGET} eval-callback.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TEST_TARGET test-eval-callback)
 add_test(NAME ${TEST_TARGET}
diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt
index 1cef6e716..310455787 100644
--- a/examples/export-lora/CMakeLists.txt
+++ b/examples/export-lora/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt
index 4edd6ec73..d2cb524c0 100644
--- a/examples/gbnf-validator/CMakeLists.txt
+++ b/examples/gbnf-validator/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator)
 add_executable(${TARGET} gbnf-validator.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt
index c94cda776..25de0af35 100644
--- a/examples/gen-docs/CMakeLists.txt
+++ b/examples/gen-docs/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-gen-docs)
 add_executable(${TARGET} gen-docs.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt
index 7a494ce32..15c5c68c6 100644
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@@ -19,4 +19,4 @@ add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
 target_link_libraries(${TARGET} PRIVATE sha256)
 
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt
index f63887da7..c407e2f0a 100644
--- a/examples/gguf-split/CMakeLists.txt
+++ b/examples/gguf-split/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-gguf-split)
 add_executable(${TARGET} gguf-split.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt
index a9569b411..fb04eb83f 100644
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt
index 86dfddca3..fa1b4dc70 100644
--- a/examples/gritlm/CMakeLists.txt
+++ b/examples/gritlm/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-gritlm)
 add_executable(${TARGET} gritlm.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt
index d4c8265bd..412696c47 100644
--- a/examples/imatrix/CMakeLists.txt
+++ b/examples/imatrix/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-imatrix)
 add_executable(${TARGET} imatrix.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt
index 9b1aa3b63..fb26628d8 100644
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-infill)
 add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt
index 5bdbea4e2..17e3b9b87 100644
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-bench)
 add_executable(${TARGET} llama-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index bbf5fec58..5d32f377f 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .)
 target_include_directories(llava PUBLIC ../..)
 target_include_directories(llava PUBLIC ../../common)
 
-target_compile_features(llava PRIVATE cxx_std_11)
+target_compile_features(llava PRIVATE cxx_std_17)
 
 add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
 if (BUILD_SHARED_LIBS)
@@ -35,11 +35,11 @@ add_executable(${TARGET} llava-cli.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-minicpmv-cli)
 add_executable(${TARGET} minicpmv-cli.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt
index f0ae5cd89..346861314 100644
--- a/examples/lookahead/CMakeLists.txt
+++ b/examples/lookahead/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-lookahead)
 add_executable(${TARGET} lookahead.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt
index ef19fe25e..fba78ceda 100644
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -2,22 +2,22 @@ set(TARGET llama-lookup)
 add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-create)
 add_executable(${TARGET} lookup-create.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-merge)
 add_executable(${TARGET} lookup-merge.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-stats)
 add_executable(${TARGET} lookup-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt
index 3b38db292..5563f4de0 100644
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@@ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
 target_include_directories(${TARGET} PRIVATE ${_common_path})
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index 5f6efaa9a..af3d9150f 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-cli)
 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt
index c13557bac..847e916de 100644
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-parallel)
 add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt
index dc467a5d3..9bc5110c2 100644
--- a/examples/passkey/CMakeLists.txt
+++ b/examples/passkey/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-passkey)
 add_executable(${TARGET} passkey.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
index be0f2fd02..3e6864093 100644
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-perplexity)
 add_executable(${TARGET} perplexity.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt
index bb986a716..9a3a0d3cd 100644
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 62680cda4..47e5cbe30 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt
index 66610f311..512a602ec 100644
--- a/examples/retrieval/CMakeLists.txt
+++ b/examples/retrieval/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-retrieval)
 add_executable(${TARGET} retrieval.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt
index 084f1e92d..52add51ef 100644
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-run)
 add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
index 0fb5e359b..0f50e50de 100644
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 93e876f5a..e82f91533 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -50,4 +50,4 @@ if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
 
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt
index 87723533b..567f7fbbb 100644
--- a/examples/simple-chat/CMakeLists.txt
+++ b/examples/simple-chat/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-simple-chat)
 add_executable(${TARGET} simple-chat.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index b63afbb8b..104ecabfd 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-simple)
 add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/speculative-simple/CMakeLists.txt b/examples/speculative-simple/CMakeLists.txt
index 7a3a141c2..aeaea74fc 100644
--- a/examples/speculative-simple/CMakeLists.txt
+++ b/examples/speculative-simple/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-speculative-simple)
 add_executable(${TARGET} speculative-simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt
index aa208e7aa..c84196bd9 100644
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-speculative)
 add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/tokenize/CMakeLists.txt b/examples/tokenize/CMakeLists.txt
index b704dcae1..1690b53e5 100644
--- a/examples/tokenize/CMakeLists.txt
+++ b/examples/tokenize/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-tokenize)
 add_executable(${TARGET} tokenize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 70b5cfdf7..789fa3b0c 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -161,7 +161,6 @@ set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
 option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
-option(GGML_AMX                             "ggml: use AMX"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
diff --git a/ggml/include/ggml-amx.h b/ggml/include/ggml-amx.h
deleted file mode 100644
index 042d6d919..000000000
--- a/ggml/include/ggml-amx.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-// buffer_type API
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
-
-GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
-
-GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 9022aa3ae..19289f32b 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -261,21 +261,15 @@ function(ggml_add_backend backend)
     if (${backend_id})
         string(TOLOWER "ggml-${backend}" backend_target)
         add_subdirectory(${backend_target})
-        # check again in case the backend disabled itself
-        # note that this should NOT be the normal behavior, in case of errors the backend should fail the build
-        # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
-        if (${backend_id})
-            message(STATUS "Including ${backend} backend")
-            if (NOT GGML_BACKEND_DL)
-                string(TOUPPER "GGML_USE_${backend}" backend_use)
-                target_compile_definitions(ggml PUBLIC ${backend_use})
-            endif()
+        message(STATUS "Including ${backend} backend")
+        if (NOT GGML_BACKEND_DL)
+            string(TOUPPER "GGML_USE_${backend}" backend_use)
+            target_compile_definitions(ggml PUBLIC ${backend_use})
         endif()
     endif()
 endfunction()
 
 ggml_add_backend(CPU)
-ggml_add_backend(AMX)
 ggml_add_backend(BLAS)
 ggml_add_backend(CANN)
 ggml_add_backend(CUDA)
@@ -289,7 +283,7 @@ ggml_add_backend(Vulkan)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
-    target_compile_features   (${target} PRIVATE c_std_11) # don't bump
+    target_compile_features   (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
 endforeach()
 
 target_link_libraries(ggml-base PRIVATE Threads::Threads)
diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt
deleted file mode 100644
index cf3ade6f0..000000000
--- a/ggml/src/ggml-amx/CMakeLists.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
-        CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
-    message(STATUS "Using AMX")
-
-    file(GLOB   GGML_HEADERS_AMX "*.h")
-    list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
-
-    file(GLOB   GGML_SOURCES_AMX "*.cpp")
-
-    ggml_add_backend_library(ggml-amx
-                             ${GGML_HEADERS_AMX}
-                             ${GGML_SOURCES_AMX}
-                            )
-
-    # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
-    # TODO: integrate AMX backend into the CPU backend
-    if (MSVC)
-        # instruction set detection for MSVC only
-        if (GGML_NATIVE)
-            # TODO: improve, should not reference files from the parent folder
-            include(../ggml-cpu/cmake/FindSIMD.cmake)
-        endif ()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (GGML_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-            endif()
-            if (GGML_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-            endif()
-            if (GGML_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
-            endif()
-            if (GGML_AMX_TILE)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
-            endif()
-            if (GGML_AMX_INT8)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
-            endif()
-            if (GGML_AMX_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
-            endif()
-        elseif (GGML_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
-        elseif (GGML_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
-        endif()
-    else()
-        if (GGML_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (GGML_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
-        if (GGML_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
-        endif()
-        if (GGML_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
-        endif()
-        if (GGML_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
-        endif()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512dq)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (GGML_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (GGML_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-        if (GGML_AVX512_BF16)
-            list(APPEND ARCH_FLAGS -mavx512bf16)
-        endif()
-        if (GGML_AMX_TILE)
-            list(APPEND ARCH_FLAGS -mamx-tile)
-        endif()
-        if (GGML_AMX_INT8)
-            list(APPEND ARCH_FLAGS -mamx-int8)
-        endif()
-        if (GGML_AMX_BF16)
-            list(APPEND ARCH_FLAGS -mamx-bf16)
-        endif()
-    endif()
-
-    target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
-else()
-    set(GGML_AMX OFF PARENT_SCOPE)
-    message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
-endif()
diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp
deleted file mode 100644
index 6bfb3da27..000000000
--- a/ggml/src/ggml-amx/ggml-amx.cpp
+++ /dev/null
@@ -1,449 +0,0 @@
-#include "ggml-amx.h"
-#include "ggml-amx/common.h"
-#include "ggml-amx/mmq.h"
-#include "ggml-backend-impl.h"
-#include "ggml-impl.h"
-
-#if defined(__gnu_linux__)
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-
-#if defined(__AMX_INT8__)
-
-// AMX buffer interface
-static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-}
-
-static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *)(buffer->context);
-}
-
-static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-    memset((char *)tensor->data + offset, value, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    if (qtype_has_amx_kernels(tensor->type)) {
-        ggml_backend_amx_convert_weight(tensor, data, offset, size);
-    } else {
-        memcpy((char *)tensor->data + offset, data, size);
-    }
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        if (qtype_has_amx_kernels(src->type)) {
-            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
-        } else {
-            memcpy(dst->data, src->data, ggml_nbytes(src));
-        }
-        return true;
-    }
-    return false;
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    memset(buffer->context, value, buffer->size);
-}
-
-static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_amx_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "AMX";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
-    if (data == NULL) {
-        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
-        return NULL;
-    }
-
-    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
-}
-
-static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
-    return ggml_backend_amx_get_alloc_size(tensor);
-
-    GGML_UNUSED(buft);
-}
-
-static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return false;
-
-    GGML_UNUSED(buft);
-}
-
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
-            /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
-        },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_buffer_type_amx;
-}
-
-// backend interface
-
-static const char * ggml_backend_amx_name(ggml_backend_t backend) {
-    return "AMX";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_amx_free(ggml_backend_t backend) {
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
-    delete ctx;
-    delete backend;
-}
-
-static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        switch (node->op) {
-        case GGML_OP_MUL_MAT:
-            ggml_backend_amx_mul_mat(ctx, node);
-            break;
-
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            break;
-
-        default:
-            fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
-            GGML_ASSERT(false);
-        }
-    }
-
-    return GGML_STATUS_SUCCESS;
-
-    GGML_UNUSED(backend);
-}
-
-static struct ggml_backend_i ggml_backend_amx_i = {
-    /* .get_name                = */ ggml_backend_amx_name,
-    /* .free                    = */ ggml_backend_amx_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_amx_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_amx_guid() {
-    static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
-    return &guid;
-}
-
-#define ARCH_GET_XCOMP_PERM     0x1022
-#define ARCH_REQ_XCOMP_PERM     0x1023
-#define XFEATURE_XTILECFG       17
-#define XFEATURE_XTILEDATA      18
-
-static bool ggml_amx_init() {
-#if defined(__gnu_linux__)
-    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
-        fprintf(stderr, "AMX is not ready to be used!\n");
-        return false;
-    }
-    return true;
-#elif defined(_WIN32)
-    return true;
-#endif
-}
-
-ggml_backend_t ggml_backend_amx_init() {
-
-    // invoke a Linux system call to request access to AMX features
-    ggml_amx_init();
-
-    // backend context
-    ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
-
-    // ggml amx backend
-    ggml_backend_t backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_amx_guid(),
-        /* .interface = */ ggml_backend_amx_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
-        /* .context   = */ ctx,
-    };
-
-    return backend;
-}
-
-bool ggml_backend_is_amx(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
-}
-
-void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_amx(backend_amx));
-
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
-    ctx->n_threads = n_threads;
-}
-
-// device interface
-
-static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
-    return "AMX";
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
-    return "Intel Advanced Matrix Extensions";
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-
-    GGML_UNUSED(dev);
-}
-
-static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_amx_device_get_name(dev);
-    props->description = ggml_backend_amx_device_get_description(dev);
-    props->type        = ggml_backend_amx_device_get_type(dev);
-    ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
-
-    // `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ false,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
-    return ggml_backend_amx_init();
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_amx_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-
-    // handle only 2d gemm for now
-    auto is_contiguous_2d = [](const struct ggml_tensor * t) {
-        return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
-    };
-
-    switch (op->op) {
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            return true;
-
-        case GGML_OP_MUL_MAT: {
-            const struct ggml_tensor * src0 = op->src[0];
-            const struct ggml_tensor * src1 = op->src[1];
-
-            const enum ggml_type type = src0->type;
-            const int64_t ne0 = op->ne[0];
-
-            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
-            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
-            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
-
-            bool can_use_amx =
-                is_contiguous_2d(src0) &&       // src0 must be contiguous
-                is_contiguous_2d(src1) &&       // src1 must be contiguous
-                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
-                has_amx_kernels &&              // with amx kernel impls
-                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
-
-            return can_use_amx;
-        }
-        default:
-            return false;
-    }
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
-
-    GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
-    /* .get_name             = */ ggml_backend_amx_device_get_name,
-    /* .get_description      = */ ggml_backend_amx_device_get_description,
-    /* .get_memory           = */ ggml_backend_amx_device_get_memory,
-    /* .get_type             = */ ggml_backend_amx_device_get_type,
-    /* .get_props            = */ ggml_backend_amx_device_get_props,
-    /* .init_backend         = */ ggml_backend_amx_device_init,
-    /* .get_buffer_type      = */ ggml_backend_amx_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ NULL,
-    /* .supports_op          = */ ggml_backend_amx_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_amx_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-// backend reg interface
-
-static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
-    return "AMX";
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    static ggml_backend_device ggml_backend_amx_device = {
-        /* .iface   = */ ggml_backend_amx_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ nullptr,
-    };
-
-    return &ggml_backend_amx_device;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
-}
-
-static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        return (void *)ggml_backend_amx_set_n_threads;
-    }
-    return NULL;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(name);
-}
-
-static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
-    /* .get_name         = */ ggml_backend_amx_reg_get_name,
-    /* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_amx_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_amx_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_amx_reg(void) {
-    static struct ggml_backend_reg ggml_backend_amx_reg = {
-        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .iface       = */ ggml_backend_amx_reg_i,
-        /* .context     = */ NULL,
-    };
-
-    return &ggml_backend_amx_reg;
-}
-
-#else // if defined(__AMX_INT8__)
-
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
-    return nullptr;
-}
-
-bool ggml_backend_is_amx(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-    return false;
-}
-
-ggml_backend_t ggml_backend_amx_init(void) {
-    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-    return nullptr;
-}
-
-void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
-    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-
-    GGML_UNUSED(backend_amx);
-    GGML_UNUSED(n_threads);
-}
-
-ggml_backend_reg_t ggml_backend_amx_reg(void) {
-    return nullptr;
-}
-
-#endif
-
-GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index a0e0e2c58..3182b84f5 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -49,10 +49,6 @@
 #include "ggml-rpc.h"
 #endif
 
-#ifdef GGML_USE_AMX
-#  include "ggml-amx.h"
-#endif
-
 #ifdef GGML_USE_CANN
 #include "ggml-cann.h"
 #endif
@@ -92,9 +88,6 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_RPC
         register_backend(ggml_backend_rpc_reg());
 #endif
-#ifdef GGML_USE_AMX
-        register_backend(ggml_backend_amx_reg());
-#endif
 #ifdef GGML_USE_KOMPUTE
         register_backend(ggml_backend_kompute_reg());
 #endif
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 45da0c27d..fdb4b986f 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
 
     if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
         // since the tensor is pre-allocated, it cannot be moved to another backend
-        GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
+        ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+        GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
     }
 
     // graph input
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 4dbc1f75b..fe2222084 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -1,12 +1,20 @@
-ggml_add_backend_library(ggml-cpu
-                         ggml-cpu.c
-                         ggml-cpu.cpp
-                         ggml-cpu-aarch64.c
-                         ggml-cpu-aarch64.h
-                         ggml-cpu-quants.c
-                         ggml-cpu-quants.h
-                        )
+ggml_add_backend_library(ggml-cpu)
 
+list (APPEND GGML_CPU_SOURCES
+    ggml-cpu.c
+    ggml-cpu.cpp
+    ggml-cpu-aarch64.c
+    ggml-cpu-aarch64.h
+    ggml-cpu-quants.c
+    ggml-cpu-quants.h
+    amx/amx.cpp
+    amx/amx.h
+    amx/mmq.cpp
+    amx/mmq.h
+    ggml-cpu-impl.h
+    )
+
+target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
 target_include_directories(ggml-cpu PRIVATE .)
 
 if (APPLE AND GGML_ACCELERATE)
@@ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE)
     if (ACCELERATE_FRAMEWORK)
         message(STATUS "Accelerate framework found")
 
-        add_compile_definitions(GGML_USE_ACCELERATE)
-        add_compile_definitions(ACCELERATE_NEW_LAPACK)
-        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+        target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
+        target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
+        target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
 
         target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
     else()
@@ -29,15 +37,9 @@ if (GGML_OPENMP)
     if (OpenMP_FOUND)
         message(STATUS "OpenMP found")
 
-        add_compile_definitions(GGML_USE_OPENMP)
+        target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
 
         target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-
-        # FIXME: should be replaced with a compiler id check
-        #if (GGML_MUSA)
-        #    list(APPEND GGML_CPU_EXTRA_INCLUDES     "/usr/lib/llvm-14/lib/clang/14.0.0/include")
-        #    list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
-        #endif()
     else()
         message(WARNING "OpenMP not found")
     endif()
@@ -46,11 +48,11 @@ endif()
 if (GGML_LLAMAFILE)
     message(STATUS "Using llamafile")
 
-    add_compile_definitions(GGML_USE_LLAMAFILE)
+    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
 
-    target_sources(ggml-cpu PRIVATE
-                    llamafile/sgemm.cpp
-                    llamafile/sgemm.h)
+    list(APPEND GGML_CPU_SOURCES
+                llamafile/sgemm.cpp
+                llamafile/sgemm.h)
 endif()
 
 if (GGML_CPU_HBM)
@@ -58,7 +60,7 @@ if (GGML_CPU_HBM)
 
     message(STATUS "Using memkind for CPU HBM")
 
-    add_compile_definitions(GGML_USE_CPU_HBM)
+    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
 
     target_link_libraries(ggml-cpu PUBLIC memkind)
 endif()
@@ -72,16 +74,16 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
     message(STATUS "ARM detected")
 
     if (MSVC)
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
-        add_compile_definitions(__ARM_NEON)
-        add_compile_definitions(__ARM_FEATURE_FMA)
+        list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
+        list(APPEND ARCH_DEFINITIONS __ARM_NEON)
+        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
 
         set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
         string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
 
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
         if (GGML_COMPILER_SUPPORT_DOTPROD)
-            add_compile_definitions(__ARM_FEATURE_DOTPROD)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
 
             message(STATUS "ARM feature DOTPROD enabled")
         endif ()
@@ -89,14 +91,14 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
 
         if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
 
             message(STATUS "ARM feature MATMUL_INT8 enabled")
         endif ()
 
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
         if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
             message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
         endif ()
@@ -118,7 +120,7 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
                 check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
                 if (GGML_COMPILER_SUPPORT_DOTPROD)
                     set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
-                    add_compile_definitions(__ARM_FEATURE_DOTPROD)
+                    list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
 
                     message(STATUS "ARM feature DOTPROD enabled")
                 endif ()
@@ -131,7 +133,7 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
                 check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
                 if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
                     set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
-                    add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+                    list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
 
                     message(STATUS "ARM feature MATMUL_INT8 enabled")
                 endif ()
@@ -175,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
     if (MSVC)
         # instruction set detection for MSVC only
         if (GGML_NATIVE)
-            # TODO: improve, should not reference files from the parent folder
             include(cmake/FindSIMD.cmake)
         endif ()
         if (GGML_AVX512)
@@ -185,37 +186,31 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
             # macros corresponding to the extensions.
             # Do it manually.
             if (GGML_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+                list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
                     list(APPEND ARCH_FLAGS -mavx512vbmi)
                 endif()
             endif()
             if (GGML_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+                list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
                     list(APPEND ARCH_FLAGS -mavx512vnni)
                 endif()
             endif()
             if (GGML_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
+                list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
                     list(APPEND ARCH_FLAGS -mavx512bf16)
                 endif()
             endif()
             if (GGML_AMX_TILE)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
+                list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
             endif()
             if (GGML_AMX_INT8)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
+                list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
             endif()
             if (GGML_AMX_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
+                list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
             endif()
         elseif (GGML_AVX2)
             list(APPEND ARCH_FLAGS /arch:AVX2)
@@ -276,7 +271,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
     else()
         list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+        # TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
     message(STATUS "loongarch64 detected")
@@ -299,11 +294,12 @@ endif()
 
 if (GGML_CPU_AARCH64)
     message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
-    add_compile_definitions(GGML_USE_CPU_AARCH64)
+    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
 endif()
 
-target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
-target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
+target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
+set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS     "${ARCH_FLAGS}")
+set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
 
 if (EMSCRIPTEN)
     set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp
new file mode 100644
index 000000000..09c0df0f5
--- /dev/null
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@@ -0,0 +1,196 @@
+#include "amx.h"
+#include "common.h"
+#include "mmq.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+
+#if defined(__gnu_linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+
+// AMX buffer interface
+static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+}
+
+static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *)(buffer->context);
+}
+
+static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    memset((char *)tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    if (qtype_has_amx_kernels(tensor->type)) {
+        ggml_backend_amx_convert_weight(tensor, data, offset, size);
+    } else {
+        memcpy((char *)tensor->data + offset, data, size);
+    }
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        if (qtype_has_amx_kernels(src->type)) {
+            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
+        } else {
+            memcpy(dst->data, src->data, ggml_nbytes(src));
+        }
+        return true;
+    }
+    return false;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+
+static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_amx_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "AMX";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
+    if (data == NULL) {
+        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
+}
+
+static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
+    return ggml_backend_amx_get_alloc_size(tensor);
+
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+
+    GGML_UNUSED(buft);
+}
+
+#define ARCH_GET_XCOMP_PERM     0x1022
+#define ARCH_REQ_XCOMP_PERM     0x1023
+#define XFEATURE_XTILECFG       17
+#define XFEATURE_XTILEDATA      18
+
+static bool ggml_amx_init() {
+#if defined(__gnu_linux__)
+    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
+        fprintf(stderr, "AMX is not ready to be used!\n");
+        return false;
+    }
+    return true;
+#elif defined(_WIN32)
+    return true;
+#endif
+}
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
+        /* .iface = */ {
+            /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
+            /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
+        },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
+    };
+
+    if (!ggml_amx_init()) {
+        return NULL;
+    }
+
+    return &ggml_backend_buffer_type_amx;
+}
+
+bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
+}
+
+bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
+    // handle only 2d gemm for now
+    auto is_contiguous_2d = [](const struct ggml_tensor * t) {
+        return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
+    };
+
+    switch (op->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+
+        case GGML_OP_MUL_MAT: {
+            const struct ggml_tensor * src0 = op->src[0];
+            const struct ggml_tensor * src1 = op->src[1];
+
+            const enum ggml_type type = src0->type;
+            const int64_t ne0 = op->ne[0];
+
+            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
+            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
+            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
+
+            bool can_use_amx =
+                is_contiguous_2d(src0) &&       // src0 must be contiguous
+                is_contiguous_2d(src1) &&       // src1 must be contiguous
+                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
+                has_amx_kernels &&              // with amx kernel impls
+                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
+
+            return can_use_amx;
+        }
+        default:
+            return false;
+    }
+}
+
+#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
diff --git a/ggml/src/ggml-cpu/amx/amx.h b/ggml/src/ggml-cpu/amx/amx.h
new file mode 100644
index 000000000..c43546273
--- /dev/null
+++ b/ggml/src/ggml-cpu/amx/amx.h
@@ -0,0 +1,20 @@
+#include "ggml-backend.h"
+#include "ggml-cpu-impl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
+bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
+void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-amx/common.h b/ggml/src/ggml-cpu/amx/common.h
similarity index 77%
rename from ggml/src/ggml-amx/common.h
rename to ggml/src/ggml-cpu/amx/common.h
index 5db8ce30d..0b0657289 100644
--- a/ggml/src/ggml-amx/common.h
+++ b/ggml/src/ggml-cpu/amx/common.h
@@ -1,8 +1,7 @@
 #pragma once
 
 #include "ggml.h"
-// hack until AMX is moved into the CPU backend
-#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
+#include "ggml-cpu-impl.h"
 
 #include <algorithm>
 #include <memory>
@@ -74,16 +73,24 @@ inline void parallel_for(int nth, int n, const func_t& f) {
 #endif
 }
 
+template <typename func_t>
+inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
+    int tbegin, tend;
+    balance211(n, params->nth, params->ith, tbegin, tend);
+    f(tbegin, tend);
+    ggml_barrier(params->threadpool); // TODO: might not always be needed
+}
+
 // quantized types that have AMX support
 inline bool qtype_has_amx_kernels(const enum ggml_type type) {
     // TODO: fix padding for vnni format
     return (type == GGML_TYPE_Q4_0) ||
-        (type == GGML_TYPE_Q4_1);
-        //(type == GGML_TYPE_Q8_0) ||
-        //(type == GGML_TYPE_Q4_K) ||
-        //(type == GGML_TYPE_Q5_K) ||
-        //(type == GGML_TYPE_Q6_K) ||
-        //(type == GGML_TYPE_IQ4_XS);
+        (type == GGML_TYPE_Q4_1) ||
+        (type == GGML_TYPE_Q8_0) ||
+        (type == GGML_TYPE_Q4_K) ||
+        (type == GGML_TYPE_Q5_K) ||
+        (type == GGML_TYPE_Q6_K) ||
+        (type == GGML_TYPE_IQ4_XS);
 }
 
 // ggml backend context
diff --git a/ggml/src/ggml-amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp
similarity index 98%
rename from ggml/src/ggml-amx/mmq.cpp
rename to ggml/src/ggml-cpu/amx/mmq.cpp
index 529bee25b..6447e73d0 100644
--- a/ggml/src/ggml-amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -4,8 +4,11 @@
 #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
 #endif
 
+#include "amx.h"
 #include "mmq.h"
 #include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-cpu-quants.h"
 #include "ggml-quants.h"
 #include <algorithm>
 #include <type_traits>
@@ -33,7 +36,7 @@
 #define ALWAYS_INLINE inline
 #endif
 
-#if defined(__AMX_INT8__)
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
 
 namespace {
 
@@ -496,13 +499,12 @@ inline void from_float(const float * x, char * vy, int64_t k);
 
 template <>
 inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
-    // FIXME: using unoptimized reference impl until moved to CPU backend
-    quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k);
+    quantize_row_q8_0(x, (block_q8_0 *)vy, k);
 }
 
 template <>
 inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
-    quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k);
+    quantize_row_q8_1(x, (block_q8_1 *)vy, k);
 }
 
 template <>
@@ -950,7 +952,7 @@ template<typename TB, typename packed_B_t = packed_B_type<TB>>
 void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) {
     GGML_UNUSED(tile);
     GGML_UNUSED(packed_B);
-};
+}
 
 template <>
 void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) {
@@ -2327,9 +2329,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) {
 
 // pack weight to vnni format
 void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-
-    size_t alloc_size = ggml_backend_amx_get_alloc_size(tensor);
-    GGML_ASSERT(alloc_size == size);
+    GGML_ASSERT(offset == 0 && size == ggml_nbytes(tensor)); // only full tensor conversion is supported for now
 
     const enum ggml_type TYPE = tensor->type;
 
@@ -2348,6 +2348,29 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
     });
 }
 
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
+    struct ggml_tensor * src0 = dst->src[0];
+
+    const enum ggml_type TYPE = src0->type;
+
+    const bool is_floating_type = TYPE == GGML_TYPE_F16;
+    if (is_floating_type) {
+        return 0;
+    }
+
+    const int M = dst->ne[1];
+    const int K = src0->ne[0];
+
+    size_t desired_wsize = 0;
+
+    GGML_DISPATCH_QTYPES(TYPE, [&] {
+        const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
+        desired_wsize = M * row_size_A;
+    });
+
+    return desired_wsize;
+}
+
 // NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
 //
 // src0: weight in shape of {N, K}, quantized
@@ -2356,14 +2379,12 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
 //
 // the function performs: dst = src1 @ src0.T
 //
-void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
+void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) {
     struct ggml_tensor * src0 = dst->src[0];
     struct ggml_tensor * src1 = dst->src[1];
 
     const enum ggml_type TYPE = src0->type;
 
-    const int n_threads = ctx->n_threads;
-
     // f16 only has avx512 kernels for now,
     // amx kernels will be added once 6th gen xeon is released.
     const bool is_floating_type = TYPE == GGML_TYPE_F16;
@@ -2379,7 +2400,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
         const int MB = div_up(M, BLOCK_M);
         const int NB = div_up(N, BLOCK_N);
 
-        parallel_for(n_threads, MB * NB, [&](int begin, int end) {
+        parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
             GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
                 for (int i = begin; i < end; ++i) {
                     int mb = i / NB;
@@ -2412,27 +2433,29 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
     }
 
     // pointer to work space, used convert A from float to quantized type
-    void * wdata = nullptr;
+    void * wdata = params->wdata;
 
     //TODO: performance improvement: merge quant A
-    GGML_DISPATCH_QTYPES(TYPE, [&] {
-        const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
-        const size_t desired_wsize = M * row_size_A;
-        if (ctx->work_size < desired_wsize) {
-            ctx->work_data.reset(new char[desired_wsize]);
-            ctx->work_size = desired_wsize;
-        }
-        wdata = ctx->work_data.get();
+    if (params->ith == 0) {
+        GGML_DISPATCH_QTYPES(TYPE, [&] {
+            const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
+            const size_t desired_wsize = M * row_size_A;
+            if (params->wsize < desired_wsize) {
+                GGML_ABORT("insufficient work space size");
+            }
 
-        // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
-        // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
-        GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
+            // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
+            // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
+            GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
 
-        const float * A_data = static_cast<const float *>(src1->data);
-        for (int m = 0; m < M; ++m) {
-            from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
-        }
-    });
+            const float * A_data = static_cast<const float *>(src1->data);
+            for (int m = 0; m < M; ++m) {
+                from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
+            }
+        });
+    }
+
+    ggml_barrier(params->threadpool);
 
     if (M == 1) {
         // MB = 1 and handle 8 tiles in each block
@@ -2440,7 +2463,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
         constexpr int BLOCK_N = TILE_N * kTilesN;
         const int NB = div_up(N, BLOCK_N);
 
-        parallel_for(n_threads, NB, [&](int begin, int end) {
+        parallel_for_ggml(params, NB, [&](int begin, int end) {
             GGML_DISPATCH_QTYPES(TYPE, [&] {
                 const int KB = K / blck_size;
                 const int TILE_SIZE = get_tile_size<type>();
@@ -2470,7 +2493,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
     const int MB = div_up(M, BLOCK_M);
     const int NB = div_up(N, BLOCK_N);
 
-    parallel_for(n_threads, MB * NB, [&](int begin, int end) {
+    parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
         // init tile config for each thread
         ggml_tile_config_init();
 
@@ -2498,13 +2521,4 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
     });
 }
 
-#else // if defined(__AMX_INT8__)
-
-void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
-    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-#endif // if defined(__AMX_INT8__)
+#endif // if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
diff --git a/ggml/src/ggml-amx/mmq.h b/ggml/src/ggml-cpu/amx/mmq.h
similarity index 72%
rename from ggml/src/ggml-amx/mmq.h
rename to ggml/src/ggml-cpu/amx/mmq.h
index cf0920620..f37366093 100644
--- a/ggml/src/ggml-amx/mmq.h
+++ b/ggml/src/ggml-cpu/amx/mmq.h
@@ -1,6 +1,5 @@
 #pragma once
 #include "common.h"
-#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -10,7 +9,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
 
 void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
 
-void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst);
+void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 27a530b22..d71076ad1 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -15,6 +15,18 @@
 extern "C" {
 #endif
 
+struct ggml_compute_params {
+    // ith = thread index, nth = number of threads
+    int ith, nth;
+
+    // work buffer for all threads
+    size_t wsize;
+    void * wdata;
+
+    struct ggml_threadpool * threadpool;
+};
+
+
 #if defined(_MSC_VER)
 
 #define m512bh(p) p
@@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
 }
 #endif
 
+// TODO: move to ggml-threading
+void ggml_barrier(struct ggml_threadpool * tp);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index e0cefc20b..23ae2e10c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10,6 +10,7 @@
 #include "ggml-quants.h"
 #include "ggml-cpu-quants.h"
 #include "ggml-threading.h"
+#include "amx/amx.h"
 #include "ggml.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -624,7 +625,7 @@ do {                                                                  \
     for (int i = 0; i < offset; ++i) {                                \
         x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
     }                                                                 \
-    res = _mm512_reduce_add_ps(x[0]);                                 \
+    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                    \
 } while (0)
 
 // TODO: is this optimal ?
@@ -674,7 +675,7 @@ do {                                                              \
     for (int i = 0; i < offset; ++i) {                            \
         x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
     }                                                             \
-    res = _mm512_reduce_add_ps(x[0]);                             \
+    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                \
 } while (0)
 
 #define GGML_F16_VEC                GGML_F32Cx16
@@ -685,8 +686,8 @@ do {                                                              \
 #define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
 #define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
 #define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
 
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
 #elif defined(__AVX__)
 
 #define GGML_SIMD
@@ -1178,28 +1179,28 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define GGML_F32x4_ADD     __lsx_vfadd_s
 #define GGML_F32x4_MUL     __lsx_vfmul_s
-#define GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
-    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
-    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
-    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
-    tmp = __lsx_vsrli_d((__m128i)t0, 32); \
-    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
-    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
-    res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0);        \
+#define GGML_F32x4_REDUCE(res, x)                                                     \
+{                                                                                     \
+    int offset = GGML_F32_ARR >> 1;                                                   \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);                                     \
+    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    res             = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
 }
 
 #define GGML_F32_VEC        GGML_F32x4
@@ -1367,31 +1368,15 @@ struct ggml_compute_state {
     int ith;
 };
 
-struct ggml_compute_params {
-    // ith = thread index, nth = number of threads
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-
-    struct ggml_threadpool * threadpool;
-};
-
 //
 // fundamental operations
 //
 
 inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
 inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
 inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
@@ -2286,7 +2271,7 @@ struct ggml_state {
 
 static struct ggml_state g_state = {0};
 
-static void ggml_barrier(struct ggml_threadpool * tp) {
+void ggml_barrier(struct ggml_threadpool * tp) {
     int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
     if (n_threads == 1) {
         return;
@@ -7455,6 +7440,13 @@ static void ggml_compute_forward_mul_mat(
         type = (enum ggml_type)(intptr_t)src0->extra;
     }
 
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+    if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
+        ggml_backend_amx_mul_mat(params, dst);
+        return;
+    }
+#endif
+
     enum ggml_type           const vec_dot_type         = type_traits_cpu[type].vec_dot_type;
     ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
     ggml_from_float_to_mat_t const from_float_to_mat    = type_traits_cpu[vec_dot_type].from_float_to_mat;
@@ -13294,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan(
                 } break;
             case GGML_OP_MUL_MAT:
                 {
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+                    if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
+                        cur = ggml_backend_amx_desired_wsize(node);
+                    }
+#endif
                     const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
 
                     if (node->src[1]->type != vec_dot_type) {
-                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+                        size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+                        cur = MAX(cur, cur2);
                     }
                 } break;
             case GGML_OP_MUL_MAT_ID:
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 44d99089a..77e5d87a8 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -3,6 +3,7 @@
 #include "ggml-cpu.h"
 #include "ggml-cpu-aarch64.h"
 #include "ggml-impl.h"
+#include "amx/amx.h"
 #include <cctype>
 #include <string>
 #include <vector>
@@ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
     static std::vector<ggml_backend_buffer_type_t> bufts = []() {
         std::vector<ggml_backend_buffer_type_t> bufts;
 
-#ifdef GGML_USE_CPU_HBM
-        bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+        if (ggml_backend_amx_buffer_type()) {
+            bufts.push_back(ggml_backend_amx_buffer_type());
+        }
 #endif
 
 #ifdef GGML_USE_CPU_AARCH64
-        bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
+        if (ggml_backend_cpu_aarch64_buffer_type()) {
+            bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
+        }
 #endif
 
         bufts.push_back(NULL);
@@ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
     const struct ggml_tensor * src0 = op->src[0];
     const struct ggml_tensor * src1 = op->src[1];
 
+    if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
+        return true;
+    }
+
     if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
         if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
             return false;
         }
     }
 
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+    if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
+        return ggml_backend_amx_device_supports_op(op);
+    }
+    for (int i = 1; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
+            return false;
+        }
+    }
+#endif
+
     for (int i = 1; i < GGML_MAX_SRC; i++) {
         if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
             return false;
@@ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
 }
 
 static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
+    bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+    supported = supported || ggml_backend_amx_buft_is_amx(buft);
+#endif
+
+    return supported;
 
     GGML_UNUSED(dev);
 }
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index b2ce2e664..da4146ec4 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -50,8 +50,7 @@
 
 #include "sgemm.h"
 #include "ggml-impl.h"
-// hack until moved into the CPU backend
-#include "../ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
 
 #ifdef _MSC_VER
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index f39b7a88c..78e3af8f2 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -30,11 +30,13 @@
 extern "C" {
 #endif
 
-#undef MIN
-#undef MAX
+#ifndef MIN
+#    define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
 
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#ifndef MAX
+#    define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
 
 // required for mmap as gguf only guarantees 32-byte alignment
 #define TENSOR_ALIGNMENT 32
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
index 10075db33..51c78b7d2 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
@@ -3,5 +3,5 @@ find_package (Threads REQUIRED)
 set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt
index d5405ad29..6235aec1f 100644
--- a/pocs/vdot/CMakeLists.txt
+++ b/pocs/vdot/CMakeLists.txt
@@ -1,9 +1,9 @@
 set(TARGET llama-vdot)
 add_executable(${TARGET} vdot.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-q8dot)
 add_executable(${TARGET} q8dot.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2f581b921..f3b3908b1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -25,7 +25,7 @@ add_library(llama
             )
 
 target_include_directories(llama PUBLIC . ../include)
-target_compile_features   (llama PUBLIC cxx_std_11) # don't bump
+target_compile_features   (llama PUBLIC cxx_std_17) # don't bump
 
 target_link_libraries(llama PUBLIC ggml)
 
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 50b35bbbc..3d4592635 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -201,7 +201,18 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
 }
 
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+#if defined(__clang__)
+    // disable C++17 deprecation warning for std::codecvt_utf8
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
     std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
     return conv.from_bytes(s);
 }
 
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index be370044d..e5c9e75e4 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -284,7 +284,7 @@ static void test_perf() {
 
     data.reserve(n_vocab);
     for (int i = 0; i < n_vocab; i++) {
-        const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f);
+        const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5);
         data.emplace_back(llama_token_data{i, logit, 0.0f});
     }
 

From 0533e7fb3842a523f64dc533bd7bd7147ec2c63a Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Sat, 30 Nov 2024 07:00:02 +0000
Subject: [PATCH 358/396] vulkan: Dynamic subgroup size support for Q6_K
 mat_vec (#10536)

* subgroup 64 version with subgroup add. 15% faster

scalable version

tested for subgroup sizes 16-128

* check for subgroup multiple of 16 and greater than 16

* subgroup sizes are always a power of 2 (https://github.com/KhronosGroup/GLSL/issues/45)

* force 16 sequential threads per block

* make 16 subgroup size a constant
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 13 ++++----
 .../vulkan-shaders/mul_mat_vec_q6_k.comp      | 30 +++++++++----------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 849c11923..df6a659f4 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1231,6 +1231,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     std::cerr << "ggml_vulkan: Compiling shaders";
 
+    // some shaders require the subgroup size to be 16 or larger
+    const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
+
     // mulmat
     std::vector<uint32_t> l_warptile, m_warptile, s_warptile,
                           l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
@@ -1240,11 +1243,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
     m_warptile = { 128,  64,  64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
-    s_warptile = { std::max(device->subgroup_size, 16u),  32,  32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
+    s_warptile = { subgroup_size_16,  32,  32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
 
     l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
     m_warptile_mmq = { 128,  64,  64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
-    s_warptile_mmq = { std::max(device->subgroup_size, 16u),  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
+    s_warptile_mmq = { subgroup_size_16,  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
 
     l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
     m_mmq_wg_denoms = m_wg_denoms = { 64,  64, 1 };
@@ -1431,7 +1434,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@@ -1445,7 +1448,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true);
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@@ -1459,7 +1462,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
 
     // dequant shaders
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
index 0b392d68d..760aff854 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@@ -4,9 +4,11 @@
 
 #include "mul_mat_vec_base.comp"
 
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
-shared FLOAT_TYPE tmp[32];
+layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+
+shared FLOAT_TYPE tmp[BLOCK_SIZE];
 
 void main() {
     const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
@@ -21,21 +23,19 @@ void main() {
     const uint num_blocks_per_row = p.ncols / QUANT_K;
     const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
 
-    const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const uint ix  = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...16
+    const uint ix  = tid/16;
 
-    const uint step = 16/K_QUANTS_PER_ITERATION;            // 16 or 8
+    const uint step = 8;
 
-    const uint v_im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const uint v_in = tid - step*v_im;                      // 0...15 or 0...7
+    const uint v_im = itid/step;                            // 0 or 1. 0 computes 0..., 1 computes 128...
+    const uint v_in = itid - step*v_im;                     // 0...15 or 0...7
 
-#if K_QUANTS_PER_ITERATION == 1
-    const uint l0 = v_in;                                   // 0...15
-    const uint is = 0;
-#else
     const uint l0 = 4 * v_in;                               // 0, 4, 8, ..., 28
     const uint is = v_in / 4;
-#endif
 
     const uint ql_offset = 64*v_im + l0;
     const uint qh_offset = 32*v_im + l0;
@@ -44,7 +44,7 @@ void main() {
 
     FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
 
-    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
         const uint y_idx   = i * QUANT_K + y_offset;
 
         const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
@@ -95,10 +95,10 @@ void main() {
     }
 
     tmp[gl_LocalInvocationID.x] = temp;
-
     // sum up partial sums and write back result
+
     barrier();
-    [[unroll]] for (uint s = 16; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
         if (tid < s) {
             tmp[tid] += tmp[tid + s];
         }

From abadba05be52cccf6c0da49534e37f6062ce8ded Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 30 Nov 2024 09:47:07 +0200
Subject: [PATCH 359/396] readme : refresh (#10587)

* readme : refresh

* readme : move section [no ci]

* readme : clarify [no ci]

* readme : fixes [no ci]

* readme : more fixes [no ci]

* readme : simplify [no ci]

* readme : clarify GGUF
---
 README.md | 362 +++++++++++++++++++-----------------------------------
 1 file changed, 123 insertions(+), 239 deletions(-)

diff --git a/README.md b/README.md
index 8aa3d1e59..c5321f26c 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ## Description
 
 The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
-variety of hardware - locally and in the cloud.
+range of hardware - locally and in the cloud.
 
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
@@ -36,14 +36,17 @@ variety of hardware - locally and in the cloud.
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
 
-Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
-improved significantly thanks to many contributions. It is the main playground for developing new features for the
-[ggml](https://github.com/ggerganov/ggml) library.
+The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
 
-**Supported models:**
+<details>
+<summary>Models</summary>
 
 Typically finetunes of the base models below are supported as well.
 
+Instructions for adding support for new models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)
+
+**Text-only:**
+
 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
 - [x] LLaMA 3 🦙🦙🦙
@@ -97,9 +100,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
 
-(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
-
-**Multimodal models:**
+**Multimodal:**
 
 - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
 - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
@@ -111,7 +112,10 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
 
-**Bindings:**
+</details>
+
+<details>
+<summary>Bindings</summary>
 
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
@@ -138,7 +142,12 @@ Typically finetunes of the base models below are supported as well.
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
 - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 
-**UI:**
+</details>
+
+<details>
+<summary>UIs</summary>
+
+*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
 
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
@@ -148,6 +157,7 @@ Typically finetunes of the base models below are supported as well.
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
 - [LARS](https://github.com/abgulati/LARS) (AGPL)
 - [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
@@ -171,241 +181,35 @@ Typically finetunes of the base models below are supported as well.
 - [semperai/amica](https://github.com/semperai/amica) (MIT)
 - [withcatai/catai](https://github.com/withcatai/catai) (MIT)
 
-*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
+</details>
 
-**Tools:**
+<details>
+<summary>Tools</summary>
 
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
-- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
+- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
 
-**Infrastructure:**
+</details>
+
+<details>
+<summary>Infrastructure</summary>
 
 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
 - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
 
-**Games:**
+</details>
+
+<details>
+<summary>Games</summary>
+
 - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
 
-## Demo
-
-<details>
-<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
-
-```
-$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
-I llama.cpp build info:
-I UNAME_S:  Darwin
-I UNAME_P:  arm
-I UNAME_M:  arm64
-I CFLAGS:   -I.            -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
-I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
-I LDFLAGS:   -framework Accelerate
-I CC:       Apple clang version 14.0.3 (clang-1403.0.22.14.1)
-I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)
-
-make: Nothing to be done for `default'.
-main: build = 1041 (cf658ad)
-main: seed  = 1692823051
-llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
-llama_model_loader: - type  f32:   81 tensors
-llama_model_loader: - type q4_0:  281 tensors
-llama_model_loader: - type q6_K:    1 tensors
-llm_load_print_meta: format         = GGUF V1 (latest)
-llm_load_print_meta: arch           = llama
-llm_load_print_meta: vocab type     = SPM
-llm_load_print_meta: n_vocab        = 32000
-llm_load_print_meta: n_merges       = 0
-llm_load_print_meta: n_ctx_train    = 4096
-llm_load_print_meta: n_ctx          = 512
-llm_load_print_meta: n_embd         = 5120
-llm_load_print_meta: n_head         = 40
-llm_load_print_meta: n_head_kv      = 40
-llm_load_print_meta: n_layer        = 40
-llm_load_print_meta: n_rot          = 128
-llm_load_print_meta: n_gqa          = 1
-llm_load_print_meta: f_norm_eps     = 1.0e-05
-llm_load_print_meta: f_norm_rms_eps = 1.0e-05
-llm_load_print_meta: n_ff           = 13824
-llm_load_print_meta: freq_base      = 10000.0
-llm_load_print_meta: freq_scale     = 1
-llm_load_print_meta: model type     = 13B
-llm_load_print_meta: model ftype    = mostly Q4_0
-llm_load_print_meta: model size     = 13.02 B
-llm_load_print_meta: general.name   = LLaMA v2
-llm_load_print_meta: BOS token = 1 '<s>'
-llm_load_print_meta: EOS token = 2 '</s>'
-llm_load_print_meta: UNK token = 0 '<unk>'
-llm_load_print_meta: LF token  = 13 '<0x0A>'
-llm_load_tensors: ggml ctx size =    0.11 MB
-llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)
-...................................................................................................
-llama_new_context_with_model: kv self size  =  400.00 MB
-llama_new_context_with_model: compute buffer total size =   75.41 MB
-
-system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
-sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
-generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
-
-
- Building a website can be done in 10 simple steps:
-Step 1: Find the right website platform.
-Step 2: Choose your domain name and hosting plan.
-Step 3: Design your website layout.
-Step 4: Write your website content and add images.
-Step 5: Install security features to protect your site from hackers or spammers
-Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
-Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
-Step 8: Start marketing and promoting the website via social media channels or paid ads
-Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
-Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
-How does a Website Work?
-A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
-The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
-How to
-llama_print_timings:        load time =   576.45 ms
-llama_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
-llama_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
-llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
-llama_print_timings:       total time = 25431.49 ms
-```
-
 </details>
 
-<details>
-<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
-
-And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
-
-https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
-
-</details>
-
-## Usage
-
-Here are the end-to-end binary build and model conversion steps for most supported models.
-
-### Basic usage
-
-Firstly, you need to get the binary. There are different methods that you can follow:
-- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
-- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
-- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
-- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
-
-You can run a basic completion using this command:
-
-```bash
-llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
-
-# Output:
-# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
-```
-
-See [this page](./examples/main/README.md) for a full list of parameters.
-
-### Conversation mode
-
-If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
-
-```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
-
-# Output:
-# > hi, who are you?
-# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
-#
-# > what is 1+1?
-# Easy peasy! The answer to 1+1 is... 2!
-```
-
-By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
-
-```bash
-./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
-```
-
-You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
-
-```bash
-./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
-```
-
-### Web server
-
-[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
-
-Example usage:
-
-```bash
-./llama-server -m your_model.gguf --port 8080
-
-# Basic web UI can be accessed via browser: http://localhost:8080
-# Chat completion endpoint: http://localhost:8080/v1/chat/completions
-```
-
-### Interactive mode
-
-> [!NOTE]
-> If you prefer basic usage, please consider using conversation mode instead of interactive mode
-
-In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
-
-Here is an example of a few-shot interaction, invoked with the command
-
-```bash
-# default arguments using a 7B model
-./examples/chat.sh
-
-# advanced chat with a 13B model
-./examples/chat-13B.sh
-
-# custom arguments using a 13B model
-./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
-```
-
-Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
-
-![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
-
-### Persistent Interaction
-
-The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
-
-```bash
-# Start a new chat
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
-
-# Resume that chat
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
-
-# Start a different chat with the same prompt/model
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
-
-# Different prompt cache for different prompt/model
-PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
-    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
-```
-
-### Constrained output with grammars
-
-`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
-
-```bash
-./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
-```
-
-The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
-
-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
-
-## Build
-
-Please refer to [Build llama.cpp locally](./docs/build.md)
-
 ## Supported backends
 
 | Backend | Target devices |
@@ -420,23 +224,104 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
 | [Vulkan](./docs/build.md#vulkan) | GPU |
 | [CANN](./docs/build.md#cann) | Ascend NPU |
 
-## Tools
+## Building and usage
 
-### Prepare and Quantize
+The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
+The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
 
-> [!NOTE]
-> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
+- Clone this repository and build locally, see [how to build](./docs/build.md)
+- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](./docs/install.md)
+- Use a Docker image, see [documentation for Docker](./docs/docker.md)
+- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
 
-To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
+### Obtaining and quantizing models
 
-Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
-It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
+The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
 
-To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
+- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
+- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
+
+After downloading a model, use the CLI tools to run it locally - see below.
+
+`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
+
+The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
+
+- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
+- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
+- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
+- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
+
+To learn more about model quantization, [read this documentation](./examples/quantize/README.md)
+
+### Using the `llama-cli` tool
+
+Run a basic text completion:
+
+```bash
+llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
+
+# Output:
+# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+```
+
+See [this page](./examples/main/README.md) for a full list of parameters.
+
+### Conversation mode
+
+Run `llama-cli` in conversation/chat mode by passing the `-cnv` parameter:
+
+```bash
+llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
+
+# Output:
+# > hi, who are you?
+# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
+#
+# > what is 1+1?
+# Easy peasy! The answer to 1+1 is... 2!
+```
+
+By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+
+```bash
+llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
+```
+
+You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
+
+```bash
+llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
+```
+
+### Constrained output with grammars
+
+`llama.cpp` can constrain the output of the model via custom grammars. For example, you can force the model to output only JSON:
+
+```bash
+llama-cli -m your_model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+```
+
+The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
+
+For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
+
+### Web server (`llama-server`)
+
+The [llama-server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
+
+Example usage:
+
+```bash
+llama-server -m your_model.gguf --port 8080
+
+# Basic web UI can be accessed via browser: http://localhost:8080
+# Chat completion endpoint: http://localhost:8080/v1/chat/completions
+```
 
 ### Perplexity (measuring model quality)
 
-You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
+Use the `llama-perplexity` tool to measure perplexity over a given prompt (lower perplexity is better).
 For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
 
 To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
@@ -456,7 +341,6 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 
 - [main (cli)](./examples/main/README.md)
 - [server](./examples/server/README.md)
-- [jeopardy](./examples/jeopardy/README.md)
 - [GBNF grammars](./grammars/README.md)
 
 **Development documentation**

From 3e0ba0e604b1ac5b2cbca9a3f38f91f2be4ef1cd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 30 Nov 2024 10:09:21 +0200
Subject: [PATCH 360/396] readme : remove old badge

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index c5321f26c..37c6666aa 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
-[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
 
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
 

From 0c39f44d70d058940fe2afe50cfc789e3e44d756 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <adrien@gallouet.fr>
Date: Sat, 30 Nov 2024 18:13:18 +0100
Subject: [PATCH 361/396] ggml-cpu: replace AArch64 NEON assembly with
 intrinsics in ggml_gemv_q4_0_4x4_q8_0() (#10567)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 94 +++++++++++-----------------
 1 file changed, 37 insertions(+), 57 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 14a1f00eb..61a92cfd9 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -525,67 +525,47 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
     if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
 
-        __asm__ __volatile__(
-            "movi v31.16b, #0x4\n"
-            "movi v30.16b, #0xf0\n"
-            "add %x[b_ptr], %x[b_ptr], #0x8\n"
-            "1:"  // Column loop
-            "add x22, %x[a_ptr], #0x2\n"
-            "movi v29.16b, #0x0\n"
-            "mov x21, %x[nb]\n"
-            "2:"  // Block loop
-            "ldr q28, [%x[b_ptr], #0x0]\n"
-            "ldr q27, [x22, #0x0]\n"
-            "movi v26.4s, #0x0\n"
-            "sub x20, x22, #0x2\n"
-            "ldr q25, [x22, #0x10]\n"
-            "ldr q24, [%x[b_ptr], #0x10]\n"
-            "sub x21, x21, #0x1\n"
-            "add x22, x22, #0x22\n"
-            "ldr q23, [%x[b_ptr], #0x20]\n"
-            "ldr q22, [%x[b_ptr], #0x30]\n"
-            "ld1r { v21.8h }, [x20]\n"
-            "ldr q20, [%x[b_ptr], #-0x8]\n"
-            "sshl v16.16b, v28.16b, v31.16b\n"
-            "and v28.16b, v28.16b, v30.16b\n"
-            "sshl v19.16b, v24.16b, v31.16b\n"
-            "and v24.16b, v24.16b, v30.16b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x48\n"
-            "sshl v18.16b, v23.16b, v31.16b\n"
-            "and v23.16b, v23.16b, v30.16b\n"
-            ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
-            "sshl v17.16b, v22.16b, v31.16b\n"
-            "and v22.16b, v22.16b, v30.16b\n"
-            "fcvtl v21.4s, v21.4h\n"
-            "fcvtl v16.4s, v20.4h\n"
-            ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
-            "fmul v16.4s, v16.4s, v21.4s\n"
-            ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
-            ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
-            ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
-            ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
-            ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
-            ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
-            "scvtf v26.4s, v26.4s, #0x4\n"
-            "fmla v29.4s, v26.4s, v16.4s\n"
-            "cbnz x21, 2b\n"
-            "sub %x[nc], %x[nc], #0x4\n"
-            "str q29, [%x[res_ptr], #0x0]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "cbnz %x[nc], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-            : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
-            );
+        for (int c = 0; c < nc; c += ncols_interleaved) {
+            const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
+            float32x4_t acc = vdupq_n_f32(0);
+            for (int b = 0; b < nb; b++) {
+                int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
+                int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
+                int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
+                int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
+                float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
+
+                int8x16_t a0 = vld1q_s8(a_ptr->qs);
+                int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
+
+                int32x4_t ret = vdupq_n_s32(0);
+
+                ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
+                ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
+                ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
+                ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
+
+                ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
+                ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
+                ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
+                ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
+
+                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+                                vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+                a_ptr++;
+                b_ptr++;
+            }
+            vst1q_f32(s, acc);
+            s += ncols_interleaved;
+        }
         return;
     }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
     float sumf[4];
     int sumi;
 

From 43957ef203b4c9ceaee42c176b3ef44ea4359c85 Mon Sep 17 00:00:00 2001
From: Wang Qin <37098874+wangqin0@users.noreply.github.com>
Date: Sat, 30 Nov 2024 19:19:44 -0800
Subject: [PATCH 362/396] build: update Makefile comments for C++ version
 change (#10598)

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 83adcef28..e30821624 100644
--- a/Makefile
+++ b/Makefile
@@ -251,7 +251,7 @@ endif
 # Compile flags
 #
 
-# keep standard at C11 and C++11
+# keep standard at C11 and C++17
 MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
 MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++17 -fPIC

From 6acce3971098772a8aacb10fe8550b4119110581 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 1 Dec 2024 11:25:17 +0200
Subject: [PATCH 363/396] readme : update the usage section with examples
 (#10596)

* readme : update the usage section with examples

* readme : more examples
---
 README.md | 274 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 201 insertions(+), 73 deletions(-)

diff --git a/README.md b/README.md
index 37c6666aa..6fdd8d9ee 100644
--- a/README.md
+++ b/README.md
@@ -42,9 +42,9 @@ The `llama.cpp` project is the main playground for developing new features for t
 
 Typically finetunes of the base models below are supported as well.
 
-Instructions for adding support for new models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)
+Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
 
-**Text-only:**
+#### Text-only
 
 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
@@ -99,7 +99,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](./docs/deve
 - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
 
-**Multimodal:**
+#### Multimodal
 
 - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
 - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
@@ -213,27 +213,27 @@ Instructions for adding support for new models: [HOWTO-add-model.md](./docs/deve
 
 | Backend | Target devices |
 | --- | --- |
-| [Metal](./docs/build.md#metal-build) | Apple Silicon |
-| [BLAS](./docs/build.md#blas-build) | All |
-| [BLIS](./docs/backend/BLIS.md) | All |
-| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
-| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
-| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
-| [Vulkan](./docs/build.md#vulkan) | GPU |
-| [CANN](./docs/build.md#cann) | Ascend NPU |
+| [Metal](docs/build.md#metal-build) | Apple Silicon |
+| [BLAS](docs/build.md#blas-build) | All |
+| [BLIS](docs/backend/BLIS.md) | All |
+| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
+| [CUDA](docs/build.md#cuda) | Nvidia GPU |
+| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
+| [Vulkan](docs/build.md#vulkan) | GPU |
+| [CANN](docs/build.md#cann) | Ascend NPU |
 
-## Building and usage
+## Building the project
 
 The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
 The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
 
-- Clone this repository and build locally, see [how to build](./docs/build.md)
-- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](./docs/install.md)
-- Use a Docker image, see [documentation for Docker](./docs/docker.md)
+- Clone this repository and build locally, see [how to build](docs/build.md)
+- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
+- Use a Docker image, see [documentation for Docker](docs/docker.md)
 - Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
 
-### Obtaining and quantizing models
+## Obtaining and quantizing models
 
 The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
 
@@ -251,79 +251,204 @@ The Hugging Face platform provides a variety of online tools for converting, qua
 - Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
 - Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
 
-To learn more about model quantization, [read this documentation](./examples/quantize/README.md)
+To learn more about model quantization, [read this documentation](examples/quantize/README.md)
 
-### Using the `llama-cli` tool
+## [`llama-cli`](examples/main)
 
-Run a basic text completion:
+#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
 
-```bash
-llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
+- <details open>
+    <summary>Run simple text completion</summary>
 
-# Output:
-# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
-```
+    ```bash
+    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128
 
-See [this page](./examples/main/README.md) for a full list of parameters.
+    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+    ```
 
-### Conversation mode
+    </details>
 
-Run `llama-cli` in conversation/chat mode by passing the `-cnv` parameter:
+- <details>
+    <summary>Run in conversation mode</summary>
 
-```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
+    ```bash
+    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv
 
-# Output:
-# > hi, who are you?
-# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
-#
-# > what is 1+1?
-# Easy peasy! The answer to 1+1 is... 2!
-```
+    # > hi, who are you?
+    # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
+    #
+    # > what is 1+1?
+    # Easy peasy! The answer to 1+1 is... 2!
+    ```
 
-By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+    </details>
 
-```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
-```
+- <details>
+    <summary>Run with custom chat template</summary>
 
-You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
+    ```bash
+    # use the "chatml" template
+    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
 
-```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
-```
+    # use a custom template
+    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
+    ```
 
-### Constrained output with grammars
+    [Supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 
-`llama.cpp` can constrain the output of the model via custom grammars. For example, you can force the model to output only JSON:
+    </details>
 
-```bash
-llama-cli -m your_model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
-```
+- <details>
+    <summary>Constrain the output with a custom grammar</summary>
 
-The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
+    ```bash
+    llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
 
-For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
+    # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
+    ```
 
-### Web server (`llama-server`)
+    The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
 
-The [llama-server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
+    For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
 
-Example usage:
+    </details>
 
-```bash
-llama-server -m your_model.gguf --port 8080
 
-# Basic web UI can be accessed via browser: http://localhost:8080
-# Chat completion endpoint: http://localhost:8080/v1/chat/completions
-```
+## [`llama-server`](examples/server)
 
-### Perplexity (measuring model quality)
+#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
 
-Use the `llama-perplexity` tool to measure perplexity over a given prompt (lower perplexity is better).
-For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
+- <details open>
+    <summary>Start a local HTTP server with default configuration on port 8080</summary>
+
+    ```bash
+    llama-server -m model.gguf --port 8080
+
+    # Basic web UI can be accessed via browser: http://localhost:8080
+    # Chat completion endpoint: http://localhost:8080/v1/chat/completions
+    ```
+
+    </details>
+
+- <details>
+    <summary>Support multiple-users and parallel decoding</summary>
+
+    ```bash
+    # up to 4 concurrent requests, each with 4096 max context
+    llama-server -m model.gguf -c 16384 -np 4
+    ```
+
+    </details>
+
+- <details>
+    <summary>Enable speculative decoding</summary>
+
+    ```bash
+    # the draft.gguf model should be a small variant of the target model.gguf
+    llama-server -m model.gguf -md draft.gguf
+    ```
+
+    </details>
+
+- <details>
+    <summary>Serve an embedding model</summary>
+
+    ```bash
+    # use the /embedding endpoint
+    llama-server -m model.gguf --embedding --pooling cls -ub 8192
+    ```
+
+    </details>
+
+- <details>
+    <summary>Serve a reranking model</summary>
+
+    ```bash
+    # use the /reranking endpoint
+    llama-server -m model.gguf --reranking
+    ```
+
+    </details>
+
+- <details>
+    <summary>Constrain all outputs with a grammar</summary>
+
+    ```bash
+    # custom grammar
+    llama-server -m model.gguf --grammar-file grammar.gbnf
+
+    # JSON
+    llama-server -m model.gguf --grammar-file grammars/json.gbnf
+    ```
+
+    </details>
+
+
+## [`llama-perplexity`](examples/perplexity)
+
+#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
+
+- <details open>
+    <summary>Measure the perplexity over a text file</summary>
+
+    ```bash
+    llama-perplexity -m model.gguf -f file.txt
+
+    # [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
+    # Final estimate: PPL = 5.4007 +/- 0.67339
+    ```
+
+    </details>
+
+- <details>
+    <summary>Measure KL divergence</summary>
+
+    ```bash
+    # TODO
+    ```
+
+    </details>
+
+[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
+[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
+
+## [`llama-bench`](example/bench)
+
+#### Benchmark the performance of the inference for various parameters.
+
+- <details open>
+    <summary>Run default benchmark</summary>
+
+    ```bash
+    llama-bench -m model.gguf
+
+    # Output:
+    # | model               |       size |     params | backend    | threads |          test |                  t/s |
+    # | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
+    # | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         pp512 |      5765.41 ± 20.55 |
+    # | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         tg128 |        197.71 ± 0.81 |
+    #
+    # build: 3e0ba0e60 (4229)
+    ```
+
+    </details>
+
+
+## [`llama-simple`](examples/simple)
+
+#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
+
+- <details>
+    <summary>Basic text completion</summary>
+
+    ```bash
+    llama-simple -m model.gguf
+
+    # Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
+    ```
+
+    </details>
 
-To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
 
 ## Contributing
 
@@ -338,19 +463,19 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 
 ## Other documentation
 
-- [main (cli)](./examples/main/README.md)
-- [server](./examples/server/README.md)
-- [GBNF grammars](./grammars/README.md)
+- [main (cli)](examples/main/README.md)
+- [server](examples/server/README.md)
+- [GBNF grammars](grammars/README.md)
 
-**Development documentation**
+#### Development documentation
 
-- [How to build](./docs/build.md)
-- [Running on Docker](./docs/docker.md)
-- [Build on Android](./docs/android.md)
-- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
+- [How to build](docs/build.md)
+- [Running on Docker](docs/docker.md)
+- [Build on Android](docs/android.md)
+- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
 
-**Seminal papers and background on the models**
+#### Seminal papers and background on the models
 
 If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
 - LLaMA:
@@ -361,3 +486,6 @@ If your issue is with model generation quality, then please at least scan the fo
 - GPT-3.5 / InstructGPT / ChatGPT:
     - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
     - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+
+#### References
+

From 86dc11c5bcf34db2749d8bd8d4fa07a542c94f84 Mon Sep 17 00:00:00 2001
From: alek3y <44779186+alek3y@users.noreply.github.com>
Date: Sun, 1 Dec 2024 12:33:12 +0100
Subject: [PATCH 364/396] server : bind to any port when specified (#10590)

---
 examples/server/server.cpp | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 9c86407c2..1c765f0ea 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3347,8 +3347,18 @@ int main(int argc, char ** argv) {
         llama_backend_free();
     };
 
-    // bind HTTP listen port, run the HTTP server in a thread
-    if (!svr->bind_to_port(params.hostname, params.port)) {
+    // bind HTTP listen port
+    bool was_bound = false;
+    if (params.port == 0) {
+        int bound_port = svr->bind_to_any_port(params.hostname);
+        if ((was_bound = (bound_port >= 0))) {
+            params.port = bound_port;
+        }
+    } else {
+        was_bound = svr->bind_to_port(params.hostname, params.port);
+    }
+
+    if (!was_bound) {
         //LOG_ERROR("couldn't bind HTTP server socket", {
         //    {"hostname", params.hostname},
         //    {"port", params.port},
@@ -3357,6 +3367,8 @@ int main(int argc, char ** argv) {
         clean_up();
         return 1;
     }
+
+    // run the HTTP server in a thread
     std::thread t([&]() { svr->listen_after_bind(); });
     svr->wait_until_ready();
 

From 3420909dffa50e70660524797a1e715a717684d2 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Sun, 1 Dec 2024 16:12:41 +0100
Subject: [PATCH 365/396] ggml : automatic selection of best CPU backend
 (#10606)

* ggml : automatic selection of best CPU backend

* amx : minor opt

* add GGML_AVX_VNNI to enable avx-vnni, fix checks
---
 .devops/llama-server.Dockerfile      |  20 +-
 CMakeLists.txt                       |   4 -
 Package.swift                        |   2 +-
 ggml/CMakeLists.txt                  |   1 +
 ggml/src/ggml-backend-impl.h         |  58 ++++--
 ggml/src/ggml-backend-reg.cpp        | 270 ++++++++++++++++--------
 ggml/src/ggml-cpu/CMakeLists.txt     |  13 ++
 ggml/src/ggml-cpu/amx/common.h       |   1 -
 ggml/src/ggml-cpu/amx/mmq.cpp        |  74 ++++---
 ggml/src/ggml-cpu/cpu-feats-x86.cpp  | 298 +++++++++++++++++++++++++++
 ggml/src/ggml-cpu/ggml-cpu-aarch64.c |   2 +-
 scripts/build-cpu.sh                 |  12 ++
 12 files changed, 599 insertions(+), 156 deletions(-)
 create mode 100644 ggml/src/ggml-cpu/cpu-feats-x86.cpp
 create mode 100755 scripts/build-cpu.sh

diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile
index 02accc85e..7110dda9e 100644
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@@ -3,22 +3,34 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 
 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 
 WORKDIR /app
 
 COPY . .
 
-ENV LLAMA_CURL=1
 
-RUN make -j$(nproc) llama-server
+RUN \
+    # Build multiple versions of the CPU backend
+    scripts/build-cpu.sh avx         -DGGML_AVX=ON -DGGML_AVX2=OFF && \
+    scripts/build-cpu.sh avx2        -DGGML_AVX=ON -DGGML_AVX2=ON && \
+    scripts/build-cpu.sh avx512      -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \
+    scripts/build-cpu.sh amx         -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \
+    # Build llama-server
+    cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build --target llama-server -j $(nproc) && \
+    # Copy the built libraries to /app/lib
+    mkdir -p /app/lib && \
+    mv libggml-cpu* /app/lib/ && \
+    find build -name "*.so" -exec cp {} /app/lib/ \;
 
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
-COPY --from=build /app/llama-server /llama-server
+COPY --from=build /app/build/bin/llama-server /llama-server
+COPY --from=build /app/lib/ /
 
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d389dccb..f84fff9e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,10 +96,6 @@ if (NOT DEFINED GGML_LLAMAFILE)
     set(GGML_LLAMAFILE_DEFAULT ON)
 endif()
 
-if (NOT DEFINED GGML_AMX)
-    set(GGML_AMX ON)
-endif()
-
 if (NOT DEFINED GGML_CUDA_GRAPHS)
     set(GGML_CUDA_GRAPHS_DEFAULT ON)
 endif()
diff --git a/Package.swift b/Package.swift
index 1e75aa7e2..d32b74a63 100644
--- a/Package.swift
+++ b/Package.swift
@@ -88,5 +88,5 @@ let package = Package(
             linkerSettings: linkerSettings
         )
     ],
-    cxxLanguageStandard: .cxx11
+    cxxLanguageStandard: .cxx17
 )
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 789fa3b0c..06d371e09 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -96,6 +96,7 @@ option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 
 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
+option(GGML_AVX_VNNI    "ggml: enable AVX-VNNI"         OFF)
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
 option(GGML_AVX512      "ggml: enable AVX512"           OFF)
 option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index dff7749b4..36d72e95f 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -211,27 +211,45 @@ extern "C" {
     GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
 
     // Add backend dynamic loading support to the backend
-    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
 
-    #ifdef GGML_BACKEND_DL
-        #ifdef __cplusplus
-        #    define GGML_BACKEND_DL_IMPL(reg_fn)                                 \
-                extern "C" {                                                     \
-                    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
-                }                                                                \
-                ggml_backend_reg_t ggml_backend_init(void) {                     \
-                    return reg_fn();                                             \
-                }
-        #else
-        #    define GGML_BACKEND_DL_IMPL(reg_fn)                             \
-                GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
-                ggml_backend_reg_t ggml_backend_init(void) {                 \
-                    return reg_fn();                                         \
-                }
-        #endif
-    #else
-    #    define GGML_BACKEND_DL_IMPL(reg_fn)
-    #endif
+    // Initialize the backend
+    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
+    // Optional: obtain a score for the backend based on the system configuration
+    // Higher scores are preferred, 0 means the backend is not supported in the current system
+    typedef int                (*ggml_backend_score_t)(void);
+
+#ifdef GGML_BACKEND_DL
+#    ifdef __cplusplus
+#        define GGML_BACKEND_DL_IMPL(reg_fn)                             \
+            extern "C" {                                                 \
+            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+            }                                                            \
+            ggml_backend_reg_t ggml_backend_init(void) {                 \
+                return reg_fn();                                         \
+            }
+#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)       \
+            extern "C" {                                   \
+            GGML_BACKEND_API int ggml_backend_score(void); \
+            }                                              \
+            int ggml_backend_score(void) {                 \
+                return score_fn();                         \
+            }
+#    else
+#        define GGML_BACKEND_DL_IMPL(reg_fn)                              \
+            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void);  \
+            ggml_backend_reg_t                  ggml_backend_init(void) { \
+                return reg_fn();                                          \
+            }
+#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)        \
+            GGML_BACKEND_API int ggml_backend_score(void);  \
+            int                  ggml_backend_score(void) { \
+                return score_fn();                          \
+            }
+#    endif
+#else
+#    define GGML_BACKEND_DL_IMPL(reg_fn)
+#    define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
+#endif
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 3182b84f5..2c4bf11b0 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -2,8 +2,13 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include <algorithm>
+#include <codecvt>
 #include <cstring>
+#include <filesystem>
+#include <locale>
+#include <memory>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #ifdef _WIN32
@@ -57,9 +62,71 @@
 #include "ggml-kompute.h"
 #endif
 
+#ifdef _WIN32
+
+using dl_handle = std::remove_pointer_t<HMODULE>;
+
+struct dl_handle_deleter {
+    void operator()(HMODULE handle) {
+        FreeLibrary(handle);
+    }
+};
+
+static dl_handle * dl_load_library(const std::wstring & path) {
+    // suppress error dialogs for missing DLLs
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    HMODULE handle = LoadLibraryW(path.c_str());
+
+    SetErrorMode(old_mode);
+
+    return handle;
+}
+
+static dl_handle * dl_load_library(const std::string & path) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return dl_load_library(converter.from_bytes(path));
+}
+
+static void * dl_get_sym(dl_handle * handle, const char * name) {
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    void * p = (void *) GetProcAddress(handle, name);
+
+    SetErrorMode(old_mode);
+
+    return p;
+}
+
+#else
+
+using dl_handle = void;
+
+struct dl_handle_deleter {
+    void operator()(void * handle) {
+        dlclose(handle);
+    }
+};
+
+static void * dl_load_library(const std::string & path) {
+    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+
+    return handle;
+}
+
+static void * dl_get_sym(dl_handle * handle, const char * name) {
+    return dlsym(handle, name);
+}
+
+#endif
+
+using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
+
 struct ggml_backend_reg_entry {
     ggml_backend_reg_t reg;
-    void * handle;
+    dl_handle_ptr handle;
 };
 
 struct ggml_backend_registry {
@@ -97,13 +164,16 @@ struct ggml_backend_registry {
     }
 
     ~ggml_backend_registry() {
-        while (!backends.empty()) {
-            // use silent since the log system may have been destroyed at this point
-            unload_backend(backends.back().reg, true);
+        // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
+        // since backend threads may still be running and accessing resources from the dynamic library
+        for (auto & entry : backends) {
+            if (entry.handle) {
+                entry.handle.release(); // NOLINT
+            }
         }
     }
 
-    void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
+    void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
         if (!reg) {
             return;
         }
@@ -112,7 +182,7 @@ struct ggml_backend_registry {
         GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
             __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
 #endif
-        backends.push_back({ reg, handle });
+        backends.push_back({ reg, std::move(handle) });
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
             register_device(ggml_backend_reg_dev_get(reg, i));
         }
@@ -126,79 +196,53 @@ struct ggml_backend_registry {
     }
 
     ggml_backend_reg_t load_backend(const char * path, bool silent) {
-#ifdef _WIN32
-        // suppress error dialogs for missing DLLs
-        DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-        SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-
-        HMODULE handle = LoadLibraryA(path);
-
+        dl_handle_ptr handle { dl_load_library(path) };
         if (!handle) {
             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
             }
-            SetErrorMode(old_mode);
             return nullptr;
         }
 
-        ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
-
-        SetErrorMode(old_mode);
-
-        if (!backend_init) {
+        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+        if (score_fn && score_fn() == 0) {
             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
             }
-            FreeLibrary(handle);
             return nullptr;
         }
-#else
-        void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
 
-        if (!handle) {
+        auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
+        if (!backend_init_fn) {
             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
             }
             return nullptr;
         }
 
-        auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
-
-        if (!backend_init) {
-            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
-            }
-            dlclose(handle);
-            return nullptr;
-        }
-#endif
-        ggml_backend_reg_t reg = backend_init();
-
+        ggml_backend_reg_t reg = backend_init_fn();
         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
             if (!silent) {
                 if (!reg) {
                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
                 } else {
                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                                   __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
+                        __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
                 }
             }
-#ifdef _WIN32
-            FreeLibrary(handle);
-#else
-            dlclose(handle);
-#endif
             return nullptr;
         }
 
         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
-        register_backend(reg, handle);
+
+        register_backend(reg, std::move(handle));
+
         return reg;
     }
 
     void unload_backend(ggml_backend_reg_t reg, bool silent) {
         auto it = std::find_if(backends.begin(), backends.end(),
-                                [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
+                               [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
 
         if (it == backends.end()) {
             if (!silent) {
@@ -217,15 +261,6 @@ struct ggml_backend_registry {
                             [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
             devices.end());
 
-        // unload library
-        if (it->handle) {
-#ifdef _WIN32
-            FreeLibrary((HMODULE) it->handle);
-#else
-            dlclose(it->handle);
-#endif
-        }
-
         // remove backend
         backends.erase(it);
     }
@@ -341,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) {
     get_reg().unload_backend(reg, true);
 }
 
-void ggml_backend_load_all() {
-    std::vector<std::string> search_prefix;
-
-    // add the executable directory to the search path
-    // FIXME: this is convenient for development, but it should probably be disabled in production
-
+static std::string get_executable_path() {
 #if defined(__APPLE__)
     // get executable path
     std::vector<char> path;
@@ -364,7 +394,7 @@ void ggml_backend_load_all() {
     if (last_slash != std::string::npos) {
         base_path = base_path.substr(0, last_slash);
     }
-    search_prefix.push_back(base_path + "/");
+    return base_path + "/";
 #elif defined(__linux__)
     std::string base_path = ".";
     std::vector<char> path(1024);
@@ -386,38 +416,104 @@ void ggml_backend_load_all() {
         path.resize(path.size() * 2);
     }
 
-    search_prefix.push_back(base_path + "/");
+    return base_path + "/";
+#elif defined(_WIN32)
+    std::vector<char> path(MAX_PATH);
+    DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
+    if (len == 0) {
+        return "";
+    }
+    std::string base_path(path.data(), len);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('\\');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    return base_path + "\\";
 #endif
+}
 
-    auto & reg = get_reg();
-
-    auto try_load = [&](const std::string & name) {
-        std::string os_name;
+static std::string backend_filename_prefix() {
 #ifdef _WIN32
-        os_name = "ggml-" + name + ".dll";
+    return "ggml-";
 #else
-        os_name = "libggml-" + name + ".so";
+    return "libggml-";
 #endif
-        if (reg.load_backend(os_name.c_str(), true)) {
-            return;
+}
+
+static std::string backend_filename_suffix() {
+#ifdef _WIN32
+    return ".dll";
+#else
+    return ".so";
+#endif
+}
+
+static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
+    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
+     // TODO: search system paths
+    std::vector<std::string> search_paths = { "./", get_executable_path() };
+    std::string file_prefix = backend_filename_prefix() + name + "-";
+
+    int best_score = 0;
+    std::string best_path;
+
+    namespace fs = std::filesystem;
+    for (const auto & search_path : search_paths) {
+        if (!fs::exists(search_path)) {
+            continue;
         }
-        for (const auto & prefix : search_prefix) {
-            if (reg.load_backend((prefix + os_name).c_str(), true)) {
-                return;
+        for (const auto & entry : fs::directory_iterator(search_path)) {
+            if (entry.is_regular_file()) {
+                std::string filename = entry.path().filename().string();
+                std::string ext = entry.path().extension().string();
+                if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
+                    dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
+                    if (!handle && !silent) {
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
+                    }
+                    if (handle) {
+                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+                        if (score_fn) {
+                            int s = score_fn();
+#ifndef NDEBUG
+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
+#endif
+                            if (s > best_score) {
+                                best_score = s;
+                                best_path = entry.path().string();
+                            }
+                        }
+                    }
+                }
             }
         }
-    };
+    }
 
-    try_load("amx");
-    try_load("blas");
-    try_load("cann");
-    try_load("cuda");
-    try_load("hip");
-    try_load("kompute");
-    try_load("metal");
-    try_load("rpc");
-    try_load("sycl");
-    try_load("vulkan");
-    try_load("musa");
-    try_load("cpu");
+    if (best_score == 0) {
+        // try to load the base backend
+        for (const auto & search_path : search_paths) {
+            std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
+            if (fs::exists(path)) {
+                return get_reg().load_backend(path.c_str(), silent);
+            }
+        }
+        return nullptr;
+    }
+
+    return get_reg().load_backend(best_path.c_str(), silent);
+}
+
+void ggml_backend_load_all() {
+    ggml_backend_load_best("blas", true);
+    ggml_backend_load_best("cann", true);
+    ggml_backend_load_best("cuda", true);
+    ggml_backend_load_best("hip", true);
+    ggml_backend_load_best("kompute", true);
+    ggml_backend_load_best("metal", true);
+    ggml_backend_load_best("rpc", true);
+    ggml_backend_load_best("sycl", true);
+    ggml_backend_load_best("vulkan", true);
+    ggml_backend_load_best("musa", true);
+    ggml_backend_load_best("cpu", true);
 }
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index fe2222084..5df63884c 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -217,6 +217,12 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
         elseif (GGML_AVX)
             list(APPEND ARCH_FLAGS /arch:AVX)
         endif()
+        if (GGML_AVX_VNNI)
+            list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
+            if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                list(APPEND ARCH_FLAGS -mavxvnni)
+            endif()
+        endif()
     else()
         if (GGML_NATIVE)
             list(APPEND ARCH_FLAGS -march=native)
@@ -233,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
         if (GGML_AVX2)
             list(APPEND ARCH_FLAGS -mavx2)
         endif()
+        if (GGML_AVX_VNNI)
+            list(APPEND ARCH_FLAGS -mavxvnni)
+        endif()
         if (GGML_AVX512)
             list(APPEND ARCH_FLAGS -mavx512f)
             list(APPEND ARCH_FLAGS -mavx512dq)
@@ -301,6 +310,10 @@ target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
 set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS     "${ARCH_FLAGS}")
 set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
 
+# the feature detection code must be compiled without any architecture flags
+target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
+# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
+
 if (EMSCRIPTEN)
     set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
 endif()
diff --git a/ggml/src/ggml-cpu/amx/common.h b/ggml/src/ggml-cpu/amx/common.h
index 0b0657289..40074c3fc 100644
--- a/ggml/src/ggml-cpu/amx/common.h
+++ b/ggml/src/ggml-cpu/amx/common.h
@@ -78,7 +78,6 @@ inline void parallel_for_ggml(const ggml_compute_params * params, int n, const f
     int tbegin, tend;
     balance211(n, params->nth, params->ith, tbegin, tend);
     f(tbegin, tend);
-    ggml_barrier(params->threadpool); // TODO: might not always be needed
 }
 
 // quantized types that have AMX support
diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp
index 6447e73d0..0ec3aa86d 100644
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -1340,21 +1340,19 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
         __m512 vb[COLS];
         __m512 vc[ROWS * COLS];
 
-        auto loadc = [&](int idx) {
+        auto loadc = [&](auto idx) {
             vc[idx] = _mm512_setzero_ps();
         };
         Unroll<ROWS * COLS>{}(loadc);
 
-        auto compute = [&](int idx, int k) {
-            // TODO: use `constexpr` here to get rid of interger div
-            // when upgraded to C++17
-            const int row = idx / COLS;
-            const int col = idx % COLS;
+        auto compute = [&](auto idx, auto k) {
+            constexpr int row = idx / COLS;
+            constexpr int col = idx % COLS;
 
-            if (col == 0) {
+            if constexpr (col == 0) {
                 va = _mm512_loadu_ps(A + row * K + k);
             }
-            if (row == 0) {
+            if constexpr (row == 0) {
                 vb[col] =  _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k)));
             }
             vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
@@ -1364,9 +1362,9 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
             Unroll<ROWS * COLS>{}(compute, k);
         }
 
-        auto storec = [&](int idx) {
-            const int row = idx / COLS;
-            const int col = idx % COLS;
+        auto storec = [&](auto idx) {
+            constexpr int row = idx / COLS;
+            constexpr int col = idx % COLS;
             C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
         };
         Unroll<ROWS * COLS>{}(storec);
@@ -1429,14 +1427,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
         const __m512i off = _mm512_set1_epi8(8);
         const __m512i lowMask = _mm512_set1_epi8(0xF);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
-        auto compute = [&](int col, int i) {
+        auto compute = [&](auto col, auto i) {
             // load a and compute compensation
-            if (col == 0) {
+            if constexpr (col == 0) {
                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
                 vcomp = _mm512_setzero_si512();
                 for (int k = 0; k < 8; ++k) {
@@ -1468,7 +1466,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
@@ -1492,14 +1490,14 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
 
         const __m512i lowMask = _mm512_set1_epi8(0xF);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
-        auto compute = [&](int col, int i) {
+        auto compute = [&](auto col, auto i) {
             // load a
-            if (col == 0) {
+            if constexpr (col == 0) {
                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
                 for (int k = 0; k < 8; ++k) {
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
@@ -1533,7 +1531,7 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
@@ -1564,14 +1562,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
         //
         const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
-        auto compute = [&](int col, int i) {
+        auto compute = [&](auto col, auto i) {
             // load a and add offset 128
-            if (col == 0) {
+            if constexpr (col == 0) {
                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
                 for (int k = 0; k < 8; ++k) {
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
@@ -1604,7 +1602,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
@@ -1636,7 +1634,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
 
         const __m512i lowMask = _mm512_set1_epi8(0xF);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
@@ -1650,9 +1648,9 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
         //     int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
         //     from {16,  8} to {4, 32}
         //
-        auto compute = [&](int col, int i) {
+        auto compute = [&](auto col, auto i) {
             // load a
-            if (col == 0) {
+            if constexpr (col == 0) {
                 for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
                     va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
                 }
@@ -1704,7 +1702,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
@@ -1737,15 +1735,15 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
 
         const __m512i lowMask = _mm512_set1_epi8(0xF);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
         // Q5_K and Q4_K shares the same vnni formats, refer to notes above.
-        auto compute = [&](int col, int i) {
+        auto compute = [&](auto col, auto i) {
             // load a
-            if (col == 0) {
+            if constexpr (col == 0) {
                 for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
                     va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
                 }
@@ -1810,7 +1808,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
@@ -1843,13 +1841,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLO
         const __m512i m32s = _mm512_set1_epi32(32);
         const __m512i lowMask = _mm512_set1_epi8(0xF);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
-        auto compute = [&](int col, int i) {
-            if (col == 0) {
+        auto compute = [&](auto col, auto i) {
+            if constexpr (col == 0) {
                 // load a
                 va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0));
                 va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64));
@@ -1961,13 +1959,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
         const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
         const __m512i values256 = _mm512_add_epi8(values128, off);
 
-        auto loadc = [&](int col) {
+        auto loadc = [&](auto col) {
             vc[col] = _mm512_setzero_ps();
         };
         Unroll<COLS>{}(loadc);
 
-        auto compute = [&](int col, int i) {
-            if (col == 0) {
+        auto compute = [&](auto col, auto i) {
+            if constexpr (col == 0) {
                 // load a
                 va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0));
                 va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64));
@@ -2017,7 +2015,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
         }
 
         //store to C
-        auto storec = [&](int col) {
+        auto storec = [&](auto col) {
             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
         };
         Unroll<COLS>{}(storec);
diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
new file mode 100644
index 000000000..514701ffe
--- /dev/null
+++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@@ -0,0 +1,298 @@
+#include "ggml-cpu.h"
+#include "ggml-backend-impl.h"
+
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstring>
+#include <vector>
+#include <bitset>
+#include <array>
+#include <string>
+
+struct cpuid_x86 {
+    bool SSE3(void) { return f_1_ecx[0]; }
+    bool PCLMULQDQ(void) { return f_1_ecx[1]; }
+    bool MONITOR(void) { return f_1_ecx[3]; }
+    bool SSSE3(void) { return f_1_ecx[9]; }
+    bool FMA(void) { return f_1_ecx[12]; }
+    bool CMPXCHG16B(void) { return f_1_ecx[13]; }
+    bool SSE41(void) { return f_1_ecx[19]; }
+    bool SSE42(void) { return f_1_ecx[20]; }
+    bool MOVBE(void) { return f_1_ecx[22]; }
+    bool POPCNT(void) { return f_1_ecx[23]; }
+    bool AES(void) { return f_1_ecx[25]; }
+    bool XSAVE(void) { return f_1_ecx[26]; }
+    bool OSXSAVE(void) { return f_1_ecx[27]; }
+    bool AVX(void) { return f_1_ecx[28]; }
+    bool F16C(void) { return f_1_ecx[29]; }
+    bool RDRAND(void) { return f_1_ecx[30]; }
+
+    bool MSR(void) { return f_1_edx[5]; }
+    bool CX8(void) { return f_1_edx[8]; }
+    bool SEP(void) { return f_1_edx[11]; }
+    bool CMOV(void) { return f_1_edx[15]; }
+    bool CLFSH(void) { return f_1_edx[19]; }
+    bool MMX(void) { return f_1_edx[23]; }
+    bool FXSR(void) { return f_1_edx[24]; }
+    bool SSE(void) { return f_1_edx[25]; }
+    bool SSE2(void) { return f_1_edx[26]; }
+
+    bool FSGSBASE(void) { return f_7_ebx[0]; }
+    bool BMI1(void) { return f_7_ebx[3]; }
+    bool HLE(void) { return is_intel && f_7_ebx[4]; }
+    bool AVX2(void) { return f_7_ebx[5]; }
+    bool BMI2(void) { return f_7_ebx[8]; }
+    bool ERMS(void) { return f_7_ebx[9]; }
+    bool INVPCID(void) { return f_7_ebx[10]; }
+    bool RTM(void) { return is_intel && f_7_ebx[11]; }
+    bool AVX512F(void) { return f_7_ebx[16]; }
+    bool RDSEED(void) { return f_7_ebx[18]; }
+    bool ADX(void) { return f_7_ebx[19]; }
+    bool AVX512PF(void) { return f_7_ebx[26]; }
+    bool AVX512ER(void) { return f_7_ebx[27]; }
+    bool AVX512CD(void) { return f_7_ebx[28]; }
+    bool SHA(void) { return f_7_ebx[29]; }
+
+    bool PREFETCHWT1(void) { return f_7_ecx[0]; }
+
+    bool LAHF(void) { return f_81_ecx[0]; }
+    bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
+    bool ABM(void) { return is_amd && f_81_ecx[5]; }
+    bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
+    bool XOP(void) { return is_amd && f_81_ecx[11]; }
+    bool TBM(void) { return is_amd && f_81_ecx[21]; }
+
+    bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
+    bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
+    bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
+    bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
+    bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
+
+    bool AVX512_VBMI(void) { return f_7_ecx[1]; }
+    bool AVX512_VNNI(void) { return f_7_ecx[11]; }
+    bool AVX512_FP16(void) { return f_7_edx[23]; }
+    bool AVX512_BF16(void) { return f_7_1_eax[5]; }
+    bool AVX_VNNI(void) { return f_7_1_eax[4]; }
+
+    bool AMX_TILE(void) { return f_7_edx[24]; }
+    bool AMX_INT8(void) { return f_7_edx[25]; }
+    bool AMX_FP16(void) { return f_7_1_eax[21]; }
+    bool AMX_BF16(void) { return f_7_edx[22]; }
+
+#ifdef _MSC_VER
+    static void cpuid(int cpu_info[4], int eax) {
+        __cpuid(cpu_info, eax);
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __cpuidex(cpu_info, eax, ecx);
+    }
+#else
+    static void cpuid(int cpu_info[4], int eax) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(0));
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(ecx));
+    }
+#endif
+
+    cpuid_x86() {
+        std::array<int, 4> cpui;
+        std::vector<std::array<int, 4>> data;
+
+        // calling __cpuid with 0x0 as the function_id argument
+        // gets the number of the highest valid function ID.
+        cpuid(cpui.data(), 0);
+        int n_ids = cpui[0];
+
+        for (int i = 0; i <= n_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            data.push_back(cpui);
+        }
+
+        // capture vendor string
+        char vendor[0x20] = {};
+        *reinterpret_cast<int *>(vendor)     = data[0][1];
+        *reinterpret_cast<int *>(vendor + 4) = data[0][3];
+        *reinterpret_cast<int *>(vendor + 8) = data[0][2];
+        this->vendor = vendor;
+        if (this->vendor == "GenuineIntel") {
+            is_intel = true;
+        } else if (this->vendor == "AuthenticAMD") {
+            is_amd = true;
+        }
+
+        // load bitset with flags for function 0x00000001
+        if (n_ids >= 1) {
+            f_1_ecx = data[1][2];
+            f_1_edx = data[1][3];
+        }
+
+        // load bitset with flags for function 0x00000007
+        if (n_ids >= 7) {
+            f_7_ebx = data[7][1];
+            f_7_ecx = data[7][2];
+            f_7_edx = data[7][3];
+            cpuidex(cpui.data(), 7, 1);
+            f_7_1_eax = cpui[0];
+        }
+
+        // calling __cpuid with 0x80000000 as the function_id argument
+        // gets the number of the highest valid extended ID.
+        cpuid(cpui.data(), 0x80000000);
+        unsigned int n_ex_ids = cpui[0];
+
+        std::vector<std::array<int, 4>> ext_data;
+        for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            ext_data.push_back(cpui);
+        }
+
+        // load bitset with flags for function 0x80000001
+        if (n_ex_ids >= 0x80000001) {
+            f_81_ecx = ext_data[1][2];
+            f_81_edx = ext_data[1][3];
+        }
+
+        // interpret CPU brand string if reported
+        char brand[0x40] = {};
+        if (n_ex_ids >= 0x80000004) {
+            std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
+            std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
+            std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
+            this->brand = brand;
+        }
+    }
+
+    bool is_intel = false;
+    bool is_amd = false;
+    std::string vendor;
+    std::string brand;
+    std::bitset<32> f_1_ecx;
+    std::bitset<32> f_1_edx;
+    std::bitset<32> f_7_ebx;
+    std::bitset<32> f_7_ecx;
+    std::bitset<32> f_7_edx;
+    std::bitset<32> f_7_1_eax;
+    std::bitset<32> f_81_ecx;
+    std::bitset<32> f_81_edx;
+};
+
+#if 0
+void test_x86_is() {
+    cpuid_x86 is;
+    printf("CPU Vendor: %s\n", is.vendor.c_str());
+    printf("Brand: %s\n", is.brand.c_str());
+    printf("is_intel: %d\n", is.is_intel);
+    printf("is_amd: %d\n", is.is_amd);
+    printf("sse3: %d\n", is.SSE3());
+    printf("pclmulqdq: %d\n", is.PCLMULQDQ());
+    printf("ssse3: %d\n", is.SSSE3());
+    printf("fma: %d\n", is.FMA());
+    printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
+    printf("sse41: %d\n", is.SSE41());
+    printf("sse42: %d\n", is.SSE42());
+    printf("movbe: %d\n", is.MOVBE());
+    printf("popcnt: %d\n", is.POPCNT());
+    printf("aes: %d\n", is.AES());
+    printf("xsave: %d\n", is.XSAVE());
+    printf("osxsave: %d\n", is.OSXSAVE());
+    printf("avx: %d\n", is.AVX());
+    printf("f16c: %d\n", is.F16C());
+    printf("rdrand: %d\n", is.RDRAND());
+    printf("msr: %d\n", is.MSR());
+    printf("cx8: %d\n", is.CX8());
+    printf("sep: %d\n", is.SEP());
+    printf("cmov: %d\n", is.CMOV());
+    printf("clflush: %d\n", is.CLFSH());
+    printf("mmx: %d\n", is.MMX());
+    printf("fxsr: %d\n", is.FXSR());
+    printf("sse: %d\n", is.SSE());
+    printf("sse2: %d\n", is.SSE2());
+    printf("fsgsbase: %d\n", is.FSGSBASE());
+    printf("bmi1: %d\n", is.BMI1());
+    printf("hle: %d\n", is.HLE());
+    printf("avx2: %d\n", is.AVX2());
+    printf("bmi2: %d\n", is.BMI2());
+    printf("erms: %d\n", is.ERMS());
+    printf("invpcid: %d\n", is.INVPCID());
+    printf("rtm: %d\n", is.RTM());
+    printf("avx512f: %d\n", is.AVX512F());
+    printf("rdseed: %d\n", is.RDSEED());
+    printf("adx: %d\n", is.ADX());
+    printf("avx512pf: %d\n", is.AVX512PF());
+    printf("avx512er: %d\n", is.AVX512ER());
+    printf("avx512cd: %d\n", is.AVX512CD());
+    printf("sha: %d\n", is.SHA());
+    printf("prefetchwt1: %d\n", is.PREFETCHWT1());
+    printf("lahf: %d\n", is.LAHF());
+    printf("lzcnt: %d\n", is.LZCNT());
+    printf("abm: %d\n", is.ABM());
+    printf("sse4a: %d\n", is.SSE4a());
+    printf("xop: %d\n", is.XOP());
+    printf("tbm: %d\n", is.TBM());
+    printf("syscall: %d\n", is.SYSCALL());
+    printf("mmxext: %d\n", is.MMXEXT());
+    printf("rdtscp: %d\n", is.RDTSCP());
+    printf("3dnowext: %d\n", is._3DNOWEXT());
+    printf("3dnow: %d\n", is._3DNOW());
+    printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
+    printf("avx512_vnni: %d\n", is.AVX512_VNNI());
+    printf("avx512_fp16: %d\n", is.AVX512_FP16());
+    printf("avx512_bf16: %d\n", is.AVX512_BF16());
+    printf("amx_tile: %d\n", is.AMX_TILE());
+    printf("amx_int8: %d\n", is.AMX_INT8());
+    printf("amx_fp16: %d\n", is.AMX_FP16());
+    printf("amx_bf16: %d\n", is.AMX_BF16());
+}
+#endif
+
+static int ggml_backend_cpu_x86_score() {
+    // FIXME: this does not check for OS support
+
+    cpuid_x86 is;
+    // if the CPU backend was built with any features not supported by the current CPU, it cannot be used
+    if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
+    if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
+    if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
+    if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
+    if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
+    if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
+    if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
+    if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
+    if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
+    if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
+    if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
+    if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
+
+    // calculate a backend score based on the supported features
+    // more important features have a higher weight
+    int score = 0;
+    score +=  ggml_cpu_has_fma        () * 1;
+    score +=  ggml_cpu_has_f16c       () * 1<<1;
+    score +=  ggml_cpu_has_ssse3      () * 1<<2;
+    score +=  ggml_cpu_has_sse3       () * 1<<3;
+    score +=  ggml_cpu_has_avx_vnni   () * 1<<4;
+    score +=  ggml_cpu_has_avx        () * 1<<5;
+    score +=  ggml_cpu_has_avx2       () * 1<<6;
+    score +=  ggml_cpu_has_avx512     () * 1<<7;
+    // score +=  ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
+    score +=  ggml_cpu_has_avx512_bf16() * 1<<9;
+    score +=  ggml_cpu_has_avx512_vnni() * 1<<10;
+    score +=  ggml_cpu_has_amx_int8   () * 1<<11;
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
+
+#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 61a92cfd9..11152385e 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
 }
 
 static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__)
     const __m512i zero = _mm512_setzero_si512();
     return _mm512_dpbusd_epi32(zero, ax, sy);
 #else
diff --git a/scripts/build-cpu.sh b/scripts/build-cpu.sh
new file mode 100755
index 000000000..4b2ad816e
--- /dev/null
+++ b/scripts/build-cpu.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+name="$1"
+args="${@:2}"
+
+echo "Building $name with args: $args"
+
+rm -fr build-cpu-$1
+cmake -S . -B build-cpu-$1 -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF $args
+cmake --build build-cpu-$1 --config Release -t ggml-cpu -j $(nproc)
+cp build-cpu-$1/bin/libggml-cpu.so ./libggml-cpu-$1.so
+rm -fr build-cpu-$1

From 5c7a5aa0c32eb19ce03e178560797db5875d7692 Mon Sep 17 00:00:00 2001
From: Wang Qin <37098874+wangqin0@users.noreply.github.com>
Date: Sun, 1 Dec 2024 10:11:42 -0800
Subject: [PATCH 366/396] ci: add error handling for Python venv creation in
 run.sh (#10608)

---
 ci/run.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/run.sh b/ci/run.sh
index 20610e560..abf08a4ff 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -815,7 +815,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
     ln -sfn ${mnt_models} ${SRC}/models-mnt
 
     # Create a fresh python3 venv and enter it
-    python3 -m venv "$MNT/venv"
+    if ! python3 -m venv "$MNT/venv"; then
+        echo "Error: Failed to create Python virtual environment at $MNT/venv."
+        exit 1
+    fi
     source "$MNT/venv/bin/activate"
 
     pip install -r ${SRC}/requirements.txt --disable-pip-version-check

From 5e1ed95583ca552a98d8528b73e1ff81249c2bf9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 1 Dec 2024 21:37:54 +0200
Subject: [PATCH 367/396] grammars : add English-only grammar (#10612)

---
 grammars/english.gbnf | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 grammars/english.gbnf

diff --git a/grammars/english.gbnf b/grammars/english.gbnf
new file mode 100644
index 000000000..2e53686c8
--- /dev/null
+++ b/grammars/english.gbnf
@@ -0,0 +1,6 @@
+# note: this might be incomplete, mostly an example
+root        ::= en-char+ ([ \t\n] en-char+)*
+en-char     ::= letter | digit | punctuation
+letter      ::= [a-zA-Z]
+digit       ::= [0-9]
+punctuation ::= [!"#$%&'()*+,-./:;<=>?@[\\\]^_`{|}~]

From 917786f43d0f29b7c77a0c56767c0fa4df68b1c5 Mon Sep 17 00:00:00 2001
From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com>
Date: Sun, 1 Dec 2024 22:09:49 +0000
Subject: [PATCH 368/396] Add `mistral-v1`, `mistral-v3`, `mistral-v3-tekken`
 and `mistral-v7` chat template types (#10572)

* Templates: `mistral-v1`, `mistral-v2`, `mistral-v3`, `mistral-v3-tekken`

* Changed system message logic and added tests for all 4

* Invalid `system_message` instead of `content` fixed

* Removed tab-indented lines

* Added template code and test for `mistral-v7`

* Added all tests. Fixed bug with `tmpl == "llama2"` test.

* Replaced tabs with spaces.

* Removed `'mistral-v2'` option as no (open) models ever used it

* Removed all references to 'v2' template from comments

* Update llama.cpp

Fixed `trim_assistant_message` bug
---
 src/llama.cpp                | 108 ++++++++++++++++++++++++-----------
 tests/test-chat-template.cpp |  32 ++++++++++-
 2 files changed, 105 insertions(+), 35 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 22b951ba2..6e9ba9727 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21862,41 +21862,85 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|im_start|>assistant\n";
         }
-    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
-        // llama2 template and its variants
-        // [variant] support system message
-        bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
-        // [variant] space before + after response
-        bool space_around_response = tmpl_contains("' ' + eos_token");
-        // [variant] add BOS inside history
-        bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
-        // [variant] trim spaces from the input message
-        bool strip_message = tmpl_contains("content.strip()");
-        // construct the prompt
-        bool is_inside_turn = true; // skip BOS at the beginning
-        ss << "[INST] ";
-        for (auto message : chat) {
-            std::string content = strip_message ? trim(message->content) : message->content;
-            std::string role(message->role);
-            if (!is_inside_turn) {
-                is_inside_turn = true;
-                ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
-            }
-            if (role == "system") {
-                if (support_system_message) {
-                    ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
-                } else {
-                    // if the model does not support system message, we still include it in the first message, but without <<SYS>>
-                    ss << content << "\n";
+    } else if (tmpl == "llama2" || tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
+        if (tmpl == "mistral-v7" || tmpl_contains("[SYSTEM_PROMPT]")) {
+            // Official mistral 'v7' template
+            // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
+            for (auto message : chat) {
+                std::string role(message->role);
+                std::string content(message->content);
+                if (role == "system") {
+                    ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
+                } else if (role == "user") {
+                    ss << "[INST] " << content << "[/INST]";
+                }
+                else {
+                    ss << " " << content << "</s>";
                 }
-            } else if (role == "user") {
-                ss << content << " [/INST]";
-            } else {
-                ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
-                is_inside_turn = false;
             }
+        } else if (tmpl == "mistral-v1" || tmpl == "mistral-v3" || tmpl == "mistral-v3-tekken"
+                   || tmpl_contains("' [INST] ' + system_message") // catches official 'v1' template
+                   || tmpl_contains("[AVAILABLE_TOOLS]")) {        // catches official 'v3' and 'v3-tekken' templates
+            // Official mistral 'v1', 'v3' and 'v3-tekken' templates
+            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
+            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
+            std::string leading_space = (tmpl == "mistral-v1" || tmpl_contains(" [INST]") ? " " : "");
+            std::string trailing_space = (tmpl == "mistral-v3-tekken" || tmpl_contains("\"[INST]\"") ? "" : " ");
+            bool trim_assistant_message = tmpl_contains("|trim + eos_token");
+            bool is_inside_turn = false;
+            for (auto message : chat) {
+                if (!is_inside_turn) {
+                    ss << leading_space << "[INST]" << trailing_space;
+                    is_inside_turn = true;
+                }
+                std::string role(message->role);
+                std::string content(message->content);
+                if (role == "system") {
+                    ss << content << "\n\n";
+                } else if (role == "user") {
+                    ss << content << leading_space << "[/INST]";
+                } else {
+                    ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
+                    is_inside_turn = false;
+                }
+            }
+        } else {
+            // llama2 template and its variants
+            // [variant] support system message
+            // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+            bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "llama2";
+            // [variant] space before + after response
+            bool space_around_response = tmpl_contains("' ' + eos_token");
+            // [variant] add BOS inside history
+            bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
+            // [variant] trim spaces from the input message
+            bool strip_message = tmpl_contains("content.strip()");
+            // construct the prompt
+            bool is_inside_turn = true; // skip BOS at the beginning
+            ss << "[INST] ";
+            for (auto message : chat) {
+                std::string content = strip_message ? trim(message->content) : message->content;
+                std::string role(message->role);
+                if (!is_inside_turn) {
+                    is_inside_turn = true;
+                    ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
+                }
+                if (role == "system") {
+                    if (support_system_message) {
+                        ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
+                    } else {
+                        // if the model does not support system message, we still include it in the first message, but without <<SYS>>
+                        ss << content << "\n";
+                    }
+                } else if (role == "user") {
+                    ss << content << " [/INST]";
+                } else {
+                    ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
+                    is_inside_turn = false;
+                }
+            }
+            // llama2 templates seem to not care about "add_generation_prompt
         }
-        // llama2 templates seem to not care about "add_generation_prompt"
     } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
         // Phi 3
         for (auto message : chat) {
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 03e897e66..dd8f7d5f0 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -21,7 +21,7 @@ int main(void) {
     std::vector<std::string> templates = {
         // teknium/OpenHermes-2.5-Mistral-7B
         "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-        // mistralai/Mistral-7B-Instruct-v0.2
+        // mistralai/Mistral-7B-Instruct-v0.2 (NOTE: Old pre-v1 without a system prompt)
         "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
         // TheBloke/FusionNet_34Bx2_MoE-AWQ
         "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
@@ -67,11 +67,19 @@ int main(void) {
         "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
         // ibm-granite/granite-3.0-8b-instruct
         "{%- if tools %}\n    {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n    {%- for tool in tools %}\n    {{- tool | tojson(indent=4) }}\n    {%- if not loop.last %}\n        {{- '\n\n' }}\n    {%- endif %}\n    {%- endfor %}\n    {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n    {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'user' %}\n    {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant_tool_call' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'tool_response' %}\n    {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- endif %}\n    {%- if loop.last and add_generation_prompt %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n    {%- endif %}\n{%- endfor %}",
+        // mistralai/Mistral-7B-Instruct-v0.2 (mistralai 'v1' template with a system prompt)
+        "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+        // Mistral-Large-Instruct-2407 (mistralai 'v3' template)
+        "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS] [\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- \"[TOOL_CALLS] [\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- \" \" + message[\"content\"]|trim + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+        // Mistral-Nemo-Instruct-2407 (mistralai 'v3-tekken' template)
+        "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS][\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n        {{- \"[TOOL_CALLS][\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+        // mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)
+        "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
     };
     std::vector<std::string> expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
         "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
-        // mistralai/Mistral-7B-Instruct-v0.2
+        // mistralai/Mistral-7B-Instruct-v0.2 (NOTE: Old pre-v1 without a system prompt)
         "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
         // TheBloke/FusionNet_34Bx2_MoE-AWQ
         "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
@@ -113,6 +121,14 @@ int main(void) {
         u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:",
         // ibm-granite/granite-3.0-8b-instruct
         "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>\n",
+        // mistralai/Mistral-7B-Instruct-v0.2 (mistralai 'v1' template with a system prompt)
+        " [INST] You are a helpful assistant\n\nHello [/INST] Hi there</s> [INST] Who are you [/INST]    I am an assistant   </s> [INST] Another question [/INST]",
+        // Mistral-Large-Instruct-2407 (mistralai 'v3' template; modified to have system prompt at start)
+        "[INST] You are a helpful assistant\n\nHello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[INST] Another question[/INST]",
+        // Mistral-Nemo-Instruct-2407 (mistralai 'v3-tekken' template; modified to have system prompt at start)
+        "[INST]You are a helpful assistant\n\nHello[/INST]Hi there</s>[INST]Who are you[/INST]   I am an assistant   </s>[INST]Another question[/INST]",
+        // mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)
+        "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST]    I am an assistant   </s>[INST] Another question[/INST]",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;
@@ -154,7 +170,12 @@ int main(void) {
         return output;
     };
     assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
-    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
+    assert(fmt_sys("mistral-v1") == " [INST] You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v3") == "[INST] You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v3-tekken") == "[INST]You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v7") == "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT]");
+    assert(fmt_sys("llama2") == "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\n");
+    assert(fmt_sys("mistral") == "[INST] You are a helpful assistant\n"); // for old pre-v1 templates
     assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
     assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
 
@@ -173,7 +194,12 @@ int main(void) {
         return output;
     };
     assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
+    assert(fmt_single("mistral-v1") == " [INST] How are you [/INST]");
+    assert(fmt_single("mistral-v3") == "[INST] How are you[/INST]");
+    assert(fmt_single("mistral-v3-tekken") == "[INST]How are you[/INST]");
+    assert(fmt_single("mistral-v7") == "[INST] How are you[/INST]");
     assert(fmt_single("llama2") == "[INST] How are you [/INST]");
+    assert(fmt_single("mistral") == "[INST] How are you [/INST]"); // for old pre-v1 templates
     assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
     assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
 

From 4cb003dd8d1f37523120a21e4b1a50a2adcb8c84 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 2 Dec 2024 08:53:27 +0200
Subject: [PATCH 369/396] contrib : refresh (#10593)

* contrib : refresh

* contrib : expand [no ci]

* contrib : expand test-backend-ops instructions

* contrib : add CODEOWNERS

* prs : update template to not have checkbox [no ci]
---
 .github/pull_request_template.md | 8 +-------
 CODEOWNERS                       | 3 +++
 CONTRIBUTING.md                  | 6 ++++--
 3 files changed, 8 insertions(+), 9 deletions(-)
 create mode 100644 CODEOWNERS

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 997c6d9d0..d9f5bdc23 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,7 +1 @@
-
-
-- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
-- Self-reported review complexity:
-  - [ ] Low
-  - [ ] Medium
-  - [ ] High
+*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 000000000..88ab6de4f
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,3 @@
+# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
+
+ci/ @ggerganov
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4c882c254..5a85ec5d2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,9 +1,10 @@
 # Pull requests (for contributors)
 
 - Test your changes:
-  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library
   - Execute [the full CI locally on your machine](ci/README.md) before publishing
-- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs
+  - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
+  - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
+  - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
 
@@ -12,6 +13,7 @@
 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
 
 # Coding guidelines
 

From 991f8aabeec89d801300bb179e52013fb0eb0584 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan.biswas@gmail.com>
Date: Mon, 2 Dec 2024 12:34:11 +0530
Subject: [PATCH 370/396] SYCL: Fix and switch to GGML_LOG system instead of
 fprintf (#10579)

* Switched to GGML_LOG

* Fix missing semicolon
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 90 ++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 39 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 808f74fa0..1310981e5 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
 
     info.device_count = dpct::dev_mgr::instance().device_count();
     if (info.device_count == 0) {
-        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
+        GGML_LOG_ERROR("%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
         return info;
     }
 
@@ -55,16 +55,16 @@ static ggml_sycl_device_info ggml_sycl_init() {
 
     int64_t total_vram = 0;
 #if defined(GGML_SYCL_FORCE_MMQ)
-    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
+    GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
 #else
-    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
+    GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
 #endif
 #if defined(SYCL_USE_XMX)
-    fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
+    GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__);
 #else
-    fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
+    GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
 #endif
-    fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
+    GGML_LOG_INFO("%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
 
     for (int i = 0; i < info.device_count; ++i) {
         info.devices[i].vmm = 0;
@@ -110,7 +110,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
 
     auto global_mem_size = prop.get_global_mem_size()/1000000;
 
-    fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
+    GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
             name.c_str(), version.c_str(), prop.get_max_compute_units(),
             prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
             global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
@@ -120,19 +120,30 @@ void ggml_backend_sycl_print_sycl_devices() {
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
     int device_count = dpct::dev_mgr::instance().device_count();
     std::map<std::string, size_t> DeviceNums;
-    fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    fprintf(stderr, "|  |                   |                                       |       |Max    |        |Max  |Global |                     |\n");
-    fprintf(stderr, "|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |\n");
-    fprintf(stderr, "|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|\n");
-    fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
+    GGML_LOG_INFO("Found %d SYCL devices:\n", device_count);
+
+    GGML_LOG_INFO(
+        "|  |                   |                                       |      "
+        " |Max    |        |Max  |Global |                     |\n");
+    GGML_LOG_INFO(
+        "|  |                   |                                       |      "
+        " |compute|Max work|sub  |mem    |                     |\n");
+    GGML_LOG_INFO(
+        "|ID|        Device Type|                                   "
+        "Name|Version|units  |group   |group|size   |       Driver version|\n");
+    GGML_LOG_INFO(
+        "|--|-------------------|---------------------------------------|------"
+        "-|-------|--------|-----|-------|---------------------|\n");
+
     for (int id = 0; id < device_count; ++id) {
-        sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        sycl::backend backend = device.get_backend();
-        std::string backend_type = get_device_backend_and_type(device);
-        int type_id=DeviceNums[backend_type]++;
-        std::stringstream device_type;
-        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
-        print_device_detail(id, device, device_type.str());
+      sycl::device device = dpct::dev_mgr::instance().get_device(id);
+      sycl::backend backend = device.get_backend();
+      std::string backend_type = get_device_backend_and_type(device);
+      int type_id = DeviceNums[backend_type]++;
+      std::stringstream device_type;
+      device_type << "[" << backend_type << ":" << std::to_string(type_id)
+                  << "]";
+      print_device_detail(id, device, device_type.str());
     }
 }
 
@@ -154,15 +165,14 @@ static void ggml_check_sycl() try {
     static bool initialized = false;
 
     if (!initialized) {
-        fprintf(stderr, "[SYCL] call ggml_check_sycl\n");
+        GGML_LOG_INFO("[SYCL] call ggml_check_sycl\n");
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-
-        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
+        GGML_LOG_INFO("%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
 
 #if defined(GGML_SYCL_F16)
-        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
+        GGML_LOG_INFO("%s: GGML_SYCL_F16: yes\n", __func__);
 #else
-        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
+        GGML_LOG_INFO("%s: GGML_SYCL_F16: no\n", __func__);
 #endif
 
 /* NOT REMOVE, keep it for next optimize for XMX.
@@ -180,9 +190,10 @@ static void ggml_check_sycl() try {
             return;
         }
         GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        ggml_backend_sycl_print_sycl_devices();
+
         initialized = true;
         g_sycl_loaded = true;
+        ggml_backend_sycl_print_sycl_devices();
     }
 }
 catch (sycl::exception const &exc) {
@@ -205,7 +216,7 @@ inline void check_allow_gpu_index(const int device_index) {
         __func__,
         device_index,
         ggml_sycl_info().device_count - 1);
-    fprintf(stderr, "%s\n", error_buf);
+    GGML_LOG_ERROR("%s\n", error_buf);
     assert(false);
   }
 }
@@ -475,8 +486,8 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
     SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
                                     size, *stream)));
     if (!dev_ptr) {
-        fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
-        return nullptr;
+      GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
+      return nullptr;
     }
     ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
     return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
@@ -752,7 +763,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                         size, *stream)));
         if (!buf) {
             char err_buf[1024];
-            snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
+            snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
             throw std::runtime_error(err_buf);
         }
         // set padding to 0 to avoid possible NaN values
@@ -1142,17 +1153,18 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
             CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
                                 look_ahead_size, *qptr)));
         if (!ptr) {
-            fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
+            GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
             return nullptr;
         }
 
         *actual_size = look_ahead_size;
         pool_size += look_ahead_size;
 
-    #ifdef DEBUG_SYCL_MALLOC
-        fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
+#ifdef DEBUG_SYCL_MALLOC
+        GGML_LOG_DEBUG("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
                 (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
-    #endif
+#endif
+
         // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr);
         return ptr;
     }
@@ -1166,7 +1178,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
                 return;
             }
         }
-        fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
+        GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
         SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
         pool_size -= size;
     }
@@ -2437,7 +2449,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
             break;
         default:
             // TODO: k-quants
-            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
+            GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
             GGML_ABORT("fatal error");
             break;
     }
@@ -3750,7 +3762,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
     } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
         ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else {
-        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
+        GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
         GGML_ABORT("fatal error");
     }
@@ -3825,7 +3837,7 @@ void ggml_sycl_set_main_device(const int main_device) try {
         dpct::device_info prop;
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
             prop, dpct::dev_mgr::instance().get_device(main_device))));
-        fprintf(stderr, "Using device %d (%s) as main device\n",
+        GGML_LOG_INFO("Using device %d (%s) as main device\n",
                 main_device, prop.get_name());
     }
 }
@@ -4172,7 +4184,7 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
 #endif
         bool ok = ggml_sycl_compute_forward(*sycl_ctx, node);
         if (!ok) {
-            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
         }
         GGML_ASSERT(ok);
     }
@@ -4672,7 +4684,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
 
     ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
     if (ctx == nullptr) {
-        fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
+        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
         return nullptr;
     };
 

From 64ed2091b24b2f9747148fdf49a34ed5938762c3 Mon Sep 17 00:00:00 2001
From: haopeng <657407891@qq.com>
Date: Mon, 2 Dec 2024 21:45:54 +0800
Subject: [PATCH 371/396] server: Add "tokens per second" information in the
 backend (#10548)

* add cmake rvv support

* add timings

* remove space

* update readme

* fix

* fix code

* remove empty line

* add test

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 common/common.h                                 |  1 +
 examples/server/README.md                       |  2 ++
 examples/server/server.cpp                      | 16 +++++++++++++++-
 .../server/tests/unit/test_chat_completion.py   | 17 +++++++++++++++++
 examples/server/utils.hpp                       |  9 +++++++++
 5 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/common/common.h b/common/common.h
index 9b1508a15..0373fd3ea 100644
--- a/common/common.h
+++ b/common/common.h
@@ -133,6 +133,7 @@ struct common_params_sampling {
     bool    penalize_nl        = false; // consider newlines as a repeatable token
     bool    ignore_eos         = false;
     bool    no_perf            = false; // disable performance metrics
+    bool    timing_per_token   = false;
 
     std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
 
diff --git a/examples/server/README.md b/examples/server/README.md
index 877768c8b..45ffb547f 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -416,6 +416,8 @@ node index.js
 
     `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
 
+    `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
+
 **Response format**
 
 - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1c765f0ea..8eca14b86 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -177,6 +177,8 @@ struct server_slot {
     bool stopped_word   = false;
     bool stopped_limit  = false;
 
+    bool timings_per_token = false;
+
     bool oaicompat = false;
 
     std::string oaicompat_model;
@@ -882,6 +884,8 @@ struct server_context {
             slot.oaicompat_model = "";
         }
 
+        slot.timings_per_token       = json_value(data, "timings_per_token",  false);
+
         slot.params.stream           = json_value(data, "stream",             false);
         slot.params.cache_prompt     = json_value(data, "cache_prompt",       true);
         slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
@@ -1279,6 +1283,7 @@ struct server_context {
             {"speculative.n_max",         slot.params.speculative.n_max},
             {"speculative.n_min",         slot.params.speculative.n_min},
             {"speculative.p_min",         slot.params.speculative.p_min},
+            {"timings_per_token",         slot.timings_per_token},
         };
     }
 
@@ -1336,6 +1341,10 @@ struct server_context {
             res.data["model"] = slot.oaicompat_model;
         }
 
+        if (slot.timings_per_token) {
+            res.data["timings"] = slot.get_formated_timings();
+        }
+
         queue_results.send(res);
     }
 
@@ -2274,12 +2283,17 @@ struct server_context {
                 common_sampler_accept(slot.smpl, id, true);
 
                 slot.n_decoded += 1;
+
+                const int64_t t_current = ggml_time_us();
+
                 if (slot.n_decoded == 1) {
-                    slot.t_start_generation = ggml_time_us();
+                    slot.t_start_generation = t_current;
                     slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
                     metrics.on_prompt_eval(slot);
                 }
 
+                slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
+
                 completion_token_output result;
                 result.tok = id;
 
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
index 1048d6fca..8a439f9ef 100644
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -146,3 +146,20 @@ def test_invalid_chat_completion_req(messages):
     })
     assert res.status_code == 400 or res.status_code == 500
     assert "error" in res.body
+
+
+def test_chat_completion_with_timings_per_token():
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/chat/completions", data={
+        "max_tokens": 10,
+        "messages": [{"role": "user", "content": "test"}],
+        "stream": True,
+        "timings_per_token": True,
+    })
+    for data in res:
+        assert "timings" in data
+        assert "prompt_per_second" in data["timings"]
+        assert "predicted_per_second" in data["timings"]
+        assert "predicted_n" in data["timings"]
+        assert data["timings"]["predicted_n"] <= 10
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 1665e9dc3..e4451532c 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -650,6 +650,10 @@ static json format_final_response_oaicompat(const json & request, const json & r
         res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
     }
 
+    if (result.contains("timings")) {
+        res.push_back({"timings", json_value(result, "timings", json::object())});
+    }
+
     return res;
 }
 
@@ -740,6 +744,11 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
         {"model",   modelname},
         {"object",  "chat.completion.chunk"}
     };
+
+    if (result.contains("timings")) {
+        ret.push_back({"timings", json_value(result, "timings", json::object())});
+    }
+
     if (!finish_reason.empty()) {
         int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
         int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);

From 8648c521010620c2daccfa1d26015c668ba2c717 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 2 Dec 2024 21:22:53 +0200
Subject: [PATCH 372/396] make : deprecate (#10514)

* make : deprecate

ggml-ci

* ci : disable Makefile builds

ggml-ci

* docs : remove make references [no ci]

* ci : disable swift build

ggml-ci

* docs : remove obsolete make references, scripts, examples

ggml-ci

* basic fix for compare-commits.sh

* update build.md

* more build.md updates

* more build.md updates

* more build.md updates

* Update Makefile

Co-authored-by: Diego Devesa <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 .github/workflows/build.yml                | 163 ++------
 Makefile                                   |   4 +
 docs/backend/BLIS.md                       |   7 -
 docs/build.md                              | 245 +++++-------
 examples/base-translate.sh                 |  61 ---
 examples/convert-llama2c-to-ggml/README.md |   5 +-
 examples/imatrix/README.md                 |   2 -
 examples/server/README.md                  |  15 -
 scripts/compare-commits.sh                 |  18 +-
 scripts/pod-llama.sh                       | 212 -----------
 scripts/server-llm.sh                      | 418 ---------------------
 11 files changed, 139 insertions(+), 1011 deletions(-)
 delete mode 100755 examples/base-translate.sh
 delete mode 100644 scripts/pod-llama.sh
 delete mode 100644 scripts/server-llm.sh

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e2291bd34..f3326a5fb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -160,66 +160,6 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
           name: llama-bin-macos-x64.zip
 
-  ubuntu-focal-make:
-    runs-on: ubuntu-20.04
-    env:
-      LLAMA_NODE_AVAILABLE: true
-      LLAMA_PYTHON_AVAILABLE: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: "20"
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
-      - name: Build
-        id: make_build
-        env:
-            LLAMA_FATAL_WARNINGS: 1
-        run: |
-          CC=gcc-8 make -j $(nproc)
-
-      - name: Test
-        id: make_test
-        run: |
-          CC=gcc-8 make tests -j $(nproc)
-          make test -j $(nproc)
-
-  ubuntu-focal-make-curl:
-    runs-on: ubuntu-20.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
-
-      - name: Build
-        id: make_build
-        env:
-          LLAMA_FATAL_WARNINGS: 1
-          LLAMA_CURL: 1
-        run: |
-          CC=gcc-8 make -j $(nproc)
-
   ubuntu-latest-cmake:
     runs-on: ubuntu-latest
 
@@ -517,36 +457,6 @@ jobs:
           cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
           cmake --build . --config Release -j $(nproc)
 
-  # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
-  macOS-latest-make:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: make_build
-        env:
-            LLAMA_FATAL_WARNINGS: 1
-        run: |
-          GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: make_test
-        run: |
-          GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          GGML_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
-
   # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
   #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
@@ -642,33 +552,35 @@ jobs:
             -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
-  macOS-latest-swift:
-    runs-on: macos-latest
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
-
-      - name: Build Swift Example
-        id: make_build_swift_example
-        run: |
-            make swift
+# TODO: tmp disabled. see for possible re-enable:
+#       https://github.com/ggerganov/llama.cpp/pull/10525
+#  macOS-latest-swift:
+#    runs-on: macos-latest
+#
+#    strategy:
+#      matrix:
+#        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        id: depends
+#        continue-on-error: true
+#        run: |
+#          brew update
+#
+#      - name: xcodebuild for swift package
+#        id: xcodebuild
+#        run: |
+#          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
+#
+#      - name: Build Swift Example
+#        id: make_build_swift_example
+#        run: |
+#            make swift
 
   windows-msys2:
     runs-on: windows-latest
@@ -695,21 +607,6 @@ jobs:
             mingw-w64-${{matrix.env}}-cmake
             mingw-w64-${{matrix.env}}-openblas
 
-      - name: Build using make
-        shell: msys2 {0}
-        run: |
-            make -j $(nproc)
-
-      - name: Clean after building using make
-        shell: msys2 {0}
-        run: |
-            make clean
-
-      - name: Build using make w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            make GGML_OPENBLAS=1 -j $(nproc)
-
       - name: Build using CMake
         shell: msys2 {0}
         run: |
@@ -1257,9 +1154,7 @@ jobs:
     runs-on: ubuntu-latest
 
     needs:
-      - ubuntu-focal-make
       - ubuntu-latest-cmake
-      - macOS-latest-make
       - macOS-latest-cmake
       - windows-latest-cmake
       - windows-2019-cmake-cuda
diff --git a/Makefile b/Makefile
index e30821624..f5dc916f6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,7 @@
+ifndef LLAMA_MAKEFILE
+$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+endif
+
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
diff --git a/docs/backend/BLIS.md b/docs/backend/BLIS.md
index 35d06bd0f..904548577 100644
--- a/docs/backend/BLIS.md
+++ b/docs/backend/BLIS.md
@@ -27,13 +27,6 @@ We recommend using openmp since it's easier to modify the cores being used.
 
 ### llama.cpp compilation
 
-Makefile:
-
-```bash
-make GGML_BLIS=1 -j
-# make GGML_BLIS=1 llama-benchmark-matmult
-```
-
 CMake:
 
 ```bash
diff --git a/docs/build.md b/docs/build.md
index 72b810437..5465629e2 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -7,124 +7,63 @@ git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```
 
-In order to build llama.cpp you have four different options.
+The following sections describe how to build with different backends and options.
 
-- Using `make`:
-  - On Linux or MacOS:
+## CPU Build
 
-      ```bash
-      make
-      ```
+Build llama.cpp using `CMake`:
 
-  - On Windows (x86/x64 only, arm64 requires cmake):
+```bash
+cmake -B build
+cmake --build build --config Release
+```
 
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-    2. Extract `w64devkit` on your pc.
-    3. Run `w64devkit.exe`.
-    4. Use the `cd` command to reach the `llama.cpp` folder.
-    5. From here you can run:
-        ```bash
-        make
-        ```
+**Notes**:
 
-  - Notes:
-    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, run `make LLAMA_DEBUG=1`
+- For faster compilation, add the `-j` argument to run multiple jobs in parallel, or use a generator that does this automatically such as Ninja. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
+- For faster repeated compilation, install [ccache](https://ccache.dev/)
+- For debug builds, there are two cases:
 
-- Using `CMake`:
+    1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
 
-  ```bash
-  cmake -B build
-  cmake --build build --config Release
-  ```
+    ```bash
+    cmake -B build -DCMAKE_BUILD_TYPE=Debug
+    cmake --build build
+    ```
 
-  **Notes**:
+    2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
 
-    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, there are two cases:
+    ```bash
+    cmake -B build -G "Xcode"
+    cmake --build build --config Debug
+    ```
 
-      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+    For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
 
-      ```bash
-      cmake -B build -DCMAKE_BUILD_TYPE=Debug
-      cmake --build build
-      ```
-
-      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
-
-      ```bash
-      cmake -B build -G "Xcode"
-      cmake --build build --config Debug
-      ```
-    - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
-      - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
-        - Tab Workload: Desktop-development with C++
-        - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
-      - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
-      - For Windows on ARM (arm64, WoA) build with:
-        ```bash
-        cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
-        cmake --build build-arm64-windows-llvm-release
-        ```
-        Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
-
--   Using `gmake` (FreeBSD):
-
-    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
-    2. Add your user to **video** group
-    3. Install compilation dependencies.
-
-        ```bash
-        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
-
-        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
-        ```
-
-## Metal Build
-
-On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
-
-When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
-argument.
+- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
+    - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
+    - Tab Workload: Desktop-development with C++
+    - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
+    - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
+    - For Windows on ARM (arm64, WoA) build with:
+    ```bash
+    cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
+    cmake --build build-arm64-windows-llvm-release
+    ```
+    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
 
 ## BLAS Build
 
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use:
 
-### Accelerate Framework:
+### Accelerate Framework
 
 This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
 
-### OpenBLAS:
+### OpenBLAS
 
 This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
 
-- Using `make`:
-  - On Linux:
-    ```bash
-    make GGML_OPENBLAS=1
-    ```
-
-  - On Windows:
-
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-    2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
-    3. Extract `w64devkit` on your pc.
-    4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
-    5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
-    6. Run `w64devkit.exe`.
-    7. Use the `cd` command to reach the `llama.cpp` folder.
-    8. From here you can run:
-
-        ```bash
-        make GGML_OPENBLAS=1
-        ```
-
 - Using `CMake` on Linux:
 
     ```bash
@@ -136,14 +75,6 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
 
 Check [BLIS.md](./backend/BLIS.md) for more information.
 
-### SYCL
-
-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
-
-llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
-
-For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
-
 ### Intel oneMKL
 
 Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
@@ -161,16 +92,29 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
 
 Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
 
-### CUDA
+### Other BLAS libraries
 
-This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+Any other BLAS library can be used by setting the `GGML_BLAS_VENDOR` option. See the [CMake documentation](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) for a list of supported vendors.
 
-For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
+## Metal Build
+
+On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
+To disable the Metal build at compile time use the `-DGGML_METAL=OFF` cmake option.
+
+When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers 0` command-line argument.
+
+## SYCL
+
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+
+llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+
+For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
+
+## CUDA
+
+This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
 
-- Using `make`:
-  ```bash
-  make GGML_CUDA=1
-  ```
 - Using `CMake`:
 
   ```bash
@@ -192,14 +136,10 @@ The following compilation options are also available to tweak performance:
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
 
-### MUSA
+## MUSA
 
 This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
 
-- Using `make`:
-  ```bash
-  make GGML_MUSA=1
-  ```
 - Using `CMake`:
 
   ```bash
@@ -213,16 +153,12 @@ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enab
 
 Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
 
-### hipBLAS
+## HIP
 
-This provides BLAS acceleration on HIP-supported AMD GPUs.
+This provides GPU acceleration on HIP-supported AMD GPUs.
 Make sure to have ROCm installed.
 You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
 
-- Using `make`:
-  ```bash
-  make GGML_HIP=1
-  ```
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
   ```bash
   HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
@@ -247,11 +183,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
       && cmake --build build -- -j 16
   ```
 
-- Using `make` (example for target gfx1030, build with 16 CPU threads):
-  ```bash
-  make -j16 GGML_HIP=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
-  ```
-
 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
   ```bash
   set PATH=%HIP_PATH%\bin;%PATH%
@@ -265,11 +196,11 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
 
-### Vulkan
+## Vulkan
 
 **Windows**
 
-#### w64devkit
+### w64devkit
 
 Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
 
@@ -289,9 +220,14 @@ Libs: -lvulkan-1
 EOF
 
 ```
-Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
 
-#### Git Bash MINGW64
+Switch into the `llama.cpp` directory and build using CMake.
+```sh
+cmake -B build -DGGML_VULKAN=ON
+cmake --build build --config Release
+```
+
+### Git Bash MINGW64
 
 Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
 
@@ -310,20 +246,21 @@ cmake --build build --config Release
 
 Now you can load the model in conversation mode using `Vulkan`
 
-```
-build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
+```sh
+build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
 ```
 
-#### MSYS2
+### MSYS2
 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
-  ```sh
-  pacman -S git \
-      mingw-w64-ucrt-x86_64-gcc \
-      mingw-w64-ucrt-x86_64-cmake \
-      mingw-w64-ucrt-x86_64-vulkan-devel \
-      mingw-w64-ucrt-x86_64-shaderc
-  ```
-Switch into `llama.cpp` directory and build using CMake.
+```sh
+pacman -S git \
+    mingw-w64-ucrt-x86_64-gcc \
+    mingw-w64-ucrt-x86_64-cmake \
+    mingw-w64-ucrt-x86_64-vulkan-devel \
+    mingw-w64-ucrt-x86_64-shaderc
+```
+
+Switch into the `llama.cpp` directory and build using CMake.
 ```sh
 cmake -B build -DGGML_VULKAN=ON
 cmake --build build --config Release
@@ -372,7 +309,7 @@ cmake --build build --config Release
 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
 ```
 
-### CANN
+## CANN
 This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
 
 For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
@@ -387,22 +324,26 @@ cmake --build build --config release
 
 You can test with:
 
-`./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
-
-If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
 ```bash
-llm_load_tensors:       CANN buffer size = 13313.00 MiB
+./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32
+```
+
+If the following info is output on screen, you are using `llama.cpp` with the CANN backend:
+```bash
+llm_load_tensors:       CANN model buffer size = 13313.00 MiB
 llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
 ```
 
 For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
 
-### Android
+## Android
 
 To read documentation for how to build on Android, [click here](./android.md)
 
-### Arm CPU optimized mulmat kernels
+## Notes about GPU-accelerated backends
 
-Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
+The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
 
-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
+In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option.
+
+Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building.
diff --git a/examples/base-translate.sh b/examples/base-translate.sh
deleted file mode 100755
index 103a52f55..000000000
--- a/examples/base-translate.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-#
-# Few-shot translation example.
-# Requires a base model (i.e. no fine-tuned or instruct models).
-#
-# Usage:
-#
-#   cd llama.cpp
-#   make -j
-#
-#   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
-#
-
-if [ $# -lt 2 ]; then
-  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
-  exit 1
-fi
-
-eargs=""
-if [ $# -gt 2 ]; then
-  eargs="${@:3}"
-fi
-
-ftmp="__llama.cpp_example_tmp__.txt"
-trap "rm -f $ftmp" EXIT
-
-echo "Translate from English to French:
-
-===
-
-sea otter, peppermint, plush girafe:
-
-sea otter => loutre de mer
-peppermint => menthe poivrée
-plush girafe => girafe peluche
-
-===
-
-violin
-
-violin => violon
-
-===
-
-phone, computer, mouse, keyboard:
-
-phone => téléphone
-computer => ordinateur
-mouse => souris
-keyboard => clavier
-
-===
-" > $ftmp
-
-echo "$2
-" >> $ftmp
-
-model=$1
-
-# generate the most likely continuation until the string "===" is found
-./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md
index 5774ac83c..46a42da69 100644
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -2,11 +2,8 @@
 
 This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
 
-To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
+To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository.
 
-`$ make -j`
-
-After successful compilation, following usage options are available:
 ```
 usage: ./llama-convert-llama2c-to-ggml [options]
 
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index bb5faec94..9c056986b 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -25,8 +25,6 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example
 
 ```bash
-GGML_CUDA=1 make -j
-
 # generate importance matrix (imatrix.dat)
 ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
 
diff --git a/examples/server/README.md b/examples/server/README.md
index 45ffb547f..aa99d06f9 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -188,12 +188,6 @@ services:
 
 `llama-server` is built alongside everything else from the root of the project
 
-- Using `make`:
-
-  ```bash
-  make llama-server
-  ```
-
 - Using `CMake`:
 
   ```bash
@@ -207,15 +201,6 @@ services:
 
 `llama-server` can also be built with SSL support using OpenSSL 3
 
-- Using `make`:
-
-  ```bash
-  # NOTE: For non-system openssl, use the following:
-  #   CXXFLAGS="-I /path/to/openssl/include"
-  #   LDFLAGS="-L /path/to/openssl/lib"
-  make LLAMA_SERVER_SSL=true llama-server
-  ```
-
 - Using `CMake`:
 
   ```bash
diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index 8b9b1ad39..143d98729 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -16,15 +16,21 @@ bench_args="${@:3}"
 rm -f llama-bench.sqlite > /dev/null
 
 # to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
+if [ -n "$GGML_CUDA" ]; then
+    cmake_opts="-DGGML_CUDA=ON"
+fi
+
+function run {
+    rm -fr build > /dev/null
+    cmake -B build -S . $cmake_opts > /dev/null
+    cmake --build build -t llama-bench > /dev/null
+    build/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+}
 
 git checkout $1 > /dev/null
-make clean > /dev/null
-make -j$(nproc) $make_opts llama-bench > /dev/null
-./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+run
 
 git checkout $2 > /dev/null
-make clean > /dev/null
-make -j$(nproc) $make_opts llama-bench > /dev/null
-./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+run
 
 ./scripts/compare-llama-bench.py -b $1 -c $2
diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh
deleted file mode 100644
index 6e56e1ed0..000000000
--- a/scripts/pod-llama.sh
+++ /dev/null
@@ -1,212 +0,0 @@
-#!/bin/bash
-#
-# Use this script only on fresh pods (runpod.io)!
-# Otherwise, it can break your environment!
-#
-
-if [ -z "$1" ]; then
-    echo "Usage: $0 <data>"
-    echo "  0: no models"
-    echo "  1: tinyllama-1b"
-    echo "  2: codellama-7b"
-    echo "  3: codellama-13b"
-    echo "  4: codellama-34b"
-    echo "  5: codellama-7b-instruct"
-    echo "  6: codellama-13b-instruct"
-    echo "  7: codellama-34b-instruct"
-
-    exit 1
-fi
-
-set -x
-
-# setup deps
-apt-get update
-apt-get install -y git-lfs cmake cmake-curses-gui vim ruby
-git-lfs install
-
-if [ ! -d "/workspace" ]; then
-    ln -sfn $(pwd) /workspace
-fi
-
-# download data
-cd /workspace
-
-# this is useful to git clone repos without doubling the disk size due to .git
-git clone https://github.com/iboB/git-lfs-download
-ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
-
-# llama.cpp
-cd /workspace
-git clone https://github.com/ggerganov/llama.cpp
-
-cd llama.cpp
-
-GGML_CUDA=1 make -j
-
-ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3  ./models/tinyllama-1b
-ln -sfn /workspace/CodeLlama-7b-hf           ./models/codellama-7b
-ln -sfn /workspace/CodeLlama-13b-hf          ./models/codellama-13b
-ln -sfn /workspace/CodeLlama-34b-hf          ./models/codellama-34b
-ln -sfn /workspace/CodeLlama-7b-Instruct-hf  ./models/codellama-7b-instruct
-ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
-ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
-
-pip install -r requirements.txt
-
-# cmake
-cd /workspace/llama.cpp
-
-mkdir build-cublas
-cd build-cublas
-
-cmake -DGGML_CUDA=1 ../
-make -j
-
-if [ "$1" -eq "0" ]; then
-    exit 0
-fi
-
-# more models
-if [ "$1" -eq "1" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
-
-    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "2" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf  --without *safetensors*
-    rm -v ./CodeLlama-7b-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
-
-    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "3" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
-    rm -v ./CodeLlama-13b-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
-
-    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "4" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
-    rm -v ./CodeLlama-34b-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
-
-    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "5" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf  --without *safetensors*
-    rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
-
-    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "6" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
-    rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
-
-    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "7" ]; then
-    cd /workspace
-
-    git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
-    rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
-
-    cd /workspace/llama.cpp
-
-    python3 examples/convert_legacy_llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
-
-    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
-    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
-    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
-fi
-
-if [ "$1" -eq "1" ]; then
-    # perf + perplexity
-    cd /workspace/llama.cpp/build-cublas
-
-    make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
-
-    ../scripts/get-wikitext-2.sh
-    unzip wikitext-2-raw-v1.zip
-
-    make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
-
-    # batched
-    cd /workspace/llama.cpp
-
-    GGML_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
-
-    # batched-bench
-    cd /workspace/llama.cpp
-
-    GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
-
-    # parallel
-    cd /workspace/llama.cpp
-
-    GGML_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
-
-fi
-
-# speculative
-#if [ "$1" -eq "7" ]; then
-#    cd /workspace/llama.cpp
-#
-#    GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
-#fi
-
-# more benches
-#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
-#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh
deleted file mode 100644
index 802592a3e..000000000
--- a/scripts/server-llm.sh
+++ /dev/null
@@ -1,418 +0,0 @@
-#!/bin/bash
-#
-# Helper script for deploying llama.cpp server with a single Bash command
-#
-# - Works on Linux and macOS
-# - Supports: CPU, CUDA, Metal
-# - Can run all GGUF models from HuggingFace
-# - Can serve requests in parallel
-# - Always builds latest llama.cpp from GitHub
-#
-# Limitations
-#
-# - Chat templates are poorly supported (base models recommended)
-# - Might be unstable!
-#
-# Usage:
-#   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]
-#
-#   --port:            port number, default is 8888
-#   --repo:            path to a repo containing GGUF model files
-#   --wtype:           weights type (f16, q8_0, q4_0, q4_1), default is user-input
-#   --backend:         cpu, cuda, metal, depends on the OS
-#   --gpu-id:          gpu id, default is 0
-#   --n-parallel:      number of parallel requests, default is 8
-#   --n-kv:            KV cache size, default is 4096
-#   --verbose:         verbose output
-#   --non-interactive: run without asking a permission to run
-#
-# Example:
-#
-#   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
-#
-
-set -e
-
-# required utils: curl, git, make
-if ! command -v curl &> /dev/null; then
-    printf "[-] curl not found\n"
-    exit 1
-fi
-if ! command -v git &> /dev/null; then
-    printf "[-] git not found\n"
-    exit 1
-fi
-if ! command -v make &> /dev/null; then
-    printf "[-] make not found\n"
-    exit 1
-fi
-
-# parse arguments
-is_interactive=1
-port=8888
-repo=""
-wtype=""
-backend="cpu"
-
-# if macOS, use metal backend by default
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    backend="metal"
-elif command -v nvcc &> /dev/null; then
-    backend="cuda"
-fi
-
-gpu_id=0
-n_parallel=8
-n_kv=4096
-verbose=0
-
-function print_usage {
-    printf "Usage:\n"
-    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n"
-    printf "  --port:             port number, default is 8888\n"
-    printf "  --repo:             path to a repo containing GGUF model files\n"
-    printf "  --wtype:            weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
-    printf "  --backend:          cpu, cuda, metal, depends on the OS\n"
-    printf "  --gpu-id:           gpu id, default is 0\n"
-    printf "  --n-parallel:       number of parallel requests, default is 8\n"
-    printf "  --n-kv:             KV cache size, default is 4096\n"
-    printf "  --verbose:          verbose output\n\n"
-    printf "  --non-interactive:  run without asking a permission to run\n"
-    printf "Example:\n\n"
-    printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
-}
-
-while [[ $# -gt 0 ]]; do
-    key="$1"
-    case $key in
-        --non-interactive)
-            is_interactive=0
-            shift
-            ;;
-        --port)
-            port="$2"
-            shift
-            shift
-            ;;
-        --repo)
-            repo="$2"
-            shift
-            shift
-            ;;
-        --wtype)
-            wtype="$2"
-            shift
-            shift
-            ;;
-        --backend)
-            backend="$2"
-            shift
-            shift
-            ;;
-        --gpu-id)
-            gpu_id="$2"
-            shift
-            shift
-            ;;
-        --n-parallel)
-            n_parallel="$2"
-            shift
-            shift
-            ;;
-        --n-kv)
-            n_kv="$2"
-            shift
-            shift
-            ;;
-        --verbose)
-            verbose=1
-            shift
-            ;;
-        --help)
-            print_usage
-            exit 0
-            ;;
-        *)
-            echo "Unknown argument: $key"
-            print_usage
-            exit 1
-            ;;
-    esac
-done
-
-# available weights types
-wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
-
-wfiles=()
-for wt in "${wtypes[@]}"; do
-    wfiles+=("")
-done
-
-# map wtype input to index
-if [[ ! -z "$wtype" ]]; then
-    iw=-1
-    is=0
-    for wt in "${wtypes[@]}"; do
-        # uppercase
-        uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
-        if [[ "$uwt" == "$wtype" ]]; then
-            iw=$is
-            break
-        fi
-        is=$((is+1))
-    done
-
-    if [[ $iw -eq -1 ]]; then
-        printf "[-] Invalid weight type: %s\n" "$wtype"
-        exit 1
-    fi
-
-    wtype="$iw"
-fi
-
-# sample repos
-repos=(
-    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
-    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
-    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
-    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
-    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
-)
-if [ $is_interactive -eq 1 ]; then
-    printf "\n"
-    printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
-    printf "    Based on the options that follow, the script might download a model file\n"
-    printf "    from the internet, which can be a few GBs in size. The script will also\n"
-    printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
-    printf "\n"
-    printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
-    printf "    model using llama.cpp for demonstration purposes.\n"
-    printf "\n"
-    printf "    Please note:\n"
-    printf "\n"
-    printf "    - All new data will be stored in the current folder\n"
-    printf "    - The server will be listening on all network interfaces\n"
-    printf "    - The server will run with default settings which are not always optimal\n"
-    printf "    - Do not judge the quality of a model based on the results from this script\n"
-    printf "    - Do not use this script to benchmark llama.cpp\n"
-    printf "    - Do not use this script in production\n"
-    printf "    - This script is only for demonstration purposes\n"
-    printf "\n"
-    printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
-    printf "\n"
-    printf "    Press Enter to continue ...\n\n"
-
-    read
-fi
-
-if [[ -z "$repo" ]]; then
-    printf "[+] No repo provided from the command line\n"
-    printf "    Please select a number from the list below or enter an URL:\n\n"
-
-    is=0
-    for r in "${repos[@]}"; do
-        printf "    %2d) %s\n" $is "$r"
-        is=$((is+1))
-    done
-
-    # ask for repo until index of sample repo is provided or an URL
-    while [[ -z "$repo" ]]; do
-        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
-        read -p "[+] Select repo: " repo
-
-        # check if the input is a number
-        if [[ "$repo" =~ ^[0-9]+$ ]]; then
-            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
-                repo="${repos[$repo]}"
-            else
-                printf "[-] Invalid repo index: %s\n" "$repo"
-                repo=""
-            fi
-        elif [[ "$repo" =~ ^https?:// ]]; then
-            repo="$repo"
-        else
-            printf "[-] Invalid repo URL: %s\n" "$repo"
-            repo=""
-        fi
-    done
-fi
-
-# remove suffix
-repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
-
-printf "[+] Checking for GGUF model files in %s\n" "$repo"
-
-# find GGUF files in the source
-# TODO: better logic
-model_tree="${repo%/}/tree/main"
-model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
-
-# list all files in the provided git repo
-printf "[+] Model files:\n\n"
-for file in $model_files; do
-    # determine iw by grepping the filename with wtypes
-    iw=-1
-    is=0
-    for wt in "${wtypes[@]}"; do
-        # uppercase
-        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
-        if [[ "$ufile" =~ "$wt" ]]; then
-            iw=$is
-            break
-        fi
-        is=$((is+1))
-    done
-
-    if [[ $iw -eq -1 ]]; then
-        continue
-    fi
-
-    wfiles[$iw]="$file"
-
-    have=" "
-    if [[ -f "$file" ]]; then
-        have="*"
-    fi
-
-    printf "    %2d) %s %s\n" $iw "$have" "$file"
-done
-
-wfile="${wfiles[$wtype]}"
-
-# ask for weights type until provided and available
-while [[ -z "$wfile" ]]; do
-    printf "\n"
-    read -p "[+] Select weight type: " wtype
-    wfile="${wfiles[$wtype]}"
-
-    if [[ -z "$wfile" ]]; then
-        printf "[-] Invalid weight type: %s\n" "$wtype"
-        wtype=""
-    fi
-done
-
-printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
-
-url="${repo%/}/resolve/main/$wfile"
-
-# check file if the model has been downloaded before
-chk="$wfile.chk"
-
-# check if we should download the file
-# - if $wfile does not exist
-# - if $wfile exists but $chk does not exist
-# - if $wfile exists and $chk exists but $wfile is newer than $chk
-# TODO: better logic using git lfs info
-
-do_download=0
-
-if [[ ! -f "$wfile" ]]; then
-    do_download=1
-elif [[ ! -f "$chk" ]]; then
-    do_download=1
-elif [[ "$wfile" -nt "$chk" ]]; then
-    do_download=1
-fi
-
-if [[ $do_download -eq 1 ]]; then
-    printf "[+] Downloading weights from %s\n" "$url"
-
-    # download the weights file
-    curl -o "$wfile" -# -L "$url"
-
-    # create a check file if successful
-    if [[ $? -eq 0 ]]; then
-        printf "[+] Creating check file %s\n" "$chk"
-        touch "$chk"
-    fi
-else
-    printf "[+] Using cached weights %s\n" "$wfile"
-fi
-
-# get latest llama.cpp and build
-
-printf "[+] Downloading latest llama.cpp\n"
-
-llama_cpp_dir="__llama_cpp_port_${port}__"
-
-if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
-    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
-    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[-] Please remove it and try again\n"
-    exit 1
-elif [[ -d "$llama_cpp_dir" ]]; then
-    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[+] Using cached llama.cpp\n"
-
-    cd "$llama_cpp_dir"
-    git reset --hard
-    git fetch
-    git checkout origin/master
-
-    cd ..
-else
-    printf "[+] Cloning llama.cpp\n"
-
-    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
-fi
-
-# mark that that the directory is made by this script
-touch "$llama_cpp_dir/__ggml_script__"
-
-if [[ $verbose -eq 1 ]]; then
-    set -x
-fi
-
-# build
-cd "$llama_cpp_dir"
-
-make clean
-
-log="--silent"
-if [[ $verbose -eq 1 ]]; then
-    log=""
-fi
-
-if [[ "$backend" == "cuda" ]]; then
-    printf "[+] Building with CUDA backend\n"
-    GGML_CUDA=1 make -j llama-server $log
-elif [[ "$backend" == "cpu" ]]; then
-    printf "[+] Building with CPU backend\n"
-    make -j llama-server $log
-elif [[ "$backend" == "metal" ]]; then
-    printf "[+] Building with Metal backend\n"
-    make -j llama-server $log
-else
-    printf "[-] Unknown backend: %s\n" "$backend"
-    exit 1
-fi
-
-# run the server
-
-printf "[+] Running server\n"
-
-args=""
-if [[ "$backend" == "cuda" ]]; then
-    export CUDA_VISIBLE_DEVICES=$gpu_id
-    args="-ngl 999"
-elif [[ "$backend" == "cpu" ]]; then
-    args="-ngl 0"
-elif [[ "$backend" == "metal" ]]; then
-    args="-ngl 999"
-else
-    printf "[-] Unknown backend: %s\n" "$backend"
-    exit 1
-fi
-
-if [[ $verbose -eq 1 ]]; then
-    args="$args --verbose"
-fi
-
-./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
-
-exit 0

From 642330ac7cea11dc1f9d3df2b8f3dbd10b5e3f3e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Mon, 2 Dec 2024 22:10:19 +0100
Subject: [PATCH 373/396] llama : add enum for built-in chat templates (#10623)

* llama : add enum for supported chat templates

* use "built-in" instead of "supported"

* arg: print list of built-in templates

* fix test

* update server README
---
 common/arg.cpp               |  20 +-
 examples/server/README.md    |  11 +-
 include/llama.h              |   3 +
 src/llama.cpp                | 363 +++++++++++++++++++++++++----------
 tests/test-chat-template.cpp |  20 +-
 5 files changed, 310 insertions(+), 107 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 32d9a964c..078c75384 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -348,6 +348,18 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
     return true;
 }
 
+static std::string list_builtin_chat_templates() {
+    std::vector<const char *> supported_tmpl;
+    int32_t res = llama_chat_builtin_templates(nullptr, 0);
+    supported_tmpl.resize(res);
+    res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
+    std::ostringstream msg;
+    for (auto & tmpl : supported_tmpl) {
+        msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
     // load dynamic backends
     ggml_backend_load_all();
@@ -1814,9 +1826,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
-        "set custom jinja chat template (default: template taken from model's metadata)\n"
-        "if suffix/prefix are specified, template will be disabled\n"
-        "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
+        string_format(
+            "set custom jinja chat template (default: template taken from model's metadata)\n"
+            "if suffix/prefix are specified, template will be disabled\n"
+            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
+        ),
         [](common_params & params, const std::string & value) {
             if (!common_chat_verify_template(value)) {
                 throw std::runtime_error(string_format(
diff --git a/examples/server/README.md b/examples/server/README.md
index aa99d06f9..3f0d45e5b 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -69,6 +69,8 @@ The project is under active development, and we are [looking for feedback and co
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing(env: LLAMA_ARG_MLOCK) |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)(env: LLAMA_ARG_NO_MMAP) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems- distribute: spread execution evenly over all nodes- isolate: only spawn threads on CPUs on the node that execution started on- numactl: use the CPU map provided by numactlif run without this previously, it is recommended to drop the system page cache before using thissee https://github.com/ggerganov/llama.cpp/issues/1437(env: LLAMA_ARG_NUMA) |
+| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)use --list-devices to see a list of available devices(env: LLAMA_ARG_DEVICE) |
+| `--list-devices` | print list of available devices and exit |
 | `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:- none: use one GPU only- layer (default): split layers and KV across GPUs- row: split rows across GPUs(env: LLAMA_ARG_SPLIT_MODE) |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1(env: LLAMA_ARG_TENSOR_SPLIT) |
@@ -158,9 +160,16 @@ The project is under active development, and we are [looking for feedback and co
 | `--props` | enable changing global properties via POST /props (default: disabled)(env: LLAMA_ARG_ENDPOINT_PROPS) |
 | `--no-slots` | disables slots monitoring endpoint(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)if suffix/prefix are specified, template will be disabledonly commonly used templates are accepted:https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)if suffix/prefix are specified, template will be disabledlist of built-in templates:chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled) |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
+| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) |
+| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)use --list-devices to see a list of available devices |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
 
 
 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
diff --git a/include/llama.h b/include/llama.h
index ab5e376e6..439e0ff0c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -990,6 +990,9 @@ extern "C" {
                                   char * buf,
                                int32_t   length);
 
+    // Get list of built-in chat templates
+    int32_t llama_chat_builtin_templates(const char ** output, size_t len);
+
     //
     // Sampling API
     //
diff --git a/src/llama.cpp b/src/llama.cpp
index 6e9ba9727..6a6f4c2a5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1549,6 +1549,67 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
     },
 };
 
+enum llm_chat_template {
+    LLM_CHAT_TEMPLATE_CHATML,
+    LLM_CHAT_TEMPLATE_LLAMA_2,
+    LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
+    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
+    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
+    LLM_CHAT_TEMPLATE_MISTRAL_V1,
+    LLM_CHAT_TEMPLATE_MISTRAL_V3,
+    LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
+    LLM_CHAT_TEMPLATE_MISTRAL_V7,
+    LLM_CHAT_TEMPLATE_PHI_3,
+    LLM_CHAT_TEMPLATE_ZEPHYR,
+    LLM_CHAT_TEMPLATE_MONARCH,
+    LLM_CHAT_TEMPLATE_GEMMA,
+    LLM_CHAT_TEMPLATE_ORION,
+    LLM_CHAT_TEMPLATE_OPENCHAT,
+    LLM_CHAT_TEMPLATE_VICUNA,
+    LLM_CHAT_TEMPLATE_VICUNA_ORCA,
+    LLM_CHAT_TEMPLATE_DEEPSEEK,
+    LLM_CHAT_TEMPLATE_DEEPSEEK_2,
+    LLM_CHAT_TEMPLATE_COMMAND_R,
+    LLM_CHAT_TEMPLATE_LLAMA_3,
+    LLM_CHAT_TEMPLATE_CHATGML_3,
+    LLM_CHAT_TEMPLATE_CHATGML_4,
+    LLM_CHAT_TEMPLATE_MINICPM,
+    LLM_CHAT_TEMPLATE_EXAONE_3,
+    LLM_CHAT_TEMPLATE_RWKV_WORLD,
+    LLM_CHAT_TEMPLATE_GRANITE,
+    LLM_CHAT_TEMPLATE_UNKNOWN,
+};
+
+static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
+    { "chatml",            LLM_CHAT_TEMPLATE_CHATML            },
+    { "llama2",            LLM_CHAT_TEMPLATE_LLAMA_2           },
+    { "llama2-sys",        LLM_CHAT_TEMPLATE_LLAMA_2_SYS       },
+    { "llama2-sys-bos",    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS   },
+    { "llama2-sys-strip",  LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
+    { "mistral-v1",        LLM_CHAT_TEMPLATE_MISTRAL_V1        },
+    { "mistral-v3",        LLM_CHAT_TEMPLATE_MISTRAL_V3        },
+    { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
+    { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
+    { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
+    { "zephyr",            LLM_CHAT_TEMPLATE_ZEPHYR            },
+    { "monarch",           LLM_CHAT_TEMPLATE_MONARCH           },
+    { "gemma",             LLM_CHAT_TEMPLATE_GEMMA             },
+    { "orion",             LLM_CHAT_TEMPLATE_ORION             },
+    { "openchat",          LLM_CHAT_TEMPLATE_OPENCHAT          },
+    { "vicuna",            LLM_CHAT_TEMPLATE_VICUNA            },
+    { "vicuna-orca",       LLM_CHAT_TEMPLATE_VICUNA_ORCA       },
+    { "deepseek",          LLM_CHAT_TEMPLATE_DEEPSEEK          },
+    { "deepseek2",         LLM_CHAT_TEMPLATE_DEEPSEEK_2        },
+    { "command-r",         LLM_CHAT_TEMPLATE_COMMAND_R         },
+    { "llama3",            LLM_CHAT_TEMPLATE_LLAMA_3           },
+    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGML_3         },
+    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGML_4         },
+    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
+    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
+    { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
+    { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
+};
+
 static llm_arch llm_arch_from_string(const std::string & name) {
     for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
         if (kv.second == name) {
@@ -21843,18 +21904,109 @@ int32_t llama_detokenize(
 // chat templates
 //
 
+static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
+    if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
+        return LLM_CHAT_TEMPLATES.at(tmpl);
+    }
+    auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
+        return tmpl.find(haystack) != std::string::npos;
+    };
+    if (tmpl_contains("<|im_start|>")) {
+        return LLM_CHAT_TEMPLATE_CHATML;
+    } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
+        if (tmpl_contains("[SYSTEM_PROMPT]")) {
+            return LLM_CHAT_TEMPLATE_MISTRAL_V7;
+        } else if (
+            // catches official 'v1' template
+            tmpl_contains("' [INST] ' + system_message")
+            // catches official 'v3' and 'v3-tekken' templates
+            || tmpl_contains("[AVAILABLE_TOOLS]")
+        ) {
+            // Official mistral 'v1', 'v3' and 'v3-tekken' templates
+            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
+            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
+            if (tmpl_contains(" [INST]")) {
+                return LLM_CHAT_TEMPLATE_MISTRAL_V1;
+            } else if (tmpl_contains("\"[INST]\"")) {
+                return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
+            }
+            return LLM_CHAT_TEMPLATE_MISTRAL_V3;
+        } else {
+            // llama2 template and its variants
+            // [variant] support system message
+            // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+            bool support_system_message = tmpl_contains("<<SYS>>");
+            bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
+            bool strip_message = tmpl_contains("content.strip()");
+            if (strip_message) {
+                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
+            } else if (add_bos_inside_history) {
+                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
+            } else if (support_system_message) {
+                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
+            } else {
+                return LLM_CHAT_TEMPLATE_LLAMA_2;
+            }
+        }
+    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
+        return LLM_CHAT_TEMPLATE_PHI_3;
+    } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
+        return LLM_CHAT_TEMPLATE_ZEPHYR;
+    } else if (tmpl_contains("bos_token + message['role']")) {
+        return LLM_CHAT_TEMPLATE_MONARCH;
+    } else if (tmpl_contains("<start_of_turn>")) {
+        return LLM_CHAT_TEMPLATE_GEMMA;
+    } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
+        // OrionStarAI/Orion-14B-Chat
+        return LLM_CHAT_TEMPLATE_ORION;
+    } else if (tmpl_contains("GPT4 Correct ")) {
+        // openchat/openchat-3.5-0106
+        return LLM_CHAT_TEMPLATE_OPENCHAT;
+    } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
+        // eachadea/vicuna-13b-1.1 (and Orca variant)
+        if (tmpl_contains("SYSTEM: ")) {
+            return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
+        }
+        return LLM_CHAT_TEMPLATE_VICUNA;
+    } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
+        // deepseek-ai/deepseek-coder-33b-instruct
+        return LLM_CHAT_TEMPLATE_DEEPSEEK;
+    } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
+        // CohereForAI/c4ai-command-r-plus
+        return LLM_CHAT_TEMPLATE_COMMAND_R;
+    } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
+        return LLM_CHAT_TEMPLATE_LLAMA_3;
+    } else if (tmpl_contains("[gMASK]sop")) {
+        // chatglm3-6b
+        return LLM_CHAT_TEMPLATE_CHATGML_3;
+    } else if (tmpl_contains("[gMASK]<sop>")) {
+        return LLM_CHAT_TEMPLATE_CHATGML_4;
+    } else if (tmpl_contains(LU8("<用户>"))) {
+        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+        return LLM_CHAT_TEMPLATE_MINICPM;
+    } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
+        return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
+    } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
+        // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
+        // EXAONE-3.0-7.8B-Instruct
+        return LLM_CHAT_TEMPLATE_EXAONE_3;
+    } else if (tmpl_contains("rwkv-world")) {
+        return LLM_CHAT_TEMPLATE_RWKV_WORLD;
+    } else if (tmpl_contains("<|start_of_role|>")) {
+        return LLM_CHAT_TEMPLATE_GRANITE;
+    }
+    return LLM_CHAT_TEMPLATE_UNKNOWN;
+}
+
 // Simple version of "llama_apply_chat_template" that only works with strings
 // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
 static int32_t llama_chat_apply_template_internal(
-    const std::string & tmpl,
+    const llm_chat_template tmpl,
     const std::vector<const llama_chat_message *> & chat,
     std::string & dest, bool add_ass) {
     // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
     std::stringstream ss;
-    auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
-        return tmpl.find(haystack) != std::string::npos;
-    };
-    if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
+    if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
         // chatml template
         for (auto message : chat) {
             ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -21862,86 +22014,84 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|im_start|>assistant\n";
         }
-    } else if (tmpl == "llama2" || tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
-        if (tmpl == "mistral-v7" || tmpl_contains("[SYSTEM_PROMPT]")) {
-            // Official mistral 'v7' template
-            // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
-            for (auto message : chat) {
-                std::string role(message->role);
-                std::string content(message->content);
-                if (role == "system") {
-                    ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
-                } else if (role == "user") {
-                    ss << "[INST] " << content << "[/INST]";
-                }
-                else {
-                    ss << " " << content << "</s>";
-                }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
+        // Official mistral 'v7' template
+        // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
+        for (auto message : chat) {
+            std::string role(message->role);
+            std::string content(message->content);
+            if (role == "system") {
+                ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
+            } else if (role == "user") {
+                ss << "[INST] " << content << "[/INST]";
             }
-        } else if (tmpl == "mistral-v1" || tmpl == "mistral-v3" || tmpl == "mistral-v3-tekken"
-                   || tmpl_contains("' [INST] ' + system_message") // catches official 'v1' template
-                   || tmpl_contains("[AVAILABLE_TOOLS]")) {        // catches official 'v3' and 'v3-tekken' templates
-            // Official mistral 'v1', 'v3' and 'v3-tekken' templates
-            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
-            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
-            std::string leading_space = (tmpl == "mistral-v1" || tmpl_contains(" [INST]") ? " " : "");
-            std::string trailing_space = (tmpl == "mistral-v3-tekken" || tmpl_contains("\"[INST]\"") ? "" : " ");
-            bool trim_assistant_message = tmpl_contains("|trim + eos_token");
-            bool is_inside_turn = false;
-            for (auto message : chat) {
-                if (!is_inside_turn) {
-                    ss << leading_space << "[INST]" << trailing_space;
-                    is_inside_turn = true;
-                }
-                std::string role(message->role);
-                std::string content(message->content);
-                if (role == "system") {
-                    ss << content << "\n\n";
-                } else if (role == "user") {
-                    ss << content << leading_space << "[/INST]";
-                } else {
-                    ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
-                    is_inside_turn = false;
-                }
+            else {
+                ss << " " << content << "</s>";
             }
-        } else {
-            // llama2 template and its variants
-            // [variant] support system message
-            // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
-            bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "llama2";
-            // [variant] space before + after response
-            bool space_around_response = tmpl_contains("' ' + eos_token");
-            // [variant] add BOS inside history
-            bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
-            // [variant] trim spaces from the input message
-            bool strip_message = tmpl_contains("content.strip()");
-            // construct the prompt
-            bool is_inside_turn = true; // skip BOS at the beginning
-            ss << "[INST] ";
-            for (auto message : chat) {
-                std::string content = strip_message ? trim(message->content) : message->content;
-                std::string role(message->role);
-                if (!is_inside_turn) {
-                    is_inside_turn = true;
-                    ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
-                }
-                if (role == "system") {
-                    if (support_system_message) {
-                        ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
-                    } else {
-                        // if the model does not support system message, we still include it in the first message, but without <<SYS>>
-                        ss << content << "\n";
-                    }
-                } else if (role == "user") {
-                    ss << content << " [/INST]";
-                } else {
-                    ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
-                    is_inside_turn = false;
-                }
-            }
-            // llama2 templates seem to not care about "add_generation_prompt
         }
-    } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
+            || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
+            || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
+        // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
+        // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
+        std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
+        std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
+        bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
+        bool is_inside_turn = false;
+        for (auto message : chat) {
+            if (!is_inside_turn) {
+                ss << leading_space << "[INST]" << trailing_space;
+                is_inside_turn = true;
+            }
+            std::string role(message->role);
+            std::string content(message->content);
+            if (role == "system") {
+                ss << content << "\n\n";
+            } else if (role == "user") {
+                ss << content << leading_space << "[/INST]";
+            } else {
+                ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
+                is_inside_turn = false;
+            }
+        }
+    } else if (
+            tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
+            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
+            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
+            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
+        // llama2 template and its variants
+        // [variant] support system message
+        // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+        bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
+        // [variant] add BOS inside history
+        bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
+        // [variant] trim spaces from the input message
+        bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
+        // construct the prompt
+        bool is_inside_turn = true; // skip BOS at the beginning
+        ss << "[INST] ";
+        for (auto message : chat) {
+            std::string content = strip_message ? trim(message->content) : message->content;
+            std::string role(message->role);
+            if (!is_inside_turn) {
+                is_inside_turn = true;
+                ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
+            }
+            if (role == "system") {
+                if (support_system_message) {
+                    ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
+                } else {
+                    // if the model does not support system message, we still include it in the first message, but without <<SYS>>
+                    ss << content << "\n";
+                }
+            } else if (role == "user") {
+                ss << content << " [/INST]";
+            } else {
+                ss << content << "</s>";
+                is_inside_turn = false;
+            }
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
         // Phi 3
         for (auto message : chat) {
             std::string role(message->role);
@@ -21950,7 +22100,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
-    } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
         // zephyr template
         for (auto message : chat) {
             ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -21958,7 +22108,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
-    } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
         // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
         for (auto message : chat) {
             std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -21967,7 +22117,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<s>assistant\n";
         }
-    } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
         // google/gemma-7b-it
         std::string system_prompt = "";
         for (auto message : chat) {
@@ -21989,7 +22139,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<start_of_turn>model\n";
         }
-    } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
         // OrionStarAI/Orion-14B-Chat
         std::string system_prompt = "";
         for (auto message : chat) {
@@ -22009,7 +22159,7 @@ static int32_t llama_chat_apply_template_internal(
                 ss << message->content << "</s>";
             }
         }
-    } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
         // openchat/openchat-3.5-0106,
         for (auto message : chat) {
             std::string role(message->role);
@@ -22023,13 +22173,13 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "GPT4 Correct Assistant:";
         }
-    } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
         // eachadea/vicuna-13b-1.1 (and Orca variant)
         for (auto message : chat) {
             std::string role(message->role);
             if (role == "system") {
                 // Orca-Vicuna variant uses a system prefix
-                if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
+                if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
                     ss << "SYSTEM: " << message->content << "\n";
                 } else {
                     ss << message->content << "\n\n";
@@ -22043,7 +22193,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "ASSISTANT:";
         }
-    } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
         // deepseek-ai/deepseek-coder-33b-instruct
         for (auto message : chat) {
             std::string role(message->role);
@@ -22058,7 +22208,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "### Response:\n";
         }
-    } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
         // CohereForAI/c4ai-command-r-plus
         for (auto message : chat) {
             std::string role(message->role);
@@ -22073,7 +22223,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
         }
-    } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
         // Llama 3
         for (auto message : chat) {
             std::string role(message->role);
@@ -22082,7 +22232,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
         }
-    } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
         // chatglm3-6b
         ss << "[gMASK]" << "sop";
         for (auto message : chat) {
@@ -22092,7 +22242,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>";
         }
-    } else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
         ss << "[gMASK]" << "<sop>";
         for (auto message : chat) {
             std::string role(message->role);
@@ -22101,7 +22251,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>";
         }
-    } else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
         // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
         for (auto message : chat) {
             std::string role(message->role);
@@ -22113,7 +22263,7 @@ static int32_t llama_chat_apply_template_internal(
                 ss << trim(message->content);
             }
         }
-    } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
         // DeepSeek-V2
         for (auto message : chat) {
             std::string role(message->role);
@@ -22128,7 +22278,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "Assistant:";
         }
-    } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
         // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
         // EXAONE-3.0-7.8B-Instruct
         for (auto message : chat) {
@@ -22144,7 +22294,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "[|assistant|]";
         }
-    } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
         // this template requires the model to have "\n\n" as EOT token
         for (auto message : chat) {
             std::string role(message->role);
@@ -22154,7 +22304,7 @@ static int32_t llama_chat_apply_template_internal(
                 ss << message->content << "\n\n";
             }
         }
-    } else if (tmpl == "granite" || tmpl_contains("<|start_of_role|>")) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
         // IBM Granite template
         for (const auto & message : chat) {
             std::string role(message->role);
@@ -22206,7 +22356,11 @@ int32_t llama_chat_apply_template(
     }
 
     std::string formatted_chat;
-    int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
+    llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
+    if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
+        return -1;
+    }
+    int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
     if (res < 0) {
         return res;
     }
@@ -22216,6 +22370,15 @@ int32_t llama_chat_apply_template(
     return res;
 }
 
+int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
+    auto it = LLM_CHAT_TEMPLATES.begin();
+    for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
+        output[i] = it->first.c_str();
+        std::advance(it, 1);
+    }
+    return (int32_t) LLM_CHAT_TEMPLATES.size();
+}
+
 //
 // sampling
 //
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index dd8f7d5f0..aa140b569 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -82,9 +82,9 @@ int main(void) {
         // mistralai/Mistral-7B-Instruct-v0.2 (NOTE: Old pre-v1 without a system prompt)
         "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
         // TheBloke/FusionNet_34Bx2_MoE-AWQ
-        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
+        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST]Hi there</s><s>[INST] Who are you [/INST]   I am an assistant   </s><s>[INST] Another question [/INST]",
         // bofenghuang/vigogne-2-70b-chat
-        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
+        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST]Hi there</s>[INST] Who are you [/INST]I am an assistant</s>[INST] Another question [/INST]",
         // mlabonne/AlphaMonarch-7B
         "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
         // google/gemma-7b-it
@@ -133,6 +133,17 @@ int main(void) {
     std::vector<char> formatted_chat(1024);
     int32_t res;
 
+    // list all supported templates
+    std::vector<const char *> supported_tmpl;
+    res = llama_chat_builtin_templates(nullptr, 0);
+    assert(res > 0);
+    supported_tmpl.resize(res);
+    res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
+    printf("Built-in chat templates:\n");
+    for (auto tmpl : supported_tmpl) {
+        printf("  %s\n", tmpl);
+    }
+
     // test invalid chat template
     res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
     assert(res < 0);
@@ -174,7 +185,8 @@ int main(void) {
     assert(fmt_sys("mistral-v3") == "[INST] You are a helpful assistant\n\n");
     assert(fmt_sys("mistral-v3-tekken") == "[INST]You are a helpful assistant\n\n");
     assert(fmt_sys("mistral-v7") == "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT]");
-    assert(fmt_sys("llama2") == "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\n");
+    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
+    assert(fmt_sys("llama2-sys") == "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\n");
     assert(fmt_sys("mistral") == "[INST] You are a helpful assistant\n"); // for old pre-v1 templates
     assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
     assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
@@ -203,5 +215,7 @@ int main(void) {
     assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
     assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
 
+    printf("Test chat templates: OK\n");
+
     return 0;
 }

From 70b98fadbc8c07a0144f3b50a4d7ab7e2aeff878 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 3 Dec 2024 11:20:00 +0200
Subject: [PATCH 374/396] server : fix default draft model parameters (#10586)

* server : force F16 KV cache for the draft model

ggml-ci

* server : fix draft params

ggml-ci

* server : various params fixes

ggml-ci
---
 examples/server/server.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8eca14b86..2ec13d7d2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -696,8 +696,9 @@ struct server_context {
 
             params_dft.devices      = params_base.speculative.devices;
             params_dft.model        = params_base.speculative.model;
-            params_dft.n_ctx        = params_base.speculative.n_ctx;
+            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
+            params_dft.n_parallel   = 1;
 
             common_init_result llama_init_dft = common_init_from_params(params_dft);
 
@@ -717,8 +718,14 @@ struct server_context {
                 return false;
             }
 
-            cparams_dft = common_context_params_to_llama(params_base);
-            cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
+            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
+
+            cparams_dft = common_context_params_to_llama(params_dft);
+            cparams_dft.n_batch = n_ctx_dft;
+
+            // force F16 KV cache for the draft model for extra performance
+            cparams_dft.type_k = GGML_TYPE_F16;
+            cparams_dft.type_v = GGML_TYPE_F16;
 
             // the context is not needed - we will create one for each slot
             llama_free(llama_init_dft.context);
@@ -2322,6 +2329,10 @@ struct server_context {
                     continue;
                 }
 
+                if (slot.state != SLOT_STATE_GENERATING) {
+                    continue;
+                }
+
                 llama_token id = slot.sampled;
 
                 struct common_speculative_params params_spec;

From 844e2e1feec887c803845a3b8762f3b15979b095 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 3 Dec 2024 11:20:35 +0200
Subject: [PATCH 375/396] github : minify link [no ci]

---
 .github/pull_request_template.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index d9f5bdc23..0b8aba061 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1 +1 @@
-*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
+*Make sure to read the [contributing guidelines](CONTRIBUTING.md) before submitting a PR*

From 515d4e53727924e48774f45ecb15bdacbf851e13 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 3 Dec 2024 11:21:43 +0200
Subject: [PATCH 376/396] github : minify link [no ci] (revert)

this doesn't work as expected
---
 .github/pull_request_template.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 0b8aba061..d9f5bdc23 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1 +1 @@
-*Make sure to read the [contributing guidelines](CONTRIBUTING.md) before submitting a PR*
+*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*

From 0115df2f65ac7c64dd0e5915c72ecc4a9343a130 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 3 Dec 2024 11:52:33 +0200
Subject: [PATCH 377/396] metal : small-batch mat-mul kernels (#10581)

* metal : small-batch mat-mul kernels

ggml-ci

* metal : add rest of types

ggml-ci

* metal : final adjustments

ggml-ci

* metal : add comments

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal-impl.h |  24 ++
 ggml/src/ggml-metal/ggml-metal.m      | 274 +++++++++++++++--
 ggml/src/ggml-metal/ggml-metal.metal  | 427 +++++++++++++++++++++++++-
 tests/test-backend-ops.cpp            |  13 +
 4 files changed, 709 insertions(+), 29 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
index 53c135496..e19586719 100644
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -192,6 +192,30 @@ typedef struct {
     int16_t  r3;
 } ggml_metal_kargs_mul_mv;
 
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne10;
+    int32_t  ne11;
+    int32_t  ne12;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    int32_t  ne0;
+    int32_t  ne1;
+    int16_t  r2;
+    int16_t  r3;
+    int16_t  nsg;
+    int16_t  nxpsg;
+    int16_t  r1ptg;
+} ggml_metal_kargs_mul_mv_ext;
+
 typedef struct {
     int32_t  nei0;
     int32_t  nei1;
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index ae6b25ede..c247b50c9 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -175,6 +175,46 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_5,
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,
@@ -699,6 +739,46 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,               mul_mv_q5_0_f32,                has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,               mul_mv_q5_1_f32,                has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,               mul_mv_q8_0_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,       mul_mv_ext_f16_f32_r1_2,        has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,       mul_mv_ext_f16_f32_r1_3,        has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,       mul_mv_ext_f16_f32_r1_4,        has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_5,       mul_mv_ext_f16_f32_r1_5,        has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_2,      mul_mv_ext_q4_0_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_3,      mul_mv_ext_q4_0_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_4,      mul_mv_ext_q4_0_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_5,      mul_mv_ext_q4_0_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_2,      mul_mv_ext_q4_1_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_3,      mul_mv_ext_q4_1_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_4,      mul_mv_ext_q4_1_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_5,      mul_mv_ext_q4_1_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_2,      mul_mv_ext_q5_0_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_3,      mul_mv_ext_q5_0_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_4,      mul_mv_ext_q5_0_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_5,      mul_mv_ext_q5_0_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_2,      mul_mv_ext_q5_1_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_3,      mul_mv_ext_q5_1_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_4,      mul_mv_ext_q5_1_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_5,      mul_mv_ext_q5_1_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_2,      mul_mv_ext_q8_0_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3,      mul_mv_ext_q8_0_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4,      mul_mv_ext_q8_0_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5,      mul_mv_ext_q8_0_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2,      mul_mv_ext_q4_K_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3,      mul_mv_ext_q4_K_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4,      mul_mv_ext_q4_K_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_5,      mul_mv_ext_q4_K_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_2,      mul_mv_ext_q5_K_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_3,      mul_mv_ext_q5_K_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_4,      mul_mv_ext_q5_K_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_5,      mul_mv_ext_q5_K_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_2,      mul_mv_ext_q6_K_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_3,      mul_mv_ext_q6_K_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_4,      mul_mv_ext_q6_K_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_5,      mul_mv_ext_q6_K_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_2,    mul_mv_ext_iq4_nl_f32_r1_2,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_3,    mul_mv_ext_iq4_nl_f32_r1_3,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_4,    mul_mv_ext_iq4_nl_f32_r1_4,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_5,    mul_mv_ext_iq4_nl_f32_r1_5,     has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,               mul_mv_q2_K_f32,                has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,               mul_mv_q3_K_f32,                has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,               mul_mv_q4_K_f32,                has_simdgroup_reduction);
@@ -1928,30 +2008,180 @@ static void ggml_metal_encode_node(
 
                 // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                 // to the matrix-vector kernel
-                int ne11_mm_min = 4;
+                const int ne11_mm_min = 4;
 
-#if 0
-                // the numbers below are measured on M2 Ultra for 7B and 13B models
-                // these numbers do not translate to other devices or model sizes
-                // TODO: need to find a better approach
-                        if ([device.name isEqualToString:@"Apple M2 Ultra"]) {
-                            switch (src0t) {
-                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q4_0:
-                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-                                case GGML_TYPE_Q5_0:                          // not tested yet
-                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-                                default:             ne11_mm_min = 1;  break;
-                            }
-                        }
-#endif
+                // first try to use small-batch mat-mv kernels
+                // these should be efficient for BS [2, ~8]
+                if (src1t == GGML_TYPE_F32 && (ne00%256 == 0) &&
+                    (
+                     (
+                      (
+                       src0t == GGML_TYPE_F16  || // TODO: helper function
+                       src0t == GGML_TYPE_Q4_0 ||
+                       src0t == GGML_TYPE_Q4_1 ||
+                       src0t == GGML_TYPE_Q5_0 ||
+                       src0t == GGML_TYPE_Q5_1 ||
+                       src0t == GGML_TYPE_Q8_0 ||
+                       src0t == GGML_TYPE_IQ4_NL ||
+                       false) && (ne11 >= 2 && ne11 <= 8)
+                     ) ||
+                     (
+                      (
+                       src0t == GGML_TYPE_Q4_K ||
+                       src0t == GGML_TYPE_Q5_K ||
+                       src0t == GGML_TYPE_Q6_K ||
+                       false) && (ne11 >= 4 && ne11 <= 8)
+                     )
+                    )
+                   ) {
+                    // TODO: determine the optimal parameters based on grid utilization
+                    //       I still don't know why we should not always use the maximum available threads:
+                    //
+                    //       nsg = pipeline.maxTotalThreadsPerThreadgroup / 32
+                    //
+                    //       my current hypothesis is that the work grid is not evenly divisible for different nsg
+                    //       values and there can be some tail effects when nsg is high. need to confirm this
+                    //
+                    const int nsg    = 2;                 // num simdgroups per threadgroup
+                    const int nxpsg  = ne11 < 3 ? 16 : 8; // num threads along row per simdgroup
+                    const int nypsg  = 32/nxpsg;          // num threads along col per simdgroup (i.e. a simdgroup processes that many src0 rows at a time)
+                    const int r0ptg  = nypsg*nsg;         // num src0 rows per threadgroup
+                          int r1ptg  = 4;                 // num src1 rows per threadgroup
 
+                    // note: not sure how optimal are those across all different hardware. there might be someting cleverer
+                    switch (ne11) {
+                        case 2:
+                            r1ptg = 2; break;
+                        case 3:
+                        case 6:
+                            r1ptg = 3; break;
+                        case 4:
+                        case 7:
+                        case 8:
+                            r1ptg = 4; break;
+                        case 5:
+                            r1ptg = 5; break;
+                    };
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    switch (src0->type) {
+                        case GGML_TYPE_F16:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q4_0:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q4_1:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q5_0:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q5_1:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q8_0:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q4_K:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q5_K:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q6_K:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_IQ4_NL:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        default: GGML_ABORT("not implemented");
+                    }
+
+                    ggml_metal_kargs_mul_mv_ext args = {
+                        /*.ne00  =*/ ne00,
+                        /*.ne01  =*/ ne01,
+                        /*.ne02  =*/ ne02,
+                        /*.nb00  =*/ nb00,
+                        /*.nb01  =*/ nb01,
+                        /*.nb02  =*/ nb02,
+                        /*.nb03  =*/ nb03,
+                        /*.ne10  =*/ ne10,
+                        /*.ne11  =*/ ne11,
+                        /*.ne12  =*/ ne12,
+                        /*.nb10  =*/ nb10,
+                        /*.nb11  =*/ nb11,
+                        /*.nb12  =*/ nb12,
+                        /*.nb13  =*/ nb13,
+                        /*.ne0   =*/ ne0,
+                        /*.ne1   =*/ ne1,
+                        /*.r2    =*/ r2,
+                        /*.r3    =*/ r3,
+                        /*.nsg   =*/ nsg,
+                        /*.nxpsg =*/ nxpsg,
+                        /*.r1ptg =*/ r1ptg,
+                    };
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
+
+                    //printf("ne01 = %lld nr0ptg = %d\n", ne01, nr0ptg);
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + r0ptg - 1)/r0ptg, (ne11 + r1ptg - 1)/r1ptg, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
+                } else
                 // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                 // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                 if ([device supportsFamily:MTLGPUFamilyApple7] &&
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index eaca38864..7567f3262 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -47,6 +47,11 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
     reg = (type4x4)(*src);
 }
 
+template <typename type4>
+void dequantize_f16_t4(device const half4 * src, short il, thread type4 & reg) {
+    reg = (type4)(*(src + il));
+}
+
 #if defined(GGML_METAL_USE_BF16)
 template <typename type4x4>
 void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) {
@@ -55,7 +60,7 @@ void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & re
 #endif
 
 template <typename type4x4>
-void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
+void dequantize_q4_0(device const block_q4_0 * xb, short il, thread type4x4 & reg) {
     device const uint16_t * qs = ((device const uint16_t *)xb + 1);
     const float d1 = il ? (xb->d / 16.h) : xb->d;
     const float d2 = d1 / 256.f;
@@ -73,8 +78,23 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg
     reg = (type4x4) reg_f;
 }
 
+template <typename type4>
+void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
+    const float d1 = (il/4) ? (xb->d / 16.h) : xb->d;
+    const float d2 = d1 / 256.f;
+    const float md = -8.h * xb->d;
+    const ushort mask0 = (il/4) ? 0x00F0 : 0x000F;
+    const ushort mask1 = mask0 << 8;
+
+    for (int i = 0; i < 2; i++) {
+        reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + md;
+        reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + md;
+    }
+}
+
 template <typename type4x4>
-void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
+void dequantize_q4_1(device const block_q4_1 * xb, short il, thread type4x4 & reg) {
     device const uint16_t * qs = ((device const uint16_t *)xb + 2);
     const float d1 = il ? (xb->d / 16.h) : xb->d;
     const float d2 = d1 / 256.f;
@@ -92,8 +112,23 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg
     reg = (type4x4) reg_f;
 }
 
+template <typename type4>
+void dequantize_q4_1_t4(device const block_q4_1 * xb, short il, thread type4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 2);
+    const float d1 = (il/4) ? (xb->d / 16.h) : xb->d;
+    const float d2 = d1 / 256.f;
+    const float  m = xb->m;
+    const ushort mask0 = (il/4) ? 0x00F0 : 0x000F;
+    const ushort mask1 = mask0 << 8;
+
+    for (int i = 0; i < 2; i++) {
+        reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + m;
+        reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + m;
+    }
+}
+
 template <typename type4x4>
-void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
+void dequantize_q5_0(device const block_q5_0 * xb, short il, thread type4x4 & reg) {
     device const uint16_t * qs = ((device const uint16_t *)xb + 3);
     const float d = xb->d;
     const float md = -16.h * xb->d;
@@ -124,8 +159,38 @@ void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg
     reg = (type4x4) reg_f;
 }
 
+template <typename type4>
+void dequantize_q5_0_t4(device const block_q5_0 * xb, short il, thread type4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
+    const float d = xb->d;
+    const float md = -16.h * xb->d;
+    const ushort mask = (il/4) ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = (il/4) ? 4 : 0;
+
+    const int gh_mv = (il/4) ? 12 : 0;
+    const int gh_bk = (il/4) ?  0 : 4;
+
+    for (int ii = 0; ii < 2; ii++) {
+        int i = 2*(il%4) + ii;
+
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg[2*ii + 0] = d * x0 + md;
+        reg[2*ii + 1] = d * x1 + md;
+    }
+}
+
 template <typename type4x4>
-void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
+void dequantize_q5_1(device const block_q5_1 * xb, short il, thread type4x4 & reg) {
     device const uint16_t * qs = ((device const uint16_t *)xb + 4);
     const float d = xb->d;
     const float m = xb->m;
@@ -156,10 +221,40 @@ void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg
     reg = (type4x4) reg_f;
 }
 
+template <typename type4>
+void dequantize_q5_1_t4(device const block_q5_1 * xb, short il, thread type4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
+    const float d = xb->d;
+    const float m = xb->m;
+    const ushort mask = (il/4) ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = (il/4) ? 4 : 0;
+
+    const int gh_mv = (il/4) ? 12 : 0;
+    const int gh_bk = (il/4) ?  0 : 4;
+
+    for (int ii = 0; ii < 2; ii++) {
+        int i = 2*(il%4) + ii;
+
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg[2*ii + 0] = d * x0 + m;
+        reg[2*ii + 1] = d * x1 + m;
+    }
+}
+
 template <typename type4x4>
 void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
     device const int8_t * qs = ((device const int8_t *)xb->qs);
-    const half d = xb->d;
+    const float d = xb->d;
 
     float4x4 reg_f;
 
@@ -170,6 +265,16 @@ void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg
     reg = (type4x4) reg_f;
 }
 
+template <typename type4>
+void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & reg) {
+    device const int8_t * qs = ((device const int8_t *)xb->qs);
+    const float d = xb->d;
+
+    for (int i = 0; i < 4; i++) {
+        reg[i] = (qs[4*(il%4) + i + 16*(il/4)] * d);
+    }
+}
+
 template <typename type4x4>
 void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
     const float d = xb->d;
@@ -224,7 +329,7 @@ static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q
 }
 
 template <typename type4x4>
-void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
+void dequantize_q4_K(device const block_q4_K * xb, short il, thread type4x4 & reg) {
     device const uchar * q = xb->qs;
 
     short is = (il/4) * 2;
@@ -236,7 +341,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
     const float dl = d * sc[0];
     const float ml = min * sc[1];
 
-    const ushort mask = il<2 ? 0x0F : 0xF0;
+    const ushort mask = il < 2 ? 0x0F : 0xF0;
     for (int i = 0; i < 16; ++i) {
         reg[i/4][i%4] = dl * (q[i] & mask) - ml;
     }
@@ -469,6 +574,19 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
     }
 }
 
+template <typename type4>
+void dequantize_iq4_nl_t4(device const block_iq4_nl * xb, short il, thread type4 & reg) {
+    device const uint16_t * q4 = (device const uint16_t *)xb->qs;
+    const float d = xb->d;
+    uint32_t aux32;
+    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
+    aux32 = ((q4[2*(il%4)] | (q4[2*(il%4)+1] << 16)) >> 4*(il/4)) & 0x0f0f0f0f;
+    reg[0] = d * kvalues_iq4nl_f[q8[0]];
+    reg[1] = d * kvalues_iq4nl_f[q8[1]];
+    reg[2] = d * kvalues_iq4nl_f[q8[2]];
+    reg[3] = d * kvalues_iq4nl_f[q8[3]];
+}
+
 template <typename type4x4>
 void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
     // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
@@ -1752,6 +1870,301 @@ kernel void kernel_mul_mv_q8_0_f32(
     kernel_mul_mv_q8_0_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
+// mat-vec kernel processing in chunks of float4
+// chpb - chunks per quantization block
+template<short nxpsg, short r1ptg, typename q_t, short chpb, void (*deq_t4)(device const q_t *, short, thread float4 &) >
+void kernel_mul_mv_ext_q4_f32_impl(
+        constant ggml_metal_kargs_mul_mv_ext & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
+    const short chpt = 4; // chunks per thread
+
+  //const short nxpsg = (32);
+    const short nypsg = (32/nxpsg);
+
+    const short tx = tiisg%nxpsg;
+    const short ty = tiisg/nxpsg;
+
+    const int i01 = tgpig.x*(nypsg*args.nsg) + nypsg*sgitg + ty;
+    const int i11 = tgpig.y*r1ptg;
+    const int i1m = tgpig.z;
+
+    const int i12 = i1m%args.ne12;
+    const int i13 = i1m/args.ne12;
+
+    const uint64_t offset0 = i01*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 = i11*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0;
+
+    device const float4 * y4[r1ptg];
+
+    for (int ir1 = 0; ir1 < r1ptg; ++ir1) {
+        y4[ir1] = (i11 + ir1 < args.ne11) ? (device const float4 *) (src1 + offset1 + ir1*args.nb11) + tx : (device const float4 *) src1;
+    }
+
+    float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f };
+
+    short cch = tx%chpb; // current chunk index
+
+    for (int ich = tx; 4*ich < args.ne00; ich += chpt*nxpsg) {
+        float4 lx[chpt];
+
+#pragma unroll(chpt)
+        for (short ch = 0; ch < chpt; ++ch) {
+            deq_t4(xq, cch, lx[ch]);
+
+            cch += nxpsg;
+            if (cch >= chpb) {
+                xq  += cch/chpb;
+                cch %= chpb;
+            }
+        }
+
+#pragma unroll(chpt)
+        for (short ch = 0; ch < chpt; ++ch) {
+#pragma unroll(r1ptg)
+            for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+                sumf[ir1] += dot(lx[ch], y4[ir1][ch*nxpsg]);
+
+            }
+        }
+
+#pragma unroll(r1ptg)
+        for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+            y4[ir1] += chpt*nxpsg;
+        }
+    }
+
+    // reduce only the threads in each row
+    for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+        if (nxpsg >= 32) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1], 16);
+        }
+        if (nxpsg >= 16) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  8);
+        }
+        if (nxpsg >= 8) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  4);
+        }
+        if (nxpsg >= 4) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  2);
+        }
+        if (nxpsg >= 2) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  1);
+        }
+
+        //sumf[ir1] = simd_sum(sumf[ir1]);
+    }
+
+    if (tx == 0) {
+        for (short ir1 = 0; ir1 < r1ptg && i11 + ir1 < args.ne11; ++ir1) {
+            device float * dst_f32 = (device float *) dst + (uint64_t)i1m*args.ne0*args.ne1 + (uint64_t)(i11 + ir1)*args.ne0;
+
+            if (i01 < args.ne01) {
+                dst_f32[i01] = sumf[ir1];
+            }
+        }
+    }
+}
+
+// mat-vec kernel processing in chunks of float4x4
+template<short nxpsg, short r1ptg, typename q_t, short chpb, void (*deq_t4x4)(device const q_t *, short, thread float4x4 &) >
+void kernel_mul_mv_ext_q4x4_f32_impl(
+        constant ggml_metal_kargs_mul_mv_ext & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
+    const short chpt = 1;
+
+  //const short nxpsg = (32);
+    const short nypsg = (32/nxpsg);
+
+    const short tx = tiisg%nxpsg;
+    const short ty = tiisg/nxpsg;
+
+    const int i01 = tgpig.x*(nypsg*args.nsg) + nypsg*sgitg + ty;
+    const int i11 = tgpig.y*r1ptg;
+    const int i1m = tgpig.z;
+
+    const int i12 = i1m%args.ne12;
+    const int i13 = i1m/args.ne12;
+
+    const uint64_t offset0 = i01*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 = i11*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0;
+
+    device const float4x4 * y4x4[r1ptg];
+
+    for (int ir1 = 0; ir1 < r1ptg; ++ir1) {
+        y4x4[ir1] = (i11 + ir1 < args.ne11) ? (device const float4x4 *) (src1 + offset1 + ir1*args.nb11) + tx : (device const float4x4 *) src1;
+    }
+
+    float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f };
+
+    short cch = tx%chpb;
+
+    for (int ich = tx; 16*ich < args.ne00; ich += chpt*nxpsg) {
+        float4x4 lx[chpt];
+
+#pragma unroll(chpt)
+        for (short ch = 0; ch < chpt; ++ch) {
+            deq_t4x4(xq, cch, lx[ch]);
+
+            cch += nxpsg;
+            if (cch >= chpb) {
+                xq  += cch/chpb;
+                cch %= chpb;
+            }
+        }
+
+#pragma unroll(chpt)
+        for (short ch = 0; ch < chpt; ++ch) {
+#pragma unroll(r1ptg)
+            for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+                sumf[ir1] +=
+                    dot(lx[ch][0], y4x4[ir1][ch*nxpsg][0]) +
+                    dot(lx[ch][1], y4x4[ir1][ch*nxpsg][1]) +
+                    dot(lx[ch][2], y4x4[ir1][ch*nxpsg][2]) +
+                    dot(lx[ch][3], y4x4[ir1][ch*nxpsg][3]);
+
+            }
+        }
+
+#pragma unroll(r1ptg)
+        for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+            y4x4[ir1] += chpt*nxpsg;
+        }
+    }
+
+    for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+        if (nxpsg >= 32) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1], 16);
+        }
+        if (nxpsg >= 16) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  8);
+        }
+        if (nxpsg >= 8) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  4);
+        }
+        if (nxpsg >= 4) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  2);
+        }
+        if (nxpsg >= 2) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  1);
+        }
+
+        //sumf[ir1] = simd_sum(sumf[ir1]);
+    }
+
+    if (tx == 0) {
+        for (short ir1 = 0; ir1 < r1ptg && i11 + ir1 < args.ne11; ++ir1) {
+            device float * dst_f32 = (device float *) dst + (uint64_t)i1m*args.ne0*args.ne1 + (uint64_t)(i11 + ir1)*args.ne0;
+
+            if (i01 < args.ne01) {
+                dst_f32[i01] = sumf[ir1];
+            }
+        }
+    }
+}
+
+// dispatchers needed for compile-time nxpsg
+// epb - elements per quantization block
+template<short r1ptg, typename q_t, short epb, void (*deq_t4)(device const q_t *, short, thread float4 &)>
+kernel void kernel_mul_mv_ext_q4_f32_disp(
+        constant ggml_metal_kargs_mul_mv_ext & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
+    switch (args.nxpsg) {
+        case 4:  kernel_mul_mv_ext_q4_f32_impl<4,  r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 8:  kernel_mul_mv_ext_q4_f32_impl<8,  r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 16: kernel_mul_mv_ext_q4_f32_impl<16, r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 32: kernel_mul_mv_ext_q4_f32_impl<32, r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+    }
+}
+
+template<short r1ptg, typename q_t, short epb, void (*deq_t4x4)(device const q_t *, short, thread float4x4 &)>
+kernel void kernel_mul_mv_ext_q4x4_f32_disp(
+        constant ggml_metal_kargs_mul_mv_ext & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
+    switch (args.nxpsg) {
+        case 4:  kernel_mul_mv_ext_q4x4_f32_impl<4,  r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 8:  kernel_mul_mv_ext_q4x4_f32_impl<8,  r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 16: kernel_mul_mv_ext_q4x4_f32_impl<16, r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 32: kernel_mul_mv_ext_q4x4_f32_impl<32, r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+    }
+}
+
+typedef decltype(kernel_mul_mv_ext_q4_f32_disp  <2, block_q8_0, 32,  dequantize_q8_0_t4>) mul_mv_ext_q4_f32_t;
+typedef decltype(kernel_mul_mv_ext_q4x4_f32_disp<2, block_q4_K, 256, dequantize_q4_K>)    mul_mv_ext_q4x4_f32_t;
+
+template [[host_name("kernel_mul_mv_ext_f16_f32_r1_2")]]    kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, half4,        4,  dequantize_f16_t4>;
+template [[host_name("kernel_mul_mv_ext_f16_f32_r1_3")]]    kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, half4,        4,  dequantize_f16_t4>;
+template [[host_name("kernel_mul_mv_ext_f16_f32_r1_4")]]    kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, half4,        4,  dequantize_f16_t4>;
+template [[host_name("kernel_mul_mv_ext_f16_f32_r1_5")]]    kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, half4,        4,  dequantize_f16_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0,   32, dequantize_q4_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0,   32, dequantize_q4_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0,   32, dequantize_q4_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_5")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q4_0,   32, dequantize_q4_0_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_2")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_1,   32, dequantize_q4_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_3")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_1,   32, dequantize_q4_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_4")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_1,   32, dequantize_q4_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_5")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q4_1,   32, dequantize_q4_1_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_2")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q5_0,   32, dequantize_q5_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_3")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q5_0,   32, dequantize_q5_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_4")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q5_0,   32, dequantize_q5_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_5")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q5_0,   32, dequantize_q5_0_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_2")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q5_1,   32, dequantize_q5_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_3")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q5_1,   32, dequantize_q5_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_4")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q5_1,   32, dequantize_q5_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_5")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q5_1,   32, dequantize_q5_1_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_2")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q8_0,   32, dequantize_q8_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_3")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q8_0,   32, dequantize_q8_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_4")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q8_0,   32, dequantize_q8_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_5")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q8_0,   32, dequantize_q8_0_t4>;
+
+template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
+template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
+template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
+template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q4_K, 256, dequantize_q4_K>;
+template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q4_K, 256, dequantize_q4_K>;
+template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q4_K, 256, dequantize_q4_K>;
+template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q4_K, 256, dequantize_q4_K>;
+
+template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q5_K, 256, dequantize_q5_K>;
+template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q5_K, 256, dequantize_q5_K>;
+template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q5_K, 256, dequantize_q5_K>;
+template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q5_K, 256, dequantize_q5_K>;
+
+template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q6_K, 256, dequantize_q6_K>;
+template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q6_K, 256, dequantize_q6_K>;
+template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>;
+template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_K, 256, dequantize_q6_K>;
+
 #define N_MV_T_T 4
 
 template<typename T0, typename T04, typename T1, typename T14, typename args_t>
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 4d9df1a64..c786da4c3 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3572,6 +3572,19 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4));
     test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4));
 
+    for (int i = 1; i < 9; ++i) {
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16,    GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_1,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q5_0,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q5_1,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_K,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q5_K,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q6_K,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_IQ4_NL, GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+    }
+
 #if 1
     for (ggml_type type_a : base_types) {
         for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {

From 82bca2257b3cec72676abb26011f1b99fcdab29d Mon Sep 17 00:00:00 2001
From: Nikolaos Pothitos <pothitos@di.uoa.gr>
Date: Tue, 3 Dec 2024 12:50:08 +0200
Subject: [PATCH 378/396] readme : add option, update default value, fix
 formatting (#10271)

* readme : document --no-display-prompt

* readme : update default prompt context size

* readme : remove unnecessary indentation

Indenting a line with four spaces makes Markdown treat that section as
plain text.

* readme : indent commands under bullets

* readme : indent commands in lettered list
---
 docs/build.md             |  16 +-
 examples/infill/README.md |   2 +-
 examples/main/README.md   |   5 +-
 examples/server/README.md | 304 +++++++++++++++++++-------------------
 4 files changed, 164 insertions(+), 163 deletions(-)

diff --git a/docs/build.md b/docs/build.md
index 5465629e2..97e340ab6 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -26,17 +26,17 @@ cmake --build build --config Release
 
     1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
 
-    ```bash
-    cmake -B build -DCMAKE_BUILD_TYPE=Debug
-    cmake --build build
-    ```
+       ```bash
+       cmake -B build -DCMAKE_BUILD_TYPE=Debug
+       cmake --build build
+       ```
 
     2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
 
-    ```bash
-    cmake -B build -G "Xcode"
-    cmake --build build --config Debug
-    ```
+       ```bash
+       cmake -B build -G "Xcode"
+       cmake --build build --config Debug
+       ```
 
     For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
 
diff --git a/examples/infill/README.md b/examples/infill/README.md
index 810a0c5e7..df4d976f2 100644
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -14,7 +14,7 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
--   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
 -   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 
 ## Input Prompts
diff --git a/examples/main/README.md b/examples/main/README.md
index 145216938..7787f7b11 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -66,7 +66,7 @@ In this section, we cover the most commonly used options for running the `llama-
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
--   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
 -   `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
 -   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
 -   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@@ -131,7 +131,7 @@ During text generation, LLaMA models have a limited context size, which means th
 
 ### Context Size
 
-- `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The LLaMA models were built with a context of 2048-8192, which will yield the best results on longer input/inference.
+- `-c N, --ctx-size N`: Set the size of the prompt context (default: 4096, 0 = loaded from model). If a LLaMA model was built with a longer context, increasing this value will yield the best results on longer input/inference.
 
 ### Extended Context Size
 
@@ -348,6 +348,7 @@ These options provide extra functionality and customization when running the LLa
 
 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
+-   `--no-display-prompt`: Don't print prompt at generation.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
diff --git a/examples/server/README.md b/examples/server/README.md
index 3f0d45e5b..45b1e99fc 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -311,104 +311,104 @@ node index.js
 
 ### POST `/completion`: Given a `prompt`, it returns the predicted completion.
 
-    *Options:*
+*Options:*
 
-    `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:
+`prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:
 
-      - The prompt is a string or an array with the first element given as a string
-      - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
+  - The prompt is a string or an array with the first element given as a string
+  - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
 
-    These input shapes and data type are allowed for `prompt`:
+These input shapes and data type are allowed for `prompt`:
 
-      - Single string: `"string"`
-      - Single sequence of tokens: `[12, 34, 56]`
-      - Mixed tokens and strings: `[12, 34, "string", 56, 78]`
+  - Single string: `"string"`
+  - Single sequence of tokens: `[12, 34, 56]`
+  - Mixed tokens and strings: `[12, 34, "string", 56, 78]`
 
-    Multiple prompts are also supported. In this case, the completion result will be an array.
+Multiple prompts are also supported. In this case, the completion result will be an array.
 
-      - Only strings: `["string1", "string2"]`
-      - Strings and sequences of tokens: `["string1", [12, 34, 56]]`
-      - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]`
+  - Only strings: `["string1", "string2"]`
+  - Strings and sequences of tokens: `["string1", [12, 34, 56]]`
+  - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]`
 
-    `temperature`: Adjust the randomness of the generated text. Default: `0.8`
+`temperature`: Adjust the randomness of the generated text. Default: `0.8`
 
-    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
+`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
 
-    `dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`
+`dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`
 
-    `top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`
+`top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`
 
-    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
+`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
 
-    `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
+`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
 
-    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
+`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
 
-    `n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`
+`n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`
 
-    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
-    By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
+`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
+By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
 
-    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
+`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
 
-    `stop`: Specify a JSON array of stopping strings.
-    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
+`stop`: Specify a JSON array of stopping strings.
+These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
 
-    `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
+`typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
 
-    `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
+`repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
 
-    `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
+`repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
 
-    `penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`
+`penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`
 
-    `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
+`presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
 
-    `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
+`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
 
-    `dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
+`dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
 
-    `dry_base`: Set the DRY repetition penalty base value. Default: `1.75`
+`dry_base`: Set the DRY repetition penalty base value. Default: `1.75`
 
-    `dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
+`dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
 
-    `dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
+`dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
 
-    `dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
+`dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
 
-    `xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
+`xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
 
-    `xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
+`xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
 
-    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
+`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
 
-    `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
+`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
 
-    `mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`
+`mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`
 
-    `grammar`: Set grammar for grammar-based sampling.  Default: no grammar
+`grammar`: Set grammar for grammar-based sampling.  Default: no grammar
 
-    `json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
+`json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
 
-    `seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.
+`seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.
 
-    `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
+`ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
 
-    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
+`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
 
-    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
+`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
 
-    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
+`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
 
-    `t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
+`t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
 
-    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
 
-    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
+`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
 
-    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
+`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
 
-    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
+`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
 
     `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
 
@@ -453,13 +453,13 @@ Notice that each `probs` is an array of length `n_probs`.
 
 ### POST `/tokenize`: Tokenize a given text
 
-    *Options:*
+*Options:*
 
-    `content`: (Required) The text to tokenize.
+`content`: (Required) The text to tokenize.
 
-    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
 
-    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
+`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
 
 **Response:**
 
@@ -496,52 +496,52 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
 
 ### POST `/detokenize`: Convert tokens to text
 
-    *Options:*
+*Options:*
 
-    `tokens`: Set the tokens to detokenize.
+`tokens`: Set the tokens to detokenize.
 
 ### POST `/embedding`: Generate embedding of a given text
 
 The same as [the embedding example](../embedding) does.
 
-    *Options:*
+*Options:*
 
-    `content`: Set the text to process.
+`content`: Set the text to process.
 
-    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
 
 ### POST `/reranking`: Rerank documents according to a given query
 
 Similar to https://jina.ai/reranker/ but might change in the future.
 Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options.
 
-    *Options:*
+*Options:*
 
-    `query`: The query against which the documents will be ranked.
+`query`: The query against which the documents will be ranked.
 
-    `documents`: An array strings representing the documents to be ranked.
+`documents`: An array strings representing the documents to be ranked.
 
-    *Aliases:*
-      - `/rerank`
-      - `/v1/rerank`
-      - `/v1/reranking`
+*Aliases:*
+  - `/rerank`
+  - `/v1/rerank`
+  - `/v1/reranking`
 
-    *Examples:*
+*Examples:*
 
-    ```shell
-    curl http://127.0.0.1:8012/v1/rerank \
-        -H "Content-Type: application/json" \
-        -d '{
-            "model": "some-model",
-                "query": "What is panda?",
-                "top_n": 3,
-                "documents": [
-                    "hi",
-                "it is a bear",
-                "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
-                ]
-        }' | jq
-    ```
+```shell
+curl http://127.0.0.1:8012/v1/rerank \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "some-model",
+            "query": "What is panda?",
+            "top_n": 3,
+            "documents": [
+                "hi",
+            "it is a bear",
+            "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
+            ]
+    }' | jq
+```
 
 ### POST `/infill`: For code infilling.
 
@@ -607,89 +607,89 @@ To use this endpoint with POST method, you need to start server with `--props`
 
 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
 
-    *Options:*
+*Options:*
 
-    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
+See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
 
-    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
+The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
 
-    *Examples:*
+*Examples:*
 
-    You can use either Python `openai` library with appropriate checkpoints:
+You can use either Python `openai` library with appropriate checkpoints:
 
-    ```python
-    import openai
+```python
+import openai
 
-    client = openai.OpenAI(
-        base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
-        api_key = "sk-no-key-required"
-    )
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+    api_key = "sk-no-key-required"
+)
 
-    completion = client.chat.completions.create(
-    model="gpt-3.5-turbo",
-    messages=[
-        {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
-        {"role": "user", "content": "Write a limerick about python exceptions"}
-    ]
-    )
+completion = client.chat.completions.create(
+model="gpt-3.5-turbo",
+messages=[
+    {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
+    {"role": "user", "content": "Write a limerick about python exceptions"}
+]
+)
 
-    print(completion.choices[0].message)
-    ```
+print(completion.choices[0].message)
+```
 
-    ... or raw HTTP requests:
+... or raw HTTP requests:
 
-    ```shell
-    curl http://localhost:8080/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-    "model": "gpt-3.5-turbo",
-    "messages": [
-    {
-        "role": "system",
-        "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
-    },
-    {
-        "role": "user",
-        "content": "Write a limerick about python exceptions"
-    }
-    ]
-    }'
-    ```
+```shell
+curl http://localhost:8080/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer no-key" \
+-d '{
+"model": "gpt-3.5-turbo",
+"messages": [
+{
+    "role": "system",
+    "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
+},
+{
+    "role": "user",
+    "content": "Write a limerick about python exceptions"
+}
+]
+}'
+```
 
 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API
 
-    *Options:*
+*Options:*
 
-    See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
+See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
 
-    *Examples:*
+*Examples:*
 
-  - input as string
+- input as string
 
-    ```shell
-    curl http://localhost:8080/v1/embeddings \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-            "input": "hello",
-            "model":"GPT-4",
-            "encoding_format": "float"
-    }'
-    ```
+  ```shell
+  curl http://localhost:8080/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer no-key" \
+  -d '{
+          "input": "hello",
+          "model":"GPT-4",
+          "encoding_format": "float"
+  }'
+  ```
 
-  - `input` as string array
+- `input` as string array
 
-    ```shell
-    curl http://localhost:8080/v1/embeddings \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-            "input": ["hello", "world"],
-            "model":"GPT-4",
-            "encoding_format": "float"
-    }'
-    ```
+  ```shell
+  curl http://localhost:8080/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer no-key" \
+  -d '{
+          "input": ["hello", "world"],
+          "model":"GPT-4",
+          "encoding_format": "float"
+  }'
+  ```
 
 ### GET `/slots`: Returns the current slots processing state
 
@@ -775,9 +775,9 @@ Available metrics:
 
 ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
 
-    *Options:*
+*Options:*
 
-    `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
+`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
 
 **Response format**
 
@@ -795,9 +795,9 @@ Available metrics:
 
 ### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
 
-    *Options:*
+*Options:*
 
-    `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
+`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
 
 **Response format**
 

From 3b4f2e33e2cbfca621e623c4b92b88da57a8c2f4 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 3 Dec 2024 12:54:30 +0100
Subject: [PATCH 379/396] llama : add missing LLAMA_API for
 llama_chat_builtin_templates (#10636)

---
 include/llama.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llama.h b/include/llama.h
index 439e0ff0c..e85f459fc 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -991,7 +991,7 @@ extern "C" {
                                int32_t   length);
 
     // Get list of built-in chat templates
-    int32_t llama_chat_builtin_templates(const char ** output, size_t len);
+    LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
 
     //
     // Sampling API

From 667d70d1704dfa6977505f5d01d4638669b90dce Mon Sep 17 00:00:00 2001
From: PAB <pierreantoine.bannier@gmail.com>
Date: Thu, 28 Nov 2024 09:25:06 +0100
Subject: [PATCH 380/396] metal : add `GGML_OP_CONV_TRANSPOSE_1D` kernels
 (ggml/1026)

* wip

* wip implementation f32

* kernel conv transpose 1d f32 working

* initial commit
---
 ggml/src/ggml-metal/ggml-metal.m     | 48 ++++++++++++++++++
 ggml/src/ggml-metal/ggml-metal.metal | 73 ++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index c247b50c9..d374b65a4 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -306,6 +306,8 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_IM2COL_F32,
     GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,
     GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32,
+    GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F32_F32,
+    GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
@@ -870,6 +872,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,                im2col_ext_f16,                 true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32,                im2col_ext_f32,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F32_F32,     conv_transpose_1d_f32_f32,      true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,     conv_transpose_1d_f16_f32,      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,        timestep_embedding_f32,         true);
@@ -1069,6 +1073,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_REPEAT:
         case GGML_OP_SCALE:
         case GGML_OP_CLAMP:
+        case GGML_OP_CONV_TRANSPOSE_1D:
             return true;
         case GGML_OP_SQR:
         case GGML_OP_SQRT:
@@ -3138,6 +3143,49 @@ static void ggml_metal_encode_node(
                     [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
                 }
             } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+                GGML_ASSERT(ggml_is_contiguous(src1));
+                GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
+                GGML_ASSERT(src1->type == GGML_TYPE_F32);
+                GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+                const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+
+                const int32_t IC = src1->ne[1];
+                const int32_t IL = src1->ne[0];
+
+                const int32_t K  = src0->ne[0];
+
+                const int32_t OL = dst->ne[0];
+                const int32_t OC = dst->ne[1];
+
+                id<MTLComputePipelineState> pipeline;
+
+                switch (src0->type) {
+                    case GGML_TYPE_F32: {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F32_F32].pipeline;
+                    } break;
+                    case GGML_TYPE_F16: {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32].pipeline;
+                    } break;
+                    default: GGML_ABORT("fatal error");
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0         atIndex:0];
+                [encoder setBuffer:id_src1 offset:offs_src1         atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst          atIndex:2];
+                [encoder setBytes:&IC      length:sizeof( int32_t)  atIndex:3];
+                [encoder setBytes:&IL      length:sizeof( int32_t)  atIndex:4];
+                [encoder setBytes:&K       length:sizeof( int32_t)  atIndex:5];
+                [encoder setBytes:&s0      length:sizeof( int32_t)  atIndex:6];
+                [encoder setBytes:&nb0     length:sizeof(uint64_t)  atIndex:7];
+                [encoder setBytes:&nb1     length:sizeof(uint64_t)  atIndex:8];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(OL, OC, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
         case GGML_OP_UPSCALE:
             {
                 GGML_ASSERT(src0->type == GGML_TYPE_F32);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 7567f3262..8cb9a3414 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2671,6 +2671,79 @@ kernel void kernel_im2col_ext(
 template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
 template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
 
+typedef void (conv_transpose_1d_t)(
+        device const float * src0,
+        device const float * src1,
+        device        char * dst,
+        constant   int32_t & IC,
+        constant   int32_t & IL,
+        constant   int32_t & K,
+        constant   int32_t & s0,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        uint3    tgpg[[threadgroups_per_grid]]);
+
+template <typename T>
+kernel void kernel_conv_transpose_1d(
+        device const     T * src0,
+        device const float * src1,
+        device        char * dst,
+        constant   int32_t & IC,
+        constant   int32_t & IL,
+        constant   int32_t & K,
+        constant   int32_t & s0,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        uint3   tgpg[[threadgroups_per_grid]]) {
+
+    float v = 0.0f;
+
+    for (int64_t c = 0; c < IC; c++) {
+        const int32_t kernel_offset = c * tgpg[1] * K + K * tgpig[1];
+        const int32_t input_offset = c * IL;
+
+        for (int64_t i = 0; i < IL; i++) {
+            if (tgpig[0] >= i * s0 && tgpig[0] < i * s0 + K) {
+                v += src0[kernel_offset + tgpig[0] - i * s0] * src1[input_offset + i];
+            }
+        }
+    }
+
+    device float * dst_ptr = (device float *) (dst + tgpig[0] * nb0 + tgpig[1] * nb1);
+
+    dst_ptr[0] = v;
+}
+
+template [[host_name("kernel_conv_transpose_1d_f32_f32")]]
+kernel void kernel_conv_transpose_1d<float>(
+    device const float * src0,
+    device const float * src1,
+    device        char * dst,
+    constant   int32_t & IC,
+    constant   int32_t & IL,
+    constant   int32_t & K,
+    constant   int32_t & s0,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    uint3   tgpig[[threadgroup_position_in_grid]],
+    uint3    tgpg[[threadgroups_per_grid]]);
+
+template [[host_name("kernel_conv_transpose_1d_f16_f32")]]
+kernel void kernel_conv_transpose_1d<half>(
+    device const half  * src0,
+    device const float * src1,
+    device        char * dst,
+    constant   int32_t & IC,
+    constant   int32_t & IL,
+    constant   int32_t & K,
+    constant   int32_t & s0,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    uint3   tgpig[[threadgroup_position_in_grid]],
+    uint3    tgpg[[threadgroups_per_grid]]);
+
 kernel void kernel_upscale_f32(
     device  const char * src0,
     device        char * dst,

From efb6ae963031709fc331e6e48cc4606ac8f9c3a7 Mon Sep 17 00:00:00 2001
From: PAB <pierreantoine.bannier@gmail.com>
Date: Mon, 2 Dec 2024 19:27:24 +0100
Subject: [PATCH 381/396] feat: add `GGML_UNARY_OP_ARGMAX` Metal kernel
 (ggml/1019)

* implemented argmax kernel

* tpig -> tgpig

* change to strides

* contiguous assertions

* kernel working and tested

* argmax simd parallel implementation

* added 2 new tests for argmax in test-backend-ops

* cosmit

* added 3 tests cases for perf eval

* add test_argmax in make_test_cases_perf

* Update test-backend-ops.cpp

Co-authored-by: Diego Devesa <slarengh@gmail.com>

---------

Co-authored-by: Diego Devesa <slarengh@gmail.com>
---
 ggml/src/ggml-metal/ggml-metal.m     | 28 ++++++++++++++
 ggml/src/ggml-metal/ggml-metal.metal | 57 ++++++++++++++++++++++++++++
 tests/test-backend-ops.cpp           | 13 ++++---
 3 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index d374b65a4..093ae9000 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -392,6 +392,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
+    GGML_METAL_KERNEL_TYPE_ARGMAX,
 
     GGML_METAL_KERNEL_TYPE_COUNT
 };
@@ -956,6 +957,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                           sin,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                           cos,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                      sum_rows,                       true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                        argmax,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,               pool_2d_avg_f32,                true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,               pool_2d_max_f32,                true);
     }
@@ -1086,6 +1088,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
             return has_simdgroup_reduction;
         case GGML_OP_RMS_NORM:
             return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
+        case GGML_OP_ARGMAX:
         case GGML_OP_NORM:
         case GGML_OP_ROPE:
             return true;
@@ -3845,6 +3848,31 @@ static void ggml_metal_encode_node(
 
                 [encoder dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)];
             } break;
+            case GGML_OP_ARGMAX:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                GGML_ASSERT(ggml_is_contiguous_1(src0));
+                GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+
+                const int64_t nrows = ggml_nrows(src0);
+
+                int nth = 32; // SIMD width
+                while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
+                    nth *= 2;
+                }
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGMAX].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float)   atIndex:0];
+                [encoder setThreadgroupMemoryLength:32*sizeof(int32_t) atIndex:1];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
        default:
             {
                 GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 8cb9a3414..5caa0846a 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1366,6 +1366,63 @@ kernel void kernel_ssm_scan_f32(
     }
 }
 
+kernel void kernel_argmax(
+        device   const void * x,
+        device      int32_t * dst,
+        constant    int64_t & ncols,
+        constant   uint64_t & nb01,
+        threadgroup   float * shared_maxval [[threadgroup(0)]],
+        threadgroup int32_t * shared_argmax [[threadgroup(1)]],
+        uint  tgpig[[threadgroup_position_in_grid]],
+        uint  tpitg[[thread_position_in_threadgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint    ntg[[threads_per_threadgroup]]) {
+    device const float * x_row = (device const float *) ((device const char *) x + tgpig * nb01);
+
+    float   lmax = -INFINITY;
+    int32_t larg = -1;
+
+    for (int i00 = tpitg; i00 < ncols; i00 += ntg) {
+        if (x_row[i00] > lmax) {
+            lmax = x_row[i00];
+            larg = i00;
+        }
+    }
+
+    // find the argmax value in the block
+    float max_val = simd_max(lmax);
+    int32_t arg_val = simd_max(select(-1, larg, lmax == max_val));
+
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            shared_maxval[tiisg] = -INFINITY;
+            shared_argmax[tiisg] = -1;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            shared_maxval[sgitg] = max_val;
+            shared_argmax[sgitg] = arg_val;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        max_val = shared_maxval[tiisg];
+        arg_val = shared_argmax[tiisg];
+
+        float max_val_reduced   = simd_max(max_val);
+        int32_t arg_val_reduced = simd_max(select(-1, arg_val, max_val == max_val_reduced));
+
+        dst[tgpig] = arg_val_reduced;
+
+        return;
+    }
+
+    dst[tgpig] = arg_val;
+}
+
 kernel void kernel_norm(
         constant ggml_metal_kargs_norm & args,
         device const char * src0,
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index c786da4c3..87c92dadd 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3460,14 +3460,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
     test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
 
-    test_cases.emplace_back(new test_argmax());
-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 1, 1, 1}));
-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100, 10, 1, 1}));
-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
-    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1}));
-
     test_cases.emplace_back(new test_count_equal());
 
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32,    1, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100,  10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 12, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {5438,  3, 1, 1}));
+
     for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));

From e9e661bd59364e5d4fce035834b6cadcadf8c2ef Mon Sep 17 00:00:00 2001
From: mahorozte <41834471+mahorozte@users.noreply.github.com>
Date: Tue, 3 Dec 2024 21:11:43 +0800
Subject: [PATCH 382/396] CUDA: remove unnecessary warp reduce in FA
 (ggml/1032)

* kqmax_new_j in every thread within warp is same after operate at line 199,this reduce can be omit

* same problem in vec32

---------

Co-authored-by: ZhaoXiaoYu <zhao.xiaoyu@zte.com.cn>
---
 ggml/src/ggml-cuda/fattn-vec-f16.cuh | 1 -
 ggml/src/ggml-cuda/fattn-vec-f32.cuh | 1 -
 2 files changed, 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
index 5ec3b91ae..34a2992c7 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -220,7 +220,6 @@ static __global__ void flash_attn_vec_ext_f16(
         for (int j = 0; j < ncols; ++j) {
             half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
 
-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
             if (threadIdx.x == 0) {
                 kqmax_shared[j][threadIdx.y] = kqmax_new_j;
             }
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
index 3d93f4a8a..a28fc8b7f 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -206,7 +206,6 @@ static __global__ void flash_attn_vec_ext_f32(
         for (int j = 0; j < ncols; ++j) {
             float kqmax_new_j = kqmax_new_arr[j];
 
-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
             if (threadIdx.x == 0) {
                 kqmax_shared[j][threadIdx.y] = kqmax_new_j;
             }

From c5054718575d37f43cc4e6f61ea7c4014ad2c0cf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 3 Dec 2024 19:40:25 +0200
Subject: [PATCH 383/396] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index fd71a0a46..27769c93b 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-c598cbe30621251e80acbcf3b601589a37c17f4d
+b903ffe79daf18c0aaacbebe44a7b93a6b8d0982

From 1cd3df46bd49a0c1c78da8b68c956448a73b7476 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 3 Dec 2024 19:42:30 +0200
Subject: [PATCH 384/396] scripts : remove amx sync

ggml-ci
---
 scripts/sync-ggml-am.sh | 3 ---
 scripts/sync-ggml.sh    | 1 -
 2 files changed, 4 deletions(-)

diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index d0815cf89..8cf25b77f 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -73,7 +73,6 @@ while read c; do
         src/ggml*.h \
         src/ggml*.c \
         src/ggml*.cpp \
-        src/ggml-amx/* \
         src/ggml-blas/* \
         src/ggml-cann/* \
         src/ggml-cpu/* \
@@ -124,7 +123,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # src/ggml*.c          -> ggml/src/ggml*.c
     # src/ggml*.cpp        -> ggml/src/ggml*.cpp
     # src/ggml*.h          -> ggml/src/ggml*.h
-    # src/ggml-amx/*       -> ggml/src/ggml-amx/*
     # src/ggml-blas/*      -> ggml/src/ggml-blas/*
     # src/ggml-cann/*      -> ggml/src/ggml-cann/*
     # src/ggml-cpu/*       -> ggml/src/ggml-cpu/*
@@ -151,7 +149,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \
-        -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-blas\//\1ggml\/src\/ggml-blas\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-cpu\//\1ggml\/src\/ggml-cpu\//g' \
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index 000270afb..f81615bb6 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -7,7 +7,6 @@ cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
 cp -rpv ../ggml/src/ggml*.c        ./ggml/src/
 cp -rpv ../ggml/src/ggml*.cpp      ./ggml/src/
 cp -rpv ../ggml/src/ggml*.h        ./ggml/src/
-cp -rpv ../ggml/src/ggml-amx/*     ./ggml/src/ggml-amx/
 cp -rpv ../ggml/src/ggml-blas/*    ./ggml/src/ggml-blas/
 cp -rpv ../ggml/src/ggml-cann/*    ./ggml/src/ggml-cann/
 cp -rpv ../ggml/src/ggml-cpu/*     ./ggml/src/ggml-cpu/

From 91c36c269bca75b2d08119c653512cd20b4ea2ba Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 3 Dec 2024 19:38:44 +0100
Subject: [PATCH 385/396] server : (web ui) Various improvements, now use vite
 as bundler (#10599)

* hide buttons in dropdown menu

* use npm as deps manager and vite as bundler

* fix build

* fix build (2)

* fix responsive on mobile

* fix more problems on mobile

* sync build

* (test) add CI step for verifying build

* fix ci

* force rebuild .hpp files

* cmake: clean up generated files pre build
---
 .github/workflows/server.yml                  |    26 +-
 .gitignore                                    |     4 +
 Makefile                                      |    19 +-
 examples/server/CMakeLists.txt                |    14 +-
 examples/server/README.md                     |    31 +
 examples/server/deps.sh                       |    25 -
 examples/server/public/deps_daisyui.min.css   |    13 -
 examples/server/public/deps_markdown-it.js    |  8442 -------
 examples/server/public/deps_tailwindcss.js    |    82 -
 .../server/public/deps_vue.esm-browser.js     | 18160 ----------------
 examples/server/public/index.html             |   879 +-
 examples/server/server.cpp                    |    38 +-
 examples/server/webui/index.html              |   268 +
 examples/server/webui/package-lock.json       |  2783 +++
 examples/server/webui/package.json            |    23 +
 examples/server/webui/postcss.config.js       |     6 +
 .../{public => webui/src}/completion.js       |     0
 examples/server/webui/src/main.js             |   456 +
 examples/server/webui/src/styles.css          |    26 +
 examples/server/webui/tailwind.config.js      |    16 +
 examples/server/webui/vite.config.js          |    36 +
 21 files changed, 3941 insertions(+), 27406 deletions(-)
 delete mode 100755 examples/server/deps.sh
 delete mode 100644 examples/server/public/deps_daisyui.min.css
 delete mode 100644 examples/server/public/deps_markdown-it.js
 delete mode 100644 examples/server/public/deps_tailwindcss.js
 delete mode 100644 examples/server/public/deps_vue.esm-browser.js
 create mode 100644 examples/server/webui/index.html
 create mode 100644 examples/server/webui/package-lock.json
 create mode 100644 examples/server/webui/package.json
 create mode 100644 examples/server/webui/postcss.config.js
 rename examples/server/{public => webui/src}/completion.js (100%)
 create mode 100644 examples/server/webui/src/main.js
 create mode 100644 examples/server/webui/src/styles.css
 create mode 100644 examples/server/webui/tailwind.config.js
 create mode 100644 examples/server/webui/vite.config.js

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 2e8e3348f..9e66fb68c 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -76,20 +76,26 @@ jobs:
         run: |
           pip install -r examples/server/tests/requirements.txt
 
-      - name: Verify server deps
-        id: verify_server_deps
+      # Setup nodejs (to be used for verifying bundled index.html)
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - name: Verify bundled index.html
+        id: verify_server_index_html
         run: |
           git config --global --add safe.directory $(realpath .)
-          cd examples/server
-          git ls-files --others --modified
+          cd examples/server/webui
           git status
-          ./deps.sh
+          npm ci
+          npm run build
           git status
-          not_ignored_files="$(git ls-files --others --modified)"
-          echo "Modified files: ${not_ignored_files}"
-          if [ -n "${not_ignored_files}" ]; then
-            echo "Repository is dirty or server deps are not built as expected"
-            echo "${not_ignored_files}"
+          modified_files="$(git status -s)"
+          echo "Modified files: ${modified_files}"
+          if [ -n "${modified_files}" ]; then
+            echo "Repository is dirty or server/webui is not built as expected"
+            echo "Hint: You may need to follow Web UI build guide in server/README.md"
+            echo "${modified_files}"
             exit 1
           fi
 
diff --git a/.gitignore b/.gitignore
index 307c065f7..1df7cf4a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -104,6 +104,10 @@ examples/server/*.mjs.hpp
 !examples/sycl/*.bat
 !examples/sycl/*.sh
 
+# Server Web UI temporary files
+node_modules
+examples/server/webui/dist
+
 # Python
 
 /.venv
diff --git a/Makefile b/Makefile
index f5dc916f6..d76c4ad53 100644
--- a/Makefile
+++ b/Makefile
@@ -1145,8 +1145,15 @@ $(LIB_COMMON_S): $(OBJ_COMMON)
 # Include dependency files
 -include $(DEP_FILES)
 
+# Clean generated server assets
+clean-server-assets:
+	find examples/server -type f -name "*.js.hpp"   -delete
+	find examples/server -type f -name "*.mjs.hpp"  -delete
+	find examples/server -type f -name "*.css.hpp"  -delete
+	find examples/server -type f -name "*.html.hpp" -delete
+
 # Clean rule
-clean:
+clean: clean-server-assets
 	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
 	rm -rvf *.a *.dll *.so *.dot
 	find ggml src common tests examples pocs -type f -name "*.o" -delete
@@ -1354,20 +1361,14 @@ llama-server: \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
 	examples/server/index.html.hpp \
-	examples/server/completion.js.hpp \
 	examples/server/loading.html.hpp \
-	examples/server/deps_daisyui.min.css.hpp \
-	examples/server/deps_markdown-it.js.hpp \
-	examples/server/deps_tailwindcss.js.hpp \
-	examples/server/deps_vue.esm-browser.js.hpp \
 	common/json.hpp \
-	common/stb_image.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
 
 # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
-examples/server/%.hpp: examples/server/public/% Makefile
+examples/server/%.hpp: examples/server/public/% FORCE Makefile
 	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
 		echo "unsigned char $${NAME}[] = {" && \
 		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
@@ -1542,7 +1543,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
 #
 # Mark legacy binary targets as .PHONY so that they are always checked.
-.PHONY: main quantize perplexity embedding server
+.PHONY: FORCE main quantize perplexity embedding server
 
 # Define the object file target
 examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index e82f91533..0035859a6 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -16,12 +16,7 @@ set(TARGET_SRCS
 )
 set(PUBLIC_ASSETS
     index.html
-    completion.js
     loading.html
-    deps_daisyui.min.css
-    deps_markdown-it.js
-    deps_tailwindcss.js
-    deps_vue.esm-browser.js
 )
 
 foreach(asset ${PUBLIC_ASSETS})
@@ -33,11 +28,20 @@ foreach(asset ${PUBLIC_ASSETS})
         OUTPUT "${output}"
         COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
     )
+    set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
 endforeach()
 
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 
+# clean up generated files in pre-build step
+foreach(asset ${PUBLIC_ASSETS})
+    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
+    add_custom_command(TARGET ${TARGET} PRE_BUILD
+        COMMAND "${CMAKE_COMMAND}" -E remove -f "${output}"
+    )
+endforeach()
+
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 
 if (LLAMA_SERVER_SSL)
diff --git a/examples/server/README.md b/examples/server/README.md
index 45b1e99fc..b2dd7b65a 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -217,6 +217,37 @@ services:
   cmake --build build --config Release -t llama-server
   ```
 
+## Web UI
+
+The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint.
+
+The web UI is developed using:
+- `vue` framework for frontend development
+- `tailwindcss` and `daisyui` for styling
+- `vite` for build tooling
+
+A pre-built version is available as a single HTML file under `/public` directory.
+
+To build or to run the dev server (with hot reload):
+
+```sh
+# make sure you have nodejs installed
+cd examples/server/webui
+npm i
+
+# to run the dev server
+npm run dev
+
+# to build the public/index.html
+npm run build
+```
+
+NOTE: if you are using the vite dev server, you can change the API base URL to llama.cpp. To do that, run this code snippet in browser's console:
+
+```js
+localStorage.setItem('base', 'http://localhost:8080')
+```
+
 ## Quick Start
 
 To get started right away, run the following command, making sure to use the correct path for the model you have:
diff --git a/examples/server/deps.sh b/examples/server/deps.sh
deleted file mode 100755
index 1ff80d056..000000000
--- a/examples/server/deps.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Download and update deps for binary
-
-# get the directory of this script file
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-PUBLIC=$DIR/public
-
-echo "download js bundle files"
-
-# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
-
-curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
-echo >> $PUBLIC/deps_tailwindcss.js # add newline
-
-curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
-curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
-echo >> $PUBLIC/deps_daisyui.min.css # add newline
-
-curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
-echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
-
-curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
-echo >> $PUBLIC/deps_markdown-it.js # add newline
-
-ls -lah $PUBLIC
diff --git a/examples/server/public/deps_daisyui.min.css b/examples/server/public/deps_daisyui.min.css
deleted file mode 100644
index bc8529651..000000000
--- a/examples/server/public/deps_daisyui.min.css
+++ /dev/null
@@ -1,13 +0,0 @@
-.alert{display:grid;width:100%;grid-auto-flow:row;align-content:flex-start;align-items:center;justify-items:center;gap:1rem;text-align:center}@media (min-width:640px){.alert{grid-auto-flow:column;grid-template-columns:auto minmax(auto,1fr);justify-items:start;text-align:start}}.artboard{width:100%}.avatar{position:relative;display:inline-flex}.avatar>div{display:block;aspect-ratio:1/1;overflow:hidden}.avatar img{height:100%;width:100%;object-fit:cover}.avatar.placeholder>div{display:flex;align-items:center;justify-content:center}.badge{display:inline-flex;align-items:center;justify-content:center;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);height:1.25rem;font-size:.875rem;line-height:1.25rem;width:fit-content;padding-left:.563rem;padding-right:.563rem}.btm-nav{position:fixed;bottom:0;left:0;right:0;display:flex;width:100%;flex-direction:row;align-items:center;justify-content:space-around;padding-bottom:env(safe-area-inset-bottom)}.btm-nav>*{position:relative;display:flex;height:100%;flex-basis:100%;cursor:pointer;flex-direction:column;align-items:center;justify-content:center;gap:.25rem}.breadcrumbs{max-width:100%;overflow-x:auto}.breadcrumbs>ol,.breadcrumbs>ul{display:flex;align-items:center;white-space:nowrap;min-height:min-content}.breadcrumbs>ol>li,.breadcrumbs>ul>li{display:flex;align-items:center}.breadcrumbs>ol>li>a,.breadcrumbs>ul>li>a{display:flex;cursor:pointer;align-items:center}@media(hover:hover){.breadcrumbs>ol>li>a:hover,.breadcrumbs>ul>li>a:hover{text-decoration-line:underline}}.btn{display:inline-flex;height:3rem;min-height:3rem;flex-shrink:0;cursor:pointer;user-select:none;flex-wrap:wrap;align-items:center;justify-content:center;border-radius:var(--rounded-btn,.5rem);border-color:transparent;padding-left:1rem;padding-right:1rem;text-align:center;font-size:.875rem;line-height:1em}.btn-disabled,.btn:disabled,.btn[disabled]{pointer-events:none}.btn-square{height:3rem;width:3rem;padding:0}.btn-circle{height:3rem;width:3rem;border-radius:9999px;padding:0}:where(.btn:is(input[type=checkbox])),:where(.btn:is(input[type=radio])){width:auto;appearance:none}.btn:is(input[type=checkbox]):after,.btn:is(input[type=radio]):after{--tw-content:attr(aria-label);content:var(--tw-content)}.card{position:relative;display:flex;flex-direction:column}.card:focus{outline:2px solid transparent;outline-offset:2px}.card-body{display:flex;flex:1 1 auto;flex-direction:column}.card-body :where(p){flex-grow:1}.card-actions{display:flex;flex-wrap:wrap;align-items:flex-start;gap:.5rem}.card figure{display:flex;align-items:center;justify-content:center}.card.image-full{display:grid}.card.image-full:before{position:relative;content:""}.card.image-full:before,.card.image-full>*{grid-column-start:1;grid-row-start:1}.card.image-full>figure img{height:100%;object-fit:cover}.card.image-full>.card-body{position:relative}.carousel{display:inline-flex;overflow-x:scroll;scroll-snap-type:x mandatory;scroll-behavior:smooth}.carousel-vertical{flex-direction:column;overflow-y:scroll;scroll-snap-type:y mandatory}.carousel-item{box-sizing:content-box;display:flex;flex:none;scroll-snap-align:start}.carousel-start .carousel-item{scroll-snap-align:start}.carousel-center .carousel-item{scroll-snap-align:center}.carousel-end .carousel-item{scroll-snap-align:end}.chat{display:grid;grid-template-columns:repeat(2,minmax(0,1fr));column-gap:.75rem;padding-top:.25rem;padding-bottom:.25rem}.chat-image{grid-row:span 2/span 2;align-self:flex-end}.chat-header{grid-row-start:1;font-size:.875rem;line-height:1.25rem}.chat-footer{grid-row-start:3;font-size:.875rem;line-height:1.25rem}.chat-bubble{position:relative;display:block;width:fit-content;padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;max-width:90%}.chat-bubble:before{position:absolute;bottom:0;height:.75rem;width:.75rem;background-color:inherit;content:"";mask-size:contain;mask-repeat:no-repeat;mask-position:center}.chat-start{place-items:start;grid-template-columns:auto 1fr}.chat-start .chat-header{grid-column-start:2}.chat-start .chat-footer{grid-column-start:2}.chat-start .chat-image{grid-column-start:1}.chat-start .chat-bubble{grid-column-start:2}.chat-start .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e")}[dir=rtl] .chat-start .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e")}.chat-end{place-items:end;grid-template-columns:1fr auto}.chat-end .chat-header{grid-column-start:1}.chat-end .chat-footer{grid-column-start:1}.chat-end .chat-image{grid-column-start:2}.chat-end .chat-bubble{grid-column-start:1}.chat-end .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e")}[dir=rtl] .chat-end .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e")}.checkbox{flex-shrink:0}.collapse:not(td):not(tr):not(colgroup){visibility:visible}.collapse{position:relative;display:grid;overflow:hidden;grid-template-rows:auto 0fr;transition:grid-template-rows .2s}.collapse-content,.collapse-title,.collapse>input[type=checkbox],.collapse>input[type=radio]{grid-column-start:1;grid-row-start:1}.collapse>input[type=checkbox],.collapse>input[type=radio]{appearance:none;opacity:0}.collapse-content{visibility:hidden;grid-column-start:1;grid-row-start:2;min-height:0;transition:visibility .2s}.collapse-open,.collapse:focus:not(.collapse-close),.collapse[open]{grid-template-rows:auto 1fr}.collapse:not(.collapse-close):has(>input[type=checkbox]:checked),.collapse:not(.collapse-close):has(>input[type=radio]:checked){grid-template-rows:auto 1fr}.collapse-open>.collapse-content,.collapse:focus:not(.collapse-close)>.collapse-content,.collapse:not(.collapse-close)>input[type=checkbox]:checked~.collapse-content,.collapse:not(.collapse-close)>input[type=radio]:checked~.collapse-content,.collapse[open]>.collapse-content{visibility:visible;min-height:fit-content}:root .countdown{line-height:1em}.countdown{display:inline-flex}.countdown>*{height:1em;display:inline-block;overflow-y:hidden}.countdown>:before{position:relative;content:"00\A 01\A 02\A 03\A 04\A 05\A 06\A 07\A 08\A 09\A 10\A 11\A 12\A 13\A 14\A 15\A 16\A 17\A 18\A 19\A 20\A 21\A 22\A 23\A 24\A 25\A 26\A 27\A 28\A 29\A 30\A 31\A 32\A 33\A 34\A 35\A 36\A 37\A 38\A 39\A 40\A 41\A 42\A 43\A 44\A 45\A 46\A 47\A 48\A 49\A 50\A 51\A 52\A 53\A 54\A 55\A 56\A 57\A 58\A 59\A 60\A 61\A 62\A 63\A 64\A 65\A 66\A 67\A 68\A 69\A 70\A 71\A 72\A 73\A 74\A 75\A 76\A 77\A 78\A 79\A 80\A 81\A 82\A 83\A 84\A 85\A 86\A 87\A 88\A 89\A 90\A 91\A 92\A 93\A 94\A 95\A 96\A 97\A 98\A 99\A";white-space:pre;top:calc(var(--value) * -1em)}.diff{position:relative;display:grid;width:100%;overflow:hidden;container-type:inline-size;grid-template-columns:auto 1fr}.diff-resizer{position:relative;top:50%;z-index:1;height:3rem;width:25rem;min-width:1rem;max-width:calc(100cqi - 1rem);resize:horizontal;overflow:hidden;opacity:0;transform-origin:100% 100%;scale:4;translate:1.5rem -1.5rem;clip-path:inset(calc(100% - .75rem) 0 0 calc(100% - .75rem))}.diff-item-1,.diff-item-2,.diff-resizer{position:relative;grid-column-start:1;grid-row-start:1}.diff-item-1:after{pointer-events:none;position:absolute;bottom:0;right:1px;top:50%;z-index:1;height:2rem;width:2rem;--tw-content:'';content:var(--tw-content);translate:50% -50%}.diff-item-2{overflow:hidden}.diff-item-1>*,.diff-item-2>*{pointer-events:none;position:absolute;bottom:0;left:0;top:0;height:100%;width:100cqi;max-width:none;object-fit:cover;object-position:center}.divider{display:flex;flex-direction:row;align-items:center;align-self:stretch}.divider:after,.divider:before{height:.125rem;width:100%;flex-grow:1;--tw-content:'';content:var(--tw-content)}.divider-start:before{display:none}.divider-end:after{display:none}.drawer{position:relative;display:grid;grid-auto-columns:max-content auto}.drawer-content{grid-column-start:2;grid-row-start:1;min-width:0}.drawer-side{pointer-events:none;position:fixed;inset-inline-start:0;top:0;grid-column-start:1;grid-row-start:1;display:grid;width:100%;grid-template-columns:repeat(1,minmax(0,1fr));grid-template-rows:repeat(1,minmax(0,1fr));align-items:flex-start;justify-items:start;overflow-x:hidden;overflow-y:hidden;overscroll-behavior:contain;height:100vh;height:100dvh}.drawer-side>.drawer-overlay{position:sticky;top:0;place-self:stretch}.drawer-side>*{grid-column-start:1;grid-row-start:1}.drawer-side>:not(.drawer-overlay){transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);will-change:transform;transform:translateX(-100%)}[dir=rtl] .drawer-side>:not(.drawer-overlay){transform:translateX(100%)}.drawer-toggle{position:fixed;height:0;width:0;appearance:none;opacity:0}.drawer-toggle:checked~.drawer-side{pointer-events:auto;visibility:visible;overflow-y:auto}.drawer-toggle:checked~.drawer-side>:not(.drawer-overlay){transform:translateX(0)}.drawer-end{grid-auto-columns:auto max-content}.drawer-end>.drawer-toggle~.drawer-content{grid-column-start:1}.drawer-end>.drawer-toggle~.drawer-side{grid-column-start:2;justify-items:end}.drawer-end>.drawer-toggle~.drawer-side>:not(.drawer-overlay){transform:translateX(100%)}[dir=rtl] .drawer-end>.drawer-toggle~.drawer-side>:not(.drawer-overlay){transform:translateX(-100%)}.drawer-end>.drawer-toggle:checked~.drawer-side>:not(.drawer-overlay){transform:translateX(0)}.dropdown{position:relative;display:inline-block}.dropdown>:not(summary):focus{outline:2px solid transparent;outline-offset:2px}.dropdown .dropdown-content{position:absolute}.dropdown:is(:not(details)) .dropdown-content{visibility:hidden;opacity:0}.dropdown-end .dropdown-content{inset-inline-end:0}.dropdown-left .dropdown-content{bottom:auto;inset-inline-end:100%;top:0}.dropdown-right .dropdown-content{bottom:auto;inset-inline-start:100%;top:0}.dropdown-bottom .dropdown-content{bottom:auto;top:100%}.dropdown-top .dropdown-content{bottom:100%;top:auto}.dropdown-end.dropdown-right .dropdown-content{bottom:0;top:auto}.dropdown-end.dropdown-left .dropdown-content{bottom:0;top:auto}.dropdown.dropdown-open .dropdown-content,.dropdown:focus-within .dropdown-content,.dropdown:not(.dropdown-hover):focus .dropdown-content{visibility:visible;opacity:1}@media (hover:hover){.dropdown.dropdown-hover:hover .dropdown-content{visibility:visible;opacity:1}}.dropdown:is(details) summary::-webkit-details-marker{display:none}.file-input{height:3rem;flex-shrink:1;padding-inline-end:1rem;font-size:.875rem;line-height:1.25rem;line-height:2}.file-input::file-selector-button{margin-inline-end:1rem;display:inline-flex;height:100%;flex-shrink:0;cursor:pointer;user-select:none;flex-wrap:wrap;align-items:center;justify-content:center;padding-left:1rem;padding-right:1rem;text-align:center;font-size:.875rem;line-height:1.25rem;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);line-height:1em}.footer{display:grid;width:100%;grid-auto-flow:row;place-items:start}.footer>*{display:grid;place-items:start}.footer-center{place-items:center;text-align:center}.footer-center>*{place-items:center}@media (min-width:48rem){.footer{grid-auto-flow:column}.footer-center{grid-auto-flow:row dense}}.form-control{display:flex;flex-direction:column}.label{display:flex;user-select:none;align-items:center;justify-content:space-between}.hero{display:grid;width:100%;place-items:center;background-size:cover;background-position:center}.hero>*{grid-column-start:1;grid-row-start:1}.hero-overlay{grid-column-start:1;grid-row-start:1;height:100%;width:100%}.hero-content{z-index:0;display:flex;align-items:center;justify-content:center}.indicator{position:relative;display:inline-flex;width:max-content}.indicator :where(.indicator-item){z-index:1;position:absolute;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));white-space:nowrap}.input{flex-shrink:1;appearance:none;height:3rem;padding-left:1rem;padding-right:1rem;font-size:.875rem;line-height:1.25rem;line-height:2}.input-md[type=number]::-webkit-inner-spin-button,.input[type=number]::-webkit-inner-spin-button{margin-top:-1rem;margin-bottom:-1rem;margin-inline-end:-1rem}.input-xs[type=number]::-webkit-inner-spin-button{margin-top:-.25rem;margin-bottom:-.25rem;margin-inline-end:0}.input-sm[type=number]::-webkit-inner-spin-button{margin-top:0;margin-bottom:0;margin-inline-end:0}.input-lg[type=number]::-webkit-inner-spin-button{margin-top:-1.5rem;margin-bottom:-1.5rem;margin-inline-end:-1.5rem}.join{display:inline-flex;align-items:stretch}.join :where(.join-item){border-start-end-radius:0;border-end-end-radius:0;border-end-start-radius:0;border-start-start-radius:0}.join .join-item:not(:first-child):not(:last-child),.join :not(:first-child):not(:last-child) .join-item{border-start-end-radius:0;border-end-end-radius:0;border-end-start-radius:0;border-start-start-radius:0}.join .join-item:first-child:not(:last-child),.join :first-child:not(:last-child) .join-item{border-start-end-radius:0;border-end-end-radius:0}.join .dropdown .join-item:first-child:not(:last-child),.join :first-child:not(:last-child) .dropdown .join-item{border-start-end-radius:inherit;border-end-end-radius:inherit}.join :where(.join-item:first-child:not(:last-child)),.join :where(:first-child:not(:last-child).join-item){border-end-start-radius:inherit;border-start-start-radius:inherit}.join .join-item:last-child:not(:first-child),.join :last-child:not(:first-child) .join-item{border-end-start-radius:0;border-start-start-radius:0}.join :where(.join-item:last-child:not(:first-child)),.join :where(:last-child:not(:first-child).join-item){border-start-end-radius:inherit;border-end-end-radius:inherit}@supports not selector(:has(*)){:where(.join*){border-radius:inherit}}@supports selector(:has(*)){:where(.join:has(.join-item)){border-radius:inherit}}.kbd{display:inline-flex;align-items:center;justify-content:center}.link{cursor:pointer;text-decoration-line:underline}.link-hover{text-decoration-line:none}@media(hover:hover){.link-hover:hover{text-decoration-line:underline}}.mask{mask-size:contain;mask-repeat:no-repeat;mask-position:center}.mask-half-1{mask-size:200%;mask-position:left}.mask-half-1:where([dir=rtl],[dir=rtl]*){mask-position:right}.mask-half-2{mask-size:200%;mask-position:right}.mask-half-2:where([dir=rtl],[dir=rtl]*){mask-position:left}.menu{display:flex;flex-direction:column;flex-wrap:wrap;font-size:.875rem;line-height:1.25rem}.menu :where(liul){position:relative;white-space:nowrap}.menu :where(li:not(.menu-title)>:not(ul,details,.menu-title,.btn)),.menu :where(li:not(.menu-title)>details>summary:not(.menu-title)){display:grid;grid-auto-flow:column;align-content:flex-start;align-items:center;gap:.5rem;grid-auto-columns:minmax(auto,max-content) auto max-content;user-select:none}.menu li.disabled{cursor:not-allowed;user-select:none}.menu :where(li>.menu-dropdown:not(.menu-dropdown-show)){display:none}:where(.menuli){position:relative;display:flex;flex-shrink:0;flex-direction:column;flex-wrap:wrap;align-items:stretch}:where(.menuli) .badge{justify-self:end}.mockup-code{position:relative;overflow:hidden;overflow-x:auto}.mockup-code pre[data-prefix]:before{content:attr(data-prefix);display:inline-block;text-align:right}.mockup-window{position:relative;overflow:hidden;overflow-x:auto}.mockup-window pre[data-prefix]:before{content:attr(data-prefix);display:inline-block;text-align:right}.mockup-browser{position:relative;overflow:hidden;overflow-x:auto}.mockup-browser pre[data-prefix]:before{content:attr(data-prefix);display:inline-block;text-align:right}.modal{pointer-events:none;position:fixed;inset:0;margin:0;display:grid;height:100%;max-height:none;width:100%;max-width:none;justify-items:center;padding:0;opacity:0;overscroll-behavior:contain;z-index:999}.modal-scroll{overscroll-behavior:auto}:where(.modal){align-items:center}.modal-box{max-height:calc(100vh - 5em)}.modal-open,.modal-toggle:checked+.modal,.modal:target,.modal[open]{pointer-events:auto;visibility:visible;opacity:1}.modal-action{display:flex}.modal-toggle{position:fixed;height:0;width:0;appearance:none;opacity:0}:root:has(:is(.modal-open,.modal:target,.modal-toggle:checked+.modal,.modal[open])){overflow:hidden;scrollbar-gutter:stable}.navbar{display:flex;align-items:center}:where(.navbar>:not(script,style)){display:inline-flex;align-items:center}.navbar-start{width:50%;justify-content:flex-start}.navbar-center{flex-shrink:0}.navbar-end{width:50%;justify-content:flex-end}.progress{position:relative;width:100%;appearance:none;overflow:hidden}.radial-progress{position:relative;display:inline-grid;height:var(--size);width:var(--size);place-content:center;border-radius:9999px;background-color:transparent;vertical-align:middle;box-sizing:content-box}.radial-progress::-moz-progress-bar{appearance:none;background-color:transparent}.radial-progress::-webkit-progress-value{appearance:none;background-color:transparent}.radial-progress::-webkit-progress-bar{appearance:none;background-color:transparent}.radial-progress:after,.radial-progress:before{position:absolute;border-radius:9999px;content:""}.radial-progress:before{inset:0;background:radial-gradient(farthest-side,currentColor 98%,#0000) top/var(--thickness) var(--thickness) no-repeat,conic-gradient(currentColor calc(var(--value) * 1%),#0000 0);-webkit-mask:radial-gradient(farthest-side,#0000 calc(99% - var(--thickness)),#000 calc(100% - var(--thickness)));mask:radial-gradient(farthest-side,#0000 calc(99% - var(--thickness)),#000 calc(100% - var(--thickness)))}.radial-progress:after{inset:calc(50% - var(--thickness)/ 2);transform:rotate(calc(var(--value) * 3.6deg - 90deg)) translate(calc(var(--size)/ 2 - 50%))}.radio{flex-shrink:0}.range{height:1.5rem;width:100%;cursor:pointer}.range:focus{outline:0}.rating{position:relative;display:inline-flex}.rating :where(input){cursor:pointer;border-radius:0}.select{display:inline-flex;cursor:pointer;user-select:none;appearance:none;height:3rem;min-height:3rem;padding-inline-start:1rem;padding-inline-end:2.5rem;font-size:.875rem;line-height:1.25rem;line-height:2}.select[multiple]{height:auto}.stack{display:inline-grid}.stack>*{grid-column-start:1;grid-row-start:1;transform:translateY(10%) scale(.9);z-index:1}.stack>:nth-child(2){transform:translateY(5%) scale(.95);z-index:2}.stack>:nth-child(1){transform:translateY(0) scale(1);z-index:3}.stats{display:inline-grid}:where(.stats){grid-auto-flow:column}.stat{display:inline-grid;width:100%;grid-template-columns:repeat(1,1fr)}.stat-figure{grid-column-start:2;grid-row:span 3/span 3;grid-row-start:1;place-self:center;justify-self:end}.stat-title{grid-column-start:1;white-space:nowrap}.stat-value{grid-column-start:1;white-space:nowrap}.stat-desc{grid-column-start:1;white-space:nowrap}.stat-actions{grid-column-start:1;white-space:nowrap}.steps{display:inline-grid;grid-auto-flow:column;overflow:hidden;overflow-x:auto;counter-reset:step;grid-auto-columns:1fr}.steps .step{display:grid;grid-template-columns:repeat(1,minmax(0,1fr));grid-template-rows:repeat(2,minmax(0,1fr));place-items:center;text-align:center}.swap{position:relative;display:inline-grid;user-select:none;place-content:center}.swap>*{grid-column-start:1;grid-row-start:1}.swap input{appearance:none}.swap .swap-indeterminate,.swap .swap-on,.swap input:indeterminate~.swap-on{opacity:0}.swap input:checked~.swap-off,.swap input:indeterminate~.swap-off,.swap-active .swap-off{opacity:0}.swap input:checked~.swap-on,.swap input:indeterminate~.swap-indeterminate,.swap-active .swap-on{opacity:1}.tabs{display:grid;align-items:flex-end}.tabs-lifted:has(.tab-content[class*=" rounded-"]) .tab:first-child:not(:is(.tab-active,[aria-selected=true])),.tabs-lifted:has(.tab-content[class^=rounded-]) .tab:first-child:not(:is(.tab-active,[aria-selected=true])){border-bottom-color:transparent}.tab{position:relative;grid-row-start:1;display:inline-flex;height:2rem;cursor:pointer;user-select:none;appearance:none;flex-wrap:wrap;align-items:center;justify-content:center;text-align:center;font-size:.875rem;line-height:1.25rem;line-height:2;--tab-padding:1rem}.tab:is(input[type=radio]){width:auto;border-bottom-right-radius:0;border-bottom-left-radius:0}.tab:is(input[type=radio]):after{--tw-content:attr(aria-label);content:var(--tw-content)}.tab:not(input):empty{cursor:default;grid-column-start:span 9999}.tab-content{grid-column-start:1;grid-column-end:span 9999;grid-row-start:2;margin-top:calc(var(--tab-border) * -1);display:none;border-color:transparent;border-width:var(--tab-border,0)}:checked+.tab-content:nth-child(2),:is(.tab-active,[aria-selected=true])+.tab-content:nth-child(2){border-start-start-radius:0}:is(.tab-active,[aria-selected=true])+.tab-content,input.tab:checked+.tab-content{display:block}.table{position:relative;width:100%}.table :where(.table-pin-rowstheadtr){position:sticky;top:0;z-index:1;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.table :where(.table-pin-rowstfoottr){position:sticky;bottom:0;z-index:1;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.table :where(.table-pin-colstrth){position:sticky;left:0;right:0;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.table-zebra tbody tr:nth-child(even) :where(.table-pin-colstrth){--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)))}.textarea{min-height:3rem;flex-shrink:1;padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;font-size:.875rem;line-height:1.25rem;line-height:2}.timeline{position:relative;display:flex}:where(.timeline>li){position:relative;display:grid;flex-shrink:0;align-items:center;grid-template-rows:var(--timeline-row-start,minmax(0,1fr)) auto var(--timeline-row-end,minmax(0,1fr));grid-template-columns:var(--timeline-col-start,minmax(0,1fr)) auto var(--timeline-col-end,minmax(0,1fr))}.timeline>li>hr{width:100%;border-width:0}:where(.timeline>li>hr):first-child{grid-column-start:1;grid-row-start:2}:where(.timeline>li>hr):last-child{grid-column-start:3;grid-column-end:none;grid-row-start:2;grid-row-end:auto}.timeline-start{grid-column-start:1;grid-column-end:4;grid-row-start:1;grid-row-end:2;margin:.25rem;align-self:flex-end;justify-self:center}.timeline-middle{grid-column-start:2;grid-row-start:2}.timeline-end{grid-column-start:1;grid-column-end:4;grid-row-start:3;grid-row-end:4;margin:.25rem;align-self:flex-start;justify-self:center}.toast{position:fixed;display:flex;min-width:fit-content;flex-direction:column;white-space:nowrap}.toggle{flex-shrink:0}.alert{border-radius:var(--rounded-box,1rem);border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));padding:1rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-b2,oklch(var(--b2)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1));background-color:var(--alert-bg)}.alert-info{border-color:var(--fallback-in,oklch(var(--in)/.2));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-in,oklch(var(--in)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.alert-success{border-color:var(--fallback-su,oklch(var(--su)/.2));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-su,oklch(var(--su)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.alert-warning{border-color:var(--fallback-wa,oklch(var(--wa)/.2));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)));--alert-bg:var(--fallback-wa,oklch(var(--wa)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.alert-error{border-color:var(--fallback-er,oklch(var(--er)/.2));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-er,oklch(var(--er)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.avatar-group{display:flex;overflow:hidden}.avatar-group :where(.avatar){overflow:hidden;border-radius:9999px;border-width:4px;--tw-border-opacity:1;border-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-border-opacity)))}.badge{border-radius:var(--rounded-badge,1.9rem);border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.badge-neutral{--tw-border-opacity:1;border-color:var(--fallback-n,oklch(var(--n)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.badge-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.badge-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.badge-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.badge-info{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.badge-success{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.badge-warning{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.badge-error{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.badge-ghost{--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.badge-outline{border-color:currentColor;--tw-border-opacity:0.5;background-color:transparent;color:currentColor}.badge-outline.badge-neutral{--tw-text-opacity:1;color:var(--fallback-n,oklch(var(--n)/var(--tw-text-opacity)))}.badge-outline.badge-primary{--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}.badge-outline.badge-secondary{--tw-text-opacity:1;color:var(--fallback-s,oklch(var(--s)/var(--tw-text-opacity)))}.badge-outline.badge-accent{--tw-text-opacity:1;color:var(--fallback-a,oklch(var(--a)/var(--tw-text-opacity)))}.badge-outline.badge-info{--tw-text-opacity:1;color:var(--fallback-in,oklch(var(--in)/var(--tw-text-opacity)))}.badge-outline.badge-success{--tw-text-opacity:1;color:var(--fallback-su,oklch(var(--su)/var(--tw-text-opacity)))}.badge-outline.badge-warning{--tw-text-opacity:1;color:var(--fallback-wa,oklch(var(--wa)/var(--tw-text-opacity)))}.badge-outline.badge-error{--tw-text-opacity:1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity)))}.btm-nav{height:4rem;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));color:currentColor}.btm-nav>*{border-color:currentColor}.btm-nav>:not(.active){padding-top:.125rem}.btm-nav>:where(.active){border-top-width:2px;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.btm-nav>.disabled,.btm-nav>[disabled]{pointer-events:none;--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}@media (hover:hover){.btm-nav>.disabled:hover,.btm-nav>[disabled]:hover{pointer-events:none;--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}}.btm-nav>* .label{font-size:1rem;line-height:1.5rem}.breadcrumbs{padding-top:.5rem;padding-bottom:.5rem}.breadcrumbs>ol>li>a:focus,.breadcrumbs>ul>li>a:focus{outline:2px solid transparent;outline-offset:2px}.breadcrumbs>ol>li>a:focus-visible,.breadcrumbs>ul>li>a:focus-visible{outline:2px solid currentColor;outline-offset:2px}.breadcrumbs>ol>li+:before,.breadcrumbs>ul>li+:before{content:"";margin-left:.5rem;margin-right:.75rem;display:block;height:.375rem;width:.375rem;--tw-rotate:45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));opacity:.4;border-top:1px solid;border-right:1px solid;background-color:transparent}[dir=rtl] .breadcrumbs>ol>li+:before,[dir=rtl] .breadcrumbs>ul>li+:before{--tw-rotate:-135deg}.btn{gap:.5rem;font-weight:600;text-decoration-line:none;transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);border-width:var(--border-btn,1px);transition-property:color,background-color,border-color,opacity,box-shadow,transform}@media (prefers-reduced-motion:no-preference){.btn{animation:button-pop var(--animation-btn,.25s) ease-out}}.btn:active:focus,.btn:active:hover{animation:button-pop 0s ease-out;transform:scale(var(--btn-focus-scale,.97))}.btn{--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));text-decoration-line:none;--tw-shadow:0 1px 2px 0 rgb(0 0 0 / 0.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:var(--fallback-bc,oklch(var(--bc)/1));background-color:oklch(var(--btn-color,var(--b2)) / var(--tw-bg-opacity));--tw-bg-opacity:1;border-color:oklch(var(--btn-color,var(--b2)) / var(--tw-border-opacity));--tw-border-opacity:1}@supports not (color:oklch(0% 0 0)){.btn{background-color:var(--btn-color,var(--fallback-b2));border-color:var(--btn-color,var(--fallback-b2))}}@media (hover:hover){.btn:hover{--tw-border-opacity:1;border-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn:hover{background-color:color-mix(in oklab,oklch(var(--btn-color,var(--b2)) / var(--tw-bg-opacity,1)) 90%,#000);border-color:color-mix(in oklab,oklch(var(--btn-color,var(--b2)) / var(--tw-border-opacity,1)) 90%,#000)}}@supports not (color:oklch(0% 0 0)){.btn:hover{background-color:var(--btn-color,var(--fallback-b2));border-color:var(--btn-color,var(--fallback-b2))}}}@supports (color:color-mix(in oklab,black,black)){.btn-active{background-color:color-mix(in oklab,oklch(var(--btn-color,var(--b3)) / var(--tw-bg-opacity,1)) 90%,#000);border-color:color-mix(in oklab,oklch(var(--btn-color,var(--b3)) / var(--tw-border-opacity,1)) 90%,#000)}}.btn:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px}.btn-primary{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}@supports (color:oklch(0% 0 0)){.btn-primary{--btn-color:var(--p)}}@supports not (color:oklch(0% 0 0)){.btn-primary{--btn-color:var(--fallback-p)}}.btn-secondary{--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}@supports (color:oklch(0% 0 0)){.btn-secondary{--btn-color:var(--s)}}@supports not (color:oklch(0% 0 0)){.btn-secondary{--btn-color:var(--fallback-s)}}.btn-accent{--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}@supports (color:oklch(0% 0 0)){.btn-accent{--btn-color:var(--a)}}@supports not (color:oklch(0% 0 0)){.btn-accent{--btn-color:var(--fallback-a)}}.btn-neutral{--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));outline-color:var(--fallback-n,oklch(var(--n)/1))}@supports (color:oklch(0% 0 0)){.btn-neutral{--btn-color:var(--n)}}@supports not (color:oklch(0% 0 0)){.btn-neutral{--btn-color:var(--fallback-n)}}.btn-info{--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}@supports (color:oklch(0% 0 0)){.btn-info{--btn-color:var(--in)}}@supports not (color:oklch(0% 0 0)){.btn-info{--btn-color:var(--fallback-in)}}.btn-success{--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}@supports (color:oklch(0% 0 0)){.btn-success{--btn-color:var(--su)}}@supports not (color:oklch(0% 0 0)){.btn-success{--btn-color:var(--fallback-su)}}.btn-warning{--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}@supports (color:oklch(0% 0 0)){.btn-warning{--btn-color:var(--wa)}}@supports not (color:oklch(0% 0 0)){.btn-warning{--btn-color:var(--fallback-wa)}}.btn-error{--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}@supports (color:oklch(0% 0 0)){.btn-error{--btn-color:var(--er)}}@supports not (color:oklch(0% 0 0)){.btn-error{--btn-color:var(--fallback-er)}}.btn.glass{--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:currentColor}@media (hover:hover){.btn.glass:hover{--glass-opacity:25%;--glass-border-opacity:15%}}.btn.glass.btn-active{--glass-opacity:25%;--glass-border-opacity:15%}.btn-ghost{border-width:1px;border-color:transparent;background-color:transparent;color:currentColor;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:currentColor}@media (hover:hover){.btn-ghost:hover{border-color:transparent}@supports (color:oklch(0% 0 0)){.btn-ghost:hover{background-color:var(--fallback-bc,oklch(var(--bc)/.2))}}}.btn-ghost.btn-active{border-color:transparent;background-color:var(--fallback-bc,oklch(var(--bc)/.2))}.btn-link{border-color:transparent;background-color:transparent;--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)));text-decoration-line:underline;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:currentColor}@media (hover:hover){.btn-link:hover{border-color:transparent;background-color:transparent;text-decoration-line:underline}}.btn-link.btn-active{border-color:transparent;background-color:transparent;text-decoration-line:underline}.btn-outline{border-color:currentColor;background-color:transparent;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}@media (hover:hover){.btn-outline:hover{--tw-border-opacity:1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-b1,oklch(var(--b1)/var(--tw-text-opacity)))}}.btn-outline.btn-active{--tw-border-opacity:1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-b1,oklch(var(--b1)/var(--tw-text-opacity)))}.btn-outline.btn-primary{--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-primary:hover{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-primary:hover{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000)}}}.btn-outline.btn-primary.btn-active{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-primary.btn-active{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000)}}.btn-outline.btn-secondary{--tw-text-opacity:1;color:var(--fallback-s,oklch(var(--s)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-secondary:hover{--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-secondary:hover{background-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000)}}}.btn-outline.btn-secondary.btn-active{--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-secondary.btn-active{background-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000)}}.btn-outline.btn-accent{--tw-text-opacity:1;color:var(--fallback-a,oklch(var(--a)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-accent:hover{--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-accent:hover{background-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000)}}}.btn-outline.btn-accent.btn-active{--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-accent.btn-active{background-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000)}}.btn-outline.btn-success{--tw-text-opacity:1;color:var(--fallback-su,oklch(var(--su)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-success:hover{--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-success:hover{background-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000)}}}.btn-outline.btn-success.btn-active{--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-success.btn-active{background-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000)}}.btn-outline.btn-info{--tw-text-opacity:1;color:var(--fallback-in,oklch(var(--in)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-info:hover{--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-info:hover{background-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000)}}}.btn-outline.btn-info.btn-active{--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-info.btn-active{background-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000)}}.btn-outline.btn-warning{--tw-text-opacity:1;color:var(--fallback-wa,oklch(var(--wa)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-warning:hover{--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-warning:hover{background-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000)}}}.btn-outline.btn-warning.btn-active{--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-warning.btn-active{background-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000)}}.btn-outline.btn-error{--tw-text-opacity:1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-error:hover{--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-error:hover{background-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000)}}}.btn-outline.btn-error.btn-active{--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-error.btn-active{background-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000)}}.btn.btn-disabled,.btn:disabled,.btn[disabled]{--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}@media (hover:hover){.btn-disabled:hover,.btn:disabled:hover,.btn[disabled]:hover{--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}}.btn:is(input[type=checkbox]:checked),.btn:is(input[type=radio]:checked){--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@media (hover:hover){@supports (color:color-mix(in oklab,black,black)){.btn:is(input[type=checkbox]:checked):hover,.btn:is(input[type=radio]:checked):hover{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000)}}}.btn:is(input[type=checkbox]:checked):focus-visible,.btn:is(input[type=radio]:checked):focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}@keyframes button-pop{0%{transform:scale(var(--btn-focus-scale,.98))}40%{transform:scale(1.02)}100%{transform:scale(1)}}.card{border-radius:var(--rounded-box,1rem)}.card :where(figure:first-child){overflow:hidden;border-start-start-radius:inherit;border-start-end-radius:inherit;border-end-start-radius:unset;border-end-end-radius:unset}.card :where(figure:last-child){overflow:hidden;border-start-start-radius:unset;border-start-end-radius:unset;border-end-start-radius:inherit;border-end-end-radius:inherit}.card:focus-visible{outline:2px solid currentColor;outline-offset:2px}.card.bordered{border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.card-bordered{border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.card.compact .card-body{padding:1rem;font-size:.875rem;line-height:1.25rem}.card-body{padding:var(--padding-card,2rem);display:flex;flex-direction:column;gap:.5rem}.card-title{display:flex;align-items:center;gap:.5rem;font-size:1.25rem;line-height:1.75rem;font-weight:600}.card.image-full:before{z-index:10;border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));opacity:.75}.card.image-full>.card-body{z-index:20;--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.card.image-full :where(figure){overflow:hidden;border-radius:inherit}.carousel{-ms-overflow-style:none;scrollbar-width:none}.carousel::-webkit-scrollbar{display:none}.chat-bubble{border-radius:var(--rounded-box,1rem);min-height:2.75rem;min-width:2.75rem}.chat-bubble{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.chat-bubble-primary{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.chat-bubble-secondary{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.chat-bubble-accent{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.chat-bubble-info{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.chat-bubble-success{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.chat-bubble-warning{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.chat-bubble-error{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.chat-start .chat-bubble{border-end-start-radius:0}.chat-start .chat-bubble:before{inset-inline-start:-.749rem}.chat-end .chat-bubble{border-end-end-radius:0}.chat-end .chat-bubble:before{inset-inline-start:99.9%}.checkbox{--chkbg:var(--fallback-bc,oklch(var(--bc)/1));--chkfg:var(--fallback-b1,oklch(var(--b1)/1));height:1.5rem;width:1.5rem;cursor:pointer;appearance:none;border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2}.checkbox:focus{box-shadow:none}.checkbox:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.checkbox:disabled{border-width:0}.checkbox:checked,.checkbox[aria-checked=true]{background-repeat:no-repeat;animation:checkmark var(--animation-input,.2s) ease-out;background-color:var(--chkbg);background-image:linear-gradient(-45deg,transparent 65%,var(--chkbg) 65.99%),linear-gradient(45deg,transparent 75%,var(--chkbg) 75.99%),linear-gradient(-45deg,var(--chkbg) 40%,transparent 40.99%),linear-gradient(45deg,var(--chkbg) 30%,var(--chkfg) 30.99%,var(--chkfg) 40%,transparent 40.99%),linear-gradient(-45deg,var(--chkfg) 50%,var(--chkbg) 50.99%)}.checkbox:indeterminate{--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));background-repeat:no-repeat;animation:checkmark var(--animation-input,.2s) ease-out;background-image:linear-gradient(90deg,transparent 80%,var(--chkbg) 80%),linear-gradient(-90deg,transparent 80%,var(--chkbg) 80%),linear-gradient(0deg,var(--chkbg) 43%,var(--chkfg) 43%,var(--chkfg) 57%,var(--chkbg) 57%)}.checkbox-primary{--chkbg:var(--fallback-p,oklch(var(--p)/1));--chkfg:var(--fallback-pc,oklch(var(--pc)/1));--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-primary:hover{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}}.checkbox-primary:focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}.checkbox-primary:checked,.checkbox-primary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.checkbox-secondary{--chkbg:var(--fallback-s,oklch(var(--s)/1));--chkfg:var(--fallback-sc,oklch(var(--sc)/1));--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-secondary:hover{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}}.checkbox-secondary:focus-visible{outline-color:var(--fallback-s,oklch(var(--s)/1))}.checkbox-secondary:checked,.checkbox-secondary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.checkbox-accent{--chkbg:var(--fallback-a,oklch(var(--a)/1));--chkfg:var(--fallback-ac,oklch(var(--ac)/1));--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-accent:hover{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}}.checkbox-accent:focus-visible{outline-color:var(--fallback-a,oklch(var(--a)/1))}.checkbox-accent:checked,.checkbox-accent[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.checkbox-success{--chkbg:var(--fallback-su,oklch(var(--su)/1));--chkfg:var(--fallback-suc,oklch(var(--suc)/1));--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-success:hover{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}}.checkbox-success:focus-visible{outline-color:var(--fallback-su,oklch(var(--su)/1))}.checkbox-success:checked,.checkbox-success[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.checkbox-warning{--chkbg:var(--fallback-wa,oklch(var(--wa)/1));--chkfg:var(--fallback-wac,oklch(var(--wac)/1));--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-warning:hover{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}}.checkbox-warning:focus-visible{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.checkbox-warning:checked,.checkbox-warning[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.checkbox-info{--chkbg:var(--fallback-in,oklch(var(--in)/1));--chkfg:var(--fallback-inc,oklch(var(--inc)/1));--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-info:hover{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}}.checkbox-info:focus-visible{outline-color:var(--fallback-in,oklch(var(--in)/1))}.checkbox-info:checked,.checkbox-info[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.checkbox-error{--chkbg:var(--fallback-er,oklch(var(--er)/1));--chkfg:var(--fallback-erc,oklch(var(--erc)/1));--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-error:hover{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}}.checkbox-error:focus-visible{outline-color:var(--fallback-er,oklch(var(--er)/1))}.checkbox-error:checked,.checkbox-error[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.checkbox:disabled{cursor:not-allowed;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.2}@keyframes checkmark{0%{background-position-y:5px}50%{background-position-y:-2px}100%{background-position-y:0}}.checkbox-mark{display:none}.collapse{width:100%;border-radius:var(--rounded-box,1rem)}details.collapse{width:100%}details.collapse summary{position:relative;display:block}details.collapse summary::-webkit-details-marker{display:none}.collapse:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}details.collapse summary{outline:2px solid transparent;outline-offset:2px}.collapse:has(.collapse-title:focus-visible),.collapse:has(>input[type=checkbox]:focus-visible),.collapse:has(>input[type=radio]:focus-visible){outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.collapse-arrow>.collapse-title:after{position:absolute;display:block;height:.5rem;width:.5rem;--tw-translate-y:-100%;--tw-rotate:45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:150ms;transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s;top:1.9rem;inset-inline-end:1.4rem;content:"";transform-origin:75% 75%;box-shadow:2px 2px;pointer-events:none}.collapse-plus>.collapse-title:after{position:absolute;display:block;height:.5rem;width:.5rem;transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);top:.9rem;inset-inline-end:1.4rem;content:"+";pointer-events:none}.collapse:not(.collapse-open):not(.collapse-close)>.collapse-title,.collapse:not(.collapse-open):not(.collapse-close)>input[type=checkbox],.collapse:not(.collapse-open):not(.collapse-close)>input[type=radio]:not(:checked){cursor:pointer}.collapse:focus:not(.collapse-open):not(.collapse-close):not(.collapse[open])>.collapse-title{cursor:unset}.collapse-title{position:relative}:where(.collapse>input[type=checkbox]),:where(.collapse>input[type=radio]){z-index:1}.collapse-title,:where(.collapse>input[type=checkbox]),:where(.collapse>input[type=radio]){width:100%;padding:1rem;padding-inline-end:3rem;min-height:3.75rem;transition:background-color .2s ease-out}.collapse-content{padding-left:1rem;padding-right:1rem;cursor:unset;transition:padding .2s ease-out,background-color .2s ease-out}.collapse-open>:where(.collapse-content),.collapse:focus:not(.collapse-close)>:where(.collapse-content),.collapse:not(.collapse-close)>:where(input[type=checkbox]:checked~.collapse-content),.collapse:not(.collapse-close)>:where(input[type=radio]:checked~.collapse-content),.collapse[open]>:where(.collapse-content){padding-bottom:1rem;transition:padding .2s ease-out,background-color .2s ease-out}.collapse-arrow:focus:not(.collapse-close)>.collapse-title:after,.collapse-arrow:not(.collapse-close)>input[type=checkbox]:checked~.collapse-title:after,.collapse-arrow:not(.collapse-close)>input[type=radio]:checked~.collapse-title:after,.collapse-open.collapse-arrow>.collapse-title:after,.collapse[open].collapse-arrow>.collapse-title:after{--tw-translate-y:-50%;--tw-rotate:225deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.collapse-open.collapse-plus>.collapse-title:after,.collapse-plus:focus:not(.collapse-close)>.collapse-title:after,.collapse-plus:not(.collapse-close)>input[type=checkbox]:checked~.collapse-title:after,.collapse-plus:not(.collapse-close)>input[type=radio]:checked~.collapse-title:after,.collapse[open].collapse-plus>.collapse-title:after{content:"−"}.countdown>:before{text-align:center;transition:all 1s cubic-bezier(1,0,0,1)}.diff-item-1:after{border-radius:9999px;border-width:2px;--tw-border-opacity:1;border-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-border-opacity)));background-color:var(--fallback-b1,oklch(var(--b1)/.5));--tw-shadow:0 1px 2px 0 rgb(0 0 0 / 0.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-style:solid;outline-offset:-3px;outline-color:var(--fallback-bc,oklch(var(--bc)/.05));--tw-backdrop-blur:blur(8px);backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);translate:50% -50%}.diff-item-2{border-right-width:2px;--tw-border-opacity:1;border-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-border-opacity)))}.divider{margin-top:1rem;margin-bottom:1rem;height:1rem;white-space:nowrap}.divider:after,.divider:before{background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.divider:not(:empty){gap:1rem}.divider-neutral:after,.divider-neutral:before{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)))}.divider-primary:after,.divider-primary:before{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)))}.divider-secondary:after,.divider-secondary:before{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)))}.divider-accent:after,.divider-accent:before{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)))}.divider-success:after,.divider-success:before{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.divider-warning:after,.divider-warning:before{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.divider-info:after,.divider-info:before{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.divider-error:after,.divider-error:before{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.drawer{width:100%}.drawer-side>.drawer-overlay{cursor:pointer;background-color:transparent;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1)}.drawer-toggle:checked~.drawer-side>.drawer-overlay{background-color:#0006}.drawer-toggle:focus-visible~.drawer-content label.drawer-button{outline-style:solid;outline-width:2px;outline-offset:2px}.dropdown:is(:not(details)) .dropdown-content{transform-origin:top;--tw-scale-x:.95;--tw-scale-y:.95;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1)}.dropdown-bottom .dropdown-content{transform-origin:top}.dropdown-top .dropdown-content{transform-origin:bottom}.dropdown-left .dropdown-content{transform-origin:right}.dropdown-right .dropdown-content{transform-origin:left}.dropdown.dropdown-open .dropdown-content,.dropdown:focus .dropdown-content,.dropdown:focus-within .dropdown-content{--tw-scale-x:1;--tw-scale-y:1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@media (hover:hover){.dropdown.dropdown-hover:hover .dropdown-content{--tw-scale-x:1;--tw-scale-y:1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}}.file-input{overflow:hidden;border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));font-size:1rem;line-height:1.5rem}.file-input::file-selector-button{border-style:solid;--tw-border-opacity:1;border-color:var(--fallback-n,oklch(var(--n)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));font-weight:600;text-transform:uppercase;--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));text-decoration-line:none;border-width:var(--border-btn,1px);animation:button-pop var(--animation-btn,.25s) ease-out}.file-input-bordered{--tw-border-opacity:0.2}.file-input:focus{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.file-input-ghost{--tw-bg-opacity:0.05}.file-input-ghost:focus{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:none}.file-input-ghost::file-selector-button{border-width:1px;border-color:transparent;background-color:transparent;color:currentColor}.file-input-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.file-input-primary:focus{outline-color:var(--fallback-p,oklch(var(--p)/1))}.file-input-primary::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.file-input-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.file-input-secondary:focus{outline-color:var(--fallback-s,oklch(var(--s)/1))}.file-input-secondary::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.file-input-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.file-input-accent:focus{outline-color:var(--fallback-a,oklch(var(--a)/1))}.file-input-accent::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.file-input-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.file-input-info:focus{outline-color:var(--fallback-in,oklch(var(--in)/1))}.file-input-info::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.file-input-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.file-input-success:focus{outline-color:var(--fallback-su,oklch(var(--su)/1))}.file-input-success::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.file-input-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.file-input-warning:focus{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.file-input-warning::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.file-input-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.file-input-error:focus{outline-color:var(--fallback-er,oklch(var(--er)/1))}.file-input-error::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.file-input-disabled,.file-input[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));--tw-text-opacity:0.2}.file-input-disabled::placeholder,.file-input[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.file-input-disabled::file-selector-button,.file-input[disabled]::file-selector-button{--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}.footer{column-gap:1rem;row-gap:2.5rem;font-size:.875rem;line-height:1.25rem}.footer>*{gap:.5rem}.footer-title{margin-bottom:.5rem;font-weight:700;text-transform:uppercase;opacity:.6}.label{padding-left:.25rem;padding-right:.25rem;padding-top:.5rem;padding-bottom:.5rem}.label-text{font-size:.875rem;line-height:1.25rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.label-text-alt{font-size:.75rem;line-height:1rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}@media(hover:hover){.label a:hover{--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}}.hero-overlay{background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.5}.hero-content{max-width:80rem;gap:1rem;padding:1rem}.input{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));font-size:1rem;line-height:1.5rem}.input input{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));background-color:transparent}.input input:focus{outline:2px solid transparent;outline-offset:2px}.input[list]::-webkit-calendar-picker-indicator{line-height:1em}.input-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.input:focus,.input:focus-within{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.input-ghost{--tw-bg-opacity:0.05}.input-ghost:focus,.input-ghost:focus-within{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:none}.input-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.input-primary:focus,.input-primary:focus-within{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}.input-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.input-secondary:focus,.input-secondary:focus-within{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}.input-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.input-accent:focus,.input-accent:focus-within{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}.input-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.input-info:focus,.input-info:focus-within{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}.input-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.input-success:focus,.input-success:focus-within{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}.input-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.input-warning:focus,.input-warning:focus-within{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.input-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.input-error:focus,.input-error:focus-within{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}.input-disabled,.input:disabled,.input:has(>input[disabled]),.input[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.input-disabled::placeholder,.input:disabled::placeholder,.input:has(>input[disabled])::placeholder,.input[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.input:has(>input[disabled])>input[disabled]{cursor:not-allowed}.input::-webkit-date-and-time-value{text-align:inherit}.join{border-radius:var(--rounded-btn,.5rem)}.join>:where(:not(:first-child)){margin-top:0;margin-bottom:0;margin-inline-start:-1px}.join>:where(:not(:first-child)):is(.btn){margin-inline-start:calc(var(--border-btn) * -1)}.join-item:focus{isolation:isolate}.kbd{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2;--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding-left:.5rem;padding-right:.5rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));border-bottom-width:2px;min-height:2.2em;min-width:2.2em}.link-primary{--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-primary:hover{color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 80%,#000)}}}.link-secondary{--tw-text-opacity:1;color:var(--fallback-s,oklch(var(--s)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-secondary:hover{color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 80%,#000)}}}.link-accent{--tw-text-opacity:1;color:var(--fallback-a,oklch(var(--a)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-accent:hover{color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 80%,#000)}}}.link-neutral{--tw-text-opacity:1;color:var(--fallback-n,oklch(var(--n)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-neutral:hover{color:color-mix(in oklab,var(--fallback-n,oklch(var(--n)/1)) 80%,#000)}}}.link-success{--tw-text-opacity:1;color:var(--fallback-su,oklch(var(--su)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-success:hover{color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 80%,#000)}}}.link-info{--tw-text-opacity:1;color:var(--fallback-in,oklch(var(--in)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-info:hover{color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 80%,#000)}}}.link-warning{--tw-text-opacity:1;color:var(--fallback-wa,oklch(var(--wa)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-warning:hover{color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 80%,#000)}}}.link-error{--tw-text-opacity:1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-error:hover{color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 80%,#000)}}}.link:focus{outline:2px solid transparent;outline-offset:2px}.link:focus-visible{outline:2px solid currentColor;outline-offset:2px}.loading{pointer-events:none;display:inline-block;aspect-ratio:1/1;width:1.5rem;background-color:currentColor;mask-size:100%;mask-repeat:no-repeat;mask-position:center;mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' stroke='%23000' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_V8m1%7Btransform-origin:center;animation:spinner_zKoa 2s linear infinite%7D.spinner_V8m1 circle%7Bstroke-linecap:round;animation:spinner_YpZS 1.5s ease-out infinite%7D%40keyframes spinner_zKoa%7B100%25%7Btransform:rotate(360deg)%7D%7D%40keyframes spinner_YpZS%7B0%25%7Bstroke-dasharray:0 150;stroke-dashoffset:0%7D47.5%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-16%7D95%25%2C100%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-59%7D%7D%3C%2Fstyle%3E%3Cg class='spinner_V8m1'%3E%3Ccircle cx='12' cy='12' r='9.5' fill='none' stroke-width='3'%3E%3C%2Fcircle%3E%3C%2Fg%3E%3C%2Fsvg%3E")}.loading-spinner{mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' stroke='%23000' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_V8m1%7Btransform-origin:center;animation:spinner_zKoa 2s linear infinite%7D.spinner_V8m1 circle%7Bstroke-linecap:round;animation:spinner_YpZS 1.5s ease-out infinite%7D%40keyframes spinner_zKoa%7B100%25%7Btransform:rotate(360deg)%7D%7D%40keyframes spinner_YpZS%7B0%25%7Bstroke-dasharray:0 150;stroke-dashoffset:0%7D47.5%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-16%7D95%25%2C100%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-59%7D%7D%3C%2Fstyle%3E%3Cg class='spinner_V8m1'%3E%3Ccircle cx='12' cy='12' r='9.5' fill='none' stroke-width='3'%3E%3C%2Fcircle%3E%3C%2Fg%3E%3C%2Fsvg%3E")}.loading-dots{mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_qM83%7Banimation:spinner_8HQG 1.05s infinite%7D.spinner_oXPr%7Banimation-delay:.1s%7D.spinner_ZTLf%7Banimation-delay:.2s%7D@keyframes spinner_8HQG%7B0%25,57.14%25%7Banimation-timing-function:cubic-bezier(0.33,.66,.66,1);transform:translate(0)%7D28.57%25%7Banimation-timing-function:cubic-bezier(0.33,0,.66,.33);transform:translateY(-6px)%7D100%25%7Btransform:translate(0)%7D%7D%3C/style%3E%3Ccircle class='spinner_qM83' cx='4' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_oXPr' cx='12' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_ZTLf' cx='20' cy='12' r='3'/%3E%3C/svg%3E")}.loading-ring{mask-image:url("data:image/svg+xml,%3Csvg width='44' height='44' viewBox='0 0 44 44' xmlns='http://www.w3.org/2000/svg' stroke='%23fff'%3E%3Cg fill='none' fill-rule='evenodd' stroke-width='2'%3E%3Ccircle cx='22' cy='22' r='1'%3E%3Canimate attributeName='r' begin='0s' dur='1.8s' values='1; 20' calcMode='spline' keyTimes='0; 1' keySplines='0.165, 0.84, 0.44, 1' repeatCount='indefinite' /%3E%3Canimate attributeName='stroke-opacity' begin='0s' dur='1.8s' values='1; 0' calcMode='spline' keyTimes='0; 1' keySplines='0.3, 0.61, 0.355, 1' repeatCount='indefinite' /%3E%3C/circle%3E%3Ccircle cx='22' cy='22' r='1'%3E%3Canimate attributeName='r' begin='-0.9s' dur='1.8s' values='1; 20' calcMode='spline' keyTimes='0; 1' keySplines='0.165, 0.84, 0.44, 1' repeatCount='indefinite' /%3E%3Canimate attributeName='stroke-opacity' begin='-0.9s' dur='1.8s' values='1; 0' calcMode='spline' keyTimes='0; 1' keySplines='0.3, 0.61, 0.355, 1' repeatCount='indefinite' /%3E%3C/circle%3E%3C/g%3E%3C/svg%3E")}.loading-ball{mask-image:url("data:image/svg+xml,%0A%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_rXNP%7Banimation:spinner_YeBj .8s infinite%7D@keyframes spinner_YeBj%7B0%25%7Banimation-timing-function:cubic-bezier(0.33,0,.66,.33);cy:5px%7D46.875%25%7Bcy:20px;rx:4px;ry:4px%7D50%25%7Banimation-timing-function:cubic-bezier(0.33,.66,.66,1);cy:20.5px;rx:4.8px;ry:3px%7D53.125%25%7Brx:4px;ry:4px%7D100%25%7Bcy:5px%7D%7D%3C/style%3E%3Cellipse class='spinner_rXNP' cx='12' cy='5' rx='4' ry='4'/%3E%3C/svg%3E")}.loading-bars{mask-image:url("data:image/svg+xml,%0A%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_hzlK%7Banimation:spinner_vc4H .8s linear infinite;animation-delay:-.8s%7D.spinner_koGT%7Banimation-delay:-.65s%7D.spinner_YF1u%7Banimation-delay:-.5s%7D@keyframes spinner_vc4H%7B0%25%7By:1px;height:22px%7D93.75%25%7By:5px;height:14px;opacity:.2%7D%7D%3C/style%3E%3Crect class='spinner_hzlK' x='1' y='1' width='6' height='22'/%3E%3Crect class='spinner_hzlK spinner_koGT' x='9' y='1' width='6' height='22'/%3E%3Crect class='spinner_hzlK spinner_YF1u' x='17' y='1' width='6' height='22'/%3E%3C/svg%3E")}.loading-infinity{mask-image:url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' style='shape-rendering: auto;' width='200px' height='200px' viewBox='0 0 100 100' preserveAspectRatio='xMidYMid'%3E%3Cpath fill='none' stroke='%230a0a0a' stroke-width='10' stroke-dasharray='205.271142578125 51.317785644531256' d='M24.3 30C11.4 30 5 43.3 5 50s6.4 20 19.3 20c19.3 0 32.1-40 51.4-40 C88.6 30 95 43.3 95 50s-6.4 20-19.3 20C56.4 70 43.6 30 24.3 30z' stroke-linecap='round' style='transform:scale(0.8);transform-origin:50px 50px'%3E%3Canimate attributeName='stroke-dashoffset' repeatCount='indefinite' dur='2s' keyTimes='0;1' values='0;256.58892822265625'%3E%3C/animate%3E%3C/path%3E%3C/svg%3E")}.loading-xs{width:1rem}.loading-sm{width:1.25rem}.loading-md{width:1.5rem}.loading-lg{width:2.5rem}.mask-squircle{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M100 0C20 0 0 20 0 100s20 100 100 100 100-20 100-100S180 0 100 0Z'/%3e%3c/svg%3e")}.mask-decagon{mask-image:url("data:image/svg+xml,%3csvg width='192' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 0 58.779 19.098 36.327 50v61.804l-36.327 50L96 200l-58.779-19.098-36.327-50V69.098l36.327-50z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-diamond{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m100 0 100 100-100 100L0 100z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-heart{mask-image:url("data:image/svg+xml,%3csvg width='200' height='185' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M100 184.606a15.384 15.384 0 0 1-8.653-2.678C53.565 156.28 37.205 138.695 28.182 127.7 8.952 104.264-.254 80.202.005 54.146.308 24.287 24.264 0 53.406 0c21.192 0 35.869 11.937 44.416 21.879a2.884 2.884 0 0 0 4.356 0C110.725 11.927 125.402 0 146.594 0c29.142 0 53.098 24.287 53.4 54.151.26 26.061-8.956 50.122-28.176 73.554-9.023 10.994-25.383 28.58-63.165 54.228a15.384 15.384 0 0 1-8.653 2.673Z' fill='black' fill-rule='nonzero'/%3e%3c/svg%3e")}.mask-hexagon{mask-image:url("data:image/svg+xml,%3csvg width='182' height='201' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M.3 65.486c0-9.196 6.687-20.063 14.211-25.078l61.86-35.946c8.36-5.016 20.899-5.016 29.258 0l61.86 35.946c8.36 5.015 14.211 15.882 14.211 25.078v71.055c0 9.196-6.687 20.063-14.211 25.079l-61.86 35.945c-8.36 4.18-20.899 4.18-29.258 0L14.51 161.62C6.151 157.44.3 145.737.3 136.54V65.486Z' fill='black' fill-rule='nonzero'/%3e%3c/svg%3e")}.mask-hexagon-2{mask-image:url("data:image/svg+xml,%3csvg width='200' height='182' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M64.786 181.4c-9.196 0-20.063-6.687-25.079-14.21L3.762 105.33c-5.016-8.36-5.016-20.9 0-29.259l35.945-61.86C44.723 5.851 55.59 0 64.786 0h71.055c9.196 0 20.063 6.688 25.079 14.211l35.945 61.86c4.18 8.36 4.18 20.899 0 29.258l-35.945 61.86c-4.18 8.36-15.883 14.211-25.079 14.211H64.786Z' fill='black' fill-rule='nonzero'/%3e%3c/svg%3e")}.mask-circle{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle fill='black' cx='100' cy='100' r='100' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram{mask-image:url("data:image/svg+xml,%3csvg width='200' height='154' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M46.154 0H200l-46.154 153.846H0z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram-2{mask-image:url("data:image/svg+xml,%3csvg width='200' height='154' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M153.846 0H0l46.154 153.846H200z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram-3{mask-image:url("data:image/svg+xml,%3csvg width='154' height='201' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M.077 47.077v153.846l153.846-46.154V.923z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram-4{mask-image:url("data:image/svg+xml,%3csvg width='154' height='201' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M153.923 47.077v153.846L.077 154.77V.923z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-pentagon{mask-image:url("data:image/svg+xml,%3csvg width='192' height='181' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 0 95.106 69.098-36.327 111.804H37.22L.894 69.098z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-square{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M0 0h200v200H0z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-star{mask-image:url("data:image/svg+xml,%3csvg width='192' height='180' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 137.263-58.779 42.024 22.163-68.389L.894 68.481l72.476-.243L96 0l22.63 68.238 72.476.243-58.49 42.417 22.163 68.389z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-star-2{mask-image:url("data:image/svg+xml,%3csvg width='192' height='180' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 153.044-58.779 26.243 7.02-63.513L.894 68.481l63.117-13.01L96 0l31.989 55.472 63.117 13.01-43.347 47.292 7.02 63.513z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle{mask-image:url("data:image/svg+xml,%3csvg width='174' height='149' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m87 148.476-86.603.185L43.86 74.423 87 0l43.14 74.423 43.463 74.238z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle-2{mask-image:url("data:image/svg+xml,%3csvg width='174' height='150' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m87 .738 86.603-.184-43.463 74.238L87 149.214 43.86 74.792.397.554z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle-3{mask-image:url("data:image/svg+xml,%3csvg width='150' height='174' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m149.369 87.107.185 86.603-74.239-43.463L.893 87.107l74.422-43.14L149.554.505z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle-4{mask-image:url("data:image/svg+xml,%3csvg width='150' height='174' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M.631 87.107.446.505l74.239 43.462 74.422 43.14-74.422 43.14L.446 173.71z' fill-rule='evenodd'/%3e%3c/svg%3e")}.menu{padding:.5rem}:where(.menuli:empty){--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.1;margin:.5rem 1rem;height:1px}.menu :where(liul){margin-inline-start:1rem;padding-inline-start:.5rem}.menu :where(liul):before{position:absolute;bottom:.75rem;inset-inline-start:0;top:.75rem;width:1px;--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.1;content:""}.menu :where(li:not(.menu-title)>:not(ul,details,.menu-title,.btn)),.menu :where(li:not(.menu-title)>details>summary:not(.menu-title)){border-radius:var(--rounded-btn,.5rem);padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;text-align:start;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);text-wrap:balance}:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):is(summary):not(.active,.btn):focus-visible,:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(summary,.active,.btn).focus,:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(summary,.active,.btn):focus,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):is(summary):not(.active,.btn):focus-visible,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(summary,.active,.btn).focus,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(summary,.active,.btn):focus{cursor:pointer;background-color:var(--fallback-bc,oklch(var(--bc)/.1));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));outline:2px solid transparent;outline-offset:2px}@media (hover:hover){:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(.active,.btn):hover,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(.active,.btn):hover{cursor:pointer;outline:2px solid transparent;outline-offset:2px}@supports (color:oklch(0% 0 0)){:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(.active,.btn):hover,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(.active,.btn):hover{background-color:var(--fallback-bc,oklch(var(--bc)/.1))}}}.menu li>:not(ul,.menu-title,details,.btn).active,.menu li>:not(ul,.menu-title,details,.btn):active,.menu li>details>summary:active{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}@media(hover:hover){.menu li>:not(ul,.menu-title,details,.btn).active,.menu li>:not(ul,.menu-title,details,.btn):active,.menu li>details>summary:active{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}}.menu li.disabled{color:var(--fallback-bc,oklch(var(--bc)/.3))}.menu :where(li>details>summary)::-webkit-details-marker{display:none}.menu :where(li>.menu-dropdown-toggle):after,.menu :where(li>details>summary):after{justify-self:end;display:block;margin-top:-.5rem;height:.5rem;width:.5rem;transform:rotate(45deg);transition-property:transform,margin-top;transition-duration:.3s;transition-timing-function:cubic-bezier(.4,0,.2,1);content:"";transform-origin:75% 75%;box-shadow:2px 2px;pointer-events:none}.menu :where(li>.menu-dropdown-toggle.menu-dropdown-show):after,.menu :where(li>details[open]>summary):after{transform:rotate(225deg);margin-top:0}.menu-title{padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;font-size:.875rem;line-height:1.25rem;font-weight:700;color:var(--fallback-bc,oklch(var(--bc)/.4))}.mockup-code{min-width:18rem;border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));padding-top:1.25rem;padding-bottom:1.25rem;--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));direction:ltr}.mockup-code:before{content:"";margin-bottom:1rem;display:block;height:.75rem;width:.75rem;border-radius:9999px;opacity:.3;box-shadow:1.4em 0,2.8em 0,4.2em 0}.mockup-code pre{padding-right:1.25rem}.mockup-code pre:before{content:"";margin-right:2ch}.mockup-code pre[data-prefix]:before{content:attr(data-prefix);width:2rem;opacity:.5}.mockup-window{display:flex;flex-direction:column;border-radius:var(--rounded-box,1rem);padding-top:1.25rem}.mockup-window:before{content:"";margin-bottom:1rem;display:block;aspect-ratio:1/1;height:.75rem;flex-shrink:0;align-self:flex-start;border-radius:9999px;opacity:.3}.mockup-window:where([dir=rtl],[dir=rtl]*):before{align-self:flex-end}.mockup-window:before{box-shadow:1.4em 0,2.8em 0,4.2em 0}.mockup-phone{display:inline-block;border:4px solid #444;border-radius:50px;background-color:#000;padding:10px;margin:0 auto;overflow:hidden}.mockup-phone .camera{position:relative;top:0;left:0;background:#000;height:25px;width:150px;margin:0 auto;border-bottom-left-radius:17px;border-bottom-right-radius:17px;z-index:11}.mockup-phone .camera:before{content:"";position:absolute;top:35%;left:50%;width:50px;height:4px;border-radius:5px;background-color:#0c0b0e;transform:translate(-50%,-50%)}.mockup-phone .camera:after{content:"";position:absolute;top:20%;left:70%;width:8px;height:8px;border-radius:5px;background-color:#0f0b25}.mockup-phone .display{overflow:hidden;border-radius:40px;margin-top:-25px}.mockup-browser{border-radius:var(--rounded-box,1rem)}.mockup-browser .mockup-browser-toolbar{margin-top:.75rem;margin-bottom:.75rem;display:inline-flex;width:100%;align-items:center;padding-right:1.4em}.mockup-browser .mockup-browser-toolbar:where([dir=rtl],[dir=rtl]*){flex-direction:row-reverse}.mockup-browser .mockup-browser-toolbar:before{content:"";margin-right:4.8rem;display:inline-block;aspect-ratio:1/1;height:.75rem;border-radius:9999px;opacity:.3;box-shadow:1.4em 0,2.8em 0,4.2em 0}.mockup-browser .mockup-browser-toolbar .input{position:relative;margin-left:auto;margin-right:auto;display:block;height:1.75rem;width:24rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding-left:2rem;direction:ltr}.mockup-browser .mockup-browser-toolbar .input:before{content:"";position:absolute;left:.5rem;top:50%;aspect-ratio:1/1;height:.75rem;--tw-translate-y:-50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:9999px;border-width:2px;border-color:currentColor;opacity:.6}.mockup-browser .mockup-browser-toolbar .input:after{content:"";position:absolute;left:1.25rem;top:50%;height:.5rem;--tw-translate-y:25%;--tw-rotate:-45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:9999px;border-width:1px;border-color:currentColor;opacity:.6}.modal{background-color:transparent;color:inherit;transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);transition-property:transform,opacity,visibility;overflow-y:hidden;overscroll-behavior:contain}.modal::backdrop,.modal:not(dialog:not(.modal-open)){background-color:#0006;animation:modal-pop .2s ease-out}.modal-backdrop{z-index:-1;grid-column-start:1;grid-row-start:1;display:grid;align-self:stretch;justify-self:stretch;color:transparent}.modal-box{grid-column-start:1;grid-row-start:1;width:91.666667%;max-width:32rem;--tw-scale-x:.9;--tw-scale-y:.9;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-bottom-right-radius:var(--rounded-box,1rem);border-bottom-left-radius:var(--rounded-box,1rem);border-top-left-radius:var(--rounded-box,1rem);border-top-right-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding:1.5rem;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);box-shadow:rgba(0,0,0,.25) 0 25px 50px -12px;overflow-y:auto;overscroll-behavior:contain}.modal-open .modal-box,.modal-toggle:checked+.modal .modal-box,.modal:target .modal-box,.modal[open] .modal-box{--tw-translate-y:0px;--tw-scale-x:1;--tw-scale-y:1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.modal-action{margin-top:1.5rem;justify-content:flex-end}.modal-action>:not([hidden])~:not([hidden]){--tw-space-x-reverse:0;margin-right:calc(.5rem * var(--tw-space-x-reverse));margin-left:calc(.5rem * calc(1 - var(--tw-space-x-reverse)))}@keyframes modal-pop{0%{opacity:0}}.navbar{padding:var(--navbar-padding,.5rem);min-height:4rem;width:100%}.progress{height:.5rem;border-radius:var(--rounded-box,1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.2))}.progress::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)))}.progress-primary::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)))}.progress-secondary::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)))}.progress-accent::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)))}.progress-info::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.progress-success::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.progress-warning::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.progress-error::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.progress:indeterminate{--progress-color:var(--fallback-bc,oklch(var(--bc)/1))}.progress-primary:indeterminate{--progress-color:var(--fallback-p,oklch(var(--p)/1))}.progress-secondary:indeterminate{--progress-color:var(--fallback-s,oklch(var(--s)/1))}.progress-accent:indeterminate{--progress-color:var(--fallback-a,oklch(var(--a)/1))}.progress-info:indeterminate{--progress-color:var(--fallback-in,oklch(var(--in)/1))}.progress-success:indeterminate{--progress-color:var(--fallback-su,oklch(var(--su)/1))}.progress-warning:indeterminate{--progress-color:var(--fallback-wa,oklch(var(--wa)/1))}.progress-error:indeterminate{--progress-color:var(--fallback-er,oklch(var(--er)/1))}.progress::-webkit-progress-bar{border-radius:var(--rounded-box,1rem);background-color:transparent}.progress::-webkit-progress-value{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)))}.progress-primary::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)))}.progress-secondary::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)))}.progress-accent::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)))}.progress-info::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.progress-success::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.progress-warning::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.progress-error::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.progress:indeterminate{background-image:repeating-linear-gradient(90deg,var(--progress-color) -1%,var(--progress-color) 10%,transparent 10%,transparent 90%);background-size:200%;background-position-x:15%;animation:progress-loading 5s ease-in-out infinite}.progress:indeterminate::-moz-progress-bar{background-color:transparent;background-image:repeating-linear-gradient(90deg,var(--progress-color) -1%,var(--progress-color) 10%,transparent 10%,transparent 90%);background-size:200%;background-position-x:15%;animation:progress-loading 5s ease-in-out infinite}@keyframes progress-loading{50%{background-position-x:-115%}}.radial-progress{--value:0;--size:5rem;--thickness:calc(var(--size) / 10)}.radial-progress:after{background-color:currentColor}.radio{--chkbg:var(--bc);height:1.5rem;width:1.5rem;cursor:pointer;appearance:none;border-radius:9999px;border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2}.radio:focus{box-shadow:none}.radio:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.radio:checked,.radio[aria-checked=true]{--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));background-image:none;animation:radiomark var(--animation-input,.2s) ease-out;box-shadow:0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset}.radio-primary{--chkbg:var(--p);--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}@media(hover:hover){.radio-primary:hover{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}}.radio-primary:focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}.radio-primary:checked,.radio-primary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.radio-secondary{--chkbg:var(--s);--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}@media(hover:hover){.radio-secondary:hover{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}}.radio-secondary:focus-visible{outline-color:var(--fallback-s,oklch(var(--s)/1))}.radio-secondary:checked,.radio-secondary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.radio-accent{--chkbg:var(--a);--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}@media(hover:hover){.radio-accent:hover{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}}.radio-accent:focus-visible{outline-color:var(--fallback-a,oklch(var(--a)/1))}.radio-accent:checked,.radio-accent[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.radio-success{--chkbg:var(--su);--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}@media(hover:hover){.radio-success:hover{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}}.radio-success:focus-visible{outline-color:var(--fallback-su,oklch(var(--su)/1))}.radio-success:checked,.radio-success[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.radio-warning{--chkbg:var(--wa);--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}@media(hover:hover){.radio-warning:hover{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}}.radio-warning:focus-visible{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.radio-warning:checked,.radio-warning[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.radio-info{--chkbg:var(--in);--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}@media(hover:hover){.radio-info:hover{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}}.radio-info:focus-visible{outline-color:var(--fallback-in,oklch(var(--in)/1))}.radio-info:checked,.radio-info[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.radio-error{--chkbg:var(--er);--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}@media(hover:hover){.radio-error:hover{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}}.radio-error:focus-visible{outline-color:var(--fallback-er,oklch(var(--er)/1))}.radio-error:checked,.radio-error[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.radio:disabled{cursor:not-allowed;opacity:.2}@keyframes radiomark{0%{box-shadow:0 0 0 12px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 12px var(--fallback-b1,oklch(var(--b1)/1)) inset}50%{box-shadow:0 0 0 3px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 3px var(--fallback-b1,oklch(var(--b1)/1)) inset}100%{box-shadow:0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset}}.radio-mark{display:none}.range{appearance:none;-webkit-appearance:none;--range-shdw:var(--fallback-bc,oklch(var(--bc)/1));overflow:hidden;border-radius:var(--rounded-box,1rem);background-color:transparent}.range:focus-visible::-webkit-slider-thumb{--focus-shadow:0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 2rem var(--range-shdw) inset}.range:focus-visible::-moz-range-thumb{--focus-shadow:0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 2rem var(--range-shdw) inset}.range::-webkit-slider-runnable-track{height:.5rem;width:100%;border-radius:var(--rounded-box,1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.range::-moz-range-track{height:.5rem;width:100%;border-radius:var(--rounded-box,1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.range::-webkit-slider-thumb{position:relative;height:1.5rem;width:1.5rem;border-radius:var(--rounded-box,1rem);border-style:none;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));appearance:none;-webkit-appearance:none;top:50%;color:var(--range-shdw);transform:translateY(-50%);--filler-size:100rem;--filler-offset:0.6rem;box-shadow:0 0 0 3px var(--range-shdw) inset,var(--focus-shadow,0 0),calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size)}.range::-moz-range-thumb{position:relative;height:1.5rem;width:1.5rem;border-radius:var(--rounded-box,1rem);border-style:none;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));top:50%;color:var(--range-shdw);--filler-size:100rem;--filler-offset:0.5rem;box-shadow:0 0 0 3px var(--range-shdw) inset,var(--focus-shadow,0 0),calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size)}.range-primary{--range-shdw:var(--fallback-p,oklch(var(--p)/1))}.range-secondary{--range-shdw:var(--fallback-s,oklch(var(--s)/1))}.range-accent{--range-shdw:var(--fallback-a,oklch(var(--a)/1))}.range-success{--range-shdw:var(--fallback-su,oklch(var(--su)/1))}.range-warning{--range-shdw:var(--fallback-wa,oklch(var(--wa)/1))}.range-info{--range-shdw:var(--fallback-in,oklch(var(--in)/1))}.range-error{--range-shdw:var(--fallback-er,oklch(var(--er)/1))}.rating input{appearance:none;-webkit-appearance:none}.rating :where(input){animation:rating-pop var(--animation-input,.25s) ease-out;height:1.5rem;width:1.5rem;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-bg-opacity:1}.rating .rating-hidden{width:.5rem;background-color:transparent}.rating input[type=radio]:checked{background-image:none}.rating input:checked~input,.rating input[aria-checked=true]~input{--tw-bg-opacity:0.2}.rating input:focus-visible{transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);transform:translateY(-.125em)}.rating input:active:focus{animation:none;transform:translateY(-.125em)}.rating-half :where(input:not(.rating-hidden)){width:.75rem}@keyframes rating-pop{0%{transform:translateY(-.125em)}40%{transform:translateY(-.125em)}100%{transform:translateY(0)}}.select{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding-inline-end:2.5rem}.select-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.select{background-image:linear-gradient(45deg,transparent 50%,currentColor 50%),linear-gradient(135deg,currentColor 50%,transparent 50%);background-position:calc(100% - 20px) calc(1px + 50%),calc(100% - 16.1px) calc(1px + 50%);background-size:4px 4px,4px 4px;background-repeat:no-repeat}.select:focus{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.select-ghost{--tw-bg-opacity:0.05}.select-ghost:focus{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.select-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.select-primary:focus{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}.select-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.select-secondary:focus{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}.select-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.select-accent:focus{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}.select-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.select-info:focus{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}.select-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.select-success:focus{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}.select-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.select-warning:focus{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.select-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.select-error:focus{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}.select-disabled,.select:disabled,.select[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.select-disabled::placeholder,.select:disabled::placeholder,.select[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.select-multiple,.select[multiple],.select[size].select:not([size="1"]){background-image:none;padding-right:1rem}[dir=rtl] .select{background-position:calc(0% + 12px) calc(1px + 50%),calc(0% + 16px) calc(1px + 50%)}.skeleton{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)));will-change:background-position;animation:skeleton 1.8s ease-in-out infinite;background-image:linear-gradient(105deg,transparent 0,transparent 40%,var(--fallback-b1,oklch(var(--b1)/1)) 50%,transparent 60%,transparent 100%);background-size:200% auto;background-repeat:no-repeat;background-position-x:-50%}@media (prefers-reduced-motion){.skeleton{animation-duration:15s}}@keyframes skeleton{from{background-position:150%}to{background-position:-50%}}.stack{place-items:center;align-items:flex-end}.stack>*{width:100%;opacity:.6}.stack>:nth-child(2){opacity:.8}.stack>:nth-child(1){opacity:1}.stats{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}:where(.stats)>:not([hidden])~:not([hidden]){--tw-divide-x-reverse:0;border-right-width:calc(1px * var(--tw-divide-x-reverse));border-left-width:calc(1px * calc(1 - var(--tw-divide-x-reverse)));--tw-divide-y-reverse:0;border-top-width:calc(0px * calc(1 - var(--tw-divide-y-reverse)));border-bottom-width:calc(0px * var(--tw-divide-y-reverse))}:where(.stats){overflow-x:auto}[dir=rtl] .stats>:not([hidden])~:not([hidden]){--tw-divide-x-reverse:1}.stat{column-gap:1rem;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.1;padding-left:1.5rem;padding-right:1.5rem;padding-top:1rem;padding-bottom:1rem}.stat-title{color:var(--fallback-bc,oklch(var(--bc)/.6))}.stat-value{font-size:2.25rem;line-height:2.5rem;font-weight:800}.stat-desc{font-size:.75rem;line-height:1rem;color:var(--fallback-bc,oklch(var(--bc)/.6))}.stat-actions{margin-top:1rem}.steps .step{grid-template-rows:40px 1fr;grid-template-columns:auto;min-width:4rem}.steps .step:before{top:0;grid-column-start:1;grid-row-start:1;height:.5rem;width:100%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));content:"";margin-inline-start:-100%}.steps .step:after{content:counter(step);counter-increment:step;z-index:1;position:relative;grid-column-start:1;grid-row-start:1;display:grid;height:2rem;width:2rem;place-items:center;place-self:center;border-radius:9999px;--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.steps .step:first-child:before{content:none}.steps .step[data-content]:after{content:attr(data-content)}.steps .step-neutral+.step-neutral:before,.steps .step-neutral:after{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.steps .step-primary+.step-primary:before,.steps .step-primary:after{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.steps .step-secondary+.step-secondary:before,.steps .step-secondary:after{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.steps .step-accent+.step-accent:before,.steps .step-accent:after{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.steps .step-info+.step-info:before{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.steps .step-info:after{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.steps .step-success+.step-success:before{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.steps .step-success:after{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.steps .step-warning+.step-warning:before{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.steps .step-warning:after{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.steps .step-error+.step-error:before{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.steps .step-error:after{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.swap{cursor:pointer}.swap>*{transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);transition-property:transform,opacity}.swap-rotate .swap-indeterminate,.swap-rotate .swap-on,.swap-rotate input:indeterminate~.swap-on{--tw-rotate:45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.swap-active:where(.swap-rotate) .swap-off,.swap-rotate input:checked~.swap-off,.swap-rotate input:indeterminate~.swap-off{--tw-rotate:-45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.swap-active:where(.swap-rotate) .swap-on,.swap-rotate input:checked~.swap-on,.swap-rotate input:indeterminate~.swap-indeterminate{--tw-rotate:0deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.swap-flip{transform-style:preserve-3d;perspective:16em}.swap-flip .swap-indeterminate,.swap-flip .swap-on,.swap-flip input:indeterminate~.swap-on{transform:rotateY(180deg);backface-visibility:hidden;opacity:1}.swap-active:where(.swap-flip) .swap-off,.swap-flip input:checked~.swap-off,.swap-flip input:indeterminate~.swap-off{transform:rotateY(-180deg);backface-visibility:hidden;opacity:1}.swap-active:where(.swap-flip) .swap-on,.swap-flip input:checked~.swap-on,.swap-flip input:indeterminate~.swap-indeterminate{transform:rotateY(0)}.tabs-lifted>.tab:focus-visible{border-end-end-radius:0;border-end-start-radius:0}.tab{--tw-text-opacity:0.5}@media(hover:hover){.tab:hover{--tw-text-opacity:1}}.tab{--tab-color:var(--fallback-bc,oklch(var(--bc)/1));--tab-bg:var(--fallback-b1,oklch(var(--b1)/1));--tab-border-color:var(--fallback-b3,oklch(var(--b3)/1));color:var(--tab-color);padding-inline-start:var(--tab-padding,1rem);padding-inline-end:var(--tab-padding,1rem)}.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]),.tab:is(input:checked){border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:1;--tw-text-opacity:1}.tab:focus{outline:2px solid transparent;outline-offset:2px}.tab:focus-visible{outline:2px solid currentColor;outline-offset:-5px}.tab-disabled,.tab[disabled]{cursor:not-allowed;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}@media (hover:hover){.tab[disabled],.tab[disabled]:hover{cursor:not-allowed;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}}.tabs-bordered>.tab{border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2;border-style:solid;border-bottom-width:calc(var(--tab-border,1px) + 1px)}.tabs-lifted>.tab{border:var(--tab-border,1px) solid transparent;border-width:0 0 var(--tab-border,1px) 0;border-start-start-radius:var(--tab-radius,.5rem);border-start-end-radius:var(--tab-radius,.5rem);border-bottom-color:var(--tab-border-color);padding-inline-start:var(--tab-padding,1rem);padding-inline-end:var(--tab-padding,1rem);padding-top:var(--tab-border,1px)}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]),.tabs-lifted>.tab:is(input:checked){background-color:var(--tab-bg);border-width:var(--tab-border,1px) var(--tab-border,1px) 0 var(--tab-border,1px);border-inline-start-color:var(--tab-border-color);border-inline-end-color:var(--tab-border-color);border-top-color:var(--tab-border-color);padding-inline-start:calc(var(--tab-padding,1rem) - var(--tab-border,1px));padding-inline-end:calc(var(--tab-padding,1rem) - var(--tab-border,1px));padding-bottom:var(--tab-border,1px);padding-top:0}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):before,.tabs-lifted>.tab:is(input:checked):before{z-index:1;content:"";display:block;position:absolute;width:calc(100% + var(--tab-radius,.5rem) * 2);height:var(--tab-radius,.5rem);bottom:0;background-size:var(--tab-radius,.5rem);background-position:top left,top right;background-repeat:no-repeat;--tab-grad:calc(69% - var(--tab-border, 1px));--radius-start:radial-gradient(
-        circle at top left,
-        transparent var(--tab-grad),
-        var(--tab-border-color) calc(var(--tab-grad) + 0.25px),
-        var(--tab-border-color) calc(var(--tab-grad) + var(--tab-border, 1px)),
-        var(--tab-bg) calc(var(--tab-grad) + var(--tab-border, 1px) + 0.25px)
-      );--radius-end:radial-gradient(
-        circle at top right,
-        transparent var(--tab-grad),
-        var(--tab-border-color) calc(var(--tab-grad) + 0.25px),
-        var(--tab-border-color) calc(var(--tab-grad) + var(--tab-border, 1px)),
-        var(--tab-bg) calc(var(--tab-grad) + var(--tab-border, 1px) + 0.25px)
-      );background-image:var(--radius-start),var(--radius-end)}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):first-child:before,.tabs-lifted>.tab:is(input:checked):first-child:before{background-image:var(--radius-end);background-position:top right}[dir=rtl] .tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):first-child:before,[dir=rtl] .tabs-lifted>.tab:is(input:checked):first-child:before{background-image:var(--radius-start);background-position:top left}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):last-child:before,.tabs-lifted>.tab:is(input:checked):last-child:before{background-image:var(--radius-start);background-position:top left}[dir=rtl] .tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):last-child:before,[dir=rtl] .tabs-lifted>.tab:is(input:checked):last-child:before{background-image:var(--radius-end);background-position:top right}.tabs-lifted>.tab:is(input:checked)+.tabs-lifted .tab:is(input:checked):before,.tabs-lifted>:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled])+.tabs-lifted :is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):before{background-image:var(--radius-end);background-position:top right}.tabs-boxed{border-radius:var(--rounded-btn,.5rem);--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding:.25rem}.tabs-boxed .tab{border-radius:var(--rounded-btn,.5rem)}.tabs-boxed :is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]),.tabs-boxed :is(input:checked){--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@media(hover:hover){.tabs-boxed :is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):hover,.tabs-boxed :is(input:checked):hover{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}}.table{border-radius:var(--rounded-box,1rem);text-align:left;font-size:.875rem;line-height:1.25rem}.table:where([dir=rtl],[dir=rtl]*){text-align:right}.table :where(th,td){padding-left:1rem;padding-right:1rem;padding-top:.75rem;padding-bottom:.75rem;vertical-align:middle}.table tr.active,.table tr.active:nth-child(even),.table-zebra tbody tr:nth-child(even){--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)))}@media(hover:hover){.table tr.hover:hover,.table tr.hover:nth-child(even):hover{--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)))}}.table-zebra tr.active,.table-zebra tr.active:nth-child(even),.table-zebra-zebra tbody tr:nth-child(even){--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}@media(hover:hover){.table-zebra tr.hover:hover,.table-zebra tr.hover:nth-child(even):hover{--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}}.table :where(theadtr,tbodytr:not(:last-child),tbodytr:first-child:last-child){border-bottom-width:1px;--tw-border-opacity:1;border-bottom-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.table :where(thead,tfoot){white-space:nowrap;font-size:.75rem;line-height:1rem;font-weight:700;color:var(--fallback-bc,oklch(var(--bc)/.6))}.table :where(tfoot){border-top-width:1px;--tw-border-opacity:1;border-top-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.textarea{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.textarea-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.textarea:focus{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.textarea-ghost{--tw-bg-opacity:0.05}.textarea-ghost:focus{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:none}.textarea-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.textarea-primary:focus{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}.textarea-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.textarea-secondary:focus{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}.textarea-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.textarea-accent:focus{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}.textarea-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.textarea-info:focus{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}.textarea-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.textarea-success:focus{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}.textarea-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.textarea-warning:focus{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.textarea-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.textarea-error:focus{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}.textarea-disabled,.textarea:disabled,.textarea[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.textarea-disabled::placeholder,.textarea:disabled::placeholder,.textarea[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.timeline hr{height:.25rem}:where(.timelinehr){--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}:where(.timeline:has(.timeline-middle)hr):first-child{border-start-end-radius:var(--rounded-badge,1.9rem);border-end-end-radius:var(--rounded-badge,1.9rem);border-start-start-radius:0;border-end-start-radius:0}:where(.timeline:has(.timeline-middle)hr):last-child{border-start-start-radius:var(--rounded-badge,1.9rem);border-end-start-radius:var(--rounded-badge,1.9rem);border-start-end-radius:0;border-end-end-radius:0}:where(.timeline:not(:has(.timeline-middle)):first-childhr:last-child){border-start-start-radius:var(--rounded-badge,1.9rem);border-end-start-radius:var(--rounded-badge,1.9rem);border-start-end-radius:0;border-end-end-radius:0}:where(.timeline:not(:has(.timeline-middle)):last-childhr:first-child){border-start-end-radius:var(--rounded-badge,1.9rem);border-end-end-radius:var(--rounded-badge,1.9rem);border-start-start-radius:0;border-end-start-radius:0}.timeline-box{border-radius:var(--rounded-box,1rem);border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;--tw-shadow:0 1px 2px 0 rgb(0 0 0 / 0.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.toast{gap:.5rem;padding:1rem}.toast>*{animation:toast-pop .25s ease-out}@keyframes toast-pop{0%{transform:scale(.9);opacity:0}100%{transform:scale(1);opacity:1}}.toggle{--tglbg:var(--fallback-b1,oklch(var(--b1)/1));--handleoffset:1.5rem;--handleoffsetcalculator:calc(var(--handleoffset) * -1);--togglehandleborder:0 0;height:1.5rem;width:3rem;cursor:pointer;appearance:none;border-radius:var(--rounded-badge,1.9rem);border-width:1px;border-color:currentColor;background-color:currentColor;color:var(--fallback-bc,oklch(var(--bc)/.5));transition:background,box-shadow var(--animation-input,.2s) ease-out;box-shadow:var(--handleoffsetcalculator) 0 0 2px var(--tglbg) inset,0 0 0 2px var(--tglbg) inset,var(--togglehandleborder)}[dir=rtl] .toggle{--handleoffsetcalculator:calc(var(--handleoffset) * 1)}.toggle:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.toggle:hover{background-color:currentColor}.toggle:checked,.toggle[aria-checked=true]{background-image:none;--handleoffsetcalculator:var(--handleoffset);--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}[dir=rtl] .toggle:checked,[dir=rtl] .toggle[aria-checked=true]{--handleoffsetcalculator:calc(var(--handleoffset) * -1)}.toggle:indeterminate{--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:calc(var(--handleoffset)/ 2) 0 0 2px var(--tglbg) inset,calc(var(--handleoffset)/ -2) 0 0 2px var(--tglbg) inset,0 0 0 2px var(--tglbg) inset}[dir=rtl] .toggle:indeterminate{box-shadow:calc(var(--handleoffset)/ 2) 0 0 2px var(--tglbg) inset,calc(var(--handleoffset)/ -2) 0 0 2px var(--tglbg) inset,0 0 0 2px var(--tglbg) inset}.toggle-primary:focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}.toggle-primary:checked,.toggle-primary[aria-checked=true]{border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.toggle-secondary:focus-visible{outline-color:var(--fallback-s,oklch(var(--s)/1))}.toggle-secondary:checked,.toggle-secondary[aria-checked=true]{border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.toggle-accent:focus-visible{outline-color:var(--fallback-a,oklch(var(--a)/1))}.toggle-accent:checked,.toggle-accent[aria-checked=true]{border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.toggle-success:focus-visible{outline-color:var(--fallback-su,oklch(var(--su)/1))}.toggle-success:checked,.toggle-success[aria-checked=true]{border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.toggle-warning:focus-visible{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.toggle-warning:checked,.toggle-warning[aria-checked=true]{border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.toggle-info:focus-visible{outline-color:var(--fallback-in,oklch(var(--in)/1))}.toggle-info:checked,.toggle-info[aria-checked=true]{border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.toggle-error:focus-visible{outline-color:var(--fallback-er,oklch(var(--er)/1))}.toggle-error:checked,.toggle-error[aria-checked=true]{border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.toggle:disabled{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));background-color:transparent;opacity:.3;--togglehandleborder:0 0 0 3px var(--fallback-bc,oklch(var(--bc)/1)) inset,var(--handleoffsetcalculator) 0 0 3px var(--fallback-bc,oklch(var(--bc)/1)) inset}.toggle-mark{display:none}:root .prose{--tw-prose-body:var(--fallback-bc,oklch(var(--bc)/0.8));--tw-prose-headings:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-lead:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-links:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-bold:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-counters:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-bullets:var(--fallback-bc,oklch(var(--bc)/0.5));--tw-prose-hr:var(--fallback-bc,oklch(var(--bc)/0.2));--tw-prose-quotes:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-quote-borders:var(--fallback-bc,oklch(var(--bc)/0.2));--tw-prose-captions:var(--fallback-bc,oklch(var(--bc)/0.5));--tw-prose-code:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-pre-code:var(--fallback-nc,oklch(var(--nc)/1));--tw-prose-pre-bg:var(--fallback-n,oklch(var(--n)/1));--tw-prose-th-borders:var(--fallback-bc,oklch(var(--bc)/0.5));--tw-prose-td-borders:var(--fallback-bc,oklch(var(--bc)/0.2))}.prose :where(code):not(:where([class~=not-prose]*,pre*)){padding:1px 8px;border-radius:var(--rounded-badge);font-weight:initial;background-color:var(--fallback-bc,oklch(var(--bc)/.1))}@supports not (color:oklch(0% 0 0)){.prose :where(code):not(:where([class~=not-prose]*,pre*)){background-color:var(--fallback-b3,oklch(var(--b3)/1))}}.prose :where(code):not(:where([class~=not-prose],[class~=not-prose]*))::after,.prose :where(code):not(:where([class~=not-prose],[class~=not-prose]*))::before{display:none}.prose pre code{border-radius:0;padding:0}.prose :where(tbodytr,thead):not(:where([class~=not-prose]*)){border-bottom-color:var(--fallback-bc,oklch(var(--bc)/.2))}:root{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:89.824% 0.06192 275.75;--ac:15.352% 0.0368 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:49.12% 0.3096 275.75;--s:69.71% 0.329 342.55;--sc:98.71% 0.0106 342.55;--a:76.76% 0.184 183.61;--n:32.1785% 0.02476 255.701624;--nc:89.4994% 0.011585 252.096176;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.4169% 0.00108 197.137559;--bc:27.8078% 0.029596 256.847952}@media (prefers-color-scheme:dark){:root{color-scheme:dark;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:13.138% 0.0392 275.75;--sc:14.96% 0.052 342.55;--ac:14.902% 0.0334 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:65.69% 0.196 275.75;--s:74.8% 0.26 342.55;--a:74.51% 0.167 183.61;--n:31.3815% 0.021108 254.139175;--nc:74.6477% 0.0216 264.435964;--b1:25.3267% 0.015896 252.417568;--b2:23.2607% 0.013807 253.100675;--b3:21.1484% 0.01165 254.087939;--bc:74.6477% 0.0216 264.435964}}[data-theme=light]{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:89.824% 0.06192 275.75;--ac:15.352% 0.0368 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:49.12% 0.3096 275.75;--s:69.71% 0.329 342.55;--sc:98.71% 0.0106 342.55;--a:76.76% 0.184 183.61;--n:32.1785% 0.02476 255.701624;--nc:89.4994% 0.011585 252.096176;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.4169% 0.00108 197.137559;--bc:27.8078% 0.029596 256.847952}:root:has(input.theme-controller[value=light]:checked){color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:89.824% 0.06192 275.75;--ac:15.352% 0.0368 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:49.12% 0.3096 275.75;--s:69.71% 0.329 342.55;--sc:98.71% 0.0106 342.55;--a:76.76% 0.184 183.61;--n:32.1785% 0.02476 255.701624;--nc:89.4994% 0.011585 252.096176;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.4169% 0.00108 197.137559;--bc:27.8078% 0.029596 256.847952}[data-theme=dark]{color-scheme:dark;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:13.138% 0.0392 275.75;--sc:14.96% 0.052 342.55;--ac:14.902% 0.0334 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:65.69% 0.196 275.75;--s:74.8% 0.26 342.55;--a:74.51% 0.167 183.61;--n:31.3815% 0.021108 254.139175;--nc:74.6477% 0.0216 264.435964;--b1:25.3267% 0.015896 252.417568;--b2:23.2607% 0.013807 253.100675;--b3:21.1484% 0.01165 254.087939;--bc:74.6477% 0.0216 264.435964}:root:has(input.theme-controller[value=dark]:checked){color-scheme:dark;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:13.138% 0.0392 275.75;--sc:14.96% 0.052 342.55;--ac:14.902% 0.0334 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:65.69% 0.196 275.75;--s:74.8% 0.26 342.55;--a:74.51% 0.167 183.61;--n:31.3815% 0.021108 254.139175;--nc:74.6477% 0.0216 264.435964;--b1:25.3267% 0.015896 252.417568;--b2:23.2607% 0.013807 253.100675;--b3:21.1484% 0.01165 254.087939;--bc:74.6477% 0.0216 264.435964}[data-theme=cupcake]{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:15.2344% 0.017892 200.026556;--sc:15.787% 0.020249 356.29965;--ac:15.8762% 0.029206 78.618794;--nc:84.7148% 0.013247 313.189598;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--p:76.172% 0.089459 200.026556;--s:78.9351% 0.101246 356.29965;--a:79.3811% 0.146032 78.618794;--n:23.5742% 0.066235 313.189598;--b1:97.7882% 0.00418 56.375637;--b2:93.9822% 0.007638 61.449292;--b3:91.5861% 0.006811 53.440502;--bc:23.5742% 0.066235 313.189598;--rounded-btn:1.9rem;--tab-border:2px;--tab-radius:0.7rem}:root:has(input.theme-controller[value=cupcake]:checked){color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:15.2344% 0.017892 200.026556;--sc:15.787% 0.020249 356.29965;--ac:15.8762% 0.029206 78.618794;--nc:84.7148% 0.013247 313.189598;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--p:76.172% 0.089459 200.026556;--s:78.9351% 0.101246 356.29965;--a:79.3811% 0.146032 78.618794;--n:23.5742% 0.066235 313.189598;--b1:97.7882% 0.00418 56.375637;--b2:93.9822% 0.007638 61.449292;--b3:91.5861% 0.006811 53.440502;--bc:23.5742% 0.066235 313.189598;--rounded-btn:1.9rem;--tab-border:2px;--tab-radius:0.7rem}[data-theme=bumblebee]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--ac:16.254% 0.0314 56.52;--nc:82.55% 0.015 281.99;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:89.51% 0.2132 96.61;--pc:38.92% 0.046 96.61;--s:80.39% 0.194 70.76;--sc:39.38% 0.068 70.76;--a:81.27% 0.157 56.52;--n:12.75% 0.075 281.99;--b1:100% 0 0}:root:has(input.theme-controller[value=bumblebee]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--ac:16.254% 0.0314 56.52;--nc:82.55% 0.015 281.99;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:89.51% 0.2132 96.61;--pc:38.92% 0.046 96.61;--s:80.39% 0.194 70.76;--sc:39.38% 0.068 70.76;--a:81.27% 0.157 56.52;--n:12.75% 0.075 281.99;--b1:100% 0 0}[data-theme=emerald]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:76.6626% 0.135433 153.450024;--pc:33.3872% 0.040618 162.240129;--s:61.3028% 0.202368 261.294233;--sc:100% 0 0;--a:72.7725% 0.149783 33.200363;--ac:0% 0 0;--n:35.5192% 0.032071 262.988584;--nc:98.4625% 0.001706 247.838921;--b1:100% 0 0;--bc:35.5192% 0.032071 262.988584;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}:root:has(input.theme-controller[value=emerald]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:76.6626% 0.135433 153.450024;--pc:33.3872% 0.040618 162.240129;--s:61.3028% 0.202368 261.294233;--sc:100% 0 0;--a:72.7725% 0.149783 33.200363;--ac:0% 0 0;--n:35.5192% 0.032071 262.988584;--nc:98.4625% 0.001706 247.838921;--b1:100% 0 0;--bc:35.5192% 0.032071 262.988584;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}[data-theme=corporate]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:12.078% 0.0456 269.1;--sc:13.0739% 0.010951 256.688055;--ac:15.3934% 0.022799 163.57888;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--border-btn:1px;--tab-border:1px;--p:60.39% 0.228 269.1;--s:65.3694% 0.054756 256.688055;--a:76.9669% 0.113994 163.57888;--n:22.3899% 0.031305 278.07229;--nc:95.8796% 0.008588 247.915135;--b1:100% 0 0;--bc:22.3899% 0.031305 278.07229;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem;--tab-radius:0.25rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}:root:has(input.theme-controller[value=corporate]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:12.078% 0.0456 269.1;--sc:13.0739% 0.010951 256.688055;--ac:15.3934% 0.022799 163.57888;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--border-btn:1px;--tab-border:1px;--p:60.39% 0.228 269.1;--s:65.3694% 0.054756 256.688055;--a:76.9669% 0.113994 163.57888;--n:22.3899% 0.031305 278.07229;--nc:95.8796% 0.008588 247.915135;--b1:100% 0 0;--bc:22.3899% 0.031305 278.07229;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem;--tab-radius:0.25rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}[data-theme=synthwave]{color-scheme:dark;--b2:20.2941% 0.076211 287.835609;--b3:18.7665% 0.070475 287.835609;--pc:14.4421% 0.031903 342.009383;--sc:15.6543% 0.02362 227.382405;--ac:17.608% 0.0412 93.72;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:72.2105% 0.159514 342.009383;--s:78.2714% 0.118101 227.382405;--a:88.04% 0.206 93.72;--n:25.5554% 0.103537 286.507967;--nc:97.9365% 0.00819 301.358346;--b1:21.8216% 0.081948 287.835609;--bc:97.9365% 0.00819 301.358346;--in:76.5197% 0.12273 231.831603;--inc:23.5017% 0.096418 290.329844;--su:86.0572% 0.115038 178.624677;--suc:23.5017% 0.096418 290.329844;--wa:85.531% 0.122117 93.722227;--wac:23.5017% 0.096418 290.329844;--er:73.7005% 0.121339 32.639257;--erc:23.5017% 0.096418 290.329844}:root:has(input.theme-controller[value=synthwave]:checked){color-scheme:dark;--b2:20.2941% 0.076211 287.835609;--b3:18.7665% 0.070475 287.835609;--pc:14.4421% 0.031903 342.009383;--sc:15.6543% 0.02362 227.382405;--ac:17.608% 0.0412 93.72;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:72.2105% 0.159514 342.009383;--s:78.2714% 0.118101 227.382405;--a:88.04% 0.206 93.72;--n:25.5554% 0.103537 286.507967;--nc:97.9365% 0.00819 301.358346;--b1:21.8216% 0.081948 287.835609;--bc:97.9365% 0.00819 301.358346;--in:76.5197% 0.12273 231.831603;--inc:23.5017% 0.096418 290.329844;--su:86.0572% 0.115038 178.624677;--suc:23.5017% 0.096418 290.329844;--wa:85.531% 0.122117 93.722227;--wac:23.5017% 0.096418 290.329844;--er:73.7005% 0.121339 32.639257;--erc:23.5017% 0.096418 290.329844}[data-theme=retro]{color-scheme:light;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:76.8664% 0.104092 22.664655;--pc:26.5104% 0.006243 0.522862;--s:80.7415% 0.052534 159.094608;--sc:26.5104% 0.006243 0.522862;--a:70.3919% 0.125455 52.953428;--ac:26.5104% 0.006243 0.522862;--n:28.4181% 0.009519 355.534017;--nc:92.5604% 0.025113 89.217311;--b1:91.6374% 0.034554 90.51575;--b2:88.2722% 0.049418 91.774344;--b3:84.133% 0.065952 90.856665;--bc:26.5104% 0.006243 0.522862;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33;--rounded-box:0.4rem;--rounded-btn:0.4rem;--rounded-badge:0.4rem;--tab-radius:0.4rem}:root:has(input.theme-controller[value=retro]:checked){color-scheme:light;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:76.8664% 0.104092 22.664655;--pc:26.5104% 0.006243 0.522862;--s:80.7415% 0.052534 159.094608;--sc:26.5104% 0.006243 0.522862;--a:70.3919% 0.125455 52.953428;--ac:26.5104% 0.006243 0.522862;--n:28.4181% 0.009519 355.534017;--nc:92.5604% 0.025113 89.217311;--b1:91.6374% 0.034554 90.51575;--b2:88.2722% 0.049418 91.774344;--b3:84.133% 0.065952 90.856665;--bc:26.5104% 0.006243 0.522862;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33;--rounded-box:0.4rem;--rounded-btn:0.4rem;--rounded-badge:0.4rem;--tab-radius:0.4rem}[data-theme=cyberpunk]{color-scheme:light;--b2:87.8943% 0.16647 104.32;--b3:81.2786% 0.15394 104.32;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:18.902% 0.0358 104.32;--pc:14.844% 0.0418 6.35;--sc:16.666% 0.0368 204.72;--ac:14.372% 0.04352 310.43;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;--p:74.22% 0.209 6.35;--s:83.33% 0.184 204.72;--a:71.86% 0.2176 310.43;--n:23.04% 0.065 269.31;--nc:94.51% 0.179 104.32;--b1:94.51% 0.179 104.32;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--tab-radius:0}:root:has(input.theme-controller[value=cyberpunk]:checked){color-scheme:light;--b2:87.8943% 0.16647 104.32;--b3:81.2786% 0.15394 104.32;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:18.902% 0.0358 104.32;--pc:14.844% 0.0418 6.35;--sc:16.666% 0.0368 204.72;--ac:14.372% 0.04352 310.43;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;--p:74.22% 0.209 6.35;--s:83.33% 0.184 204.72;--a:71.86% 0.2176 310.43;--n:23.04% 0.065 269.31;--nc:94.51% 0.179 104.32;--b1:94.51% 0.179 104.32;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--tab-radius:0}[data-theme=valentine]{color-scheme:light;--b2:88.0567% 0.024834 337.06289;--b3:81.4288% 0.022964 337.06289;--pc:13.7239% 0.030755 15.066527;--sc:14.3942% 0.029258 293.189609;--ac:14.2537% 0.014961 197.828857;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.614% 0.0414 27.33;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:68.6197% 0.153774 15.066527;--s:71.971% 0.14629 293.189609;--a:71.2685% 0.074804 197.828857;--n:54.6053% 0.143342 358.004839;--nc:90.2701% 0.037202 336.955191;--b1:94.6846% 0.026703 337.06289;--bc:37.3085% 0.081131 4.606426;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.07% 0.207 27.33;--rounded-btn:1.9rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=valentine]:checked){color-scheme:light;--b2:88.0567% 0.024834 337.06289;--b3:81.4288% 0.022964 337.06289;--pc:13.7239% 0.030755 15.066527;--sc:14.3942% 0.029258 293.189609;--ac:14.2537% 0.014961 197.828857;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.614% 0.0414 27.33;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:68.6197% 0.153774 15.066527;--s:71.971% 0.14629 293.189609;--a:71.2685% 0.074804 197.828857;--n:54.6053% 0.143342 358.004839;--nc:90.2701% 0.037202 336.955191;--b1:94.6846% 0.026703 337.06289;--bc:37.3085% 0.081131 4.606426;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.07% 0.207 27.33;--rounded-btn:1.9rem;--tab-radius:0.7rem}[data-theme=halloween]{color-scheme:dark;--b2:23.0416% 0 0;--b3:21.3072% 0 0;--bc:84.9552% 0 0;--sc:89.196% 0.0496 305.03;--nc:84.8742% 0.009322 65.681484;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:77.48% 0.204 60.62;--pc:19.6935% 0.004671 196.779412;--s:45.98% 0.248 305.03;--a:64.8% 0.223 136.073479;--ac:0% 0 0;--n:24.371% 0.046608 65.681484;--b1:24.7759% 0 0;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33}:root:has(input.theme-controller[value=halloween]:checked){color-scheme:dark;--b2:23.0416% 0 0;--b3:21.3072% 0 0;--bc:84.9552% 0 0;--sc:89.196% 0.0496 305.03;--nc:84.8742% 0.009322 65.681484;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:77.48% 0.204 60.62;--pc:19.6935% 0.004671 196.779412;--s:45.98% 0.248 305.03;--a:64.8% 0.223 136.073479;--ac:0% 0 0;--n:24.371% 0.046608 65.681484;--b1:24.7759% 0 0;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33}[data-theme=garden]{color-scheme:light;--b2:86.4453% 0.002011 17.197414;--b3:79.9386% 0.00186 17.197414;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--sc:89.699% 0.022197 355.095988;--ac:11.2547% 0.010859 154.390187;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:62.45% 0.278 3.83636;--pc:100% 0 0;--s:48.4952% 0.110985 355.095988;--a:56.2735% 0.054297 154.390187;--n:24.1559% 0.049362 89.070594;--nc:92.9519% 0.002163 17.197414;--b1:92.9519% 0.002163 17.197414;--bc:16.9617% 0.001664 17.32068}:root:has(input.theme-controller[value=garden]:checked){color-scheme:light;--b2:86.4453% 0.002011 17.197414;--b3:79.9386% 0.00186 17.197414;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--sc:89.699% 0.022197 355.095988;--ac:11.2547% 0.010859 154.390187;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:62.45% 0.278 3.83636;--pc:100% 0 0;--s:48.4952% 0.110985 355.095988;--a:56.2735% 0.054297 154.390187;--n:24.1559% 0.049362 89.070594;--nc:92.9519% 0.002163 17.197414;--b1:92.9519% 0.002163 17.197414;--bc:16.9617% 0.001664 17.32068}[data-theme=forest]{color-scheme:dark;--b2:17.522% 0.007709 17.911578;--b3:16.2032% 0.007129 17.911578;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:83.7682% 0.001658 17.911578;--sc:13.9553% 0.027077 168.327128;--ac:14.1257% 0.02389 185.713193;--nc:86.1397% 0.007806 171.364646;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:68.6283% 0.185567 148.958922;--pc:0% 0 0;--s:69.7764% 0.135385 168.327128;--a:70.6285% 0.119451 185.713193;--n:30.6985% 0.039032 171.364646;--b1:18.8409% 0.00829 17.911578;--rounded-btn:1.9rem}:root:has(input.theme-controller[value=forest]:checked){color-scheme:dark;--b2:17.522% 0.007709 17.911578;--b3:16.2032% 0.007129 17.911578;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:83.7682% 0.001658 17.911578;--sc:13.9553% 0.027077 168.327128;--ac:14.1257% 0.02389 185.713193;--nc:86.1397% 0.007806 171.364646;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:68.6283% 0.185567 148.958922;--pc:0% 0 0;--s:69.7764% 0.135385 168.327128;--a:70.6285% 0.119451 185.713193;--n:30.6985% 0.039032 171.364646;--b1:18.8409% 0.00829 17.911578;--rounded-btn:1.9rem}[data-theme=aqua]{color-scheme:dark;--b2:45.3464% 0.118611 261.181672;--b3:41.9333% 0.109683 261.181672;--bc:89.7519% 0.025508 261.181672;--sc:12.1365% 0.02175 309.782946;--ac:18.6854% 0.020445 94.555431;--nc:12.2124% 0.023402 243.760661;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.79% 0.038 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:85.6617% 0.14498 198.6458;--pc:40.1249% 0.068266 197.603872;--s:60.6827% 0.108752 309.782946;--a:93.4269% 0.102225 94.555431;--n:61.0622% 0.117009 243.760661;--b1:48.7596% 0.127539 261.181672;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.95% 0.19 27.33}:root:has(input.theme-controller[value=aqua]:checked){color-scheme:dark;--b2:45.3464% 0.118611 261.181672;--b3:41.9333% 0.109683 261.181672;--bc:89.7519% 0.025508 261.181672;--sc:12.1365% 0.02175 309.782946;--ac:18.6854% 0.020445 94.555431;--nc:12.2124% 0.023402 243.760661;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.79% 0.038 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:85.6617% 0.14498 198.6458;--pc:40.1249% 0.068266 197.603872;--s:60.6827% 0.108752 309.782946;--a:93.4269% 0.102225 94.555431;--n:61.0622% 0.117009 243.760661;--b1:48.7596% 0.127539 261.181672;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.95% 0.19 27.33}[data-theme=lofi]{color-scheme:light;--inc:15.908% 0.0206 205.9;--suc:18.026% 0.0306 164.14;--wac:17.674% 0.027 79.94;--erc:15.732% 0.03 28.47;--border-btn:1px;--tab-border:1px;--p:15.9066% 0 0;--pc:100% 0 0;--s:21.455% 0.001566 17.278957;--sc:100% 0 0;--a:26.8618% 0 0;--ac:100% 0 0;--n:0% 0 0;--nc:100% 0 0;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.268% 0.001082 17.17934;--bc:0% 0 0;--in:79.54% 0.103 205.9;--su:90.13% 0.153 164.14;--wa:88.37% 0.135 79.94;--er:78.66% 0.15 28.47;--rounded-box:0.25rem;--rounded-btn:0.125rem;--rounded-badge:0.125rem;--tab-radius:0.125rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}:root:has(input.theme-controller[value=lofi]:checked){color-scheme:light;--inc:15.908% 0.0206 205.9;--suc:18.026% 0.0306 164.14;--wac:17.674% 0.027 79.94;--erc:15.732% 0.03 28.47;--border-btn:1px;--tab-border:1px;--p:15.9066% 0 0;--pc:100% 0 0;--s:21.455% 0.001566 17.278957;--sc:100% 0 0;--a:26.8618% 0 0;--ac:100% 0 0;--n:0% 0 0;--nc:100% 0 0;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.268% 0.001082 17.17934;--bc:0% 0 0;--in:79.54% 0.103 205.9;--su:90.13% 0.153 164.14;--wa:88.37% 0.135 79.94;--er:78.66% 0.15 28.47;--rounded-box:0.25rem;--rounded-btn:0.125rem;--rounded-badge:0.125rem;--tab-radius:0.125rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}[data-theme=pastel]{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--pc:16.6166% 0.006979 316.8737;--sc:17.6153% 0.009839 8.688364;--ac:17.8419% 0.012056 170.923263;--nc:14.2681% 0.014702 228.183906;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:83.0828% 0.034896 316.8737;--s:88.0763% 0.049197 8.688364;--a:89.2096% 0.06028 170.923263;--n:71.3406% 0.07351 228.183906;--b1:100% 0 0;--b2:98.4625% 0.001706 247.838921;--b3:87.1681% 0.009339 258.338227;--rounded-btn:1.9rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=pastel]:checked){color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--pc:16.6166% 0.006979 316.8737;--sc:17.6153% 0.009839 8.688364;--ac:17.8419% 0.012056 170.923263;--nc:14.2681% 0.014702 228.183906;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:83.0828% 0.034896 316.8737;--s:88.0763% 0.049197 8.688364;--a:89.2096% 0.06028 170.923263;--n:71.3406% 0.07351 228.183906;--b1:100% 0 0;--b2:98.4625% 0.001706 247.838921;--b3:87.1681% 0.009339 258.338227;--rounded-btn:1.9rem;--tab-radius:0.7rem}[data-theme=fantasy]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:87.49% 0.0378 325.02;--sc:90.784% 0.0324 241.36;--ac:15.196% 0.0408 56.72;--nc:85.5616% 0.005919 256.847952;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:37.45% 0.189 325.02;--s:53.92% 0.162 241.36;--a:75.98% 0.204 56.72;--n:27.8078% 0.029596 256.847952;--b1:100% 0 0;--bc:27.8078% 0.029596 256.847952}:root:has(input.theme-controller[value=fantasy]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:87.49% 0.0378 325.02;--sc:90.784% 0.0324 241.36;--ac:15.196% 0.0408 56.72;--nc:85.5616% 0.005919 256.847952;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:37.45% 0.189 325.02;--s:53.92% 0.162 241.36;--a:75.98% 0.204 56.72;--n:27.8078% 0.029596 256.847952;--b1:100% 0 0;--bc:27.8078% 0.029596 256.847952}[data-theme=wireframe]{color-scheme:light;--bc:20% 0 0;--pc:15.6521% 0 0;--sc:15.6521% 0 0;--ac:15.6521% 0 0;--nc:18.8014% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:14.1626% 0.019994 108.702381;--erc:12.5591% 0.051537 29.233885;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:Chalkboard,comic sans ms,sans-serif;--p:78.2604% 0 0;--s:78.2604% 0 0;--a:78.2604% 0 0;--n:94.007% 0 0;--b1:100% 0 0;--b2:94.9119% 0 0;--b3:89.7547% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:70.8131% 0.099969 108.702381;--er:62.7955% 0.257683 29.233885;--rounded-box:0.2rem;--rounded-btn:0.2rem;--rounded-badge:0.2rem;--tab-radius:0.2rem}:root:has(input.theme-controller[value=wireframe]:checked){color-scheme:light;--bc:20% 0 0;--pc:15.6521% 0 0;--sc:15.6521% 0 0;--ac:15.6521% 0 0;--nc:18.8014% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:14.1626% 0.019994 108.702381;--erc:12.5591% 0.051537 29.233885;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:Chalkboard,comic sans ms,sans-serif;--p:78.2604% 0 0;--s:78.2604% 0 0;--a:78.2604% 0 0;--n:94.007% 0 0;--b1:100% 0 0;--b2:94.9119% 0 0;--b3:89.7547% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:70.8131% 0.099969 108.702381;--er:62.7955% 0.257683 29.233885;--rounded-box:0.2rem;--rounded-btn:0.2rem;--rounded-badge:0.2rem;--tab-radius:0.2rem}[data-theme=black]{color-scheme:dark;--pc:86.736% 0 0;--sc:86.736% 0 0;--ac:86.736% 0 0;--nc:86.736% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:19.3597% 0.042201 109.769232;--erc:12.5591% 0.051537 29.233885;--border-btn:1px;--tab-border:1px;--p:33.6799% 0 0;--s:33.6799% 0 0;--a:33.6799% 0 0;--b1:0% 0 0;--b2:19.1251% 0 0;--b3:26.8618% 0 0;--bc:87.6096% 0 0;--n:33.6799% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:96.7983% 0.211006 109.769232;--er:62.7955% 0.257683 29.233885;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--animation-btn:0;--animation-input:0;--btn-focus-scale:1;--tab-radius:0}:root:has(input.theme-controller[value=black]:checked){color-scheme:dark;--pc:86.736% 0 0;--sc:86.736% 0 0;--ac:86.736% 0 0;--nc:86.736% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:19.3597% 0.042201 109.769232;--erc:12.5591% 0.051537 29.233885;--border-btn:1px;--tab-border:1px;--p:33.6799% 0 0;--s:33.6799% 0 0;--a:33.6799% 0 0;--b1:0% 0 0;--b2:19.1251% 0 0;--b3:26.8618% 0 0;--bc:87.6096% 0 0;--n:33.6799% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:96.7983% 0.211006 109.769232;--er:62.7955% 0.257683 29.233885;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--animation-btn:0;--animation-input:0;--btn-focus-scale:1;--tab-radius:0}[data-theme=luxury]{color-scheme:dark;--pc:20% 0 0;--sc:85.5163% 0.012821 261.069149;--ac:87.3349% 0.010348 338.82597;--inc:15.8122% 0.024356 237.133883;--suc:15.6239% 0.038579 132.154381;--wac:17.2255% 0.027305 102.89115;--erc:14.3506% 0.035271 22.568916;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:100% 0 0;--s:27.5815% 0.064106 261.069149;--a:36.6744% 0.051741 338.82597;--n:24.27% 0.057015 59.825019;--nc:93.2033% 0.089631 90.861683;--b1:14.0765% 0.004386 285.822869;--b2:20.2191% 0.004211 308.22937;--b3:29.8961% 0.003818 308.318612;--bc:75.6879% 0.123666 76.890484;--in:79.0612% 0.121778 237.133883;--su:78.1197% 0.192894 132.154381;--wa:86.1274% 0.136524 102.89115;--er:71.7531% 0.176357 22.568916}:root:has(input.theme-controller[value=luxury]:checked){color-scheme:dark;--pc:20% 0 0;--sc:85.5163% 0.012821 261.069149;--ac:87.3349% 0.010348 338.82597;--inc:15.8122% 0.024356 237.133883;--suc:15.6239% 0.038579 132.154381;--wac:17.2255% 0.027305 102.89115;--erc:14.3506% 0.035271 22.568916;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:100% 0 0;--s:27.5815% 0.064106 261.069149;--a:36.6744% 0.051741 338.82597;--n:24.27% 0.057015 59.825019;--nc:93.2033% 0.089631 90.861683;--b1:14.0765% 0.004386 285.822869;--b2:20.2191% 0.004211 308.22937;--b3:29.8961% 0.003818 308.318612;--bc:75.6879% 0.123666 76.890484;--in:79.0612% 0.121778 237.133883;--su:78.1197% 0.192894 132.154381;--wa:86.1274% 0.136524 102.89115;--er:71.7531% 0.176357 22.568916}[data-theme=dracula]{color-scheme:dark;--b2:26.8053% 0.020556 277.508664;--b3:24.7877% 0.019009 277.508664;--pc:15.0922% 0.036614 346.812432;--sc:14.8405% 0.029709 301.883095;--ac:16.6785% 0.024826 66.558491;--nc:87.8891% 0.006515 275.524078;--inc:17.6526% 0.018676 212.846491;--suc:17.4199% 0.043903 148.024881;--wac:19.1068% 0.026849 112.757109;--erc:13.6441% 0.041266 24.430965;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.4611% 0.18307 346.812432;--s:74.2023% 0.148546 301.883095;--a:83.3927% 0.124132 66.558491;--n:39.4456% 0.032576 275.524078;--b1:28.8229% 0.022103 277.508664;--bc:97.7477% 0.007913 106.545019;--in:88.263% 0.09338 212.846491;--su:87.0995% 0.219516 148.024881;--wa:95.5338% 0.134246 112.757109;--er:68.2204% 0.206328 24.430965}:root:has(input.theme-controller[value=dracula]:checked){color-scheme:dark;--b2:26.8053% 0.020556 277.508664;--b3:24.7877% 0.019009 277.508664;--pc:15.0922% 0.036614 346.812432;--sc:14.8405% 0.029709 301.883095;--ac:16.6785% 0.024826 66.558491;--nc:87.8891% 0.006515 275.524078;--inc:17.6526% 0.018676 212.846491;--suc:17.4199% 0.043903 148.024881;--wac:19.1068% 0.026849 112.757109;--erc:13.6441% 0.041266 24.430965;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.4611% 0.18307 346.812432;--s:74.2023% 0.148546 301.883095;--a:83.3927% 0.124132 66.558491;--n:39.4456% 0.032576 275.524078;--b1:28.8229% 0.022103 277.508664;--bc:97.7477% 0.007913 106.545019;--in:88.263% 0.09338 212.846491;--su:87.0995% 0.219516 148.024881;--wa:95.5338% 0.134246 112.757109;--er:68.2204% 0.206328 24.430965}[data-theme=cmyk]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--bc:20% 0 0;--pc:14.3544% 0.02666 239.443325;--sc:12.8953% 0.040552 359.339283;--ac:18.8458% 0.037948 105.306968;--nc:84.3557% 0 0;--inc:13.6952% 0.0189 217.284104;--suc:89.3898% 0.032505 321.406278;--wac:14.2473% 0.031969 52.023412;--erc:12.4027% 0.041677 28.717543;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.7722% 0.133298 239.443325;--s:64.4766% 0.202758 359.339283;--a:94.2289% 0.189741 105.306968;--n:21.7787% 0 0;--b1:100% 0 0;--in:68.4759% 0.094499 217.284104;--su:46.949% 0.162524 321.406278;--wa:71.2364% 0.159843 52.023412;--er:62.0133% 0.208385 28.717543}:root:has(input.theme-controller[value=cmyk]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--bc:20% 0 0;--pc:14.3544% 0.02666 239.443325;--sc:12.8953% 0.040552 359.339283;--ac:18.8458% 0.037948 105.306968;--nc:84.3557% 0 0;--inc:13.6952% 0.0189 217.284104;--suc:89.3898% 0.032505 321.406278;--wac:14.2473% 0.031969 52.023412;--erc:12.4027% 0.041677 28.717543;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.7722% 0.133298 239.443325;--s:64.4766% 0.202758 359.339283;--a:94.2289% 0.189741 105.306968;--n:21.7787% 0 0;--b1:100% 0 0;--in:68.4759% 0.094499 217.284104;--su:46.949% 0.162524 321.406278;--wa:71.2364% 0.159843 52.023412;--er:62.0133% 0.208385 28.717543}[data-theme=autumn]{color-scheme:light;--b2:89.1077% 0 0;--b3:82.4006% 0 0;--bc:19.1629% 0 0;--pc:88.1446% 0.032232 17.530175;--sc:12.3353% 0.033821 23.865865;--ac:14.6851% 0.018999 60.729616;--nc:90.8734% 0.007475 51.902819;--inc:13.8449% 0.019596 207.284192;--suc:12.199% 0.016032 174.616213;--wac:14.0163% 0.032982 56.844303;--erc:90.614% 0.0482 24.16;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:40.7232% 0.16116 17.530175;--s:61.6763% 0.169105 23.865865;--a:73.4253% 0.094994 60.729616;--n:54.3672% 0.037374 51.902819;--b1:95.8147% 0 0;--in:69.2245% 0.097979 207.284192;--su:60.9951% 0.080159 174.616213;--wa:70.0817% 0.164909 56.844303;--er:53.07% 0.241 24.16}:root:has(input.theme-controller[value=autumn]:checked){color-scheme:light;--b2:89.1077% 0 0;--b3:82.4006% 0 0;--bc:19.1629% 0 0;--pc:88.1446% 0.032232 17.530175;--sc:12.3353% 0.033821 23.865865;--ac:14.6851% 0.018999 60.729616;--nc:90.8734% 0.007475 51.902819;--inc:13.8449% 0.019596 207.284192;--suc:12.199% 0.016032 174.616213;--wac:14.0163% 0.032982 56.844303;--erc:90.614% 0.0482 24.16;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:40.7232% 0.16116 17.530175;--s:61.6763% 0.169105 23.865865;--a:73.4253% 0.094994 60.729616;--n:54.3672% 0.037374 51.902819;--b1:95.8147% 0 0;--in:69.2245% 0.097979 207.284192;--su:60.9951% 0.080159 174.616213;--wa:70.0817% 0.164909 56.844303;--er:53.07% 0.241 24.16}[data-theme=business]{color-scheme:dark;--b2:22.6487% 0 0;--b3:20.944% 0 0;--bc:84.8707% 0 0;--pc:88.3407% 0.019811 251.473931;--sc:12.8185% 0.005481 229.389418;--ac:13.4542% 0.033545 35.791525;--nc:85.4882% 0.00265 253.041249;--inc:12.5233% 0.028702 240.033697;--suc:14.0454% 0.018919 156.59611;--wac:15.4965% 0.023141 81.519177;--erc:90.3221% 0.029356 29.674507;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:41.7036% 0.099057 251.473931;--s:64.0924% 0.027405 229.389418;--a:67.271% 0.167726 35.791525;--n:27.441% 0.01325 253.041249;--b1:24.3535% 0 0;--in:62.6163% 0.143511 240.033697;--su:70.2268% 0.094594 156.59611;--wa:77.4824% 0.115704 81.519177;--er:51.6105% 0.14678 29.674507;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem}:root:has(input.theme-controller[value=business]:checked){color-scheme:dark;--b2:22.6487% 0 0;--b3:20.944% 0 0;--bc:84.8707% 0 0;--pc:88.3407% 0.019811 251.473931;--sc:12.8185% 0.005481 229.389418;--ac:13.4542% 0.033545 35.791525;--nc:85.4882% 0.00265 253.041249;--inc:12.5233% 0.028702 240.033697;--suc:14.0454% 0.018919 156.59611;--wac:15.4965% 0.023141 81.519177;--erc:90.3221% 0.029356 29.674507;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:41.7036% 0.099057 251.473931;--s:64.0924% 0.027405 229.389418;--a:67.271% 0.167726 35.791525;--n:27.441% 0.01325 253.041249;--b1:24.3535% 0 0;--in:62.6163% 0.143511 240.033697;--su:70.2268% 0.094594 156.59611;--wa:77.4824% 0.115704 81.519177;--er:51.6105% 0.14678 29.674507;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem}[data-theme=acid]{color-scheme:light;--b2:91.6146% 0 0;--b3:84.7189% 0 0;--bc:19.7021% 0 0;--pc:14.38% 0.0714 330.759573;--sc:14.674% 0.0448 48.250878;--ac:18.556% 0.0528 122.962951;--nc:84.262% 0.0256 278.68;--inc:12.144% 0.0454 252.05;--suc:17.144% 0.0532 158.53;--wac:18.202% 0.0424 100.5;--erc:12.968% 0.0586 29.349188;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:71.9% 0.357 330.759573;--s:73.37% 0.224 48.250878;--a:92.78% 0.264 122.962951;--n:21.31% 0.128 278.68;--b1:98.5104% 0 0;--in:60.72% 0.227 252.05;--su:85.72% 0.266 158.53;--wa:91.01% 0.212 100.5;--er:64.84% 0.293 29.349188;--rounded-box:1.25rem;--rounded-btn:1rem;--rounded-badge:1rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=acid]:checked){color-scheme:light;--b2:91.6146% 0 0;--b3:84.7189% 0 0;--bc:19.7021% 0 0;--pc:14.38% 0.0714 330.759573;--sc:14.674% 0.0448 48.250878;--ac:18.556% 0.0528 122.962951;--nc:84.262% 0.0256 278.68;--inc:12.144% 0.0454 252.05;--suc:17.144% 0.0532 158.53;--wac:18.202% 0.0424 100.5;--erc:12.968% 0.0586 29.349188;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:71.9% 0.357 330.759573;--s:73.37% 0.224 48.250878;--a:92.78% 0.264 122.962951;--n:21.31% 0.128 278.68;--b1:98.5104% 0 0;--in:60.72% 0.227 252.05;--su:85.72% 0.266 158.53;--wa:91.01% 0.212 100.5;--er:64.84% 0.293 29.349188;--rounded-box:1.25rem;--rounded-btn:1rem;--rounded-badge:1rem;--tab-radius:0.7rem}[data-theme=lemonade]{color-scheme:light;--b2:91.8003% 0.0186 123.72;--b3:84.8906% 0.0172 123.72;--bc:19.742% 0.004 123.72;--pc:11.784% 0.0398 134.6;--sc:15.55% 0.0392 111.09;--ac:17.078% 0.0402 100.73;--nc:86.196% 0.015 108.6;--inc:17.238% 0.0094 224.14;--suc:17.238% 0.0094 157.85;--wac:17.238% 0.0094 102.15;--erc:17.238% 0.0094 25.85;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:58.92% 0.199 134.6;--s:77.75% 0.196 111.09;--a:85.39% 0.201 100.73;--n:30.98% 0.075 108.6;--b1:98.71% 0.02 123.72;--in:86.19% 0.047 224.14;--su:86.19% 0.047 157.85;--wa:86.19% 0.047 102.15;--er:86.19% 0.047 25.85}:root:has(input.theme-controller[value=lemonade]:checked){color-scheme:light;--b2:91.8003% 0.0186 123.72;--b3:84.8906% 0.0172 123.72;--bc:19.742% 0.004 123.72;--pc:11.784% 0.0398 134.6;--sc:15.55% 0.0392 111.09;--ac:17.078% 0.0402 100.73;--nc:86.196% 0.015 108.6;--inc:17.238% 0.0094 224.14;--suc:17.238% 0.0094 157.85;--wac:17.238% 0.0094 102.15;--erc:17.238% 0.0094 25.85;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:58.92% 0.199 134.6;--s:77.75% 0.196 111.09;--a:85.39% 0.201 100.73;--n:30.98% 0.075 108.6;--b1:98.71% 0.02 123.72;--in:86.19% 0.047 224.14;--su:86.19% 0.047 157.85;--wa:86.19% 0.047 102.15;--er:86.19% 0.047 25.85}[data-theme=night]{color-scheme:dark;--b2:19.3144% 0.037037 265.754874;--b3:17.8606% 0.034249 265.754874;--bc:84.1536% 0.007965 265.754874;--pc:15.0703% 0.027798 232.66148;--sc:13.6023% 0.031661 276.934902;--ac:14.4721% 0.035244 350.048739;--nc:85.5899% 0.00737 260.030984;--suc:15.6904% 0.026506 181.911977;--wac:16.6486% 0.027912 82.95003;--erc:14.3572% 0.034051 13.11834;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.3513% 0.138989 232.66148;--s:68.0113% 0.158303 276.934902;--a:72.3603% 0.176218 350.048739;--n:27.9495% 0.036848 260.030984;--b1:20.7682% 0.039824 265.754874;--in:68.4553% 0.148062 237.25135;--inc:0% 0 0;--su:78.452% 0.132529 181.911977;--wa:83.2428% 0.139558 82.95003;--er:71.7858% 0.170255 13.11834}:root:has(input.theme-controller[value=night]:checked){color-scheme:dark;--b2:19.3144% 0.037037 265.754874;--b3:17.8606% 0.034249 265.754874;--bc:84.1536% 0.007965 265.754874;--pc:15.0703% 0.027798 232.66148;--sc:13.6023% 0.031661 276.934902;--ac:14.4721% 0.035244 350.048739;--nc:85.5899% 0.00737 260.030984;--suc:15.6904% 0.026506 181.911977;--wac:16.6486% 0.027912 82.95003;--erc:14.3572% 0.034051 13.11834;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.3513% 0.138989 232.66148;--s:68.0113% 0.158303 276.934902;--a:72.3603% 0.176218 350.048739;--n:27.9495% 0.036848 260.030984;--b1:20.7682% 0.039824 265.754874;--in:68.4553% 0.148062 237.25135;--inc:0% 0 0;--su:78.452% 0.132529 181.911977;--wa:83.2428% 0.139558 82.95003;--er:71.7858% 0.170255 13.11834}[data-theme=coffee]{color-scheme:dark;--b2:20.1585% 0.021457 329.708637;--b3:18.6412% 0.019842 329.708637;--pc:14.3993% 0.024765 62.756393;--sc:86.893% 0.00597 199.19444;--ac:88.5243% 0.014881 224.389184;--nc:83.3022% 0.003149 326.261446;--inc:15.898% 0.012774 184.558367;--suc:14.9445% 0.014491 131.116276;--wac:17.6301% 0.028162 87.722413;--erc:15.4637% 0.025644 31.871922;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.9967% 0.123825 62.756393;--s:34.465% 0.029849 199.19444;--a:42.6213% 0.074405 224.389184;--n:16.5109% 0.015743 326.261446;--b1:21.6758% 0.023072 329.708637;--bc:72.3547% 0.092794 79.129387;--in:79.4902% 0.063869 184.558367;--su:74.7224% 0.072456 131.116276;--wa:88.1503% 0.140812 87.722413;--er:77.3187% 0.12822 31.871922}:root:has(input.theme-controller[value=coffee]:checked){color-scheme:dark;--b2:20.1585% 0.021457 329.708637;--b3:18.6412% 0.019842 329.708637;--pc:14.3993% 0.024765 62.756393;--sc:86.893% 0.00597 199.19444;--ac:88.5243% 0.014881 224.389184;--nc:83.3022% 0.003149 326.261446;--inc:15.898% 0.012774 184.558367;--suc:14.9445% 0.014491 131.116276;--wac:17.6301% 0.028162 87.722413;--erc:15.4637% 0.025644 31.871922;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.9967% 0.123825 62.756393;--s:34.465% 0.029849 199.19444;--a:42.6213% 0.074405 224.389184;--n:16.5109% 0.015743 326.261446;--b1:21.6758% 0.023072 329.708637;--bc:72.3547% 0.092794 79.129387;--in:79.4902% 0.063869 184.558367;--su:74.7224% 0.072456 131.116276;--wa:88.1503% 0.140812 87.722413;--er:77.3187% 0.12822 31.871922}[data-theme=winter]{color-scheme:light;--pc:91.372% 0.051 257.57;--sc:88.5103% 0.03222 282.339433;--ac:11.988% 0.038303 335.171434;--nc:83.9233% 0.012704 257.651965;--inc:17.6255% 0.017178 214.515264;--suc:16.0988% 0.015404 197.823719;--wac:17.8345% 0.009167 71.47031;--erc:14.6185% 0.022037 20.076293;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:56.86% 0.255 257.57;--s:42.5516% 0.161098 282.339433;--a:59.9398% 0.191515 335.171434;--n:19.6166% 0.063518 257.651965;--b1:100% 0 0;--b2:97.4663% 0.011947 259.822565;--b3:93.2686% 0.016223 262.751375;--bc:41.8869% 0.053885 255.824911;--in:88.1275% 0.085888 214.515264;--su:80.4941% 0.077019 197.823719;--wa:89.1725% 0.045833 71.47031;--er:73.0926% 0.110185 20.076293}:root:has(input.theme-controller[value=winter]:checked){color-scheme:light;--pc:91.372% 0.051 257.57;--sc:88.5103% 0.03222 282.339433;--ac:11.988% 0.038303 335.171434;--nc:83.9233% 0.012704 257.651965;--inc:17.6255% 0.017178 214.515264;--suc:16.0988% 0.015404 197.823719;--wac:17.8345% 0.009167 71.47031;--erc:14.6185% 0.022037 20.076293;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:56.86% 0.255 257.57;--s:42.5516% 0.161098 282.339433;--a:59.9398% 0.191515 335.171434;--n:19.6166% 0.063518 257.651965;--b1:100% 0 0;--b2:97.4663% 0.011947 259.822565;--b3:93.2686% 0.016223 262.751375;--bc:41.8869% 0.053885 255.824911;--in:88.1275% 0.085888 214.515264;--su:80.4941% 0.077019 197.823719;--wa:89.1725% 0.045833 71.47031;--er:73.0926% 0.110185 20.076293}[data-theme=dim]{color-scheme:dark;--pc:17.2267% 0.028331 139.549991;--sc:14.6752% 0.033181 35.353059;--ac:14.8459% 0.026728 311.37924;--inc:17.2157% 0.028409 206.182959;--suc:17.2343% 0.028437 166.534048;--wac:17.2327% 0.028447 94.818679;--erc:16.4838% 0.019914 33.756357;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:86.1335% 0.141656 139.549991;--s:73.3759% 0.165904 35.353059;--a:74.2296% 0.133641 311.37924;--n:24.7311% 0.020483 264.094728;--nc:82.9011% 0.031335 222.959324;--b1:30.8577% 0.023243 264.149498;--b2:28.0368% 0.01983 264.182074;--b3:26.3469% 0.018403 262.177739;--bc:82.9011% 0.031335 222.959324;--in:86.0785% 0.142046 206.182959;--su:86.1717% 0.142187 166.534048;--wa:86.1634% 0.142236 94.818679;--er:82.4189% 0.09957 33.756357}:root:has(input.theme-controller[value=dim]:checked){color-scheme:dark;--pc:17.2267% 0.028331 139.549991;--sc:14.6752% 0.033181 35.353059;--ac:14.8459% 0.026728 311.37924;--inc:17.2157% 0.028409 206.182959;--suc:17.2343% 0.028437 166.534048;--wac:17.2327% 0.028447 94.818679;--erc:16.4838% 0.019914 33.756357;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:86.1335% 0.141656 139.549991;--s:73.3759% 0.165904 35.353059;--a:74.2296% 0.133641 311.37924;--n:24.7311% 0.020483 264.094728;--nc:82.9011% 0.031335 222.959324;--b1:30.8577% 0.023243 264.149498;--b2:28.0368% 0.01983 264.182074;--b3:26.3469% 0.018403 262.177739;--bc:82.9011% 0.031335 222.959324;--in:86.0785% 0.142046 206.182959;--su:86.1717% 0.142187 166.534048;--wa:86.1634% 0.142236 94.818679;--er:82.4189% 0.09957 33.756357}[data-theme=nord]{color-scheme:light;--pc:11.8872% 0.015449 254.027774;--sc:13.9303% 0.011822 248.687186;--ac:15.4929% 0.01245 217.469017;--inc:13.8414% 0.012499 332.664922;--suc:15.3654% 0.01498 131.063061;--wac:17.0972% 0.017847 84.093335;--erc:12.122% 0.024119 15.341883;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:59.4359% 0.077246 254.027774;--s:69.6516% 0.059108 248.687186;--a:77.4643% 0.062249 217.469017;--n:45.229% 0.035214 264.1312;--nc:89.9258% 0.016374 262.749256;--b1:95.1276% 0.007445 260.731539;--b2:93.2996% 0.010389 261.788485;--b3:89.9258% 0.016374 262.749256;--bc:32.4374% 0.022945 264.182036;--in:69.2072% 0.062496 332.664922;--su:76.827% 0.074899 131.063061;--wa:85.4862% 0.089234 84.093335;--er:60.61% 0.120594 15.341883;--rounded-box:0.4rem;--rounded-btn:0.2rem;--rounded-badge:0.4rem;--tab-radius:0.2rem}:root:has(input.theme-controller[value=nord]:checked){color-scheme:light;--pc:11.8872% 0.015449 254.027774;--sc:13.9303% 0.011822 248.687186;--ac:15.4929% 0.01245 217.469017;--inc:13.8414% 0.012499 332.664922;--suc:15.3654% 0.01498 131.063061;--wac:17.0972% 0.017847 84.093335;--erc:12.122% 0.024119 15.341883;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:59.4359% 0.077246 254.027774;--s:69.6516% 0.059108 248.687186;--a:77.4643% 0.062249 217.469017;--n:45.229% 0.035214 264.1312;--nc:89.9258% 0.016374 262.749256;--b1:95.1276% 0.007445 260.731539;--b2:93.2996% 0.010389 261.788485;--b3:89.9258% 0.016374 262.749256;--bc:32.4374% 0.022945 264.182036;--in:69.2072% 0.062496 332.664922;--su:76.827% 0.074899 131.063061;--wa:85.4862% 0.089234 84.093335;--er:60.61% 0.120594 15.341883;--rounded-box:0.4rem;--rounded-btn:0.2rem;--rounded-badge:0.4rem;--tab-radius:0.2rem}[data-theme=sunset]{color-scheme:dark;--pc:14.9408% 0.031656 39.94703;--sc:14.5075% 0.035531 2.72034;--ac:14.2589% 0.033336 299.844533;--inc:17.1119% 0.017054 206.015183;--suc:17.1122% 0.017172 144.77874;--wac:17.1139% 0.016961 74.427797;--erc:17.1023% 0.015778 16.886379;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:74.7039% 0.158278 39.94703;--s:72.5375% 0.177654 2.72034;--a:71.2947% 0.166678 299.844533;--n:26% 0.019 237.69;--nc:70% 0.019 237.69;--b1:22% 0.019 237.69;--b2:20% 0.019 237.69;--b3:18% 0.019 237.69;--bc:77.3835% 0.043586 245.096534;--in:85.5596% 0.085271 206.015183;--su:85.5609% 0.08586 144.77874;--wa:85.5695% 0.084806 74.427797;--er:85.5116% 0.07889 16.886379;--rounded-box:1.2rem;--rounded-btn:0.8rem;--rounded-badge:0.4rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=sunset]:checked){color-scheme:dark;--pc:14.9408% 0.031656 39.94703;--sc:14.5075% 0.035531 2.72034;--ac:14.2589% 0.033336 299.844533;--inc:17.1119% 0.017054 206.015183;--suc:17.1122% 0.017172 144.77874;--wac:17.1139% 0.016961 74.427797;--erc:17.1023% 0.015778 16.886379;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:74.7039% 0.158278 39.94703;--s:72.5375% 0.177654 2.72034;--a:71.2947% 0.166678 299.844533;--n:26% 0.019 237.69;--nc:70% 0.019 237.69;--b1:22% 0.019 237.69;--b2:20% 0.019 237.69;--b3:18% 0.019 237.69;--bc:77.3835% 0.043586 245.096534;--in:85.5596% 0.085271 206.015183;--su:85.5609% 0.08586 144.77874;--wa:85.5695% 0.084806 74.427797;--er:85.5116% 0.07889 16.886379;--rounded-box:1.2rem;--rounded-btn:0.8rem;--rounded-badge:0.4rem;--tab-radius:0.7rem}
diff --git a/examples/server/public/deps_markdown-it.js b/examples/server/public/deps_markdown-it.js
deleted file mode 100644
index 1be0cebe6..000000000
--- a/examples/server/public/deps_markdown-it.js
+++ /dev/null
@@ -1,8442 +0,0 @@
-/*! markdown-it 13.0.2 https://github.com/markdown-it/markdown-it @license MIT */
-(function(global, factory) {
-  typeof exports === "object" && typeof module !== "undefined" ? module.exports = factory() : typeof define === "function" && define.amd ? define(factory) : (global = typeof globalThis !== "undefined" ? globalThis : global || self, 
-  global.markdownit = factory());
-})(this, (function() {
-  "use strict";
-  function createCommonjsModule(fn, basedir, module) {
-    return module = {
-      path: basedir,
-      exports: {},
-      require: function(path, base) {
-        return commonjsRequire(path, base === undefined || base === null ? module.path : base);
-      }
-    }, fn(module, module.exports), module.exports;
-  }
-  function getAugmentedNamespace(n) {
-    if (n.__esModule) return n;
-    var a = Object.defineProperty({}, "__esModule", {
-      value: true
-    });
-    Object.keys(n).forEach((function(k) {
-      var d = Object.getOwnPropertyDescriptor(n, k);
-      Object.defineProperty(a, k, d.get ? d : {
-        enumerable: true,
-        get: function() {
-          return n[k];
-        }
-      });
-    }));
-    return a;
-  }
-  function commonjsRequire() {
-    throw new Error("Dynamic requires are not currently supported by @rollup/plugin-commonjs");
-  }
-  var require$$0 = {
-    Aacute: "\xc1",
-    aacute: "\xe1",
-    Abreve: "\u0102",
-    abreve: "\u0103",
-    ac: "\u223e",
-    acd: "\u223f",
-    acE: "\u223e\u0333",
-    Acirc: "\xc2",
-    acirc: "\xe2",
-    acute: "\xb4",
-    Acy: "\u0410",
-    acy: "\u0430",
-    AElig: "\xc6",
-    aelig: "\xe6",
-    af: "\u2061",
-    Afr: "\ud835\udd04",
-    afr: "\ud835\udd1e",
-    Agrave: "\xc0",
-    agrave: "\xe0",
-    alefsym: "\u2135",
-    aleph: "\u2135",
-    Alpha: "\u0391",
-    alpha: "\u03b1",
-    Amacr: "\u0100",
-    amacr: "\u0101",
-    amalg: "\u2a3f",
-    amp: "&",
-    AMP: "&",
-    andand: "\u2a55",
-    And: "\u2a53",
-    and: "\u2227",
-    andd: "\u2a5c",
-    andslope: "\u2a58",
-    andv: "\u2a5a",
-    ang: "\u2220",
-    ange: "\u29a4",
-    angle: "\u2220",
-    angmsdaa: "\u29a8",
-    angmsdab: "\u29a9",
-    angmsdac: "\u29aa",
-    angmsdad: "\u29ab",
-    angmsdae: "\u29ac",
-    angmsdaf: "\u29ad",
-    angmsdag: "\u29ae",
-    angmsdah: "\u29af",
-    angmsd: "\u2221",
-    angrt: "\u221f",
-    angrtvb: "\u22be",
-    angrtvbd: "\u299d",
-    angsph: "\u2222",
-    angst: "\xc5",
-    angzarr: "\u237c",
-    Aogon: "\u0104",
-    aogon: "\u0105",
-    Aopf: "\ud835\udd38",
-    aopf: "\ud835\udd52",
-    apacir: "\u2a6f",
-    ap: "\u2248",
-    apE: "\u2a70",
-    ape: "\u224a",
-    apid: "\u224b",
-    apos: "'",
-    ApplyFunction: "\u2061",
-    approx: "\u2248",
-    approxeq: "\u224a",
-    Aring: "\xc5",
-    aring: "\xe5",
-    Ascr: "\ud835\udc9c",
-    ascr: "\ud835\udcb6",
-    Assign: "\u2254",
-    ast: "*",
-    asymp: "\u2248",
-    asympeq: "\u224d",
-    Atilde: "\xc3",
-    atilde: "\xe3",
-    Auml: "\xc4",
-    auml: "\xe4",
-    awconint: "\u2233",
-    awint: "\u2a11",
-    backcong: "\u224c",
-    backepsilon: "\u03f6",
-    backprime: "\u2035",
-    backsim: "\u223d",
-    backsimeq: "\u22cd",
-    Backslash: "\u2216",
-    Barv: "\u2ae7",
-    barvee: "\u22bd",
-    barwed: "\u2305",
-    Barwed: "\u2306",
-    barwedge: "\u2305",
-    bbrk: "\u23b5",
-    bbrktbrk: "\u23b6",
-    bcong: "\u224c",
-    Bcy: "\u0411",
-    bcy: "\u0431",
-    bdquo: "\u201e",
-    becaus: "\u2235",
-    because: "\u2235",
-    Because: "\u2235",
-    bemptyv: "\u29b0",
-    bepsi: "\u03f6",
-    bernou: "\u212c",
-    Bernoullis: "\u212c",
-    Beta: "\u0392",
-    beta: "\u03b2",
-    beth: "\u2136",
-    between: "\u226c",
-    Bfr: "\ud835\udd05",
-    bfr: "\ud835\udd1f",
-    bigcap: "\u22c2",
-    bigcirc: "\u25ef",
-    bigcup: "\u22c3",
-    bigodot: "\u2a00",
-    bigoplus: "\u2a01",
-    bigotimes: "\u2a02",
-    bigsqcup: "\u2a06",
-    bigstar: "\u2605",
-    bigtriangledown: "\u25bd",
-    bigtriangleup: "\u25b3",
-    biguplus: "\u2a04",
-    bigvee: "\u22c1",
-    bigwedge: "\u22c0",
-    bkarow: "\u290d",
-    blacklozenge: "\u29eb",
-    blacksquare: "\u25aa",
-    blacktriangle: "\u25b4",
-    blacktriangledown: "\u25be",
-    blacktriangleleft: "\u25c2",
-    blacktriangleright: "\u25b8",
-    blank: "\u2423",
-    blk12: "\u2592",
-    blk14: "\u2591",
-    blk34: "\u2593",
-    block: "\u2588",
-    bne: "=\u20e5",
-    bnequiv: "\u2261\u20e5",
-    bNot: "\u2aed",
-    bnot: "\u2310",
-    Bopf: "\ud835\udd39",
-    bopf: "\ud835\udd53",
-    bot: "\u22a5",
-    bottom: "\u22a5",
-    bowtie: "\u22c8",
-    boxbox: "\u29c9",
-    boxdl: "\u2510",
-    boxdL: "\u2555",
-    boxDl: "\u2556",
-    boxDL: "\u2557",
-    boxdr: "\u250c",
-    boxdR: "\u2552",
-    boxDr: "\u2553",
-    boxDR: "\u2554",
-    boxh: "\u2500",
-    boxH: "\u2550",
-    boxhd: "\u252c",
-    boxHd: "\u2564",
-    boxhD: "\u2565",
-    boxHD: "\u2566",
-    boxhu: "\u2534",
-    boxHu: "\u2567",
-    boxhU: "\u2568",
-    boxHU: "\u2569",
-    boxminus: "\u229f",
-    boxplus: "\u229e",
-    boxtimes: "\u22a0",
-    boxul: "\u2518",
-    boxuL: "\u255b",
-    boxUl: "\u255c",
-    boxUL: "\u255d",
-    boxur: "\u2514",
-    boxuR: "\u2558",
-    boxUr: "\u2559",
-    boxUR: "\u255a",
-    boxv: "\u2502",
-    boxV: "\u2551",
-    boxvh: "\u253c",
-    boxvH: "\u256a",
-    boxVh: "\u256b",
-    boxVH: "\u256c",
-    boxvl: "\u2524",
-    boxvL: "\u2561",
-    boxVl: "\u2562",
-    boxVL: "\u2563",
-    boxvr: "\u251c",
-    boxvR: "\u255e",
-    boxVr: "\u255f",
-    boxVR: "\u2560",
-    bprime: "\u2035",
-    breve: "\u02d8",
-    Breve: "\u02d8",
-    brvbar: "\xa6",
-    bscr: "\ud835\udcb7",
-    Bscr: "\u212c",
-    bsemi: "\u204f",
-    bsim: "\u223d",
-    bsime: "\u22cd",
-    bsolb: "\u29c5",
-    bsol: "\\",
-    bsolhsub: "\u27c8",
-    bull: "\u2022",
-    bullet: "\u2022",
-    bump: "\u224e",
-    bumpE: "\u2aae",
-    bumpe: "\u224f",
-    Bumpeq: "\u224e",
-    bumpeq: "\u224f",
-    Cacute: "\u0106",
-    cacute: "\u0107",
-    capand: "\u2a44",
-    capbrcup: "\u2a49",
-    capcap: "\u2a4b",
-    cap: "\u2229",
-    Cap: "\u22d2",
-    capcup: "\u2a47",
-    capdot: "\u2a40",
-    CapitalDifferentialD: "\u2145",
-    caps: "\u2229\ufe00",
-    caret: "\u2041",
-    caron: "\u02c7",
-    Cayleys: "\u212d",
-    ccaps: "\u2a4d",
-    Ccaron: "\u010c",
-    ccaron: "\u010d",
-    Ccedil: "\xc7",
-    ccedil: "\xe7",
-    Ccirc: "\u0108",
-    ccirc: "\u0109",
-    Cconint: "\u2230",
-    ccups: "\u2a4c",
-    ccupssm: "\u2a50",
-    Cdot: "\u010a",
-    cdot: "\u010b",
-    cedil: "\xb8",
-    Cedilla: "\xb8",
-    cemptyv: "\u29b2",
-    cent: "\xa2",
-    centerdot: "\xb7",
-    CenterDot: "\xb7",
-    cfr: "\ud835\udd20",
-    Cfr: "\u212d",
-    CHcy: "\u0427",
-    chcy: "\u0447",
-    check: "\u2713",
-    checkmark: "\u2713",
-    Chi: "\u03a7",
-    chi: "\u03c7",
-    circ: "\u02c6",
-    circeq: "\u2257",
-    circlearrowleft: "\u21ba",
-    circlearrowright: "\u21bb",
-    circledast: "\u229b",
-    circledcirc: "\u229a",
-    circleddash: "\u229d",
-    CircleDot: "\u2299",
-    circledR: "\xae",
-    circledS: "\u24c8",
-    CircleMinus: "\u2296",
-    CirclePlus: "\u2295",
-    CircleTimes: "\u2297",
-    cir: "\u25cb",
-    cirE: "\u29c3",
-    cire: "\u2257",
-    cirfnint: "\u2a10",
-    cirmid: "\u2aef",
-    cirscir: "\u29c2",
-    ClockwiseContourIntegral: "\u2232",
-    CloseCurlyDoubleQuote: "\u201d",
-    CloseCurlyQuote: "\u2019",
-    clubs: "\u2663",
-    clubsuit: "\u2663",
-    colon: ":",
-    Colon: "\u2237",
-    Colone: "\u2a74",
-    colone: "\u2254",
-    coloneq: "\u2254",
-    comma: ",",
-    commat: "@",
-    comp: "\u2201",
-    compfn: "\u2218",
-    complement: "\u2201",
-    complexes: "\u2102",
-    cong: "\u2245",
-    congdot: "\u2a6d",
-    Congruent: "\u2261",
-    conint: "\u222e",
-    Conint: "\u222f",
-    ContourIntegral: "\u222e",
-    copf: "\ud835\udd54",
-    Copf: "\u2102",
-    coprod: "\u2210",
-    Coproduct: "\u2210",
-    copy: "\xa9",
-    COPY: "\xa9",
-    copysr: "\u2117",
-    CounterClockwiseContourIntegral: "\u2233",
-    crarr: "\u21b5",
-    cross: "\u2717",
-    Cross: "\u2a2f",
-    Cscr: "\ud835\udc9e",
-    cscr: "\ud835\udcb8",
-    csub: "\u2acf",
-    csube: "\u2ad1",
-    csup: "\u2ad0",
-    csupe: "\u2ad2",
-    ctdot: "\u22ef",
-    cudarrl: "\u2938",
-    cudarrr: "\u2935",
-    cuepr: "\u22de",
-    cuesc: "\u22df",
-    cularr: "\u21b6",
-    cularrp: "\u293d",
-    cupbrcap: "\u2a48",
-    cupcap: "\u2a46",
-    CupCap: "\u224d",
-    cup: "\u222a",
-    Cup: "\u22d3",
-    cupcup: "\u2a4a",
-    cupdot: "\u228d",
-    cupor: "\u2a45",
-    cups: "\u222a\ufe00",
-    curarr: "\u21b7",
-    curarrm: "\u293c",
-    curlyeqprec: "\u22de",
-    curlyeqsucc: "\u22df",
-    curlyvee: "\u22ce",
-    curlywedge: "\u22cf",
-    curren: "\xa4",
-    curvearrowleft: "\u21b6",
-    curvearrowright: "\u21b7",
-    cuvee: "\u22ce",
-    cuwed: "\u22cf",
-    cwconint: "\u2232",
-    cwint: "\u2231",
-    cylcty: "\u232d",
-    dagger: "\u2020",
-    Dagger: "\u2021",
-    daleth: "\u2138",
-    darr: "\u2193",
-    Darr: "\u21a1",
-    dArr: "\u21d3",
-    dash: "\u2010",
-    Dashv: "\u2ae4",
-    dashv: "\u22a3",
-    dbkarow: "\u290f",
-    dblac: "\u02dd",
-    Dcaron: "\u010e",
-    dcaron: "\u010f",
-    Dcy: "\u0414",
-    dcy: "\u0434",
-    ddagger: "\u2021",
-    ddarr: "\u21ca",
-    DD: "\u2145",
-    dd: "\u2146",
-    DDotrahd: "\u2911",
-    ddotseq: "\u2a77",
-    deg: "\xb0",
-    Del: "\u2207",
-    Delta: "\u0394",
-    delta: "\u03b4",
-    demptyv: "\u29b1",
-    dfisht: "\u297f",
-    Dfr: "\ud835\udd07",
-    dfr: "\ud835\udd21",
-    dHar: "\u2965",
-    dharl: "\u21c3",
-    dharr: "\u21c2",
-    DiacriticalAcute: "\xb4",
-    DiacriticalDot: "\u02d9",
-    DiacriticalDoubleAcute: "\u02dd",
-    DiacriticalGrave: "`",
-    DiacriticalTilde: "\u02dc",
-    diam: "\u22c4",
-    diamond: "\u22c4",
-    Diamond: "\u22c4",
-    diamondsuit: "\u2666",
-    diams: "\u2666",
-    die: "\xa8",
-    DifferentialD: "\u2146",
-    digamma: "\u03dd",
-    disin: "\u22f2",
-    div: "\xf7",
-    divide: "\xf7",
-    divideontimes: "\u22c7",
-    divonx: "\u22c7",
-    DJcy: "\u0402",
-    djcy: "\u0452",
-    dlcorn: "\u231e",
-    dlcrop: "\u230d",
-    dollar: "$",
-    Dopf: "\ud835\udd3b",
-    dopf: "\ud835\udd55",
-    Dot: "\xa8",
-    dot: "\u02d9",
-    DotDot: "\u20dc",
-    doteq: "\u2250",
-    doteqdot: "\u2251",
-    DotEqual: "\u2250",
-    dotminus: "\u2238",
-    dotplus: "\u2214",
-    dotsquare: "\u22a1",
-    doublebarwedge: "\u2306",
-    DoubleContourIntegral: "\u222f",
-    DoubleDot: "\xa8",
-    DoubleDownArrow: "\u21d3",
-    DoubleLeftArrow: "\u21d0",
-    DoubleLeftRightArrow: "\u21d4",
-    DoubleLeftTee: "\u2ae4",
-    DoubleLongLeftArrow: "\u27f8",
-    DoubleLongLeftRightArrow: "\u27fa",
-    DoubleLongRightArrow: "\u27f9",
-    DoubleRightArrow: "\u21d2",
-    DoubleRightTee: "\u22a8",
-    DoubleUpArrow: "\u21d1",
-    DoubleUpDownArrow: "\u21d5",
-    DoubleVerticalBar: "\u2225",
-    DownArrowBar: "\u2913",
-    downarrow: "\u2193",
-    DownArrow: "\u2193",
-    Downarrow: "\u21d3",
-    DownArrowUpArrow: "\u21f5",
-    DownBreve: "\u0311",
-    downdownarrows: "\u21ca",
-    downharpoonleft: "\u21c3",
-    downharpoonright: "\u21c2",
-    DownLeftRightVector: "\u2950",
-    DownLeftTeeVector: "\u295e",
-    DownLeftVectorBar: "\u2956",
-    DownLeftVector: "\u21bd",
-    DownRightTeeVector: "\u295f",
-    DownRightVectorBar: "\u2957",
-    DownRightVector: "\u21c1",
-    DownTeeArrow: "\u21a7",
-    DownTee: "\u22a4",
-    drbkarow: "\u2910",
-    drcorn: "\u231f",
-    drcrop: "\u230c",
-    Dscr: "\ud835\udc9f",
-    dscr: "\ud835\udcb9",
-    DScy: "\u0405",
-    dscy: "\u0455",
-    dsol: "\u29f6",
-    Dstrok: "\u0110",
-    dstrok: "\u0111",
-    dtdot: "\u22f1",
-    dtri: "\u25bf",
-    dtrif: "\u25be",
-    duarr: "\u21f5",
-    duhar: "\u296f",
-    dwangle: "\u29a6",
-    DZcy: "\u040f",
-    dzcy: "\u045f",
-    dzigrarr: "\u27ff",
-    Eacute: "\xc9",
-    eacute: "\xe9",
-    easter: "\u2a6e",
-    Ecaron: "\u011a",
-    ecaron: "\u011b",
-    Ecirc: "\xca",
-    ecirc: "\xea",
-    ecir: "\u2256",
-    ecolon: "\u2255",
-    Ecy: "\u042d",
-    ecy: "\u044d",
-    eDDot: "\u2a77",
-    Edot: "\u0116",
-    edot: "\u0117",
-    eDot: "\u2251",
-    ee: "\u2147",
-    efDot: "\u2252",
-    Efr: "\ud835\udd08",
-    efr: "\ud835\udd22",
-    eg: "\u2a9a",
-    Egrave: "\xc8",
-    egrave: "\xe8",
-    egs: "\u2a96",
-    egsdot: "\u2a98",
-    el: "\u2a99",
-    Element: "\u2208",
-    elinters: "\u23e7",
-    ell: "\u2113",
-    els: "\u2a95",
-    elsdot: "\u2a97",
-    Emacr: "\u0112",
-    emacr: "\u0113",
-    empty: "\u2205",
-    emptyset: "\u2205",
-    EmptySmallSquare: "\u25fb",
-    emptyv: "\u2205",
-    EmptyVerySmallSquare: "\u25ab",
-    emsp13: "\u2004",
-    emsp14: "\u2005",
-    emsp: "\u2003",
-    ENG: "\u014a",
-    eng: "\u014b",
-    ensp: "\u2002",
-    Eogon: "\u0118",
-    eogon: "\u0119",
-    Eopf: "\ud835\udd3c",
-    eopf: "\ud835\udd56",
-    epar: "\u22d5",
-    eparsl: "\u29e3",
-    eplus: "\u2a71",
-    epsi: "\u03b5",
-    Epsilon: "\u0395",
-    epsilon: "\u03b5",
-    epsiv: "\u03f5",
-    eqcirc: "\u2256",
-    eqcolon: "\u2255",
-    eqsim: "\u2242",
-    eqslantgtr: "\u2a96",
-    eqslantless: "\u2a95",
-    Equal: "\u2a75",
-    equals: "=",
-    EqualTilde: "\u2242",
-    equest: "\u225f",
-    Equilibrium: "\u21cc",
-    equiv: "\u2261",
-    equivDD: "\u2a78",
-    eqvparsl: "\u29e5",
-    erarr: "\u2971",
-    erDot: "\u2253",
-    escr: "\u212f",
-    Escr: "\u2130",
-    esdot: "\u2250",
-    Esim: "\u2a73",
-    esim: "\u2242",
-    Eta: "\u0397",
-    eta: "\u03b7",
-    ETH: "\xd0",
-    eth: "\xf0",
-    Euml: "\xcb",
-    euml: "\xeb",
-    euro: "\u20ac",
-    excl: "!",
-    exist: "\u2203",
-    Exists: "\u2203",
-    expectation: "\u2130",
-    exponentiale: "\u2147",
-    ExponentialE: "\u2147",
-    fallingdotseq: "\u2252",
-    Fcy: "\u0424",
-    fcy: "\u0444",
-    female: "\u2640",
-    ffilig: "\ufb03",
-    fflig: "\ufb00",
-    ffllig: "\ufb04",
-    Ffr: "\ud835\udd09",
-    ffr: "\ud835\udd23",
-    filig: "\ufb01",
-    FilledSmallSquare: "\u25fc",
-    FilledVerySmallSquare: "\u25aa",
-    fjlig: "fj",
-    flat: "\u266d",
-    fllig: "\ufb02",
-    fltns: "\u25b1",
-    fnof: "\u0192",
-    Fopf: "\ud835\udd3d",
-    fopf: "\ud835\udd57",
-    forall: "\u2200",
-    ForAll: "\u2200",
-    fork: "\u22d4",
-    forkv: "\u2ad9",
-    Fouriertrf: "\u2131",
-    fpartint: "\u2a0d",
-    frac12: "\xbd",
-    frac13: "\u2153",
-    frac14: "\xbc",
-    frac15: "\u2155",
-    frac16: "\u2159",
-    frac18: "\u215b",
-    frac23: "\u2154",
-    frac25: "\u2156",
-    frac34: "\xbe",
-    frac35: "\u2157",
-    frac38: "\u215c",
-    frac45: "\u2158",
-    frac56: "\u215a",
-    frac58: "\u215d",
-    frac78: "\u215e",
-    frasl: "\u2044",
-    frown: "\u2322",
-    fscr: "\ud835\udcbb",
-    Fscr: "\u2131",
-    gacute: "\u01f5",
-    Gamma: "\u0393",
-    gamma: "\u03b3",
-    Gammad: "\u03dc",
-    gammad: "\u03dd",
-    gap: "\u2a86",
-    Gbreve: "\u011e",
-    gbreve: "\u011f",
-    Gcedil: "\u0122",
-    Gcirc: "\u011c",
-    gcirc: "\u011d",
-    Gcy: "\u0413",
-    gcy: "\u0433",
-    Gdot: "\u0120",
-    gdot: "\u0121",
-    ge: "\u2265",
-    gE: "\u2267",
-    gEl: "\u2a8c",
-    gel: "\u22db",
-    geq: "\u2265",
-    geqq: "\u2267",
-    geqslant: "\u2a7e",
-    gescc: "\u2aa9",
-    ges: "\u2a7e",
-    gesdot: "\u2a80",
-    gesdoto: "\u2a82",
-    gesdotol: "\u2a84",
-    gesl: "\u22db\ufe00",
-    gesles: "\u2a94",
-    Gfr: "\ud835\udd0a",
-    gfr: "\ud835\udd24",
-    gg: "\u226b",
-    Gg: "\u22d9",
-    ggg: "\u22d9",
-    gimel: "\u2137",
-    GJcy: "\u0403",
-    gjcy: "\u0453",
-    gla: "\u2aa5",
-    gl: "\u2277",
-    glE: "\u2a92",
-    glj: "\u2aa4",
-    gnap: "\u2a8a",
-    gnapprox: "\u2a8a",
-    gne: "\u2a88",
-    gnE: "\u2269",
-    gneq: "\u2a88",
-    gneqq: "\u2269",
-    gnsim: "\u22e7",
-    Gopf: "\ud835\udd3e",
-    gopf: "\ud835\udd58",
-    grave: "`",
-    GreaterEqual: "\u2265",
-    GreaterEqualLess: "\u22db",
-    GreaterFullEqual: "\u2267",
-    GreaterGreater: "\u2aa2",
-    GreaterLess: "\u2277",
-    GreaterSlantEqual: "\u2a7e",
-    GreaterTilde: "\u2273",
-    Gscr: "\ud835\udca2",
-    gscr: "\u210a",
-    gsim: "\u2273",
-    gsime: "\u2a8e",
-    gsiml: "\u2a90",
-    gtcc: "\u2aa7",
-    gtcir: "\u2a7a",
-    gt: ">",
-    GT: ">",
-    Gt: "\u226b",
-    gtdot: "\u22d7",
-    gtlPar: "\u2995",
-    gtquest: "\u2a7c",
-    gtrapprox: "\u2a86",
-    gtrarr: "\u2978",
-    gtrdot: "\u22d7",
-    gtreqless: "\u22db",
-    gtreqqless: "\u2a8c",
-    gtrless: "\u2277",
-    gtrsim: "\u2273",
-    gvertneqq: "\u2269\ufe00",
-    gvnE: "\u2269\ufe00",
-    Hacek: "\u02c7",
-    hairsp: "\u200a",
-    half: "\xbd",
-    hamilt: "\u210b",
-    HARDcy: "\u042a",
-    hardcy: "\u044a",
-    harrcir: "\u2948",
-    harr: "\u2194",
-    hArr: "\u21d4",
-    harrw: "\u21ad",
-    Hat: "^",
-    hbar: "\u210f",
-    Hcirc: "\u0124",
-    hcirc: "\u0125",
-    hearts: "\u2665",
-    heartsuit: "\u2665",
-    hellip: "\u2026",
-    hercon: "\u22b9",
-    hfr: "\ud835\udd25",
-    Hfr: "\u210c",
-    HilbertSpace: "\u210b",
-    hksearow: "\u2925",
-    hkswarow: "\u2926",
-    hoarr: "\u21ff",
-    homtht: "\u223b",
-    hookleftarrow: "\u21a9",
-    hookrightarrow: "\u21aa",
-    hopf: "\ud835\udd59",
-    Hopf: "\u210d",
-    horbar: "\u2015",
-    HorizontalLine: "\u2500",
-    hscr: "\ud835\udcbd",
-    Hscr: "\u210b",
-    hslash: "\u210f",
-    Hstrok: "\u0126",
-    hstrok: "\u0127",
-    HumpDownHump: "\u224e",
-    HumpEqual: "\u224f",
-    hybull: "\u2043",
-    hyphen: "\u2010",
-    Iacute: "\xcd",
-    iacute: "\xed",
-    ic: "\u2063",
-    Icirc: "\xce",
-    icirc: "\xee",
-    Icy: "\u0418",
-    icy: "\u0438",
-    Idot: "\u0130",
-    IEcy: "\u0415",
-    iecy: "\u0435",
-    iexcl: "\xa1",
-    iff: "\u21d4",
-    ifr: "\ud835\udd26",
-    Ifr: "\u2111",
-    Igrave: "\xcc",
-    igrave: "\xec",
-    ii: "\u2148",
-    iiiint: "\u2a0c",
-    iiint: "\u222d",
-    iinfin: "\u29dc",
-    iiota: "\u2129",
-    IJlig: "\u0132",
-    ijlig: "\u0133",
-    Imacr: "\u012a",
-    imacr: "\u012b",
-    image: "\u2111",
-    ImaginaryI: "\u2148",
-    imagline: "\u2110",
-    imagpart: "\u2111",
-    imath: "\u0131",
-    Im: "\u2111",
-    imof: "\u22b7",
-    imped: "\u01b5",
-    Implies: "\u21d2",
-    incare: "\u2105",
-    in: "\u2208",
-    infin: "\u221e",
-    infintie: "\u29dd",
-    inodot: "\u0131",
-    intcal: "\u22ba",
-    int: "\u222b",
-    Int: "\u222c",
-    integers: "\u2124",
-    Integral: "\u222b",
-    intercal: "\u22ba",
-    Intersection: "\u22c2",
-    intlarhk: "\u2a17",
-    intprod: "\u2a3c",
-    InvisibleComma: "\u2063",
-    InvisibleTimes: "\u2062",
-    IOcy: "\u0401",
-    iocy: "\u0451",
-    Iogon: "\u012e",
-    iogon: "\u012f",
-    Iopf: "\ud835\udd40",
-    iopf: "\ud835\udd5a",
-    Iota: "\u0399",
-    iota: "\u03b9",
-    iprod: "\u2a3c",
-    iquest: "\xbf",
-    iscr: "\ud835\udcbe",
-    Iscr: "\u2110",
-    isin: "\u2208",
-    isindot: "\u22f5",
-    isinE: "\u22f9",
-    isins: "\u22f4",
-    isinsv: "\u22f3",
-    isinv: "\u2208",
-    it: "\u2062",
-    Itilde: "\u0128",
-    itilde: "\u0129",
-    Iukcy: "\u0406",
-    iukcy: "\u0456",
-    Iuml: "\xcf",
-    iuml: "\xef",
-    Jcirc: "\u0134",
-    jcirc: "\u0135",
-    Jcy: "\u0419",
-    jcy: "\u0439",
-    Jfr: "\ud835\udd0d",
-    jfr: "\ud835\udd27",
-    jmath: "\u0237",
-    Jopf: "\ud835\udd41",
-    jopf: "\ud835\udd5b",
-    Jscr: "\ud835\udca5",
-    jscr: "\ud835\udcbf",
-    Jsercy: "\u0408",
-    jsercy: "\u0458",
-    Jukcy: "\u0404",
-    jukcy: "\u0454",
-    Kappa: "\u039a",
-    kappa: "\u03ba",
-    kappav: "\u03f0",
-    Kcedil: "\u0136",
-    kcedil: "\u0137",
-    Kcy: "\u041a",
-    kcy: "\u043a",
-    Kfr: "\ud835\udd0e",
-    kfr: "\ud835\udd28",
-    kgreen: "\u0138",
-    KHcy: "\u0425",
-    khcy: "\u0445",
-    KJcy: "\u040c",
-    kjcy: "\u045c",
-    Kopf: "\ud835\udd42",
-    kopf: "\ud835\udd5c",
-    Kscr: "\ud835\udca6",
-    kscr: "\ud835\udcc0",
-    lAarr: "\u21da",
-    Lacute: "\u0139",
-    lacute: "\u013a",
-    laemptyv: "\u29b4",
-    lagran: "\u2112",
-    Lambda: "\u039b",
-    lambda: "\u03bb",
-    lang: "\u27e8",
-    Lang: "\u27ea",
-    langd: "\u2991",
-    langle: "\u27e8",
-    lap: "\u2a85",
-    Laplacetrf: "\u2112",
-    laquo: "\xab",
-    larrb: "\u21e4",
-    larrbfs: "\u291f",
-    larr: "\u2190",
-    Larr: "\u219e",
-    lArr: "\u21d0",
-    larrfs: "\u291d",
-    larrhk: "\u21a9",
-    larrlp: "\u21ab",
-    larrpl: "\u2939",
-    larrsim: "\u2973",
-    larrtl: "\u21a2",
-    latail: "\u2919",
-    lAtail: "\u291b",
-    lat: "\u2aab",
-    late: "\u2aad",
-    lates: "\u2aad\ufe00",
-    lbarr: "\u290c",
-    lBarr: "\u290e",
-    lbbrk: "\u2772",
-    lbrace: "{",
-    lbrack: "[",
-    lbrke: "\u298b",
-    lbrksld: "\u298f",
-    lbrkslu: "\u298d",
-    Lcaron: "\u013d",
-    lcaron: "\u013e",
-    Lcedil: "\u013b",
-    lcedil: "\u013c",
-    lceil: "\u2308",
-    lcub: "{",
-    Lcy: "\u041b",
-    lcy: "\u043b",
-    ldca: "\u2936",
-    ldquo: "\u201c",
-    ldquor: "\u201e",
-    ldrdhar: "\u2967",
-    ldrushar: "\u294b",
-    ldsh: "\u21b2",
-    le: "\u2264",
-    lE: "\u2266",
-    LeftAngleBracket: "\u27e8",
-    LeftArrowBar: "\u21e4",
-    leftarrow: "\u2190",
-    LeftArrow: "\u2190",
-    Leftarrow: "\u21d0",
-    LeftArrowRightArrow: "\u21c6",
-    leftarrowtail: "\u21a2",
-    LeftCeiling: "\u2308",
-    LeftDoubleBracket: "\u27e6",
-    LeftDownTeeVector: "\u2961",
-    LeftDownVectorBar: "\u2959",
-    LeftDownVector: "\u21c3",
-    LeftFloor: "\u230a",
-    leftharpoondown: "\u21bd",
-    leftharpoonup: "\u21bc",
-    leftleftarrows: "\u21c7",
-    leftrightarrow: "\u2194",
-    LeftRightArrow: "\u2194",
-    Leftrightarrow: "\u21d4",
-    leftrightarrows: "\u21c6",
-    leftrightharpoons: "\u21cb",
-    leftrightsquigarrow: "\u21ad",
-    LeftRightVector: "\u294e",
-    LeftTeeArrow: "\u21a4",
-    LeftTee: "\u22a3",
-    LeftTeeVector: "\u295a",
-    leftthreetimes: "\u22cb",
-    LeftTriangleBar: "\u29cf",
-    LeftTriangle: "\u22b2",
-    LeftTriangleEqual: "\u22b4",
-    LeftUpDownVector: "\u2951",
-    LeftUpTeeVector: "\u2960",
-    LeftUpVectorBar: "\u2958",
-    LeftUpVector: "\u21bf",
-    LeftVectorBar: "\u2952",
-    LeftVector: "\u21bc",
-    lEg: "\u2a8b",
-    leg: "\u22da",
-    leq: "\u2264",
-    leqq: "\u2266",
-    leqslant: "\u2a7d",
-    lescc: "\u2aa8",
-    les: "\u2a7d",
-    lesdot: "\u2a7f",
-    lesdoto: "\u2a81",
-    lesdotor: "\u2a83",
-    lesg: "\u22da\ufe00",
-    lesges: "\u2a93",
-    lessapprox: "\u2a85",
-    lessdot: "\u22d6",
-    lesseqgtr: "\u22da",
-    lesseqqgtr: "\u2a8b",
-    LessEqualGreater: "\u22da",
-    LessFullEqual: "\u2266",
-    LessGreater: "\u2276",
-    lessgtr: "\u2276",
-    LessLess: "\u2aa1",
-    lesssim: "\u2272",
-    LessSlantEqual: "\u2a7d",
-    LessTilde: "\u2272",
-    lfisht: "\u297c",
-    lfloor: "\u230a",
-    Lfr: "\ud835\udd0f",
-    lfr: "\ud835\udd29",
-    lg: "\u2276",
-    lgE: "\u2a91",
-    lHar: "\u2962",
-    lhard: "\u21bd",
-    lharu: "\u21bc",
-    lharul: "\u296a",
-    lhblk: "\u2584",
-    LJcy: "\u0409",
-    ljcy: "\u0459",
-    llarr: "\u21c7",
-    ll: "\u226a",
-    Ll: "\u22d8",
-    llcorner: "\u231e",
-    Lleftarrow: "\u21da",
-    llhard: "\u296b",
-    lltri: "\u25fa",
-    Lmidot: "\u013f",
-    lmidot: "\u0140",
-    lmoustache: "\u23b0",
-    lmoust: "\u23b0",
-    lnap: "\u2a89",
-    lnapprox: "\u2a89",
-    lne: "\u2a87",
-    lnE: "\u2268",
-    lneq: "\u2a87",
-    lneqq: "\u2268",
-    lnsim: "\u22e6",
-    loang: "\u27ec",
-    loarr: "\u21fd",
-    lobrk: "\u27e6",
-    longleftarrow: "\u27f5",
-    LongLeftArrow: "\u27f5",
-    Longleftarrow: "\u27f8",
-    longleftrightarrow: "\u27f7",
-    LongLeftRightArrow: "\u27f7",
-    Longleftrightarrow: "\u27fa",
-    longmapsto: "\u27fc",
-    longrightarrow: "\u27f6",
-    LongRightArrow: "\u27f6",
-    Longrightarrow: "\u27f9",
-    looparrowleft: "\u21ab",
-    looparrowright: "\u21ac",
-    lopar: "\u2985",
-    Lopf: "\ud835\udd43",
-    lopf: "\ud835\udd5d",
-    loplus: "\u2a2d",
-    lotimes: "\u2a34",
-    lowast: "\u2217",
-    lowbar: "_",
-    LowerLeftArrow: "\u2199",
-    LowerRightArrow: "\u2198",
-    loz: "\u25ca",
-    lozenge: "\u25ca",
-    lozf: "\u29eb",
-    lpar: "(",
-    lparlt: "\u2993",
-    lrarr: "\u21c6",
-    lrcorner: "\u231f",
-    lrhar: "\u21cb",
-    lrhard: "\u296d",
-    lrm: "\u200e",
-    lrtri: "\u22bf",
-    lsaquo: "\u2039",
-    lscr: "\ud835\udcc1",
-    Lscr: "\u2112",
-    lsh: "\u21b0",
-    Lsh: "\u21b0",
-    lsim: "\u2272",
-    lsime: "\u2a8d",
-    lsimg: "\u2a8f",
-    lsqb: "[",
-    lsquo: "\u2018",
-    lsquor: "\u201a",
-    Lstrok: "\u0141",
-    lstrok: "\u0142",
-    ltcc: "\u2aa6",
-    ltcir: "\u2a79",
-    lt: "<",
-    LT: "<",
-    Lt: "\u226a",
-    ltdot: "\u22d6",
-    lthree: "\u22cb",
-    ltimes: "\u22c9",
-    ltlarr: "\u2976",
-    ltquest: "\u2a7b",
-    ltri: "\u25c3",
-    ltrie: "\u22b4",
-    ltrif: "\u25c2",
-    ltrPar: "\u2996",
-    lurdshar: "\u294a",
-    luruhar: "\u2966",
-    lvertneqq: "\u2268\ufe00",
-    lvnE: "\u2268\ufe00",
-    macr: "\xaf",
-    male: "\u2642",
-    malt: "\u2720",
-    maltese: "\u2720",
-    Map: "\u2905",
-    map: "\u21a6",
-    mapsto: "\u21a6",
-    mapstodown: "\u21a7",
-    mapstoleft: "\u21a4",
-    mapstoup: "\u21a5",
-    marker: "\u25ae",
-    mcomma: "\u2a29",
-    Mcy: "\u041c",
-    mcy: "\u043c",
-    mdash: "\u2014",
-    mDDot: "\u223a",
-    measuredangle: "\u2221",
-    MediumSpace: "\u205f",
-    Mellintrf: "\u2133",
-    Mfr: "\ud835\udd10",
-    mfr: "\ud835\udd2a",
-    mho: "\u2127",
-    micro: "\xb5",
-    midast: "*",
-    midcir: "\u2af0",
-    mid: "\u2223",
-    middot: "\xb7",
-    minusb: "\u229f",
-    minus: "\u2212",
-    minusd: "\u2238",
-    minusdu: "\u2a2a",
-    MinusPlus: "\u2213",
-    mlcp: "\u2adb",
-    mldr: "\u2026",
-    mnplus: "\u2213",
-    models: "\u22a7",
-    Mopf: "\ud835\udd44",
-    mopf: "\ud835\udd5e",
-    mp: "\u2213",
-    mscr: "\ud835\udcc2",
-    Mscr: "\u2133",
-    mstpos: "\u223e",
-    Mu: "\u039c",
-    mu: "\u03bc",
-    multimap: "\u22b8",
-    mumap: "\u22b8",
-    nabla: "\u2207",
-    Nacute: "\u0143",
-    nacute: "\u0144",
-    nang: "\u2220\u20d2",
-    nap: "\u2249",
-    napE: "\u2a70\u0338",
-    napid: "\u224b\u0338",
-    napos: "\u0149",
-    napprox: "\u2249",
-    natural: "\u266e",
-    naturals: "\u2115",
-    natur: "\u266e",
-    nbsp: "\xa0",
-    nbump: "\u224e\u0338",
-    nbumpe: "\u224f\u0338",
-    ncap: "\u2a43",
-    Ncaron: "\u0147",
-    ncaron: "\u0148",
-    Ncedil: "\u0145",
-    ncedil: "\u0146",
-    ncong: "\u2247",
-    ncongdot: "\u2a6d\u0338",
-    ncup: "\u2a42",
-    Ncy: "\u041d",
-    ncy: "\u043d",
-    ndash: "\u2013",
-    nearhk: "\u2924",
-    nearr: "\u2197",
-    neArr: "\u21d7",
-    nearrow: "\u2197",
-    ne: "\u2260",
-    nedot: "\u2250\u0338",
-    NegativeMediumSpace: "\u200b",
-    NegativeThickSpace: "\u200b",
-    NegativeThinSpace: "\u200b",
-    NegativeVeryThinSpace: "\u200b",
-    nequiv: "\u2262",
-    nesear: "\u2928",
-    nesim: "\u2242\u0338",
-    NestedGreaterGreater: "\u226b",
-    NestedLessLess: "\u226a",
-    NewLine: "\n",
-    nexist: "\u2204",
-    nexists: "\u2204",
-    Nfr: "\ud835\udd11",
-    nfr: "\ud835\udd2b",
-    ngE: "\u2267\u0338",
-    nge: "\u2271",
-    ngeq: "\u2271",
-    ngeqq: "\u2267\u0338",
-    ngeqslant: "\u2a7e\u0338",
-    nges: "\u2a7e\u0338",
-    nGg: "\u22d9\u0338",
-    ngsim: "\u2275",
-    nGt: "\u226b\u20d2",
-    ngt: "\u226f",
-    ngtr: "\u226f",
-    nGtv: "\u226b\u0338",
-    nharr: "\u21ae",
-    nhArr: "\u21ce",
-    nhpar: "\u2af2",
-    ni: "\u220b",
-    nis: "\u22fc",
-    nisd: "\u22fa",
-    niv: "\u220b",
-    NJcy: "\u040a",
-    njcy: "\u045a",
-    nlarr: "\u219a",
-    nlArr: "\u21cd",
-    nldr: "\u2025",
-    nlE: "\u2266\u0338",
-    nle: "\u2270",
-    nleftarrow: "\u219a",
-    nLeftarrow: "\u21cd",
-    nleftrightarrow: "\u21ae",
-    nLeftrightarrow: "\u21ce",
-    nleq: "\u2270",
-    nleqq: "\u2266\u0338",
-    nleqslant: "\u2a7d\u0338",
-    nles: "\u2a7d\u0338",
-    nless: "\u226e",
-    nLl: "\u22d8\u0338",
-    nlsim: "\u2274",
-    nLt: "\u226a\u20d2",
-    nlt: "\u226e",
-    nltri: "\u22ea",
-    nltrie: "\u22ec",
-    nLtv: "\u226a\u0338",
-    nmid: "\u2224",
-    NoBreak: "\u2060",
-    NonBreakingSpace: "\xa0",
-    nopf: "\ud835\udd5f",
-    Nopf: "\u2115",
-    Not: "\u2aec",
-    not: "\xac",
-    NotCongruent: "\u2262",
-    NotCupCap: "\u226d",
-    NotDoubleVerticalBar: "\u2226",
-    NotElement: "\u2209",
-    NotEqual: "\u2260",
-    NotEqualTilde: "\u2242\u0338",
-    NotExists: "\u2204",
-    NotGreater: "\u226f",
-    NotGreaterEqual: "\u2271",
-    NotGreaterFullEqual: "\u2267\u0338",
-    NotGreaterGreater: "\u226b\u0338",
-    NotGreaterLess: "\u2279",
-    NotGreaterSlantEqual: "\u2a7e\u0338",
-    NotGreaterTilde: "\u2275",
-    NotHumpDownHump: "\u224e\u0338",
-    NotHumpEqual: "\u224f\u0338",
-    notin: "\u2209",
-    notindot: "\u22f5\u0338",
-    notinE: "\u22f9\u0338",
-    notinva: "\u2209",
-    notinvb: "\u22f7",
-    notinvc: "\u22f6",
-    NotLeftTriangleBar: "\u29cf\u0338",
-    NotLeftTriangle: "\u22ea",
-    NotLeftTriangleEqual: "\u22ec",
-    NotLess: "\u226e",
-    NotLessEqual: "\u2270",
-    NotLessGreater: "\u2278",
-    NotLessLess: "\u226a\u0338",
-    NotLessSlantEqual: "\u2a7d\u0338",
-    NotLessTilde: "\u2274",
-    NotNestedGreaterGreater: "\u2aa2\u0338",
-    NotNestedLessLess: "\u2aa1\u0338",
-    notni: "\u220c",
-    notniva: "\u220c",
-    notnivb: "\u22fe",
-    notnivc: "\u22fd",
-    NotPrecedes: "\u2280",
-    NotPrecedesEqual: "\u2aaf\u0338",
-    NotPrecedesSlantEqual: "\u22e0",
-    NotReverseElement: "\u220c",
-    NotRightTriangleBar: "\u29d0\u0338",
-    NotRightTriangle: "\u22eb",
-    NotRightTriangleEqual: "\u22ed",
-    NotSquareSubset: "\u228f\u0338",
-    NotSquareSubsetEqual: "\u22e2",
-    NotSquareSuperset: "\u2290\u0338",
-    NotSquareSupersetEqual: "\u22e3",
-    NotSubset: "\u2282\u20d2",
-    NotSubsetEqual: "\u2288",
-    NotSucceeds: "\u2281",
-    NotSucceedsEqual: "\u2ab0\u0338",
-    NotSucceedsSlantEqual: "\u22e1",
-    NotSucceedsTilde: "\u227f\u0338",
-    NotSuperset: "\u2283\u20d2",
-    NotSupersetEqual: "\u2289",
-    NotTilde: "\u2241",
-    NotTildeEqual: "\u2244",
-    NotTildeFullEqual: "\u2247",
-    NotTildeTilde: "\u2249",
-    NotVerticalBar: "\u2224",
-    nparallel: "\u2226",
-    npar: "\u2226",
-    nparsl: "\u2afd\u20e5",
-    npart: "\u2202\u0338",
-    npolint: "\u2a14",
-    npr: "\u2280",
-    nprcue: "\u22e0",
-    nprec: "\u2280",
-    npreceq: "\u2aaf\u0338",
-    npre: "\u2aaf\u0338",
-    nrarrc: "\u2933\u0338",
-    nrarr: "\u219b",
-    nrArr: "\u21cf",
-    nrarrw: "\u219d\u0338",
-    nrightarrow: "\u219b",
-    nRightarrow: "\u21cf",
-    nrtri: "\u22eb",
-    nrtrie: "\u22ed",
-    nsc: "\u2281",
-    nsccue: "\u22e1",
-    nsce: "\u2ab0\u0338",
-    Nscr: "\ud835\udca9",
-    nscr: "\ud835\udcc3",
-    nshortmid: "\u2224",
-    nshortparallel: "\u2226",
-    nsim: "\u2241",
-    nsime: "\u2244",
-    nsimeq: "\u2244",
-    nsmid: "\u2224",
-    nspar: "\u2226",
-    nsqsube: "\u22e2",
-    nsqsupe: "\u22e3",
-    nsub: "\u2284",
-    nsubE: "\u2ac5\u0338",
-    nsube: "\u2288",
-    nsubset: "\u2282\u20d2",
-    nsubseteq: "\u2288",
-    nsubseteqq: "\u2ac5\u0338",
-    nsucc: "\u2281",
-    nsucceq: "\u2ab0\u0338",
-    nsup: "\u2285",
-    nsupE: "\u2ac6\u0338",
-    nsupe: "\u2289",
-    nsupset: "\u2283\u20d2",
-    nsupseteq: "\u2289",
-    nsupseteqq: "\u2ac6\u0338",
-    ntgl: "\u2279",
-    Ntilde: "\xd1",
-    ntilde: "\xf1",
-    ntlg: "\u2278",
-    ntriangleleft: "\u22ea",
-    ntrianglelefteq: "\u22ec",
-    ntriangleright: "\u22eb",
-    ntrianglerighteq: "\u22ed",
-    Nu: "\u039d",
-    nu: "\u03bd",
-    num: "#",
-    numero: "\u2116",
-    numsp: "\u2007",
-    nvap: "\u224d\u20d2",
-    nvdash: "\u22ac",
-    nvDash: "\u22ad",
-    nVdash: "\u22ae",
-    nVDash: "\u22af",
-    nvge: "\u2265\u20d2",
-    nvgt: ">\u20d2",
-    nvHarr: "\u2904",
-    nvinfin: "\u29de",
-    nvlArr: "\u2902",
-    nvle: "\u2264\u20d2",
-    nvlt: "<\u20d2",
-    nvltrie: "\u22b4\u20d2",
-    nvrArr: "\u2903",
-    nvrtrie: "\u22b5\u20d2",
-    nvsim: "\u223c\u20d2",
-    nwarhk: "\u2923",
-    nwarr: "\u2196",
-    nwArr: "\u21d6",
-    nwarrow: "\u2196",
-    nwnear: "\u2927",
-    Oacute: "\xd3",
-    oacute: "\xf3",
-    oast: "\u229b",
-    Ocirc: "\xd4",
-    ocirc: "\xf4",
-    ocir: "\u229a",
-    Ocy: "\u041e",
-    ocy: "\u043e",
-    odash: "\u229d",
-    Odblac: "\u0150",
-    odblac: "\u0151",
-    odiv: "\u2a38",
-    odot: "\u2299",
-    odsold: "\u29bc",
-    OElig: "\u0152",
-    oelig: "\u0153",
-    ofcir: "\u29bf",
-    Ofr: "\ud835\udd12",
-    ofr: "\ud835\udd2c",
-    ogon: "\u02db",
-    Ograve: "\xd2",
-    ograve: "\xf2",
-    ogt: "\u29c1",
-    ohbar: "\u29b5",
-    ohm: "\u03a9",
-    oint: "\u222e",
-    olarr: "\u21ba",
-    olcir: "\u29be",
-    olcross: "\u29bb",
-    oline: "\u203e",
-    olt: "\u29c0",
-    Omacr: "\u014c",
-    omacr: "\u014d",
-    Omega: "\u03a9",
-    omega: "\u03c9",
-    Omicron: "\u039f",
-    omicron: "\u03bf",
-    omid: "\u29b6",
-    ominus: "\u2296",
-    Oopf: "\ud835\udd46",
-    oopf: "\ud835\udd60",
-    opar: "\u29b7",
-    OpenCurlyDoubleQuote: "\u201c",
-    OpenCurlyQuote: "\u2018",
-    operp: "\u29b9",
-    oplus: "\u2295",
-    orarr: "\u21bb",
-    Or: "\u2a54",
-    or: "\u2228",
-    ord: "\u2a5d",
-    order: "\u2134",
-    orderof: "\u2134",
-    ordf: "\xaa",
-    ordm: "\xba",
-    origof: "\u22b6",
-    oror: "\u2a56",
-    orslope: "\u2a57",
-    orv: "\u2a5b",
-    oS: "\u24c8",
-    Oscr: "\ud835\udcaa",
-    oscr: "\u2134",
-    Oslash: "\xd8",
-    oslash: "\xf8",
-    osol: "\u2298",
-    Otilde: "\xd5",
-    otilde: "\xf5",
-    otimesas: "\u2a36",
-    Otimes: "\u2a37",
-    otimes: "\u2297",
-    Ouml: "\xd6",
-    ouml: "\xf6",
-    ovbar: "\u233d",
-    OverBar: "\u203e",
-    OverBrace: "\u23de",
-    OverBracket: "\u23b4",
-    OverParenthesis: "\u23dc",
-    para: "\xb6",
-    parallel: "\u2225",
-    par: "\u2225",
-    parsim: "\u2af3",
-    parsl: "\u2afd",
-    part: "\u2202",
-    PartialD: "\u2202",
-    Pcy: "\u041f",
-    pcy: "\u043f",
-    percnt: "%",
-    period: ".",
-    permil: "\u2030",
-    perp: "\u22a5",
-    pertenk: "\u2031",
-    Pfr: "\ud835\udd13",
-    pfr: "\ud835\udd2d",
-    Phi: "\u03a6",
-    phi: "\u03c6",
-    phiv: "\u03d5",
-    phmmat: "\u2133",
-    phone: "\u260e",
-    Pi: "\u03a0",
-    pi: "\u03c0",
-    pitchfork: "\u22d4",
-    piv: "\u03d6",
-    planck: "\u210f",
-    planckh: "\u210e",
-    plankv: "\u210f",
-    plusacir: "\u2a23",
-    plusb: "\u229e",
-    pluscir: "\u2a22",
-    plus: "+",
-    plusdo: "\u2214",
-    plusdu: "\u2a25",
-    pluse: "\u2a72",
-    PlusMinus: "\xb1",
-    plusmn: "\xb1",
-    plussim: "\u2a26",
-    plustwo: "\u2a27",
-    pm: "\xb1",
-    Poincareplane: "\u210c",
-    pointint: "\u2a15",
-    popf: "\ud835\udd61",
-    Popf: "\u2119",
-    pound: "\xa3",
-    prap: "\u2ab7",
-    Pr: "\u2abb",
-    pr: "\u227a",
-    prcue: "\u227c",
-    precapprox: "\u2ab7",
-    prec: "\u227a",
-    preccurlyeq: "\u227c",
-    Precedes: "\u227a",
-    PrecedesEqual: "\u2aaf",
-    PrecedesSlantEqual: "\u227c",
-    PrecedesTilde: "\u227e",
-    preceq: "\u2aaf",
-    precnapprox: "\u2ab9",
-    precneqq: "\u2ab5",
-    precnsim: "\u22e8",
-    pre: "\u2aaf",
-    prE: "\u2ab3",
-    precsim: "\u227e",
-    prime: "\u2032",
-    Prime: "\u2033",
-    primes: "\u2119",
-    prnap: "\u2ab9",
-    prnE: "\u2ab5",
-    prnsim: "\u22e8",
-    prod: "\u220f",
-    Product: "\u220f",
-    profalar: "\u232e",
-    profline: "\u2312",
-    profsurf: "\u2313",
-    prop: "\u221d",
-    Proportional: "\u221d",
-    Proportion: "\u2237",
-    propto: "\u221d",
-    prsim: "\u227e",
-    prurel: "\u22b0",
-    Pscr: "\ud835\udcab",
-    pscr: "\ud835\udcc5",
-    Psi: "\u03a8",
-    psi: "\u03c8",
-    puncsp: "\u2008",
-    Qfr: "\ud835\udd14",
-    qfr: "\ud835\udd2e",
-    qint: "\u2a0c",
-    qopf: "\ud835\udd62",
-    Qopf: "\u211a",
-    qprime: "\u2057",
-    Qscr: "\ud835\udcac",
-    qscr: "\ud835\udcc6",
-    quaternions: "\u210d",
-    quatint: "\u2a16",
-    quest: "?",
-    questeq: "\u225f",
-    quot: '"',
-    QUOT: '"',
-    rAarr: "\u21db",
-    race: "\u223d\u0331",
-    Racute: "\u0154",
-    racute: "\u0155",
-    radic: "\u221a",
-    raemptyv: "\u29b3",
-    rang: "\u27e9",
-    Rang: "\u27eb",
-    rangd: "\u2992",
-    range: "\u29a5",
-    rangle: "\u27e9",
-    raquo: "\xbb",
-    rarrap: "\u2975",
-    rarrb: "\u21e5",
-    rarrbfs: "\u2920",
-    rarrc: "\u2933",
-    rarr: "\u2192",
-    Rarr: "\u21a0",
-    rArr: "\u21d2",
-    rarrfs: "\u291e",
-    rarrhk: "\u21aa",
-    rarrlp: "\u21ac",
-    rarrpl: "\u2945",
-    rarrsim: "\u2974",
-    Rarrtl: "\u2916",
-    rarrtl: "\u21a3",
-    rarrw: "\u219d",
-    ratail: "\u291a",
-    rAtail: "\u291c",
-    ratio: "\u2236",
-    rationals: "\u211a",
-    rbarr: "\u290d",
-    rBarr: "\u290f",
-    RBarr: "\u2910",
-    rbbrk: "\u2773",
-    rbrace: "}",
-    rbrack: "]",
-    rbrke: "\u298c",
-    rbrksld: "\u298e",
-    rbrkslu: "\u2990",
-    Rcaron: "\u0158",
-    rcaron: "\u0159",
-    Rcedil: "\u0156",
-    rcedil: "\u0157",
-    rceil: "\u2309",
-    rcub: "}",
-    Rcy: "\u0420",
-    rcy: "\u0440",
-    rdca: "\u2937",
-    rdldhar: "\u2969",
-    rdquo: "\u201d",
-    rdquor: "\u201d",
-    rdsh: "\u21b3",
-    real: "\u211c",
-    realine: "\u211b",
-    realpart: "\u211c",
-    reals: "\u211d",
-    Re: "\u211c",
-    rect: "\u25ad",
-    reg: "\xae",
-    REG: "\xae",
-    ReverseElement: "\u220b",
-    ReverseEquilibrium: "\u21cb",
-    ReverseUpEquilibrium: "\u296f",
-    rfisht: "\u297d",
-    rfloor: "\u230b",
-    rfr: "\ud835\udd2f",
-    Rfr: "\u211c",
-    rHar: "\u2964",
-    rhard: "\u21c1",
-    rharu: "\u21c0",
-    rharul: "\u296c",
-    Rho: "\u03a1",
-    rho: "\u03c1",
-    rhov: "\u03f1",
-    RightAngleBracket: "\u27e9",
-    RightArrowBar: "\u21e5",
-    rightarrow: "\u2192",
-    RightArrow: "\u2192",
-    Rightarrow: "\u21d2",
-    RightArrowLeftArrow: "\u21c4",
-    rightarrowtail: "\u21a3",
-    RightCeiling: "\u2309",
-    RightDoubleBracket: "\u27e7",
-    RightDownTeeVector: "\u295d",
-    RightDownVectorBar: "\u2955",
-    RightDownVector: "\u21c2",
-    RightFloor: "\u230b",
-    rightharpoondown: "\u21c1",
-    rightharpoonup: "\u21c0",
-    rightleftarrows: "\u21c4",
-    rightleftharpoons: "\u21cc",
-    rightrightarrows: "\u21c9",
-    rightsquigarrow: "\u219d",
-    RightTeeArrow: "\u21a6",
-    RightTee: "\u22a2",
-    RightTeeVector: "\u295b",
-    rightthreetimes: "\u22cc",
-    RightTriangleBar: "\u29d0",
-    RightTriangle: "\u22b3",
-    RightTriangleEqual: "\u22b5",
-    RightUpDownVector: "\u294f",
-    RightUpTeeVector: "\u295c",
-    RightUpVectorBar: "\u2954",
-    RightUpVector: "\u21be",
-    RightVectorBar: "\u2953",
-    RightVector: "\u21c0",
-    ring: "\u02da",
-    risingdotseq: "\u2253",
-    rlarr: "\u21c4",
-    rlhar: "\u21cc",
-    rlm: "\u200f",
-    rmoustache: "\u23b1",
-    rmoust: "\u23b1",
-    rnmid: "\u2aee",
-    roang: "\u27ed",
-    roarr: "\u21fe",
-    robrk: "\u27e7",
-    ropar: "\u2986",
-    ropf: "\ud835\udd63",
-    Ropf: "\u211d",
-    roplus: "\u2a2e",
-    rotimes: "\u2a35",
-    RoundImplies: "\u2970",
-    rpar: ")",
-    rpargt: "\u2994",
-    rppolint: "\u2a12",
-    rrarr: "\u21c9",
-    Rrightarrow: "\u21db",
-    rsaquo: "\u203a",
-    rscr: "\ud835\udcc7",
-    Rscr: "\u211b",
-    rsh: "\u21b1",
-    Rsh: "\u21b1",
-    rsqb: "]",
-    rsquo: "\u2019",
-    rsquor: "\u2019",
-    rthree: "\u22cc",
-    rtimes: "\u22ca",
-    rtri: "\u25b9",
-    rtrie: "\u22b5",
-    rtrif: "\u25b8",
-    rtriltri: "\u29ce",
-    RuleDelayed: "\u29f4",
-    ruluhar: "\u2968",
-    rx: "\u211e",
-    Sacute: "\u015a",
-    sacute: "\u015b",
-    sbquo: "\u201a",
-    scap: "\u2ab8",
-    Scaron: "\u0160",
-    scaron: "\u0161",
-    Sc: "\u2abc",
-    sc: "\u227b",
-    sccue: "\u227d",
-    sce: "\u2ab0",
-    scE: "\u2ab4",
-    Scedil: "\u015e",
-    scedil: "\u015f",
-    Scirc: "\u015c",
-    scirc: "\u015d",
-    scnap: "\u2aba",
-    scnE: "\u2ab6",
-    scnsim: "\u22e9",
-    scpolint: "\u2a13",
-    scsim: "\u227f",
-    Scy: "\u0421",
-    scy: "\u0441",
-    sdotb: "\u22a1",
-    sdot: "\u22c5",
-    sdote: "\u2a66",
-    searhk: "\u2925",
-    searr: "\u2198",
-    seArr: "\u21d8",
-    searrow: "\u2198",
-    sect: "\xa7",
-    semi: ";",
-    seswar: "\u2929",
-    setminus: "\u2216",
-    setmn: "\u2216",
-    sext: "\u2736",
-    Sfr: "\ud835\udd16",
-    sfr: "\ud835\udd30",
-    sfrown: "\u2322",
-    sharp: "\u266f",
-    SHCHcy: "\u0429",
-    shchcy: "\u0449",
-    SHcy: "\u0428",
-    shcy: "\u0448",
-    ShortDownArrow: "\u2193",
-    ShortLeftArrow: "\u2190",
-    shortmid: "\u2223",
-    shortparallel: "\u2225",
-    ShortRightArrow: "\u2192",
-    ShortUpArrow: "\u2191",
-    shy: "\xad",
-    Sigma: "\u03a3",
-    sigma: "\u03c3",
-    sigmaf: "\u03c2",
-    sigmav: "\u03c2",
-    sim: "\u223c",
-    simdot: "\u2a6a",
-    sime: "\u2243",
-    simeq: "\u2243",
-    simg: "\u2a9e",
-    simgE: "\u2aa0",
-    siml: "\u2a9d",
-    simlE: "\u2a9f",
-    simne: "\u2246",
-    simplus: "\u2a24",
-    simrarr: "\u2972",
-    slarr: "\u2190",
-    SmallCircle: "\u2218",
-    smallsetminus: "\u2216",
-    smashp: "\u2a33",
-    smeparsl: "\u29e4",
-    smid: "\u2223",
-    smile: "\u2323",
-    smt: "\u2aaa",
-    smte: "\u2aac",
-    smtes: "\u2aac\ufe00",
-    SOFTcy: "\u042c",
-    softcy: "\u044c",
-    solbar: "\u233f",
-    solb: "\u29c4",
-    sol: "/",
-    Sopf: "\ud835\udd4a",
-    sopf: "\ud835\udd64",
-    spades: "\u2660",
-    spadesuit: "\u2660",
-    spar: "\u2225",
-    sqcap: "\u2293",
-    sqcaps: "\u2293\ufe00",
-    sqcup: "\u2294",
-    sqcups: "\u2294\ufe00",
-    Sqrt: "\u221a",
-    sqsub: "\u228f",
-    sqsube: "\u2291",
-    sqsubset: "\u228f",
-    sqsubseteq: "\u2291",
-    sqsup: "\u2290",
-    sqsupe: "\u2292",
-    sqsupset: "\u2290",
-    sqsupseteq: "\u2292",
-    square: "\u25a1",
-    Square: "\u25a1",
-    SquareIntersection: "\u2293",
-    SquareSubset: "\u228f",
-    SquareSubsetEqual: "\u2291",
-    SquareSuperset: "\u2290",
-    SquareSupersetEqual: "\u2292",
-    SquareUnion: "\u2294",
-    squarf: "\u25aa",
-    squ: "\u25a1",
-    squf: "\u25aa",
-    srarr: "\u2192",
-    Sscr: "\ud835\udcae",
-    sscr: "\ud835\udcc8",
-    ssetmn: "\u2216",
-    ssmile: "\u2323",
-    sstarf: "\u22c6",
-    Star: "\u22c6",
-    star: "\u2606",
-    starf: "\u2605",
-    straightepsilon: "\u03f5",
-    straightphi: "\u03d5",
-    strns: "\xaf",
-    sub: "\u2282",
-    Sub: "\u22d0",
-    subdot: "\u2abd",
-    subE: "\u2ac5",
-    sube: "\u2286",
-    subedot: "\u2ac3",
-    submult: "\u2ac1",
-    subnE: "\u2acb",
-    subne: "\u228a",
-    subplus: "\u2abf",
-    subrarr: "\u2979",
-    subset: "\u2282",
-    Subset: "\u22d0",
-    subseteq: "\u2286",
-    subseteqq: "\u2ac5",
-    SubsetEqual: "\u2286",
-    subsetneq: "\u228a",
-    subsetneqq: "\u2acb",
-    subsim: "\u2ac7",
-    subsub: "\u2ad5",
-    subsup: "\u2ad3",
-    succapprox: "\u2ab8",
-    succ: "\u227b",
-    succcurlyeq: "\u227d",
-    Succeeds: "\u227b",
-    SucceedsEqual: "\u2ab0",
-    SucceedsSlantEqual: "\u227d",
-    SucceedsTilde: "\u227f",
-    succeq: "\u2ab0",
-    succnapprox: "\u2aba",
-    succneqq: "\u2ab6",
-    succnsim: "\u22e9",
-    succsim: "\u227f",
-    SuchThat: "\u220b",
-    sum: "\u2211",
-    Sum: "\u2211",
-    sung: "\u266a",
-    sup1: "\xb9",
-    sup2: "\xb2",
-    sup3: "\xb3",
-    sup: "\u2283",
-    Sup: "\u22d1",
-    supdot: "\u2abe",
-    supdsub: "\u2ad8",
-    supE: "\u2ac6",
-    supe: "\u2287",
-    supedot: "\u2ac4",
-    Superset: "\u2283",
-    SupersetEqual: "\u2287",
-    suphsol: "\u27c9",
-    suphsub: "\u2ad7",
-    suplarr: "\u297b",
-    supmult: "\u2ac2",
-    supnE: "\u2acc",
-    supne: "\u228b",
-    supplus: "\u2ac0",
-    supset: "\u2283",
-    Supset: "\u22d1",
-    supseteq: "\u2287",
-    supseteqq: "\u2ac6",
-    supsetneq: "\u228b",
-    supsetneqq: "\u2acc",
-    supsim: "\u2ac8",
-    supsub: "\u2ad4",
-    supsup: "\u2ad6",
-    swarhk: "\u2926",
-    swarr: "\u2199",
-    swArr: "\u21d9",
-    swarrow: "\u2199",
-    swnwar: "\u292a",
-    szlig: "\xdf",
-    Tab: "\t",
-    target: "\u2316",
-    Tau: "\u03a4",
-    tau: "\u03c4",
-    tbrk: "\u23b4",
-    Tcaron: "\u0164",
-    tcaron: "\u0165",
-    Tcedil: "\u0162",
-    tcedil: "\u0163",
-    Tcy: "\u0422",
-    tcy: "\u0442",
-    tdot: "\u20db",
-    telrec: "\u2315",
-    Tfr: "\ud835\udd17",
-    tfr: "\ud835\udd31",
-    there4: "\u2234",
-    therefore: "\u2234",
-    Therefore: "\u2234",
-    Theta: "\u0398",
-    theta: "\u03b8",
-    thetasym: "\u03d1",
-    thetav: "\u03d1",
-    thickapprox: "\u2248",
-    thicksim: "\u223c",
-    ThickSpace: "\u205f\u200a",
-    ThinSpace: "\u2009",
-    thinsp: "\u2009",
-    thkap: "\u2248",
-    thksim: "\u223c",
-    THORN: "\xde",
-    thorn: "\xfe",
-    tilde: "\u02dc",
-    Tilde: "\u223c",
-    TildeEqual: "\u2243",
-    TildeFullEqual: "\u2245",
-    TildeTilde: "\u2248",
-    timesbar: "\u2a31",
-    timesb: "\u22a0",
-    times: "\xd7",
-    timesd: "\u2a30",
-    tint: "\u222d",
-    toea: "\u2928",
-    topbot: "\u2336",
-    topcir: "\u2af1",
-    top: "\u22a4",
-    Topf: "\ud835\udd4b",
-    topf: "\ud835\udd65",
-    topfork: "\u2ada",
-    tosa: "\u2929",
-    tprime: "\u2034",
-    trade: "\u2122",
-    TRADE: "\u2122",
-    triangle: "\u25b5",
-    triangledown: "\u25bf",
-    triangleleft: "\u25c3",
-    trianglelefteq: "\u22b4",
-    triangleq: "\u225c",
-    triangleright: "\u25b9",
-    trianglerighteq: "\u22b5",
-    tridot: "\u25ec",
-    trie: "\u225c",
-    triminus: "\u2a3a",
-    TripleDot: "\u20db",
-    triplus: "\u2a39",
-    trisb: "\u29cd",
-    tritime: "\u2a3b",
-    trpezium: "\u23e2",
-    Tscr: "\ud835\udcaf",
-    tscr: "\ud835\udcc9",
-    TScy: "\u0426",
-    tscy: "\u0446",
-    TSHcy: "\u040b",
-    tshcy: "\u045b",
-    Tstrok: "\u0166",
-    tstrok: "\u0167",
-    twixt: "\u226c",
-    twoheadleftarrow: "\u219e",
-    twoheadrightarrow: "\u21a0",
-    Uacute: "\xda",
-    uacute: "\xfa",
-    uarr: "\u2191",
-    Uarr: "\u219f",
-    uArr: "\u21d1",
-    Uarrocir: "\u2949",
-    Ubrcy: "\u040e",
-    ubrcy: "\u045e",
-    Ubreve: "\u016c",
-    ubreve: "\u016d",
-    Ucirc: "\xdb",
-    ucirc: "\xfb",
-    Ucy: "\u0423",
-    ucy: "\u0443",
-    udarr: "\u21c5",
-    Udblac: "\u0170",
-    udblac: "\u0171",
-    udhar: "\u296e",
-    ufisht: "\u297e",
-    Ufr: "\ud835\udd18",
-    ufr: "\ud835\udd32",
-    Ugrave: "\xd9",
-    ugrave: "\xf9",
-    uHar: "\u2963",
-    uharl: "\u21bf",
-    uharr: "\u21be",
-    uhblk: "\u2580",
-    ulcorn: "\u231c",
-    ulcorner: "\u231c",
-    ulcrop: "\u230f",
-    ultri: "\u25f8",
-    Umacr: "\u016a",
-    umacr: "\u016b",
-    uml: "\xa8",
-    UnderBar: "_",
-    UnderBrace: "\u23df",
-    UnderBracket: "\u23b5",
-    UnderParenthesis: "\u23dd",
-    Union: "\u22c3",
-    UnionPlus: "\u228e",
-    Uogon: "\u0172",
-    uogon: "\u0173",
-    Uopf: "\ud835\udd4c",
-    uopf: "\ud835\udd66",
-    UpArrowBar: "\u2912",
-    uparrow: "\u2191",
-    UpArrow: "\u2191",
-    Uparrow: "\u21d1",
-    UpArrowDownArrow: "\u21c5",
-    updownarrow: "\u2195",
-    UpDownArrow: "\u2195",
-    Updownarrow: "\u21d5",
-    UpEquilibrium: "\u296e",
-    upharpoonleft: "\u21bf",
-    upharpoonright: "\u21be",
-    uplus: "\u228e",
-    UpperLeftArrow: "\u2196",
-    UpperRightArrow: "\u2197",
-    upsi: "\u03c5",
-    Upsi: "\u03d2",
-    upsih: "\u03d2",
-    Upsilon: "\u03a5",
-    upsilon: "\u03c5",
-    UpTeeArrow: "\u21a5",
-    UpTee: "\u22a5",
-    upuparrows: "\u21c8",
-    urcorn: "\u231d",
-    urcorner: "\u231d",
-    urcrop: "\u230e",
-    Uring: "\u016e",
-    uring: "\u016f",
-    urtri: "\u25f9",
-    Uscr: "\ud835\udcb0",
-    uscr: "\ud835\udcca",
-    utdot: "\u22f0",
-    Utilde: "\u0168",
-    utilde: "\u0169",
-    utri: "\u25b5",
-    utrif: "\u25b4",
-    uuarr: "\u21c8",
-    Uuml: "\xdc",
-    uuml: "\xfc",
-    uwangle: "\u29a7",
-    vangrt: "\u299c",
-    varepsilon: "\u03f5",
-    varkappa: "\u03f0",
-    varnothing: "\u2205",
-    varphi: "\u03d5",
-    varpi: "\u03d6",
-    varpropto: "\u221d",
-    varr: "\u2195",
-    vArr: "\u21d5",
-    varrho: "\u03f1",
-    varsigma: "\u03c2",
-    varsubsetneq: "\u228a\ufe00",
-    varsubsetneqq: "\u2acb\ufe00",
-    varsupsetneq: "\u228b\ufe00",
-    varsupsetneqq: "\u2acc\ufe00",
-    vartheta: "\u03d1",
-    vartriangleleft: "\u22b2",
-    vartriangleright: "\u22b3",
-    vBar: "\u2ae8",
-    Vbar: "\u2aeb",
-    vBarv: "\u2ae9",
-    Vcy: "\u0412",
-    vcy: "\u0432",
-    vdash: "\u22a2",
-    vDash: "\u22a8",
-    Vdash: "\u22a9",
-    VDash: "\u22ab",
-    Vdashl: "\u2ae6",
-    veebar: "\u22bb",
-    vee: "\u2228",
-    Vee: "\u22c1",
-    veeeq: "\u225a",
-    vellip: "\u22ee",
-    verbar: "|",
-    Verbar: "\u2016",
-    vert: "|",
-    Vert: "\u2016",
-    VerticalBar: "\u2223",
-    VerticalLine: "|",
-    VerticalSeparator: "\u2758",
-    VerticalTilde: "\u2240",
-    VeryThinSpace: "\u200a",
-    Vfr: "\ud835\udd19",
-    vfr: "\ud835\udd33",
-    vltri: "\u22b2",
-    vnsub: "\u2282\u20d2",
-    vnsup: "\u2283\u20d2",
-    Vopf: "\ud835\udd4d",
-    vopf: "\ud835\udd67",
-    vprop: "\u221d",
-    vrtri: "\u22b3",
-    Vscr: "\ud835\udcb1",
-    vscr: "\ud835\udccb",
-    vsubnE: "\u2acb\ufe00",
-    vsubne: "\u228a\ufe00",
-    vsupnE: "\u2acc\ufe00",
-    vsupne: "\u228b\ufe00",
-    Vvdash: "\u22aa",
-    vzigzag: "\u299a",
-    Wcirc: "\u0174",
-    wcirc: "\u0175",
-    wedbar: "\u2a5f",
-    wedge: "\u2227",
-    Wedge: "\u22c0",
-    wedgeq: "\u2259",
-    weierp: "\u2118",
-    Wfr: "\ud835\udd1a",
-    wfr: "\ud835\udd34",
-    Wopf: "\ud835\udd4e",
-    wopf: "\ud835\udd68",
-    wp: "\u2118",
-    wr: "\u2240",
-    wreath: "\u2240",
-    Wscr: "\ud835\udcb2",
-    wscr: "\ud835\udccc",
-    xcap: "\u22c2",
-    xcirc: "\u25ef",
-    xcup: "\u22c3",
-    xdtri: "\u25bd",
-    Xfr: "\ud835\udd1b",
-    xfr: "\ud835\udd35",
-    xharr: "\u27f7",
-    xhArr: "\u27fa",
-    Xi: "\u039e",
-    xi: "\u03be",
-    xlarr: "\u27f5",
-    xlArr: "\u27f8",
-    xmap: "\u27fc",
-    xnis: "\u22fb",
-    xodot: "\u2a00",
-    Xopf: "\ud835\udd4f",
-    xopf: "\ud835\udd69",
-    xoplus: "\u2a01",
-    xotime: "\u2a02",
-    xrarr: "\u27f6",
-    xrArr: "\u27f9",
-    Xscr: "\ud835\udcb3",
-    xscr: "\ud835\udccd",
-    xsqcup: "\u2a06",
-    xuplus: "\u2a04",
-    xutri: "\u25b3",
-    xvee: "\u22c1",
-    xwedge: "\u22c0",
-    Yacute: "\xdd",
-    yacute: "\xfd",
-    YAcy: "\u042f",
-    yacy: "\u044f",
-    Ycirc: "\u0176",
-    ycirc: "\u0177",
-    Ycy: "\u042b",
-    ycy: "\u044b",
-    yen: "\xa5",
-    Yfr: "\ud835\udd1c",
-    yfr: "\ud835\udd36",
-    YIcy: "\u0407",
-    yicy: "\u0457",
-    Yopf: "\ud835\udd50",
-    yopf: "\ud835\udd6a",
-    Yscr: "\ud835\udcb4",
-    yscr: "\ud835\udcce",
-    YUcy: "\u042e",
-    yucy: "\u044e",
-    yuml: "\xff",
-    Yuml: "\u0178",
-    Zacute: "\u0179",
-    zacute: "\u017a",
-    Zcaron: "\u017d",
-    zcaron: "\u017e",
-    Zcy: "\u0417",
-    zcy: "\u0437",
-    Zdot: "\u017b",
-    zdot: "\u017c",
-    zeetrf: "\u2128",
-    ZeroWidthSpace: "\u200b",
-    Zeta: "\u0396",
-    zeta: "\u03b6",
-    zfr: "\ud835\udd37",
-    Zfr: "\u2128",
-    ZHcy: "\u0416",
-    zhcy: "\u0436",
-    zigrarr: "\u21dd",
-    zopf: "\ud835\udd6b",
-    Zopf: "\u2124",
-    Zscr: "\ud835\udcb5",
-    zscr: "\ud835\udccf",
-    zwj: "\u200d",
-    zwnj: "\u200c"
-  };
-  /*eslint quotes:0*/  var entities = require$$0;
-  var regex$4 = /[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]/;
-  var encodeCache = {};
-  // Create a lookup array where anything but characters in `chars` string
-  // and alphanumeric chars is percent-encoded.
-  
-    function getEncodeCache(exclude) {
-    var i, ch, cache = encodeCache[exclude];
-    if (cache) {
-      return cache;
-    }
-    cache = encodeCache[exclude] = [];
-    for (i = 0; i < 128; i++) {
-      ch = String.fromCharCode(i);
-      if (/^[0-9a-z]$/i.test(ch)) {
-        // always allow unencoded alphanumeric characters
-        cache.push(ch);
-      } else {
-        cache.push("%" + ("0" + i.toString(16).toUpperCase()).slice(-2));
-      }
-    }
-    for (i = 0; i < exclude.length; i++) {
-      cache[exclude.charCodeAt(i)] = exclude[i];
-    }
-    return cache;
-  }
-  // Encode unsafe characters with percent-encoding, skipping already
-  // encoded sequences.
-  
-  //  - string       - string to encode
-  //  - exclude      - list of characters to ignore (in addition to a-zA-Z0-9)
-  //  - keepEscaped  - don't encode '%' in a correct escape sequence (default: true)
-  
-    function encode$2(string, exclude, keepEscaped) {
-    var i, l, code, nextCode, cache, result = "";
-    if (typeof exclude !== "string") {
-      // encode(string, keepEscaped)
-      keepEscaped = exclude;
-      exclude = encode$2.defaultChars;
-    }
-    if (typeof keepEscaped === "undefined") {
-      keepEscaped = true;
-    }
-    cache = getEncodeCache(exclude);
-    for (i = 0, l = string.length; i < l; i++) {
-      code = string.charCodeAt(i);
-      if (keepEscaped && code === 37 /* % */ && i + 2 < l) {
-        if (/^[0-9a-f]{2}$/i.test(string.slice(i + 1, i + 3))) {
-          result += string.slice(i, i + 3);
-          i += 2;
-          continue;
-        }
-      }
-      if (code < 128) {
-        result += cache[code];
-        continue;
-      }
-      if (code >= 55296 && code <= 57343) {
-        if (code >= 55296 && code <= 56319 && i + 1 < l) {
-          nextCode = string.charCodeAt(i + 1);
-          if (nextCode >= 56320 && nextCode <= 57343) {
-            result += encodeURIComponent(string[i] + string[i + 1]);
-            i++;
-            continue;
-          }
-        }
-        result += "%EF%BF%BD";
-        continue;
-      }
-      result += encodeURIComponent(string[i]);
-    }
-    return result;
-  }
-  encode$2.defaultChars = ";/?:@&=+$,-_.!~*'()#";
-  encode$2.componentChars = "-_.!~*'()";
-  var encode_1 = encode$2;
-  /* eslint-disable no-bitwise */  var decodeCache = {};
-  function getDecodeCache(exclude) {
-    var i, ch, cache = decodeCache[exclude];
-    if (cache) {
-      return cache;
-    }
-    cache = decodeCache[exclude] = [];
-    for (i = 0; i < 128; i++) {
-      ch = String.fromCharCode(i);
-      cache.push(ch);
-    }
-    for (i = 0; i < exclude.length; i++) {
-      ch = exclude.charCodeAt(i);
-      cache[ch] = "%" + ("0" + ch.toString(16).toUpperCase()).slice(-2);
-    }
-    return cache;
-  }
-  // Decode percent-encoded string.
-  
-    function decode$2(string, exclude) {
-    var cache;
-    if (typeof exclude !== "string") {
-      exclude = decode$2.defaultChars;
-    }
-    cache = getDecodeCache(exclude);
-    return string.replace(/(%[a-f0-9]{2})+/gi, (function(seq) {
-      var i, l, b1, b2, b3, b4, chr, result = "";
-      for (i = 0, l = seq.length; i < l; i += 3) {
-        b1 = parseInt(seq.slice(i + 1, i + 3), 16);
-        if (b1 < 128) {
-          result += cache[b1];
-          continue;
-        }
-        if ((b1 & 224) === 192 && i + 3 < l) {
-          // 110xxxxx 10xxxxxx
-          b2 = parseInt(seq.slice(i + 4, i + 6), 16);
-          if ((b2 & 192) === 128) {
-            chr = b1 << 6 & 1984 | b2 & 63;
-            if (chr < 128) {
-              result += "\ufffd\ufffd";
-            } else {
-              result += String.fromCharCode(chr);
-            }
-            i += 3;
-            continue;
-          }
-        }
-        if ((b1 & 240) === 224 && i + 6 < l) {
-          // 1110xxxx 10xxxxxx 10xxxxxx
-          b2 = parseInt(seq.slice(i + 4, i + 6), 16);
-          b3 = parseInt(seq.slice(i + 7, i + 9), 16);
-          if ((b2 & 192) === 128 && (b3 & 192) === 128) {
-            chr = b1 << 12 & 61440 | b2 << 6 & 4032 | b3 & 63;
-            if (chr < 2048 || chr >= 55296 && chr <= 57343) {
-              result += "\ufffd\ufffd\ufffd";
-            } else {
-              result += String.fromCharCode(chr);
-            }
-            i += 6;
-            continue;
-          }
-        }
-        if ((b1 & 248) === 240 && i + 9 < l) {
-          // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
-          b2 = parseInt(seq.slice(i + 4, i + 6), 16);
-          b3 = parseInt(seq.slice(i + 7, i + 9), 16);
-          b4 = parseInt(seq.slice(i + 10, i + 12), 16);
-          if ((b2 & 192) === 128 && (b3 & 192) === 128 && (b4 & 192) === 128) {
-            chr = b1 << 18 & 1835008 | b2 << 12 & 258048 | b3 << 6 & 4032 | b4 & 63;
-            if (chr < 65536 || chr > 1114111) {
-              result += "\ufffd\ufffd\ufffd\ufffd";
-            } else {
-              chr -= 65536;
-              result += String.fromCharCode(55296 + (chr >> 10), 56320 + (chr & 1023));
-            }
-            i += 9;
-            continue;
-          }
-        }
-        result += "\ufffd";
-      }
-      return result;
-    }));
-  }
-  decode$2.defaultChars = ";/?:@&=+$,#";
-  decode$2.componentChars = "";
-  var decode_1 = decode$2;
-  var format$1 = function format(url) {
-    var result = "";
-    result += url.protocol || "";
-    result += url.slashes ? "//" : "";
-    result += url.auth ? url.auth + "@" : "";
-    if (url.hostname && url.hostname.indexOf(":") !== -1) {
-      // ipv6 address
-      result += "[" + url.hostname + "]";
-    } else {
-      result += url.hostname || "";
-    }
-    result += url.port ? ":" + url.port : "";
-    result += url.pathname || "";
-    result += url.search || "";
-    result += url.hash || "";
-    return result;
-  };
-  // Copyright Joyent, Inc. and other Node contributors.
-  
-  // Changes from joyent/node:
-  
-  // 1. No leading slash in paths,
-  //    e.g. in `url.parse('http://foo?bar')` pathname is ``, not `/`
-  
-  // 2. Backslashes are not replaced with slashes,
-  //    so `http:\\example.org\` is treated like a relative path
-  
-  // 3. Trailing colon is treated like a part of the path,
-  //    i.e. in `http://example.org:foo` pathname is `:foo`
-  
-  // 4. Nothing is URL-encoded in the resulting object,
-  //    (in joyent/node some chars in auth and paths are encoded)
-  
-  // 5. `url.parse()` does not have `parseQueryString` argument
-  
-  // 6. Removed extraneous result properties: `host`, `path`, `query`, etc.,
-  //    which can be constructed using other parts of the url.
-  
-    function Url() {
-    this.protocol = null;
-    this.slashes = null;
-    this.auth = null;
-    this.port = null;
-    this.hostname = null;
-    this.hash = null;
-    this.search = null;
-    this.pathname = null;
-  }
-  // Reference: RFC 3986, RFC 1808, RFC 2396
-  // define these here so at least they only have to be
-  // compiled once on the first module load.
-    var protocolPattern = /^([a-z0-9.+-]+:)/i, portPattern = /:[0-9]*$/, 
-  // Special case for a simple path URL
-  simplePathPattern = /^(\/\/?(?!\/)[^\?\s]*)(\?[^\s]*)?$/, 
-  // RFC 2396: characters reserved for delimiting URLs.
-  // We actually just auto-escape these.
-  delims = [ "<", ">", '"', "`", " ", "\r", "\n", "\t" ], 
-  // RFC 2396: characters not allowed for various reasons.
-  unwise = [ "{", "}", "|", "\\", "^", "`" ].concat(delims), 
-  // Allowed by RFCs, but cause of XSS attacks.  Always escape these.
-  autoEscape = [ "'" ].concat(unwise), 
-  // Characters that are never ever allowed in a hostname.
-  // Note that any invalid chars are also handled, but these
-  // are the ones that are *expected* to be seen, so we fast-path
-  // them.
-  nonHostChars = [ "%", "/", "?", ";", "#" ].concat(autoEscape), hostEndingChars = [ "/", "?", "#" ], hostnameMaxLen = 255, hostnamePartPattern = /^[+a-z0-9A-Z_-]{0,63}$/, hostnamePartStart = /^([+a-z0-9A-Z_-]{0,63})(.*)$/, 
-  // protocols that can allow "unsafe" and "unwise" chars.
-  /* eslint-disable no-script-url */
-  // protocols that never have a hostname.
-  hostlessProtocol = {
-    javascript: true,
-    "javascript:": true
-  }, 
-  // protocols that always contain a // bit.
-  slashedProtocol = {
-    http: true,
-    https: true,
-    ftp: true,
-    gopher: true,
-    file: true,
-    "http:": true,
-    "https:": true,
-    "ftp:": true,
-    "gopher:": true,
-    "file:": true
-  };
-  /* eslint-enable no-script-url */  function urlParse(url, slashesDenoteHost) {
-    if (url && url instanceof Url) {
-      return url;
-    }
-    var u = new Url;
-    u.parse(url, slashesDenoteHost);
-    return u;
-  }
-  Url.prototype.parse = function(url, slashesDenoteHost) {
-    var i, l, lowerProto, hec, slashes, rest = url;
-    // trim before proceeding.
-    // This is to support parse stuff like "  http://foo.com  \n"
-        rest = rest.trim();
-    if (!slashesDenoteHost && url.split("#").length === 1) {
-      // Try fast path regexp
-      var simplePath = simplePathPattern.exec(rest);
-      if (simplePath) {
-        this.pathname = simplePath[1];
-        if (simplePath[2]) {
-          this.search = simplePath[2];
-        }
-        return this;
-      }
-    }
-    var proto = protocolPattern.exec(rest);
-    if (proto) {
-      proto = proto[0];
-      lowerProto = proto.toLowerCase();
-      this.protocol = proto;
-      rest = rest.substr(proto.length);
-    }
-    // figure out if it's got a host
-    // user@server is *always* interpreted as a hostname, and url
-    // resolution will treat //foo/bar as host=foo,path=bar because that's
-    // how the browser resolves relative URLs.
-        if (slashesDenoteHost || proto || rest.match(/^\/\/[^@\/]+@[^@\/]+/)) {
-      slashes = rest.substr(0, 2) === "//";
-      if (slashes && !(proto && hostlessProtocol[proto])) {
-        rest = rest.substr(2);
-        this.slashes = true;
-      }
-    }
-    if (!hostlessProtocol[proto] && (slashes || proto && !slashedProtocol[proto])) {
-      // there's a hostname.
-      // the first instance of /, ?, ;, or # ends the host.
-      // If there is an @ in the hostname, then non-host chars *are* allowed
-      // to the left of the last @ sign, unless some host-ending character
-      // comes *before* the @-sign.
-      // URLs are obnoxious.
-      // ex:
-      // http://a@b@c/ => user:a@b host:c
-      // http://a@b?@c => user:a host:c path:/?@c
-      // v0.12 TODO(isaacs): This is not quite how Chrome does things.
-      // Review our test case against browsers more comprehensively.
-      // find the first instance of any hostEndingChars
-      var hostEnd = -1;
-      for (i = 0; i < hostEndingChars.length; i++) {
-        hec = rest.indexOf(hostEndingChars[i]);
-        if (hec !== -1 && (hostEnd === -1 || hec < hostEnd)) {
-          hostEnd = hec;
-        }
-      }
-      // at this point, either we have an explicit point where the
-      // auth portion cannot go past, or the last @ char is the decider.
-            var auth, atSign;
-      if (hostEnd === -1) {
-        // atSign can be anywhere.
-        atSign = rest.lastIndexOf("@");
-      } else {
-        // atSign must be in auth portion.
-        // http://a@b/c@d => host:b auth:a path:/c@d
-        atSign = rest.lastIndexOf("@", hostEnd);
-      }
-      // Now we have a portion which is definitely the auth.
-      // Pull that off.
-            if (atSign !== -1) {
-        auth = rest.slice(0, atSign);
-        rest = rest.slice(atSign + 1);
-        this.auth = auth;
-      }
-      // the host is the remaining to the left of the first non-host char
-            hostEnd = -1;
-      for (i = 0; i < nonHostChars.length; i++) {
-        hec = rest.indexOf(nonHostChars[i]);
-        if (hec !== -1 && (hostEnd === -1 || hec < hostEnd)) {
-          hostEnd = hec;
-        }
-      }
-      // if we still have not hit it, then the entire thing is a host.
-            if (hostEnd === -1) {
-        hostEnd = rest.length;
-      }
-      if (rest[hostEnd - 1] === ":") {
-        hostEnd--;
-      }
-      var host = rest.slice(0, hostEnd);
-      rest = rest.slice(hostEnd);
-      // pull out port.
-            this.parseHost(host);
-      // we've indicated that there is a hostname,
-      // so even if it's empty, it has to be present.
-            this.hostname = this.hostname || "";
-      // if hostname begins with [ and ends with ]
-      // assume that it's an IPv6 address.
-            var ipv6Hostname = this.hostname[0] === "[" && this.hostname[this.hostname.length - 1] === "]";
-      // validate a little.
-            if (!ipv6Hostname) {
-        var hostparts = this.hostname.split(/\./);
-        for (i = 0, l = hostparts.length; i < l; i++) {
-          var part = hostparts[i];
-          if (!part) {
-            continue;
-          }
-          if (!part.match(hostnamePartPattern)) {
-            var newpart = "";
-            for (var j = 0, k = part.length; j < k; j++) {
-              if (part.charCodeAt(j) > 127) {
-                // we replace non-ASCII char with a temporary placeholder
-                // we need this to make sure size of hostname is not
-                // broken by replacing non-ASCII by nothing
-                newpart += "x";
-              } else {
-                newpart += part[j];
-              }
-            }
-            // we test again with ASCII char only
-                        if (!newpart.match(hostnamePartPattern)) {
-              var validParts = hostparts.slice(0, i);
-              var notHost = hostparts.slice(i + 1);
-              var bit = part.match(hostnamePartStart);
-              if (bit) {
-                validParts.push(bit[1]);
-                notHost.unshift(bit[2]);
-              }
-              if (notHost.length) {
-                rest = notHost.join(".") + rest;
-              }
-              this.hostname = validParts.join(".");
-              break;
-            }
-          }
-        }
-      }
-      if (this.hostname.length > hostnameMaxLen) {
-        this.hostname = "";
-      }
-      // strip [ and ] from the hostname
-      // the host field still retains them, though
-            if (ipv6Hostname) {
-        this.hostname = this.hostname.substr(1, this.hostname.length - 2);
-      }
-    }
-    // chop off from the tail first.
-        var hash = rest.indexOf("#");
-    if (hash !== -1) {
-      // got a fragment string.
-      this.hash = rest.substr(hash);
-      rest = rest.slice(0, hash);
-    }
-    var qm = rest.indexOf("?");
-    if (qm !== -1) {
-      this.search = rest.substr(qm);
-      rest = rest.slice(0, qm);
-    }
-    if (rest) {
-      this.pathname = rest;
-    }
-    if (slashedProtocol[lowerProto] && this.hostname && !this.pathname) {
-      this.pathname = "";
-    }
-    return this;
-  };
-  Url.prototype.parseHost = function(host) {
-    var port = portPattern.exec(host);
-    if (port) {
-      port = port[0];
-      if (port !== ":") {
-        this.port = port.substr(1);
-      }
-      host = host.substr(0, host.length - port.length);
-    }
-    if (host) {
-      this.hostname = host;
-    }
-  };
-  var parse$1 = urlParse;
-  var encode$1 = encode_1;
-  var decode$1 = decode_1;
-  var format = format$1;
-  var parse = parse$1;
-  var mdurl = {
-    encode: encode$1,
-    decode: decode$1,
-    format: format,
-    parse: parse
-  };
-  var regex$3 = /[\0-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]/;
-  var regex$2 = /[\0-\x1F\x7F-\x9F]/;
-  var regex$1 = /[\xAD\u0600-\u0605\u061C\u06DD\u070F\u08E2\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\uFFF9-\uFFFB]|\uD804[\uDCBD\uDCCD]|\uD82F[\uDCA0-\uDCA3]|\uD834[\uDD73-\uDD7A]|\uDB40[\uDC01\uDC20-\uDC7F]/;
-  var regex = /[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]/;
-  var Any = regex$3;
-  var Cc = regex$2;
-  var Cf = regex$1;
-  var P = regex$4;
-  var Z = regex;
-  var uc_micro = {
-    Any: Any,
-    Cc: Cc,
-    Cf: Cf,
-    P: P,
-    Z: Z
-  };
-  var utils = createCommonjsModule((function(module, exports) {
-    function _class(obj) {
-      return Object.prototype.toString.call(obj);
-    }
-    function isString(obj) {
-      return _class(obj) === "[object String]";
-    }
-    var _hasOwnProperty = Object.prototype.hasOwnProperty;
-    function has(object, key) {
-      return _hasOwnProperty.call(object, key);
-    }
-    // Merge objects
-    
-        function assign(obj /*from1, from2, from3, ...*/) {
-      var sources = Array.prototype.slice.call(arguments, 1);
-      sources.forEach((function(source) {
-        if (!source) {
-          return;
-        }
-        if (typeof source !== "object") {
-          throw new TypeError(source + "must be object");
-        }
-        Object.keys(source).forEach((function(key) {
-          obj[key] = source[key];
-        }));
-      }));
-      return obj;
-    }
-    // Remove element from array and put another array at those position.
-    // Useful for some operations with tokens
-        function arrayReplaceAt(src, pos, newElements) {
-      return [].concat(src.slice(0, pos), newElements, src.slice(pos + 1));
-    }
-    ////////////////////////////////////////////////////////////////////////////////
-        function isValidEntityCode(c) {
-      /*eslint no-bitwise:0*/
-      // broken sequence
-      if (c >= 55296 && c <= 57343) {
-        return false;
-      }
-      // never used
-            if (c >= 64976 && c <= 65007) {
-        return false;
-      }
-      if ((c & 65535) === 65535 || (c & 65535) === 65534) {
-        return false;
-      }
-      // control codes
-            if (c >= 0 && c <= 8) {
-        return false;
-      }
-      if (c === 11) {
-        return false;
-      }
-      if (c >= 14 && c <= 31) {
-        return false;
-      }
-      if (c >= 127 && c <= 159) {
-        return false;
-      }
-      // out of range
-            if (c > 1114111) {
-        return false;
-      }
-      return true;
-    }
-    function fromCodePoint(c) {
-      /*eslint no-bitwise:0*/
-      if (c > 65535) {
-        c -= 65536;
-        var surrogate1 = 55296 + (c >> 10), surrogate2 = 56320 + (c & 1023);
-        return String.fromCharCode(surrogate1, surrogate2);
-      }
-      return String.fromCharCode(c);
-    }
-    var UNESCAPE_MD_RE = /\\([!"#$%&'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])/g;
-    var ENTITY_RE = /&([a-z#][a-z0-9]{1,31});/gi;
-    var UNESCAPE_ALL_RE = new RegExp(UNESCAPE_MD_RE.source + "|" + ENTITY_RE.source, "gi");
-    var DIGITAL_ENTITY_TEST_RE = /^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))$/i;
-    function replaceEntityPattern(match, name) {
-      var code;
-      if (has(entities, name)) {
-        return entities[name];
-      }
-      if (name.charCodeAt(0) === 35 /* # */ && DIGITAL_ENTITY_TEST_RE.test(name)) {
-        code = name[1].toLowerCase() === "x" ? parseInt(name.slice(2), 16) : parseInt(name.slice(1), 10);
-        if (isValidEntityCode(code)) {
-          return fromCodePoint(code);
-        }
-      }
-      return match;
-    }
-    /*function replaceEntities(str) {
-	  if (str.indexOf('&') < 0) { return str; }
-
-	  return str.replace(ENTITY_RE, replaceEntityPattern);
-	}*/    function unescapeMd(str) {
-      if (str.indexOf("\\") < 0) {
-        return str;
-      }
-      return str.replace(UNESCAPE_MD_RE, "$1");
-    }
-    function unescapeAll(str) {
-      if (str.indexOf("\\") < 0 && str.indexOf("&") < 0) {
-        return str;
-      }
-      return str.replace(UNESCAPE_ALL_RE, (function(match, escaped, entity) {
-        if (escaped) {
-          return escaped;
-        }
-        return replaceEntityPattern(match, entity);
-      }));
-    }
-    ////////////////////////////////////////////////////////////////////////////////
-        var HTML_ESCAPE_TEST_RE = /[&<>"]/;
-    var HTML_ESCAPE_REPLACE_RE = /[&<>"]/g;
-    var HTML_REPLACEMENTS = {
-      "&": "&amp;",
-      "<": "&lt;",
-      ">": "&gt;",
-      '"': "&quot;"
-    };
-    function replaceUnsafeChar(ch) {
-      return HTML_REPLACEMENTS[ch];
-    }
-    function escapeHtml(str) {
-      if (HTML_ESCAPE_TEST_RE.test(str)) {
-        return str.replace(HTML_ESCAPE_REPLACE_RE, replaceUnsafeChar);
-      }
-      return str;
-    }
-    ////////////////////////////////////////////////////////////////////////////////
-        var REGEXP_ESCAPE_RE = /[.?*+^$[\]\\(){}|-]/g;
-    function escapeRE(str) {
-      return str.replace(REGEXP_ESCAPE_RE, "\\$&");
-    }
-    ////////////////////////////////////////////////////////////////////////////////
-        function isSpace(code) {
-      switch (code) {
-       case 9:
-       case 32:
-        return true;
-      }
-      return false;
-    }
-    // Zs (unicode class) || [\t\f\v\r\n]
-        function isWhiteSpace(code) {
-      if (code >= 8192 && code <= 8202) {
-        return true;
-      }
-      switch (code) {
-       case 9:
- // \t
-               case 10:
- // \n
-               case 11:
- // \v
-               case 12:
- // \f
-               case 13:
- // \r
-               case 32:
-       case 160:
-       case 5760:
-       case 8239:
-       case 8287:
-       case 12288:
-        return true;
-      }
-      return false;
-    }
-    ////////////////////////////////////////////////////////////////////////////////
-    /*eslint-disable max-len*/
-    // Currently without astral characters support.
-        function isPunctChar(ch) {
-      return regex$4.test(ch);
-    }
-    // Markdown ASCII punctuation characters.
-    
-    // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~
-    // http://spec.commonmark.org/0.15/#ascii-punctuation-character
-    
-    // Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
-    
-        function isMdAsciiPunct(ch) {
-      switch (ch) {
-       case 33 /* ! */ :
-       case 34 /* " */ :
-       case 35 /* # */ :
-       case 36 /* $ */ :
-       case 37 /* % */ :
-       case 38 /* & */ :
-       case 39 /* ' */ :
-       case 40 /* ( */ :
-       case 41 /* ) */ :
-       case 42 /* * */ :
-       case 43 /* + */ :
-       case 44 /* , */ :
-       case 45 /* - */ :
-       case 46 /* . */ :
-       case 47 /* / */ :
-       case 58 /* : */ :
-       case 59 /* ; */ :
-       case 60 /* < */ :
-       case 61 /* = */ :
-       case 62 /* > */ :
-       case 63 /* ? */ :
-       case 64 /* @ */ :
-       case 91 /* [ */ :
-       case 92 /* \ */ :
-       case 93 /* ] */ :
-       case 94 /* ^ */ :
-       case 95 /* _ */ :
-       case 96 /* ` */ :
-       case 123 /* { */ :
-       case 124 /* | */ :
-       case 125 /* } */ :
-       case 126 /* ~ */ :
-        return true;
-
-       default:
-        return false;
-      }
-    }
-    // Hepler to unify [reference labels].
-    
-        function normalizeReference(str) {
-      // Trim and collapse whitespace
-      str = str.trim().replace(/\s+/g, " ");
-      // In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
-      // fixed in v12 (couldn't find any details).
-      
-      // So treat this one as a special case
-      // (remove this when node v10 is no longer supported).
-      
-            if ("\u1e9e".toLowerCase() === "\u1e7e") {
-        str = str.replace(/\u1e9e/g, "\xdf");
-      }
-      // .toLowerCase().toUpperCase() should get rid of all differences
-      // between letter variants.
-      
-      // Simple .toLowerCase() doesn't normalize 125 code points correctly,
-      // and .toUpperCase doesn't normalize 6 of them (list of exceptions:
-      // İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
-      // uppercased versions).
-      
-      // Here's an example showing how it happens. Lets take greek letter omega:
-      // uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
-      
-      // Unicode entries:
-      // 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8;
-      // 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
-      // 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
-      // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8;
-      
-      // Case-insensitive comparison should treat all of them as equivalent.
-      
-      // But .toLowerCase() doesn't change ϑ (it's already lowercase),
-      // and .toUpperCase() doesn't change ϴ (already uppercase).
-      
-      // Applying first lower then upper case normalizes any character:
-      // '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
-      
-      // Note: this is equivalent to unicode case folding; unicode normalization
-      // is a different step that is not required here.
-      
-      // Final result should be uppercased, because it's later stored in an object
-      // (this avoid a conflict with Object.prototype members,
-      // most notably, `__proto__`)
-      
-            return str.toLowerCase().toUpperCase();
-    }
-    ////////////////////////////////////////////////////////////////////////////////
-    // Re-export libraries commonly used in both markdown-it and its plugins,
-    // so plugins won't have to depend on them explicitly, which reduces their
-    // bundled size (e.g. a browser build).
-    
-        exports.lib = {};
-    exports.lib.mdurl = mdurl;
-    exports.lib.ucmicro = uc_micro;
-    exports.assign = assign;
-    exports.isString = isString;
-    exports.has = has;
-    exports.unescapeMd = unescapeMd;
-    exports.unescapeAll = unescapeAll;
-    exports.isValidEntityCode = isValidEntityCode;
-    exports.fromCodePoint = fromCodePoint;
-    // exports.replaceEntities     = replaceEntities;
-        exports.escapeHtml = escapeHtml;
-    exports.arrayReplaceAt = arrayReplaceAt;
-    exports.isSpace = isSpace;
-    exports.isWhiteSpace = isWhiteSpace;
-    exports.isMdAsciiPunct = isMdAsciiPunct;
-    exports.isPunctChar = isPunctChar;
-    exports.escapeRE = escapeRE;
-    exports.normalizeReference = normalizeReference;
-  }));
-  // Parse link label
-    var parse_link_label = function parseLinkLabel(state, start, disableNested) {
-    var level, found, marker, prevPos, labelEnd = -1, max = state.posMax, oldPos = state.pos;
-    state.pos = start + 1;
-    level = 1;
-    while (state.pos < max) {
-      marker = state.src.charCodeAt(state.pos);
-      if (marker === 93 /* ] */) {
-        level--;
-        if (level === 0) {
-          found = true;
-          break;
-        }
-      }
-      prevPos = state.pos;
-      state.md.inline.skipToken(state);
-      if (marker === 91 /* [ */) {
-        if (prevPos === state.pos - 1) {
-          // increase level if we find text `[`, which is not a part of any token
-          level++;
-        } else if (disableNested) {
-          state.pos = oldPos;
-          return -1;
-        }
-      }
-    }
-    if (found) {
-      labelEnd = state.pos;
-    }
-    // restore old state
-        state.pos = oldPos;
-    return labelEnd;
-  };
-  var unescapeAll$2 = utils.unescapeAll;
-  var parse_link_destination = function parseLinkDestination(str, start, max) {
-    var code, level, pos = start, result = {
-      ok: false,
-      pos: 0,
-      lines: 0,
-      str: ""
-    };
-    if (str.charCodeAt(pos) === 60 /* < */) {
-      pos++;
-      while (pos < max) {
-        code = str.charCodeAt(pos);
-        if (code === 10 /* \n */) {
-          return result;
-        }
-        if (code === 60 /* < */) {
-          return result;
-        }
-        if (code === 62 /* > */) {
-          result.pos = pos + 1;
-          result.str = unescapeAll$2(str.slice(start + 1, pos));
-          result.ok = true;
-          return result;
-        }
-        if (code === 92 /* \ */ && pos + 1 < max) {
-          pos += 2;
-          continue;
-        }
-        pos++;
-      }
-      // no closing '>'
-            return result;
-    }
-    // this should be ... } else { ... branch
-        level = 0;
-    while (pos < max) {
-      code = str.charCodeAt(pos);
-      if (code === 32) {
-        break;
-      }
-      // ascii control characters
-            if (code < 32 || code === 127) {
-        break;
-      }
-      if (code === 92 /* \ */ && pos + 1 < max) {
-        if (str.charCodeAt(pos + 1) === 32) {
-          break;
-        }
-        pos += 2;
-        continue;
-      }
-      if (code === 40 /* ( */) {
-        level++;
-        if (level > 32) {
-          return result;
-        }
-      }
-      if (code === 41 /* ) */) {
-        if (level === 0) {
-          break;
-        }
-        level--;
-      }
-      pos++;
-    }
-    if (start === pos) {
-      return result;
-    }
-    if (level !== 0) {
-      return result;
-    }
-    result.str = unescapeAll$2(str.slice(start, pos));
-    result.pos = pos;
-    result.ok = true;
-    return result;
-  };
-  var unescapeAll$1 = utils.unescapeAll;
-  var parse_link_title = function parseLinkTitle(str, start, max) {
-    var code, marker, lines = 0, pos = start, result = {
-      ok: false,
-      pos: 0,
-      lines: 0,
-      str: ""
-    };
-    if (pos >= max) {
-      return result;
-    }
-    marker = str.charCodeAt(pos);
-    if (marker !== 34 /* " */ && marker !== 39 /* ' */ && marker !== 40 /* ( */) {
-      return result;
-    }
-    pos++;
-    // if opening marker is "(", switch it to closing marker ")"
-        if (marker === 40) {
-      marker = 41;
-    }
-    while (pos < max) {
-      code = str.charCodeAt(pos);
-      if (code === marker) {
-        result.pos = pos + 1;
-        result.lines = lines;
-        result.str = unescapeAll$1(str.slice(start + 1, pos));
-        result.ok = true;
-        return result;
-      } else if (code === 40 /* ( */ && marker === 41 /* ) */) {
-        return result;
-      } else if (code === 10) {
-        lines++;
-      } else if (code === 92 /* \ */ && pos + 1 < max) {
-        pos++;
-        if (str.charCodeAt(pos) === 10) {
-          lines++;
-        }
-      }
-      pos++;
-    }
-    return result;
-  };
-  var parseLinkLabel = parse_link_label;
-  var parseLinkDestination = parse_link_destination;
-  var parseLinkTitle = parse_link_title;
-  var helpers = {
-    parseLinkLabel: parseLinkLabel,
-    parseLinkDestination: parseLinkDestination,
-    parseLinkTitle: parseLinkTitle
-  };
-  var assign$1 = utils.assign;
-  var unescapeAll = utils.unescapeAll;
-  var escapeHtml = utils.escapeHtml;
-  ////////////////////////////////////////////////////////////////////////////////
-    var default_rules = {};
-  default_rules.code_inline = function(tokens, idx, options, env, slf) {
-    var token = tokens[idx];
-    return "<code" + slf.renderAttrs(token) + ">" + escapeHtml(token.content) + "</code>";
-  };
-  default_rules.code_block = function(tokens, idx, options, env, slf) {
-    var token = tokens[idx];
-    return "<pre" + slf.renderAttrs(token) + "><code>" + escapeHtml(tokens[idx].content) + "</code></pre>\n";
-  };
-  default_rules.fence = function(tokens, idx, options, env, slf) {
-    var token = tokens[idx], info = token.info ? unescapeAll(token.info).trim() : "", langName = "", langAttrs = "", highlighted, i, arr, tmpAttrs, tmpToken;
-    if (info) {
-      arr = info.split(/(\s+)/g);
-      langName = arr[0];
-      langAttrs = arr.slice(2).join("");
-    }
-    if (options.highlight) {
-      highlighted = options.highlight(token.content, langName, langAttrs) || escapeHtml(token.content);
-    } else {
-      highlighted = escapeHtml(token.content);
-    }
-    if (highlighted.indexOf("<pre") === 0) {
-      return highlighted + "\n";
-    }
-    // If language exists, inject class gently, without modifying original token.
-    // May be, one day we will add .deepClone() for token and simplify this part, but
-    // now we prefer to keep things local.
-        if (info) {
-      i = token.attrIndex("class");
-      tmpAttrs = token.attrs ? token.attrs.slice() : [];
-      if (i < 0) {
-        tmpAttrs.push([ "class", options.langPrefix + langName ]);
-      } else {
-        tmpAttrs[i] = tmpAttrs[i].slice();
-        tmpAttrs[i][1] += " " + options.langPrefix + langName;
-      }
-      // Fake token just to render attributes
-            tmpToken = {
-        attrs: tmpAttrs
-      };
-      return "<pre><code" + slf.renderAttrs(tmpToken) + ">" + highlighted + "</code></pre>\n";
-    }
-    return "<pre><code" + slf.renderAttrs(token) + ">" + highlighted + "</code></pre>\n";
-  };
-  default_rules.image = function(tokens, idx, options, env, slf) {
-    var token = tokens[idx];
-    // "alt" attr MUST be set, even if empty. Because it's mandatory and
-    // should be placed on proper position for tests.
-    
-    // Replace content with actual value
-        token.attrs[token.attrIndex("alt")][1] = slf.renderInlineAsText(token.children, options, env);
-    return slf.renderToken(tokens, idx, options);
-  };
-  default_rules.hardbreak = function(tokens, idx, options /*, env */) {
-    return options.xhtmlOut ? "\n" : "\n";
-  };
-  default_rules.softbreak = function(tokens, idx, options /*, env */) {
-    return options.breaks ? options.xhtmlOut ? "\n" : "\n" : "\n";
-  };
-  default_rules.text = function(tokens, idx /*, options, env */) {
-    return escapeHtml(tokens[idx].content);
-  };
-  default_rules.html_block = function(tokens, idx /*, options, env */) {
-    return tokens[idx].content;
-  };
-  default_rules.html_inline = function(tokens, idx /*, options, env */) {
-    return tokens[idx].content;
-  };
-  /**
-	 * new Renderer()
-	 *
-	 * Creates new [[Renderer]] instance and fill [[Renderer#rules]] with defaults.
-	 **/  function Renderer() {
-    /**
-	   * Renderer#rules -> Object
-	   *
-	   * Contains render rules for tokens. Can be updated and extended.
-	   *
-	   * ##### Example
-	   *
-	   * ```javascript
-	   * var md = require('markdown-it')();
-	   *
-	   * md.renderer.rules.strong_open  = function () { return '<b>'; };
-	   * md.renderer.rules.strong_close = function () { return '</b>'; };
-	   *
-	   * var result = md.renderInline(...);
-	   * ```
-	   *
-	   * Each rule is called as independent static function with fixed signature:
-	   *
-	   * ```javascript
-	   * function my_token_render(tokens, idx, options, env, renderer) {
-	   *   // ...
-	   *   return renderedHTML;
-	   * }
-	   * ```
-	   *
-	   * See [source code](https://github.com/markdown-it/markdown-it/blob/master/lib/renderer.js)
-	   * for more details and examples.
-	   **/
-    this.rules = assign$1({}, default_rules);
-  }
-  /**
-	 * Renderer.renderAttrs(token) -> String
-	 *
-	 * Render token attributes to string.
-	 **/  Renderer.prototype.renderAttrs = function renderAttrs(token) {
-    var i, l, result;
-    if (!token.attrs) {
-      return "";
-    }
-    result = "";
-    for (i = 0, l = token.attrs.length; i < l; i++) {
-      result += " " + escapeHtml(token.attrs[i][0]) + '="' + escapeHtml(token.attrs[i][1]) + '"';
-    }
-    return result;
-  };
-  /**
-	 * Renderer.renderToken(tokens, idx, options) -> String
-	 * - tokens (Array): list of tokens
-	 * - idx (Numbed): token index to render
-	 * - options (Object): params of parser instance
-	 *
-	 * Default token renderer. Can be overriden by custom function
-	 * in [[Renderer#rules]].
-	 **/  Renderer.prototype.renderToken = function renderToken(tokens, idx, options) {
-    var nextToken, result = "", needLf = false, token = tokens[idx];
-    // Tight list paragraphs
-        if (token.hidden) {
-      return "";
-    }
-    // Insert a newline between hidden paragraph and subsequent opening
-    // block-level tag.
-    
-    // For example, here we should insert a newline before blockquote:
-    //  - a
-    //    >
-    
-        if (token.block && token.nesting !== -1 && idx && tokens[idx - 1].hidden) {
-      result += "\n";
-    }
-    // Add token name, e.g. `<img`
-        result += (token.nesting === -1 ? "</" : "<") + token.tag;
-    // Encode attributes, e.g. `<img src="foo"`
-        result += this.renderAttrs(token);
-    // Add a slash for self-closing tags, e.g. `<img src="foo" /`
-        if (token.nesting === 0 && options.xhtmlOut) {
-      result += " /";
-    }
-    // Check if we need to add a newline after this tag
-        if (token.block) {
-      needLf = true;
-      if (token.nesting === 1) {
-        if (idx + 1 < tokens.length) {
-          nextToken = tokens[idx + 1];
-          if (nextToken.type === "inline" || nextToken.hidden) {
-            // Block-level tag containing an inline tag.
-            needLf = false;
-          } else if (nextToken.nesting === -1 && nextToken.tag === token.tag) {
-            // Opening tag + closing tag of the same type. E.g. `<li></li>`.
-            needLf = false;
-          }
-        }
-      }
-    }
-    result += needLf ? ">\n" : ">";
-    return result;
-  };
-  /**
-	 * Renderer.renderInline(tokens, options, env) -> String
-	 * - tokens (Array): list on block tokens to render
-	 * - options (Object): params of parser instance
-	 * - env (Object): additional data from parsed input (references, for example)
-	 *
-	 * The same as [[Renderer.render]], but for single token of `inline` type.
-	 **/  Renderer.prototype.renderInline = function(tokens, options, env) {
-    var type, result = "", rules = this.rules;
-    for (var i = 0, len = tokens.length; i < len; i++) {
-      type = tokens[i].type;
-      if (typeof rules[type] !== "undefined") {
-        result += rules[type](tokens, i, options, env, this);
-      } else {
-        result += this.renderToken(tokens, i, options);
-      }
-    }
-    return result;
-  };
-  /** internal
-	 * Renderer.renderInlineAsText(tokens, options, env) -> String
-	 * - tokens (Array): list on block tokens to render
-	 * - options (Object): params of parser instance
-	 * - env (Object): additional data from parsed input (references, for example)
-	 *
-	 * Special kludge for image `alt` attributes to conform CommonMark spec.
-	 * Don't try to use it! Spec requires to show `alt` content with stripped markup,
-	 * instead of simple escaping.
-	 **/  Renderer.prototype.renderInlineAsText = function(tokens, options, env) {
-    var result = "";
-    for (var i = 0, len = tokens.length; i < len; i++) {
-      if (tokens[i].type === "text") {
-        result += tokens[i].content;
-      } else if (tokens[i].type === "image") {
-        result += this.renderInlineAsText(tokens[i].children, options, env);
-      } else if (tokens[i].type === "softbreak") {
-        result += "\n";
-      }
-    }
-    return result;
-  };
-  /**
-	 * Renderer.render(tokens, options, env) -> String
-	 * - tokens (Array): list on block tokens to render
-	 * - options (Object): params of parser instance
-	 * - env (Object): additional data from parsed input (references, for example)
-	 *
-	 * Takes token stream and generates HTML. Probably, you will never need to call
-	 * this method directly.
-	 **/  Renderer.prototype.render = function(tokens, options, env) {
-    var i, len, type, result = "", rules = this.rules;
-    for (i = 0, len = tokens.length; i < len; i++) {
-      type = tokens[i].type;
-      if (type === "inline") {
-        result += this.renderInline(tokens[i].children, options, env);
-      } else if (typeof rules[type] !== "undefined") {
-        result += rules[type](tokens, i, options, env, this);
-      } else {
-        result += this.renderToken(tokens, i, options, env);
-      }
-    }
-    return result;
-  };
-  var renderer = Renderer;
-  /**
-	 * class Ruler
-	 *
-	 * Helper class, used by [[MarkdownIt#core]], [[MarkdownIt#block]] and
-	 * [[MarkdownIt#inline]] to manage sequences of functions (rules):
-	 *
-	 * - keep rules in defined order
-	 * - assign the name to each rule
-	 * - enable/disable rules
-	 * - add/replace rules
-	 * - allow assign rules to additional named chains (in the same)
-	 * - cacheing lists of active rules
-	 *
-	 * You will not need use this class directly until write plugins. For simple
-	 * rules control use [[MarkdownIt.disable]], [[MarkdownIt.enable]] and
-	 * [[MarkdownIt.use]].
-	 **/
-  /**
-	 * new Ruler()
-	 **/  function Ruler() {
-    // List of added rules. Each element is:
-    // {
-    //   name: XXX,
-    //   enabled: Boolean,
-    //   fn: Function(),
-    //   alt: [ name2, name3 ]
-    // }
-    this.__rules__ = [];
-    // Cached rule chains.
-    
-    // First level - chain name, '' for default.
-    // Second level - diginal anchor for fast filtering by charcodes.
-    
-        this.__cache__ = null;
-  }
-  ////////////////////////////////////////////////////////////////////////////////
-  // Helper methods, should not be used directly
-  // Find rule index by name
-  
-    Ruler.prototype.__find__ = function(name) {
-    for (var i = 0; i < this.__rules__.length; i++) {
-      if (this.__rules__[i].name === name) {
-        return i;
-      }
-    }
-    return -1;
-  };
-  // Build rules lookup cache
-  
-    Ruler.prototype.__compile__ = function() {
-    var self = this;
-    var chains = [ "" ];
-    // collect unique names
-        self.__rules__.forEach((function(rule) {
-      if (!rule.enabled) {
-        return;
-      }
-      rule.alt.forEach((function(altName) {
-        if (chains.indexOf(altName) < 0) {
-          chains.push(altName);
-        }
-      }));
-    }));
-    self.__cache__ = {};
-    chains.forEach((function(chain) {
-      self.__cache__[chain] = [];
-      self.__rules__.forEach((function(rule) {
-        if (!rule.enabled) {
-          return;
-        }
-        if (chain && rule.alt.indexOf(chain) < 0) {
-          return;
-        }
-        self.__cache__[chain].push(rule.fn);
-      }));
-    }));
-  };
-  /**
-	 * Ruler.at(name, fn [, options])
-	 * - name (String): rule name to replace.
-	 * - fn (Function): new rule function.
-	 * - options (Object): new rule options (not mandatory).
-	 *
-	 * Replace rule by name with new function & options. Throws error if name not
-	 * found.
-	 *
-	 * ##### Options:
-	 *
-	 * - __alt__ - array with names of "alternate" chains.
-	 *
-	 * ##### Example
-	 *
-	 * Replace existing typographer replacement rule with new one:
-	 *
-	 * ```javascript
-	 * var md = require('markdown-it')();
-	 *
-	 * md.core.ruler.at('replacements', function replace(state) {
-	 *   //...
-	 * });
-	 * ```
-	 **/  Ruler.prototype.at = function(name, fn, options) {
-    var index = this.__find__(name);
-    var opt = options || {};
-    if (index === -1) {
-      throw new Error("Parser rule not found: " + name);
-    }
-    this.__rules__[index].fn = fn;
-    this.__rules__[index].alt = opt.alt || [];
-    this.__cache__ = null;
-  };
-  /**
-	 * Ruler.before(beforeName, ruleName, fn [, options])
-	 * - beforeName (String): new rule will be added before this one.
-	 * - ruleName (String): name of added rule.
-	 * - fn (Function): rule function.
-	 * - options (Object): rule options (not mandatory).
-	 *
-	 * Add new rule to chain before one with given name. See also
-	 * [[Ruler.after]], [[Ruler.push]].
-	 *
-	 * ##### Options:
-	 *
-	 * - __alt__ - array with names of "alternate" chains.
-	 *
-	 * ##### Example
-	 *
-	 * ```javascript
-	 * var md = require('markdown-it')();
-	 *
-	 * md.block.ruler.before('paragraph', 'my_rule', function replace(state) {
-	 *   //...
-	 * });
-	 * ```
-	 **/  Ruler.prototype.before = function(beforeName, ruleName, fn, options) {
-    var index = this.__find__(beforeName);
-    var opt = options || {};
-    if (index === -1) {
-      throw new Error("Parser rule not found: " + beforeName);
-    }
-    this.__rules__.splice(index, 0, {
-      name: ruleName,
-      enabled: true,
-      fn: fn,
-      alt: opt.alt || []
-    });
-    this.__cache__ = null;
-  };
-  /**
-	 * Ruler.after(afterName, ruleName, fn [, options])
-	 * - afterName (String): new rule will be added after this one.
-	 * - ruleName (String): name of added rule.
-	 * - fn (Function): rule function.
-	 * - options (Object): rule options (not mandatory).
-	 *
-	 * Add new rule to chain after one with given name. See also
-	 * [[Ruler.before]], [[Ruler.push]].
-	 *
-	 * ##### Options:
-	 *
-	 * - __alt__ - array with names of "alternate" chains.
-	 *
-	 * ##### Example
-	 *
-	 * ```javascript
-	 * var md = require('markdown-it')();
-	 *
-	 * md.inline.ruler.after('text', 'my_rule', function replace(state) {
-	 *   //...
-	 * });
-	 * ```
-	 **/  Ruler.prototype.after = function(afterName, ruleName, fn, options) {
-    var index = this.__find__(afterName);
-    var opt = options || {};
-    if (index === -1) {
-      throw new Error("Parser rule not found: " + afterName);
-    }
-    this.__rules__.splice(index + 1, 0, {
-      name: ruleName,
-      enabled: true,
-      fn: fn,
-      alt: opt.alt || []
-    });
-    this.__cache__ = null;
-  };
-  /**
-	 * Ruler.push(ruleName, fn [, options])
-	 * - ruleName (String): name of added rule.
-	 * - fn (Function): rule function.
-	 * - options (Object): rule options (not mandatory).
-	 *
-	 * Push new rule to the end of chain. See also
-	 * [[Ruler.before]], [[Ruler.after]].
-	 *
-	 * ##### Options:
-	 *
-	 * - __alt__ - array with names of "alternate" chains.
-	 *
-	 * ##### Example
-	 *
-	 * ```javascript
-	 * var md = require('markdown-it')();
-	 *
-	 * md.core.ruler.push('my_rule', function replace(state) {
-	 *   //...
-	 * });
-	 * ```
-	 **/  Ruler.prototype.push = function(ruleName, fn, options) {
-    var opt = options || {};
-    this.__rules__.push({
-      name: ruleName,
-      enabled: true,
-      fn: fn,
-      alt: opt.alt || []
-    });
-    this.__cache__ = null;
-  };
-  /**
-	 * Ruler.enable(list [, ignoreInvalid]) -> Array
-	 * - list (String|Array): list of rule names to enable.
-	 * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found.
-	 *
-	 * Enable rules with given names. If any rule name not found - throw Error.
-	 * Errors can be disabled by second param.
-	 *
-	 * Returns list of found rule names (if no exception happened).
-	 *
-	 * See also [[Ruler.disable]], [[Ruler.enableOnly]].
-	 **/  Ruler.prototype.enable = function(list, ignoreInvalid) {
-    if (!Array.isArray(list)) {
-      list = [ list ];
-    }
-    var result = [];
-    // Search by name and enable
-        list.forEach((function(name) {
-      var idx = this.__find__(name);
-      if (idx < 0) {
-        if (ignoreInvalid) {
-          return;
-        }
-        throw new Error("Rules manager: invalid rule name " + name);
-      }
-      this.__rules__[idx].enabled = true;
-      result.push(name);
-    }), this);
-    this.__cache__ = null;
-    return result;
-  };
-  /**
-	 * Ruler.enableOnly(list [, ignoreInvalid])
-	 * - list (String|Array): list of rule names to enable (whitelist).
-	 * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found.
-	 *
-	 * Enable rules with given names, and disable everything else. If any rule name
-	 * not found - throw Error. Errors can be disabled by second param.
-	 *
-	 * See also [[Ruler.disable]], [[Ruler.enable]].
-	 **/  Ruler.prototype.enableOnly = function(list, ignoreInvalid) {
-    if (!Array.isArray(list)) {
-      list = [ list ];
-    }
-    this.__rules__.forEach((function(rule) {
-      rule.enabled = false;
-    }));
-    this.enable(list, ignoreInvalid);
-  };
-  /**
-	 * Ruler.disable(list [, ignoreInvalid]) -> Array
-	 * - list (String|Array): list of rule names to disable.
-	 * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found.
-	 *
-	 * Disable rules with given names. If any rule name not found - throw Error.
-	 * Errors can be disabled by second param.
-	 *
-	 * Returns list of found rule names (if no exception happened).
-	 *
-	 * See also [[Ruler.enable]], [[Ruler.enableOnly]].
-	 **/  Ruler.prototype.disable = function(list, ignoreInvalid) {
-    if (!Array.isArray(list)) {
-      list = [ list ];
-    }
-    var result = [];
-    // Search by name and disable
-        list.forEach((function(name) {
-      var idx = this.__find__(name);
-      if (idx < 0) {
-        if (ignoreInvalid) {
-          return;
-        }
-        throw new Error("Rules manager: invalid rule name " + name);
-      }
-      this.__rules__[idx].enabled = false;
-      result.push(name);
-    }), this);
-    this.__cache__ = null;
-    return result;
-  };
-  /**
-	 * Ruler.getRules(chainName) -> Array
-	 *
-	 * Return array of active functions (rules) for given chain name. It analyzes
-	 * rules configuration, compiles caches if not exists and returns result.
-	 *
-	 * Default chain name is `''` (empty string). It can't be skipped. That's
-	 * done intentionally, to keep signature monomorphic for high speed.
-	 **/  Ruler.prototype.getRules = function(chainName) {
-    if (this.__cache__ === null) {
-      this.__compile__();
-    }
-    // Chain can be empty, if rules disabled. But we still have to return Array.
-        return this.__cache__[chainName] || [];
-  };
-  var ruler = Ruler;
-  // Normalize input string
-  // https://spec.commonmark.org/0.29/#line-ending
-    var NEWLINES_RE = /\r\n?|\n/g;
-  var NULL_RE = /\0/g;
-  var normalize = function normalize(state) {
-    var str;
-    // Normalize newlines
-        str = state.src.replace(NEWLINES_RE, "\n");
-    // Replace NULL characters
-        str = str.replace(NULL_RE, "\ufffd");
-    state.src = str;
-  };
-  var block = function block(state) {
-    var token;
-    if (state.inlineMode) {
-      token = new state.Token("inline", "", 0);
-      token.content = state.src;
-      token.map = [ 0, 1 ];
-      token.children = [];
-      state.tokens.push(token);
-    } else {
-      state.md.block.parse(state.src, state.md, state.env, state.tokens);
-    }
-  };
-  var inline = function inline(state) {
-    var tokens = state.tokens, tok, i, l;
-    // Parse inlines
-        for (i = 0, l = tokens.length; i < l; i++) {
-      tok = tokens[i];
-      if (tok.type === "inline") {
-        state.md.inline.parse(tok.content, state.md, state.env, tok.children);
-      }
-    }
-  };
-  var arrayReplaceAt = utils.arrayReplaceAt;
-  function isLinkOpen$1(str) {
-    return /^<a[>\s]/i.test(str);
-  }
-  function isLinkClose$1(str) {
-    return /^<\/a\s*>/i.test(str);
-  }
-  var linkify$1 = function linkify(state) {
-    var i, j, l, tokens, token, currentToken, nodes, ln, text, pos, lastPos, level, htmlLinkLevel, url, fullUrl, urlText, blockTokens = state.tokens, links;
-    if (!state.md.options.linkify) {
-      return;
-    }
-    for (j = 0, l = blockTokens.length; j < l; j++) {
-      if (blockTokens[j].type !== "inline" || !state.md.linkify.pretest(blockTokens[j].content)) {
-        continue;
-      }
-      tokens = blockTokens[j].children;
-      htmlLinkLevel = 0;
-      // We scan from the end, to keep position when new tags added.
-      // Use reversed logic in links start/end match
-            for (i = tokens.length - 1; i >= 0; i--) {
-        currentToken = tokens[i];
-        // Skip content of markdown links
-                if (currentToken.type === "link_close") {
-          i--;
-          while (tokens[i].level !== currentToken.level && tokens[i].type !== "link_open") {
-            i--;
-          }
-          continue;
-        }
-        // Skip content of html tag links
-                if (currentToken.type === "html_inline") {
-          if (isLinkOpen$1(currentToken.content) && htmlLinkLevel > 0) {
-            htmlLinkLevel--;
-          }
-          if (isLinkClose$1(currentToken.content)) {
-            htmlLinkLevel++;
-          }
-        }
-        if (htmlLinkLevel > 0) {
-          continue;
-        }
-        if (currentToken.type === "text" && state.md.linkify.test(currentToken.content)) {
-          text = currentToken.content;
-          links = state.md.linkify.match(text);
-          // Now split string to nodes
-                    nodes = [];
-          level = currentToken.level;
-          lastPos = 0;
-          // forbid escape sequence at the start of the string,
-          // this avoids http\://example.com/ from being linkified as
-          // http:<a href="//example.com/">//example.com/</a>
-                    if (links.length > 0 && links[0].index === 0 && i > 0 && tokens[i - 1].type === "text_special") {
-            links = links.slice(1);
-          }
-          for (ln = 0; ln < links.length; ln++) {
-            url = links[ln].url;
-            fullUrl = state.md.normalizeLink(url);
-            if (!state.md.validateLink(fullUrl)) {
-              continue;
-            }
-            urlText = links[ln].text;
-            // Linkifier might send raw hostnames like "example.com", where url
-            // starts with domain name. So we prepend http:// in those cases,
-            // and remove it afterwards.
-            
-                        if (!links[ln].schema) {
-              urlText = state.md.normalizeLinkText("http://" + urlText).replace(/^http:\/\//, "");
-            } else if (links[ln].schema === "mailto:" && !/^mailto:/i.test(urlText)) {
-              urlText = state.md.normalizeLinkText("mailto:" + urlText).replace(/^mailto:/, "");
-            } else {
-              urlText = state.md.normalizeLinkText(urlText);
-            }
-            pos = links[ln].index;
-            if (pos > lastPos) {
-              token = new state.Token("text", "", 0);
-              token.content = text.slice(lastPos, pos);
-              token.level = level;
-              nodes.push(token);
-            }
-            token = new state.Token("link_open", "a", 1);
-            token.attrs = [ [ "href", fullUrl ] ];
-            token.level = level++;
-            token.markup = "linkify";
-            token.info = "auto";
-            nodes.push(token);
-            token = new state.Token("text", "", 0);
-            token.content = urlText;
-            token.level = level;
-            nodes.push(token);
-            token = new state.Token("link_close", "a", -1);
-            token.level = --level;
-            token.markup = "linkify";
-            token.info = "auto";
-            nodes.push(token);
-            lastPos = links[ln].lastIndex;
-          }
-          if (lastPos < text.length) {
-            token = new state.Token("text", "", 0);
-            token.content = text.slice(lastPos);
-            token.level = level;
-            nodes.push(token);
-          }
-          // replace current node
-                    blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes);
-        }
-      }
-    }
-  };
-  // Simple typographic replacements
-  // TODO:
-  // - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾
-  // - multiplications 2 x 4 -> 2 × 4
-    var RARE_RE = /\+-|\.\.|\?\?\?\?|!!!!|,,|--/;
-  // Workaround for phantomjs - need regex without /g flag,
-  // or root check will fail every second time
-    var SCOPED_ABBR_TEST_RE = /\((c|tm|r)\)/i;
-  var SCOPED_ABBR_RE = /\((c|tm|r)\)/gi;
-  var SCOPED_ABBR = {
-    c: "\xa9",
-    r: "\xae",
-    tm: "\u2122"
-  };
-  function replaceFn(match, name) {
-    return SCOPED_ABBR[name.toLowerCase()];
-  }
-  function replace_scoped(inlineTokens) {
-    var i, token, inside_autolink = 0;
-    for (i = inlineTokens.length - 1; i >= 0; i--) {
-      token = inlineTokens[i];
-      if (token.type === "text" && !inside_autolink) {
-        token.content = token.content.replace(SCOPED_ABBR_RE, replaceFn);
-      }
-      if (token.type === "link_open" && token.info === "auto") {
-        inside_autolink--;
-      }
-      if (token.type === "link_close" && token.info === "auto") {
-        inside_autolink++;
-      }
-    }
-  }
-  function replace_rare(inlineTokens) {
-    var i, token, inside_autolink = 0;
-    for (i = inlineTokens.length - 1; i >= 0; i--) {
-      token = inlineTokens[i];
-      if (token.type === "text" && !inside_autolink) {
-        if (RARE_RE.test(token.content)) {
-          token.content = token.content.replace(/\+-/g, "\xb1").replace(/\.{2,}/g, "\u2026").replace(/([?!])\u2026/g, "$1..").replace(/([?!]){4,}/g, "$1$1$1").replace(/,{2,}/g, ",").replace(/(^|[^-])---(?=[^-]|$)/gm, "$1\u2014").replace(/(^|\s)--(?=\s|$)/gm, "$1\u2013").replace(/(^|[^-\s])--(?=[^-\s]|$)/gm, "$1\u2013");
-        }
-      }
-      if (token.type === "link_open" && token.info === "auto") {
-        inside_autolink--;
-      }
-      if (token.type === "link_close" && token.info === "auto") {
-        inside_autolink++;
-      }
-    }
-  }
-  var replacements = function replace(state) {
-    var blkIdx;
-    if (!state.md.options.typographer) {
-      return;
-    }
-    for (blkIdx = state.tokens.length - 1; blkIdx >= 0; blkIdx--) {
-      if (state.tokens[blkIdx].type !== "inline") {
-        continue;
-      }
-      if (SCOPED_ABBR_TEST_RE.test(state.tokens[blkIdx].content)) {
-        replace_scoped(state.tokens[blkIdx].children);
-      }
-      if (RARE_RE.test(state.tokens[blkIdx].content)) {
-        replace_rare(state.tokens[blkIdx].children);
-      }
-    }
-  };
-  var isWhiteSpace$1 = utils.isWhiteSpace;
-  var isPunctChar$1 = utils.isPunctChar;
-  var isMdAsciiPunct$1 = utils.isMdAsciiPunct;
-  var QUOTE_TEST_RE = /['"]/;
-  var QUOTE_RE = /['"]/g;
-  var APOSTROPHE = "\u2019";
- /* ’ */  function replaceAt(str, index, ch) {
-    return str.slice(0, index) + ch + str.slice(index + 1);
-  }
-  function process_inlines(tokens, state) {
-    var i, token, text, t, pos, max, thisLevel, item, lastChar, nextChar, isLastPunctChar, isNextPunctChar, isLastWhiteSpace, isNextWhiteSpace, canOpen, canClose, j, isSingle, stack, openQuote, closeQuote;
-    stack = [];
-    for (i = 0; i < tokens.length; i++) {
-      token = tokens[i];
-      thisLevel = tokens[i].level;
-      for (j = stack.length - 1; j >= 0; j--) {
-        if (stack[j].level <= thisLevel) {
-          break;
-        }
-      }
-      stack.length = j + 1;
-      if (token.type !== "text") {
-        continue;
-      }
-      text = token.content;
-      pos = 0;
-      max = text.length;
-      /*eslint no-labels:0,block-scoped-var:0*/      OUTER: while (pos < max) {
-        QUOTE_RE.lastIndex = pos;
-        t = QUOTE_RE.exec(text);
-        if (!t) {
-          break;
-        }
-        canOpen = canClose = true;
-        pos = t.index + 1;
-        isSingle = t[0] === "'";
-        // Find previous character,
-        // default to space if it's the beginning of the line
-        
-                lastChar = 32;
-        if (t.index - 1 >= 0) {
-          lastChar = text.charCodeAt(t.index - 1);
-        } else {
-          for (j = i - 1; j >= 0; j--) {
-            if (tokens[j].type === "softbreak" || tokens[j].type === "hardbreak") break;
- // lastChar defaults to 0x20
-                        if (!tokens[j].content) continue;
- // should skip all tokens except 'text', 'html_inline' or 'code_inline'
-                        lastChar = tokens[j].content.charCodeAt(tokens[j].content.length - 1);
-            break;
-          }
-        }
-        // Find next character,
-        // default to space if it's the end of the line
-        
-                nextChar = 32;
-        if (pos < max) {
-          nextChar = text.charCodeAt(pos);
-        } else {
-          for (j = i + 1; j < tokens.length; j++) {
-            if (tokens[j].type === "softbreak" || tokens[j].type === "hardbreak") break;
- // nextChar defaults to 0x20
-                        if (!tokens[j].content) continue;
- // should skip all tokens except 'text', 'html_inline' or 'code_inline'
-                        nextChar = tokens[j].content.charCodeAt(0);
-            break;
-          }
-        }
-        isLastPunctChar = isMdAsciiPunct$1(lastChar) || isPunctChar$1(String.fromCharCode(lastChar));
-        isNextPunctChar = isMdAsciiPunct$1(nextChar) || isPunctChar$1(String.fromCharCode(nextChar));
-        isLastWhiteSpace = isWhiteSpace$1(lastChar);
-        isNextWhiteSpace = isWhiteSpace$1(nextChar);
-        if (isNextWhiteSpace) {
-          canOpen = false;
-        } else if (isNextPunctChar) {
-          if (!(isLastWhiteSpace || isLastPunctChar)) {
-            canOpen = false;
-          }
-        }
-        if (isLastWhiteSpace) {
-          canClose = false;
-        } else if (isLastPunctChar) {
-          if (!(isNextWhiteSpace || isNextPunctChar)) {
-            canClose = false;
-          }
-        }
-        if (nextChar === 34 /* " */ && t[0] === '"') {
-          if (lastChar >= 48 /* 0 */ && lastChar <= 57 /* 9 */) {
-            // special case: 1"" - count first quote as an inch
-            canClose = canOpen = false;
-          }
-        }
-        if (canOpen && canClose) {
-          // Replace quotes in the middle of punctuation sequence, but not
-          // in the middle of the words, i.e.:
-          // 1. foo " bar " baz - not replaced
-          // 2. foo-"-bar-"-baz - replaced
-          // 3. foo"bar"baz     - not replaced
-          canOpen = isLastPunctChar;
-          canClose = isNextPunctChar;
-        }
-        if (!canOpen && !canClose) {
-          // middle of word
-          if (isSingle) {
-            token.content = replaceAt(token.content, t.index, APOSTROPHE);
-          }
-          continue;
-        }
-        if (canClose) {
-          // this could be a closing quote, rewind the stack to get a match
-          for (j = stack.length - 1; j >= 0; j--) {
-            item = stack[j];
-            if (stack[j].level < thisLevel) {
-              break;
-            }
-            if (item.single === isSingle && stack[j].level === thisLevel) {
-              item = stack[j];
-              if (isSingle) {
-                openQuote = state.md.options.quotes[2];
-                closeQuote = state.md.options.quotes[3];
-              } else {
-                openQuote = state.md.options.quotes[0];
-                closeQuote = state.md.options.quotes[1];
-              }
-              // replace token.content *before* tokens[item.token].content,
-              // because, if they are pointing at the same token, replaceAt
-              // could mess up indices when quote length != 1
-                            token.content = replaceAt(token.content, t.index, closeQuote);
-              tokens[item.token].content = replaceAt(tokens[item.token].content, item.pos, openQuote);
-              pos += closeQuote.length - 1;
-              if (item.token === i) {
-                pos += openQuote.length - 1;
-              }
-              text = token.content;
-              max = text.length;
-              stack.length = j;
-              continue OUTER;
-            }
-          }
-        }
-        if (canOpen) {
-          stack.push({
-            token: i,
-            pos: t.index,
-            single: isSingle,
-            level: thisLevel
-          });
-        } else if (canClose && isSingle) {
-          token.content = replaceAt(token.content, t.index, APOSTROPHE);
-        }
-      }
-    }
-  }
-  var smartquotes = function smartquotes(state) {
-    /*eslint max-depth:0*/
-    var blkIdx;
-    if (!state.md.options.typographer) {
-      return;
-    }
-    for (blkIdx = state.tokens.length - 1; blkIdx >= 0; blkIdx--) {
-      if (state.tokens[blkIdx].type !== "inline" || !QUOTE_TEST_RE.test(state.tokens[blkIdx].content)) {
-        continue;
-      }
-      process_inlines(state.tokens[blkIdx].children, state);
-    }
-  };
-  // Join raw text tokens with the rest of the text
-    var text_join = function text_join(state) {
-    var j, l, tokens, curr, max, last, blockTokens = state.tokens;
-    for (j = 0, l = blockTokens.length; j < l; j++) {
-      if (blockTokens[j].type !== "inline") continue;
-      tokens = blockTokens[j].children;
-      max = tokens.length;
-      for (curr = 0; curr < max; curr++) {
-        if (tokens[curr].type === "text_special") {
-          tokens[curr].type = "text";
-        }
-      }
-      for (curr = last = 0; curr < max; curr++) {
-        if (tokens[curr].type === "text" && curr + 1 < max && tokens[curr + 1].type === "text") {
-          // collapse two adjacent text nodes
-          tokens[curr + 1].content = tokens[curr].content + tokens[curr + 1].content;
-        } else {
-          if (curr !== last) {
-            tokens[last] = tokens[curr];
-          }
-          last++;
-        }
-      }
-      if (curr !== last) {
-        tokens.length = last;
-      }
-    }
-  };
-  // Token class
-  /**
-	 * class Token
-	 **/
-  /**
-	 * new Token(type, tag, nesting)
-	 *
-	 * Create new token and fill passed properties.
-	 **/  function Token(type, tag, nesting) {
-    /**
-	   * Token#type -> String
-	   *
-	   * Type of the token (string, e.g. "paragraph_open")
-	   **/
-    this.type = type;
-    /**
-	   * Token#tag -> String
-	   *
-	   * html tag name, e.g. "p"
-	   **/    this.tag = tag;
-    /**
-	   * Token#attrs -> Array
-	   *
-	   * Html attributes. Format: `[ [ name1, value1 ], [ name2, value2 ] ]`
-	   **/    this.attrs = null;
-    /**
-	   * Token#map -> Array
-	   *
-	   * Source map info. Format: `[ line_begin, line_end ]`
-	   **/    this.map = null;
-    /**
-	   * Token#nesting -> Number
-	   *
-	   * Level change (number in {-1, 0, 1} set), where:
-	   *
-	   * -  `1` means the tag is opening
-	   * -  `0` means the tag is self-closing
-	   * - `-1` means the tag is closing
-	   **/    this.nesting = nesting;
-    /**
-	   * Token#level -> Number
-	   *
-	   * nesting level, the same as `state.level`
-	   **/    this.level = 0;
-    /**
-	   * Token#children -> Array
-	   *
-	   * An array of child nodes (inline and img tokens)
-	   **/    this.children = null;
-    /**
-	   * Token#content -> String
-	   *
-	   * In a case of self-closing tag (code, html, fence, etc.),
-	   * it has contents of this tag.
-	   **/    this.content = "";
-    /**
-	   * Token#markup -> String
-	   *
-	   * '*' or '_' for emphasis, fence string for fence, etc.
-	   **/    this.markup = "";
-    /**
-	   * Token#info -> String
-	   *
-	   * Additional information:
-	   *
-	   * - Info string for "fence" tokens
-	   * - The value "auto" for autolink "link_open" and "link_close" tokens
-	   * - The string value of the item marker for ordered-list "list_item_open" tokens
-	   **/    this.info = "";
-    /**
-	   * Token#meta -> Object
-	   *
-	   * A place for plugins to store an arbitrary data
-	   **/    this.meta = null;
-    /**
-	   * Token#block -> Boolean
-	   *
-	   * True for block-level tokens, false for inline tokens.
-	   * Used in renderer to calculate line breaks
-	   **/    this.block = false;
-    /**
-	   * Token#hidden -> Boolean
-	   *
-	   * If it's true, ignore this element when rendering. Used for tight lists
-	   * to hide paragraphs.
-	   **/    this.hidden = false;
-  }
-  /**
-	 * Token.attrIndex(name) -> Number
-	 *
-	 * Search attribute index by name.
-	 **/  Token.prototype.attrIndex = function attrIndex(name) {
-    var attrs, i, len;
-    if (!this.attrs) {
-      return -1;
-    }
-    attrs = this.attrs;
-    for (i = 0, len = attrs.length; i < len; i++) {
-      if (attrs[i][0] === name) {
-        return i;
-      }
-    }
-    return -1;
-  };
-  /**
-	 * Token.attrPush(attrData)
-	 *
-	 * Add `[ name, value ]` attribute to list. Init attrs if necessary
-	 **/  Token.prototype.attrPush = function attrPush(attrData) {
-    if (this.attrs) {
-      this.attrs.push(attrData);
-    } else {
-      this.attrs = [ attrData ];
-    }
-  };
-  /**
-	 * Token.attrSet(name, value)
-	 *
-	 * Set `name` attribute to `value`. Override old value if exists.
-	 **/  Token.prototype.attrSet = function attrSet(name, value) {
-    var idx = this.attrIndex(name), attrData = [ name, value ];
-    if (idx < 0) {
-      this.attrPush(attrData);
-    } else {
-      this.attrs[idx] = attrData;
-    }
-  };
-  /**
-	 * Token.attrGet(name)
-	 *
-	 * Get the value of attribute `name`, or null if it does not exist.
-	 **/  Token.prototype.attrGet = function attrGet(name) {
-    var idx = this.attrIndex(name), value = null;
-    if (idx >= 0) {
-      value = this.attrs[idx][1];
-    }
-    return value;
-  };
-  /**
-	 * Token.attrJoin(name, value)
-	 *
-	 * Join value to existing attribute via space. Or create new attribute if not
-	 * exists. Useful to operate with token classes.
-	 **/  Token.prototype.attrJoin = function attrJoin(name, value) {
-    var idx = this.attrIndex(name);
-    if (idx < 0) {
-      this.attrPush([ name, value ]);
-    } else {
-      this.attrs[idx][1] = this.attrs[idx][1] + " " + value;
-    }
-  };
-  var token = Token;
-  function StateCore(src, md, env) {
-    this.src = src;
-    this.env = env;
-    this.tokens = [];
-    this.inlineMode = false;
-    this.md = md;
- // link to parser instance
-    }
-  // re-export Token class to use in core rules
-    StateCore.prototype.Token = token;
-  var state_core = StateCore;
-  var _rules$2 = [ [ "normalize", normalize ], [ "block", block ], [ "inline", inline ], [ "linkify", linkify$1 ], [ "replacements", replacements ], [ "smartquotes", smartquotes ], 
-  // `text_join` finds `text_special` tokens (for escape sequences)
-  // and joins them with the rest of the text
-  [ "text_join", text_join ] ];
-  /**
-	 * new Core()
-	 **/  function Core() {
-    /**
-	   * Core#ruler -> Ruler
-	   *
-	   * [[Ruler]] instance. Keep configuration of core rules.
-	   **/
-    this.ruler = new ruler;
-    for (var i = 0; i < _rules$2.length; i++) {
-      this.ruler.push(_rules$2[i][0], _rules$2[i][1]);
-    }
-  }
-  /**
-	 * Core.process(state)
-	 *
-	 * Executes core chain rules.
-	 **/  Core.prototype.process = function(state) {
-    var i, l, rules;
-    rules = this.ruler.getRules("");
-    for (i = 0, l = rules.length; i < l; i++) {
-      rules[i](state);
-    }
-  };
-  Core.prototype.State = state_core;
-  var parser_core = Core;
-  var isSpace$a = utils.isSpace;
-  function getLine(state, line) {
-    var pos = state.bMarks[line] + state.tShift[line], max = state.eMarks[line];
-    return state.src.slice(pos, max);
-  }
-  function escapedSplit(str) {
-    var result = [], pos = 0, max = str.length, ch, isEscaped = false, lastPos = 0, current = "";
-    ch = str.charCodeAt(pos);
-    while (pos < max) {
-      if (ch === 124 /* | */) {
-        if (!isEscaped) {
-          // pipe separating cells, '|'
-          result.push(current + str.substring(lastPos, pos));
-          current = "";
-          lastPos = pos + 1;
-        } else {
-          // escaped pipe, '\|'
-          current += str.substring(lastPos, pos - 1);
-          lastPos = pos;
-        }
-      }
-      isEscaped = ch === 92 /* \ */;
-      pos++;
-      ch = str.charCodeAt(pos);
-    }
-    result.push(current + str.substring(lastPos));
-    return result;
-  }
-  var table = function table(state, startLine, endLine, silent) {
-    var ch, lineText, pos, i, l, nextLine, columns, columnCount, token, aligns, t, tableLines, tbodyLines, oldParentType, terminate, terminatorRules, firstCh, secondCh;
-    // should have at least two lines
-        if (startLine + 2 > endLine) {
-      return false;
-    }
-    nextLine = startLine + 1;
-    if (state.sCount[nextLine] < state.blkIndent) {
-      return false;
-    }
-    // if it's indented more than 3 spaces, it should be a code block
-        if (state.sCount[nextLine] - state.blkIndent >= 4) {
-      return false;
-    }
-    // first character of the second line should be '|', '-', ':',
-    // and no other characters are allowed but spaces;
-    // basically, this is the equivalent of /^[-:|][-:|\s]*$/ regexp
-        pos = state.bMarks[nextLine] + state.tShift[nextLine];
-    if (pos >= state.eMarks[nextLine]) {
-      return false;
-    }
-    firstCh = state.src.charCodeAt(pos++);
-    if (firstCh !== 124 /* | */ && firstCh !== 45 /* - */ && firstCh !== 58 /* : */) {
-      return false;
-    }
-    if (pos >= state.eMarks[nextLine]) {
-      return false;
-    }
-    secondCh = state.src.charCodeAt(pos++);
-    if (secondCh !== 124 /* | */ && secondCh !== 45 /* - */ && secondCh !== 58 /* : */ && !isSpace$a(secondCh)) {
-      return false;
-    }
-    // if first character is '-', then second character must not be a space
-    // (due to parsing ambiguity with list)
-        if (firstCh === 45 /* - */ && isSpace$a(secondCh)) {
-      return false;
-    }
-    while (pos < state.eMarks[nextLine]) {
-      ch = state.src.charCodeAt(pos);
-      if (ch !== 124 /* | */ && ch !== 45 /* - */ && ch !== 58 /* : */ && !isSpace$a(ch)) {
-        return false;
-      }
-      pos++;
-    }
-    lineText = getLine(state, startLine + 1);
-    columns = lineText.split("|");
-    aligns = [];
-    for (i = 0; i < columns.length; i++) {
-      t = columns[i].trim();
-      if (!t) {
-        // allow empty columns before and after table, but not in between columns;
-        // e.g. allow ` |---| `, disallow ` ---||--- `
-        if (i === 0 || i === columns.length - 1) {
-          continue;
-        } else {
-          return false;
-        }
-      }
-      if (!/^:?-+:?$/.test(t)) {
-        return false;
-      }
-      if (t.charCodeAt(t.length - 1) === 58 /* : */) {
-        aligns.push(t.charCodeAt(0) === 58 /* : */ ? "center" : "right");
-      } else if (t.charCodeAt(0) === 58 /* : */) {
-        aligns.push("left");
-      } else {
-        aligns.push("");
-      }
-    }
-    lineText = getLine(state, startLine).trim();
-    if (lineText.indexOf("|") === -1) {
-      return false;
-    }
-    if (state.sCount[startLine] - state.blkIndent >= 4) {
-      return false;
-    }
-    columns = escapedSplit(lineText);
-    if (columns.length && columns[0] === "") columns.shift();
-    if (columns.length && columns[columns.length - 1] === "") columns.pop();
-    // header row will define an amount of columns in the entire table,
-    // and align row should be exactly the same (the rest of the rows can differ)
-        columnCount = columns.length;
-    if (columnCount === 0 || columnCount !== aligns.length) {
-      return false;
-    }
-    if (silent) {
-      return true;
-    }
-    oldParentType = state.parentType;
-    state.parentType = "table";
-    // use 'blockquote' lists for termination because it's
-    // the most similar to tables
-        terminatorRules = state.md.block.ruler.getRules("blockquote");
-    token = state.push("table_open", "table", 1);
-    token.map = tableLines = [ startLine, 0 ];
-    token = state.push("thead_open", "thead", 1);
-    token.map = [ startLine, startLine + 1 ];
-    token = state.push("tr_open", "tr", 1);
-    token.map = [ startLine, startLine + 1 ];
-    for (i = 0; i < columns.length; i++) {
-      token = state.push("th_open", "th", 1);
-      if (aligns[i]) {
-        token.attrs = [ [ "style", "text-align:" + aligns[i] ] ];
-      }
-      token = state.push("inline", "", 0);
-      token.content = columns[i].trim();
-      token.children = [];
-      token = state.push("th_close", "th", -1);
-    }
-    token = state.push("tr_close", "tr", -1);
-    token = state.push("thead_close", "thead", -1);
-    for (nextLine = startLine + 2; nextLine < endLine; nextLine++) {
-      if (state.sCount[nextLine] < state.blkIndent) {
-        break;
-      }
-      terminate = false;
-      for (i = 0, l = terminatorRules.length; i < l; i++) {
-        if (terminatorRules[i](state, nextLine, endLine, true)) {
-          terminate = true;
-          break;
-        }
-      }
-      if (terminate) {
-        break;
-      }
-      lineText = getLine(state, nextLine).trim();
-      if (!lineText) {
-        break;
-      }
-      if (state.sCount[nextLine] - state.blkIndent >= 4) {
-        break;
-      }
-      columns = escapedSplit(lineText);
-      if (columns.length && columns[0] === "") columns.shift();
-      if (columns.length && columns[columns.length - 1] === "") columns.pop();
-      if (nextLine === startLine + 2) {
-        token = state.push("tbody_open", "tbody", 1);
-        token.map = tbodyLines = [ startLine + 2, 0 ];
-      }
-      token = state.push("tr_open", "tr", 1);
-      token.map = [ nextLine, nextLine + 1 ];
-      for (i = 0; i < columnCount; i++) {
-        token = state.push("td_open", "td", 1);
-        if (aligns[i]) {
-          token.attrs = [ [ "style", "text-align:" + aligns[i] ] ];
-        }
-        token = state.push("inline", "", 0);
-        token.content = columns[i] ? columns[i].trim() : "";
-        token.children = [];
-        token = state.push("td_close", "td", -1);
-      }
-      token = state.push("tr_close", "tr", -1);
-    }
-    if (tbodyLines) {
-      token = state.push("tbody_close", "tbody", -1);
-      tbodyLines[1] = nextLine;
-    }
-    token = state.push("table_close", "table", -1);
-    tableLines[1] = nextLine;
-    state.parentType = oldParentType;
-    state.line = nextLine;
-    return true;
-  };
-  // Code block (4 spaces padded)
-    var code = function code(state, startLine, endLine /*, silent*/) {
-    var nextLine, last, token;
-    if (state.sCount[startLine] - state.blkIndent < 4) {
-      return false;
-    }
-    last = nextLine = startLine + 1;
-    while (nextLine < endLine) {
-      if (state.isEmpty(nextLine)) {
-        nextLine++;
-        continue;
-      }
-      if (state.sCount[nextLine] - state.blkIndent >= 4) {
-        nextLine++;
-        last = nextLine;
-        continue;
-      }
-      break;
-    }
-    state.line = last;
-    token = state.push("code_block", "code", 0);
-    token.content = state.getLines(startLine, last, 4 + state.blkIndent, false) + "\n";
-    token.map = [ startLine, state.line ];
-    return true;
-  };
-  // fences (``` lang, ~~~ lang)
-    var fence = function fence(state, startLine, endLine, silent) {
-    var marker, len, params, nextLine, mem, token, markup, haveEndMarker = false, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine];
-    // if it's indented more than 3 spaces, it should be a code block
-        if (state.sCount[startLine] - state.blkIndent >= 4) {
-      return false;
-    }
-    if (pos + 3 > max) {
-      return false;
-    }
-    marker = state.src.charCodeAt(pos);
-    if (marker !== 126 /* ~ */ && marker !== 96 /* ` */) {
-      return false;
-    }
-    // scan marker length
-        mem = pos;
-    pos = state.skipChars(pos, marker);
-    len = pos - mem;
-    if (len < 3) {
-      return false;
-    }
-    markup = state.src.slice(mem, pos);
-    params = state.src.slice(pos, max);
-    if (marker === 96 /* ` */) {
-      if (params.indexOf(String.fromCharCode(marker)) >= 0) {
-        return false;
-      }
-    }
-    // Since start is found, we can report success here in validation mode
-        if (silent) {
-      return true;
-    }
-    // search end of block
-        nextLine = startLine;
-    for (;;) {
-      nextLine++;
-      if (nextLine >= endLine) {
-        // unclosed block should be autoclosed by end of document.
-        // also block seems to be autoclosed by end of parent
-        break;
-      }
-      pos = mem = state.bMarks[nextLine] + state.tShift[nextLine];
-      max = state.eMarks[nextLine];
-      if (pos < max && state.sCount[nextLine] < state.blkIndent) {
-        // non-empty line with negative indent should stop the list:
-        // - ```
-        //  test
-        break;
-      }
-      if (state.src.charCodeAt(pos) !== marker) {
-        continue;
-      }
-      if (state.sCount[nextLine] - state.blkIndent >= 4) {
-        // closing fence should be indented less than 4 spaces
-        continue;
-      }
-      pos = state.skipChars(pos, marker);
-      // closing code fence must be at least as long as the opening one
-            if (pos - mem < len) {
-        continue;
-      }
-      // make sure tail has spaces only
-            pos = state.skipSpaces(pos);
-      if (pos < max) {
-        continue;
-      }
-      haveEndMarker = true;
-      // found!
-            break;
-    }
-    // If a fence has heading spaces, they should be removed from its inner block
-        len = state.sCount[startLine];
-    state.line = nextLine + (haveEndMarker ? 1 : 0);
-    token = state.push("fence", "code", 0);
-    token.info = params;
-    token.content = state.getLines(startLine + 1, nextLine, len, true);
-    token.markup = markup;
-    token.map = [ startLine, state.line ];
-    return true;
-  };
-  var isSpace$9 = utils.isSpace;
-  var blockquote = function blockquote(state, startLine, endLine, silent) {
-    var adjustTab, ch, i, initial, l, lastLineEmpty, lines, nextLine, offset, oldBMarks, oldBSCount, oldIndent, oldParentType, oldSCount, oldTShift, spaceAfterMarker, terminate, terminatorRules, token, isOutdented, oldLineMax = state.lineMax, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine];
-    // if it's indented more than 3 spaces, it should be a code block
-        if (state.sCount[startLine] - state.blkIndent >= 4) {
-      return false;
-    }
-    // check the block quote marker
-        if (state.src.charCodeAt(pos) !== 62 /* > */) {
-      return false;
-    }
-    // we know that it's going to be a valid blockquote,
-    // so no point trying to find the end of it in silent mode
-        if (silent) {
-      return true;
-    }
-    oldBMarks = [];
-    oldBSCount = [];
-    oldSCount = [];
-    oldTShift = [];
-    terminatorRules = state.md.block.ruler.getRules("blockquote");
-    oldParentType = state.parentType;
-    state.parentType = "blockquote";
-    // Search the end of the block
-    
-    // Block ends with either:
-    //  1. an empty line outside:
-    //     ```
-    //     > test
-    
-    //     ```
-    //  2. an empty line inside:
-    //     ```
-    //     >
-    //     test
-    //     ```
-    //  3. another tag:
-    //     ```
-    //     > test
-    //      - - -
-    //     ```
-        for (nextLine = startLine; nextLine < endLine; nextLine++) {
-      // check if it's outdented, i.e. it's inside list item and indented
-      // less than said list item:
-      // ```
-      // 1. anything
-      //    > current blockquote
-      // 2. checking this line
-      // ```
-      isOutdented = state.sCount[nextLine] < state.blkIndent;
-      pos = state.bMarks[nextLine] + state.tShift[nextLine];
-      max = state.eMarks[nextLine];
-      if (pos >= max) {
-        // Case 1: line is not inside the blockquote, and this line is empty.
-        break;
-      }
-      if (state.src.charCodeAt(pos++) === 62 /* > */ && !isOutdented) {
-        // This line is inside the blockquote.
-        // set offset past spaces and ">"
-        initial = state.sCount[nextLine] + 1;
-        // skip one optional space after '>'
-                if (state.src.charCodeAt(pos) === 32 /* space */) {
-          // ' >   test '
-          //     ^ -- position start of line here:
-          pos++;
-          initial++;
-          adjustTab = false;
-          spaceAfterMarker = true;
-        } else if (state.src.charCodeAt(pos) === 9 /* tab */) {
-          spaceAfterMarker = true;
-          if ((state.bsCount[nextLine] + initial) % 4 === 3) {
-            // '  >\t  test '
-            //       ^ -- position start of line here (tab has width===1)
-            pos++;
-            initial++;
-            adjustTab = false;
-          } else {
-            // ' >\t  test '
-            //    ^ -- position start of line here + shift bsCount slightly
-            //         to make extra space appear
-            adjustTab = true;
-          }
-        } else {
-          spaceAfterMarker = false;
-        }
-        offset = initial;
-        oldBMarks.push(state.bMarks[nextLine]);
-        state.bMarks[nextLine] = pos;
-        while (pos < max) {
-          ch = state.src.charCodeAt(pos);
-          if (isSpace$9(ch)) {
-            if (ch === 9) {
-              offset += 4 - (offset + state.bsCount[nextLine] + (adjustTab ? 1 : 0)) % 4;
-            } else {
-              offset++;
-            }
-          } else {
-            break;
-          }
-          pos++;
-        }
-        lastLineEmpty = pos >= max;
-        oldBSCount.push(state.bsCount[nextLine]);
-        state.bsCount[nextLine] = state.sCount[nextLine] + 1 + (spaceAfterMarker ? 1 : 0);
-        oldSCount.push(state.sCount[nextLine]);
-        state.sCount[nextLine] = offset - initial;
-        oldTShift.push(state.tShift[nextLine]);
-        state.tShift[nextLine] = pos - state.bMarks[nextLine];
-        continue;
-      }
-      // Case 2: line is not inside the blockquote, and the last line was empty.
-            if (lastLineEmpty) {
-        break;
-      }
-      // Case 3: another tag found.
-            terminate = false;
-      for (i = 0, l = terminatorRules.length; i < l; i++) {
-        if (terminatorRules[i](state, nextLine, endLine, true)) {
-          terminate = true;
-          break;
-        }
-      }
-      if (terminate) {
-        // Quirk to enforce "hard termination mode" for paragraphs;
-        // normally if you call `tokenize(state, startLine, nextLine)`,
-        // paragraphs will look below nextLine for paragraph continuation,
-        // but if blockquote is terminated by another tag, they shouldn't
-        state.lineMax = nextLine;
-        if (state.blkIndent !== 0) {
-          // state.blkIndent was non-zero, we now set it to zero,
-          // so we need to re-calculate all offsets to appear as
-          // if indent wasn't changed
-          oldBMarks.push(state.bMarks[nextLine]);
-          oldBSCount.push(state.bsCount[nextLine]);
-          oldTShift.push(state.tShift[nextLine]);
-          oldSCount.push(state.sCount[nextLine]);
-          state.sCount[nextLine] -= state.blkIndent;
-        }
-        break;
-      }
-      oldBMarks.push(state.bMarks[nextLine]);
-      oldBSCount.push(state.bsCount[nextLine]);
-      oldTShift.push(state.tShift[nextLine]);
-      oldSCount.push(state.sCount[nextLine]);
-      // A negative indentation means that this is a paragraph continuation
-      
-            state.sCount[nextLine] = -1;
-    }
-    oldIndent = state.blkIndent;
-    state.blkIndent = 0;
-    token = state.push("blockquote_open", "blockquote", 1);
-    token.markup = ">";
-    token.map = lines = [ startLine, 0 ];
-    state.md.block.tokenize(state, startLine, nextLine);
-    token = state.push("blockquote_close", "blockquote", -1);
-    token.markup = ">";
-    state.lineMax = oldLineMax;
-    state.parentType = oldParentType;
-    lines[1] = state.line;
-    // Restore original tShift; this might not be necessary since the parser
-    // has already been here, but just to make sure we can do that.
-        for (i = 0; i < oldTShift.length; i++) {
-      state.bMarks[i + startLine] = oldBMarks[i];
-      state.tShift[i + startLine] = oldTShift[i];
-      state.sCount[i + startLine] = oldSCount[i];
-      state.bsCount[i + startLine] = oldBSCount[i];
-    }
-    state.blkIndent = oldIndent;
-    return true;
-  };
-  var isSpace$8 = utils.isSpace;
-  var hr = function hr(state, startLine, endLine, silent) {
-    var marker, cnt, ch, token, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine];
-    // if it's indented more than 3 spaces, it should be a code block
-        if (state.sCount[startLine] - state.blkIndent >= 4) {
-      return false;
-    }
-    marker = state.src.charCodeAt(pos++);
-    // Check hr marker
-        if (marker !== 42 /* * */ && marker !== 45 /* - */ && marker !== 95 /* _ */) {
-      return false;
-    }
-    // markers can be mixed with spaces, but there should be at least 3 of them
-        cnt = 1;
-    while (pos < max) {
-      ch = state.src.charCodeAt(pos++);
-      if (ch !== marker && !isSpace$8(ch)) {
-        return false;
-      }
-      if (ch === marker) {
-        cnt++;
-      }
-    }
-    if (cnt < 3) {
-      return false;
-    }
-    if (silent) {
-      return true;
-    }
-    state.line = startLine + 1;
-    token = state.push("hr", "hr", 0);
-    token.map = [ startLine, state.line ];
-    token.markup = Array(cnt + 1).join(String.fromCharCode(marker));
-    return true;
-  };
-  var isSpace$7 = utils.isSpace;
-  // Search `[-+*][\n ]`, returns next pos after marker on success
-  // or -1 on fail.
-    function skipBulletListMarker(state, startLine) {
-    var marker, pos, max, ch;
-    pos = state.bMarks[startLine] + state.tShift[startLine];
-    max = state.eMarks[startLine];
-    marker = state.src.charCodeAt(pos++);
-    // Check bullet
-        if (marker !== 42 /* * */ && marker !== 45 /* - */ && marker !== 43 /* + */) {
-      return -1;
-    }
-    if (pos < max) {
-      ch = state.src.charCodeAt(pos);
-      if (!isSpace$7(ch)) {
-        // " -test " - is not a list item
-        return -1;
-      }
-    }
-    return pos;
-  }
-  // Search `\d+[.)][\n ]`, returns next pos after marker on success
-  // or -1 on fail.
-    function skipOrderedListMarker(state, startLine) {
-    var ch, start = state.bMarks[startLine] + state.tShift[startLine], pos = start, max = state.eMarks[startLine];
-    // List marker should have at least 2 chars (digit + dot)
-        if (pos + 1 >= max) {
-      return -1;
-    }
-    ch = state.src.charCodeAt(pos++);
-    if (ch < 48 /* 0 */ || ch > 57 /* 9 */) {
-      return -1;
-    }
-    for (;;) {
-      // EOL -> fail
-      if (pos >= max) {
-        return -1;
-      }
-      ch = state.src.charCodeAt(pos++);
-      if (ch >= 48 /* 0 */ && ch <= 57 /* 9 */) {
-        // List marker should have no more than 9 digits
-        // (prevents integer overflow in browsers)
-        if (pos - start >= 10) {
-          return -1;
-        }
-        continue;
-      }
-      // found valid marker
-            if (ch === 41 /* ) */ || ch === 46 /* . */) {
-        break;
-      }
-      return -1;
-    }
-    if (pos < max) {
-      ch = state.src.charCodeAt(pos);
-      if (!isSpace$7(ch)) {
-        // " 1.test " - is not a list item
-        return -1;
-      }
-    }
-    return pos;
-  }
-  function markTightParagraphs(state, idx) {
-    var i, l, level = state.level + 2;
-    for (i = idx + 2, l = state.tokens.length - 2; i < l; i++) {
-      if (state.tokens[i].level === level && state.tokens[i].type === "paragraph_open") {
-        state.tokens[i + 2].hidden = true;
-        state.tokens[i].hidden = true;
-        i += 2;
-      }
-    }
-  }
-  var list = function list(state, startLine, endLine, silent) {
-    var ch, contentStart, i, indent, indentAfterMarker, initial, isOrdered, itemLines, l, listLines, listTokIdx, markerCharCode, markerValue, max, offset, oldListIndent, oldParentType, oldSCount, oldTShift, oldTight, pos, posAfterMarker, prevEmptyEnd, start, terminate, terminatorRules, token, nextLine = startLine, isTerminatingParagraph = false, tight = true;
-    // if it's indented more than 3 spaces, it should be a code block
-        if (state.sCount[nextLine] - state.blkIndent >= 4) {
-      return false;
-    }
-    // Special case:
-    //  - item 1
-    //   - item 2
-    //    - item 3
-    //     - item 4
-    //      - this one is a paragraph continuation
-        if (state.listIndent >= 0 && state.sCount[nextLine] - state.listIndent >= 4 && state.sCount[nextLine] < state.blkIndent) {
-      return false;
-    }
-    // limit conditions when list can interrupt
-    // a paragraph (validation mode only)
-        if (silent && state.parentType === "paragraph") {
-      // Next list item should still terminate previous list item;
-      // This code can fail if plugins use blkIndent as well as lists,
-      // but I hope the spec gets fixed long before that happens.
-      if (state.sCount[nextLine] >= state.blkIndent) {
-        isTerminatingParagraph = true;
-      }
-    }
-    // Detect list type and position after marker
-        if ((posAfterMarker = skipOrderedListMarker(state, nextLine)) >= 0) {
-      isOrdered = true;
-      start = state.bMarks[nextLine] + state.tShift[nextLine];
-      markerValue = Number(state.src.slice(start, posAfterMarker - 1));
-      // If we're starting a new ordered list right after
-      // a paragraph, it should start with 1.
-            if (isTerminatingParagraph && markerValue !== 1) return false;
-    } else if ((posAfterMarker = skipBulletListMarker(state, nextLine)) >= 0) {
-      isOrdered = false;
-    } else {
-      return false;
-    }
-    // If we're starting a new unordered list right after
-    // a paragraph, first line should not be empty.
-        if (isTerminatingParagraph) {
-      if (state.skipSpaces(posAfterMarker) >= state.eMarks[nextLine]) return false;
-    }
-    // For validation mode we can terminate immediately
-        if (silent) {
-      return true;
-    }
-    // We should terminate list on style change. Remember first one to compare.
-        markerCharCode = state.src.charCodeAt(posAfterMarker - 1);
-    // Start list
-        listTokIdx = state.tokens.length;
-    if (isOrdered) {
-      token = state.push("ordered_list_open", "ol", 1);
-      if (markerValue !== 1) {
-        token.attrs = [ [ "start", markerValue ] ];
-      }
-    } else {
-      token = state.push("bullet_list_open", "ul", 1);
-    }
-    token.map = listLines = [ nextLine, 0 ];
-    token.markup = String.fromCharCode(markerCharCode);
-    
-    // Iterate list items
-    
-        prevEmptyEnd = false;
-    terminatorRules = state.md.block.ruler.getRules("list");
-    oldParentType = state.parentType;
-    state.parentType = "list";
-    while (nextLine < endLine) {
-      pos = posAfterMarker;
-      max = state.eMarks[nextLine];
-      initial = offset = state.sCount[nextLine] + posAfterMarker - (state.bMarks[nextLine] + state.tShift[nextLine]);
-      while (pos < max) {
-        ch = state.src.charCodeAt(pos);
-        if (ch === 9) {
-          offset += 4 - (offset + state.bsCount[nextLine]) % 4;
-        } else if (ch === 32) {
-          offset++;
-        } else {
-          break;
-        }
-        pos++;
-      }
-      contentStart = pos;
-      if (contentStart >= max) {
-        // trimming space in "-    \n  3" case, indent is 1 here
-        indentAfterMarker = 1;
-      } else {
-        indentAfterMarker = offset - initial;
-      }
-      // If we have more than 4 spaces, the indent is 1
-      // (the rest is just indented code block)
-            if (indentAfterMarker > 4) {
-        indentAfterMarker = 1;
-      }
-      // "  -  test"
-      //  ^^^^^ - calculating total length of this thing
-            indent = initial + indentAfterMarker;
-      // Run subparser & write tokens
-            token = state.push("list_item_open", "li", 1);
-      token.markup = String.fromCharCode(markerCharCode);
-      token.map = itemLines = [ nextLine, 0 ];
-      if (isOrdered) {
-        token.info = state.src.slice(start, posAfterMarker - 1);
-      }
-      // change current state, then restore it after parser subcall
-            oldTight = state.tight;
-      oldTShift = state.tShift[nextLine];
-      oldSCount = state.sCount[nextLine];
-      //  - example list
-      // ^ listIndent position will be here
-      //   ^ blkIndent position will be here
-      
-            oldListIndent = state.listIndent;
-      state.listIndent = state.blkIndent;
-      state.blkIndent = indent;
-      state.tight = true;
-      state.tShift[nextLine] = contentStart - state.bMarks[nextLine];
-      state.sCount[nextLine] = offset;
-      if (contentStart >= max && state.isEmpty(nextLine + 1)) {
-        // workaround for this case
-        // (list item is empty, list terminates before "foo"):
-        // ~~~~~~~~
-        //   -
-        //     foo
-        // ~~~~~~~~
-        state.line = Math.min(state.line + 2, endLine);
-      } else {
-        state.md.block.tokenize(state, nextLine, endLine, true);
-      }
-      // If any of list item is tight, mark list as tight
-            if (!state.tight || prevEmptyEnd) {
-        tight = false;
-      }
-      // Item become loose if finish with empty line,
-      // but we should filter last element, because it means list finish
-            prevEmptyEnd = state.line - nextLine > 1 && state.isEmpty(state.line - 1);
-      state.blkIndent = state.listIndent;
-      state.listIndent = oldListIndent;
-      state.tShift[nextLine] = oldTShift;
-      state.sCount[nextLine] = oldSCount;
-      state.tight = oldTight;
-      token = state.push("list_item_close", "li", -1);
-      token.markup = String.fromCharCode(markerCharCode);
-      nextLine = state.line;
-      itemLines[1] = nextLine;
-      if (nextLine >= endLine) {
-        break;
-      }
-      
-      // Try to check if list is terminated or continued.
-      
-            if (state.sCount[nextLine] < state.blkIndent) {
-        break;
-      }
-      // if it's indented more than 3 spaces, it should be a code block
-            if (state.sCount[nextLine] - state.blkIndent >= 4) {
-        break;
-      }
-      // fail if terminating block found
-            terminate = false;
-      for (i = 0, l = terminatorRules.length; i < l; i++) {
-        if (terminatorRules[i](state, nextLine, endLine, true)) {
-          terminate = true;
-          break;
-        }
-      }
-      if (terminate) {
-        break;
-      }
-      // fail if list has another type
-            if (isOrdered) {
-        posAfterMarker = skipOrderedListMarker(state, nextLine);
-        if (posAfterMarker < 0) {
-          break;
-        }
-        start = state.bMarks[nextLine] + state.tShift[nextLine];
-      } else {
-        posAfterMarker = skipBulletListMarker(state, nextLine);
-        if (posAfterMarker < 0) {
-          break;
-        }
-      }
-      if (markerCharCode !== state.src.charCodeAt(posAfterMarker - 1)) {
-        break;
-      }
-    }
-    // Finalize list
-        if (isOrdered) {
-      token = state.push("ordered_list_close", "ol", -1);
-    } else {
-      token = state.push("bullet_list_close", "ul", -1);
-    }
-    token.markup = String.fromCharCode(markerCharCode);
-    listLines[1] = nextLine;
-    state.line = nextLine;
-    state.parentType = oldParentType;
-    // mark paragraphs tight if needed
-        if (tight) {
-      markTightParagraphs(state, listTokIdx);
-    }
-    return true;
-  };
-  var normalizeReference$2 = utils.normalizeReference;
-  var isSpace$6 = utils.isSpace;
-  var reference = function reference(state, startLine, _endLine, silent) {
-    var ch, destEndPos, destEndLineNo, endLine, href, i, l, label, labelEnd, oldParentType, res, start, str, terminate, terminatorRules, title, lines = 0, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine], nextLine = startLine + 1;
-    // if it's indented more than 3 spaces, it should be a code block
-        if (state.sCount[startLine] - state.blkIndent >= 4) {
-      return false;
-    }
-    if (state.src.charCodeAt(pos) !== 91 /* [ */) {
-      return false;
-    }
-    // Simple check to quickly interrupt scan on [link](url) at the start of line.
-    // Can be useful on practice: https://github.com/markdown-it/markdown-it/issues/54
-        while (++pos < max) {
-      if (state.src.charCodeAt(pos) === 93 /* ] */ && state.src.charCodeAt(pos - 1) !== 92 /* \ */) {
-        if (pos + 1 === max) {
-          return false;
-        }
-        if (state.src.charCodeAt(pos + 1) !== 58 /* : */) {
-          return false;
-        }
-        break;
-      }
-    }
-    endLine = state.lineMax;
-    // jump line-by-line until empty one or EOF
-        terminatorRules = state.md.block.ruler.getRules("reference");
-    oldParentType = state.parentType;
-    state.parentType = "reference";
-    for (;nextLine < endLine && !state.isEmpty(nextLine); nextLine++) {
-      // this would be a code block normally, but after paragraph
-      // it's considered a lazy continuation regardless of what's there
-      if (state.sCount[nextLine] - state.blkIndent > 3) {
-        continue;
-      }
-      // quirk for blockquotes, this line should already be checked by that rule
-            if (state.sCount[nextLine] < 0) {
-        continue;
-      }
-      // Some tags can terminate paragraph without empty line.
-            terminate = false;
-      for (i = 0, l = terminatorRules.length; i < l; i++) {
-        if (terminatorRules[i](state, nextLine, endLine, true)) {
-          terminate = true;
-          break;
-        }
-      }
-      if (terminate) {
-        break;
-      }
-    }
-    str = state.getLines(startLine, nextLine, state.blkIndent, false).trim();
-    max = str.length;
-    for (pos = 1; pos < max; pos++) {
-      ch = str.charCodeAt(pos);
-      if (ch === 91 /* [ */) {
-        return false;
-      } else if (ch === 93 /* ] */) {
-        labelEnd = pos;
-        break;
-      } else if (ch === 10 /* \n */) {
-        lines++;
-      } else if (ch === 92 /* \ */) {
-        pos++;
-        if (pos < max && str.charCodeAt(pos) === 10) {
-          lines++;
-        }
-      }
-    }
-    if (labelEnd < 0 || str.charCodeAt(labelEnd + 1) !== 58 /* : */) {
-      return false;
-    }
-    // [label]:   destination   'title'
-    //         ^^^ skip optional whitespace here
-        for (pos = labelEnd + 2; pos < max; pos++) {
-      ch = str.charCodeAt(pos);
-      if (ch === 10) {
-        lines++;
-      } else if (isSpace$6(ch)) ; else {
-        break;
-      }
-    }
-    // [label]:   destination   'title'
-    //            ^^^^^^^^^^^ parse this
-        res = state.md.helpers.parseLinkDestination(str, pos, max);
-    if (!res.ok) {
-      return false;
-    }
-    href = state.md.normalizeLink(res.str);
-    if (!state.md.validateLink(href)) {
-      return false;
-    }
-    pos = res.pos;
-    lines += res.lines;
-    // save cursor state, we could require to rollback later
-        destEndPos = pos;
-    destEndLineNo = lines;
-    // [label]:   destination   'title'
-    //                       ^^^ skipping those spaces
-        start = pos;
-    for (;pos < max; pos++) {
-      ch = str.charCodeAt(pos);
-      if (ch === 10) {
-        lines++;
-      } else if (isSpace$6(ch)) ; else {
-        break;
-      }
-    }
-    // [label]:   destination   'title'
-    //                          ^^^^^^^ parse this
-        res = state.md.helpers.parseLinkTitle(str, pos, max);
-    if (pos < max && start !== pos && res.ok) {
-      title = res.str;
-      pos = res.pos;
-      lines += res.lines;
-    } else {
-      title = "";
-      pos = destEndPos;
-      lines = destEndLineNo;
-    }
-    // skip trailing spaces until the rest of the line
-        while (pos < max) {
-      ch = str.charCodeAt(pos);
-      if (!isSpace$6(ch)) {
-        break;
-      }
-      pos++;
-    }
-    if (pos < max && str.charCodeAt(pos) !== 10) {
-      if (title) {
-        // garbage at the end of the line after title,
-        // but it could still be a valid reference if we roll back
-        title = "";
-        pos = destEndPos;
-        lines = destEndLineNo;
-        while (pos < max) {
-          ch = str.charCodeAt(pos);
-          if (!isSpace$6(ch)) {
-            break;
-          }
-          pos++;
-        }
-      }
-    }
-    if (pos < max && str.charCodeAt(pos) !== 10) {
-      // garbage at the end of the line
-      return false;
-    }
-    label = normalizeReference$2(str.slice(1, labelEnd));
-    if (!label) {
-      // CommonMark 0.20 disallows empty labels
-      return false;
-    }
-    // Reference can not terminate anything. This check is for safety only.
-    /*istanbul ignore if*/    if (silent) {
-      return true;
-    }
-    if (typeof state.env.references === "undefined") {
-      state.env.references = {};
-    }
-    if (typeof state.env.references[label] === "undefined") {
-      state.env.references[label] = {
-        title: title,
-        href: href
-      };
-    }
-    state.parentType = oldParentType;
-    state.line = startLine + lines + 1;
-    return true;
-  };
-  // List of valid html blocks names, accorting to commonmark spec
-    var html_blocks = [ "address", "article", "aside", "base", "basefont", "blockquote", "body", "caption", "center", "col", "colgroup", "dd", "details", "dialog", "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hr", "html", "iframe", "legend", "li", "link", "main", "menu", "menuitem", "nav", "noframes", "ol", "optgroup", "option", "p", "param", "section", "source", "summary", "table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "track", "ul" ];
-  // Regexps to match html elements
-    var attr_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*";
-  var unquoted = "[^\"'=<>`\\x00-\\x20]+";
-  var single_quoted = "'[^']*'";
-  var double_quoted = '"[^"]*"';
-  var attr_value = "(?:" + unquoted + "|" + single_quoted + "|" + double_quoted + ")";
-  var attribute = "(?:\\s+" + attr_name + "(?:\\s*=\\s*" + attr_value + ")?)";
-  var open_tag = "<[A-Za-z][A-Za-z0-9\\-]*" + attribute + "*\\s*\\/?>";
-  var close_tag = "<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>";
-  var comment = "\x3c!----\x3e|\x3c!--(?:-?[^>-])(?:-?[^-])*--\x3e";
-  var processing = "<[?][\\s\\S]*?[?]>";
-  var declaration = "<![A-Z]+\\s+[^>]*>";
-  var cdata = "<!\\[CDATA\\[[\\s\\S]*?\\]\\]>";
-  var HTML_TAG_RE$1 = new RegExp("^(?:" + open_tag + "|" + close_tag + "|" + comment + "|" + processing + "|" + declaration + "|" + cdata + ")");
-  var HTML_OPEN_CLOSE_TAG_RE$1 = new RegExp("^(?:" + open_tag + "|" + close_tag + ")");
-  var HTML_TAG_RE_1 = HTML_TAG_RE$1;
-  var HTML_OPEN_CLOSE_TAG_RE_1 = HTML_OPEN_CLOSE_TAG_RE$1;
-  var html_re = {
-    HTML_TAG_RE: HTML_TAG_RE_1,
-    HTML_OPEN_CLOSE_TAG_RE: HTML_OPEN_CLOSE_TAG_RE_1
-  };
-  var HTML_OPEN_CLOSE_TAG_RE = html_re.HTML_OPEN_CLOSE_TAG_RE;
-  // An array of opening and corresponding closing sequences for html tags,
-  // last argument defines whether it can terminate a paragraph or not
-  
-    var HTML_SEQUENCES = [ [ /^<(script|pre|style|textarea)(?=(\s|>|$))/i, /<\/(script|pre|style|textarea)>/i, true ], [ /^<!--/, /-->/, true ], [ /^<\?/, /\?>/, true ], [ /^<![A-Z]/, />/, true ], [ /^<!\[CDATA\[/, /\]\]>/, true ], [ new RegExp("^</?(" + html_blocks.join("|") + ")(?=(\\s|/?>|$))", "i"), /^$/, true ], [ new RegExp(HTML_OPEN_CLOSE_TAG_RE.source + "\\s*$"), /^$/, false ] ];
-  var html_block = function html_block(state, startLine, endLine, silent) {
-    var i, nextLine, token, lineText, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine];
-    // if it's indented more than 3 spaces, it should be a code block
-        if (state.sCount[startLine] - state.blkIndent >= 4) {
-      return false;
-    }
-    if (!state.md.options.html) {
-      return false;
-    }
-    if (state.src.charCodeAt(pos) !== 60 /* < */) {
-      return false;
-    }
-    lineText = state.src.slice(pos, max);
-    for (i = 0; i < HTML_SEQUENCES.length; i++) {
-      if (HTML_SEQUENCES[i][0].test(lineText)) {
-        break;
-      }
-    }
-    if (i === HTML_SEQUENCES.length) {
-      return false;
-    }
-    if (silent) {
-      // true if this sequence can be a terminator, false otherwise
-      return HTML_SEQUENCES[i][2];
-    }
-    nextLine = startLine + 1;
-    // If we are here - we detected HTML block.
-    // Let's roll down till block end.
-        if (!HTML_SEQUENCES[i][1].test(lineText)) {
-      for (;nextLine < endLine; nextLine++) {
-        if (state.sCount[nextLine] < state.blkIndent) {
-          break;
-        }
-        pos = state.bMarks[nextLine] + state.tShift[nextLine];
-        max = state.eMarks[nextLine];
-        lineText = state.src.slice(pos, max);
-        if (HTML_SEQUENCES[i][1].test(lineText)) {
-          if (lineText.length !== 0) {
-            nextLine++;
-          }
-          break;
-        }
-      }
-    }
-    state.line = nextLine;
-    token = state.push("html_block", "", 0);
-    token.map = [ startLine, nextLine ];
-    token.content = state.getLines(startLine, nextLine, state.blkIndent, true);
-    return true;
-  };
-  var isSpace$5 = utils.isSpace;
-  var heading = function heading(state, startLine, endLine, silent) {
-    var ch, level, tmp, token, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine];
-    // if it's indented more than 3 spaces, it should be a code block
-        if (state.sCount[startLine] - state.blkIndent >= 4) {
-      return false;
-    }
-    ch = state.src.charCodeAt(pos);
-    if (ch !== 35 /* # */ || pos >= max) {
-      return false;
-    }
-    // count heading level
-        level = 1;
-    ch = state.src.charCodeAt(++pos);
-    while (ch === 35 /* # */ && pos < max && level <= 6) {
-      level++;
-      ch = state.src.charCodeAt(++pos);
-    }
-    if (level > 6 || pos < max && !isSpace$5(ch)) {
-      return false;
-    }
-    if (silent) {
-      return true;
-    }
-    // Let's cut tails like '    ###  ' from the end of string
-        max = state.skipSpacesBack(max, pos);
-    tmp = state.skipCharsBack(max, 35, pos);
- // #
-        if (tmp > pos && isSpace$5(state.src.charCodeAt(tmp - 1))) {
-      max = tmp;
-    }
-    state.line = startLine + 1;
-    token = state.push("heading_open", "h" + String(level), 1);
-    token.markup = "########".slice(0, level);
-    token.map = [ startLine, state.line ];
-    token = state.push("inline", "", 0);
-    token.content = state.src.slice(pos, max).trim();
-    token.map = [ startLine, state.line ];
-    token.children = [];
-    token = state.push("heading_close", "h" + String(level), -1);
-    token.markup = "########".slice(0, level);
-    return true;
-  };
-  // lheading (---, ===)
-    var lheading = function lheading(state, startLine, endLine /*, silent*/) {
-    var content, terminate, i, l, token, pos, max, level, marker, nextLine = startLine + 1, oldParentType, terminatorRules = state.md.block.ruler.getRules("paragraph");
-    // if it's indented more than 3 spaces, it should be a code block
-        if (state.sCount[startLine] - state.blkIndent >= 4) {
-      return false;
-    }
-    oldParentType = state.parentType;
-    state.parentType = "paragraph";
- // use paragraph to match terminatorRules
-    // jump line-by-line until empty one or EOF
-        for (;nextLine < endLine && !state.isEmpty(nextLine); nextLine++) {
-      // this would be a code block normally, but after paragraph
-      // it's considered a lazy continuation regardless of what's there
-      if (state.sCount[nextLine] - state.blkIndent > 3) {
-        continue;
-      }
-      
-      // Check for underline in setext header
-      
-            if (state.sCount[nextLine] >= state.blkIndent) {
-        pos = state.bMarks[nextLine] + state.tShift[nextLine];
-        max = state.eMarks[nextLine];
-        if (pos < max) {
-          marker = state.src.charCodeAt(pos);
-          if (marker === 45 /* - */ || marker === 61 /* = */) {
-            pos = state.skipChars(pos, marker);
-            pos = state.skipSpaces(pos);
-            if (pos >= max) {
-              level = marker === 61 /* = */ ? 1 : 2;
-              break;
-            }
-          }
-        }
-      }
-      // quirk for blockquotes, this line should already be checked by that rule
-            if (state.sCount[nextLine] < 0) {
-        continue;
-      }
-      // Some tags can terminate paragraph without empty line.
-            terminate = false;
-      for (i = 0, l = terminatorRules.length; i < l; i++) {
-        if (terminatorRules[i](state, nextLine, endLine, true)) {
-          terminate = true;
-          break;
-        }
-      }
-      if (terminate) {
-        break;
-      }
-    }
-    if (!level) {
-      // Didn't find valid underline
-      return false;
-    }
-    content = state.getLines(startLine, nextLine, state.blkIndent, false).trim();
-    state.line = nextLine + 1;
-    token = state.push("heading_open", "h" + String(level), 1);
-    token.markup = String.fromCharCode(marker);
-    token.map = [ startLine, state.line ];
-    token = state.push("inline", "", 0);
-    token.content = content;
-    token.map = [ startLine, state.line - 1 ];
-    token.children = [];
-    token = state.push("heading_close", "h" + String(level), -1);
-    token.markup = String.fromCharCode(marker);
-    state.parentType = oldParentType;
-    return true;
-  };
-  // Paragraph
-    var paragraph = function paragraph(state, startLine, endLine) {
-    var content, terminate, i, l, token, oldParentType, nextLine = startLine + 1, terminatorRules = state.md.block.ruler.getRules("paragraph");
-    oldParentType = state.parentType;
-    state.parentType = "paragraph";
-    // jump line-by-line until empty one or EOF
-        for (;nextLine < endLine && !state.isEmpty(nextLine); nextLine++) {
-      // this would be a code block normally, but after paragraph
-      // it's considered a lazy continuation regardless of what's there
-      if (state.sCount[nextLine] - state.blkIndent > 3) {
-        continue;
-      }
-      // quirk for blockquotes, this line should already be checked by that rule
-            if (state.sCount[nextLine] < 0) {
-        continue;
-      }
-      // Some tags can terminate paragraph without empty line.
-            terminate = false;
-      for (i = 0, l = terminatorRules.length; i < l; i++) {
-        if (terminatorRules[i](state, nextLine, endLine, true)) {
-          terminate = true;
-          break;
-        }
-      }
-      if (terminate) {
-        break;
-      }
-    }
-    content = state.getLines(startLine, nextLine, state.blkIndent, false).trim();
-    state.line = nextLine;
-    token = state.push("paragraph_open", "p", 1);
-    token.map = [ startLine, state.line ];
-    token = state.push("inline", "", 0);
-    token.content = content;
-    token.map = [ startLine, state.line ];
-    token.children = [];
-    token = state.push("paragraph_close", "p", -1);
-    state.parentType = oldParentType;
-    return true;
-  };
-  var isSpace$4 = utils.isSpace;
-  function StateBlock(src, md, env, tokens) {
-    var ch, s, start, pos, len, indent, offset, indent_found;
-    this.src = src;
-    // link to parser instance
-        this.md = md;
-    this.env = env;
-    
-    // Internal state vartiables
-    
-        this.tokens = tokens;
-    this.bMarks = [];
- // line begin offsets for fast jumps
-        this.eMarks = [];
- // line end offsets for fast jumps
-        this.tShift = [];
- // offsets of the first non-space characters (tabs not expanded)
-        this.sCount = [];
- // indents for each line (tabs expanded)
-    // An amount of virtual spaces (tabs expanded) between beginning
-    // of each line (bMarks) and real beginning of that line.
-    
-    // It exists only as a hack because blockquotes override bMarks
-    // losing information in the process.
-    
-    // It's used only when expanding tabs, you can think about it as
-    // an initial tab length, e.g. bsCount=21 applied to string `\t123`
-    // means first tab should be expanded to 4-21%4 === 3 spaces.
-    
-        this.bsCount = [];
-    // block parser variables
-        this.blkIndent = 0;
- // required block content indent (for example, if we are
-    // inside a list, it would be positioned after list marker)
-        this.line = 0;
- // line index in src
-        this.lineMax = 0;
- // lines count
-        this.tight = false;
- // loose/tight mode for lists
-        this.ddIndent = -1;
- // indent of the current dd block (-1 if there isn't any)
-        this.listIndent = -1;
- // indent of the current list block (-1 if there isn't any)
-    // can be 'blockquote', 'list', 'root', 'paragraph' or 'reference'
-    // used in lists to determine if they interrupt a paragraph
-        this.parentType = "root";
-    this.level = 0;
-    // renderer
-        this.result = "";
-    // Create caches
-    // Generate markers.
-        s = this.src;
-    indent_found = false;
-    for (start = pos = indent = offset = 0, len = s.length; pos < len; pos++) {
-      ch = s.charCodeAt(pos);
-      if (!indent_found) {
-        if (isSpace$4(ch)) {
-          indent++;
-          if (ch === 9) {
-            offset += 4 - offset % 4;
-          } else {
-            offset++;
-          }
-          continue;
-        } else {
-          indent_found = true;
-        }
-      }
-      if (ch === 10 || pos === len - 1) {
-        if (ch !== 10) {
-          pos++;
-        }
-        this.bMarks.push(start);
-        this.eMarks.push(pos);
-        this.tShift.push(indent);
-        this.sCount.push(offset);
-        this.bsCount.push(0);
-        indent_found = false;
-        indent = 0;
-        offset = 0;
-        start = pos + 1;
-      }
-    }
-    // Push fake entry to simplify cache bounds checks
-        this.bMarks.push(s.length);
-    this.eMarks.push(s.length);
-    this.tShift.push(0);
-    this.sCount.push(0);
-    this.bsCount.push(0);
-    this.lineMax = this.bMarks.length - 1;
- // don't count last fake line
-    }
-  // Push new token to "stream".
-  
-    StateBlock.prototype.push = function(type, tag, nesting) {
-    var token$1 = new token(type, tag, nesting);
-    token$1.block = true;
-    if (nesting < 0) this.level--;
- // closing tag
-        token$1.level = this.level;
-    if (nesting > 0) this.level++;
- // opening tag
-        this.tokens.push(token$1);
-    return token$1;
-  };
-  StateBlock.prototype.isEmpty = function isEmpty(line) {
-    return this.bMarks[line] + this.tShift[line] >= this.eMarks[line];
-  };
-  StateBlock.prototype.skipEmptyLines = function skipEmptyLines(from) {
-    for (var max = this.lineMax; from < max; from++) {
-      if (this.bMarks[from] + this.tShift[from] < this.eMarks[from]) {
-        break;
-      }
-    }
-    return from;
-  };
-  // Skip spaces from given position.
-    StateBlock.prototype.skipSpaces = function skipSpaces(pos) {
-    var ch;
-    for (var max = this.src.length; pos < max; pos++) {
-      ch = this.src.charCodeAt(pos);
-      if (!isSpace$4(ch)) {
-        break;
-      }
-    }
-    return pos;
-  };
-  // Skip spaces from given position in reverse.
-    StateBlock.prototype.skipSpacesBack = function skipSpacesBack(pos, min) {
-    if (pos <= min) {
-      return pos;
-    }
-    while (pos > min) {
-      if (!isSpace$4(this.src.charCodeAt(--pos))) {
-        return pos + 1;
-      }
-    }
-    return pos;
-  };
-  // Skip char codes from given position
-    StateBlock.prototype.skipChars = function skipChars(pos, code) {
-    for (var max = this.src.length; pos < max; pos++) {
-      if (this.src.charCodeAt(pos) !== code) {
-        break;
-      }
-    }
-    return pos;
-  };
-  // Skip char codes reverse from given position - 1
-    StateBlock.prototype.skipCharsBack = function skipCharsBack(pos, code, min) {
-    if (pos <= min) {
-      return pos;
-    }
-    while (pos > min) {
-      if (code !== this.src.charCodeAt(--pos)) {
-        return pos + 1;
-      }
-    }
-    return pos;
-  };
-  // cut lines range from source.
-    StateBlock.prototype.getLines = function getLines(begin, end, indent, keepLastLF) {
-    var i, lineIndent, ch, first, last, queue, lineStart, line = begin;
-    if (begin >= end) {
-      return "";
-    }
-    queue = new Array(end - begin);
-    for (i = 0; line < end; line++, i++) {
-      lineIndent = 0;
-      lineStart = first = this.bMarks[line];
-      if (line + 1 < end || keepLastLF) {
-        // No need for bounds check because we have fake entry on tail.
-        last = this.eMarks[line] + 1;
-      } else {
-        last = this.eMarks[line];
-      }
-      while (first < last && lineIndent < indent) {
-        ch = this.src.charCodeAt(first);
-        if (isSpace$4(ch)) {
-          if (ch === 9) {
-            lineIndent += 4 - (lineIndent + this.bsCount[line]) % 4;
-          } else {
-            lineIndent++;
-          }
-        } else if (first - lineStart < this.tShift[line]) {
-          // patched tShift masked characters to look like spaces (blockquotes, list markers)
-          lineIndent++;
-        } else {
-          break;
-        }
-        first++;
-      }
-      if (lineIndent > indent) {
-        // partially expanding tabs in code blocks, e.g '\t\tfoobar'
-        // with indent=2 becomes '  \tfoobar'
-        queue[i] = new Array(lineIndent - indent + 1).join(" ") + this.src.slice(first, last);
-      } else {
-        queue[i] = this.src.slice(first, last);
-      }
-    }
-    return queue.join("");
-  };
-  // re-export Token class to use in block rules
-    StateBlock.prototype.Token = token;
-  var state_block = StateBlock;
-  var _rules$1 = [ 
-  // First 2 params - rule name & source. Secondary array - list of rules,
-  // which can be terminated by this one.
-  [ "table", table, [ "paragraph", "reference" ] ], [ "code", code ], [ "fence", fence, [ "paragraph", "reference", "blockquote", "list" ] ], [ "blockquote", blockquote, [ "paragraph", "reference", "blockquote", "list" ] ], [ "hr", hr, [ "paragraph", "reference", "blockquote", "list" ] ], [ "list", list, [ "paragraph", "reference", "blockquote" ] ], [ "reference", reference ], [ "html_block", html_block, [ "paragraph", "reference", "blockquote" ] ], [ "heading", heading, [ "paragraph", "reference", "blockquote" ] ], [ "lheading", lheading ], [ "paragraph", paragraph ] ];
-  /**
-	 * new ParserBlock()
-	 **/  function ParserBlock() {
-    /**
-	   * ParserBlock#ruler -> Ruler
-	   *
-	   * [[Ruler]] instance. Keep configuration of block rules.
-	   **/
-    this.ruler = new ruler;
-    for (var i = 0; i < _rules$1.length; i++) {
-      this.ruler.push(_rules$1[i][0], _rules$1[i][1], {
-        alt: (_rules$1[i][2] || []).slice()
-      });
-    }
-  }
-  // Generate tokens for input range
-  
-    ParserBlock.prototype.tokenize = function(state, startLine, endLine) {
-    var ok, i, prevLine, rules = this.ruler.getRules(""), len = rules.length, line = startLine, hasEmptyLines = false, maxNesting = state.md.options.maxNesting;
-    while (line < endLine) {
-      state.line = line = state.skipEmptyLines(line);
-      if (line >= endLine) {
-        break;
-      }
-      // Termination condition for nested calls.
-      // Nested calls currently used for blockquotes & lists
-            if (state.sCount[line] < state.blkIndent) {
-        break;
-      }
-      // If nesting level exceeded - skip tail to the end. That's not ordinary
-      // situation and we should not care about content.
-            if (state.level >= maxNesting) {
-        state.line = endLine;
-        break;
-      }
-      // Try all possible rules.
-      // On success, rule should:
-      
-      // - update `state.line`
-      // - update `state.tokens`
-      // - return true
-            prevLine = state.line;
-      for (i = 0; i < len; i++) {
-        ok = rules[i](state, line, endLine, false);
-        if (ok) {
-          if (prevLine >= state.line) {
-            throw new Error("block rule didn't increment state.line");
-          }
-          break;
-        }
-      }
-      // this can only happen if user disables paragraph rule
-            if (!ok) throw new Error("none of the block rules matched");
-      // set state.tight if we had an empty line before current tag
-      // i.e. latest empty line should not count
-            state.tight = !hasEmptyLines;
-      // paragraph might "eat" one newline after it in nested lists
-            if (state.isEmpty(state.line - 1)) {
-        hasEmptyLines = true;
-      }
-      line = state.line;
-      if (line < endLine && state.isEmpty(line)) {
-        hasEmptyLines = true;
-        line++;
-        state.line = line;
-      }
-    }
-  };
-  /**
-	 * ParserBlock.parse(str, md, env, outTokens)
-	 *
-	 * Process input string and push block tokens into `outTokens`
-	 **/  ParserBlock.prototype.parse = function(src, md, env, outTokens) {
-    var state;
-    if (!src) {
-      return;
-    }
-    state = new this.State(src, md, env, outTokens);
-    this.tokenize(state, state.line, state.lineMax);
-  };
-  ParserBlock.prototype.State = state_block;
-  var parser_block = ParserBlock;
-  // Skip text characters for text token, place those to pending buffer
-  // Rule to skip pure text
-  // '{}$%@~+=:' reserved for extentions
-  // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~
-  // !!!! Don't confuse with "Markdown ASCII Punctuation" chars
-  // http://spec.commonmark.org/0.15/#ascii-punctuation-character
-    function isTerminatorChar(ch) {
-    switch (ch) {
-     case 10 /* \n */ :
-     case 33 /* ! */ :
-     case 35 /* # */ :
-     case 36 /* $ */ :
-     case 37 /* % */ :
-     case 38 /* & */ :
-     case 42 /* * */ :
-     case 43 /* + */ :
-     case 45 /* - */ :
-     case 58 /* : */ :
-     case 60 /* < */ :
-     case 61 /* = */ :
-     case 62 /* > */ :
-     case 64 /* @ */ :
-     case 91 /* [ */ :
-     case 92 /* \ */ :
-     case 93 /* ] */ :
-     case 94 /* ^ */ :
-     case 95 /* _ */ :
-     case 96 /* ` */ :
-     case 123 /* { */ :
-     case 125 /* } */ :
-     case 126 /* ~ */ :
-      return true;
-
-     default:
-      return false;
-    }
-  }
-  var text = function text(state, silent) {
-    var pos = state.pos;
-    while (pos < state.posMax && !isTerminatorChar(state.src.charCodeAt(pos))) {
-      pos++;
-    }
-    if (pos === state.pos) {
-      return false;
-    }
-    if (!silent) {
-      state.pending += state.src.slice(state.pos, pos);
-    }
-    state.pos = pos;
-    return true;
-  };
-  // Process links like https://example.org/
-  // RFC3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
-    var SCHEME_RE = /(?:^|[^a-z0-9.+-])([a-z][a-z0-9.+-]*)$/i;
-  var linkify = function linkify(state, silent) {
-    var pos, max, match, proto, link, url, fullUrl, token;
-    if (!state.md.options.linkify) return false;
-    if (state.linkLevel > 0) return false;
-    pos = state.pos;
-    max = state.posMax;
-    if (pos + 3 > max) return false;
-    if (state.src.charCodeAt(pos) !== 58 /* : */) return false;
-    if (state.src.charCodeAt(pos + 1) !== 47 /* / */) return false;
-    if (state.src.charCodeAt(pos + 2) !== 47 /* / */) return false;
-    match = state.pending.match(SCHEME_RE);
-    if (!match) return false;
-    proto = match[1];
-    link = state.md.linkify.matchAtStart(state.src.slice(pos - proto.length));
-    if (!link) return false;
-    url = link.url;
-    // invalid link, but still detected by linkify somehow;
-    // need to check to prevent infinite loop below
-        if (url.length <= proto.length) return false;
-    // disallow '*' at the end of the link (conflicts with emphasis)
-        url = url.replace(/\*+$/, "");
-    fullUrl = state.md.normalizeLink(url);
-    if (!state.md.validateLink(fullUrl)) return false;
-    if (!silent) {
-      state.pending = state.pending.slice(0, -proto.length);
-      token = state.push("link_open", "a", 1);
-      token.attrs = [ [ "href", fullUrl ] ];
-      token.markup = "linkify";
-      token.info = "auto";
-      token = state.push("text", "", 0);
-      token.content = state.md.normalizeLinkText(url);
-      token = state.push("link_close", "a", -1);
-      token.markup = "linkify";
-      token.info = "auto";
-    }
-    state.pos += url.length - proto.length;
-    return true;
-  };
-  var isSpace$3 = utils.isSpace;
-  var newline = function newline(state, silent) {
-    var pmax, max, ws, pos = state.pos;
-    if (state.src.charCodeAt(pos) !== 10 /* \n */) {
-      return false;
-    }
-    pmax = state.pending.length - 1;
-    max = state.posMax;
-    // '  \n' -> hardbreak
-    // Lookup in pending chars is bad practice! Don't copy to other rules!
-    // Pending string is stored in concat mode, indexed lookups will cause
-    // convertion to flat mode.
-        if (!silent) {
-      if (pmax >= 0 && state.pending.charCodeAt(pmax) === 32) {
-        if (pmax >= 1 && state.pending.charCodeAt(pmax - 1) === 32) {
-          // Find whitespaces tail of pending chars.
-          ws = pmax - 1;
-          while (ws >= 1 && state.pending.charCodeAt(ws - 1) === 32) ws--;
-          state.pending = state.pending.slice(0, ws);
-          state.push("hardbreak", "br", 0);
-        } else {
-          state.pending = state.pending.slice(0, -1);
-          state.push("softbreak", "br", 0);
-        }
-      } else {
-        state.push("softbreak", "br", 0);
-      }
-    }
-    pos++;
-    // skip heading spaces for next line
-        while (pos < max && isSpace$3(state.src.charCodeAt(pos))) {
-      pos++;
-    }
-    state.pos = pos;
-    return true;
-  };
-  var isSpace$2 = utils.isSpace;
-  var ESCAPED = [];
-  for (var i = 0; i < 256; i++) {
-    ESCAPED.push(0);
-  }
-  "\\!\"#$%&'()*+,./:;<=>?@[]^_`{|}~-".split("").forEach((function(ch) {
-    ESCAPED[ch.charCodeAt(0)] = 1;
-  }));
-  var _escape = function escape(state, silent) {
-    var ch1, ch2, origStr, escapedStr, token, pos = state.pos, max = state.posMax;
-    if (state.src.charCodeAt(pos) !== 92 /* \ */) return false;
-    pos++;
-    // '\' at the end of the inline block
-        if (pos >= max) return false;
-    ch1 = state.src.charCodeAt(pos);
-    if (ch1 === 10) {
-      if (!silent) {
-        state.push("hardbreak", "br", 0);
-      }
-      pos++;
-      // skip leading whitespaces from next line
-            while (pos < max) {
-        ch1 = state.src.charCodeAt(pos);
-        if (!isSpace$2(ch1)) break;
-        pos++;
-      }
-      state.pos = pos;
-      return true;
-    }
-    escapedStr = state.src[pos];
-    if (ch1 >= 55296 && ch1 <= 56319 && pos + 1 < max) {
-      ch2 = state.src.charCodeAt(pos + 1);
-      if (ch2 >= 56320 && ch2 <= 57343) {
-        escapedStr += state.src[pos + 1];
-        pos++;
-      }
-    }
-    origStr = "\\" + escapedStr;
-    if (!silent) {
-      token = state.push("text_special", "", 0);
-      if (ch1 < 256 && ESCAPED[ch1] !== 0) {
-        token.content = escapedStr;
-      } else {
-        token.content = origStr;
-      }
-      token.markup = origStr;
-      token.info = "escape";
-    }
-    state.pos = pos + 1;
-    return true;
-  };
-  // Parse backticks
-    var backticks = function backtick(state, silent) {
-    var start, max, marker, token, matchStart, matchEnd, openerLength, closerLength, pos = state.pos, ch = state.src.charCodeAt(pos);
-    if (ch !== 96 /* ` */) {
-      return false;
-    }
-    start = pos;
-    pos++;
-    max = state.posMax;
-    // scan marker length
-        while (pos < max && state.src.charCodeAt(pos) === 96 /* ` */) {
-      pos++;
-    }
-    marker = state.src.slice(start, pos);
-    openerLength = marker.length;
-    if (state.backticksScanned && (state.backticks[openerLength] || 0) <= start) {
-      if (!silent) state.pending += marker;
-      state.pos += openerLength;
-      return true;
-    }
-    matchEnd = pos;
-    // Nothing found in the cache, scan until the end of the line (or until marker is found)
-        while ((matchStart = state.src.indexOf("`", matchEnd)) !== -1) {
-      matchEnd = matchStart + 1;
-      // scan marker length
-            while (matchEnd < max && state.src.charCodeAt(matchEnd) === 96 /* ` */) {
-        matchEnd++;
-      }
-      closerLength = matchEnd - matchStart;
-      if (closerLength === openerLength) {
-        // Found matching closer length.
-        if (!silent) {
-          token = state.push("code_inline", "code", 0);
-          token.markup = marker;
-          token.content = state.src.slice(pos, matchStart).replace(/\n/g, " ").replace(/^ (.+) $/, "$1");
-        }
-        state.pos = matchEnd;
-        return true;
-      }
-      // Some different length found, put it in cache as upper limit of where closer can be found
-            state.backticks[closerLength] = matchStart;
-    }
-    // Scanned through the end, didn't find anything
-        state.backticksScanned = true;
-    if (!silent) state.pending += marker;
-    state.pos += openerLength;
-    return true;
-  };
-  // ~~strike through~~
-  // Insert each marker as a separate text token, and add it to delimiter list
-  
-    var tokenize$1 = function strikethrough(state, silent) {
-    var i, scanned, token, len, ch, start = state.pos, marker = state.src.charCodeAt(start);
-    if (silent) {
-      return false;
-    }
-    if (marker !== 126 /* ~ */) {
-      return false;
-    }
-    scanned = state.scanDelims(state.pos, true);
-    len = scanned.length;
-    ch = String.fromCharCode(marker);
-    if (len < 2) {
-      return false;
-    }
-    if (len % 2) {
-      token = state.push("text", "", 0);
-      token.content = ch;
-      len--;
-    }
-    for (i = 0; i < len; i += 2) {
-      token = state.push("text", "", 0);
-      token.content = ch + ch;
-      state.delimiters.push({
-        marker: marker,
-        length: 0,
-        // disable "rule of 3" length checks meant for emphasis
-        token: state.tokens.length - 1,
-        end: -1,
-        open: scanned.can_open,
-        close: scanned.can_close
-      });
-    }
-    state.pos += scanned.length;
-    return true;
-  };
-  function postProcess$1(state, delimiters) {
-    var i, j, startDelim, endDelim, token, loneMarkers = [], max = delimiters.length;
-    for (i = 0; i < max; i++) {
-      startDelim = delimiters[i];
-      if (startDelim.marker !== 126 /* ~ */) {
-        continue;
-      }
-      if (startDelim.end === -1) {
-        continue;
-      }
-      endDelim = delimiters[startDelim.end];
-      token = state.tokens[startDelim.token];
-      token.type = "s_open";
-      token.tag = "s";
-      token.nesting = 1;
-      token.markup = "~~";
-      token.content = "";
-      token = state.tokens[endDelim.token];
-      token.type = "s_close";
-      token.tag = "s";
-      token.nesting = -1;
-      token.markup = "~~";
-      token.content = "";
-      if (state.tokens[endDelim.token - 1].type === "text" && state.tokens[endDelim.token - 1].content === "~") {
-        loneMarkers.push(endDelim.token - 1);
-      }
-    }
-    // If a marker sequence has an odd number of characters, it's splitted
-    // like this: `~~~~~` -> `~` + `~~` + `~~`, leaving one marker at the
-    // start of the sequence.
-    
-    // So, we have to move all those markers after subsequent s_close tags.
-    
-        while (loneMarkers.length) {
-      i = loneMarkers.pop();
-      j = i + 1;
-      while (j < state.tokens.length && state.tokens[j].type === "s_close") {
-        j++;
-      }
-      j--;
-      if (i !== j) {
-        token = state.tokens[j];
-        state.tokens[j] = state.tokens[i];
-        state.tokens[i] = token;
-      }
-    }
-  }
-  // Walk through delimiter list and replace text tokens with tags
-  
-    var postProcess_1$1 = function strikethrough(state) {
-    var curr, tokens_meta = state.tokens_meta, max = state.tokens_meta.length;
-    postProcess$1(state, state.delimiters);
-    for (curr = 0; curr < max; curr++) {
-      if (tokens_meta[curr] && tokens_meta[curr].delimiters) {
-        postProcess$1(state, tokens_meta[curr].delimiters);
-      }
-    }
-  };
-  var strikethrough = {
-    tokenize: tokenize$1,
-    postProcess: postProcess_1$1
-  };
-  // Process *this* and _that_
-  // Insert each marker as a separate text token, and add it to delimiter list
-  
-    var tokenize = function emphasis(state, silent) {
-    var i, scanned, token, start = state.pos, marker = state.src.charCodeAt(start);
-    if (silent) {
-      return false;
-    }
-    if (marker !== 95 /* _ */ && marker !== 42 /* * */) {
-      return false;
-    }
-    scanned = state.scanDelims(state.pos, marker === 42);
-    for (i = 0; i < scanned.length; i++) {
-      token = state.push("text", "", 0);
-      token.content = String.fromCharCode(marker);
-      state.delimiters.push({
-        // Char code of the starting marker (number).
-        marker: marker,
-        // Total length of these series of delimiters.
-        length: scanned.length,
-        // A position of the token this delimiter corresponds to.
-        token: state.tokens.length - 1,
-        // If this delimiter is matched as a valid opener, `end` will be
-        // equal to its position, otherwise it's `-1`.
-        end: -1,
-        // Boolean flags that determine if this delimiter could open or close
-        // an emphasis.
-        open: scanned.can_open,
-        close: scanned.can_close
-      });
-    }
-    state.pos += scanned.length;
-    return true;
-  };
-  function postProcess(state, delimiters) {
-    var i, startDelim, endDelim, token, ch, isStrong, max = delimiters.length;
-    for (i = max - 1; i >= 0; i--) {
-      startDelim = delimiters[i];
-      if (startDelim.marker !== 95 /* _ */ && startDelim.marker !== 42 /* * */) {
-        continue;
-      }
-      // Process only opening markers
-            if (startDelim.end === -1) {
-        continue;
-      }
-      endDelim = delimiters[startDelim.end];
-      // If the previous delimiter has the same marker and is adjacent to this one,
-      // merge those into one strong delimiter.
-      
-      // `<em><em>whatever</em></em>` -> `<strong>whatever</strong>`
-      
-            isStrong = i > 0 && delimiters[i - 1].end === startDelim.end + 1 && 
-      // check that first two markers match and adjacent
-      delimiters[i - 1].marker === startDelim.marker && delimiters[i - 1].token === startDelim.token - 1 && 
-      // check that last two markers are adjacent (we can safely assume they match)
-      delimiters[startDelim.end + 1].token === endDelim.token + 1;
-      ch = String.fromCharCode(startDelim.marker);
-      token = state.tokens[startDelim.token];
-      token.type = isStrong ? "strong_open" : "em_open";
-      token.tag = isStrong ? "strong" : "em";
-      token.nesting = 1;
-      token.markup = isStrong ? ch + ch : ch;
-      token.content = "";
-      token = state.tokens[endDelim.token];
-      token.type = isStrong ? "strong_close" : "em_close";
-      token.tag = isStrong ? "strong" : "em";
-      token.nesting = -1;
-      token.markup = isStrong ? ch + ch : ch;
-      token.content = "";
-      if (isStrong) {
-        state.tokens[delimiters[i - 1].token].content = "";
-        state.tokens[delimiters[startDelim.end + 1].token].content = "";
-        i--;
-      }
-    }
-  }
-  // Walk through delimiter list and replace text tokens with tags
-  
-    var postProcess_1 = function emphasis(state) {
-    var curr, tokens_meta = state.tokens_meta, max = state.tokens_meta.length;
-    postProcess(state, state.delimiters);
-    for (curr = 0; curr < max; curr++) {
-      if (tokens_meta[curr] && tokens_meta[curr].delimiters) {
-        postProcess(state, tokens_meta[curr].delimiters);
-      }
-    }
-  };
-  var emphasis = {
-    tokenize: tokenize,
-    postProcess: postProcess_1
-  };
-  var normalizeReference$1 = utils.normalizeReference;
-  var isSpace$1 = utils.isSpace;
-  var link = function link(state, silent) {
-    var attrs, code, label, labelEnd, labelStart, pos, res, ref, token, href = "", title = "", oldPos = state.pos, max = state.posMax, start = state.pos, parseReference = true;
-    if (state.src.charCodeAt(state.pos) !== 91 /* [ */) {
-      return false;
-    }
-    labelStart = state.pos + 1;
-    labelEnd = state.md.helpers.parseLinkLabel(state, state.pos, true);
-    // parser failed to find ']', so it's not a valid link
-        if (labelEnd < 0) {
-      return false;
-    }
-    pos = labelEnd + 1;
-    if (pos < max && state.src.charCodeAt(pos) === 40 /* ( */) {
-      // Inline link
-      // might have found a valid shortcut link, disable reference parsing
-      parseReference = false;
-      // [link](  <href>  "title"  )
-      //        ^^ skipping these spaces
-            pos++;
-      for (;pos < max; pos++) {
-        code = state.src.charCodeAt(pos);
-        if (!isSpace$1(code) && code !== 10) {
-          break;
-        }
-      }
-      if (pos >= max) {
-        return false;
-      }
-      // [link](  <href>  "title"  )
-      //          ^^^^^^ parsing link destination
-            start = pos;
-      res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax);
-      if (res.ok) {
-        href = state.md.normalizeLink(res.str);
-        if (state.md.validateLink(href)) {
-          pos = res.pos;
-        } else {
-          href = "";
-        }
-        // [link](  <href>  "title"  )
-        //                ^^ skipping these spaces
-                start = pos;
-        for (;pos < max; pos++) {
-          code = state.src.charCodeAt(pos);
-          if (!isSpace$1(code) && code !== 10) {
-            break;
-          }
-        }
-        // [link](  <href>  "title"  )
-        //                  ^^^^^^^ parsing link title
-                res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax);
-        if (pos < max && start !== pos && res.ok) {
-          title = res.str;
-          pos = res.pos;
-          // [link](  <href>  "title"  )
-          //                         ^^ skipping these spaces
-                    for (;pos < max; pos++) {
-            code = state.src.charCodeAt(pos);
-            if (!isSpace$1(code) && code !== 10) {
-              break;
-            }
-          }
-        }
-      }
-      if (pos >= max || state.src.charCodeAt(pos) !== 41 /* ) */) {
-        // parsing a valid shortcut link failed, fallback to reference
-        parseReference = true;
-      }
-      pos++;
-    }
-    if (parseReference) {
-      // Link reference
-      if (typeof state.env.references === "undefined") {
-        return false;
-      }
-      if (pos < max && state.src.charCodeAt(pos) === 91 /* [ */) {
-        start = pos + 1;
-        pos = state.md.helpers.parseLinkLabel(state, pos);
-        if (pos >= 0) {
-          label = state.src.slice(start, pos++);
-        } else {
-          pos = labelEnd + 1;
-        }
-      } else {
-        pos = labelEnd + 1;
-      }
-      // covers label === '' and label === undefined
-      // (collapsed reference link and shortcut reference link respectively)
-            if (!label) {
-        label = state.src.slice(labelStart, labelEnd);
-      }
-      ref = state.env.references[normalizeReference$1(label)];
-      if (!ref) {
-        state.pos = oldPos;
-        return false;
-      }
-      href = ref.href;
-      title = ref.title;
-    }
-    
-    // We found the end of the link, and know for a fact it's a valid link;
-    // so all that's left to do is to call tokenizer.
-    
-        if (!silent) {
-      state.pos = labelStart;
-      state.posMax = labelEnd;
-      token = state.push("link_open", "a", 1);
-      token.attrs = attrs = [ [ "href", href ] ];
-      if (title) {
-        attrs.push([ "title", title ]);
-      }
-      state.linkLevel++;
-      state.md.inline.tokenize(state);
-      state.linkLevel--;
-      token = state.push("link_close", "a", -1);
-    }
-    state.pos = pos;
-    state.posMax = max;
-    return true;
-  };
-  var normalizeReference = utils.normalizeReference;
-  var isSpace = utils.isSpace;
-  var image = function image(state, silent) {
-    var attrs, code, content, label, labelEnd, labelStart, pos, ref, res, title, token, tokens, start, href = "", oldPos = state.pos, max = state.posMax;
-    if (state.src.charCodeAt(state.pos) !== 33 /* ! */) {
-      return false;
-    }
-    if (state.src.charCodeAt(state.pos + 1) !== 91 /* [ */) {
-      return false;
-    }
-    labelStart = state.pos + 2;
-    labelEnd = state.md.helpers.parseLinkLabel(state, state.pos + 1, false);
-    // parser failed to find ']', so it's not a valid link
-        if (labelEnd < 0) {
-      return false;
-    }
-    pos = labelEnd + 1;
-    if (pos < max && state.src.charCodeAt(pos) === 40 /* ( */) {
-      // Inline link
-      // [link](  <href>  "title"  )
-      //        ^^ skipping these spaces
-      pos++;
-      for (;pos < max; pos++) {
-        code = state.src.charCodeAt(pos);
-        if (!isSpace(code) && code !== 10) {
-          break;
-        }
-      }
-      if (pos >= max) {
-        return false;
-      }
-      // [link](  <href>  "title"  )
-      //          ^^^^^^ parsing link destination
-            start = pos;
-      res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax);
-      if (res.ok) {
-        href = state.md.normalizeLink(res.str);
-        if (state.md.validateLink(href)) {
-          pos = res.pos;
-        } else {
-          href = "";
-        }
-      }
-      // [link](  <href>  "title"  )
-      //                ^^ skipping these spaces
-            start = pos;
-      for (;pos < max; pos++) {
-        code = state.src.charCodeAt(pos);
-        if (!isSpace(code) && code !== 10) {
-          break;
-        }
-      }
-      // [link](  <href>  "title"  )
-      //                  ^^^^^^^ parsing link title
-            res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax);
-      if (pos < max && start !== pos && res.ok) {
-        title = res.str;
-        pos = res.pos;
-        // [link](  <href>  "title"  )
-        //                         ^^ skipping these spaces
-                for (;pos < max; pos++) {
-          code = state.src.charCodeAt(pos);
-          if (!isSpace(code) && code !== 10) {
-            break;
-          }
-        }
-      } else {
-        title = "";
-      }
-      if (pos >= max || state.src.charCodeAt(pos) !== 41 /* ) */) {
-        state.pos = oldPos;
-        return false;
-      }
-      pos++;
-    } else {
-      // Link reference
-      if (typeof state.env.references === "undefined") {
-        return false;
-      }
-      if (pos < max && state.src.charCodeAt(pos) === 91 /* [ */) {
-        start = pos + 1;
-        pos = state.md.helpers.parseLinkLabel(state, pos);
-        if (pos >= 0) {
-          label = state.src.slice(start, pos++);
-        } else {
-          pos = labelEnd + 1;
-        }
-      } else {
-        pos = labelEnd + 1;
-      }
-      // covers label === '' and label === undefined
-      // (collapsed reference link and shortcut reference link respectively)
-            if (!label) {
-        label = state.src.slice(labelStart, labelEnd);
-      }
-      ref = state.env.references[normalizeReference(label)];
-      if (!ref) {
-        state.pos = oldPos;
-        return false;
-      }
-      href = ref.href;
-      title = ref.title;
-    }
-    
-    // We found the end of the link, and know for a fact it's a valid link;
-    // so all that's left to do is to call tokenizer.
-    
-        if (!silent) {
-      content = state.src.slice(labelStart, labelEnd);
-      state.md.inline.parse(content, state.md, state.env, tokens = []);
-      token = state.push("image", "img", 0);
-      token.attrs = attrs = [ [ "src", href ], [ "alt", "" ] ];
-      token.children = tokens;
-      token.content = content;
-      if (title) {
-        attrs.push([ "title", title ]);
-      }
-    }
-    state.pos = pos;
-    state.posMax = max;
-    return true;
-  };
-  // Process autolinks '<protocol:...>'
-  /*eslint max-len:0*/  var EMAIL_RE = /^([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$/;
-  var AUTOLINK_RE = /^([a-zA-Z][a-zA-Z0-9+.\-]{1,31}):([^<>\x00-\x20]*)$/;
-  var autolink = function autolink(state, silent) {
-    var url, fullUrl, token, ch, start, max, pos = state.pos;
-    if (state.src.charCodeAt(pos) !== 60 /* < */) {
-      return false;
-    }
-    start = state.pos;
-    max = state.posMax;
-    for (;;) {
-      if (++pos >= max) return false;
-      ch = state.src.charCodeAt(pos);
-      if (ch === 60 /* < */) return false;
-      if (ch === 62 /* > */) break;
-    }
-    url = state.src.slice(start + 1, pos);
-    if (AUTOLINK_RE.test(url)) {
-      fullUrl = state.md.normalizeLink(url);
-      if (!state.md.validateLink(fullUrl)) {
-        return false;
-      }
-      if (!silent) {
-        token = state.push("link_open", "a", 1);
-        token.attrs = [ [ "href", fullUrl ] ];
-        token.markup = "autolink";
-        token.info = "auto";
-        token = state.push("text", "", 0);
-        token.content = state.md.normalizeLinkText(url);
-        token = state.push("link_close", "a", -1);
-        token.markup = "autolink";
-        token.info = "auto";
-      }
-      state.pos += url.length + 2;
-      return true;
-    }
-    if (EMAIL_RE.test(url)) {
-      fullUrl = state.md.normalizeLink("mailto:" + url);
-      if (!state.md.validateLink(fullUrl)) {
-        return false;
-      }
-      if (!silent) {
-        token = state.push("link_open", "a", 1);
-        token.attrs = [ [ "href", fullUrl ] ];
-        token.markup = "autolink";
-        token.info = "auto";
-        token = state.push("text", "", 0);
-        token.content = state.md.normalizeLinkText(url);
-        token = state.push("link_close", "a", -1);
-        token.markup = "autolink";
-        token.info = "auto";
-      }
-      state.pos += url.length + 2;
-      return true;
-    }
-    return false;
-  };
-  var HTML_TAG_RE = html_re.HTML_TAG_RE;
-  function isLinkOpen(str) {
-    return /^<a[>\s]/i.test(str);
-  }
-  function isLinkClose(str) {
-    return /^<\/a\s*>/i.test(str);
-  }
-  function isLetter(ch) {
-    /*eslint no-bitwise:0*/
-    var lc = ch | 32;
- // to lower case
-        return lc >= 97 /* a */ && lc <= 122 /* z */;
-  }
-  var html_inline = function html_inline(state, silent) {
-    var ch, match, max, token, pos = state.pos;
-    if (!state.md.options.html) {
-      return false;
-    }
-    // Check start
-        max = state.posMax;
-    if (state.src.charCodeAt(pos) !== 60 /* < */ || pos + 2 >= max) {
-      return false;
-    }
-    // Quick fail on second char
-        ch = state.src.charCodeAt(pos + 1);
-    if (ch !== 33 /* ! */ && ch !== 63 /* ? */ && ch !== 47 /* / */ && !isLetter(ch)) {
-      return false;
-    }
-    match = state.src.slice(pos).match(HTML_TAG_RE);
-    if (!match) {
-      return false;
-    }
-    if (!silent) {
-      token = state.push("html_inline", "", 0);
-      token.content = match[0];
-      if (isLinkOpen(token.content)) state.linkLevel++;
-      if (isLinkClose(token.content)) state.linkLevel--;
-    }
-    state.pos += match[0].length;
-    return true;
-  };
-  var has = utils.has;
-  var isValidEntityCode = utils.isValidEntityCode;
-  var fromCodePoint = utils.fromCodePoint;
-  var DIGITAL_RE = /^&#((?:x[a-f0-9]{1,6}|[0-9]{1,7}));/i;
-  var NAMED_RE = /^&([a-z][a-z0-9]{1,31});/i;
-  var entity = function entity(state, silent) {
-    var ch, code, match, token, pos = state.pos, max = state.posMax;
-    if (state.src.charCodeAt(pos) !== 38 /* & */) return false;
-    if (pos + 1 >= max) return false;
-    ch = state.src.charCodeAt(pos + 1);
-    if (ch === 35 /* # */) {
-      match = state.src.slice(pos).match(DIGITAL_RE);
-      if (match) {
-        if (!silent) {
-          code = match[1][0].toLowerCase() === "x" ? parseInt(match[1].slice(1), 16) : parseInt(match[1], 10);
-          token = state.push("text_special", "", 0);
-          token.content = isValidEntityCode(code) ? fromCodePoint(code) : fromCodePoint(65533);
-          token.markup = match[0];
-          token.info = "entity";
-        }
-        state.pos += match[0].length;
-        return true;
-      }
-    } else {
-      match = state.src.slice(pos).match(NAMED_RE);
-      if (match) {
-        if (has(entities, match[1])) {
-          if (!silent) {
-            token = state.push("text_special", "", 0);
-            token.content = entities[match[1]];
-            token.markup = match[0];
-            token.info = "entity";
-          }
-          state.pos += match[0].length;
-          return true;
-        }
-      }
-    }
-    return false;
-  };
-  // For each opening emphasis-like marker find a matching closing one
-    function processDelimiters(delimiters) {
-    var closerIdx, openerIdx, closer, opener, minOpenerIdx, newMinOpenerIdx, isOddMatch, lastJump, openersBottom = {}, max = delimiters.length;
-    if (!max) return;
-    // headerIdx is the first delimiter of the current (where closer is) delimiter run
-        var headerIdx = 0;
-    var lastTokenIdx = -2;
- // needs any value lower than -1
-        var jumps = [];
-    for (closerIdx = 0; closerIdx < max; closerIdx++) {
-      closer = delimiters[closerIdx];
-      jumps.push(0);
-      // markers belong to same delimiter run if:
-      //  - they have adjacent tokens
-      //  - AND markers are the same
-      
-            if (delimiters[headerIdx].marker !== closer.marker || lastTokenIdx !== closer.token - 1) {
-        headerIdx = closerIdx;
-      }
-      lastTokenIdx = closer.token;
-      // Length is only used for emphasis-specific "rule of 3",
-      // if it's not defined (in strikethrough or 3rd party plugins),
-      // we can default it to 0 to disable those checks.
-      
-            closer.length = closer.length || 0;
-      if (!closer.close) continue;
-      // Previously calculated lower bounds (previous fails)
-      // for each marker, each delimiter length modulo 3,
-      // and for whether this closer can be an opener;
-      // https://github.com/commonmark/cmark/commit/34250e12ccebdc6372b8b49c44fab57c72443460
-            if (!openersBottom.hasOwnProperty(closer.marker)) {
-        openersBottom[closer.marker] = [ -1, -1, -1, -1, -1, -1 ];
-      }
-      minOpenerIdx = openersBottom[closer.marker][(closer.open ? 3 : 0) + closer.length % 3];
-      openerIdx = headerIdx - jumps[headerIdx] - 1;
-      newMinOpenerIdx = openerIdx;
-      for (;openerIdx > minOpenerIdx; openerIdx -= jumps[openerIdx] + 1) {
-        opener = delimiters[openerIdx];
-        if (opener.marker !== closer.marker) continue;
-        if (opener.open && opener.end < 0) {
-          isOddMatch = false;
-          // from spec:
-          
-          // If one of the delimiters can both open and close emphasis, then the
-          // sum of the lengths of the delimiter runs containing the opening and
-          // closing delimiters must not be a multiple of 3 unless both lengths
-          // are multiples of 3.
-          
-                    if (opener.close || closer.open) {
-            if ((opener.length + closer.length) % 3 === 0) {
-              if (opener.length % 3 !== 0 || closer.length % 3 !== 0) {
-                isOddMatch = true;
-              }
-            }
-          }
-          if (!isOddMatch) {
-            // If previous delimiter cannot be an opener, we can safely skip
-            // the entire sequence in future checks. This is required to make
-            // sure algorithm has linear complexity (see *_*_*_*_*_... case).
-            lastJump = openerIdx > 0 && !delimiters[openerIdx - 1].open ? jumps[openerIdx - 1] + 1 : 0;
-            jumps[closerIdx] = closerIdx - openerIdx + lastJump;
-            jumps[openerIdx] = lastJump;
-            closer.open = false;
-            opener.end = closerIdx;
-            opener.close = false;
-            newMinOpenerIdx = -1;
-            // treat next token as start of run,
-            // it optimizes skips in **<...>**a**<...>** pathological case
-                        lastTokenIdx = -2;
-            break;
-          }
-        }
-      }
-      if (newMinOpenerIdx !== -1) {
-        // If match for this delimiter run failed, we want to set lower bound for
-        // future lookups. This is required to make sure algorithm has linear
-        // complexity.
-        // See details here:
-        // https://github.com/commonmark/cmark/issues/178#issuecomment-270417442
-        openersBottom[closer.marker][(closer.open ? 3 : 0) + (closer.length || 0) % 3] = newMinOpenerIdx;
-      }
-    }
-  }
-  var balance_pairs = function link_pairs(state) {
-    var curr, tokens_meta = state.tokens_meta, max = state.tokens_meta.length;
-    processDelimiters(state.delimiters);
-    for (curr = 0; curr < max; curr++) {
-      if (tokens_meta[curr] && tokens_meta[curr].delimiters) {
-        processDelimiters(tokens_meta[curr].delimiters);
-      }
-    }
-  };
-  // Clean up tokens after emphasis and strikethrough postprocessing:
-    var fragments_join = function fragments_join(state) {
-    var curr, last, level = 0, tokens = state.tokens, max = state.tokens.length;
-    for (curr = last = 0; curr < max; curr++) {
-      // re-calculate levels after emphasis/strikethrough turns some text nodes
-      // into opening/closing tags
-      if (tokens[curr].nesting < 0) level--;
- // closing tag
-            tokens[curr].level = level;
-      if (tokens[curr].nesting > 0) level++;
- // opening tag
-            if (tokens[curr].type === "text" && curr + 1 < max && tokens[curr + 1].type === "text") {
-        // collapse two adjacent text nodes
-        tokens[curr + 1].content = tokens[curr].content + tokens[curr + 1].content;
-      } else {
-        if (curr !== last) {
-          tokens[last] = tokens[curr];
-        }
-        last++;
-      }
-    }
-    if (curr !== last) {
-      tokens.length = last;
-    }
-  };
-  var isWhiteSpace = utils.isWhiteSpace;
-  var isPunctChar = utils.isPunctChar;
-  var isMdAsciiPunct = utils.isMdAsciiPunct;
-  function StateInline(src, md, env, outTokens) {
-    this.src = src;
-    this.env = env;
-    this.md = md;
-    this.tokens = outTokens;
-    this.tokens_meta = Array(outTokens.length);
-    this.pos = 0;
-    this.posMax = this.src.length;
-    this.level = 0;
-    this.pending = "";
-    this.pendingLevel = 0;
-    // Stores { start: end } pairs. Useful for backtrack
-    // optimization of pairs parse (emphasis, strikes).
-        this.cache = {};
-    // List of emphasis-like delimiters for current tag
-        this.delimiters = [];
-    // Stack of delimiter lists for upper level tags
-        this._prev_delimiters = [];
-    // backtick length => last seen position
-        this.backticks = {};
-    this.backticksScanned = false;
-    // Counter used to disable inline linkify-it execution
-    // inside <a> and markdown links
-        this.linkLevel = 0;
-  }
-  // Flush pending text
-  
-    StateInline.prototype.pushPending = function() {
-    var token$1 = new token("text", "", 0);
-    token$1.content = this.pending;
-    token$1.level = this.pendingLevel;
-    this.tokens.push(token$1);
-    this.pending = "";
-    return token$1;
-  };
-  // Push new token to "stream".
-  // If pending text exists - flush it as text token
-  
-    StateInline.prototype.push = function(type, tag, nesting) {
-    if (this.pending) {
-      this.pushPending();
-    }
-    var token$1 = new token(type, tag, nesting);
-    var token_meta = null;
-    if (nesting < 0) {
-      // closing tag
-      this.level--;
-      this.delimiters = this._prev_delimiters.pop();
-    }
-    token$1.level = this.level;
-    if (nesting > 0) {
-      // opening tag
-      this.level++;
-      this._prev_delimiters.push(this.delimiters);
-      this.delimiters = [];
-      token_meta = {
-        delimiters: this.delimiters
-      };
-    }
-    this.pendingLevel = this.level;
-    this.tokens.push(token$1);
-    this.tokens_meta.push(token_meta);
-    return token$1;
-  };
-  // Scan a sequence of emphasis-like markers, and determine whether
-  // it can start an emphasis sequence or end an emphasis sequence.
-  
-  //  - start - position to scan from (it should point at a valid marker);
-  //  - canSplitWord - determine if these markers can be found inside a word
-  
-    StateInline.prototype.scanDelims = function(start, canSplitWord) {
-    var pos = start, lastChar, nextChar, count, can_open, can_close, isLastWhiteSpace, isLastPunctChar, isNextWhiteSpace, isNextPunctChar, left_flanking = true, right_flanking = true, max = this.posMax, marker = this.src.charCodeAt(start);
-    // treat beginning of the line as a whitespace
-        lastChar = start > 0 ? this.src.charCodeAt(start - 1) : 32;
-    while (pos < max && this.src.charCodeAt(pos) === marker) {
-      pos++;
-    }
-    count = pos - start;
-    // treat end of the line as a whitespace
-        nextChar = pos < max ? this.src.charCodeAt(pos) : 32;
-    isLastPunctChar = isMdAsciiPunct(lastChar) || isPunctChar(String.fromCharCode(lastChar));
-    isNextPunctChar = isMdAsciiPunct(nextChar) || isPunctChar(String.fromCharCode(nextChar));
-    isLastWhiteSpace = isWhiteSpace(lastChar);
-    isNextWhiteSpace = isWhiteSpace(nextChar);
-    if (isNextWhiteSpace) {
-      left_flanking = false;
-    } else if (isNextPunctChar) {
-      if (!(isLastWhiteSpace || isLastPunctChar)) {
-        left_flanking = false;
-      }
-    }
-    if (isLastWhiteSpace) {
-      right_flanking = false;
-    } else if (isLastPunctChar) {
-      if (!(isNextWhiteSpace || isNextPunctChar)) {
-        right_flanking = false;
-      }
-    }
-    if (!canSplitWord) {
-      can_open = left_flanking && (!right_flanking || isLastPunctChar);
-      can_close = right_flanking && (!left_flanking || isNextPunctChar);
-    } else {
-      can_open = left_flanking;
-      can_close = right_flanking;
-    }
-    return {
-      can_open: can_open,
-      can_close: can_close,
-      length: count
-    };
-  };
-  // re-export Token class to use in block rules
-    StateInline.prototype.Token = token;
-  var state_inline = StateInline;
-  ////////////////////////////////////////////////////////////////////////////////
-  // Parser rules
-    var _rules = [ [ "text", text ], [ "linkify", linkify ], [ "newline", newline ], [ "escape", _escape ], [ "backticks", backticks ], [ "strikethrough", strikethrough.tokenize ], [ "emphasis", emphasis.tokenize ], [ "link", link ], [ "image", image ], [ "autolink", autolink ], [ "html_inline", html_inline ], [ "entity", entity ] ];
-  // `rule2` ruleset was created specifically for emphasis/strikethrough
-  // post-processing and may be changed in the future.
-  
-  // Don't use this for anything except pairs (plugins working with `balance_pairs`).
-  
-    var _rules2 = [ [ "balance_pairs", balance_pairs ], [ "strikethrough", strikethrough.postProcess ], [ "emphasis", emphasis.postProcess ], 
-  // rules for pairs separate '**' into its own text tokens, which may be left unused,
-  // rule below merges unused segments back with the rest of the text
-  [ "fragments_join", fragments_join ] ];
-  /**
-	 * new ParserInline()
-	 **/  function ParserInline() {
-    var i;
-    /**
-	   * ParserInline#ruler -> Ruler
-	   *
-	   * [[Ruler]] instance. Keep configuration of inline rules.
-	   **/    this.ruler = new ruler;
-    for (i = 0; i < _rules.length; i++) {
-      this.ruler.push(_rules[i][0], _rules[i][1]);
-    }
-    /**
-	   * ParserInline#ruler2 -> Ruler
-	   *
-	   * [[Ruler]] instance. Second ruler used for post-processing
-	   * (e.g. in emphasis-like rules).
-	   **/    this.ruler2 = new ruler;
-    for (i = 0; i < _rules2.length; i++) {
-      this.ruler2.push(_rules2[i][0], _rules2[i][1]);
-    }
-  }
-  // Skip single token by running all rules in validation mode;
-  // returns `true` if any rule reported success
-  
-    ParserInline.prototype.skipToken = function(state) {
-    var ok, i, pos = state.pos, rules = this.ruler.getRules(""), len = rules.length, maxNesting = state.md.options.maxNesting, cache = state.cache;
-    if (typeof cache[pos] !== "undefined") {
-      state.pos = cache[pos];
-      return;
-    }
-    if (state.level < maxNesting) {
-      for (i = 0; i < len; i++) {
-        // Increment state.level and decrement it later to limit recursion.
-        // It's harmless to do here, because no tokens are created. But ideally,
-        // we'd need a separate private state variable for this purpose.
-        state.level++;
-        ok = rules[i](state, true);
-        state.level--;
-        if (ok) {
-          if (pos >= state.pos) {
-            throw new Error("inline rule didn't increment state.pos");
-          }
-          break;
-        }
-      }
-    } else {
-      // Too much nesting, just skip until the end of the paragraph.
-      // NOTE: this will cause links to behave incorrectly in the following case,
-      //       when an amount of `[` is exactly equal to `maxNesting + 1`:
-      //       [[[[[[[[[[[[[[[[[[[[[foo]()
-      // TODO: remove this workaround when CM standard will allow nested links
-      //       (we can replace it by preventing links from being parsed in
-      //       validation mode)
-      state.pos = state.posMax;
-    }
-    if (!ok) {
-      state.pos++;
-    }
-    cache[pos] = state.pos;
-  };
-  // Generate tokens for input range
-  
-    ParserInline.prototype.tokenize = function(state) {
-    var ok, i, prevPos, rules = this.ruler.getRules(""), len = rules.length, end = state.posMax, maxNesting = state.md.options.maxNesting;
-    while (state.pos < end) {
-      // Try all possible rules.
-      // On success, rule should:
-      // - update `state.pos`
-      // - update `state.tokens`
-      // - return true
-      prevPos = state.pos;
-      if (state.level < maxNesting) {
-        for (i = 0; i < len; i++) {
-          ok = rules[i](state, false);
-          if (ok) {
-            if (prevPos >= state.pos) {
-              throw new Error("inline rule didn't increment state.pos");
-            }
-            break;
-          }
-        }
-      }
-      if (ok) {
-        if (state.pos >= end) {
-          break;
-        }
-        continue;
-      }
-      state.pending += state.src[state.pos++];
-    }
-    if (state.pending) {
-      state.pushPending();
-    }
-  };
-  /**
-	 * ParserInline.parse(str, md, env, outTokens)
-	 *
-	 * Process input string and push inline tokens into `outTokens`
-	 **/  ParserInline.prototype.parse = function(str, md, env, outTokens) {
-    var i, rules, len;
-    var state = new this.State(str, md, env, outTokens);
-    this.tokenize(state);
-    rules = this.ruler2.getRules("");
-    len = rules.length;
-    for (i = 0; i < len; i++) {
-      rules[i](state);
-    }
-  };
-  ParserInline.prototype.State = state_inline;
-  var parser_inline = ParserInline;
-  var re = function(opts) {
-    var re = {};
-    opts = opts || {};
-    // Use direct extract instead of `regenerate` to reduse browserified size
-        re.src_Any = regex$3.source;
-    re.src_Cc = regex$2.source;
-    re.src_Z = regex.source;
-    re.src_P = regex$4.source;
-    // \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation)
-        re.src_ZPCc = [ re.src_Z, re.src_P, re.src_Cc ].join("|");
-    // \p{\Z\Cc} (white spaces + control)
-        re.src_ZCc = [ re.src_Z, re.src_Cc ].join("|");
-    // Experimental. List of chars, completely prohibited in links
-    // because can separate it from other part of text
-        var text_separators = "[><\uff5c]";
-    // All possible word characters (everything without punctuation, spaces & controls)
-    // Defined via punctuation & spaces to save space
-    // Should be something like \p{\L\N\S\M} (\w but without `_`)
-        re.src_pseudo_letter = "(?:(?!" + text_separators + "|" + re.src_ZPCc + ")" + re.src_Any + ")";
-    // The same as abothe but without [0-9]
-    // var src_pseudo_letter_non_d = '(?:(?![0-9]|' + src_ZPCc + ')' + src_Any + ')';
-    ////////////////////////////////////////////////////////////////////////////////
-        re.src_ip4 = "(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)";
-    // Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch.
-        re.src_auth = "(?:(?:(?!" + re.src_ZCc + "|[@/\\[\\]()]).)+@)?";
-    re.src_port = "(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?";
-    re.src_host_terminator = "(?=$|" + text_separators + "|" + re.src_ZPCc + ")" + "(?!" + (opts["---"] ? "-(?!--)|" : "-|") + "_|:\\d|\\.-|\\.(?!$|" + re.src_ZPCc + "))";
-    re.src_path = "(?:" + "[/?#]" + "(?:" + "(?!" + re.src_ZCc + "|" + text_separators + "|[()[\\]{}.,\"'?!\\-;]).|" + "\\[(?:(?!" + re.src_ZCc + "|\\]).)*\\]|" + "\\((?:(?!" + re.src_ZCc + "|[)]).)*\\)|" + "\\{(?:(?!" + re.src_ZCc + "|[}]).)*\\}|" + '\\"(?:(?!' + re.src_ZCc + '|["]).)+\\"|' + "\\'(?:(?!" + re.src_ZCc + "|[']).)+\\'|" + "\\'(?=" + re.src_pseudo_letter + "|[-])|" + // allow `I'm_king` if no pair found
-    "\\.{2,}[a-zA-Z0-9%/&]|" + // google has many dots in "google search" links (#66, #81).
-    // github has ... in commit range links,
-    // Restrict to
-    // - english
-    // - percent-encoded
-    // - parts of file path
-    // - params separator
-    // until more examples found.
-    "\\.(?!" + re.src_ZCc + "|[.]|$)|" + (opts["---"] ? "\\-(?!--(?:[^-]|$))(?:-*)|" : "\\-+|") + ",(?!" + re.src_ZCc + "|$)|" + // allow `,,,` in paths
-    ";(?!" + re.src_ZCc + "|$)|" + // allow `;` if not followed by space-like char
-    "\\!+(?!" + re.src_ZCc + "|[!]|$)|" + // allow `!!!` in paths, but not at the end
-    "\\?(?!" + re.src_ZCc + "|[?]|$)" + ")+" + "|\\/" + ")?";
-    // Allow anything in markdown spec, forbid quote (") at the first position
-    // because emails enclosed in quotes are far more common
-        re.src_email_name = '[\\-;:&=\\+\\$,\\.a-zA-Z0-9_][\\-;:&=\\+\\$,\\"\\.a-zA-Z0-9_]*';
-    re.src_xn = "xn--[a-z0-9\\-]{1,59}";
-    // More to read about domain names
-    // http://serverfault.com/questions/638260/
-        re.src_domain_root = 
-    // Allow letters & digits (http://test1)
-    "(?:" + re.src_xn + "|" + re.src_pseudo_letter + "{1,63}" + ")";
-    re.src_domain = "(?:" + re.src_xn + "|" + "(?:" + re.src_pseudo_letter + ")" + "|" + "(?:" + re.src_pseudo_letter + "(?:-|" + re.src_pseudo_letter + "){0,61}" + re.src_pseudo_letter + ")" + ")";
-    re.src_host = "(?:" + 
-    // Don't need IP check, because digits are already allowed in normal domain names
-    //   src_ip4 +
-    // '|' +
-    "(?:(?:(?:" + re.src_domain + ")\\.)*" + re.src_domain /*_root*/ + ")" + ")";
-    re.tpl_host_fuzzy = "(?:" + re.src_ip4 + "|" + "(?:(?:(?:" + re.src_domain + ")\\.)+(?:%TLDS%))" + ")";
-    re.tpl_host_no_ip_fuzzy = "(?:(?:(?:" + re.src_domain + ")\\.)+(?:%TLDS%))";
-    re.src_host_strict = re.src_host + re.src_host_terminator;
-    re.tpl_host_fuzzy_strict = re.tpl_host_fuzzy + re.src_host_terminator;
-    re.src_host_port_strict = re.src_host + re.src_port + re.src_host_terminator;
-    re.tpl_host_port_fuzzy_strict = re.tpl_host_fuzzy + re.src_port + re.src_host_terminator;
-    re.tpl_host_port_no_ip_fuzzy_strict = re.tpl_host_no_ip_fuzzy + re.src_port + re.src_host_terminator;
-    ////////////////////////////////////////////////////////////////////////////////
-    // Main rules
-    // Rude test fuzzy links by host, for quick deny
-        re.tpl_host_fuzzy_test = "localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:" + re.src_ZPCc + "|>|$))";
-    re.tpl_email_fuzzy = "(^|" + text_separators + '|"|\\(|' + re.src_ZCc + ")" + "(" + re.src_email_name + "@" + re.tpl_host_fuzzy_strict + ")";
-    re.tpl_link_fuzzy = 
-    // Fuzzy link can't be prepended with .:/\- and non punctuation.
-    // but can start with > (markdown blockquote)
-    "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + re.src_ZPCc + "))" + "((?![$+<=>^`|\uff5c])" + re.tpl_host_port_fuzzy_strict + re.src_path + ")";
-    re.tpl_link_no_ip_fuzzy = 
-    // Fuzzy link can't be prepended with .:/\- and non punctuation.
-    // but can start with > (markdown blockquote)
-    "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + re.src_ZPCc + "))" + "((?![$+<=>^`|\uff5c])" + re.tpl_host_port_no_ip_fuzzy_strict + re.src_path + ")";
-    return re;
-  };
-  ////////////////////////////////////////////////////////////////////////////////
-  // Helpers
-  // Merge objects
-  
-    function assign(obj /*from1, from2, from3, ...*/) {
-    var sources = Array.prototype.slice.call(arguments, 1);
-    sources.forEach((function(source) {
-      if (!source) {
-        return;
-      }
-      Object.keys(source).forEach((function(key) {
-        obj[key] = source[key];
-      }));
-    }));
-    return obj;
-  }
-  function _class(obj) {
-    return Object.prototype.toString.call(obj);
-  }
-  function isString(obj) {
-    return _class(obj) === "[object String]";
-  }
-  function isObject(obj) {
-    return _class(obj) === "[object Object]";
-  }
-  function isRegExp(obj) {
-    return _class(obj) === "[object RegExp]";
-  }
-  function isFunction(obj) {
-    return _class(obj) === "[object Function]";
-  }
-  function escapeRE(str) {
-    return str.replace(/[.?*+^$[\]\\(){}|-]/g, "\\$&");
-  }
-  ////////////////////////////////////////////////////////////////////////////////
-    var defaultOptions = {
-    fuzzyLink: true,
-    fuzzyEmail: true,
-    fuzzyIP: false
-  };
-  function isOptionsObj(obj) {
-    return Object.keys(obj || {}).reduce((function(acc, k) {
-      return acc || defaultOptions.hasOwnProperty(k);
-    }), false);
-  }
-  var defaultSchemas = {
-    "http:": {
-      validate: function(text, pos, self) {
-        var tail = text.slice(pos);
-        if (!self.re.http) {
-          // compile lazily, because "host"-containing variables can change on tlds update.
-          self.re.http = new RegExp("^\\/\\/" + self.re.src_auth + self.re.src_host_port_strict + self.re.src_path, "i");
-        }
-        if (self.re.http.test(tail)) {
-          return tail.match(self.re.http)[0].length;
-        }
-        return 0;
-      }
-    },
-    "https:": "http:",
-    "ftp:": "http:",
-    "//": {
-      validate: function(text, pos, self) {
-        var tail = text.slice(pos);
-        if (!self.re.no_http) {
-          // compile lazily, because "host"-containing variables can change on tlds update.
-          self.re.no_http = new RegExp("^" + self.re.src_auth + 
-          // Don't allow single-level domains, because of false positives like '//test'
-          // with code comments
-          "(?:localhost|(?:(?:" + self.re.src_domain + ")\\.)+" + self.re.src_domain_root + ")" + self.re.src_port + self.re.src_host_terminator + self.re.src_path, "i");
-        }
-        if (self.re.no_http.test(tail)) {
-          // should not be `://` & `///`, that protects from errors in protocol name
-          if (pos >= 3 && text[pos - 3] === ":") {
-            return 0;
-          }
-          if (pos >= 3 && text[pos - 3] === "/") {
-            return 0;
-          }
-          return tail.match(self.re.no_http)[0].length;
-        }
-        return 0;
-      }
-    },
-    "mailto:": {
-      validate: function(text, pos, self) {
-        var tail = text.slice(pos);
-        if (!self.re.mailto) {
-          self.re.mailto = new RegExp("^" + self.re.src_email_name + "@" + self.re.src_host_strict, "i");
-        }
-        if (self.re.mailto.test(tail)) {
-          return tail.match(self.re.mailto)[0].length;
-        }
-        return 0;
-      }
-    }
-  };
-  /*eslint-disable max-len*/
-  // RE pattern for 2-character tlds (autogenerated by ./support/tlds_2char_gen.js)
-    var tlds_2ch_src_re = "a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]";
-  // DON'T try to make PRs with changes. Extend TLDs with LinkifyIt.tlds() instead
-    var tlds_default = "biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|\u0440\u0444".split("|");
-  /*eslint-enable max-len*/
-  ////////////////////////////////////////////////////////////////////////////////
-    function resetScanCache(self) {
-    self.__index__ = -1;
-    self.__text_cache__ = "";
-  }
-  function createValidator(re) {
-    return function(text, pos) {
-      var tail = text.slice(pos);
-      if (re.test(tail)) {
-        return tail.match(re)[0].length;
-      }
-      return 0;
-    };
-  }
-  function createNormalizer() {
-    return function(match, self) {
-      self.normalize(match);
-    };
-  }
-  // Schemas compiler. Build regexps.
-  
-    function compile(self) {
-    // Load & clone RE patterns.
-    var re$1 = self.re = re(self.__opts__);
-    // Define dynamic patterns
-        var tlds = self.__tlds__.slice();
-    self.onCompile();
-    if (!self.__tlds_replaced__) {
-      tlds.push(tlds_2ch_src_re);
-    }
-    tlds.push(re$1.src_xn);
-    re$1.src_tlds = tlds.join("|");
-    function untpl(tpl) {
-      return tpl.replace("%TLDS%", re$1.src_tlds);
-    }
-    re$1.email_fuzzy = RegExp(untpl(re$1.tpl_email_fuzzy), "i");
-    re$1.link_fuzzy = RegExp(untpl(re$1.tpl_link_fuzzy), "i");
-    re$1.link_no_ip_fuzzy = RegExp(untpl(re$1.tpl_link_no_ip_fuzzy), "i");
-    re$1.host_fuzzy_test = RegExp(untpl(re$1.tpl_host_fuzzy_test), "i");
-    
-    // Compile each schema
-    
-        var aliases = [];
-    self.__compiled__ = {};
- // Reset compiled data
-        function schemaError(name, val) {
-      throw new Error('(LinkifyIt) Invalid schema "' + name + '": ' + val);
-    }
-    Object.keys(self.__schemas__).forEach((function(name) {
-      var val = self.__schemas__[name];
-      // skip disabled methods
-            if (val === null) {
-        return;
-      }
-      var compiled = {
-        validate: null,
-        link: null
-      };
-      self.__compiled__[name] = compiled;
-      if (isObject(val)) {
-        if (isRegExp(val.validate)) {
-          compiled.validate = createValidator(val.validate);
-        } else if (isFunction(val.validate)) {
-          compiled.validate = val.validate;
-        } else {
-          schemaError(name, val);
-        }
-        if (isFunction(val.normalize)) {
-          compiled.normalize = val.normalize;
-        } else if (!val.normalize) {
-          compiled.normalize = createNormalizer();
-        } else {
-          schemaError(name, val);
-        }
-        return;
-      }
-      if (isString(val)) {
-        aliases.push(name);
-        return;
-      }
-      schemaError(name, val);
-    }));
-    
-    // Compile postponed aliases
-    
-        aliases.forEach((function(alias) {
-      if (!self.__compiled__[self.__schemas__[alias]]) {
-        // Silently fail on missed schemas to avoid errons on disable.
-        // schemaError(alias, self.__schemas__[alias]);
-        return;
-      }
-      self.__compiled__[alias].validate = self.__compiled__[self.__schemas__[alias]].validate;
-      self.__compiled__[alias].normalize = self.__compiled__[self.__schemas__[alias]].normalize;
-    }));
-    
-    // Fake record for guessed links
-    
-        self.__compiled__[""] = {
-      validate: null,
-      normalize: createNormalizer()
-    };
-    
-    // Build schema condition
-    
-        var slist = Object.keys(self.__compiled__).filter((function(name) {
-      // Filter disabled & fake schemas
-      return name.length > 0 && self.__compiled__[name];
-    })).map(escapeRE).join("|");
-    // (?!_) cause 1.5x slowdown
-        self.re.schema_test = RegExp("(^|(?!_)(?:[><\uff5c]|" + re$1.src_ZPCc + "))(" + slist + ")", "i");
-    self.re.schema_search = RegExp("(^|(?!_)(?:[><\uff5c]|" + re$1.src_ZPCc + "))(" + slist + ")", "ig");
-    self.re.schema_at_start = RegExp("^" + self.re.schema_search.source, "i");
-    self.re.pretest = RegExp("(" + self.re.schema_test.source + ")|(" + self.re.host_fuzzy_test.source + ")|@", "i");
-    
-    // Cleanup
-    
-        resetScanCache(self);
-  }
-  /**
-	 * class Match
-	 *
-	 * Match result. Single element of array, returned by [[LinkifyIt#match]]
-	 **/  function Match(self, shift) {
-    var start = self.__index__, end = self.__last_index__, text = self.__text_cache__.slice(start, end);
-    /**
-	   * Match#schema -> String
-	   *
-	   * Prefix (protocol) for matched string.
-	   **/    this.schema = self.__schema__.toLowerCase();
-    /**
-	   * Match#index -> Number
-	   *
-	   * First position of matched string.
-	   **/    this.index = start + shift;
-    /**
-	   * Match#lastIndex -> Number
-	   *
-	   * Next position after matched string.
-	   **/    this.lastIndex = end + shift;
-    /**
-	   * Match#raw -> String
-	   *
-	   * Matched string.
-	   **/    this.raw = text;
-    /**
-	   * Match#text -> String
-	   *
-	   * Notmalized text of matched string.
-	   **/    this.text = text;
-    /**
-	   * Match#url -> String
-	   *
-	   * Normalized url of matched string.
-	   **/    this.url = text;
-  }
-  function createMatch(self, shift) {
-    var match = new Match(self, shift);
-    self.__compiled__[match.schema].normalize(match, self);
-    return match;
-  }
-  /**
-	 * class LinkifyIt
-	 **/
-  /**
-	 * new LinkifyIt(schemas, options)
-	 * - schemas (Object): Optional. Additional schemas to validate (prefix/validator)
-	 * - options (Object): { fuzzyLink|fuzzyEmail|fuzzyIP: true|false }
-	 *
-	 * Creates new linkifier instance with optional additional schemas.
-	 * Can be called without `new` keyword for convenience.
-	 *
-	 * By default understands:
-	 *
-	 * - `http(s)://...` , `ftp://...`, `mailto:...` & `//...` links
-	 * - "fuzzy" links and emails (example.com, foo@bar.com).
-	 *
-	 * `schemas` is an object, where each key/value describes protocol/rule:
-	 *
-	 * - __key__ - link prefix (usually, protocol name with `:` at the end, `skype:`
-	 *   for example). `linkify-it` makes shure that prefix is not preceeded with
-	 *   alphanumeric char and symbols. Only whitespaces and punctuation allowed.
-	 * - __value__ - rule to check tail after link prefix
-	 *   - _String_ - just alias to existing rule
-	 *   - _Object_
-	 *     - _validate_ - validator function (should return matched length on success),
-	 *       or `RegExp`.
-	 *     - _normalize_ - optional function to normalize text & url of matched result
-	 *       (for example, for @twitter mentions).
-	 *
-	 * `options`:
-	 *
-	 * - __fuzzyLink__ - recognige URL-s without `http(s):` prefix. Default `true`.
-	 * - __fuzzyIP__ - allow IPs in fuzzy links above. Can conflict with some texts
-	 *   like version numbers. Default `false`.
-	 * - __fuzzyEmail__ - recognize emails without `mailto:` prefix.
-	 *
-	 **/  function LinkifyIt(schemas, options) {
-    if (!(this instanceof LinkifyIt)) {
-      return new LinkifyIt(schemas, options);
-    }
-    if (!options) {
-      if (isOptionsObj(schemas)) {
-        options = schemas;
-        schemas = {};
-      }
-    }
-    this.__opts__ = assign({}, defaultOptions, options);
-    // Cache last tested result. Used to skip repeating steps on next `match` call.
-        this.__index__ = -1;
-    this.__last_index__ = -1;
- // Next scan position
-        this.__schema__ = "";
-    this.__text_cache__ = "";
-    this.__schemas__ = assign({}, defaultSchemas, schemas);
-    this.__compiled__ = {};
-    this.__tlds__ = tlds_default;
-    this.__tlds_replaced__ = false;
-    this.re = {};
-    compile(this);
-  }
-  /** chainable
-	 * LinkifyIt#add(schema, definition)
-	 * - schema (String): rule name (fixed pattern prefix)
-	 * - definition (String|RegExp|Object): schema definition
-	 *
-	 * Add new rule definition. See constructor description for details.
-	 **/  LinkifyIt.prototype.add = function add(schema, definition) {
-    this.__schemas__[schema] = definition;
-    compile(this);
-    return this;
-  };
-  /** chainable
-	 * LinkifyIt#set(options)
-	 * - options (Object): { fuzzyLink|fuzzyEmail|fuzzyIP: true|false }
-	 *
-	 * Set recognition options for links without schema.
-	 **/  LinkifyIt.prototype.set = function set(options) {
-    this.__opts__ = assign(this.__opts__, options);
-    return this;
-  };
-  /**
-	 * LinkifyIt#test(text) -> Boolean
-	 *
-	 * Searches linkifiable pattern and returns `true` on success or `false` on fail.
-	 **/  LinkifyIt.prototype.test = function test(text) {
-    // Reset scan cache
-    this.__text_cache__ = text;
-    this.__index__ = -1;
-    if (!text.length) {
-      return false;
-    }
-    var m, ml, me, len, shift, next, re, tld_pos, at_pos;
-    // try to scan for link with schema - that's the most simple rule
-        if (this.re.schema_test.test(text)) {
-      re = this.re.schema_search;
-      re.lastIndex = 0;
-      while ((m = re.exec(text)) !== null) {
-        len = this.testSchemaAt(text, m[2], re.lastIndex);
-        if (len) {
-          this.__schema__ = m[2];
-          this.__index__ = m.index + m[1].length;
-          this.__last_index__ = m.index + m[0].length + len;
-          break;
-        }
-      }
-    }
-    if (this.__opts__.fuzzyLink && this.__compiled__["http:"]) {
-      // guess schemaless links
-      tld_pos = text.search(this.re.host_fuzzy_test);
-      if (tld_pos >= 0) {
-        // if tld is located after found link - no need to check fuzzy pattern
-        if (this.__index__ < 0 || tld_pos < this.__index__) {
-          if ((ml = text.match(this.__opts__.fuzzyIP ? this.re.link_fuzzy : this.re.link_no_ip_fuzzy)) !== null) {
-            shift = ml.index + ml[1].length;
-            if (this.__index__ < 0 || shift < this.__index__) {
-              this.__schema__ = "";
-              this.__index__ = shift;
-              this.__last_index__ = ml.index + ml[0].length;
-            }
-          }
-        }
-      }
-    }
-    if (this.__opts__.fuzzyEmail && this.__compiled__["mailto:"]) {
-      // guess schemaless emails
-      at_pos = text.indexOf("@");
-      if (at_pos >= 0) {
-        // We can't skip this check, because this cases are possible:
-        // 192.168.1.1@gmail.com, my.in@example.com
-        if ((me = text.match(this.re.email_fuzzy)) !== null) {
-          shift = me.index + me[1].length;
-          next = me.index + me[0].length;
-          if (this.__index__ < 0 || shift < this.__index__ || shift === this.__index__ && next > this.__last_index__) {
-            this.__schema__ = "mailto:";
-            this.__index__ = shift;
-            this.__last_index__ = next;
-          }
-        }
-      }
-    }
-    return this.__index__ >= 0;
-  };
-  /**
-	 * LinkifyIt#pretest(text) -> Boolean
-	 *
-	 * Very quick check, that can give false positives. Returns true if link MAY BE
-	 * can exists. Can be used for speed optimization, when you need to check that
-	 * link NOT exists.
-	 **/  LinkifyIt.prototype.pretest = function pretest(text) {
-    return this.re.pretest.test(text);
-  };
-  /**
-	 * LinkifyIt#testSchemaAt(text, name, position) -> Number
-	 * - text (String): text to scan
-	 * - name (String): rule (schema) name
-	 * - position (Number): text offset to check from
-	 *
-	 * Similar to [[LinkifyIt#test]] but checks only specific protocol tail exactly
-	 * at given position. Returns length of found pattern (0 on fail).
-	 **/  LinkifyIt.prototype.testSchemaAt = function testSchemaAt(text, schema, pos) {
-    // If not supported schema check requested - terminate
-    if (!this.__compiled__[schema.toLowerCase()]) {
-      return 0;
-    }
-    return this.__compiled__[schema.toLowerCase()].validate(text, pos, this);
-  };
-  /**
-	 * LinkifyIt#match(text) -> Array|null
-	 *
-	 * Returns array of found link descriptions or `null` on fail. We strongly
-	 * recommend to use [[LinkifyIt#test]] first, for best speed.
-	 *
-	 * ##### Result match description
-	 *
-	 * - __schema__ - link schema, can be empty for fuzzy links, or `//` for
-	 *   protocol-neutral  links.
-	 * - __index__ - offset of matched text
-	 * - __lastIndex__ - index of next char after mathch end
-	 * - __raw__ - matched text
-	 * - __text__ - normalized text
-	 * - __url__ - link, generated from matched text
-	 **/  LinkifyIt.prototype.match = function match(text) {
-    var shift = 0, result = [];
-    // Try to take previous element from cache, if .test() called before
-        if (this.__index__ >= 0 && this.__text_cache__ === text) {
-      result.push(createMatch(this, shift));
-      shift = this.__last_index__;
-    }
-    // Cut head if cache was used
-        var tail = shift ? text.slice(shift) : text;
-    // Scan string until end reached
-        while (this.test(tail)) {
-      result.push(createMatch(this, shift));
-      tail = tail.slice(this.__last_index__);
-      shift += this.__last_index__;
-    }
-    if (result.length) {
-      return result;
-    }
-    return null;
-  };
-  /**
-	 * LinkifyIt#matchAtStart(text) -> Match|null
-	 *
-	 * Returns fully-formed (not fuzzy) link if it starts at the beginning
-	 * of the string, and null otherwise.
-	 **/  LinkifyIt.prototype.matchAtStart = function matchAtStart(text) {
-    // Reset scan cache
-    this.__text_cache__ = text;
-    this.__index__ = -1;
-    if (!text.length) return null;
-    var m = this.re.schema_at_start.exec(text);
-    if (!m) return null;
-    var len = this.testSchemaAt(text, m[2], m[0].length);
-    if (!len) return null;
-    this.__schema__ = m[2];
-    this.__index__ = m.index + m[1].length;
-    this.__last_index__ = m.index + m[0].length + len;
-    return createMatch(this, 0);
-  };
-  /** chainable
-	 * LinkifyIt#tlds(list [, keepOld]) -> this
-	 * - list (Array): list of tlds
-	 * - keepOld (Boolean): merge with current list if `true` (`false` by default)
-	 *
-	 * Load (or merge) new tlds list. Those are user for fuzzy links (without prefix)
-	 * to avoid false positives. By default this algorythm used:
-	 *
-	 * - hostname with any 2-letter root zones are ok.
-	 * - biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф
-	 *   are ok.
-	 * - encoded (`xn--...`) root zones are ok.
-	 *
-	 * If list is replaced, then exact match for 2-chars root zones will be checked.
-	 **/  LinkifyIt.prototype.tlds = function tlds(list, keepOld) {
-    list = Array.isArray(list) ? list : [ list ];
-    if (!keepOld) {
-      this.__tlds__ = list.slice();
-      this.__tlds_replaced__ = true;
-      compile(this);
-      return this;
-    }
-    this.__tlds__ = this.__tlds__.concat(list).sort().filter((function(el, idx, arr) {
-      return el !== arr[idx - 1];
-    })).reverse();
-    compile(this);
-    return this;
-  };
-  /**
-	 * LinkifyIt#normalize(match)
-	 *
-	 * Default normalizer (if schema does not define it's own).
-	 **/  LinkifyIt.prototype.normalize = function normalize(match) {
-    // Do minimal possible changes by default. Need to collect feedback prior
-    // to move forward https://github.com/markdown-it/linkify-it/issues/1
-    if (!match.schema) {
-      match.url = "http://" + match.url;
-    }
-    if (match.schema === "mailto:" && !/^mailto:/i.test(match.url)) {
-      match.url = "mailto:" + match.url;
-    }
-  };
-  /**
-	 * LinkifyIt#onCompile()
-	 *
-	 * Override to modify basic RegExp-s.
-	 **/  LinkifyIt.prototype.onCompile = function onCompile() {};
-  var linkifyIt = LinkifyIt;
-  /*! https://mths.be/punycode v1.4.1 by @mathias */
-  /** Highest positive signed 32-bit float value */  var maxInt = 2147483647;
- // aka. 0x7FFFFFFF or 2^31-1
-  /** Bootstring parameters */  var base = 36;
-  var tMin = 1;
-  var tMax = 26;
-  var skew = 38;
-  var damp = 700;
-  var initialBias = 72;
-  var initialN = 128;
- // 0x80
-    var delimiter = "-";
- // '\x2D'
-  /** Regular expressions */  var regexPunycode = /^xn--/;
-  var regexNonASCII = /[^\x20-\x7E]/;
- // unprintable ASCII chars + non-ASCII chars
-    var regexSeparators = /[\x2E\u3002\uFF0E\uFF61]/g;
- // RFC 3490 separators
-  /** Error messages */  var errors = {
-    overflow: "Overflow: input needs wider integers to process",
-    "not-basic": "Illegal input >= 0x80 (not a basic code point)",
-    "invalid-input": "Invalid input"
-  };
-  /** Convenience shortcuts */  var baseMinusTMin = base - tMin;
-  var floor = Math.floor;
-  var stringFromCharCode = String.fromCharCode;
-  /*--------------------------------------------------------------------------*/
-  /**
-	 * A generic error utility function.
-	 * @private
-	 * @param {String} type The error type.
-	 * @returns {Error} Throws a `RangeError` with the applicable error message.
-	 */  function error(type) {
-    throw new RangeError(errors[type]);
-  }
-  /**
-	 * A generic `Array#map` utility function.
-	 * @private
-	 * @param {Array} array The array to iterate over.
-	 * @param {Function} callback The function that gets called for every array
-	 * item.
-	 * @returns {Array} A new array of values returned by the callback function.
-	 */  function map(array, fn) {
-    var length = array.length;
-    var result = [];
-    while (length--) {
-      result[length] = fn(array[length]);
-    }
-    return result;
-  }
-  /**
-	 * A simple `Array#map`-like wrapper to work with domain name strings or email
-	 * addresses.
-	 * @private
-	 * @param {String} domain The domain name or email address.
-	 * @param {Function} callback The function that gets called for every
-	 * character.
-	 * @returns {Array} A new string of characters returned by the callback
-	 * function.
-	 */  function mapDomain(string, fn) {
-    var parts = string.split("@");
-    var result = "";
-    if (parts.length > 1) {
-      // In email addresses, only the domain name should be punycoded. Leave
-      // the local part (i.e. everything up to `@`) intact.
-      result = parts[0] + "@";
-      string = parts[1];
-    }
-    // Avoid `split(regex)` for IE8 compatibility. See #17.
-        string = string.replace(regexSeparators, ".");
-    var labels = string.split(".");
-    var encoded = map(labels, fn).join(".");
-    return result + encoded;
-  }
-  /**
-	 * Creates an array containing the numeric code points of each Unicode
-	 * character in the string. While JavaScript uses UCS-2 internally,
-	 * this function will convert a pair of surrogate halves (each of which
-	 * UCS-2 exposes as separate characters) into a single code point,
-	 * matching UTF-16.
-	 * @see `punycode.ucs2.encode`
-	 * @see <https://mathiasbynens.be/notes/javascript-encoding>
-	 * @memberOf punycode.ucs2
-	 * @name decode
-	 * @param {String} string The Unicode input string (UCS-2).
-	 * @returns {Array} The new array of code points.
-	 */  function ucs2decode(string) {
-    var output = [], counter = 0, length = string.length, value, extra;
-    while (counter < length) {
-      value = string.charCodeAt(counter++);
-      if (value >= 55296 && value <= 56319 && counter < length) {
-        // high surrogate, and there is a next character
-        extra = string.charCodeAt(counter++);
-        if ((extra & 64512) == 56320) {
-          // low surrogate
-          output.push(((value & 1023) << 10) + (extra & 1023) + 65536);
-        } else {
-          // unmatched surrogate; only append this code unit, in case the next
-          // code unit is the high surrogate of a surrogate pair
-          output.push(value);
-          counter--;
-        }
-      } else {
-        output.push(value);
-      }
-    }
-    return output;
-  }
-  /**
-	 * Creates a string based on an array of numeric code points.
-	 * @see `punycode.ucs2.decode`
-	 * @memberOf punycode.ucs2
-	 * @name encode
-	 * @param {Array} codePoints The array of numeric code points.
-	 * @returns {String} The new Unicode string (UCS-2).
-	 */  function ucs2encode(array) {
-    return map(array, (function(value) {
-      var output = "";
-      if (value > 65535) {
-        value -= 65536;
-        output += stringFromCharCode(value >>> 10 & 1023 | 55296);
-        value = 56320 | value & 1023;
-      }
-      output += stringFromCharCode(value);
-      return output;
-    })).join("");
-  }
-  /**
-	 * Converts a basic code point into a digit/integer.
-	 * @see `digitToBasic()`
-	 * @private
-	 * @param {Number} codePoint The basic numeric code point value.
-	 * @returns {Number} The numeric value of a basic code point (for use in
-	 * representing integers) in the range `0` to `base - 1`, or `base` if
-	 * the code point does not represent a value.
-	 */  function basicToDigit(codePoint) {
-    if (codePoint - 48 < 10) {
-      return codePoint - 22;
-    }
-    if (codePoint - 65 < 26) {
-      return codePoint - 65;
-    }
-    if (codePoint - 97 < 26) {
-      return codePoint - 97;
-    }
-    return base;
-  }
-  /**
-	 * Converts a digit/integer into a basic code point.
-	 * @see `basicToDigit()`
-	 * @private
-	 * @param {Number} digit The numeric value of a basic code point.
-	 * @returns {Number} The basic code point whose value (when used for
-	 * representing integers) is `digit`, which needs to be in the range
-	 * `0` to `base - 1`. If `flag` is non-zero, the uppercase form is
-	 * used; else, the lowercase form is used. The behavior is undefined
-	 * if `flag` is non-zero and `digit` has no uppercase form.
-	 */  function digitToBasic(digit, flag) {
-    //  0..25 map to ASCII a..z or A..Z
-    // 26..35 map to ASCII 0..9
-    return digit + 22 + 75 * (digit < 26) - ((flag != 0) << 5);
-  }
-  /**
-	 * Bias adaptation function as per section 3.4 of RFC 3492.
-	 * https://tools.ietf.org/html/rfc3492#section-3.4
-	 * @private
-	 */  function adapt(delta, numPoints, firstTime) {
-    var k = 0;
-    delta = firstTime ? floor(delta / damp) : delta >> 1;
-    delta += floor(delta / numPoints);
-    for (;delta > baseMinusTMin * tMax >> 1; k += base) {
-      delta = floor(delta / baseMinusTMin);
-    }
-    return floor(k + (baseMinusTMin + 1) * delta / (delta + skew));
-  }
-  /**
-	 * Converts a Punycode string of ASCII-only symbols to a string of Unicode
-	 * symbols.
-	 * @memberOf punycode
-	 * @param {String} input The Punycode string of ASCII-only symbols.
-	 * @returns {String} The resulting string of Unicode symbols.
-	 */  function decode(input) {
-    // Don't use UCS-2
-    var output = [], inputLength = input.length, out, i = 0, n = initialN, bias = initialBias, basic, j, index, oldi, w, k, digit, t, 
-    /** Cached calculation results */
-    baseMinusT;
-    // Handle the basic code points: let `basic` be the number of input code
-    // points before the last delimiter, or `0` if there is none, then copy
-    // the first basic code points to the output.
-        basic = input.lastIndexOf(delimiter);
-    if (basic < 0) {
-      basic = 0;
-    }
-    for (j = 0; j < basic; ++j) {
-      // if it's not a basic code point
-      if (input.charCodeAt(j) >= 128) {
-        error("not-basic");
-      }
-      output.push(input.charCodeAt(j));
-    }
-    // Main decoding loop: start just after the last delimiter if any basic code
-    // points were copied; start at the beginning otherwise.
-        for (index = basic > 0 ? basic + 1 : 0; index < inputLength; ) {
-      // `index` is the index of the next character to be consumed.
-      // Decode a generalized variable-length integer into `delta`,
-      // which gets added to `i`. The overflow checking is easier
-      // if we increase `i` as we go, then subtract off its starting
-      // value at the end to obtain `delta`.
-      for (oldi = i, w = 1, k = base; ;k += base) {
-        if (index >= inputLength) {
-          error("invalid-input");
-        }
-        digit = basicToDigit(input.charCodeAt(index++));
-        if (digit >= base || digit > floor((maxInt - i) / w)) {
-          error("overflow");
-        }
-        i += digit * w;
-        t = k <= bias ? tMin : k >= bias + tMax ? tMax : k - bias;
-        if (digit < t) {
-          break;
-        }
-        baseMinusT = base - t;
-        if (w > floor(maxInt / baseMinusT)) {
-          error("overflow");
-        }
-        w *= baseMinusT;
-      }
-      out = output.length + 1;
-      bias = adapt(i - oldi, out, oldi == 0);
-      // `i` was supposed to wrap around from `out` to `0`,
-      // incrementing `n` each time, so we'll fix that now:
-            if (floor(i / out) > maxInt - n) {
-        error("overflow");
-      }
-      n += floor(i / out);
-      i %= out;
-      // Insert `n` at position `i` of the output
-            output.splice(i++, 0, n);
-    }
-    return ucs2encode(output);
-  }
-  /**
-	 * Converts a string of Unicode symbols (e.g. a domain name label) to a
-	 * Punycode string of ASCII-only symbols.
-	 * @memberOf punycode
-	 * @param {String} input The string of Unicode symbols.
-	 * @returns {String} The resulting Punycode string of ASCII-only symbols.
-	 */  function encode(input) {
-    var n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, currentValue, output = [], 
-    /** `inputLength` will hold the number of code points in `input`. */
-    inputLength, 
-    /** Cached calculation results */
-    handledCPCountPlusOne, baseMinusT, qMinusT;
-    // Convert the input in UCS-2 to Unicode
-        input = ucs2decode(input);
-    // Cache the length
-        inputLength = input.length;
-    // Initialize the state
-        n = initialN;
-    delta = 0;
-    bias = initialBias;
-    // Handle the basic code points
-        for (j = 0; j < inputLength; ++j) {
-      currentValue = input[j];
-      if (currentValue < 128) {
-        output.push(stringFromCharCode(currentValue));
-      }
-    }
-    handledCPCount = basicLength = output.length;
-    // `handledCPCount` is the number of code points that have been handled;
-    // `basicLength` is the number of basic code points.
-    // Finish the basic string - if it is not empty - with a delimiter
-        if (basicLength) {
-      output.push(delimiter);
-    }
-    // Main encoding loop:
-        while (handledCPCount < inputLength) {
-      // All non-basic code points < n have been handled already. Find the next
-      // larger one:
-      for (m = maxInt, j = 0; j < inputLength; ++j) {
-        currentValue = input[j];
-        if (currentValue >= n && currentValue < m) {
-          m = currentValue;
-        }
-      }
-      // Increase `delta` enough to advance the decoder's <n,i> state to <m,0>,
-      // but guard against overflow
-            handledCPCountPlusOne = handledCPCount + 1;
-      if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) {
-        error("overflow");
-      }
-      delta += (m - n) * handledCPCountPlusOne;
-      n = m;
-      for (j = 0; j < inputLength; ++j) {
-        currentValue = input[j];
-        if (currentValue < n && ++delta > maxInt) {
-          error("overflow");
-        }
-        if (currentValue == n) {
-          // Represent delta as a generalized variable-length integer
-          for (q = delta, k = base; ;k += base) {
-            t = k <= bias ? tMin : k >= bias + tMax ? tMax : k - bias;
-            if (q < t) {
-              break;
-            }
-            qMinusT = q - t;
-            baseMinusT = base - t;
-            output.push(stringFromCharCode(digitToBasic(t + qMinusT % baseMinusT, 0)));
-            q = floor(qMinusT / baseMinusT);
-          }
-          output.push(stringFromCharCode(digitToBasic(q, 0)));
-          bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength);
-          delta = 0;
-          ++handledCPCount;
-        }
-      }
-      ++delta;
-      ++n;
-    }
-    return output.join("");
-  }
-  /**
-	 * Converts a Punycode string representing a domain name or an email address
-	 * to Unicode. Only the Punycoded parts of the input will be converted, i.e.
-	 * it doesn't matter if you call it on a string that has already been
-	 * converted to Unicode.
-	 * @memberOf punycode
-	 * @param {String} input The Punycoded domain name or email address to
-	 * convert to Unicode.
-	 * @returns {String} The Unicode representation of the given Punycode
-	 * string.
-	 */  function toUnicode(input) {
-    return mapDomain(input, (function(string) {
-      return regexPunycode.test(string) ? decode(string.slice(4).toLowerCase()) : string;
-    }));
-  }
-  /**
-	 * Converts a Unicode string representing a domain name or an email address to
-	 * Punycode. Only the non-ASCII parts of the domain name will be converted,
-	 * i.e. it doesn't matter if you call it with a domain that's already in
-	 * ASCII.
-	 * @memberOf punycode
-	 * @param {String} input The domain name or email address to convert, as a
-	 * Unicode string.
-	 * @returns {String} The Punycode representation of the given domain name or
-	 * email address.
-	 */  function toASCII(input) {
-    return mapDomain(input, (function(string) {
-      return regexNonASCII.test(string) ? "xn--" + encode(string) : string;
-    }));
-  }
-  var version = "1.4.1";
-  /**
-	 * An object of methods to convert from JavaScript's internal character
-	 * representation (UCS-2) to Unicode code points, and back.
-	 * @see <https://mathiasbynens.be/notes/javascript-encoding>
-	 * @memberOf punycode
-	 * @type Object
-	 */  var ucs2 = {
-    decode: ucs2decode,
-    encode: ucs2encode
-  };
-  var punycode$1 = {
-    version: version,
-    ucs2: ucs2,
-    toASCII: toASCII,
-    toUnicode: toUnicode,
-    encode: encode,
-    decode: decode
-  };
-  var punycode$2 =  Object.freeze({
-    __proto__: null,
-    decode: decode,
-    encode: encode,
-    toUnicode: toUnicode,
-    toASCII: toASCII,
-    version: version,
-    ucs2: ucs2,
-    default: punycode$1
-  });
-  // markdown-it default options
-    var _default = {
-    options: {
-      html: false,
-      // Enable HTML tags in source
-      xhtmlOut: false,
-      // Use '/' to close single tags ()
-      breaks: false,
-      // Convert '\n' in paragraphs into 
-      langPrefix: "language-",
-      // CSS language prefix for fenced blocks
-      linkify: false,
-      // autoconvert URL-like texts to links
-      // Enable some language-neutral replacements + quotes beautification
-      typographer: false,
-      // Double + single quotes replacement pairs, when typographer enabled,
-      // and smartquotes on. Could be either a String or an Array.
-      // For example, you can use '«»„“' for Russian, '„“‚‘' for German,
-      // and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp).
-      quotes: "\u201c\u201d\u2018\u2019",
-      /* “”‘’ */
-      // Highlighter function. Should return escaped HTML,
-      // or '' if the source string is not changed and should be escaped externaly.
-      // If result starts with <pre... internal wrapper is skipped.
-      // function (/*str, lang*/) { return ''; }
-      highlight: null,
-      maxNesting: 100
-    },
-    components: {
-      core: {},
-      block: {},
-      inline: {}
-    }
-  };
-  // "Zero" preset, with nothing enabled. Useful for manual configuring of simple
-    var zero = {
-    options: {
-      html: false,
-      // Enable HTML tags in source
-      xhtmlOut: false,
-      // Use '/' to close single tags ()
-      breaks: false,
-      // Convert '\n' in paragraphs into 
-      langPrefix: "language-",
-      // CSS language prefix for fenced blocks
-      linkify: false,
-      // autoconvert URL-like texts to links
-      // Enable some language-neutral replacements + quotes beautification
-      typographer: false,
-      // Double + single quotes replacement pairs, when typographer enabled,
-      // and smartquotes on. Could be either a String or an Array.
-      // For example, you can use '«»„“' for Russian, '„“‚‘' for German,
-      // and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp).
-      quotes: "\u201c\u201d\u2018\u2019",
-      /* “”‘’ */
-      // Highlighter function. Should return escaped HTML,
-      // or '' if the source string is not changed and should be escaped externaly.
-      // If result starts with <pre... internal wrapper is skipped.
-      // function (/*str, lang*/) { return ''; }
-      highlight: null,
-      maxNesting: 20
-    },
-    components: {
-      core: {
-        rules: [ "normalize", "block", "inline", "text_join" ]
-      },
-      block: {
-        rules: [ "paragraph" ]
-      },
-      inline: {
-        rules: [ "text" ],
-        rules2: [ "balance_pairs", "fragments_join" ]
-      }
-    }
-  };
-  // Commonmark default options
-    var commonmark = {
-    options: {
-      html: true,
-      // Enable HTML tags in source
-      xhtmlOut: true,
-      // Use '/' to close single tags ()
-      breaks: false,
-      // Convert '\n' in paragraphs into 
-      langPrefix: "language-",
-      // CSS language prefix for fenced blocks
-      linkify: false,
-      // autoconvert URL-like texts to links
-      // Enable some language-neutral replacements + quotes beautification
-      typographer: false,
-      // Double + single quotes replacement pairs, when typographer enabled,
-      // and smartquotes on. Could be either a String or an Array.
-      // For example, you can use '«»„“' for Russian, '„“‚‘' for German,
-      // and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp).
-      quotes: "\u201c\u201d\u2018\u2019",
-      /* “”‘’ */
-      // Highlighter function. Should return escaped HTML,
-      // or '' if the source string is not changed and should be escaped externaly.
-      // If result starts with <pre... internal wrapper is skipped.
-      // function (/*str, lang*/) { return ''; }
-      highlight: null,
-      maxNesting: 20
-    },
-    components: {
-      core: {
-        rules: [ "normalize", "block", "inline", "text_join" ]
-      },
-      block: {
-        rules: [ "blockquote", "code", "fence", "heading", "hr", "html_block", "lheading", "list", "reference", "paragraph" ]
-      },
-      inline: {
-        rules: [ "autolink", "backticks", "emphasis", "entity", "escape", "html_inline", "image", "link", "newline", "text" ],
-        rules2: [ "balance_pairs", "emphasis", "fragments_join" ]
-      }
-    }
-  };
-  var punycode =  getAugmentedNamespace(punycode$2);
-  var config = {
-    default: _default,
-    zero: zero,
-    commonmark: commonmark
-  };
-  ////////////////////////////////////////////////////////////////////////////////
-  
-  // This validator can prohibit more than really needed to prevent XSS. It's a
-  // tradeoff to keep code simple and to be secure by default.
-  
-  // If you need different setup - override validator method as you wish. Or
-  // replace it with dummy function and use external sanitizer.
-  
-    var BAD_PROTO_RE = /^(vbscript|javascript|file|data):/;
-  var GOOD_DATA_RE = /^data:image\/(gif|png|jpeg|webp);/;
-  function validateLink(url) {
-    // url should be normalized at this point, and existing entities are decoded
-    var str = url.trim().toLowerCase();
-    return BAD_PROTO_RE.test(str) ? GOOD_DATA_RE.test(str) ? true : false : true;
-  }
-  ////////////////////////////////////////////////////////////////////////////////
-    var RECODE_HOSTNAME_FOR = [ "http:", "https:", "mailto:" ];
-  function normalizeLink(url) {
-    var parsed = mdurl.parse(url, true);
-    if (parsed.hostname) {
-      // Encode hostnames in urls like:
-      // `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
-      // We don't encode unknown schemas, because it's likely that we encode
-      // something we shouldn't (e.g. `skype:name` treated as `skype:host`)
-      if (!parsed.protocol || RECODE_HOSTNAME_FOR.indexOf(parsed.protocol) >= 0) {
-        try {
-          parsed.hostname = punycode.toASCII(parsed.hostname);
-        } catch (er) {}
-      }
-    }
-    return mdurl.encode(mdurl.format(parsed));
-  }
-  function normalizeLinkText(url) {
-    var parsed = mdurl.parse(url, true);
-    if (parsed.hostname) {
-      // Encode hostnames in urls like:
-      // `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
-      // We don't encode unknown schemas, because it's likely that we encode
-      // something we shouldn't (e.g. `skype:name` treated as `skype:host`)
-      if (!parsed.protocol || RECODE_HOSTNAME_FOR.indexOf(parsed.protocol) >= 0) {
-        try {
-          parsed.hostname = punycode.toUnicode(parsed.hostname);
-        } catch (er) {}
-      }
-    }
-    // add '%' to exclude list because of https://github.com/markdown-it/markdown-it/issues/720
-        return mdurl.decode(mdurl.format(parsed), mdurl.decode.defaultChars + "%");
-  }
-  /**
-	 * class MarkdownIt
-	 *
-	 * Main parser/renderer class.
-	 *
-	 * ##### Usage
-	 *
-	 * ```javascript
-	 * // node.js, "classic" way:
-	 * var MarkdownIt = require('markdown-it'),
-	 *     md = new MarkdownIt();
-	 * var result = md.render('# markdown-it rulezz!');
-	 *
-	 * // node.js, the same, but with sugar:
-	 * var md = require('markdown-it')();
-	 * var result = md.render('# markdown-it rulezz!');
-	 *
-	 * // browser without AMD, added to "window" on script load
-	 * // Note, there are no dash.
-	 * var md = window.markdownit();
-	 * var result = md.render('# markdown-it rulezz!');
-	 * ```
-	 *
-	 * Single line rendering, without paragraph wrap:
-	 *
-	 * ```javascript
-	 * var md = require('markdown-it')();
-	 * var result = md.renderInline('__markdown-it__ rulezz!');
-	 * ```
-	 **/
-  /**
-	 * new MarkdownIt([presetName, options])
-	 * - presetName (String): optional, `commonmark` / `zero`
-	 * - options (Object)
-	 *
-	 * Creates parser instanse with given config. Can be called without `new`.
-	 *
-	 * ##### presetName
-	 *
-	 * MarkdownIt provides named presets as a convenience to quickly
-	 * enable/disable active syntax rules and options for common use cases.
-	 *
-	 * - ["commonmark"](https://github.com/markdown-it/markdown-it/blob/master/lib/presets/commonmark.js) -
-	 *   configures parser to strict [CommonMark](http://commonmark.org/) mode.
-	 * - [default](https://github.com/markdown-it/markdown-it/blob/master/lib/presets/default.js) -
-	 *   similar to GFM, used when no preset name given. Enables all available rules,
-	 *   but still without html, typographer & autolinker.
-	 * - ["zero"](https://github.com/markdown-it/markdown-it/blob/master/lib/presets/zero.js) -
-	 *   all rules disabled. Useful to quickly setup your config via `.enable()`.
-	 *   For example, when you need only `bold` and `italic` markup and nothing else.
-	 *
-	 * ##### options:
-	 *
-	 * - __html__ - `false`. Set `true` to enable HTML tags in source. Be careful!
-	 *   That's not safe! You may need external sanitizer to protect output from XSS.
-	 *   It's better to extend features via plugins, instead of enabling HTML.
-	 * - __xhtmlOut__ - `false`. Set `true` to add '/' when closing single tags
-	 *   (``). This is needed only for full CommonMark compatibility. In real
-	 *   world you will need HTML output.
-	 * - __breaks__ - `false`. Set `true` to convert `\n` in paragraphs into ``.
-	 * - __langPrefix__ - `language-`. CSS language class prefix for fenced blocks.
-	 *   Can be useful for external highlighters.
-	 * - __linkify__ - `false`. Set `true` to autoconvert URL-like text to links.
-	 * - __typographer__  - `false`. Set `true` to enable [some language-neutral
-	 *   replacement](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_core/replacements.js) +
-	 *   quotes beautification (smartquotes).
-	 * - __quotes__ - `“”‘’`, String or Array. Double + single quotes replacement
-	 *   pairs, when typographer enabled and smartquotes on. For example, you can
-	 *   use `'«»„“'` for Russian, `'„“‚‘'` for German, and
-	 *   `['«\xA0', '\xA0»', '‹\xA0', '\xA0›']` for French (including nbsp).
-	 * - __highlight__ - `null`. Highlighter function for fenced code blocks.
-	 *   Highlighter `function (str, lang)` should return escaped HTML. It can also
-	 *   return empty string if the source was not changed and should be escaped
-	 *   externaly. If result starts with <pre... internal wrapper is skipped.
-	 *
-	 * ##### Example
-	 *
-	 * ```javascript
-	 * // commonmark mode
-	 * var md = require('markdown-it')('commonmark');
-	 *
-	 * // default mode
-	 * var md = require('markdown-it')();
-	 *
-	 * // enable everything
-	 * var md = require('markdown-it')({
-	 *   html: true,
-	 *   linkify: true,
-	 *   typographer: true
-	 * });
-	 * ```
-	 *
-	 * ##### Syntax highlighting
-	 *
-	 * ```js
-	 * var hljs = require('highlight.js') // https://highlightjs.org/
-	 *
-	 * var md = require('markdown-it')({
-	 *   highlight: function (str, lang) {
-	 *     if (lang && hljs.getLanguage(lang)) {
-	 *       try {
-	 *         return hljs.highlight(str, { language: lang, ignoreIllegals: true }).value;
-	 *       } catch (__) {}
-	 *     }
-	 *
-	 *     return ''; // use external default escaping
-	 *   }
-	 * });
-	 * ```
-	 *
-	 * Or with full wrapper override (if you need assign class to `<pre>`):
-	 *
-	 * ```javascript
-	 * var hljs = require('highlight.js') // https://highlightjs.org/
-	 *
-	 * // Actual default values
-	 * var md = require('markdown-it')({
-	 *   highlight: function (str, lang) {
-	 *     if (lang && hljs.getLanguage(lang)) {
-	 *       try {
-	 *         return '<pre class="hljs"><code>' +
-	 *                hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
-	 *                '</code></pre>';
-	 *       } catch (__) {}
-	 *     }
-	 *
-	 *     return '<pre class="hljs"><code>' + md.utils.escapeHtml(str) + '</code></pre>';
-	 *   }
-	 * });
-	 * ```
-	 *
-	 **/  function MarkdownIt(presetName, options) {
-    if (!(this instanceof MarkdownIt)) {
-      return new MarkdownIt(presetName, options);
-    }
-    if (!options) {
-      if (!utils.isString(presetName)) {
-        options = presetName || {};
-        presetName = "default";
-      }
-    }
-    /**
-	   * MarkdownIt#inline -> ParserInline
-	   *
-	   * Instance of [[ParserInline]]. You may need it to add new rules when
-	   * writing plugins. For simple rules control use [[MarkdownIt.disable]] and
-	   * [[MarkdownIt.enable]].
-	   **/    this.inline = new parser_inline;
-    /**
-	   * MarkdownIt#block -> ParserBlock
-	   *
-	   * Instance of [[ParserBlock]]. You may need it to add new rules when
-	   * writing plugins. For simple rules control use [[MarkdownIt.disable]] and
-	   * [[MarkdownIt.enable]].
-	   **/    this.block = new parser_block;
-    /**
-	   * MarkdownIt#core -> Core
-	   *
-	   * Instance of [[Core]] chain executor. You may need it to add new rules when
-	   * writing plugins. For simple rules control use [[MarkdownIt.disable]] and
-	   * [[MarkdownIt.enable]].
-	   **/    this.core = new parser_core;
-    /**
-	   * MarkdownIt#renderer -> Renderer
-	   *
-	   * Instance of [[Renderer]]. Use it to modify output look. Or to add rendering
-	   * rules for new token types, generated by plugins.
-	   *
-	   * ##### Example
-	   *
-	   * ```javascript
-	   * var md = require('markdown-it')();
-	   *
-	   * function myToken(tokens, idx, options, env, self) {
-	   *   //...
-	   *   return result;
-	   * };
-	   *
-	   * md.renderer.rules['my_token'] = myToken
-	   * ```
-	   *
-	   * See [[Renderer]] docs and [source code](https://github.com/markdown-it/markdown-it/blob/master/lib/renderer.js).
-	   **/    this.renderer = new renderer;
-    /**
-	   * MarkdownIt#linkify -> LinkifyIt
-	   *
-	   * [linkify-it](https://github.com/markdown-it/linkify-it) instance.
-	   * Used by [linkify](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_core/linkify.js)
-	   * rule.
-	   **/    this.linkify = new linkifyIt;
-    /**
-	   * MarkdownIt#validateLink(url) -> Boolean
-	   *
-	   * Link validation function. CommonMark allows too much in links. By default
-	   * we disable `javascript:`, `vbscript:`, `file:` schemas, and almost all `data:...` schemas
-	   * except some embedded image types.
-	   *
-	   * You can change this behaviour:
-	   *
-	   * ```javascript
-	   * var md = require('markdown-it')();
-	   * // enable everything
-	   * md.validateLink = function () { return true; }
-	   * ```
-	   **/    this.validateLink = validateLink;
-    /**
-	   * MarkdownIt#normalizeLink(url) -> String
-	   *
-	   * Function used to encode link url to a machine-readable format,
-	   * which includes url-encoding, punycode, etc.
-	   **/    this.normalizeLink = normalizeLink;
-    /**
-	   * MarkdownIt#normalizeLinkText(url) -> String
-	   *
-	   * Function used to decode link url to a human-readable format`
-	   **/    this.normalizeLinkText = normalizeLinkText;
-    // Expose utils & helpers for easy acces from plugins
-    /**
-	   * MarkdownIt#utils -> utils
-	   *
-	   * Assorted utility functions, useful to write plugins. See details
-	   * [here](https://github.com/markdown-it/markdown-it/blob/master/lib/common/utils.js).
-	   **/    this.utils = utils;
-    /**
-	   * MarkdownIt#helpers -> helpers
-	   *
-	   * Link components parser functions, useful to write plugins. See details
-	   * [here](https://github.com/markdown-it/markdown-it/blob/master/lib/helpers).
-	   **/    this.helpers = utils.assign({}, helpers);
-    this.options = {};
-    this.configure(presetName);
-    if (options) {
-      this.set(options);
-    }
-  }
-  /** chainable
-	 * MarkdownIt.set(options)
-	 *
-	 * Set parser options (in the same format as in constructor). Probably, you
-	 * will never need it, but you can change options after constructor call.
-	 *
-	 * ##### Example
-	 *
-	 * ```javascript
-	 * var md = require('markdown-it')()
-	 *             .set({ html: true, breaks: true })
-	 *             .set({ typographer, true });
-	 * ```
-	 *
-	 * __Note:__ To achieve the best possible performance, don't modify a
-	 * `markdown-it` instance options on the fly. If you need multiple configurations
-	 * it's best to create multiple instances and initialize each with separate
-	 * config.
-	 **/  MarkdownIt.prototype.set = function(options) {
-    utils.assign(this.options, options);
-    return this;
-  };
-  /** chainable, internal
-	 * MarkdownIt.configure(presets)
-	 *
-	 * Batch load of all options and compenent settings. This is internal method,
-	 * and you probably will not need it. But if you will - see available presets
-	 * and data structure [here](https://github.com/markdown-it/markdown-it/tree/master/lib/presets)
-	 *
-	 * We strongly recommend to use presets instead of direct config loads. That
-	 * will give better compatibility with next versions.
-	 **/  MarkdownIt.prototype.configure = function(presets) {
-    var self = this, presetName;
-    if (utils.isString(presets)) {
-      presetName = presets;
-      presets = config[presetName];
-      if (!presets) {
-        throw new Error('Wrong `markdown-it` preset "' + presetName + '", check name');
-      }
-    }
-    if (!presets) {
-      throw new Error("Wrong `markdown-it` preset, can't be empty");
-    }
-    if (presets.options) {
-      self.set(presets.options);
-    }
-    if (presets.components) {
-      Object.keys(presets.components).forEach((function(name) {
-        if (presets.components[name].rules) {
-          self[name].ruler.enableOnly(presets.components[name].rules);
-        }
-        if (presets.components[name].rules2) {
-          self[name].ruler2.enableOnly(presets.components[name].rules2);
-        }
-      }));
-    }
-    return this;
-  };
-  /** chainable
-	 * MarkdownIt.enable(list, ignoreInvalid)
-	 * - list (String|Array): rule name or list of rule names to enable
-	 * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found.
-	 *
-	 * Enable list or rules. It will automatically find appropriate components,
-	 * containing rules with given names. If rule not found, and `ignoreInvalid`
-	 * not set - throws exception.
-	 *
-	 * ##### Example
-	 *
-	 * ```javascript
-	 * var md = require('markdown-it')()
-	 *             .enable(['sub', 'sup'])
-	 *             .disable('smartquotes');
-	 * ```
-	 **/  MarkdownIt.prototype.enable = function(list, ignoreInvalid) {
-    var result = [];
-    if (!Array.isArray(list)) {
-      list = [ list ];
-    }
-    [ "core", "block", "inline" ].forEach((function(chain) {
-      result = result.concat(this[chain].ruler.enable(list, true));
-    }), this);
-    result = result.concat(this.inline.ruler2.enable(list, true));
-    var missed = list.filter((function(name) {
-      return result.indexOf(name) < 0;
-    }));
-    if (missed.length && !ignoreInvalid) {
-      throw new Error("MarkdownIt. Failed to enable unknown rule(s): " + missed);
-    }
-    return this;
-  };
-  /** chainable
-	 * MarkdownIt.disable(list, ignoreInvalid)
-	 * - list (String|Array): rule name or list of rule names to disable.
-	 * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found.
-	 *
-	 * The same as [[MarkdownIt.enable]], but turn specified rules off.
-	 **/  MarkdownIt.prototype.disable = function(list, ignoreInvalid) {
-    var result = [];
-    if (!Array.isArray(list)) {
-      list = [ list ];
-    }
-    [ "core", "block", "inline" ].forEach((function(chain) {
-      result = result.concat(this[chain].ruler.disable(list, true));
-    }), this);
-    result = result.concat(this.inline.ruler2.disable(list, true));
-    var missed = list.filter((function(name) {
-      return result.indexOf(name) < 0;
-    }));
-    if (missed.length && !ignoreInvalid) {
-      throw new Error("MarkdownIt. Failed to disable unknown rule(s): " + missed);
-    }
-    return this;
-  };
-  /** chainable
-	 * MarkdownIt.use(plugin, params)
-	 *
-	 * Load specified plugin with given params into current parser instance.
-	 * It's just a sugar to call `plugin(md, params)` with curring.
-	 *
-	 * ##### Example
-	 *
-	 * ```javascript
-	 * var iterator = require('markdown-it-for-inline');
-	 * var md = require('markdown-it')()
-	 *             .use(iterator, 'foo_replace', 'text', function (tokens, idx) {
-	 *               tokens[idx].content = tokens[idx].content.replace(/foo/g, 'bar');
-	 *             });
-	 * ```
-	 **/  MarkdownIt.prototype.use = function(plugin /*, params, ... */) {
-    var args = [ this ].concat(Array.prototype.slice.call(arguments, 1));
-    plugin.apply(plugin, args);
-    return this;
-  };
-  /** internal
-	 * MarkdownIt.parse(src, env) -> Array
-	 * - src (String): source string
-	 * - env (Object): environment sandbox
-	 *
-	 * Parse input string and return list of block tokens (special token type
-	 * "inline" will contain list of inline tokens). You should not call this
-	 * method directly, until you write custom renderer (for example, to produce
-	 * AST).
-	 *
-	 * `env` is used to pass data between "distributed" rules and return additional
-	 * metadata like reference info, needed for the renderer. It also can be used to
-	 * inject data in specific cases. Usually, you will be ok to pass `{}`,
-	 * and then pass updated object to renderer.
-	 **/  MarkdownIt.prototype.parse = function(src, env) {
-    if (typeof src !== "string") {
-      throw new Error("Input data should be a String");
-    }
-    var state = new this.core.State(src, this, env);
-    this.core.process(state);
-    return state.tokens;
-  };
-  /**
-	 * MarkdownIt.render(src [, env]) -> String
-	 * - src (String): source string
-	 * - env (Object): environment sandbox
-	 *
-	 * Render markdown string into html. It does all magic for you :).
-	 *
-	 * `env` can be used to inject additional metadata (`{}` by default).
-	 * But you will not need it with high probability. See also comment
-	 * in [[MarkdownIt.parse]].
-	 **/  MarkdownIt.prototype.render = function(src, env) {
-    env = env || {};
-    return this.renderer.render(this.parse(src, env), this.options, env);
-  };
-  /** internal
-	 * MarkdownIt.parseInline(src, env) -> Array
-	 * - src (String): source string
-	 * - env (Object): environment sandbox
-	 *
-	 * The same as [[MarkdownIt.parse]] but skip all block rules. It returns the
-	 * block tokens list with the single `inline` element, containing parsed inline
-	 * tokens in `children` property. Also updates `env` object.
-	 **/  MarkdownIt.prototype.parseInline = function(src, env) {
-    var state = new this.core.State(src, this, env);
-    state.inlineMode = true;
-    this.core.process(state);
-    return state.tokens;
-  };
-  /**
-	 * MarkdownIt.renderInline(src [, env]) -> String
-	 * - src (String): source string
-	 * - env (Object): environment sandbox
-	 *
-	 * Similar to [[MarkdownIt.render]] but for single paragraph content. Result
-	 * will NOT be wrapped into `` tags.
-	 **/  MarkdownIt.prototype.renderInline = function(src, env) {
-    env = env || {};
-    return this.renderer.render(this.parseInline(src, env), this.options, env);
-  };
-  var lib = MarkdownIt;
-  var markdownIt = lib;
-  return markdownIt;
-}));
-
diff --git a/examples/server/public/deps_tailwindcss.js b/examples/server/public/deps_tailwindcss.js
deleted file mode 100644
index 6736cb8ca..000000000
--- a/examples/server/public/deps_tailwindcss.js
+++ /dev/null
@@ -1,82 +0,0 @@
-(()=>{var Iv=Object.create;var Ui=Object.defineProperty;var Dv=Object.getOwnPropertyDescriptor;var qv=Object.getOwnPropertyNames;var $v=Object.getPrototypeOf,Lv=Object.prototype.hasOwnProperty;var cf=r=>Ui(r,"__esModule",{value:!0});var pf=r=>{if(typeof require!="undefined")return require(r);throw new Error('Dynamic require of "'+r+'" is not supported')};var R=(r,e)=>()=>(r&&(e=r(r=0)),e);var x=(r,e)=>()=>(e||r((e={exports:{}}).exports,e),e.exports),Ge=(r,e)=>{cf(r);for(var t in e)Ui(r,t,{get:e[t],enumerable:!0})},Mv=(r,e,t)=>{if(e&&typeof e=="object"||typeof e=="function")for(let i of qv(e))!Lv.call(r,i)&&i!=="default"&&Ui(r,i,{get:()=>e[i],enumerable:!(t=Dv(e,i))||t.enumerable});return r},pe=r=>Mv(cf(Ui(r!=null?Iv($v(r)):{},"default",r&&r.__esModule&&"default"in r?{get:()=>r.default,enumerable:!0}:{value:r,enumerable:!0})),r);var m,u=R(()=>{m={platform:"",env:{},versions:{node:"14.17.6"}}});var Nv,be,ft=R(()=>{u();Nv=0,be={readFileSync:r=>self[r]||"",statSync:()=>({mtimeMs:Nv++}),promises:{readFile:r=>Promise.resolve(self[r]||"")}}});var Ns=x((sP,hf)=>{u();"use strict";var df=class{constructor(e={}){if(!(e.maxSize&&e.maxSize>0))throw new TypeError("`maxSize` must be a number greater than 0");if(typeof e.maxAge=="number"&&e.maxAge===0)throw new TypeError("`maxAge` must be a number greater than 0");this.maxSize=e.maxSize,this.maxAge=e.maxAge||1/0,this.onEviction=e.onEviction,this.cache=new Map,this.oldCache=new Map,this._size=0}_emitEvictions(e){if(typeof this.onEviction=="function")for(let[t,i]of e)this.onEviction(t,i.value)}_deleteIfExpired(e,t){return typeof t.expiry=="number"&&t.expiry<=Date.now()?(typeof this.onEviction=="function"&&this.onEviction(e,t.value),this.delete(e)):!1}_getOrDeleteIfExpired(e,t){if(this._deleteIfExpired(e,t)===!1)return t.value}_getItemValue(e,t){return t.expiry?this._getOrDeleteIfExpired(e,t):t.value}_peek(e,t){let i=t.get(e);return this._getItemValue(e,i)}_set(e,t){this.cache.set(e,t),this._size++,this._size>=this.maxSize&&(this._size=0,this._emitEvictions(this.oldCache),this.oldCache=this.cache,this.cache=new Map)}_moveToRecent(e,t){this.oldCache.delete(e),this._set(e,t)}*_entriesAscending(){for(let e of this.oldCache){let[t,i]=e;this.cache.has(t)||this._deleteIfExpired(t,i)===!1&&(yield e)}for(let e of this.cache){let[t,i]=e;this._deleteIfExpired(t,i)===!1&&(yield e)}}get(e){if(this.cache.has(e)){let t=this.cache.get(e);return this._getItemValue(e,t)}if(this.oldCache.has(e)){let t=this.oldCache.get(e);if(this._deleteIfExpired(e,t)===!1)return this._moveToRecent(e,t),t.value}}set(e,t,{maxAge:i=this.maxAge===1/0?void 0:Date.now()+this.maxAge}={}){this.cache.has(e)?this.cache.set(e,{value:t,maxAge:i}):this._set(e,{value:t,expiry:i})}has(e){return this.cache.has(e)?!this._deleteIfExpired(e,this.cache.get(e)):this.oldCache.has(e)?!this._deleteIfExpired(e,this.oldCache.get(e)):!1}peek(e){if(this.cache.has(e))return this._peek(e,this.cache);if(this.oldCache.has(e))return this._peek(e,this.oldCache)}delete(e){let t=this.cache.delete(e);return t&&this._size--,this.oldCache.delete(e)||t}clear(){this.cache.clear(),this.oldCache.clear(),this._size=0}resize(e){if(!(e&&e>0))throw new TypeError("`maxSize` must be a number greater than 0");let t=[...this._entriesAscending()],i=t.length-e;i<0?(this.cache=new Map(t),this.oldCache=new Map,this._size=t.length):(i>0&&this._emitEvictions(t.slice(0,i)),this.oldCache=new Map(t.slice(i)),this.cache=new Map,this._size=0),this.maxSize=e}*keys(){for(let[e]of this)yield e}*values(){for(let[,e]of this)yield e}*[Symbol.iterator](){for(let e of this.cache){let[t,i]=e;this._deleteIfExpired(t,i)===!1&&(yield[t,i.value])}for(let e of this.oldCache){let[t,i]=e;this.cache.has(t)||this._deleteIfExpired(t,i)===!1&&(yield[t,i.value])}}*entriesDescending(){let e=[...this.cache];for(let t=e.length-1;t>=0;--t){let i=e[t],[n,a]=i;this._deleteIfExpired(n,a)===!1&&(yield[n,a.value])}e=[...this.oldCache];for(let t=e.length-1;t>=0;--t){let i=e[t],[n,a]=i;this.cache.has(n)||this._deleteIfExpired(n,a)===!1&&(yield[n,a.value])}}*entriesAscending(){for(let[e,t]of this._entriesAscending())yield[e,t.value]}get size(){if(!this._size)return this.oldCache.size;let e=0;for(let t of this.oldCache.keys())this.cache.has(t)||e++;return Math.min(this._size+e,this.maxSize)}};hf.exports=df});var mf,gf=R(()=>{u();mf=r=>r&&r._hash});function Vi(r){return mf(r,{ignoreUnknown:!0})}var yf=R(()=>{u();gf()});function xt(r){if(r=`${r}`,r==="0")return"0";if(/^[+-]?(\d+|\d*\.\d+)(e[+-]?\d+)?(%|\w+)?$/.test(r))return r.replace(/^[+-]?/,t=>t==="-"?"":"-");let e=["var","calc","min","max","clamp"];for(let t of e)if(r.includes(`${t}(`))return`calc(${r} * -1)`}var Hi=R(()=>{u()});var bf,wf=R(()=>{u();bf=["preflight","container","accessibility","pointerEvents","visibility","position","inset","isolation","zIndex","order","gridColumn","gridColumnStart","gridColumnEnd","gridRow","gridRowStart","gridRowEnd","float","clear","margin","boxSizing","lineClamp","display","aspectRatio","size","height","maxHeight","minHeight","width","minWidth","maxWidth","flex","flexShrink","flexGrow","flexBasis","tableLayout","captionSide","borderCollapse","borderSpacing","transformOrigin","translate","rotate","skew","scale","transform","animation","cursor","touchAction","userSelect","resize","scrollSnapType","scrollSnapAlign","scrollSnapStop","scrollMargin","scrollPadding","listStylePosition","listStyleType","listStyleImage","appearance","columns","breakBefore","breakInside","breakAfter","gridAutoColumns","gridAutoFlow","gridAutoRows","gridTemplateColumns","gridTemplateRows","flexDirection","flexWrap","placeContent","placeItems","alignContent","alignItems","justifyContent","justifyItems","gap","space","divideWidth","divideStyle","divideColor","divideOpacity","placeSelf","alignSelf","justifySelf","overflow","overscrollBehavior","scrollBehavior","textOverflow","hyphens","whitespace","textWrap","wordBreak","borderRadius","borderWidth","borderStyle","borderColor","borderOpacity","backgroundColor","backgroundOpacity","backgroundImage","gradientColorStops","boxDecorationBreak","backgroundSize","backgroundAttachment","backgroundClip","backgroundPosition","backgroundRepeat","backgroundOrigin","fill","stroke","strokeWidth","objectFit","objectPosition","padding","textAlign","textIndent","verticalAlign","fontFamily","fontSize","fontWeight","textTransform","fontStyle","fontVariantNumeric","lineHeight","letterSpacing","textColor","textOpacity","textDecoration","textDecorationColor","textDecorationStyle","textDecorationThickness","textUnderlineOffset","fontSmoothing","placeholderColor","placeholderOpacity","caretColor","accentColor","opacity","backgroundBlendMode","mixBlendMode","boxShadow","boxShadowColor","outlineStyle","outlineWidth","outlineOffset","outlineColor","ringWidth","ringColor","ringOpacity","ringOffsetWidth","ringOffsetColor","blur","brightness","contrast","dropShadow","grayscale","hueRotate","invert","saturate","sepia","filter","backdropBlur","backdropBrightness","backdropContrast","backdropGrayscale","backdropHueRotate","backdropInvert","backdropOpacity","backdropSaturate","backdropSepia","backdropFilter","transitionProperty","transitionDelay","transitionDuration","transitionTimingFunction","willChange","contain","content","forcedColorAdjust"]});function vf(r,e){return r===void 0?e:Array.isArray(r)?r:[...new Set(e.filter(i=>r!==!1&&r[i]!==!1).concat(Object.keys(r).filter(i=>r[i]!==!1)))]}var xf=R(()=>{u()});var kf={};Ge(kf,{default:()=>Qe});var Qe,Wi=R(()=>{u();Qe=new Proxy({},{get:()=>String})});function Bs(r,e,t){typeof m!="undefined"&&m.env.JEST_WORKER_ID||t&&Sf.has(t)||(t&&Sf.add(t),console.warn(""),e.forEach(i=>console.warn(r,"-",i)))}function Fs(r){return Qe.dim(r)}var Sf,G,Be=R(()=>{u();Wi();Sf=new Set;G={info(r,e){Bs(Qe.bold(Qe.cyan("info")),...Array.isArray(r)?[r]:[e,r])},warn(r,e){["content-problems"].includes(r)||Bs(Qe.bold(Qe.yellow("warn")),...Array.isArray(r)?[r]:[e,r])},risk(r,e){Bs(Qe.bold(Qe.magenta("risk")),...Array.isArray(r)?[r]:[e,r])}}});var Af={};Ge(Af,{default:()=>js});function qr({version:r,from:e,to:t}){G.warn(`${e}-color-renamed`,[`As of Tailwind CSS ${r}, \`${e}\` has been renamed to \`${t}\`.`,"Update your configuration file to silence this warning."])}var js,zs=R(()=>{u();Be();js={inherit:"inherit",current:"currentColor",transparent:"transparent",black:"#000",white:"#fff",slate:{50:"#f8fafc",100:"#f1f5f9",200:"#e2e8f0",300:"#cbd5e1",400:"#94a3b8",500:"#64748b",600:"#475569",700:"#334155",800:"#1e293b",900:"#0f172a",950:"#020617"},gray:{50:"#f9fafb",100:"#f3f4f6",200:"#e5e7eb",300:"#d1d5db",400:"#9ca3af",500:"#6b7280",600:"#4b5563",700:"#374151",800:"#1f2937",900:"#111827",950:"#030712"},zinc:{50:"#fafafa",100:"#f4f4f5",200:"#e4e4e7",300:"#d4d4d8",400:"#a1a1aa",500:"#71717a",600:"#52525b",700:"#3f3f46",800:"#27272a",900:"#18181b",950:"#09090b"},neutral:{50:"#fafafa",100:"#f5f5f5",200:"#e5e5e5",300:"#d4d4d4",400:"#a3a3a3",500:"#737373",600:"#525252",700:"#404040",800:"#262626",900:"#171717",950:"#0a0a0a"},stone:{50:"#fafaf9",100:"#f5f5f4",200:"#e7e5e4",300:"#d6d3d1",400:"#a8a29e",500:"#78716c",600:"#57534e",700:"#44403c",800:"#292524",900:"#1c1917",950:"#0c0a09"},red:{50:"#fef2f2",100:"#fee2e2",200:"#fecaca",300:"#fca5a5",400:"#f87171",500:"#ef4444",600:"#dc2626",700:"#b91c1c",800:"#991b1b",900:"#7f1d1d",950:"#450a0a"},orange:{50:"#fff7ed",100:"#ffedd5",200:"#fed7aa",300:"#fdba74",400:"#fb923c",500:"#f97316",600:"#ea580c",700:"#c2410c",800:"#9a3412",900:"#7c2d12",950:"#431407"},amber:{50:"#fffbeb",100:"#fef3c7",200:"#fde68a",300:"#fcd34d",400:"#fbbf24",500:"#f59e0b",600:"#d97706",700:"#b45309",800:"#92400e",900:"#78350f",950:"#451a03"},yellow:{50:"#fefce8",100:"#fef9c3",200:"#fef08a",300:"#fde047",400:"#facc15",500:"#eab308",600:"#ca8a04",700:"#a16207",800:"#854d0e",900:"#713f12",950:"#422006"},lime:{50:"#f7fee7",100:"#ecfccb",200:"#d9f99d",300:"#bef264",400:"#a3e635",500:"#84cc16",600:"#65a30d",700:"#4d7c0f",800:"#3f6212",900:"#365314",950:"#1a2e05"},green:{50:"#f0fdf4",100:"#dcfce7",200:"#bbf7d0",300:"#86efac",400:"#4ade80",500:"#22c55e",600:"#16a34a",700:"#15803d",800:"#166534",900:"#14532d",950:"#052e16"},emerald:{50:"#ecfdf5",100:"#d1fae5",200:"#a7f3d0",300:"#6ee7b7",400:"#34d399",500:"#10b981",600:"#059669",700:"#047857",800:"#065f46",900:"#064e3b",950:"#022c22"},teal:{50:"#f0fdfa",100:"#ccfbf1",200:"#99f6e4",300:"#5eead4",400:"#2dd4bf",500:"#14b8a6",600:"#0d9488",700:"#0f766e",800:"#115e59",900:"#134e4a",950:"#042f2e"},cyan:{50:"#ecfeff",100:"#cffafe",200:"#a5f3fc",300:"#67e8f9",400:"#22d3ee",500:"#06b6d4",600:"#0891b2",700:"#0e7490",800:"#155e75",900:"#164e63",950:"#083344"},sky:{50:"#f0f9ff",100:"#e0f2fe",200:"#bae6fd",300:"#7dd3fc",400:"#38bdf8",500:"#0ea5e9",600:"#0284c7",700:"#0369a1",800:"#075985",900:"#0c4a6e",950:"#082f49"},blue:{50:"#eff6ff",100:"#dbeafe",200:"#bfdbfe",300:"#93c5fd",400:"#60a5fa",500:"#3b82f6",600:"#2563eb",700:"#1d4ed8",800:"#1e40af",900:"#1e3a8a",950:"#172554"},indigo:{50:"#eef2ff",100:"#e0e7ff",200:"#c7d2fe",300:"#a5b4fc",400:"#818cf8",500:"#6366f1",600:"#4f46e5",700:"#4338ca",800:"#3730a3",900:"#312e81",950:"#1e1b4b"},violet:{50:"#f5f3ff",100:"#ede9fe",200:"#ddd6fe",300:"#c4b5fd",400:"#a78bfa",500:"#8b5cf6",600:"#7c3aed",700:"#6d28d9",800:"#5b21b6",900:"#4c1d95",950:"#2e1065"},purple:{50:"#faf5ff",100:"#f3e8ff",200:"#e9d5ff",300:"#d8b4fe",400:"#c084fc",500:"#a855f7",600:"#9333ea",700:"#7e22ce",800:"#6b21a8",900:"#581c87",950:"#3b0764"},fuchsia:{50:"#fdf4ff",100:"#fae8ff",200:"#f5d0fe",300:"#f0abfc",400:"#e879f9",500:"#d946ef",600:"#c026d3",700:"#a21caf",800:"#86198f",900:"#701a75",950:"#4a044e"},pink:{50:"#fdf2f8",100:"#fce7f3",200:"#fbcfe8",300:"#f9a8d4",400:"#f472b6",500:"#ec4899",600:"#db2777",700:"#be185d",800:"#9d174d",900:"#831843",950:"#500724"},rose:{50:"#fff1f2",100:"#ffe4e6",200:"#fecdd3",300:"#fda4af",400:"#fb7185",500:"#f43f5e",600:"#e11d48",700:"#be123c",800:"#9f1239",900:"#881337",950:"#4c0519"},get lightBlue(){return qr({version:"v2.2",from:"lightBlue",to:"sky"}),this.sky},get warmGray(){return qr({version:"v3.0",from:"warmGray",to:"stone"}),this.stone},get trueGray(){return qr({version:"v3.0",from:"trueGray",to:"neutral"}),this.neutral},get coolGray(){return qr({version:"v3.0",from:"coolGray",to:"gray"}),this.gray},get blueGray(){return qr({version:"v3.0",from:"blueGray",to:"slate"}),this.slate}}});function Us(r,...e){for(let t of e){for(let i in t)r?.hasOwnProperty?.(i)||(r[i]=t[i]);for(let i of Object.getOwnPropertySymbols(t))r?.hasOwnProperty?.(i)||(r[i]=t[i])}return r}var Cf=R(()=>{u()});function kt(r){if(Array.isArray(r))return r;let e=r.split("[").length-1,t=r.split("]").length-1;if(e!==t)throw new Error(`Path is invalid. Has unbalanced brackets: ${r}`);return r.split(/\.(?![^\[]*\])|[\[\]]/g).filter(Boolean)}var Gi=R(()=>{u()});function we(r,e){return Qi.future.includes(e)?r.future==="all"||(r?.future?.[e]??_f[e]??!1):Qi.experimental.includes(e)?r.experimental==="all"||(r?.experimental?.[e]??_f[e]??!1):!1}function Ef(r){return r.experimental==="all"?Qi.experimental:Object.keys(r?.experimental??{}).filter(e=>Qi.experimental.includes(e)&&r.experimental[e])}function Of(r){if(m.env.JEST_WORKER_ID===void 0&&Ef(r).length>0){let e=Ef(r).map(t=>Qe.yellow(t)).join(", ");G.warn("experimental-flags-enabled",[`You have enabled experimental features: ${e}`,"Experimental features in Tailwind CSS are not covered by semver, may introduce breaking changes, and can change at any time."])}}var _f,Qi,ct=R(()=>{u();Wi();Be();_f={optimizeUniversalDefaults:!1,generalizedModifiers:!0,disableColorOpacityUtilitiesByDefault:!1,relativeContentPathsByDefault:!1},Qi={future:["hoverOnlyWhenSupported","respectDefaultRingColorOpacity","disableColorOpacityUtilitiesByDefault","relativeContentPathsByDefault"],experimental:["optimizeUniversalDefaults","generalizedModifiers"]}});function Tf(r){(()=>{if(r.purge||!r.content||!Array.isArray(r.content)&&!(typeof r.content=="object"&&r.content!==null))return!1;if(Array.isArray(r.content))return r.content.every(t=>typeof t=="string"?!0:!(typeof t?.raw!="string"||t?.extension&&typeof t?.extension!="string"));if(typeof r.content=="object"&&r.content!==null){if(Object.keys(r.content).some(t=>!["files","relative","extract","transform"].includes(t)))return!1;if(Array.isArray(r.content.files)){if(!r.content.files.every(t=>typeof t=="string"?!0:!(typeof t?.raw!="string"||t?.extension&&typeof t?.extension!="string")))return!1;if(typeof r.content.extract=="object"){for(let t of Object.values(r.content.extract))if(typeof t!="function")return!1}else if(!(r.content.extract===void 0||typeof r.content.extract=="function"))return!1;if(typeof r.content.transform=="object"){for(let t of Object.values(r.content.transform))if(typeof t!="function")return!1}else if(!(r.content.transform===void 0||typeof r.content.transform=="function"))return!1;if(typeof r.content.relative!="boolean"&&typeof r.content.relative!="undefined")return!1}return!0}return!1})()||G.warn("purge-deprecation",["The `purge`/`content` options have changed in Tailwind CSS v3.0.","Update your configuration file to eliminate this warning.","https://tailwindcss.com/docs/upgrade-guide#configure-content-sources"]),r.safelist=(()=>{let{content:t,purge:i,safelist:n}=r;return Array.isArray(n)?n:Array.isArray(t?.safelist)?t.safelist:Array.isArray(i?.safelist)?i.safelist:Array.isArray(i?.options?.safelist)?i.options.safelist:[]})(),r.blocklist=(()=>{let{blocklist:t}=r;if(Array.isArray(t)){if(t.every(i=>typeof i=="string"))return t;G.warn("blocklist-invalid",["The `blocklist` option must be an array of strings.","https://tailwindcss.com/docs/content-configuration#discarding-classes"])}return[]})(),typeof r.prefix=="function"?(G.warn("prefix-function",["As of Tailwind CSS v3.0, `prefix` cannot be a function.","Update `prefix` in your configuration to be a string to eliminate this warning.","https://tailwindcss.com/docs/upgrade-guide#prefix-cannot-be-a-function"]),r.prefix=""):r.prefix=r.prefix??"",r.content={relative:(()=>{let{content:t}=r;return t?.relative?t.relative:we(r,"relativeContentPathsByDefault")})(),files:(()=>{let{content:t,purge:i}=r;return Array.isArray(i)?i:Array.isArray(i?.content)?i.content:Array.isArray(t)?t:Array.isArray(t?.content)?t.content:Array.isArray(t?.files)?t.files:[]})(),extract:(()=>{let t=(()=>r.purge?.extract?r.purge.extract:r.content?.extract?r.content.extract:r.purge?.extract?.DEFAULT?r.purge.extract.DEFAULT:r.content?.extract?.DEFAULT?r.content.extract.DEFAULT:r.purge?.options?.extractors?r.purge.options.extractors:r.content?.options?.extractors?r.content.options.extractors:{})(),i={},n=(()=>{if(r.purge?.options?.defaultExtractor)return r.purge.options.defaultExtractor;if(r.content?.options?.defaultExtractor)return r.content.options.defaultExtractor})();if(n!==void 0&&(i.DEFAULT=n),typeof t=="function")i.DEFAULT=t;else if(Array.isArray(t))for(let{extensions:a,extractor:s}of t??[])for(let o of a)i[o]=s;else typeof t=="object"&&t!==null&&Object.assign(i,t);return i})(),transform:(()=>{let t=(()=>r.purge?.transform?r.purge.transform:r.content?.transform?r.content.transform:r.purge?.transform?.DEFAULT?r.purge.transform.DEFAULT:r.content?.transform?.DEFAULT?r.content.transform.DEFAULT:{})(),i={};return typeof t=="function"?i.DEFAULT=t:typeof t=="object"&&t!==null&&Object.assign(i,t),i})()};for(let t of r.content.files)if(typeof t=="string"&&/{([^,]*?)}/g.test(t)){G.warn("invalid-glob-braces",[`The glob pattern ${Fs(t)} in your Tailwind CSS configuration is invalid.`,`Update it to ${Fs(t.replace(/{([^,]*?)}/g,"$1"))} to silence this warning.`]);break}return r}var Rf=R(()=>{u();ct();Be()});function ke(r){if(Object.prototype.toString.call(r)!=="[object Object]")return!1;let e=Object.getPrototypeOf(r);return e===null||Object.getPrototypeOf(e)===null}var Kt=R(()=>{u()});function St(r){return Array.isArray(r)?r.map(e=>St(e)):typeof r=="object"&&r!==null?Object.fromEntries(Object.entries(r).map(([e,t])=>[e,St(t)])):r}var Yi=R(()=>{u()});function jt(r){return r.replace(/\\,/g,"\\2c ")}var Ki=R(()=>{u()});var Vs,Pf=R(()=>{u();Vs={aliceblue:[240,248,255],antiquewhite:[250,235,215],aqua:[0,255,255],aquamarine:[127,255,212],azure:[240,255,255],beige:[245,245,220],bisque:[255,228,196],black:[0,0,0],blanchedalmond:[255,235,205],blue:[0,0,255],blueviolet:[138,43,226],brown:[165,42,42],burlywood:[222,184,135],cadetblue:[95,158,160],chartreuse:[127,255,0],chocolate:[210,105,30],coral:[255,127,80],cornflowerblue:[100,149,237],cornsilk:[255,248,220],crimson:[220,20,60],cyan:[0,255,255],darkblue:[0,0,139],darkcyan:[0,139,139],darkgoldenrod:[184,134,11],darkgray:[169,169,169],darkgreen:[0,100,0],darkgrey:[169,169,169],darkkhaki:[189,183,107],darkmagenta:[139,0,139],darkolivegreen:[85,107,47],darkorange:[255,140,0],darkorchid:[153,50,204],darkred:[139,0,0],darksalmon:[233,150,122],darkseagreen:[143,188,143],darkslateblue:[72,61,139],darkslategray:[47,79,79],darkslategrey:[47,79,79],darkturquoise:[0,206,209],darkviolet:[148,0,211],deeppink:[255,20,147],deepskyblue:[0,191,255],dimgray:[105,105,105],dimgrey:[105,105,105],dodgerblue:[30,144,255],firebrick:[178,34,34],floralwhite:[255,250,240],forestgreen:[34,139,34],fuchsia:[255,0,255],gainsboro:[220,220,220],ghostwhite:[248,248,255],gold:[255,215,0],goldenrod:[218,165,32],gray:[128,128,128],green:[0,128,0],greenyellow:[173,255,47],grey:[128,128,128],honeydew:[240,255,240],hotpink:[255,105,180],indianred:[205,92,92],indigo:[75,0,130],ivory:[255,255,240],khaki:[240,230,140],lavender:[230,230,250],lavenderblush:[255,240,245],lawngreen:[124,252,0],lemonchiffon:[255,250,205],lightblue:[173,216,230],lightcoral:[240,128,128],lightcyan:[224,255,255],lightgoldenrodyellow:[250,250,210],lightgray:[211,211,211],lightgreen:[144,238,144],lightgrey:[211,211,211],lightpink:[255,182,193],lightsalmon:[255,160,122],lightseagreen:[32,178,170],lightskyblue:[135,206,250],lightslategray:[119,136,153],lightslategrey:[119,136,153],lightsteelblue:[176,196,222],lightyellow:[255,255,224],lime:[0,255,0],limegreen:[50,205,50],linen:[250,240,230],magenta:[255,0,255],maroon:[128,0,0],mediumaquamarine:[102,205,170],mediumblue:[0,0,205],mediumorchid:[186,85,211],mediumpurple:[147,112,219],mediumseagreen:[60,179,113],mediumslateblue:[123,104,238],mediumspringgreen:[0,250,154],mediumturquoise:[72,209,204],mediumvioletred:[199,21,133],midnightblue:[25,25,112],mintcream:[245,255,250],mistyrose:[255,228,225],moccasin:[255,228,181],navajowhite:[255,222,173],navy:[0,0,128],oldlace:[253,245,230],olive:[128,128,0],olivedrab:[107,142,35],orange:[255,165,0],orangered:[255,69,0],orchid:[218,112,214],palegoldenrod:[238,232,170],palegreen:[152,251,152],paleturquoise:[175,238,238],palevioletred:[219,112,147],papayawhip:[255,239,213],peachpuff:[255,218,185],peru:[205,133,63],pink:[255,192,203],plum:[221,160,221],powderblue:[176,224,230],purple:[128,0,128],rebeccapurple:[102,51,153],red:[255,0,0],rosybrown:[188,143,143],royalblue:[65,105,225],saddlebrown:[139,69,19],salmon:[250,128,114],sandybrown:[244,164,96],seagreen:[46,139,87],seashell:[255,245,238],sienna:[160,82,45],silver:[192,192,192],skyblue:[135,206,235],slateblue:[106,90,205],slategray:[112,128,144],slategrey:[112,128,144],snow:[255,250,250],springgreen:[0,255,127],steelblue:[70,130,180],tan:[210,180,140],teal:[0,128,128],thistle:[216,191,216],tomato:[255,99,71],turquoise:[64,224,208],violet:[238,130,238],wheat:[245,222,179],white:[255,255,255],whitesmoke:[245,245,245],yellow:[255,255,0],yellowgreen:[154,205,50]}});function $r(r,{loose:e=!1}={}){if(typeof r!="string")return null;if(r=r.trim(),r==="transparent")return{mode:"rgb",color:["0","0","0"],alpha:"0"};if(r in Vs)return{mode:"rgb",color:Vs[r].map(a=>a.toString())};let t=r.replace(Fv,(a,s,o,l,c)=>["#",s,s,o,o,l,l,c?c+c:""].join("")).match(Bv);if(t!==null)return{mode:"rgb",color:[parseInt(t[1],16),parseInt(t[2],16),parseInt(t[3],16)].map(a=>a.toString()),alpha:t[4]?(parseInt(t[4],16)/255).toString():void 0};let i=r.match(jv)??r.match(zv);if(i===null)return null;let n=[i[2],i[3],i[4]].filter(Boolean).map(a=>a.toString());return n.length===2&&n[0].startsWith("var(")?{mode:i[1],color:[n[0]],alpha:n[1]}:!e&&n.length!==3||n.length<3&&!n.some(a=>/^var\(.*?\)$/.test(a))?null:{mode:i[1],color:n,alpha:i[5]?.toString?.()}}function Hs({mode:r,color:e,alpha:t}){let i=t!==void 0;return r==="rgba"||r==="hsla"?`${r}(${e.join(", ")}${i?`, ${t}`:""})`:`${r}(${e.join(" ")}${i?` / ${t}`:""})`}var Bv,Fv,At,Xi,If,Ct,jv,zv,Ws=R(()=>{u();Pf();Bv=/^#([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})?$/i,Fv=/^#([a-f\d])([a-f\d])([a-f\d])([a-f\d])?$/i,At=/(?:\d+|\d*\.\d+)%?/,Xi=/(?:\s*,\s*|\s+)/,If=/\s*[,/]\s*/,Ct=/var\(--(?:[^ )]*?)(?:,(?:[^ )]*?|var\(--[^ )]*?\)))?\)/,jv=new RegExp(`^(rgba?)\\(\\s*(${At.source}|${Ct.source})(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${If.source}(${At.source}|${Ct.source}))?\\s*\\)$`),zv=new RegExp(`^(hsla?)\\(\\s*((?:${At.source})(?:deg|rad|grad|turn)?|${Ct.source})(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${If.source}(${At.source}|${Ct.source}))?\\s*\\)$`)});function Ze(r,e,t){if(typeof r=="function")return r({opacityValue:e});let i=$r(r,{loose:!0});return i===null?t:Hs({...i,alpha:e})}function Ae({color:r,property:e,variable:t}){let i=[].concat(e);if(typeof r=="function")return{[t]:"1",...Object.fromEntries(i.map(a=>[a,r({opacityVariable:t,opacityValue:`var(${t})`})]))};let n=$r(r);return n===null?Object.fromEntries(i.map(a=>[a,r])):n.alpha!==void 0?Object.fromEntries(i.map(a=>[a,r])):{[t]:"1",...Object.fromEntries(i.map(a=>[a,Hs({...n,alpha:`var(${t})`})]))}}var Lr=R(()=>{u();Ws()});function ve(r,e){let t=[],i=[],n=0,a=!1;for(let s=0;s<r.length;s++){let o=r[s];t.length===0&&o===e[0]&&!a&&(e.length===1||r.slice(s,s+e.length)===e)&&(i.push(r.slice(n,s)),n=s+e.length),a=a?!1:o==="\\",o==="("||o==="["||o==="{"?t.push(o):(o===")"&&t[t.length-1]==="("||o==="]"&&t[t.length-1]==="["||o==="}"&&t[t.length-1]==="{")&&t.pop()}return i.push(r.slice(n)),i}var zt=R(()=>{u()});function Ji(r){return ve(r,",").map(t=>{let i=t.trim(),n={raw:i},a=i.split(Vv),s=new Set;for(let o of a)Df.lastIndex=0,!s.has("KEYWORD")&&Uv.has(o)?(n.keyword=o,s.add("KEYWORD")):Df.test(o)?s.has("X")?s.has("Y")?s.has("BLUR")?s.has("SPREAD")||(n.spread=o,s.add("SPREAD")):(n.blur=o,s.add("BLUR")):(n.y=o,s.add("Y")):(n.x=o,s.add("X")):n.color?(n.unknown||(n.unknown=[]),n.unknown.push(o)):n.color=o;return n.valid=n.x!==void 0&&n.y!==void 0,n})}function qf(r){return r.map(e=>e.valid?[e.keyword,e.x,e.y,e.blur,e.spread,e.color].filter(Boolean).join(" "):e.raw).join(", ")}var Uv,Vv,Df,Gs=R(()=>{u();zt();Uv=new Set(["inset","inherit","initial","revert","unset"]),Vv=/\ +(?![^(]*\))/g,Df=/^-?(\d+|\.\d+)(.*?)$/g});function Qs(r){return Hv.some(e=>new RegExp(`^${e}\\(.*\\)`).test(r))}function K(r,e=null,t=!0){let i=e&&Wv.has(e.property);return r.startsWith("--")&&!i?`var(${r})`:r.includes("url(")?r.split(/(url\(.*?\))/g).filter(Boolean).map(n=>/^url\(.*?\)$/.test(n)?n:K(n,e,!1)).join(""):(r=r.replace(/([^\\])_+/g,(n,a)=>a+" ".repeat(n.length-1)).replace(/^_/g," ").replace(/\\_/g,"_"),t&&(r=r.trim()),r=Gv(r),r)}function Ye(r){return r.includes("=")&&(r=r.replace(/(=.*)/g,(e,t)=>{if(t[1]==="'"||t[1]==='"')return t;if(t.length>2){let i=t[t.length-1];if(t[t.length-2]===" "&&(i==="i"||i==="I"||i==="s"||i==="S"))return`="${t.slice(1,-2)}" ${t[t.length-1]}`}return`="${t.slice(1)}"`})),r}function Gv(r){let e=["theme"],t=["min-content","max-content","fit-content","safe-area-inset-top","safe-area-inset-right","safe-area-inset-bottom","safe-area-inset-left","titlebar-area-x","titlebar-area-y","titlebar-area-width","titlebar-area-height","keyboard-inset-top","keyboard-inset-right","keyboard-inset-bottom","keyboard-inset-left","keyboard-inset-width","keyboard-inset-height","radial-gradient","linear-gradient","conic-gradient","repeating-radial-gradient","repeating-linear-gradient","repeating-conic-gradient","anchor-size"];return r.replace(/(calc|min|max|clamp)\(.+\)/g,i=>{let n="";function a(){let s=n.trimEnd();return s[s.length-1]}for(let s=0;s<i.length;s++){let o=function(f){return f.split("").every((d,p)=>i[s+p]===d)},l=function(f){let d=1/0;for(let h of f){let b=i.indexOf(h,s);b!==-1&&b<d&&(d=b)}let p=i.slice(s,d);return s+=p.length-1,p},c=i[s];if(o("var"))n+=l([")",","]);else if(t.some(f=>o(f))){let f=t.find(d=>o(d));n+=f,s+=f.length-1}else e.some(f=>o(f))?n+=l([")"]):o("[")?n+=l(["]"]):["+","-","*","/"].includes(c)&&!["(","+","-","*","/",","].includes(a())?n+=` ${c} `:n+=c}return n.replace(/\s+/g," ")})}function Ys(r){return r.startsWith("url(")}function Ks(r){return!isNaN(Number(r))||Qs(r)}function Mr(r){return r.endsWith("%")&&Ks(r.slice(0,-1))||Qs(r)}function Nr(r){return r==="0"||new RegExp(`^[+-]?[0-9]*.?[0-9]+(?:[eE][+-]?[0-9]+)?${Yv}$`).test(r)||Qs(r)}function $f(r){return Kv.has(r)}function Lf(r){let e=Ji(K(r));for(let t of e)if(!t.valid)return!1;return!0}function Mf(r){let e=0;return ve(r,"_").every(i=>(i=K(i),i.startsWith("var(")?!0:$r(i,{loose:!0})!==null?(e++,!0):!1))?e>0:!1}function Nf(r){let e=0;return ve(r,",").every(i=>(i=K(i),i.startsWith("var(")?!0:Ys(i)||Jv(i)||["element(","image(","cross-fade(","image-set("].some(n=>i.startsWith(n))?(e++,!0):!1))?e>0:!1}function Jv(r){r=K(r);for(let e of Xv)if(r.startsWith(`${e}(`))return!0;return!1}function Bf(r){let e=0;return ve(r,"_").every(i=>(i=K(i),i.startsWith("var(")?!0:Zv.has(i)||Nr(i)||Mr(i)?(e++,!0):!1))?e>0:!1}function Ff(r){let e=0;return ve(r,",").every(i=>(i=K(i),i.startsWith("var(")?!0:i.includes(" ")&&!/(['"])([^"']+)\1/g.test(i)||/^\d/g.test(i)?!1:(e++,!0)))?e>0:!1}function jf(r){return ex.has(r)}function zf(r){return tx.has(r)}function Uf(r){return rx.has(r)}var Hv,Wv,Qv,Yv,Kv,Xv,Zv,ex,tx,rx,Br=R(()=>{u();Ws();Gs();zt();Hv=["min","max","clamp","calc"];Wv=new Set(["scroll-timeline-name","timeline-scope","view-timeline-name","font-palette","anchor-name","anchor-scope","position-anchor","position-try-options","scroll-timeline","animation-timeline","view-timeline","position-try"]);Qv=["cm","mm","Q","in","pc","pt","px","em","ex","ch","rem","lh","rlh","vw","vh","vmin","vmax","vb","vi","svw","svh","lvw","lvh","dvw","dvh","cqw","cqh","cqi","cqb","cqmin","cqmax"],Yv=`(?:${Qv.join("|")})`;Kv=new Set(["thin","medium","thick"]);Xv=new Set(["conic-gradient","linear-gradient","radial-gradient","repeating-conic-gradient","repeating-linear-gradient","repeating-radial-gradient"]);Zv=new Set(["center","top","right","bottom","left"]);ex=new Set(["serif","sans-serif","monospace","cursive","fantasy","system-ui","ui-serif","ui-sans-serif","ui-monospace","ui-rounded","math","emoji","fangsong"]);tx=new Set(["xx-small","x-small","small","medium","large","x-large","xx-large","xxx-large"]);rx=new Set(["larger","smaller"])});function Vf(r){let e=["cover","contain"];return ve(r,",").every(t=>{let i=ve(t,"_").filter(Boolean);return i.length===1&&e.includes(i[0])?!0:i.length!==1&&i.length!==2?!1:i.every(n=>Nr(n)||Mr(n)||n==="auto")})}var Hf=R(()=>{u();Br();zt()});function Wf(r,e){r.walkClasses(t=>{t.value=e(t.value),t.raws&&t.raws.value&&(t.raws.value=jt(t.raws.value))})}function Gf(r,e){if(!_t(r))return;let t=r.slice(1,-1);if(!!e(t))return K(t)}function ix(r,e={},t){let i=e[r];if(i!==void 0)return xt(i);if(_t(r)){let n=Gf(r,t);return n===void 0?void 0:xt(n)}}function Zi(r,e={},{validate:t=()=>!0}={}){let i=e.values?.[r];return i!==void 0?i:e.supportsNegativeValues&&r.startsWith("-")?ix(r.slice(1),e.values,t):Gf(r,t)}function _t(r){return r.startsWith("[")&&r.endsWith("]")}function Qf(r){let e=r.lastIndexOf("/"),t=r.lastIndexOf("[",e),i=r.indexOf("]",e);return r[e-1]==="]"||r[e+1]==="["||t!==-1&&i!==-1&&t<e&&e<i&&(e=r.lastIndexOf("/",t)),e===-1||e===r.length-1?[r,void 0]:_t(r)&&!r.includes("]/[")?[r,void 0]:[r.slice(0,e),r.slice(e+1)]}function Xt(r){if(typeof r=="string"&&r.includes("<alpha-value>")){let e=r;return({opacityValue:t=1})=>e.replace(/<alpha-value>/g,t)}return r}function Yf(r){return K(r.slice(1,-1))}function nx(r,e={},{tailwindConfig:t={}}={}){if(e.values?.[r]!==void 0)return Xt(e.values?.[r]);let[i,n]=Qf(r);if(n!==void 0){let a=e.values?.[i]??(_t(i)?i.slice(1,-1):void 0);return a===void 0?void 0:(a=Xt(a),_t(n)?Ze(a,Yf(n)):t.theme?.opacity?.[n]===void 0?void 0:Ze(a,t.theme.opacity[n]))}return Zi(r,e,{validate:Mf})}function sx(r,e={}){return e.values?.[r]}function qe(r){return(e,t)=>Zi(e,t,{validate:r})}function ax(r,e){let t=r.indexOf(e);return t===-1?[void 0,r]:[r.slice(0,t),r.slice(t+1)]}function Js(r,e,t,i){if(t.values&&e in t.values)for(let{type:a}of r??[]){let s=Xs[a](e,t,{tailwindConfig:i});if(s!==void 0)return[s,a,null]}if(_t(e)){let a=e.slice(1,-1),[s,o]=ax(a,":");if(!/^[\w-_]+$/g.test(s))o=a;else if(s!==void 0&&!Kf.includes(s))return[];if(o.length>0&&Kf.includes(s))return[Zi(`[${o}]`,t),s,null]}let n=Zs(r,e,t,i);for(let a of n)return a;return[]}function*Zs(r,e,t,i){let n=we(i,"generalizedModifiers"),[a,s]=Qf(e);if(n&&t.modifiers!=null&&(t.modifiers==="any"||typeof t.modifiers=="object"&&(s&&_t(s)||s in t.modifiers))||(a=e,s=void 0),s!==void 0&&a===""&&(a="DEFAULT"),s!==void 0&&typeof t.modifiers=="object"){let l=t.modifiers?.[s]??null;l!==null?s=l:_t(s)&&(s=Yf(s))}for(let{type:l}of r??[]){let c=Xs[l](a,t,{tailwindConfig:i});c!==void 0&&(yield[c,l,s??null])}}var Xs,Kf,Fr=R(()=>{u();Ki();Lr();Br();Hi();Hf();ct();Xs={any:Zi,color:nx,url:qe(Ys),image:qe(Nf),length:qe(Nr),percentage:qe(Mr),position:qe(Bf),lookup:sx,"generic-name":qe(jf),"family-name":qe(Ff),number:qe(Ks),"line-width":qe($f),"absolute-size":qe(zf),"relative-size":qe(Uf),shadow:qe(Lf),size:qe(Vf)},Kf=Object.keys(Xs)});function X(r){return typeof r=="function"?r({}):r}var ea=R(()=>{u()});function Jt(r){return typeof r=="function"}function jr(r,...e){let t=e.pop();for(let i of e)for(let n in i){let a=t(r[n],i[n]);a===void 0?ke(r[n])&&ke(i[n])?r[n]=jr({},r[n],i[n],t):r[n]=i[n]:r[n]=a}return r}function ox(r,...e){return Jt(r)?r(...e):r}function lx(r){return r.reduce((e,{extend:t})=>jr(e,t,(i,n)=>i===void 0?[n]:Array.isArray(i)?[n,...i]:[n,i]),{})}function ux(r){return{...r.reduce((e,t)=>Us(e,t),{}),extend:lx(r)}}function Xf(r,e){if(Array.isArray(r)&&ke(r[0]))return r.concat(e);if(Array.isArray(e)&&ke(e[0])&&ke(r))return[r,...e];if(Array.isArray(e))return e}function fx({extend:r,...e}){return jr(e,r,(t,i)=>!Jt(t)&&!i.some(Jt)?jr({},t,...i,Xf):(n,a)=>jr({},...[t,...i].map(s=>ox(s,n,a)),Xf))}function*cx(r){let e=kt(r);if(e.length===0||(yield e,Array.isArray(r)))return;let t=/^(.*?)\s*\/\s*([^/]+)$/,i=r.match(t);if(i!==null){let[,n,a]=i,s=kt(n);s.alpha=a,yield s}}function px(r){let e=(t,i)=>{for(let n of cx(t)){let a=0,s=r;for(;s!=null&&a<n.length;)s=s[n[a++]],s=Jt(s)&&(n.alpha===void 0||a<=n.length-1)?s(e,ta):s;if(s!==void 0){if(n.alpha!==void 0){let o=Xt(s);return Ze(o,n.alpha,X(o))}return ke(s)?St(s):s}}return i};return Object.assign(e,{theme:e,...ta}),Object.keys(r).reduce((t,i)=>(t[i]=Jt(r[i])?r[i](e,ta):r[i],t),{})}function Jf(r){let e=[];return r.forEach(t=>{e=[...e,t];let i=t?.plugins??[];i.length!==0&&i.forEach(n=>{n.__isOptionsFunction&&(n=n()),e=[...e,...Jf([n?.config??{}])]})}),e}function dx(r){return[...r].reduceRight((t,i)=>Jt(i)?i({corePlugins:t}):vf(i,t),bf)}function hx(r){return[...r].reduceRight((t,i)=>[...t,...i],[])}function ra(r){let e=[...Jf(r),{prefix:"",important:!1,separator:":"}];return Tf(Us({theme:px(fx(ux(e.map(t=>t?.theme??{})))),corePlugins:dx(e.map(t=>t.corePlugins)),plugins:hx(r.map(t=>t?.plugins??[]))},...e))}var ta,Zf=R(()=>{u();Hi();wf();xf();zs();Cf();Gi();Rf();Kt();Yi();Fr();Lr();ea();ta={colors:js,negative(r){return Object.keys(r).filter(e=>r[e]!=="0").reduce((e,t)=>{let i=xt(r[t]);return i!==void 0&&(e[`-${t}`]=i),e},{})},breakpoints(r){return Object.keys(r).filter(e=>typeof r[e]=="string").reduce((e,t)=>({...e,[`screen-${t}`]:r[t]}),{})}}});var en=x((l3,ec)=>{u();ec.exports={content:[],presets:[],darkMode:"media",theme:{accentColor:({theme:r})=>({...r("colors"),auto:"auto"}),animation:{none:"none",spin:"spin 1s linear infinite",ping:"ping 1s cubic-bezier(0, 0, 0.2, 1) infinite",pulse:"pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite",bounce:"bounce 1s infinite"},aria:{busy:'busy="true"',checked:'checked="true"',disabled:'disabled="true"',expanded:'expanded="true"',hidden:'hidden="true"',pressed:'pressed="true"',readonly:'readonly="true"',required:'required="true"',selected:'selected="true"'},aspectRatio:{auto:"auto",square:"1 / 1",video:"16 / 9"},backdropBlur:({theme:r})=>r("blur"),backdropBrightness:({theme:r})=>r("brightness"),backdropContrast:({theme:r})=>r("contrast"),backdropGrayscale:({theme:r})=>r("grayscale"),backdropHueRotate:({theme:r})=>r("hueRotate"),backdropInvert:({theme:r})=>r("invert"),backdropOpacity:({theme:r})=>r("opacity"),backdropSaturate:({theme:r})=>r("saturate"),backdropSepia:({theme:r})=>r("sepia"),backgroundColor:({theme:r})=>r("colors"),backgroundImage:{none:"none","gradient-to-t":"linear-gradient(to top, var(--tw-gradient-stops))","gradient-to-tr":"linear-gradient(to top right, var(--tw-gradient-stops))","gradient-to-r":"linear-gradient(to right, var(--tw-gradient-stops))","gradient-to-br":"linear-gradient(to bottom right, var(--tw-gradient-stops))","gradient-to-b":"linear-gradient(to bottom, var(--tw-gradient-stops))","gradient-to-bl":"linear-gradient(to bottom left, var(--tw-gradient-stops))","gradient-to-l":"linear-gradient(to left, var(--tw-gradient-stops))","gradient-to-tl":"linear-gradient(to top left, var(--tw-gradient-stops))"},backgroundOpacity:({theme:r})=>r("opacity"),backgroundPosition:{bottom:"bottom",center:"center",left:"left","left-bottom":"left bottom","left-top":"left top",right:"right","right-bottom":"right bottom","right-top":"right top",top:"top"},backgroundSize:{auto:"auto",cover:"cover",contain:"contain"},blur:{0:"0",none:"",sm:"4px",DEFAULT:"8px",md:"12px",lg:"16px",xl:"24px","2xl":"40px","3xl":"64px"},borderColor:({theme:r})=>({...r("colors"),DEFAULT:r("colors.gray.200","currentColor")}),borderOpacity:({theme:r})=>r("opacity"),borderRadius:{none:"0px",sm:"0.125rem",DEFAULT:"0.25rem",md:"0.375rem",lg:"0.5rem",xl:"0.75rem","2xl":"1rem","3xl":"1.5rem",full:"9999px"},borderSpacing:({theme:r})=>({...r("spacing")}),borderWidth:{DEFAULT:"1px",0:"0px",2:"2px",4:"4px",8:"8px"},boxShadow:{sm:"0 1px 2px 0 rgb(0 0 0 / 0.05)",DEFAULT:"0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",md:"0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",lg:"0 10px 15px -3px rgb(0 0 0 / 0.1), 0 4px 6px -4px rgb(0 0 0 / 0.1)",xl:"0 20px 25px -5px rgb(0 0 0 / 0.1), 0 8px 10px -6px rgb(0 0 0 / 0.1)","2xl":"0 25px 50px -12px rgb(0 0 0 / 0.25)",inner:"inset 0 2px 4px 0 rgb(0 0 0 / 0.05)",none:"none"},boxShadowColor:({theme:r})=>r("colors"),brightness:{0:"0",50:".5",75:".75",90:".9",95:".95",100:"1",105:"1.05",110:"1.1",125:"1.25",150:"1.5",200:"2"},caretColor:({theme:r})=>r("colors"),colors:({colors:r})=>({inherit:r.inherit,current:r.current,transparent:r.transparent,black:r.black,white:r.white,slate:r.slate,gray:r.gray,zinc:r.zinc,neutral:r.neutral,stone:r.stone,red:r.red,orange:r.orange,amber:r.amber,yellow:r.yellow,lime:r.lime,green:r.green,emerald:r.emerald,teal:r.teal,cyan:r.cyan,sky:r.sky,blue:r.blue,indigo:r.indigo,violet:r.violet,purple:r.purple,fuchsia:r.fuchsia,pink:r.pink,rose:r.rose}),columns:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12","3xs":"16rem","2xs":"18rem",xs:"20rem",sm:"24rem",md:"28rem",lg:"32rem",xl:"36rem","2xl":"42rem","3xl":"48rem","4xl":"56rem","5xl":"64rem","6xl":"72rem","7xl":"80rem"},container:{},content:{none:"none"},contrast:{0:"0",50:".5",75:".75",100:"1",125:"1.25",150:"1.5",200:"2"},cursor:{auto:"auto",default:"default",pointer:"pointer",wait:"wait",text:"text",move:"move",help:"help","not-allowed":"not-allowed",none:"none","context-menu":"context-menu",progress:"progress",cell:"cell",crosshair:"crosshair","vertical-text":"vertical-text",alias:"alias",copy:"copy","no-drop":"no-drop",grab:"grab",grabbing:"grabbing","all-scroll":"all-scroll","col-resize":"col-resize","row-resize":"row-resize","n-resize":"n-resize","e-resize":"e-resize","s-resize":"s-resize","w-resize":"w-resize","ne-resize":"ne-resize","nw-resize":"nw-resize","se-resize":"se-resize","sw-resize":"sw-resize","ew-resize":"ew-resize","ns-resize":"ns-resize","nesw-resize":"nesw-resize","nwse-resize":"nwse-resize","zoom-in":"zoom-in","zoom-out":"zoom-out"},divideColor:({theme:r})=>r("borderColor"),divideOpacity:({theme:r})=>r("borderOpacity"),divideWidth:({theme:r})=>r("borderWidth"),dropShadow:{sm:"0 1px 1px rgb(0 0 0 / 0.05)",DEFAULT:["0 1px 2px rgb(0 0 0 / 0.1)","0 1px 1px rgb(0 0 0 / 0.06)"],md:["0 4px 3px rgb(0 0 0 / 0.07)","0 2px 2px rgb(0 0 0 / 0.06)"],lg:["0 10px 8px rgb(0 0 0 / 0.04)","0 4px 3px rgb(0 0 0 / 0.1)"],xl:["0 20px 13px rgb(0 0 0 / 0.03)","0 8px 5px rgb(0 0 0 / 0.08)"],"2xl":"0 25px 25px rgb(0 0 0 / 0.15)",none:"0 0 #0000"},fill:({theme:r})=>({none:"none",...r("colors")}),flex:{1:"1 1 0%",auto:"1 1 auto",initial:"0 1 auto",none:"none"},flexBasis:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%","1/12":"8.333333%","2/12":"16.666667%","3/12":"25%","4/12":"33.333333%","5/12":"41.666667%","6/12":"50%","7/12":"58.333333%","8/12":"66.666667%","9/12":"75%","10/12":"83.333333%","11/12":"91.666667%",full:"100%"}),flexGrow:{0:"0",DEFAULT:"1"},flexShrink:{0:"0",DEFAULT:"1"},fontFamily:{sans:["ui-sans-serif","system-ui","sans-serif",'"Apple Color Emoji"','"Segoe UI Emoji"','"Segoe UI Symbol"','"Noto Color Emoji"'],serif:["ui-serif","Georgia","Cambria",'"Times New Roman"',"Times","serif"],mono:["ui-monospace","SFMono-Regular","Menlo","Monaco","Consolas",'"Liberation Mono"','"Courier New"',"monospace"]},fontSize:{xs:["0.75rem",{lineHeight:"1rem"}],sm:["0.875rem",{lineHeight:"1.25rem"}],base:["1rem",{lineHeight:"1.5rem"}],lg:["1.125rem",{lineHeight:"1.75rem"}],xl:["1.25rem",{lineHeight:"1.75rem"}],"2xl":["1.5rem",{lineHeight:"2rem"}],"3xl":["1.875rem",{lineHeight:"2.25rem"}],"4xl":["2.25rem",{lineHeight:"2.5rem"}],"5xl":["3rem",{lineHeight:"1"}],"6xl":["3.75rem",{lineHeight:"1"}],"7xl":["4.5rem",{lineHeight:"1"}],"8xl":["6rem",{lineHeight:"1"}],"9xl":["8rem",{lineHeight:"1"}]},fontWeight:{thin:"100",extralight:"200",light:"300",normal:"400",medium:"500",semibold:"600",bold:"700",extrabold:"800",black:"900"},gap:({theme:r})=>r("spacing"),gradientColorStops:({theme:r})=>r("colors"),gradientColorStopPositions:{"0%":"0%","5%":"5%","10%":"10%","15%":"15%","20%":"20%","25%":"25%","30%":"30%","35%":"35%","40%":"40%","45%":"45%","50%":"50%","55%":"55%","60%":"60%","65%":"65%","70%":"70%","75%":"75%","80%":"80%","85%":"85%","90%":"90%","95%":"95%","100%":"100%"},grayscale:{0:"0",DEFAULT:"100%"},gridAutoColumns:{auto:"auto",min:"min-content",max:"max-content",fr:"minmax(0, 1fr)"},gridAutoRows:{auto:"auto",min:"min-content",max:"max-content",fr:"minmax(0, 1fr)"},gridColumn:{auto:"auto","span-1":"span 1 / span 1","span-2":"span 2 / span 2","span-3":"span 3 / span 3","span-4":"span 4 / span 4","span-5":"span 5 / span 5","span-6":"span 6 / span 6","span-7":"span 7 / span 7","span-8":"span 8 / span 8","span-9":"span 9 / span 9","span-10":"span 10 / span 10","span-11":"span 11 / span 11","span-12":"span 12 / span 12","span-full":"1 / -1"},gridColumnEnd:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridColumnStart:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridRow:{auto:"auto","span-1":"span 1 / span 1","span-2":"span 2 / span 2","span-3":"span 3 / span 3","span-4":"span 4 / span 4","span-5":"span 5 / span 5","span-6":"span 6 / span 6","span-7":"span 7 / span 7","span-8":"span 8 / span 8","span-9":"span 9 / span 9","span-10":"span 10 / span 10","span-11":"span 11 / span 11","span-12":"span 12 / span 12","span-full":"1 / -1"},gridRowEnd:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridRowStart:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridTemplateColumns:{none:"none",subgrid:"subgrid",1:"repeat(1, minmax(0, 1fr))",2:"repeat(2, minmax(0, 1fr))",3:"repeat(3, minmax(0, 1fr))",4:"repeat(4, minmax(0, 1fr))",5:"repeat(5, minmax(0, 1fr))",6:"repeat(6, minmax(0, 1fr))",7:"repeat(7, minmax(0, 1fr))",8:"repeat(8, minmax(0, 1fr))",9:"repeat(9, minmax(0, 1fr))",10:"repeat(10, minmax(0, 1fr))",11:"repeat(11, minmax(0, 1fr))",12:"repeat(12, minmax(0, 1fr))"},gridTemplateRows:{none:"none",subgrid:"subgrid",1:"repeat(1, minmax(0, 1fr))",2:"repeat(2, minmax(0, 1fr))",3:"repeat(3, minmax(0, 1fr))",4:"repeat(4, minmax(0, 1fr))",5:"repeat(5, minmax(0, 1fr))",6:"repeat(6, minmax(0, 1fr))",7:"repeat(7, minmax(0, 1fr))",8:"repeat(8, minmax(0, 1fr))",9:"repeat(9, minmax(0, 1fr))",10:"repeat(10, minmax(0, 1fr))",11:"repeat(11, minmax(0, 1fr))",12:"repeat(12, minmax(0, 1fr))"},height:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%",full:"100%",screen:"100vh",svh:"100svh",lvh:"100lvh",dvh:"100dvh",min:"min-content",max:"max-content",fit:"fit-content"}),hueRotate:{0:"0deg",15:"15deg",30:"30deg",60:"60deg",90:"90deg",180:"180deg"},inset:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%",full:"100%"}),invert:{0:"0",DEFAULT:"100%"},keyframes:{spin:{to:{transform:"rotate(360deg)"}},ping:{"75%, 100%":{transform:"scale(2)",opacity:"0"}},pulse:{"50%":{opacity:".5"}},bounce:{"0%, 100%":{transform:"translateY(-25%)",animationTimingFunction:"cubic-bezier(0.8,0,1,1)"},"50%":{transform:"none",animationTimingFunction:"cubic-bezier(0,0,0.2,1)"}}},letterSpacing:{tighter:"-0.05em",tight:"-0.025em",normal:"0em",wide:"0.025em",wider:"0.05em",widest:"0.1em"},lineHeight:{none:"1",tight:"1.25",snug:"1.375",normal:"1.5",relaxed:"1.625",loose:"2",3:".75rem",4:"1rem",5:"1.25rem",6:"1.5rem",7:"1.75rem",8:"2rem",9:"2.25rem",10:"2.5rem"},listStyleType:{none:"none",disc:"disc",decimal:"decimal"},listStyleImage:{none:"none"},margin:({theme:r})=>({auto:"auto",...r("spacing")}),lineClamp:{1:"1",2:"2",3:"3",4:"4",5:"5",6:"6"},maxHeight:({theme:r})=>({...r("spacing"),none:"none",full:"100%",screen:"100vh",svh:"100svh",lvh:"100lvh",dvh:"100dvh",min:"min-content",max:"max-content",fit:"fit-content"}),maxWidth:({theme:r,breakpoints:e})=>({...r("spacing"),none:"none",xs:"20rem",sm:"24rem",md:"28rem",lg:"32rem",xl:"36rem","2xl":"42rem","3xl":"48rem","4xl":"56rem","5xl":"64rem","6xl":"72rem","7xl":"80rem",full:"100%",min:"min-content",max:"max-content",fit:"fit-content",prose:"65ch",...e(r("screens"))}),minHeight:({theme:r})=>({...r("spacing"),full:"100%",screen:"100vh",svh:"100svh",lvh:"100lvh",dvh:"100dvh",min:"min-content",max:"max-content",fit:"fit-content"}),minWidth:({theme:r})=>({...r("spacing"),full:"100%",min:"min-content",max:"max-content",fit:"fit-content"}),objectPosition:{bottom:"bottom",center:"center",left:"left","left-bottom":"left bottom","left-top":"left top",right:"right","right-bottom":"right bottom","right-top":"right top",top:"top"},opacity:{0:"0",5:"0.05",10:"0.1",15:"0.15",20:"0.2",25:"0.25",30:"0.3",35:"0.35",40:"0.4",45:"0.45",50:"0.5",55:"0.55",60:"0.6",65:"0.65",70:"0.7",75:"0.75",80:"0.8",85:"0.85",90:"0.9",95:"0.95",100:"1"},order:{first:"-9999",last:"9999",none:"0",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12"},outlineColor:({theme:r})=>r("colors"),outlineOffset:{0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},outlineWidth:{0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},padding:({theme:r})=>r("spacing"),placeholderColor:({theme:r})=>r("colors"),placeholderOpacity:({theme:r})=>r("opacity"),ringColor:({theme:r})=>({DEFAULT:r("colors.blue.500","#3b82f6"),...r("colors")}),ringOffsetColor:({theme:r})=>r("colors"),ringOffsetWidth:{0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},ringOpacity:({theme:r})=>({DEFAULT:"0.5",...r("opacity")}),ringWidth:{DEFAULT:"3px",0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},rotate:{0:"0deg",1:"1deg",2:"2deg",3:"3deg",6:"6deg",12:"12deg",45:"45deg",90:"90deg",180:"180deg"},saturate:{0:"0",50:".5",100:"1",150:"1.5",200:"2"},scale:{0:"0",50:".5",75:".75",90:".9",95:".95",100:"1",105:"1.05",110:"1.1",125:"1.25",150:"1.5"},screens:{sm:"640px",md:"768px",lg:"1024px",xl:"1280px","2xl":"1536px"},scrollMargin:({theme:r})=>({...r("spacing")}),scrollPadding:({theme:r})=>r("spacing"),sepia:{0:"0",DEFAULT:"100%"},skew:{0:"0deg",1:"1deg",2:"2deg",3:"3deg",6:"6deg",12:"12deg"},space:({theme:r})=>({...r("spacing")}),spacing:{px:"1px",0:"0px",.5:"0.125rem",1:"0.25rem",1.5:"0.375rem",2:"0.5rem",2.5:"0.625rem",3:"0.75rem",3.5:"0.875rem",4:"1rem",5:"1.25rem",6:"1.5rem",7:"1.75rem",8:"2rem",9:"2.25rem",10:"2.5rem",11:"2.75rem",12:"3rem",14:"3.5rem",16:"4rem",20:"5rem",24:"6rem",28:"7rem",32:"8rem",36:"9rem",40:"10rem",44:"11rem",48:"12rem",52:"13rem",56:"14rem",60:"15rem",64:"16rem",72:"18rem",80:"20rem",96:"24rem"},stroke:({theme:r})=>({none:"none",...r("colors")}),strokeWidth:{0:"0",1:"1",2:"2"},supports:{},data:{},textColor:({theme:r})=>r("colors"),textDecorationColor:({theme:r})=>r("colors"),textDecorationThickness:{auto:"auto","from-font":"from-font",0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},textIndent:({theme:r})=>({...r("spacing")}),textOpacity:({theme:r})=>r("opacity"),textUnderlineOffset:{auto:"auto",0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},transformOrigin:{center:"center",top:"top","top-right":"top right",right:"right","bottom-right":"bottom right",bottom:"bottom","bottom-left":"bottom left",left:"left","top-left":"top left"},transitionDelay:{0:"0s",75:"75ms",100:"100ms",150:"150ms",200:"200ms",300:"300ms",500:"500ms",700:"700ms",1e3:"1000ms"},transitionDuration:{DEFAULT:"150ms",0:"0s",75:"75ms",100:"100ms",150:"150ms",200:"200ms",300:"300ms",500:"500ms",700:"700ms",1e3:"1000ms"},transitionProperty:{none:"none",all:"all",DEFAULT:"color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, backdrop-filter",colors:"color, background-color, border-color, text-decoration-color, fill, stroke",opacity:"opacity",shadow:"box-shadow",transform:"transform"},transitionTimingFunction:{DEFAULT:"cubic-bezier(0.4, 0, 0.2, 1)",linear:"linear",in:"cubic-bezier(0.4, 0, 1, 1)",out:"cubic-bezier(0, 0, 0.2, 1)","in-out":"cubic-bezier(0.4, 0, 0.2, 1)"},translate:({theme:r})=>({...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%",full:"100%"}),size:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%","1/12":"8.333333%","2/12":"16.666667%","3/12":"25%","4/12":"33.333333%","5/12":"41.666667%","6/12":"50%","7/12":"58.333333%","8/12":"66.666667%","9/12":"75%","10/12":"83.333333%","11/12":"91.666667%",full:"100%",min:"min-content",max:"max-content",fit:"fit-content"}),width:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%","1/12":"8.333333%","2/12":"16.666667%","3/12":"25%","4/12":"33.333333%","5/12":"41.666667%","6/12":"50%","7/12":"58.333333%","8/12":"66.666667%","9/12":"75%","10/12":"83.333333%","11/12":"91.666667%",full:"100%",screen:"100vw",svw:"100svw",lvw:"100lvw",dvw:"100dvw",min:"min-content",max:"max-content",fit:"fit-content"}),willChange:{auto:"auto",scroll:"scroll-position",contents:"contents",transform:"transform"},zIndex:{auto:"auto",0:"0",10:"10",20:"20",30:"30",40:"40",50:"50"}},plugins:[]}});function tn(r){let e=(r?.presets??[tc.default]).slice().reverse().flatMap(n=>tn(n instanceof Function?n():n)),t={respectDefaultRingColorOpacity:{theme:{ringColor:({theme:n})=>({DEFAULT:"#3b82f67f",...n("colors")})}},disableColorOpacityUtilitiesByDefault:{corePlugins:{backgroundOpacity:!1,borderOpacity:!1,divideOpacity:!1,placeholderOpacity:!1,ringOpacity:!1,textOpacity:!1}}},i=Object.keys(t).filter(n=>we(r,n)).map(n=>t[n]);return[r,...i,...e]}var tc,rc=R(()=>{u();tc=pe(en());ct()});var ic={};Ge(ic,{default:()=>zr});function zr(...r){let[,...e]=tn(r[0]);return ra([...r,...e])}var ia=R(()=>{u();Zf();rc()});var Ur={};Ge(Ur,{default:()=>me});var me,et=R(()=>{u();me={resolve:r=>r,extname:r=>"."+r.split(".").pop()}});function rn(r){return typeof r=="object"&&r!==null}function gx(r){return Object.keys(r).length===0}function nc(r){return typeof r=="string"||r instanceof String}function na(r){return rn(r)&&r.config===void 0&&!gx(r)?null:rn(r)&&r.config!==void 0&&nc(r.config)?me.resolve(r.config):rn(r)&&r.config!==void 0&&rn(r.config)?null:nc(r)?me.resolve(r):yx()}function yx(){for(let r of mx)try{let e=me.resolve(r);return be.accessSync(e),e}catch(e){}return null}var mx,sc=R(()=>{u();ft();et();mx=["./tailwind.config.js","./tailwind.config.cjs","./tailwind.config.mjs","./tailwind.config.ts","./tailwind.config.cts","./tailwind.config.mts"]});var ac={};Ge(ac,{default:()=>sa});var sa,aa=R(()=>{u();sa={parse:r=>({href:r})}});var oa=x(()=>{u()});var nn=x((b3,uc)=>{u();"use strict";var oc=(Wi(),kf),lc=oa(),Zt=class extends Error{constructor(e,t,i,n,a,s){super(e);this.name="CssSyntaxError",this.reason=e,a&&(this.file=a),n&&(this.source=n),s&&(this.plugin=s),typeof t!="undefined"&&typeof i!="undefined"&&(typeof t=="number"?(this.line=t,this.column=i):(this.line=t.line,this.column=t.column,this.endLine=i.line,this.endColumn=i.column)),this.setMessage(),Error.captureStackTrace&&Error.captureStackTrace(this,Zt)}setMessage(){this.message=this.plugin?this.plugin+": ":"",this.message+=this.file?this.file:"<css input>",typeof this.line!="undefined"&&(this.message+=":"+this.line+":"+this.column),this.message+=": "+this.reason}showSourceCode(e){if(!this.source)return"";let t=this.source;e==null&&(e=oc.isColorSupported),lc&&e&&(t=lc(t));let i=t.split(/\r?\n/),n=Math.max(this.line-3,0),a=Math.min(this.line+2,i.length),s=String(a).length,o,l;if(e){let{bold:c,red:f,gray:d}=oc.createColors(!0);o=p=>c(f(p)),l=p=>d(p)}else o=l=c=>c;return i.slice(n,a).map((c,f)=>{let d=n+1+f,p=" "+(" "+d).slice(-s)+" | ";if(d===this.line){let h=l(p.replace(/\d/g," "))+c.slice(0,this.column-1).replace(/[^\t]/g," ");return o(">")+l(p)+c+`
- `+h+o("^")}return" "+l(p)+c}).join(`
-`)}toString(){let e=this.showSourceCode();return e&&(e=`
-
-`+e+`
-`),this.name+": "+this.message+e}};uc.exports=Zt;Zt.default=Zt});var sn=x((w3,la)=>{u();"use strict";la.exports.isClean=Symbol("isClean");la.exports.my=Symbol("my")});var ua=x((v3,cc)=>{u();"use strict";var fc={colon:": ",indent:"    ",beforeDecl:`
-`,beforeRule:`
-`,beforeOpen:" ",beforeClose:`
-`,beforeComment:`
-`,after:`
-`,emptyBody:"",commentLeft:" ",commentRight:" ",semicolon:!1};function bx(r){return r[0].toUpperCase()+r.slice(1)}var an=class{constructor(e){this.builder=e}stringify(e,t){if(!this[e.type])throw new Error("Unknown AST node type "+e.type+". Maybe you need to change PostCSS stringifier.");this[e.type](e,t)}document(e){this.body(e)}root(e){this.body(e),e.raws.after&&this.builder(e.raws.after)}comment(e){let t=this.raw(e,"left","commentLeft"),i=this.raw(e,"right","commentRight");this.builder("/*"+t+e.text+i+"*/",e)}decl(e,t){let i=this.raw(e,"between","colon"),n=e.prop+i+this.rawValue(e,"value");e.important&&(n+=e.raws.important||" !important"),t&&(n+=";"),this.builder(n,e)}rule(e){this.block(e,this.rawValue(e,"selector")),e.raws.ownSemicolon&&this.builder(e.raws.ownSemicolon,e,"end")}atrule(e,t){let i="@"+e.name,n=e.params?this.rawValue(e,"params"):"";if(typeof e.raws.afterName!="undefined"?i+=e.raws.afterName:n&&(i+=" "),e.nodes)this.block(e,i+n);else{let a=(e.raws.between||"")+(t?";":"");this.builder(i+n+a,e)}}body(e){let t=e.nodes.length-1;for(;t>0&&e.nodes[t].type==="comment";)t-=1;let i=this.raw(e,"semicolon");for(let n=0;n<e.nodes.length;n++){let a=e.nodes[n],s=this.raw(a,"before");s&&this.builder(s),this.stringify(a,t!==n||i)}}block(e,t){let i=this.raw(e,"between","beforeOpen");this.builder(t+i+"{",e,"start");let n;e.nodes&&e.nodes.length?(this.body(e),n=this.raw(e,"after")):n=this.raw(e,"after","emptyBody"),n&&this.builder(n),this.builder("}",e,"end")}raw(e,t,i){let n;if(i||(i=t),t&&(n=e.raws[t],typeof n!="undefined"))return n;let a=e.parent;if(i==="before"&&(!a||a.type==="root"&&a.first===e||a&&a.type==="document"))return"";if(!a)return fc[i];let s=e.root();if(s.rawCache||(s.rawCache={}),typeof s.rawCache[i]!="undefined")return s.rawCache[i];if(i==="before"||i==="after")return this.beforeAfter(e,i);{let o="raw"+bx(i);this[o]?n=this[o](s,e):s.walk(l=>{if(n=l.raws[t],typeof n!="undefined")return!1})}return typeof n=="undefined"&&(n=fc[i]),s.rawCache[i]=n,n}rawSemicolon(e){let t;return e.walk(i=>{if(i.nodes&&i.nodes.length&&i.last.type==="decl"&&(t=i.raws.semicolon,typeof t!="undefined"))return!1}),t}rawEmptyBody(e){let t;return e.walk(i=>{if(i.nodes&&i.nodes.length===0&&(t=i.raws.after,typeof t!="undefined"))return!1}),t}rawIndent(e){if(e.raws.indent)return e.raws.indent;let t;return e.walk(i=>{let n=i.parent;if(n&&n!==e&&n.parent&&n.parent===e&&typeof i.raws.before!="undefined"){let a=i.raws.before.split(`
-`);return t=a[a.length-1],t=t.replace(/\S/g,""),!1}}),t}rawBeforeComment(e,t){let i;return e.walkComments(n=>{if(typeof n.raws.before!="undefined")return i=n.raws.before,i.includes(`
-`)&&(i=i.replace(/[^\n]+$/,"")),!1}),typeof i=="undefined"?i=this.raw(t,null,"beforeDecl"):i&&(i=i.replace(/\S/g,"")),i}rawBeforeDecl(e,t){let i;return e.walkDecls(n=>{if(typeof n.raws.before!="undefined")return i=n.raws.before,i.includes(`
-`)&&(i=i.replace(/[^\n]+$/,"")),!1}),typeof i=="undefined"?i=this.raw(t,null,"beforeRule"):i&&(i=i.replace(/\S/g,"")),i}rawBeforeRule(e){let t;return e.walk(i=>{if(i.nodes&&(i.parent!==e||e.first!==i)&&typeof i.raws.before!="undefined")return t=i.raws.before,t.includes(`
-`)&&(t=t.replace(/[^\n]+$/,"")),!1}),t&&(t=t.replace(/\S/g,"")),t}rawBeforeClose(e){let t;return e.walk(i=>{if(i.nodes&&i.nodes.length>0&&typeof i.raws.after!="undefined")return t=i.raws.after,t.includes(`
-`)&&(t=t.replace(/[^\n]+$/,"")),!1}),t&&(t=t.replace(/\S/g,"")),t}rawBeforeOpen(e){let t;return e.walk(i=>{if(i.type!=="decl"&&(t=i.raws.between,typeof t!="undefined"))return!1}),t}rawColon(e){let t;return e.walkDecls(i=>{if(typeof i.raws.between!="undefined")return t=i.raws.between.replace(/[^\s:]/g,""),!1}),t}beforeAfter(e,t){let i;e.type==="decl"?i=this.raw(e,null,"beforeDecl"):e.type==="comment"?i=this.raw(e,null,"beforeComment"):t==="before"?i=this.raw(e,null,"beforeRule"):i=this.raw(e,null,"beforeClose");let n=e.parent,a=0;for(;n&&n.type!=="root";)a+=1,n=n.parent;if(i.includes(`
-`)){let s=this.raw(e,null,"indent");if(s.length)for(let o=0;o<a;o++)i+=s}return i}rawValue(e,t){let i=e[t],n=e.raws[t];return n&&n.value===i?n.raw:i}};cc.exports=an;an.default=an});var Vr=x((x3,pc)=>{u();"use strict";var wx=ua();function fa(r,e){new wx(e).stringify(r)}pc.exports=fa;fa.default=fa});var Hr=x((k3,dc)=>{u();"use strict";var{isClean:on,my:vx}=sn(),xx=nn(),kx=ua(),Sx=Vr();function ca(r,e){let t=new r.constructor;for(let i in r){if(!Object.prototype.hasOwnProperty.call(r,i)||i==="proxyCache")continue;let n=r[i],a=typeof n;i==="parent"&&a==="object"?e&&(t[i]=e):i==="source"?t[i]=n:Array.isArray(n)?t[i]=n.map(s=>ca(s,t)):(a==="object"&&n!==null&&(n=ca(n)),t[i]=n)}return t}var ln=class{constructor(e={}){this.raws={},this[on]=!1,this[vx]=!0;for(let t in e)if(t==="nodes"){this.nodes=[];for(let i of e[t])typeof i.clone=="function"?this.append(i.clone()):this.append(i)}else this[t]=e[t]}error(e,t={}){if(this.source){let{start:i,end:n}=this.rangeBy(t);return this.source.input.error(e,{line:i.line,column:i.column},{line:n.line,column:n.column},t)}return new xx(e)}warn(e,t,i){let n={node:this};for(let a in i)n[a]=i[a];return e.warn(t,n)}remove(){return this.parent&&this.parent.removeChild(this),this.parent=void 0,this}toString(e=Sx){e.stringify&&(e=e.stringify);let t="";return e(this,i=>{t+=i}),t}assign(e={}){for(let t in e)this[t]=e[t];return this}clone(e={}){let t=ca(this);for(let i in e)t[i]=e[i];return t}cloneBefore(e={}){let t=this.clone(e);return this.parent.insertBefore(this,t),t}cloneAfter(e={}){let t=this.clone(e);return this.parent.insertAfter(this,t),t}replaceWith(...e){if(this.parent){let t=this,i=!1;for(let n of e)n===this?i=!0:i?(this.parent.insertAfter(t,n),t=n):this.parent.insertBefore(t,n);i||this.remove()}return this}next(){if(!this.parent)return;let e=this.parent.index(this);return this.parent.nodes[e+1]}prev(){if(!this.parent)return;let e=this.parent.index(this);return this.parent.nodes[e-1]}before(e){return this.parent.insertBefore(this,e),this}after(e){return this.parent.insertAfter(this,e),this}root(){let e=this;for(;e.parent&&e.parent.type!=="document";)e=e.parent;return e}raw(e,t){return new kx().raw(this,e,t)}cleanRaws(e){delete this.raws.before,delete this.raws.after,e||delete this.raws.between}toJSON(e,t){let i={},n=t==null;t=t||new Map;let a=0;for(let s in this){if(!Object.prototype.hasOwnProperty.call(this,s)||s==="parent"||s==="proxyCache")continue;let o=this[s];if(Array.isArray(o))i[s]=o.map(l=>typeof l=="object"&&l.toJSON?l.toJSON(null,t):l);else if(typeof o=="object"&&o.toJSON)i[s]=o.toJSON(null,t);else if(s==="source"){let l=t.get(o.input);l==null&&(l=a,t.set(o.input,a),a++),i[s]={inputId:l,start:o.start,end:o.end}}else i[s]=o}return n&&(i.inputs=[...t.keys()].map(s=>s.toJSON())),i}positionInside(e){let t=this.toString(),i=this.source.start.column,n=this.source.start.line;for(let a=0;a<e;a++)t[a]===`
-`?(i=1,n+=1):i+=1;return{line:n,column:i}}positionBy(e){let t=this.source.start;if(e.index)t=this.positionInside(e.index);else if(e.word){let i=this.toString().indexOf(e.word);i!==-1&&(t=this.positionInside(i))}return t}rangeBy(e){let t={line:this.source.start.line,column:this.source.start.column},i=this.source.end?{line:this.source.end.line,column:this.source.end.column+1}:{line:t.line,column:t.column+1};if(e.word){let n=this.toString().indexOf(e.word);n!==-1&&(t=this.positionInside(n),i=this.positionInside(n+e.word.length))}else e.start?t={line:e.start.line,column:e.start.column}:e.index&&(t=this.positionInside(e.index)),e.end?i={line:e.end.line,column:e.end.column}:e.endIndex?i=this.positionInside(e.endIndex):e.index&&(i=this.positionInside(e.index+1));return(i.line<t.line||i.line===t.line&&i.column<=t.column)&&(i={line:t.line,column:t.column+1}),{start:t,end:i}}getProxyProcessor(){return{set(e,t,i){return e[t]===i||(e[t]=i,(t==="prop"||t==="value"||t==="name"||t==="params"||t==="important"||t==="text")&&e.markDirty()),!0},get(e,t){return t==="proxyOf"?e:t==="root"?()=>e.root().toProxy():e[t]}}}toProxy(){return this.proxyCache||(this.proxyCache=new Proxy(this,this.getProxyProcessor())),this.proxyCache}addToError(e){if(e.postcssNode=this,e.stack&&this.source&&/\n\s{4}at /.test(e.stack)){let t=this.source;e.stack=e.stack.replace(/\n\s{4}at /,`$&${t.input.from}:${t.start.line}:${t.start.column}$&`)}return e}markDirty(){if(this[on]){this[on]=!1;let e=this;for(;e=e.parent;)e[on]=!1}}get proxyOf(){return this}};dc.exports=ln;ln.default=ln});var Wr=x((S3,hc)=>{u();"use strict";var Ax=Hr(),un=class extends Ax{constructor(e){e&&typeof e.value!="undefined"&&typeof e.value!="string"&&(e={...e,value:String(e.value)});super(e);this.type="decl"}get variable(){return this.prop.startsWith("--")||this.prop[0]==="$"}};hc.exports=un;un.default=un});var pa=x((A3,mc)=>{u();mc.exports=function(r,e){return{generate:()=>{let t="";return r(e,i=>{t+=i}),[t]}}}});var Gr=x((C3,gc)=>{u();"use strict";var Cx=Hr(),fn=class extends Cx{constructor(e){super(e);this.type="comment"}};gc.exports=fn;fn.default=fn});var Et=x((_3,Cc)=>{u();"use strict";var{isClean:yc,my:bc}=sn(),wc=Wr(),vc=Gr(),_x=Hr(),xc,da,ha,kc;function Sc(r){return r.map(e=>(e.nodes&&(e.nodes=Sc(e.nodes)),delete e.source,e))}function Ac(r){if(r[yc]=!1,r.proxyOf.nodes)for(let e of r.proxyOf.nodes)Ac(e)}var Fe=class extends _x{push(e){return e.parent=this,this.proxyOf.nodes.push(e),this}each(e){if(!this.proxyOf.nodes)return;let t=this.getIterator(),i,n;for(;this.indexes[t]<this.proxyOf.nodes.length&&(i=this.indexes[t],n=e(this.proxyOf.nodes[i],i),n!==!1);)this.indexes[t]+=1;return delete this.indexes[t],n}walk(e){return this.each((t,i)=>{let n;try{n=e(t,i)}catch(a){throw t.addToError(a)}return n!==!1&&t.walk&&(n=t.walk(e)),n})}walkDecls(e,t){return t?e instanceof RegExp?this.walk((i,n)=>{if(i.type==="decl"&&e.test(i.prop))return t(i,n)}):this.walk((i,n)=>{if(i.type==="decl"&&i.prop===e)return t(i,n)}):(t=e,this.walk((i,n)=>{if(i.type==="decl")return t(i,n)}))}walkRules(e,t){return t?e instanceof RegExp?this.walk((i,n)=>{if(i.type==="rule"&&e.test(i.selector))return t(i,n)}):this.walk((i,n)=>{if(i.type==="rule"&&i.selector===e)return t(i,n)}):(t=e,this.walk((i,n)=>{if(i.type==="rule")return t(i,n)}))}walkAtRules(e,t){return t?e instanceof RegExp?this.walk((i,n)=>{if(i.type==="atrule"&&e.test(i.name))return t(i,n)}):this.walk((i,n)=>{if(i.type==="atrule"&&i.name===e)return t(i,n)}):(t=e,this.walk((i,n)=>{if(i.type==="atrule")return t(i,n)}))}walkComments(e){return this.walk((t,i)=>{if(t.type==="comment")return e(t,i)})}append(...e){for(let t of e){let i=this.normalize(t,this.last);for(let n of i)this.proxyOf.nodes.push(n)}return this.markDirty(),this}prepend(...e){e=e.reverse();for(let t of e){let i=this.normalize(t,this.first,"prepend").reverse();for(let n of i)this.proxyOf.nodes.unshift(n);for(let n in this.indexes)this.indexes[n]=this.indexes[n]+i.length}return this.markDirty(),this}cleanRaws(e){if(super.cleanRaws(e),this.nodes)for(let t of this.nodes)t.cleanRaws(e)}insertBefore(e,t){let i=this.index(e),n=i===0?"prepend":!1,a=this.normalize(t,this.proxyOf.nodes[i],n).reverse();i=this.index(e);for(let o of a)this.proxyOf.nodes.splice(i,0,o);let s;for(let o in this.indexes)s=this.indexes[o],i<=s&&(this.indexes[o]=s+a.length);return this.markDirty(),this}insertAfter(e,t){let i=this.index(e),n=this.normalize(t,this.proxyOf.nodes[i]).reverse();i=this.index(e);for(let s of n)this.proxyOf.nodes.splice(i+1,0,s);let a;for(let s in this.indexes)a=this.indexes[s],i<a&&(this.indexes[s]=a+n.length);return this.markDirty(),this}removeChild(e){e=this.index(e),this.proxyOf.nodes[e].parent=void 0,this.proxyOf.nodes.splice(e,1);let t;for(let i in this.indexes)t=this.indexes[i],t>=e&&(this.indexes[i]=t-1);return this.markDirty(),this}removeAll(){for(let e of this.proxyOf.nodes)e.parent=void 0;return this.proxyOf.nodes=[],this.markDirty(),this}replaceValues(e,t,i){return i||(i=t,t={}),this.walkDecls(n=>{t.props&&!t.props.includes(n.prop)||t.fast&&!n.value.includes(t.fast)||(n.value=n.value.replace(e,i))}),this.markDirty(),this}every(e){return this.nodes.every(e)}some(e){return this.nodes.some(e)}index(e){return typeof e=="number"?e:(e.proxyOf&&(e=e.proxyOf),this.proxyOf.nodes.indexOf(e))}get first(){if(!!this.proxyOf.nodes)return this.proxyOf.nodes[0]}get last(){if(!!this.proxyOf.nodes)return this.proxyOf.nodes[this.proxyOf.nodes.length-1]}normalize(e,t){if(typeof e=="string")e=Sc(xc(e).nodes);else if(Array.isArray(e)){e=e.slice(0);for(let n of e)n.parent&&n.parent.removeChild(n,"ignore")}else if(e.type==="root"&&this.type!=="document"){e=e.nodes.slice(0);for(let n of e)n.parent&&n.parent.removeChild(n,"ignore")}else if(e.type)e=[e];else if(e.prop){if(typeof e.value=="undefined")throw new Error("Value field is missed in node creation");typeof e.value!="string"&&(e.value=String(e.value)),e=[new wc(e)]}else if(e.selector)e=[new da(e)];else if(e.name)e=[new ha(e)];else if(e.text)e=[new vc(e)];else throw new Error("Unknown node type in node creation");return e.map(n=>(n[bc]||Fe.rebuild(n),n=n.proxyOf,n.parent&&n.parent.removeChild(n),n[yc]&&Ac(n),typeof n.raws.before=="undefined"&&t&&typeof t.raws.before!="undefined"&&(n.raws.before=t.raws.before.replace(/\S/g,"")),n.parent=this.proxyOf,n))}getProxyProcessor(){return{set(e,t,i){return e[t]===i||(e[t]=i,(t==="name"||t==="params"||t==="selector")&&e.markDirty()),!0},get(e,t){return t==="proxyOf"?e:e[t]?t==="each"||typeof t=="string"&&t.startsWith("walk")?(...i)=>e[t](...i.map(n=>typeof n=="function"?(a,s)=>n(a.toProxy(),s):n)):t==="every"||t==="some"?i=>e[t]((n,...a)=>i(n.toProxy(),...a)):t==="root"?()=>e.root().toProxy():t==="nodes"?e.nodes.map(i=>i.toProxy()):t==="first"||t==="last"?e[t].toProxy():e[t]:e[t]}}}getIterator(){this.lastEach||(this.lastEach=0),this.indexes||(this.indexes={}),this.lastEach+=1;let e=this.lastEach;return this.indexes[e]=0,e}};Fe.registerParse=r=>{xc=r};Fe.registerRule=r=>{da=r};Fe.registerAtRule=r=>{ha=r};Fe.registerRoot=r=>{kc=r};Cc.exports=Fe;Fe.default=Fe;Fe.rebuild=r=>{r.type==="atrule"?Object.setPrototypeOf(r,ha.prototype):r.type==="rule"?Object.setPrototypeOf(r,da.prototype):r.type==="decl"?Object.setPrototypeOf(r,wc.prototype):r.type==="comment"?Object.setPrototypeOf(r,vc.prototype):r.type==="root"&&Object.setPrototypeOf(r,kc.prototype),r[bc]=!0,r.nodes&&r.nodes.forEach(e=>{Fe.rebuild(e)})}});var cn=x((E3,Oc)=>{u();"use strict";var Ex=Et(),_c,Ec,er=class extends Ex{constructor(e){super({type:"document",...e});this.nodes||(this.nodes=[])}toResult(e={}){return new _c(new Ec,this,e).stringify()}};er.registerLazyResult=r=>{_c=r};er.registerProcessor=r=>{Ec=r};Oc.exports=er;er.default=er});var ma=x((O3,Rc)=>{u();"use strict";var Tc={};Rc.exports=function(e){Tc[e]||(Tc[e]=!0,typeof console!="undefined"&&console.warn&&console.warn(e))}});var ga=x((T3,Pc)=>{u();"use strict";var pn=class{constructor(e,t={}){if(this.type="warning",this.text=e,t.node&&t.node.source){let i=t.node.rangeBy(t);this.line=i.start.line,this.column=i.start.column,this.endLine=i.end.line,this.endColumn=i.end.column}for(let i in t)this[i]=t[i]}toString(){return this.node?this.node.error(this.text,{plugin:this.plugin,index:this.index,word:this.word}).message:this.plugin?this.plugin+": "+this.text:this.text}};Pc.exports=pn;pn.default=pn});var hn=x((R3,Ic)=>{u();"use strict";var Ox=ga(),dn=class{constructor(e,t,i){this.processor=e,this.messages=[],this.root=t,this.opts=i,this.css=void 0,this.map=void 0}toString(){return this.css}warn(e,t={}){t.plugin||this.lastPlugin&&this.lastPlugin.postcssPlugin&&(t.plugin=this.lastPlugin.postcssPlugin);let i=new Ox(e,t);return this.messages.push(i),i}warnings(){return this.messages.filter(e=>e.type==="warning")}get content(){return this.css}};Ic.exports=dn;dn.default=dn});var Mc=x((P3,Lc)=>{u();"use strict";var ya="'".charCodeAt(0),Dc='"'.charCodeAt(0),mn="\\".charCodeAt(0),qc="/".charCodeAt(0),gn=`
-`.charCodeAt(0),Qr=" ".charCodeAt(0),yn="\f".charCodeAt(0),bn="	".charCodeAt(0),wn="\r".charCodeAt(0),Tx="[".charCodeAt(0),Rx="]".charCodeAt(0),Px="(".charCodeAt(0),Ix=")".charCodeAt(0),Dx="{".charCodeAt(0),qx="}".charCodeAt(0),$x=";".charCodeAt(0),Lx="*".charCodeAt(0),Mx=":".charCodeAt(0),Nx="@".charCodeAt(0),vn=/[\t\n\f\r "#'()/;[\\\]{}]/g,xn=/[\t\n\f\r !"#'():;@[\\\]{}]|\/(?=\*)/g,Bx=/.[\n"'(/\\]/,$c=/[\da-f]/i;Lc.exports=function(e,t={}){let i=e.css.valueOf(),n=t.ignoreErrors,a,s,o,l,c,f,d,p,h,b,v=i.length,y=0,w=[],k=[];function S(){return y}function E(T){throw e.error("Unclosed "+T,y)}function O(){return k.length===0&&y>=v}function B(T){if(k.length)return k.pop();if(y>=v)return;let F=T?T.ignoreUnclosed:!1;switch(a=i.charCodeAt(y),a){case gn:case Qr:case bn:case wn:case yn:{s=y;do s+=1,a=i.charCodeAt(s);while(a===Qr||a===gn||a===bn||a===wn||a===yn);b=["space",i.slice(y,s)],y=s-1;break}case Tx:case Rx:case Dx:case qx:case Mx:case $x:case Ix:{let Y=String.fromCharCode(a);b=[Y,Y,y];break}case Px:{if(p=w.length?w.pop()[1]:"",h=i.charCodeAt(y+1),p==="url"&&h!==ya&&h!==Dc&&h!==Qr&&h!==gn&&h!==bn&&h!==yn&&h!==wn){s=y;do{if(f=!1,s=i.indexOf(")",s+1),s===-1)if(n||F){s=y;break}else E("bracket");for(d=s;i.charCodeAt(d-1)===mn;)d-=1,f=!f}while(f);b=["brackets",i.slice(y,s+1),y,s],y=s}else s=i.indexOf(")",y+1),l=i.slice(y,s+1),s===-1||Bx.test(l)?b=["(","(",y]:(b=["brackets",l,y,s],y=s);break}case ya:case Dc:{o=a===ya?"'":'"',s=y;do{if(f=!1,s=i.indexOf(o,s+1),s===-1)if(n||F){s=y+1;break}else E("string");for(d=s;i.charCodeAt(d-1)===mn;)d-=1,f=!f}while(f);b=["string",i.slice(y,s+1),y,s],y=s;break}case Nx:{vn.lastIndex=y+1,vn.test(i),vn.lastIndex===0?s=i.length-1:s=vn.lastIndex-2,b=["at-word",i.slice(y,s+1),y,s],y=s;break}case mn:{for(s=y,c=!0;i.charCodeAt(s+1)===mn;)s+=1,c=!c;if(a=i.charCodeAt(s+1),c&&a!==qc&&a!==Qr&&a!==gn&&a!==bn&&a!==wn&&a!==yn&&(s+=1,$c.test(i.charAt(s)))){for(;$c.test(i.charAt(s+1));)s+=1;i.charCodeAt(s+1)===Qr&&(s+=1)}b=["word",i.slice(y,s+1),y,s],y=s;break}default:{a===qc&&i.charCodeAt(y+1)===Lx?(s=i.indexOf("*/",y+2)+1,s===0&&(n||F?s=i.length:E("comment")),b=["comment",i.slice(y,s+1),y,s],y=s):(xn.lastIndex=y+1,xn.test(i),xn.lastIndex===0?s=i.length-1:s=xn.lastIndex-2,b=["word",i.slice(y,s+1),y,s],w.push(b),y=s);break}}return y++,b}function N(T){k.push(T)}return{back:N,nextToken:B,endOfFile:O,position:S}}});var kn=x((I3,Bc)=>{u();"use strict";var Nc=Et(),Yr=class extends Nc{constructor(e){super(e);this.type="atrule"}append(...e){return this.proxyOf.nodes||(this.nodes=[]),super.append(...e)}prepend(...e){return this.proxyOf.nodes||(this.nodes=[]),super.prepend(...e)}};Bc.exports=Yr;Yr.default=Yr;Nc.registerAtRule(Yr)});var tr=x((D3,Uc)=>{u();"use strict";var Fc=Et(),jc,zc,Ut=class extends Fc{constructor(e){super(e);this.type="root",this.nodes||(this.nodes=[])}removeChild(e,t){let i=this.index(e);return!t&&i===0&&this.nodes.length>1&&(this.nodes[1].raws.before=this.nodes[i].raws.before),super.removeChild(e)}normalize(e,t,i){let n=super.normalize(e);if(t){if(i==="prepend")this.nodes.length>1?t.raws.before=this.nodes[1].raws.before:delete t.raws.before;else if(this.first!==t)for(let a of n)a.raws.before=t.raws.before}return n}toResult(e={}){return new jc(new zc,this,e).stringify()}};Ut.registerLazyResult=r=>{jc=r};Ut.registerProcessor=r=>{zc=r};Uc.exports=Ut;Ut.default=Ut;Fc.registerRoot(Ut)});var ba=x((q3,Vc)=>{u();"use strict";var Kr={split(r,e,t){let i=[],n="",a=!1,s=0,o=!1,l="",c=!1;for(let f of r)c?c=!1:f==="\\"?c=!0:o?f===l&&(o=!1):f==='"'||f==="'"?(o=!0,l=f):f==="("?s+=1:f===")"?s>0&&(s-=1):s===0&&e.includes(f)&&(a=!0),a?(n!==""&&i.push(n.trim()),n="",a=!1):n+=f;return(t||n!=="")&&i.push(n.trim()),i},space(r){let e=[" ",`
-`,"	"];return Kr.split(r,e)},comma(r){return Kr.split(r,[","],!0)}};Vc.exports=Kr;Kr.default=Kr});var Sn=x(($3,Wc)=>{u();"use strict";var Hc=Et(),Fx=ba(),Xr=class extends Hc{constructor(e){super(e);this.type="rule",this.nodes||(this.nodes=[])}get selectors(){return Fx.comma(this.selector)}set selectors(e){let t=this.selector?this.selector.match(/,\s*/):null,i=t?t[0]:","+this.raw("between","beforeOpen");this.selector=e.join(i)}};Wc.exports=Xr;Xr.default=Xr;Hc.registerRule(Xr)});var Xc=x((L3,Kc)=>{u();"use strict";var jx=Wr(),zx=Mc(),Ux=Gr(),Vx=kn(),Hx=tr(),Gc=Sn(),Qc={empty:!0,space:!0};function Wx(r){for(let e=r.length-1;e>=0;e--){let t=r[e],i=t[3]||t[2];if(i)return i}}var Yc=class{constructor(e){this.input=e,this.root=new Hx,this.current=this.root,this.spaces="",this.semicolon=!1,this.customProperty=!1,this.createTokenizer(),this.root.source={input:e,start:{offset:0,line:1,column:1}}}createTokenizer(){this.tokenizer=zx(this.input)}parse(){let e;for(;!this.tokenizer.endOfFile();)switch(e=this.tokenizer.nextToken(),e[0]){case"space":this.spaces+=e[1];break;case";":this.freeSemicolon(e);break;case"}":this.end(e);break;case"comment":this.comment(e);break;case"at-word":this.atrule(e);break;case"{":this.emptyRule(e);break;default:this.other(e);break}this.endFile()}comment(e){let t=new Ux;this.init(t,e[2]),t.source.end=this.getPosition(e[3]||e[2]);let i=e[1].slice(2,-2);if(/^\s*$/.test(i))t.text="",t.raws.left=i,t.raws.right="";else{let n=i.match(/^(\s*)([^]*\S)(\s*)$/);t.text=n[2],t.raws.left=n[1],t.raws.right=n[3]}}emptyRule(e){let t=new Gc;this.init(t,e[2]),t.selector="",t.raws.between="",this.current=t}other(e){let t=!1,i=null,n=!1,a=null,s=[],o=e[1].startsWith("--"),l=[],c=e;for(;c;){if(i=c[0],l.push(c),i==="("||i==="[")a||(a=c),s.push(i==="("?")":"]");else if(o&&n&&i==="{")a||(a=c),s.push("}");else if(s.length===0)if(i===";")if(n){this.decl(l,o);return}else break;else if(i==="{"){this.rule(l);return}else if(i==="}"){this.tokenizer.back(l.pop()),t=!0;break}else i===":"&&(n=!0);else i===s[s.length-1]&&(s.pop(),s.length===0&&(a=null));c=this.tokenizer.nextToken()}if(this.tokenizer.endOfFile()&&(t=!0),s.length>0&&this.unclosedBracket(a),t&&n){if(!o)for(;l.length&&(c=l[l.length-1][0],!(c!=="space"&&c!=="comment"));)this.tokenizer.back(l.pop());this.decl(l,o)}else this.unknownWord(l)}rule(e){e.pop();let t=new Gc;this.init(t,e[0][2]),t.raws.between=this.spacesAndCommentsFromEnd(e),this.raw(t,"selector",e),this.current=t}decl(e,t){let i=new jx;this.init(i,e[0][2]);let n=e[e.length-1];for(n[0]===";"&&(this.semicolon=!0,e.pop()),i.source.end=this.getPosition(n[3]||n[2]||Wx(e));e[0][0]!=="word";)e.length===1&&this.unknownWord(e),i.raws.before+=e.shift()[1];for(i.source.start=this.getPosition(e[0][2]),i.prop="";e.length;){let c=e[0][0];if(c===":"||c==="space"||c==="comment")break;i.prop+=e.shift()[1]}i.raws.between="";let a;for(;e.length;)if(a=e.shift(),a[0]===":"){i.raws.between+=a[1];break}else a[0]==="word"&&/\w/.test(a[1])&&this.unknownWord([a]),i.raws.between+=a[1];(i.prop[0]==="_"||i.prop[0]==="*")&&(i.raws.before+=i.prop[0],i.prop=i.prop.slice(1));let s=[],o;for(;e.length&&(o=e[0][0],!(o!=="space"&&o!=="comment"));)s.push(e.shift());this.precheckMissedSemicolon(e);for(let c=e.length-1;c>=0;c--){if(a=e[c],a[1].toLowerCase()==="!important"){i.important=!0;let f=this.stringFrom(e,c);f=this.spacesFromEnd(e)+f,f!==" !important"&&(i.raws.important=f);break}else if(a[1].toLowerCase()==="important"){let f=e.slice(0),d="";for(let p=c;p>0;p--){let h=f[p][0];if(d.trim().indexOf("!")===0&&h!=="space")break;d=f.pop()[1]+d}d.trim().indexOf("!")===0&&(i.important=!0,i.raws.important=d,e=f)}if(a[0]!=="space"&&a[0]!=="comment")break}e.some(c=>c[0]!=="space"&&c[0]!=="comment")&&(i.raws.between+=s.map(c=>c[1]).join(""),s=[]),this.raw(i,"value",s.concat(e),t),i.value.includes(":")&&!t&&this.checkMissedSemicolon(e)}atrule(e){let t=new Vx;t.name=e[1].slice(1),t.name===""&&this.unnamedAtrule(t,e),this.init(t,e[2]);let i,n,a,s=!1,o=!1,l=[],c=[];for(;!this.tokenizer.endOfFile();){if(e=this.tokenizer.nextToken(),i=e[0],i==="("||i==="["?c.push(i==="("?")":"]"):i==="{"&&c.length>0?c.push("}"):i===c[c.length-1]&&c.pop(),c.length===0)if(i===";"){t.source.end=this.getPosition(e[2]),this.semicolon=!0;break}else if(i==="{"){o=!0;break}else if(i==="}"){if(l.length>0){for(a=l.length-1,n=l[a];n&&n[0]==="space";)n=l[--a];n&&(t.source.end=this.getPosition(n[3]||n[2]))}this.end(e);break}else l.push(e);else l.push(e);if(this.tokenizer.endOfFile()){s=!0;break}}t.raws.between=this.spacesAndCommentsFromEnd(l),l.length?(t.raws.afterName=this.spacesAndCommentsFromStart(l),this.raw(t,"params",l),s&&(e=l[l.length-1],t.source.end=this.getPosition(e[3]||e[2]),this.spaces=t.raws.between,t.raws.between="")):(t.raws.afterName="",t.params=""),o&&(t.nodes=[],this.current=t)}end(e){this.current.nodes&&this.current.nodes.length&&(this.current.raws.semicolon=this.semicolon),this.semicolon=!1,this.current.raws.after=(this.current.raws.after||"")+this.spaces,this.spaces="",this.current.parent?(this.current.source.end=this.getPosition(e[2]),this.current=this.current.parent):this.unexpectedClose(e)}endFile(){this.current.parent&&this.unclosedBlock(),this.current.nodes&&this.current.nodes.length&&(this.current.raws.semicolon=this.semicolon),this.current.raws.after=(this.current.raws.after||"")+this.spaces}freeSemicolon(e){if(this.spaces+=e[1],this.current.nodes){let t=this.current.nodes[this.current.nodes.length-1];t&&t.type==="rule"&&!t.raws.ownSemicolon&&(t.raws.ownSemicolon=this.spaces,this.spaces="")}}getPosition(e){let t=this.input.fromOffset(e);return{offset:e,line:t.line,column:t.col}}init(e,t){this.current.push(e),e.source={start:this.getPosition(t),input:this.input},e.raws.before=this.spaces,this.spaces="",e.type!=="comment"&&(this.semicolon=!1)}raw(e,t,i,n){let a,s,o=i.length,l="",c=!0,f,d;for(let p=0;p<o;p+=1)a=i[p],s=a[0],s==="space"&&p===o-1&&!n?c=!1:s==="comment"?(d=i[p-1]?i[p-1][0]:"empty",f=i[p+1]?i[p+1][0]:"empty",!Qc[d]&&!Qc[f]?l.slice(-1)===","?c=!1:l+=a[1]:c=!1):l+=a[1];if(!c){let p=i.reduce((h,b)=>h+b[1],"");e.raws[t]={value:l,raw:p}}e[t]=l}spacesAndCommentsFromEnd(e){let t,i="";for(;e.length&&(t=e[e.length-1][0],!(t!=="space"&&t!=="comment"));)i=e.pop()[1]+i;return i}spacesAndCommentsFromStart(e){let t,i="";for(;e.length&&(t=e[0][0],!(t!=="space"&&t!=="comment"));)i+=e.shift()[1];return i}spacesFromEnd(e){let t,i="";for(;e.length&&(t=e[e.length-1][0],t==="space");)i=e.pop()[1]+i;return i}stringFrom(e,t){let i="";for(let n=t;n<e.length;n++)i+=e[n][1];return e.splice(t,e.length-t),i}colon(e){let t=0,i,n,a;for(let[s,o]of e.entries()){if(i=o,n=i[0],n==="("&&(t+=1),n===")"&&(t-=1),t===0&&n===":")if(!a)this.doubleColon(i);else{if(a[0]==="word"&&a[1]==="progid")continue;return s}a=i}return!1}unclosedBracket(e){throw this.input.error("Unclosed bracket",{offset:e[2]},{offset:e[2]+1})}unknownWord(e){throw this.input.error("Unknown word",{offset:e[0][2]},{offset:e[0][2]+e[0][1].length})}unexpectedClose(e){throw this.input.error("Unexpected }",{offset:e[2]},{offset:e[2]+1})}unclosedBlock(){let e=this.current.source.start;throw this.input.error("Unclosed block",e.line,e.column)}doubleColon(e){throw this.input.error("Double colon",{offset:e[2]},{offset:e[2]+e[1].length})}unnamedAtrule(e,t){throw this.input.error("At-rule without name",{offset:t[2]},{offset:t[2]+t[1].length})}precheckMissedSemicolon(){}checkMissedSemicolon(e){let t=this.colon(e);if(t===!1)return;let i=0,n;for(let a=t-1;a>=0&&(n=e[a],!(n[0]!=="space"&&(i+=1,i===2)));a--);throw this.input.error("Missed semicolon",n[0]==="word"?n[3]+1:n[2])}};Kc.exports=Yc});var Jc=x(()=>{u()});var ep=x((B3,Zc)=>{u();var Gx="useandom-26T198340PX75pxJACKVERYMINDBUSHWOLF_GQZbfghjklqvwyzrict",Qx=(r,e=21)=>(t=e)=>{let i="",n=t;for(;n--;)i+=r[Math.random()*r.length|0];return i},Yx=(r=21)=>{let e="",t=r;for(;t--;)e+=Gx[Math.random()*64|0];return e};Zc.exports={nanoid:Yx,customAlphabet:Qx}});var wa=x((F3,tp)=>{u();tp.exports={}});var Cn=x((j3,sp)=>{u();"use strict";var{SourceMapConsumer:Kx,SourceMapGenerator:Xx}=Jc(),{fileURLToPath:rp,pathToFileURL:An}=(aa(),ac),{resolve:va,isAbsolute:xa}=(et(),Ur),{nanoid:Jx}=ep(),ka=oa(),ip=nn(),Zx=wa(),Sa=Symbol("fromOffsetCache"),e1=Boolean(Kx&&Xx),np=Boolean(va&&xa),Jr=class{constructor(e,t={}){if(e===null||typeof e=="undefined"||typeof e=="object"&&!e.toString)throw new Error(`PostCSS received ${e} instead of CSS string`);if(this.css=e.toString(),this.css[0]==="\uFEFF"||this.css[0]==="\uFFFE"?(this.hasBOM=!0,this.css=this.css.slice(1)):this.hasBOM=!1,t.from&&(!np||/^\w+:\/\//.test(t.from)||xa(t.from)?this.file=t.from:this.file=va(t.from)),np&&e1){let i=new Zx(this.css,t);if(i.text){this.map=i;let n=i.consumer().file;!this.file&&n&&(this.file=this.mapResolve(n))}}this.file||(this.id="<input css "+Jx(6)+">"),this.map&&(this.map.file=this.from)}fromOffset(e){let t,i;if(this[Sa])i=this[Sa];else{let a=this.css.split(`
-`);i=new Array(a.length);let s=0;for(let o=0,l=a.length;o<l;o++)i[o]=s,s+=a[o].length+1;this[Sa]=i}t=i[i.length-1];let n=0;if(e>=t)n=i.length-1;else{let a=i.length-2,s;for(;n<a;)if(s=n+(a-n>>1),e<i[s])a=s-1;else if(e>=i[s+1])n=s+1;else{n=s;break}}return{line:n+1,col:e-i[n]+1}}error(e,t,i,n={}){let a,s,o;if(t&&typeof t=="object"){let c=t,f=i;if(typeof c.offset=="number"){let d=this.fromOffset(c.offset);t=d.line,i=d.col}else t=c.line,i=c.column;if(typeof f.offset=="number"){let d=this.fromOffset(f.offset);s=d.line,o=d.col}else s=f.line,o=f.column}else if(!i){let c=this.fromOffset(t);t=c.line,i=c.col}let l=this.origin(t,i,s,o);return l?a=new ip(e,l.endLine===void 0?l.line:{line:l.line,column:l.column},l.endLine===void 0?l.column:{line:l.endLine,column:l.endColumn},l.source,l.file,n.plugin):a=new ip(e,s===void 0?t:{line:t,column:i},s===void 0?i:{line:s,column:o},this.css,this.file,n.plugin),a.input={line:t,column:i,endLine:s,endColumn:o,source:this.css},this.file&&(An&&(a.input.url=An(this.file).toString()),a.input.file=this.file),a}origin(e,t,i,n){if(!this.map)return!1;let a=this.map.consumer(),s=a.originalPositionFor({line:e,column:t});if(!s.source)return!1;let o;typeof i=="number"&&(o=a.originalPositionFor({line:i,column:n}));let l;xa(s.source)?l=An(s.source):l=new URL(s.source,this.map.consumer().sourceRoot||An(this.map.mapFile));let c={url:l.toString(),line:s.line,column:s.column,endLine:o&&o.line,endColumn:o&&o.column};if(l.protocol==="file:")if(rp)c.file=rp(l);else throw new Error("file: protocol is not available in this PostCSS build");let f=a.sourceContentFor(s.source);return f&&(c.source=f),c}mapResolve(e){return/^\w+:\/\//.test(e)?e:va(this.map.consumer().sourceRoot||this.map.root||".",e)}get from(){return this.file||this.id}toJSON(){let e={};for(let t of["hasBOM","css","file","id"])this[t]!=null&&(e[t]=this[t]);return this.map&&(e.map={...this.map},e.map.consumerCache&&(e.map.consumerCache=void 0)),e}};sp.exports=Jr;Jr.default=Jr;ka&&ka.registerInput&&ka.registerInput(Jr)});var En=x((z3,ap)=>{u();"use strict";var t1=Et(),r1=Xc(),i1=Cn();function _n(r,e){let t=new i1(r,e),i=new r1(t);try{i.parse()}catch(n){throw n}return i.root}ap.exports=_n;_n.default=_n;t1.registerParse(_n)});var _a=x((V3,fp)=>{u();"use strict";var{isClean:tt,my:n1}=sn(),s1=pa(),a1=Vr(),o1=Et(),l1=cn(),U3=ma(),op=hn(),u1=En(),f1=tr(),c1={document:"Document",root:"Root",atrule:"AtRule",rule:"Rule",decl:"Declaration",comment:"Comment"},p1={postcssPlugin:!0,prepare:!0,Once:!0,Document:!0,Root:!0,Declaration:!0,Rule:!0,AtRule:!0,Comment:!0,DeclarationExit:!0,RuleExit:!0,AtRuleExit:!0,CommentExit:!0,RootExit:!0,DocumentExit:!0,OnceExit:!0},d1={postcssPlugin:!0,prepare:!0,Once:!0},rr=0;function Zr(r){return typeof r=="object"&&typeof r.then=="function"}function lp(r){let e=!1,t=c1[r.type];return r.type==="decl"?e=r.prop.toLowerCase():r.type==="atrule"&&(e=r.name.toLowerCase()),e&&r.append?[t,t+"-"+e,rr,t+"Exit",t+"Exit-"+e]:e?[t,t+"-"+e,t+"Exit",t+"Exit-"+e]:r.append?[t,rr,t+"Exit"]:[t,t+"Exit"]}function up(r){let e;return r.type==="document"?e=["Document",rr,"DocumentExit"]:r.type==="root"?e=["Root",rr,"RootExit"]:e=lp(r),{node:r,events:e,eventIndex:0,visitors:[],visitorIndex:0,iterator:0}}function Aa(r){return r[tt]=!1,r.nodes&&r.nodes.forEach(e=>Aa(e)),r}var Ca={},pt=class{constructor(e,t,i){this.stringified=!1,this.processed=!1;let n;if(typeof t=="object"&&t!==null&&(t.type==="root"||t.type==="document"))n=Aa(t);else if(t instanceof pt||t instanceof op)n=Aa(t.root),t.map&&(typeof i.map=="undefined"&&(i.map={}),i.map.inline||(i.map.inline=!1),i.map.prev=t.map);else{let a=u1;i.syntax&&(a=i.syntax.parse),i.parser&&(a=i.parser),a.parse&&(a=a.parse);try{n=a(t,i)}catch(s){this.processed=!0,this.error=s}n&&!n[n1]&&o1.rebuild(n)}this.result=new op(e,n,i),this.helpers={...Ca,result:this.result,postcss:Ca},this.plugins=this.processor.plugins.map(a=>typeof a=="object"&&a.prepare?{...a,...a.prepare(this.result)}:a)}get[Symbol.toStringTag](){return"LazyResult"}get processor(){return this.result.processor}get opts(){return this.result.opts}get css(){return this.stringify().css}get content(){return this.stringify().content}get map(){return this.stringify().map}get root(){return this.sync().root}get messages(){return this.sync().messages}warnings(){return this.sync().warnings()}toString(){return this.css}then(e,t){return this.async().then(e,t)}catch(e){return this.async().catch(e)}finally(e){return this.async().then(e,e)}async(){return this.error?Promise.reject(this.error):this.processed?Promise.resolve(this.result):(this.processing||(this.processing=this.runAsync()),this.processing)}sync(){if(this.error)throw this.error;if(this.processed)return this.result;if(this.processed=!0,this.processing)throw this.getAsyncError();for(let e of this.plugins){let t=this.runOnRoot(e);if(Zr(t))throw this.getAsyncError()}if(this.prepareVisitors(),this.hasListener){let e=this.result.root;for(;!e[tt];)e[tt]=!0,this.walkSync(e);if(this.listeners.OnceExit)if(e.type==="document")for(let t of e.nodes)this.visitSync(this.listeners.OnceExit,t);else this.visitSync(this.listeners.OnceExit,e)}return this.result}stringify(){if(this.error)throw this.error;if(this.stringified)return this.result;this.stringified=!0,this.sync();let e=this.result.opts,t=a1;e.syntax&&(t=e.syntax.stringify),e.stringifier&&(t=e.stringifier),t.stringify&&(t=t.stringify);let n=new s1(t,this.result.root,this.result.opts).generate();return this.result.css=n[0],this.result.map=n[1],this.result}walkSync(e){e[tt]=!0;let t=lp(e);for(let i of t)if(i===rr)e.nodes&&e.each(n=>{n[tt]||this.walkSync(n)});else{let n=this.listeners[i];if(n&&this.visitSync(n,e.toProxy()))return}}visitSync(e,t){for(let[i,n]of e){this.result.lastPlugin=i;let a;try{a=n(t,this.helpers)}catch(s){throw this.handleError(s,t.proxyOf)}if(t.type!=="root"&&t.type!=="document"&&!t.parent)return!0;if(Zr(a))throw this.getAsyncError()}}runOnRoot(e){this.result.lastPlugin=e;try{if(typeof e=="object"&&e.Once){if(this.result.root.type==="document"){let t=this.result.root.nodes.map(i=>e.Once(i,this.helpers));return Zr(t[0])?Promise.all(t):t}return e.Once(this.result.root,this.helpers)}else if(typeof e=="function")return e(this.result.root,this.result)}catch(t){throw this.handleError(t)}}getAsyncError(){throw new Error("Use process(css).then(cb) to work with async plugins")}handleError(e,t){let i=this.result.lastPlugin;try{t&&t.addToError(e),this.error=e,e.name==="CssSyntaxError"&&!e.plugin?(e.plugin=i.postcssPlugin,e.setMessage()):i.postcssVersion}catch(n){console&&console.error&&console.error(n)}return e}async runAsync(){this.plugin=0;for(let e=0;e<this.plugins.length;e++){let t=this.plugins[e],i=this.runOnRoot(t);if(Zr(i))try{await i}catch(n){throw this.handleError(n)}}if(this.prepareVisitors(),this.hasListener){let e=this.result.root;for(;!e[tt];){e[tt]=!0;let t=[up(e)];for(;t.length>0;){let i=this.visitTick(t);if(Zr(i))try{await i}catch(n){let a=t[t.length-1].node;throw this.handleError(n,a)}}}if(this.listeners.OnceExit)for(let[t,i]of this.listeners.OnceExit){this.result.lastPlugin=t;try{if(e.type==="document"){let n=e.nodes.map(a=>i(a,this.helpers));await Promise.all(n)}else await i(e,this.helpers)}catch(n){throw this.handleError(n)}}}return this.processed=!0,this.stringify()}prepareVisitors(){this.listeners={};let e=(t,i,n)=>{this.listeners[i]||(this.listeners[i]=[]),this.listeners[i].push([t,n])};for(let t of this.plugins)if(typeof t=="object")for(let i in t){if(!p1[i]&&/^[A-Z]/.test(i))throw new Error(`Unknown event ${i} in ${t.postcssPlugin}. Try to update PostCSS (${this.processor.version} now).`);if(!d1[i])if(typeof t[i]=="object")for(let n in t[i])n==="*"?e(t,i,t[i][n]):e(t,i+"-"+n.toLowerCase(),t[i][n]);else typeof t[i]=="function"&&e(t,i,t[i])}this.hasListener=Object.keys(this.listeners).length>0}visitTick(e){let t=e[e.length-1],{node:i,visitors:n}=t;if(i.type!=="root"&&i.type!=="document"&&!i.parent){e.pop();return}if(n.length>0&&t.visitorIndex<n.length){let[s,o]=n[t.visitorIndex];t.visitorIndex+=1,t.visitorIndex===n.length&&(t.visitors=[],t.visitorIndex=0),this.result.lastPlugin=s;try{return o(i.toProxy(),this.helpers)}catch(l){throw this.handleError(l,i)}}if(t.iterator!==0){let s=t.iterator,o;for(;o=i.nodes[i.indexes[s]];)if(i.indexes[s]+=1,!o[tt]){o[tt]=!0,e.push(up(o));return}t.iterator=0,delete i.indexes[s]}let a=t.events;for(;t.eventIndex<a.length;){let s=a[t.eventIndex];if(t.eventIndex+=1,s===rr){i.nodes&&i.nodes.length&&(i[tt]=!0,t.iterator=i.getIterator());return}else if(this.listeners[s]){t.visitors=this.listeners[s];return}}e.pop()}};pt.registerPostcss=r=>{Ca=r};fp.exports=pt;pt.default=pt;f1.registerLazyResult(pt);l1.registerLazyResult(pt)});var pp=x((W3,cp)=>{u();"use strict";var h1=pa(),m1=Vr(),H3=ma(),g1=En(),y1=hn(),On=class{constructor(e,t,i){t=t.toString(),this.stringified=!1,this._processor=e,this._css=t,this._opts=i,this._map=void 0;let n,a=m1;this.result=new y1(this._processor,n,this._opts),this.result.css=t;let s=this;Object.defineProperty(this.result,"root",{get(){return s.root}});let o=new h1(a,n,this._opts,t);if(o.isMap()){let[l,c]=o.generate();l&&(this.result.css=l),c&&(this.result.map=c)}}get[Symbol.toStringTag](){return"NoWorkResult"}get processor(){return this.result.processor}get opts(){return this.result.opts}get css(){return this.result.css}get content(){return this.result.css}get map(){return this.result.map}get root(){if(this._root)return this._root;let e,t=g1;try{e=t(this._css,this._opts)}catch(i){this.error=i}if(this.error)throw this.error;return this._root=e,e}get messages(){return[]}warnings(){return[]}toString(){return this._css}then(e,t){return this.async().then(e,t)}catch(e){return this.async().catch(e)}finally(e){return this.async().then(e,e)}async(){return this.error?Promise.reject(this.error):Promise.resolve(this.result)}sync(){if(this.error)throw this.error;return this.result}};cp.exports=On;On.default=On});var hp=x((G3,dp)=>{u();"use strict";var b1=pp(),w1=_a(),v1=cn(),x1=tr(),ir=class{constructor(e=[]){this.version="8.4.24",this.plugins=this.normalize(e)}use(e){return this.plugins=this.plugins.concat(this.normalize([e])),this}process(e,t={}){return this.plugins.length===0&&typeof t.parser=="undefined"&&typeof t.stringifier=="undefined"&&typeof t.syntax=="undefined"?new b1(this,e,t):new w1(this,e,t)}normalize(e){let t=[];for(let i of e)if(i.postcss===!0?i=i():i.postcss&&(i=i.postcss),typeof i=="object"&&Array.isArray(i.plugins))t=t.concat(i.plugins);else if(typeof i=="object"&&i.postcssPlugin)t.push(i);else if(typeof i=="function")t.push(i);else if(!(typeof i=="object"&&(i.parse||i.stringify)))throw new Error(i+" is not a PostCSS plugin");return t}};dp.exports=ir;ir.default=ir;x1.registerProcessor(ir);v1.registerProcessor(ir)});var gp=x((Q3,mp)=>{u();"use strict";var k1=Wr(),S1=wa(),A1=Gr(),C1=kn(),_1=Cn(),E1=tr(),O1=Sn();function ei(r,e){if(Array.isArray(r))return r.map(n=>ei(n));let{inputs:t,...i}=r;if(t){e=[];for(let n of t){let a={...n,__proto__:_1.prototype};a.map&&(a.map={...a.map,__proto__:S1.prototype}),e.push(a)}}if(i.nodes&&(i.nodes=r.nodes.map(n=>ei(n,e))),i.source){let{inputId:n,...a}=i.source;i.source=a,n!=null&&(i.source.input=e[n])}if(i.type==="root")return new E1(i);if(i.type==="decl")return new k1(i);if(i.type==="rule")return new O1(i);if(i.type==="comment")return new A1(i);if(i.type==="atrule")return new C1(i);throw new Error("Unknown node type: "+r.type)}mp.exports=ei;ei.default=ei});var $e=x((Y3,Sp)=>{u();"use strict";var T1=nn(),yp=Wr(),R1=_a(),P1=Et(),Ea=hp(),I1=Vr(),D1=gp(),bp=cn(),q1=ga(),wp=Gr(),vp=kn(),$1=hn(),L1=Cn(),M1=En(),N1=ba(),xp=Sn(),kp=tr(),B1=Hr();function Z(...r){return r.length===1&&Array.isArray(r[0])&&(r=r[0]),new Ea(r)}Z.plugin=function(e,t){let i=!1;function n(...s){console&&console.warn&&!i&&(i=!0,console.warn(e+`: postcss.plugin was deprecated. Migration guide:
-https://evilmartians.com/chronicles/postcss-8-plugin-migration`),m.env.LANG&&m.env.LANG.startsWith("cn")&&console.warn(e+`: \u91CC\u9762 postcss.plugin \u88AB\u5F03\u7528. \u8FC1\u79FB\u6307\u5357:
-https://www.w3ctech.com/topic/2226`));let o=t(...s);return o.postcssPlugin=e,o.postcssVersion=new Ea().version,o}let a;return Object.defineProperty(n,"postcss",{get(){return a||(a=n()),a}}),n.process=function(s,o,l){return Z([n(l)]).process(s,o)},n};Z.stringify=I1;Z.parse=M1;Z.fromJSON=D1;Z.list=N1;Z.comment=r=>new wp(r);Z.atRule=r=>new vp(r);Z.decl=r=>new yp(r);Z.rule=r=>new xp(r);Z.root=r=>new kp(r);Z.document=r=>new bp(r);Z.CssSyntaxError=T1;Z.Declaration=yp;Z.Container=P1;Z.Processor=Ea;Z.Document=bp;Z.Comment=wp;Z.Warning=q1;Z.AtRule=vp;Z.Result=$1;Z.Input=L1;Z.Rule=xp;Z.Root=kp;Z.Node=B1;R1.registerPostcss(Z);Sp.exports=Z;Z.default=Z});var re,ee,K3,X3,J3,Z3,eI,tI,rI,iI,nI,sI,aI,oI,lI,uI,fI,cI,pI,dI,hI,mI,gI,yI,bI,wI,Ot=R(()=>{u();re=pe($e()),ee=re.default,K3=re.default.stringify,X3=re.default.fromJSON,J3=re.default.plugin,Z3=re.default.parse,eI=re.default.list,tI=re.default.document,rI=re.default.comment,iI=re.default.atRule,nI=re.default.rule,sI=re.default.decl,aI=re.default.root,oI=re.default.CssSyntaxError,lI=re.default.Declaration,uI=re.default.Container,fI=re.default.Processor,cI=re.default.Document,pI=re.default.Comment,dI=re.default.Warning,hI=re.default.AtRule,mI=re.default.Result,gI=re.default.Input,yI=re.default.Rule,bI=re.default.Root,wI=re.default.Node});var Oa=x((xI,Ap)=>{u();Ap.exports=function(r,e,t,i,n){for(e=e.split?e.split("."):e,i=0;i<e.length;i++)r=r?r[e[i]]:n;return r===n?t:r}});var Rn=x((Tn,Cp)=>{u();"use strict";Tn.__esModule=!0;Tn.default=z1;function F1(r){for(var e=r.toLowerCase(),t="",i=!1,n=0;n<6&&e[n]!==void 0;n++){var a=e.charCodeAt(n),s=a>=97&&a<=102||a>=48&&a<=57;if(i=a===32,!s)break;t+=e[n]}if(t.length!==0){var o=parseInt(t,16),l=o>=55296&&o<=57343;return l||o===0||o>1114111?["\uFFFD",t.length+(i?1:0)]:[String.fromCodePoint(o),t.length+(i?1:0)]}}var j1=/\\/;function z1(r){var e=j1.test(r);if(!e)return r;for(var t="",i=0;i<r.length;i++){if(r[i]==="\\"){var n=F1(r.slice(i+1,i+7));if(n!==void 0){t+=n[0],i+=n[1];continue}if(r[i+1]==="\\"){t+="\\",i++;continue}r.length===i+1&&(t+=r[i]);continue}t+=r[i]}return t}Cp.exports=Tn.default});var Ep=x((Pn,_p)=>{u();"use strict";Pn.__esModule=!0;Pn.default=U1;function U1(r){for(var e=arguments.length,t=new Array(e>1?e-1:0),i=1;i<e;i++)t[i-1]=arguments[i];for(;t.length>0;){var n=t.shift();if(!r[n])return;r=r[n]}return r}_p.exports=Pn.default});var Tp=x((In,Op)=>{u();"use strict";In.__esModule=!0;In.default=V1;function V1(r){for(var e=arguments.length,t=new Array(e>1?e-1:0),i=1;i<e;i++)t[i-1]=arguments[i];for(;t.length>0;){var n=t.shift();r[n]||(r[n]={}),r=r[n]}}Op.exports=In.default});var Pp=x((Dn,Rp)=>{u();"use strict";Dn.__esModule=!0;Dn.default=H1;function H1(r){for(var e="",t=r.indexOf("/*"),i=0;t>=0;){e=e+r.slice(i,t);var n=r.indexOf("*/",t+2);if(n<0)return e;i=n+2,t=r.indexOf("/*",i)}return e=e+r.slice(i),e}Rp.exports=Dn.default});var ti=x(rt=>{u();"use strict";rt.__esModule=!0;rt.unesc=rt.stripComments=rt.getProp=rt.ensureObject=void 0;var W1=qn(Rn());rt.unesc=W1.default;var G1=qn(Ep());rt.getProp=G1.default;var Q1=qn(Tp());rt.ensureObject=Q1.default;var Y1=qn(Pp());rt.stripComments=Y1.default;function qn(r){return r&&r.__esModule?r:{default:r}}});var dt=x((ri,qp)=>{u();"use strict";ri.__esModule=!0;ri.default=void 0;var Ip=ti();function Dp(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function K1(r,e,t){return e&&Dp(r.prototype,e),t&&Dp(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}var X1=function r(e,t){if(typeof e!="object"||e===null)return e;var i=new e.constructor;for(var n in e)if(!!e.hasOwnProperty(n)){var a=e[n],s=typeof a;n==="parent"&&s==="object"?t&&(i[n]=t):a instanceof Array?i[n]=a.map(function(o){return r(o,i)}):i[n]=r(a,i)}return i},J1=function(){function r(t){t===void 0&&(t={}),Object.assign(this,t),this.spaces=this.spaces||{},this.spaces.before=this.spaces.before||"",this.spaces.after=this.spaces.after||""}var e=r.prototype;return e.remove=function(){return this.parent&&this.parent.removeChild(this),this.parent=void 0,this},e.replaceWith=function(){if(this.parent){for(var i in arguments)this.parent.insertBefore(this,arguments[i]);this.remove()}return this},e.next=function(){return this.parent.at(this.parent.index(this)+1)},e.prev=function(){return this.parent.at(this.parent.index(this)-1)},e.clone=function(i){i===void 0&&(i={});var n=X1(this);for(var a in i)n[a]=i[a];return n},e.appendToPropertyAndEscape=function(i,n,a){this.raws||(this.raws={});var s=this[i],o=this.raws[i];this[i]=s+n,o||a!==n?this.raws[i]=(o||s)+a:delete this.raws[i]},e.setPropertyAndEscape=function(i,n,a){this.raws||(this.raws={}),this[i]=n,this.raws[i]=a},e.setPropertyWithoutEscape=function(i,n){this[i]=n,this.raws&&delete this.raws[i]},e.isAtPosition=function(i,n){if(this.source&&this.source.start&&this.source.end)return!(this.source.start.line>i||this.source.end.line<i||this.source.start.line===i&&this.source.start.column>n||this.source.end.line===i&&this.source.end.column<n)},e.stringifyProperty=function(i){return this.raws&&this.raws[i]||this[i]},e.valueToString=function(){return String(this.stringifyProperty("value"))},e.toString=function(){return[this.rawSpaceBefore,this.valueToString(),this.rawSpaceAfter].join("")},K1(r,[{key:"rawSpaceBefore",get:function(){var i=this.raws&&this.raws.spaces&&this.raws.spaces.before;return i===void 0&&(i=this.spaces&&this.spaces.before),i||""},set:function(i){(0,Ip.ensureObject)(this,"raws","spaces"),this.raws.spaces.before=i}},{key:"rawSpaceAfter",get:function(){var i=this.raws&&this.raws.spaces&&this.raws.spaces.after;return i===void 0&&(i=this.spaces.after),i||""},set:function(i){(0,Ip.ensureObject)(this,"raws","spaces"),this.raws.spaces.after=i}}]),r}();ri.default=J1;qp.exports=ri.default});var Se=x(ie=>{u();"use strict";ie.__esModule=!0;ie.UNIVERSAL=ie.TAG=ie.STRING=ie.SELECTOR=ie.ROOT=ie.PSEUDO=ie.NESTING=ie.ID=ie.COMMENT=ie.COMBINATOR=ie.CLASS=ie.ATTRIBUTE=void 0;var Z1="tag";ie.TAG=Z1;var ek="string";ie.STRING=ek;var tk="selector";ie.SELECTOR=tk;var rk="root";ie.ROOT=rk;var ik="pseudo";ie.PSEUDO=ik;var nk="nesting";ie.NESTING=nk;var sk="id";ie.ID=sk;var ak="comment";ie.COMMENT=ak;var ok="combinator";ie.COMBINATOR=ok;var lk="class";ie.CLASS=lk;var uk="attribute";ie.ATTRIBUTE=uk;var fk="universal";ie.UNIVERSAL=fk});var $n=x((ii,Np)=>{u();"use strict";ii.__esModule=!0;ii.default=void 0;var ck=dk(dt()),ht=pk(Se());function $p(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return($p=function(n){return n?t:e})(r)}function pk(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=$p(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}function dk(r){return r&&r.__esModule?r:{default:r}}function hk(r,e){var t=typeof Symbol!="undefined"&&r[Symbol.iterator]||r["@@iterator"];if(t)return(t=t.call(r)).next.bind(t);if(Array.isArray(r)||(t=mk(r))||e&&r&&typeof r.length=="number"){t&&(r=t);var i=0;return function(){return i>=r.length?{done:!0}:{done:!1,value:r[i++]}}}throw new TypeError(`Invalid attempt to iterate non-iterable instance.
-In order to be iterable, non-array objects must have a [Symbol.iterator]() method.`)}function mk(r,e){if(!!r){if(typeof r=="string")return Lp(r,e);var t=Object.prototype.toString.call(r).slice(8,-1);if(t==="Object"&&r.constructor&&(t=r.constructor.name),t==="Map"||t==="Set")return Array.from(r);if(t==="Arguments"||/^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t))return Lp(r,e)}}function Lp(r,e){(e==null||e>r.length)&&(e=r.length);for(var t=0,i=new Array(e);t<e;t++)i[t]=r[t];return i}function Mp(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function gk(r,e,t){return e&&Mp(r.prototype,e),t&&Mp(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}function yk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ta(r,e)}function Ta(r,e){return Ta=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ta(r,e)}var bk=function(r){yk(e,r);function e(i){var n;return n=r.call(this,i)||this,n.nodes||(n.nodes=[]),n}var t=e.prototype;return t.append=function(n){return n.parent=this,this.nodes.push(n),this},t.prepend=function(n){return n.parent=this,this.nodes.unshift(n),this},t.at=function(n){return this.nodes[n]},t.index=function(n){return typeof n=="number"?n:this.nodes.indexOf(n)},t.removeChild=function(n){n=this.index(n),this.at(n).parent=void 0,this.nodes.splice(n,1);var a;for(var s in this.indexes)a=this.indexes[s],a>=n&&(this.indexes[s]=a-1);return this},t.removeAll=function(){for(var n=hk(this.nodes),a;!(a=n()).done;){var s=a.value;s.parent=void 0}return this.nodes=[],this},t.empty=function(){return this.removeAll()},t.insertAfter=function(n,a){a.parent=this;var s=this.index(n);this.nodes.splice(s+1,0,a),a.parent=this;var o;for(var l in this.indexes)o=this.indexes[l],s<=o&&(this.indexes[l]=o+1);return this},t.insertBefore=function(n,a){a.parent=this;var s=this.index(n);this.nodes.splice(s,0,a),a.parent=this;var o;for(var l in this.indexes)o=this.indexes[l],o<=s&&(this.indexes[l]=o+1);return this},t._findChildAtPosition=function(n,a){var s=void 0;return this.each(function(o){if(o.atPosition){var l=o.atPosition(n,a);if(l)return s=l,!1}else if(o.isAtPosition(n,a))return s=o,!1}),s},t.atPosition=function(n,a){if(this.isAtPosition(n,a))return this._findChildAtPosition(n,a)||this},t._inferEndPosition=function(){this.last&&this.last.source&&this.last.source.end&&(this.source=this.source||{},this.source.end=this.source.end||{},Object.assign(this.source.end,this.last.source.end))},t.each=function(n){this.lastEach||(this.lastEach=0),this.indexes||(this.indexes={}),this.lastEach++;var a=this.lastEach;if(this.indexes[a]=0,!!this.length){for(var s,o;this.indexes[a]<this.length&&(s=this.indexes[a],o=n(this.at(s),s),o!==!1);)this.indexes[a]+=1;if(delete this.indexes[a],o===!1)return!1}},t.walk=function(n){return this.each(function(a,s){var o=n(a,s);if(o!==!1&&a.length&&(o=a.walk(n)),o===!1)return!1})},t.walkAttributes=function(n){var a=this;return this.walk(function(s){if(s.type===ht.ATTRIBUTE)return n.call(a,s)})},t.walkClasses=function(n){var a=this;return this.walk(function(s){if(s.type===ht.CLASS)return n.call(a,s)})},t.walkCombinators=function(n){var a=this;return this.walk(function(s){if(s.type===ht.COMBINATOR)return n.call(a,s)})},t.walkComments=function(n){var a=this;return this.walk(function(s){if(s.type===ht.COMMENT)return n.call(a,s)})},t.walkIds=function(n){var a=this;return this.walk(function(s){if(s.type===ht.ID)return n.call(a,s)})},t.walkNesting=function(n){var a=this;return this.walk(function(s){if(s.type===ht.NESTING)return n.call(a,s)})},t.walkPseudos=function(n){var a=this;return this.walk(function(s){if(s.type===ht.PSEUDO)return n.call(a,s)})},t.walkTags=function(n){var a=this;return this.walk(function(s){if(s.type===ht.TAG)return n.call(a,s)})},t.walkUniversals=function(n){var a=this;return this.walk(function(s){if(s.type===ht.UNIVERSAL)return n.call(a,s)})},t.split=function(n){var a=this,s=[];return this.reduce(function(o,l,c){var f=n.call(a,l);return s.push(l),f?(o.push(s),s=[]):c===a.length-1&&o.push(s),o},[])},t.map=function(n){return this.nodes.map(n)},t.reduce=function(n,a){return this.nodes.reduce(n,a)},t.every=function(n){return this.nodes.every(n)},t.some=function(n){return this.nodes.some(n)},t.filter=function(n){return this.nodes.filter(n)},t.sort=function(n){return this.nodes.sort(n)},t.toString=function(){return this.map(String).join("")},gk(e,[{key:"first",get:function(){return this.at(0)}},{key:"last",get:function(){return this.at(this.length-1)}},{key:"length",get:function(){return this.nodes.length}}]),e}(ck.default);ii.default=bk;Np.exports=ii.default});var Pa=x((ni,Fp)=>{u();"use strict";ni.__esModule=!0;ni.default=void 0;var wk=xk($n()),vk=Se();function xk(r){return r&&r.__esModule?r:{default:r}}function Bp(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function kk(r,e,t){return e&&Bp(r.prototype,e),t&&Bp(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}function Sk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ra(r,e)}function Ra(r,e){return Ra=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ra(r,e)}var Ak=function(r){Sk(e,r);function e(i){var n;return n=r.call(this,i)||this,n.type=vk.ROOT,n}var t=e.prototype;return t.toString=function(){var n=this.reduce(function(a,s){return a.push(String(s)),a},[]).join(",");return this.trailingComma?n+",":n},t.error=function(n,a){return this._error?this._error(n,a):new Error(n)},kk(e,[{key:"errorGenerator",set:function(n){this._error=n}}]),e}(wk.default);ni.default=Ak;Fp.exports=ni.default});var Da=x((si,jp)=>{u();"use strict";si.__esModule=!0;si.default=void 0;var Ck=Ek($n()),_k=Se();function Ek(r){return r&&r.__esModule?r:{default:r}}function Ok(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ia(r,e)}function Ia(r,e){return Ia=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ia(r,e)}var Tk=function(r){Ok(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=_k.SELECTOR,i}return e}(Ck.default);si.default=Tk;jp.exports=si.default});var Ln=x((AI,zp)=>{u();"use strict";var Rk={},Pk=Rk.hasOwnProperty,Ik=function(e,t){if(!e)return t;var i={};for(var n in t)i[n]=Pk.call(e,n)?e[n]:t[n];return i},Dk=/[ -,\.\/:-@\[-\^`\{-~]/,qk=/[ -,\.\/:-@\[\]\^`\{-~]/,$k=/(^|\\+)?(\\[A-F0-9]{1,6})\x20(?![a-fA-F0-9\x20])/g,qa=function r(e,t){t=Ik(t,r.options),t.quotes!="single"&&t.quotes!="double"&&(t.quotes="single");for(var i=t.quotes=="double"?'"':"'",n=t.isIdentifier,a=e.charAt(0),s="",o=0,l=e.length;o<l;){var c=e.charAt(o++),f=c.charCodeAt(),d=void 0;if(f<32||f>126){if(f>=55296&&f<=56319&&o<l){var p=e.charCodeAt(o++);(p&64512)==56320?f=((f&1023)<<10)+(p&1023)+65536:o--}d="\\"+f.toString(16).toUpperCase()+" "}else t.escapeEverything?Dk.test(c)?d="\\"+c:d="\\"+f.toString(16).toUpperCase()+" ":/[\t\n\f\r\x0B]/.test(c)?d="\\"+f.toString(16).toUpperCase()+" ":c=="\\"||!n&&(c=='"'&&i==c||c=="'"&&i==c)||n&&qk.test(c)?d="\\"+c:d=c;s+=d}return n&&(/^-[-\d]/.test(s)?s="\\-"+s.slice(1):/\d/.test(a)&&(s="\\3"+a+" "+s.slice(1))),s=s.replace($k,function(h,b,v){return b&&b.length%2?h:(b||"")+v}),!n&&t.wrap?i+s+i:s};qa.options={escapeEverything:!1,isIdentifier:!1,quotes:"single",wrap:!1};qa.version="3.0.0";zp.exports=qa});var La=x((ai,Hp)=>{u();"use strict";ai.__esModule=!0;ai.default=void 0;var Lk=Up(Ln()),Mk=ti(),Nk=Up(dt()),Bk=Se();function Up(r){return r&&r.__esModule?r:{default:r}}function Vp(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function Fk(r,e,t){return e&&Vp(r.prototype,e),t&&Vp(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}function jk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,$a(r,e)}function $a(r,e){return $a=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},$a(r,e)}var zk=function(r){jk(e,r);function e(i){var n;return n=r.call(this,i)||this,n.type=Bk.CLASS,n._constructed=!0,n}var t=e.prototype;return t.valueToString=function(){return"."+r.prototype.valueToString.call(this)},Fk(e,[{key:"value",get:function(){return this._value},set:function(n){if(this._constructed){var a=(0,Lk.default)(n,{isIdentifier:!0});a!==n?((0,Mk.ensureObject)(this,"raws"),this.raws.value=a):this.raws&&delete this.raws.value}this._value=n}}]),e}(Nk.default);ai.default=zk;Hp.exports=ai.default});var Na=x((oi,Wp)=>{u();"use strict";oi.__esModule=!0;oi.default=void 0;var Uk=Hk(dt()),Vk=Se();function Hk(r){return r&&r.__esModule?r:{default:r}}function Wk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ma(r,e)}function Ma(r,e){return Ma=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ma(r,e)}var Gk=function(r){Wk(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=Vk.COMMENT,i}return e}(Uk.default);oi.default=Gk;Wp.exports=oi.default});var Fa=x((li,Gp)=>{u();"use strict";li.__esModule=!0;li.default=void 0;var Qk=Kk(dt()),Yk=Se();function Kk(r){return r&&r.__esModule?r:{default:r}}function Xk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ba(r,e)}function Ba(r,e){return Ba=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ba(r,e)}var Jk=function(r){Xk(e,r);function e(i){var n;return n=r.call(this,i)||this,n.type=Yk.ID,n}var t=e.prototype;return t.valueToString=function(){return"#"+r.prototype.valueToString.call(this)},e}(Qk.default);li.default=Jk;Gp.exports=li.default});var Mn=x((ui,Kp)=>{u();"use strict";ui.__esModule=!0;ui.default=void 0;var Zk=Qp(Ln()),eS=ti(),tS=Qp(dt());function Qp(r){return r&&r.__esModule?r:{default:r}}function Yp(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function rS(r,e,t){return e&&Yp(r.prototype,e),t&&Yp(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}function iS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,ja(r,e)}function ja(r,e){return ja=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},ja(r,e)}var nS=function(r){iS(e,r);function e(){return r.apply(this,arguments)||this}var t=e.prototype;return t.qualifiedName=function(n){return this.namespace?this.namespaceString+"|"+n:n},t.valueToString=function(){return this.qualifiedName(r.prototype.valueToString.call(this))},rS(e,[{key:"namespace",get:function(){return this._namespace},set:function(n){if(n===!0||n==="*"||n==="&"){this._namespace=n,this.raws&&delete this.raws.namespace;return}var a=(0,Zk.default)(n,{isIdentifier:!0});this._namespace=n,a!==n?((0,eS.ensureObject)(this,"raws"),this.raws.namespace=a):this.raws&&delete this.raws.namespace}},{key:"ns",get:function(){return this._namespace},set:function(n){this.namespace=n}},{key:"namespaceString",get:function(){if(this.namespace){var n=this.stringifyProperty("namespace");return n===!0?"":n}else return""}}]),e}(tS.default);ui.default=nS;Kp.exports=ui.default});var Ua=x((fi,Xp)=>{u();"use strict";fi.__esModule=!0;fi.default=void 0;var sS=oS(Mn()),aS=Se();function oS(r){return r&&r.__esModule?r:{default:r}}function lS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,za(r,e)}function za(r,e){return za=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},za(r,e)}var uS=function(r){lS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=aS.TAG,i}return e}(sS.default);fi.default=uS;Xp.exports=fi.default});var Ha=x((ci,Jp)=>{u();"use strict";ci.__esModule=!0;ci.default=void 0;var fS=pS(dt()),cS=Se();function pS(r){return r&&r.__esModule?r:{default:r}}function dS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Va(r,e)}function Va(r,e){return Va=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Va(r,e)}var hS=function(r){dS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=cS.STRING,i}return e}(fS.default);ci.default=hS;Jp.exports=ci.default});var Ga=x((pi,Zp)=>{u();"use strict";pi.__esModule=!0;pi.default=void 0;var mS=yS($n()),gS=Se();function yS(r){return r&&r.__esModule?r:{default:r}}function bS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Wa(r,e)}function Wa(r,e){return Wa=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Wa(r,e)}var wS=function(r){bS(e,r);function e(i){var n;return n=r.call(this,i)||this,n.type=gS.PSEUDO,n}var t=e.prototype;return t.toString=function(){var n=this.length?"("+this.map(String).join(",")+")":"";return[this.rawSpaceBefore,this.stringifyProperty("value"),n,this.rawSpaceAfter].join("")},e}(mS.default);pi.default=wS;Zp.exports=pi.default});var Nn={};Ge(Nn,{deprecate:()=>vS});function vS(r){return r}var Bn=R(()=>{u()});var td=x((CI,ed)=>{u();ed.exports=(Bn(),Nn).deprecate});var Za=x(mi=>{u();"use strict";mi.__esModule=!0;mi.default=void 0;mi.unescapeValue=Xa;var di=Ya(Ln()),xS=Ya(Rn()),kS=Ya(Mn()),SS=Se(),Qa;function Ya(r){return r&&r.__esModule?r:{default:r}}function rd(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function AS(r,e,t){return e&&rd(r.prototype,e),t&&rd(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}function CS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ka(r,e)}function Ka(r,e){return Ka=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ka(r,e)}var hi=td(),_S=/^('|")([^]*)\1$/,ES=hi(function(){},"Assigning an attribute a value containing characters that might need to be escaped is deprecated. Call attribute.setValue() instead."),OS=hi(function(){},"Assigning attr.quoted is deprecated and has no effect. Assign to attr.quoteMark instead."),TS=hi(function(){},"Constructing an Attribute selector with a value without specifying quoteMark is deprecated. Note: The value should be unescaped now.");function Xa(r){var e=!1,t=null,i=r,n=i.match(_S);return n&&(t=n[1],i=n[2]),i=(0,xS.default)(i),i!==r&&(e=!0),{deprecatedUsage:e,unescaped:i,quoteMark:t}}function RS(r){if(r.quoteMark!==void 0||r.value===void 0)return r;TS();var e=Xa(r.value),t=e.quoteMark,i=e.unescaped;return r.raws||(r.raws={}),r.raws.value===void 0&&(r.raws.value=r.value),r.value=i,r.quoteMark=t,r}var Fn=function(r){CS(e,r);function e(i){var n;return i===void 0&&(i={}),n=r.call(this,RS(i))||this,n.type=SS.ATTRIBUTE,n.raws=n.raws||{},Object.defineProperty(n.raws,"unquoted",{get:hi(function(){return n.value},"attr.raws.unquoted is deprecated. Call attr.value instead."),set:hi(function(){return n.value},"Setting attr.raws.unquoted is deprecated and has no effect. attr.value is unescaped by default now.")}),n._constructed=!0,n}var t=e.prototype;return t.getQuotedValue=function(n){n===void 0&&(n={});var a=this._determineQuoteMark(n),s=Ja[a],o=(0,di.default)(this._value,s);return o},t._determineQuoteMark=function(n){return n.smart?this.smartQuoteMark(n):this.preferredQuoteMark(n)},t.setValue=function(n,a){a===void 0&&(a={}),this._value=n,this._quoteMark=this._determineQuoteMark(a),this._syncRawValue()},t.smartQuoteMark=function(n){var a=this.value,s=a.replace(/[^']/g,"").length,o=a.replace(/[^"]/g,"").length;if(s+o===0){var l=(0,di.default)(a,{isIdentifier:!0});if(l===a)return e.NO_QUOTE;var c=this.preferredQuoteMark(n);if(c===e.NO_QUOTE){var f=this.quoteMark||n.quoteMark||e.DOUBLE_QUOTE,d=Ja[f],p=(0,di.default)(a,d);if(p.length<l.length)return f}return c}else return o===s?this.preferredQuoteMark(n):o<s?e.DOUBLE_QUOTE:e.SINGLE_QUOTE},t.preferredQuoteMark=function(n){var a=n.preferCurrentQuoteMark?this.quoteMark:n.quoteMark;return a===void 0&&(a=n.preferCurrentQuoteMark?n.quoteMark:this.quoteMark),a===void 0&&(a=e.DOUBLE_QUOTE),a},t._syncRawValue=function(){var n=(0,di.default)(this._value,Ja[this.quoteMark]);n===this._value?this.raws&&delete this.raws.value:this.raws.value=n},t._handleEscapes=function(n,a){if(this._constructed){var s=(0,di.default)(a,{isIdentifier:!0});s!==a?this.raws[n]=s:delete this.raws[n]}},t._spacesFor=function(n){var a={before:"",after:""},s=this.spaces[n]||{},o=this.raws.spaces&&this.raws.spaces[n]||{};return Object.assign(a,s,o)},t._stringFor=function(n,a,s){a===void 0&&(a=n),s===void 0&&(s=id);var o=this._spacesFor(a);return s(this.stringifyProperty(n),o)},t.offsetOf=function(n){var a=1,s=this._spacesFor("attribute");if(a+=s.before.length,n==="namespace"||n==="ns")return this.namespace?a:-1;if(n==="attributeNS"||(a+=this.namespaceString.length,this.namespace&&(a+=1),n==="attribute"))return a;a+=this.stringifyProperty("attribute").length,a+=s.after.length;var o=this._spacesFor("operator");a+=o.before.length;var l=this.stringifyProperty("operator");if(n==="operator")return l?a:-1;a+=l.length,a+=o.after.length;var c=this._spacesFor("value");a+=c.before.length;var f=this.stringifyProperty("value");if(n==="value")return f?a:-1;a+=f.length,a+=c.after.length;var d=this._spacesFor("insensitive");return a+=d.before.length,n==="insensitive"&&this.insensitive?a:-1},t.toString=function(){var n=this,a=[this.rawSpaceBefore,"["];return a.push(this._stringFor("qualifiedAttribute","attribute")),this.operator&&(this.value||this.value==="")&&(a.push(this._stringFor("operator")),a.push(this._stringFor("value")),a.push(this._stringFor("insensitiveFlag","insensitive",function(s,o){return s.length>0&&!n.quoted&&o.before.length===0&&!(n.spaces.value&&n.spaces.value.after)&&(o.before=" "),id(s,o)}))),a.push("]"),a.push(this.rawSpaceAfter),a.join("")},AS(e,[{key:"quoted",get:function(){var n=this.quoteMark;return n==="'"||n==='"'},set:function(n){OS()}},{key:"quoteMark",get:function(){return this._quoteMark},set:function(n){if(!this._constructed){this._quoteMark=n;return}this._quoteMark!==n&&(this._quoteMark=n,this._syncRawValue())}},{key:"qualifiedAttribute",get:function(){return this.qualifiedName(this.raws.attribute||this.attribute)}},{key:"insensitiveFlag",get:function(){return this.insensitive?"i":""}},{key:"value",get:function(){return this._value},set:function(n){if(this._constructed){var a=Xa(n),s=a.deprecatedUsage,o=a.unescaped,l=a.quoteMark;if(s&&ES(),o===this._value&&l===this._quoteMark)return;this._value=o,this._quoteMark=l,this._syncRawValue()}else this._value=n}},{key:"insensitive",get:function(){return this._insensitive},set:function(n){n||(this._insensitive=!1,this.raws&&(this.raws.insensitiveFlag==="I"||this.raws.insensitiveFlag==="i")&&(this.raws.insensitiveFlag=void 0)),this._insensitive=n}},{key:"attribute",get:function(){return this._attribute},set:function(n){this._handleEscapes("attribute",n),this._attribute=n}}]),e}(kS.default);mi.default=Fn;Fn.NO_QUOTE=null;Fn.SINGLE_QUOTE="'";Fn.DOUBLE_QUOTE='"';var Ja=(Qa={"'":{quotes:"single",wrap:!0},'"':{quotes:"double",wrap:!0}},Qa[null]={isIdentifier:!0},Qa);function id(r,e){return""+e.before+r+e.after}});var to=x((gi,nd)=>{u();"use strict";gi.__esModule=!0;gi.default=void 0;var PS=DS(Mn()),IS=Se();function DS(r){return r&&r.__esModule?r:{default:r}}function qS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,eo(r,e)}function eo(r,e){return eo=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},eo(r,e)}var $S=function(r){qS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=IS.UNIVERSAL,i.value="*",i}return e}(PS.default);gi.default=$S;nd.exports=gi.default});var io=x((yi,sd)=>{u();"use strict";yi.__esModule=!0;yi.default=void 0;var LS=NS(dt()),MS=Se();function NS(r){return r&&r.__esModule?r:{default:r}}function BS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,ro(r,e)}function ro(r,e){return ro=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},ro(r,e)}var FS=function(r){BS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=MS.COMBINATOR,i}return e}(LS.default);yi.default=FS;sd.exports=yi.default});var so=x((bi,ad)=>{u();"use strict";bi.__esModule=!0;bi.default=void 0;var jS=US(dt()),zS=Se();function US(r){return r&&r.__esModule?r:{default:r}}function VS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,no(r,e)}function no(r,e){return no=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},no(r,e)}var HS=function(r){VS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=zS.NESTING,i.value="&",i}return e}(jS.default);bi.default=HS;ad.exports=bi.default});var ld=x((jn,od)=>{u();"use strict";jn.__esModule=!0;jn.default=WS;function WS(r){return r.sort(function(e,t){return e-t})}od.exports=jn.default});var ao=x(M=>{u();"use strict";M.__esModule=!0;M.word=M.tilde=M.tab=M.str=M.space=M.slash=M.singleQuote=M.semicolon=M.plus=M.pipe=M.openSquare=M.openParenthesis=M.newline=M.greaterThan=M.feed=M.equals=M.doubleQuote=M.dollar=M.cr=M.comment=M.comma=M.combinator=M.colon=M.closeSquare=M.closeParenthesis=M.caret=M.bang=M.backslash=M.at=M.asterisk=M.ampersand=void 0;var GS=38;M.ampersand=GS;var QS=42;M.asterisk=QS;var YS=64;M.at=YS;var KS=44;M.comma=KS;var XS=58;M.colon=XS;var JS=59;M.semicolon=JS;var ZS=40;M.openParenthesis=ZS;var eA=41;M.closeParenthesis=eA;var tA=91;M.openSquare=tA;var rA=93;M.closeSquare=rA;var iA=36;M.dollar=iA;var nA=126;M.tilde=nA;var sA=94;M.caret=sA;var aA=43;M.plus=aA;var oA=61;M.equals=oA;var lA=124;M.pipe=lA;var uA=62;M.greaterThan=uA;var fA=32;M.space=fA;var ud=39;M.singleQuote=ud;var cA=34;M.doubleQuote=cA;var pA=47;M.slash=pA;var dA=33;M.bang=dA;var hA=92;M.backslash=hA;var mA=13;M.cr=mA;var gA=12;M.feed=gA;var yA=10;M.newline=yA;var bA=9;M.tab=bA;var wA=ud;M.str=wA;var vA=-1;M.comment=vA;var xA=-2;M.word=xA;var kA=-3;M.combinator=kA});var pd=x(wi=>{u();"use strict";wi.__esModule=!0;wi.FIELDS=void 0;wi.default=TA;var D=SA(ao()),nr,te;function fd(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return(fd=function(n){return n?t:e})(r)}function SA(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=fd(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}var AA=(nr={},nr[D.tab]=!0,nr[D.newline]=!0,nr[D.cr]=!0,nr[D.feed]=!0,nr),CA=(te={},te[D.space]=!0,te[D.tab]=!0,te[D.newline]=!0,te[D.cr]=!0,te[D.feed]=!0,te[D.ampersand]=!0,te[D.asterisk]=!0,te[D.bang]=!0,te[D.comma]=!0,te[D.colon]=!0,te[D.semicolon]=!0,te[D.openParenthesis]=!0,te[D.closeParenthesis]=!0,te[D.openSquare]=!0,te[D.closeSquare]=!0,te[D.singleQuote]=!0,te[D.doubleQuote]=!0,te[D.plus]=!0,te[D.pipe]=!0,te[D.tilde]=!0,te[D.greaterThan]=!0,te[D.equals]=!0,te[D.dollar]=!0,te[D.caret]=!0,te[D.slash]=!0,te),oo={},cd="0123456789abcdefABCDEF";for(zn=0;zn<cd.length;zn++)oo[cd.charCodeAt(zn)]=!0;var zn;function _A(r,e){var t=e,i;do{if(i=r.charCodeAt(t),CA[i])return t-1;i===D.backslash?t=EA(r,t)+1:t++}while(t<r.length);return t-1}function EA(r,e){var t=e,i=r.charCodeAt(t+1);if(!AA[i])if(oo[i]){var n=0;do t++,n++,i=r.charCodeAt(t+1);while(oo[i]&&n<6);n<6&&i===D.space&&t++}else t++;return t}var OA={TYPE:0,START_LINE:1,START_COL:2,END_LINE:3,END_COL:4,START_POS:5,END_POS:6};wi.FIELDS=OA;function TA(r){var e=[],t=r.css.valueOf(),i=t,n=i.length,a=-1,s=1,o=0,l=0,c,f,d,p,h,b,v,y,w,k,S,E,O;function B(N,T){if(r.safe)t+=T,w=t.length-1;else throw r.error("Unclosed "+N,s,o-a,o)}for(;o<n;){switch(c=t.charCodeAt(o),c===D.newline&&(a=o,s+=1),c){case D.space:case D.tab:case D.newline:case D.cr:case D.feed:w=o;do w+=1,c=t.charCodeAt(w),c===D.newline&&(a=w,s+=1);while(c===D.space||c===D.newline||c===D.tab||c===D.cr||c===D.feed);O=D.space,p=s,d=w-a-1,l=w;break;case D.plus:case D.greaterThan:case D.tilde:case D.pipe:w=o;do w+=1,c=t.charCodeAt(w);while(c===D.plus||c===D.greaterThan||c===D.tilde||c===D.pipe);O=D.combinator,p=s,d=o-a,l=w;break;case D.asterisk:case D.ampersand:case D.bang:case D.comma:case D.equals:case D.dollar:case D.caret:case D.openSquare:case D.closeSquare:case D.colon:case D.semicolon:case D.openParenthesis:case D.closeParenthesis:w=o,O=c,p=s,d=o-a,l=w+1;break;case D.singleQuote:case D.doubleQuote:E=c===D.singleQuote?"'":'"',w=o;do for(h=!1,w=t.indexOf(E,w+1),w===-1&&B("quote",E),b=w;t.charCodeAt(b-1)===D.backslash;)b-=1,h=!h;while(h);O=D.str,p=s,d=o-a,l=w+1;break;default:c===D.slash&&t.charCodeAt(o+1)===D.asterisk?(w=t.indexOf("*/",o+2)+1,w===0&&B("comment","*/"),f=t.slice(o,w+1),y=f.split(`
-`),v=y.length-1,v>0?(k=s+v,S=w-y[v].length):(k=s,S=a),O=D.comment,s=k,p=k,d=w-S):c===D.slash?(w=o,O=c,p=s,d=o-a,l=w+1):(w=_A(t,o),O=D.word,p=s,d=w-a),l=w+1;break}e.push([O,s,o-a,p,d,o,l]),S&&(a=S,S=null),o=l}return e}});var vd=x((vi,wd)=>{u();"use strict";vi.__esModule=!0;vi.default=void 0;var RA=je(Pa()),lo=je(Da()),PA=je(La()),dd=je(Na()),IA=je(Fa()),DA=je(Ua()),uo=je(Ha()),qA=je(Ga()),hd=Un(Za()),$A=je(to()),fo=je(io()),LA=je(so()),MA=je(ld()),P=Un(pd()),$=Un(ao()),NA=Un(Se()),le=ti(),Vt,co;function md(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return(md=function(n){return n?t:e})(r)}function Un(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=md(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}function je(r){return r&&r.__esModule?r:{default:r}}function gd(r,e){for(var t=0;t<e.length;t++){var i=e[t];i.enumerable=i.enumerable||!1,i.configurable=!0,"value"in i&&(i.writable=!0),Object.defineProperty(r,i.key,i)}}function BA(r,e,t){return e&&gd(r.prototype,e),t&&gd(r,t),Object.defineProperty(r,"prototype",{writable:!1}),r}var po=(Vt={},Vt[$.space]=!0,Vt[$.cr]=!0,Vt[$.feed]=!0,Vt[$.newline]=!0,Vt[$.tab]=!0,Vt),FA=Object.assign({},po,(co={},co[$.comment]=!0,co));function yd(r){return{line:r[P.FIELDS.START_LINE],column:r[P.FIELDS.START_COL]}}function bd(r){return{line:r[P.FIELDS.END_LINE],column:r[P.FIELDS.END_COL]}}function Ht(r,e,t,i){return{start:{line:r,column:e},end:{line:t,column:i}}}function sr(r){return Ht(r[P.FIELDS.START_LINE],r[P.FIELDS.START_COL],r[P.FIELDS.END_LINE],r[P.FIELDS.END_COL])}function ho(r,e){if(!!r)return Ht(r[P.FIELDS.START_LINE],r[P.FIELDS.START_COL],e[P.FIELDS.END_LINE],e[P.FIELDS.END_COL])}function ar(r,e){var t=r[e];if(typeof t=="string")return t.indexOf("\\")!==-1&&((0,le.ensureObject)(r,"raws"),r[e]=(0,le.unesc)(t),r.raws[e]===void 0&&(r.raws[e]=t)),r}function mo(r,e){for(var t=-1,i=[];(t=r.indexOf(e,t+1))!==-1;)i.push(t);return i}function jA(){var r=Array.prototype.concat.apply([],arguments);return r.filter(function(e,t){return t===r.indexOf(e)})}var zA=function(){function r(t,i){i===void 0&&(i={}),this.rule=t,this.options=Object.assign({lossy:!1,safe:!1},i),this.position=0,this.css=typeof this.rule=="string"?this.rule:this.rule.selector,this.tokens=(0,P.default)({css:this.css,error:this._errorGenerator(),safe:this.options.safe});var n=ho(this.tokens[0],this.tokens[this.tokens.length-1]);this.root=new RA.default({source:n}),this.root.errorGenerator=this._errorGenerator();var a=new lo.default({source:{start:{line:1,column:1}}});this.root.append(a),this.current=a,this.loop()}var e=r.prototype;return e._errorGenerator=function(){var i=this;return function(n,a){return typeof i.rule=="string"?new Error(n):i.rule.error(n,a)}},e.attribute=function(){var i=[],n=this.currToken;for(this.position++;this.position<this.tokens.length&&this.currToken[P.FIELDS.TYPE]!==$.closeSquare;)i.push(this.currToken),this.position++;if(this.currToken[P.FIELDS.TYPE]!==$.closeSquare)return this.expected("closing square bracket",this.currToken[P.FIELDS.START_POS]);var a=i.length,s={source:Ht(n[1],n[2],this.currToken[3],this.currToken[4]),sourceIndex:n[P.FIELDS.START_POS]};if(a===1&&!~[$.word].indexOf(i[0][P.FIELDS.TYPE]))return this.expected("attribute",i[0][P.FIELDS.START_POS]);for(var o=0,l="",c="",f=null,d=!1;o<a;){var p=i[o],h=this.content(p),b=i[o+1];switch(p[P.FIELDS.TYPE]){case $.space:if(d=!0,this.options.lossy)break;if(f){(0,le.ensureObject)(s,"spaces",f);var v=s.spaces[f].after||"";s.spaces[f].after=v+h;var y=(0,le.getProp)(s,"raws","spaces",f,"after")||null;y&&(s.raws.spaces[f].after=y+h)}else l=l+h,c=c+h;break;case $.asterisk:if(b[P.FIELDS.TYPE]===$.equals)s.operator=h,f="operator";else if((!s.namespace||f==="namespace"&&!d)&&b){l&&((0,le.ensureObject)(s,"spaces","attribute"),s.spaces.attribute.before=l,l=""),c&&((0,le.ensureObject)(s,"raws","spaces","attribute"),s.raws.spaces.attribute.before=l,c=""),s.namespace=(s.namespace||"")+h;var w=(0,le.getProp)(s,"raws","namespace")||null;w&&(s.raws.namespace+=h),f="namespace"}d=!1;break;case $.dollar:if(f==="value"){var k=(0,le.getProp)(s,"raws","value");s.value+="$",k&&(s.raws.value=k+"$");break}case $.caret:b[P.FIELDS.TYPE]===$.equals&&(s.operator=h,f="operator"),d=!1;break;case $.combinator:if(h==="~"&&b[P.FIELDS.TYPE]===$.equals&&(s.operator=h,f="operator"),h!=="|"){d=!1;break}b[P.FIELDS.TYPE]===$.equals?(s.operator=h,f="operator"):!s.namespace&&!s.attribute&&(s.namespace=!0),d=!1;break;case $.word:if(b&&this.content(b)==="|"&&i[o+2]&&i[o+2][P.FIELDS.TYPE]!==$.equals&&!s.operator&&!s.namespace)s.namespace=h,f="namespace";else if(!s.attribute||f==="attribute"&&!d){l&&((0,le.ensureObject)(s,"spaces","attribute"),s.spaces.attribute.before=l,l=""),c&&((0,le.ensureObject)(s,"raws","spaces","attribute"),s.raws.spaces.attribute.before=c,c=""),s.attribute=(s.attribute||"")+h;var S=(0,le.getProp)(s,"raws","attribute")||null;S&&(s.raws.attribute+=h),f="attribute"}else if(!s.value&&s.value!==""||f==="value"&&!(d||s.quoteMark)){var E=(0,le.unesc)(h),O=(0,le.getProp)(s,"raws","value")||"",B=s.value||"";s.value=B+E,s.quoteMark=null,(E!==h||O)&&((0,le.ensureObject)(s,"raws"),s.raws.value=(O||B)+h),f="value"}else{var N=h==="i"||h==="I";(s.value||s.value==="")&&(s.quoteMark||d)?(s.insensitive=N,(!N||h==="I")&&((0,le.ensureObject)(s,"raws"),s.raws.insensitiveFlag=h),f="insensitive",l&&((0,le.ensureObject)(s,"spaces","insensitive"),s.spaces.insensitive.before=l,l=""),c&&((0,le.ensureObject)(s,"raws","spaces","insensitive"),s.raws.spaces.insensitive.before=c,c="")):(s.value||s.value==="")&&(f="value",s.value+=h,s.raws.value&&(s.raws.value+=h))}d=!1;break;case $.str:if(!s.attribute||!s.operator)return this.error("Expected an attribute followed by an operator preceding the string.",{index:p[P.FIELDS.START_POS]});var T=(0,hd.unescapeValue)(h),F=T.unescaped,Y=T.quoteMark;s.value=F,s.quoteMark=Y,f="value",(0,le.ensureObject)(s,"raws"),s.raws.value=h,d=!1;break;case $.equals:if(!s.attribute)return this.expected("attribute",p[P.FIELDS.START_POS],h);if(s.value)return this.error('Unexpected "=" found; an operator was already defined.',{index:p[P.FIELDS.START_POS]});s.operator=s.operator?s.operator+h:h,f="operator",d=!1;break;case $.comment:if(f)if(d||b&&b[P.FIELDS.TYPE]===$.space||f==="insensitive"){var _=(0,le.getProp)(s,"spaces",f,"after")||"",Q=(0,le.getProp)(s,"raws","spaces",f,"after")||_;(0,le.ensureObject)(s,"raws","spaces",f),s.raws.spaces[f].after=Q+h}else{var U=s[f]||"",oe=(0,le.getProp)(s,"raws",f)||U;(0,le.ensureObject)(s,"raws"),s.raws[f]=oe+h}else c=c+h;break;default:return this.error('Unexpected "'+h+'" found.',{index:p[P.FIELDS.START_POS]})}o++}ar(s,"attribute"),ar(s,"namespace"),this.newNode(new hd.default(s)),this.position++},e.parseWhitespaceEquivalentTokens=function(i){i<0&&(i=this.tokens.length);var n=this.position,a=[],s="",o=void 0;do if(po[this.currToken[P.FIELDS.TYPE]])this.options.lossy||(s+=this.content());else if(this.currToken[P.FIELDS.TYPE]===$.comment){var l={};s&&(l.before=s,s=""),o=new dd.default({value:this.content(),source:sr(this.currToken),sourceIndex:this.currToken[P.FIELDS.START_POS],spaces:l}),a.push(o)}while(++this.position<i);if(s){if(o)o.spaces.after=s;else if(!this.options.lossy){var c=this.tokens[n],f=this.tokens[this.position-1];a.push(new uo.default({value:"",source:Ht(c[P.FIELDS.START_LINE],c[P.FIELDS.START_COL],f[P.FIELDS.END_LINE],f[P.FIELDS.END_COL]),sourceIndex:c[P.FIELDS.START_POS],spaces:{before:s,after:""}}))}}return a},e.convertWhitespaceNodesToSpace=function(i,n){var a=this;n===void 0&&(n=!1);var s="",o="";i.forEach(function(c){var f=a.lossySpace(c.spaces.before,n),d=a.lossySpace(c.rawSpaceBefore,n);s+=f+a.lossySpace(c.spaces.after,n&&f.length===0),o+=f+c.value+a.lossySpace(c.rawSpaceAfter,n&&d.length===0)}),o===s&&(o=void 0);var l={space:s,rawSpace:o};return l},e.isNamedCombinator=function(i){return i===void 0&&(i=this.position),this.tokens[i+0]&&this.tokens[i+0][P.FIELDS.TYPE]===$.slash&&this.tokens[i+1]&&this.tokens[i+1][P.FIELDS.TYPE]===$.word&&this.tokens[i+2]&&this.tokens[i+2][P.FIELDS.TYPE]===$.slash},e.namedCombinator=function(){if(this.isNamedCombinator()){var i=this.content(this.tokens[this.position+1]),n=(0,le.unesc)(i).toLowerCase(),a={};n!==i&&(a.value="/"+i+"/");var s=new fo.default({value:"/"+n+"/",source:Ht(this.currToken[P.FIELDS.START_LINE],this.currToken[P.FIELDS.START_COL],this.tokens[this.position+2][P.FIELDS.END_LINE],this.tokens[this.position+2][P.FIELDS.END_COL]),sourceIndex:this.currToken[P.FIELDS.START_POS],raws:a});return this.position=this.position+3,s}else this.unexpected()},e.combinator=function(){var i=this;if(this.content()==="|")return this.namespace();var n=this.locateNextMeaningfulToken(this.position);if(n<0||this.tokens[n][P.FIELDS.TYPE]===$.comma){var a=this.parseWhitespaceEquivalentTokens(n);if(a.length>0){var s=this.current.last;if(s){var o=this.convertWhitespaceNodesToSpace(a),l=o.space,c=o.rawSpace;c!==void 0&&(s.rawSpaceAfter+=c),s.spaces.after+=l}else a.forEach(function(O){return i.newNode(O)})}return}var f=this.currToken,d=void 0;n>this.position&&(d=this.parseWhitespaceEquivalentTokens(n));var p;if(this.isNamedCombinator()?p=this.namedCombinator():this.currToken[P.FIELDS.TYPE]===$.combinator?(p=new fo.default({value:this.content(),source:sr(this.currToken),sourceIndex:this.currToken[P.FIELDS.START_POS]}),this.position++):po[this.currToken[P.FIELDS.TYPE]]||d||this.unexpected(),p){if(d){var h=this.convertWhitespaceNodesToSpace(d),b=h.space,v=h.rawSpace;p.spaces.before=b,p.rawSpaceBefore=v}}else{var y=this.convertWhitespaceNodesToSpace(d,!0),w=y.space,k=y.rawSpace;k||(k=w);var S={},E={spaces:{}};w.endsWith(" ")&&k.endsWith(" ")?(S.before=w.slice(0,w.length-1),E.spaces.before=k.slice(0,k.length-1)):w.startsWith(" ")&&k.startsWith(" ")?(S.after=w.slice(1),E.spaces.after=k.slice(1)):E.value=k,p=new fo.default({value:" ",source:ho(f,this.tokens[this.position-1]),sourceIndex:f[P.FIELDS.START_POS],spaces:S,raws:E})}return this.currToken&&this.currToken[P.FIELDS.TYPE]===$.space&&(p.spaces.after=this.optionalSpace(this.content()),this.position++),this.newNode(p)},e.comma=function(){if(this.position===this.tokens.length-1){this.root.trailingComma=!0,this.position++;return}this.current._inferEndPosition();var i=new lo.default({source:{start:yd(this.tokens[this.position+1])}});this.current.parent.append(i),this.current=i,this.position++},e.comment=function(){var i=this.currToken;this.newNode(new dd.default({value:this.content(),source:sr(i),sourceIndex:i[P.FIELDS.START_POS]})),this.position++},e.error=function(i,n){throw this.root.error(i,n)},e.missingBackslash=function(){return this.error("Expected a backslash preceding the semicolon.",{index:this.currToken[P.FIELDS.START_POS]})},e.missingParenthesis=function(){return this.expected("opening parenthesis",this.currToken[P.FIELDS.START_POS])},e.missingSquareBracket=function(){return this.expected("opening square bracket",this.currToken[P.FIELDS.START_POS])},e.unexpected=function(){return this.error("Unexpected '"+this.content()+"'. Escaping special characters with \\ may help.",this.currToken[P.FIELDS.START_POS])},e.unexpectedPipe=function(){return this.error("Unexpected '|'.",this.currToken[P.FIELDS.START_POS])},e.namespace=function(){var i=this.prevToken&&this.content(this.prevToken)||!0;if(this.nextToken[P.FIELDS.TYPE]===$.word)return this.position++,this.word(i);if(this.nextToken[P.FIELDS.TYPE]===$.asterisk)return this.position++,this.universal(i);this.unexpectedPipe()},e.nesting=function(){if(this.nextToken){var i=this.content(this.nextToken);if(i==="|"){this.position++;return}}var n=this.currToken;this.newNode(new LA.default({value:this.content(),source:sr(n),sourceIndex:n[P.FIELDS.START_POS]})),this.position++},e.parentheses=function(){var i=this.current.last,n=1;if(this.position++,i&&i.type===NA.PSEUDO){var a=new lo.default({source:{start:yd(this.tokens[this.position-1])}}),s=this.current;for(i.append(a),this.current=a;this.position<this.tokens.length&&n;)this.currToken[P.FIELDS.TYPE]===$.openParenthesis&&n++,this.currToken[P.FIELDS.TYPE]===$.closeParenthesis&&n--,n?this.parse():(this.current.source.end=bd(this.currToken),this.current.parent.source.end=bd(this.currToken),this.position++);this.current=s}else{for(var o=this.currToken,l="(",c;this.position<this.tokens.length&&n;)this.currToken[P.FIELDS.TYPE]===$.openParenthesis&&n++,this.currToken[P.FIELDS.TYPE]===$.closeParenthesis&&n--,c=this.currToken,l+=this.parseParenthesisToken(this.currToken),this.position++;i?i.appendToPropertyAndEscape("value",l,l):this.newNode(new uo.default({value:l,source:Ht(o[P.FIELDS.START_LINE],o[P.FIELDS.START_COL],c[P.FIELDS.END_LINE],c[P.FIELDS.END_COL]),sourceIndex:o[P.FIELDS.START_POS]}))}if(n)return this.expected("closing parenthesis",this.currToken[P.FIELDS.START_POS])},e.pseudo=function(){for(var i=this,n="",a=this.currToken;this.currToken&&this.currToken[P.FIELDS.TYPE]===$.colon;)n+=this.content(),this.position++;if(!this.currToken)return this.expected(["pseudo-class","pseudo-element"],this.position-1);if(this.currToken[P.FIELDS.TYPE]===$.word)this.splitWord(!1,function(s,o){n+=s,i.newNode(new qA.default({value:n,source:ho(a,i.currToken),sourceIndex:a[P.FIELDS.START_POS]})),o>1&&i.nextToken&&i.nextToken[P.FIELDS.TYPE]===$.openParenthesis&&i.error("Misplaced parenthesis.",{index:i.nextToken[P.FIELDS.START_POS]})});else return this.expected(["pseudo-class","pseudo-element"],this.currToken[P.FIELDS.START_POS])},e.space=function(){var i=this.content();this.position===0||this.prevToken[P.FIELDS.TYPE]===$.comma||this.prevToken[P.FIELDS.TYPE]===$.openParenthesis||this.current.nodes.every(function(n){return n.type==="comment"})?(this.spaces=this.optionalSpace(i),this.position++):this.position===this.tokens.length-1||this.nextToken[P.FIELDS.TYPE]===$.comma||this.nextToken[P.FIELDS.TYPE]===$.closeParenthesis?(this.current.last.spaces.after=this.optionalSpace(i),this.position++):this.combinator()},e.string=function(){var i=this.currToken;this.newNode(new uo.default({value:this.content(),source:sr(i),sourceIndex:i[P.FIELDS.START_POS]})),this.position++},e.universal=function(i){var n=this.nextToken;if(n&&this.content(n)==="|")return this.position++,this.namespace();var a=this.currToken;this.newNode(new $A.default({value:this.content(),source:sr(a),sourceIndex:a[P.FIELDS.START_POS]}),i),this.position++},e.splitWord=function(i,n){for(var a=this,s=this.nextToken,o=this.content();s&&~[$.dollar,$.caret,$.equals,$.word].indexOf(s[P.FIELDS.TYPE]);){this.position++;var l=this.content();if(o+=l,l.lastIndexOf("\\")===l.length-1){var c=this.nextToken;c&&c[P.FIELDS.TYPE]===$.space&&(o+=this.requiredSpace(this.content(c)),this.position++)}s=this.nextToken}var f=mo(o,".").filter(function(b){var v=o[b-1]==="\\",y=/^\d+\.\d+%$/.test(o);return!v&&!y}),d=mo(o,"#").filter(function(b){return o[b-1]!=="\\"}),p=mo(o,"#{");p.length&&(d=d.filter(function(b){return!~p.indexOf(b)}));var h=(0,MA.default)(jA([0].concat(f,d)));h.forEach(function(b,v){var y=h[v+1]||o.length,w=o.slice(b,y);if(v===0&&n)return n.call(a,w,h.length);var k,S=a.currToken,E=S[P.FIELDS.START_POS]+h[v],O=Ht(S[1],S[2]+b,S[3],S[2]+(y-1));if(~f.indexOf(b)){var B={value:w.slice(1),source:O,sourceIndex:E};k=new PA.default(ar(B,"value"))}else if(~d.indexOf(b)){var N={value:w.slice(1),source:O,sourceIndex:E};k=new IA.default(ar(N,"value"))}else{var T={value:w,source:O,sourceIndex:E};ar(T,"value"),k=new DA.default(T)}a.newNode(k,i),i=null}),this.position++},e.word=function(i){var n=this.nextToken;return n&&this.content(n)==="|"?(this.position++,this.namespace()):this.splitWord(i)},e.loop=function(){for(;this.position<this.tokens.length;)this.parse(!0);return this.current._inferEndPosition(),this.root},e.parse=function(i){switch(this.currToken[P.FIELDS.TYPE]){case $.space:this.space();break;case $.comment:this.comment();break;case $.openParenthesis:this.parentheses();break;case $.closeParenthesis:i&&this.missingParenthesis();break;case $.openSquare:this.attribute();break;case $.dollar:case $.caret:case $.equals:case $.word:this.word();break;case $.colon:this.pseudo();break;case $.comma:this.comma();break;case $.asterisk:this.universal();break;case $.ampersand:this.nesting();break;case $.slash:case $.combinator:this.combinator();break;case $.str:this.string();break;case $.closeSquare:this.missingSquareBracket();case $.semicolon:this.missingBackslash();default:this.unexpected()}},e.expected=function(i,n,a){if(Array.isArray(i)){var s=i.pop();i=i.join(", ")+" or "+s}var o=/^[aeiou]/.test(i[0])?"an":"a";return a?this.error("Expected "+o+" "+i+', found "'+a+'" instead.',{index:n}):this.error("Expected "+o+" "+i+".",{index:n})},e.requiredSpace=function(i){return this.options.lossy?" ":i},e.optionalSpace=function(i){return this.options.lossy?"":i},e.lossySpace=function(i,n){return this.options.lossy?n?" ":"":i},e.parseParenthesisToken=function(i){var n=this.content(i);return i[P.FIELDS.TYPE]===$.space?this.requiredSpace(n):n},e.newNode=function(i,n){return n&&(/^ +$/.test(n)&&(this.options.lossy||(this.spaces=(this.spaces||"")+n),n=!0),i.namespace=n,ar(i,"namespace")),this.spaces&&(i.spaces.before=this.spaces,this.spaces=""),this.current.append(i)},e.content=function(i){return i===void 0&&(i=this.currToken),this.css.slice(i[P.FIELDS.START_POS],i[P.FIELDS.END_POS])},e.locateNextMeaningfulToken=function(i){i===void 0&&(i=this.position+1);for(var n=i;n<this.tokens.length;)if(FA[this.tokens[n][P.FIELDS.TYPE]]){n++;continue}else return n;return-1},BA(r,[{key:"currToken",get:function(){return this.tokens[this.position]}},{key:"nextToken",get:function(){return this.tokens[this.position+1]}},{key:"prevToken",get:function(){return this.tokens[this.position-1]}}]),r}();vi.default=zA;wd.exports=vi.default});var kd=x((xi,xd)=>{u();"use strict";xi.__esModule=!0;xi.default=void 0;var UA=VA(vd());function VA(r){return r&&r.__esModule?r:{default:r}}var HA=function(){function r(t,i){this.func=t||function(){},this.funcRes=null,this.options=i}var e=r.prototype;return e._shouldUpdateSelector=function(i,n){n===void 0&&(n={});var a=Object.assign({},this.options,n);return a.updateSelector===!1?!1:typeof i!="string"},e._isLossy=function(i){i===void 0&&(i={});var n=Object.assign({},this.options,i);return n.lossless===!1},e._root=function(i,n){n===void 0&&(n={});var a=new UA.default(i,this._parseOptions(n));return a.root},e._parseOptions=function(i){return{lossy:this._isLossy(i)}},e._run=function(i,n){var a=this;return n===void 0&&(n={}),new Promise(function(s,o){try{var l=a._root(i,n);Promise.resolve(a.func(l)).then(function(c){var f=void 0;return a._shouldUpdateSelector(i,n)&&(f=l.toString(),i.selector=f),{transform:c,root:l,string:f}}).then(s,o)}catch(c){o(c);return}})},e._runSync=function(i,n){n===void 0&&(n={});var a=this._root(i,n),s=this.func(a);if(s&&typeof s.then=="function")throw new Error("Selector processor returned a promise to a synchronous call.");var o=void 0;return n.updateSelector&&typeof i!="string"&&(o=a.toString(),i.selector=o),{transform:s,root:a,string:o}},e.ast=function(i,n){return this._run(i,n).then(function(a){return a.root})},e.astSync=function(i,n){return this._runSync(i,n).root},e.transform=function(i,n){return this._run(i,n).then(function(a){return a.transform})},e.transformSync=function(i,n){return this._runSync(i,n).transform},e.process=function(i,n){return this._run(i,n).then(function(a){return a.string||a.root.toString()})},e.processSync=function(i,n){var a=this._runSync(i,n);return a.string||a.root.toString()},r}();xi.default=HA;xd.exports=xi.default});var Sd=x(ne=>{u();"use strict";ne.__esModule=!0;ne.universal=ne.tag=ne.string=ne.selector=ne.root=ne.pseudo=ne.nesting=ne.id=ne.comment=ne.combinator=ne.className=ne.attribute=void 0;var WA=ze(Za()),GA=ze(La()),QA=ze(io()),YA=ze(Na()),KA=ze(Fa()),XA=ze(so()),JA=ze(Ga()),ZA=ze(Pa()),eC=ze(Da()),tC=ze(Ha()),rC=ze(Ua()),iC=ze(to());function ze(r){return r&&r.__esModule?r:{default:r}}var nC=function(e){return new WA.default(e)};ne.attribute=nC;var sC=function(e){return new GA.default(e)};ne.className=sC;var aC=function(e){return new QA.default(e)};ne.combinator=aC;var oC=function(e){return new YA.default(e)};ne.comment=oC;var lC=function(e){return new KA.default(e)};ne.id=lC;var uC=function(e){return new XA.default(e)};ne.nesting=uC;var fC=function(e){return new JA.default(e)};ne.pseudo=fC;var cC=function(e){return new ZA.default(e)};ne.root=cC;var pC=function(e){return new eC.default(e)};ne.selector=pC;var dC=function(e){return new tC.default(e)};ne.string=dC;var hC=function(e){return new rC.default(e)};ne.tag=hC;var mC=function(e){return new iC.default(e)};ne.universal=mC});var Ed=x(J=>{u();"use strict";J.__esModule=!0;J.isComment=J.isCombinator=J.isClassName=J.isAttribute=void 0;J.isContainer=EC;J.isIdentifier=void 0;J.isNamespace=OC;J.isNesting=void 0;J.isNode=go;J.isPseudo=void 0;J.isPseudoClass=_C;J.isPseudoElement=_d;J.isUniversal=J.isTag=J.isString=J.isSelector=J.isRoot=void 0;var ue=Se(),Oe,gC=(Oe={},Oe[ue.ATTRIBUTE]=!0,Oe[ue.CLASS]=!0,Oe[ue.COMBINATOR]=!0,Oe[ue.COMMENT]=!0,Oe[ue.ID]=!0,Oe[ue.NESTING]=!0,Oe[ue.PSEUDO]=!0,Oe[ue.ROOT]=!0,Oe[ue.SELECTOR]=!0,Oe[ue.STRING]=!0,Oe[ue.TAG]=!0,Oe[ue.UNIVERSAL]=!0,Oe);function go(r){return typeof r=="object"&&gC[r.type]}function Ue(r,e){return go(e)&&e.type===r}var Ad=Ue.bind(null,ue.ATTRIBUTE);J.isAttribute=Ad;var yC=Ue.bind(null,ue.CLASS);J.isClassName=yC;var bC=Ue.bind(null,ue.COMBINATOR);J.isCombinator=bC;var wC=Ue.bind(null,ue.COMMENT);J.isComment=wC;var vC=Ue.bind(null,ue.ID);J.isIdentifier=vC;var xC=Ue.bind(null,ue.NESTING);J.isNesting=xC;var yo=Ue.bind(null,ue.PSEUDO);J.isPseudo=yo;var kC=Ue.bind(null,ue.ROOT);J.isRoot=kC;var SC=Ue.bind(null,ue.SELECTOR);J.isSelector=SC;var AC=Ue.bind(null,ue.STRING);J.isString=AC;var Cd=Ue.bind(null,ue.TAG);J.isTag=Cd;var CC=Ue.bind(null,ue.UNIVERSAL);J.isUniversal=CC;function _d(r){return yo(r)&&r.value&&(r.value.startsWith("::")||r.value.toLowerCase()===":before"||r.value.toLowerCase()===":after"||r.value.toLowerCase()===":first-letter"||r.value.toLowerCase()===":first-line")}function _C(r){return yo(r)&&!_d(r)}function EC(r){return!!(go(r)&&r.walk)}function OC(r){return Ad(r)||Cd(r)}});var Od=x(Ke=>{u();"use strict";Ke.__esModule=!0;var bo=Se();Object.keys(bo).forEach(function(r){r==="default"||r==="__esModule"||r in Ke&&Ke[r]===bo[r]||(Ke[r]=bo[r])});var wo=Sd();Object.keys(wo).forEach(function(r){r==="default"||r==="__esModule"||r in Ke&&Ke[r]===wo[r]||(Ke[r]=wo[r])});var vo=Ed();Object.keys(vo).forEach(function(r){r==="default"||r==="__esModule"||r in Ke&&Ke[r]===vo[r]||(Ke[r]=vo[r])})});var it=x((ki,Rd)=>{u();"use strict";ki.__esModule=!0;ki.default=void 0;var TC=IC(kd()),RC=PC(Od());function Td(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return(Td=function(n){return n?t:e})(r)}function PC(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=Td(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}function IC(r){return r&&r.__esModule?r:{default:r}}var xo=function(e){return new TC.default(e)};Object.assign(xo,RC);delete xo.__esModule;var DC=xo;ki.default=DC;Rd.exports=ki.default});function mt(r){return["fontSize","outline"].includes(r)?e=>(typeof e=="function"&&(e=e({})),Array.isArray(e)&&(e=e[0]),e):r==="fontFamily"?e=>{typeof e=="function"&&(e=e({}));let t=Array.isArray(e)&&ke(e[1])?e[0]:e;return Array.isArray(t)?t.join(", "):t}:["boxShadow","transitionProperty","transitionDuration","transitionDelay","transitionTimingFunction","backgroundImage","backgroundSize","backgroundColor","cursor","animation"].includes(r)?e=>(typeof e=="function"&&(e=e({})),Array.isArray(e)&&(e=e.join(", ")),e):["gridTemplateColumns","gridTemplateRows","objectPosition"].includes(r)?e=>(typeof e=="function"&&(e=e({})),typeof e=="string"&&(e=ee.list.comma(e).join(" ")),e):(e,t={})=>(typeof e=="function"&&(e=e(t)),e)}var Si=R(()=>{u();Ot();Kt()});var Md=x(($I,_o)=>{u();var{Rule:Pd,AtRule:qC}=$e(),Id=it();function ko(r,e){let t;try{Id(i=>{t=i}).processSync(r)}catch(i){throw r.includes(":")?e?e.error("Missed semicolon"):i:e?e.error(i.message):i}return t.at(0)}function Dd(r,e){let t=!1;return r.each(i=>{if(i.type==="nesting"){let n=e.clone({});i.value!=="&"?i.replaceWith(ko(i.value.replace("&",n.toString()))):i.replaceWith(n),t=!0}else"nodes"in i&&i.nodes&&Dd(i,e)&&(t=!0)}),t}function qd(r,e){let t=[];return r.selectors.forEach(i=>{let n=ko(i,r);e.selectors.forEach(a=>{if(!a)return;let s=ko(a,e);Dd(s,n)||(s.prepend(Id.combinator({value:" "})),s.prepend(n.clone({}))),t.push(s.toString())})}),t}function Vn(r,e){let t=r.prev();for(e.after(r);t&&t.type==="comment";){let i=t.prev();e.after(t),t=i}return r}function $C(r){return function e(t,i,n,a=n){let s=[];if(i.each(o=>{o.type==="rule"&&n?a&&(o.selectors=qd(t,o)):o.type==="atrule"&&o.nodes?r[o.name]?e(t,o,a):i[Ao]!==!1&&s.push(o):s.push(o)}),n&&s.length){let o=t.clone({nodes:[]});for(let l of s)o.append(l);i.prepend(o)}}}function So(r,e,t){let i=new Pd({selector:r,nodes:[]});return i.append(e),t.after(i),i}function $d(r,e){let t={};for(let i of r)t[i]=!0;if(e)for(let i of e)t[i.replace(/^@/,"")]=!0;return t}function LC(r){r=r.trim();let e=r.match(/^\((.*)\)$/);if(!e)return{type:"basic",selector:r};let t=e[1].match(/^(with(?:out)?):(.+)$/);if(t){let i=t[1]==="with",n=Object.fromEntries(t[2].trim().split(/\s+/).map(s=>[s,!0]));if(i&&n.all)return{type:"noop"};let a=s=>!!n[s];return n.all?a=()=>!0:i&&(a=s=>s==="all"?!1:!n[s]),{type:"withrules",escapes:a}}return{type:"unknown"}}function MC(r){let e=[],t=r.parent;for(;t&&t instanceof qC;)e.push(t),t=t.parent;return e}function NC(r){let e=r[Ld];if(!e)r.after(r.nodes);else{let t=r.nodes,i,n=-1,a,s,o,l=MC(r);if(l.forEach((c,f)=>{if(e(c.name))i=c,n=f,s=o;else{let d=o;o=c.clone({nodes:[]}),d&&o.append(d),a=a||o}}),i?s?(a.append(t),i.after(s)):i.after(t):r.after(t),r.next()&&i){let c;l.slice(0,n+1).forEach((f,d,p)=>{let h=c;c=f.clone({nodes:[]}),h&&c.append(h);let b=[],y=(p[d-1]||r).next();for(;y;)b.push(y),y=y.next();c.append(b)}),c&&(s||t[t.length-1]).after(c)}}r.remove()}var Ao=Symbol("rootRuleMergeSel"),Ld=Symbol("rootRuleEscapes");function BC(r){let{params:e}=r,{type:t,selector:i,escapes:n}=LC(e);if(t==="unknown")throw r.error(`Unknown @${r.name} parameter ${JSON.stringify(e)}`);if(t==="basic"&&i){let a=new Pd({selector:i,nodes:r.nodes});r.removeAll(),r.append(a)}r[Ld]=n,r[Ao]=n?!n("all"):t==="noop"}var Co=Symbol("hasRootRule");_o.exports=(r={})=>{let e=$d(["media","supports","layer","container"],r.bubble),t=$C(e),i=$d(["document","font-face","keyframes","-webkit-keyframes","-moz-keyframes"],r.unwrap),n=(r.rootRuleName||"at-root").replace(/^@/,""),a=r.preserveEmpty;return{postcssPlugin:"postcss-nested",Once(s){s.walkAtRules(n,o=>{BC(o),s[Co]=!0})},Rule(s){let o=!1,l=s,c=!1,f=[];s.each(d=>{d.type==="rule"?(f.length&&(l=So(s.selector,f,l),f=[]),c=!0,o=!0,d.selectors=qd(s,d),l=Vn(d,l)):d.type==="atrule"?(f.length&&(l=So(s.selector,f,l),f=[]),d.name===n?(o=!0,t(s,d,!0,d[Ao]),l=Vn(d,l)):e[d.name]?(c=!0,o=!0,t(s,d,!0),l=Vn(d,l)):i[d.name]?(c=!0,o=!0,t(s,d,!1),l=Vn(d,l)):c&&f.push(d)):d.type==="decl"&&c&&f.push(d)}),f.length&&(l=So(s.selector,f,l)),o&&a!==!0&&(s.raws.semicolon=!0,s.nodes.length===0&&s.remove())},RootExit(s){s[Co]&&(s.walkAtRules(n,NC),s[Co]=!1)}}};_o.exports.postcss=!0});var jd=x((LI,Fd)=>{u();"use strict";var Nd=/-(\w|$)/g,Bd=(r,e)=>e.toUpperCase(),FC=r=>(r=r.toLowerCase(),r==="float"?"cssFloat":r.startsWith("-ms-")?r.substr(1).replace(Nd,Bd):r.replace(Nd,Bd));Fd.exports=FC});var To=x((MI,zd)=>{u();var jC=jd(),zC={boxFlex:!0,boxFlexGroup:!0,columnCount:!0,flex:!0,flexGrow:!0,flexPositive:!0,flexShrink:!0,flexNegative:!0,fontWeight:!0,lineClamp:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,tabSize:!0,widows:!0,zIndex:!0,zoom:!0,fillOpacity:!0,strokeDashoffset:!0,strokeOpacity:!0,strokeWidth:!0};function Eo(r){return typeof r.nodes=="undefined"?!0:Oo(r)}function Oo(r){let e,t={};return r.each(i=>{if(i.type==="atrule")e="@"+i.name,i.params&&(e+=" "+i.params),typeof t[e]=="undefined"?t[e]=Eo(i):Array.isArray(t[e])?t[e].push(Eo(i)):t[e]=[t[e],Eo(i)];else if(i.type==="rule"){let n=Oo(i);if(t[i.selector])for(let a in n)t[i.selector][a]=n[a];else t[i.selector]=n}else if(i.type==="decl"){i.prop[0]==="-"&&i.prop[1]==="-"||i.parent&&i.parent.selector===":export"?e=i.prop:e=jC(i.prop);let n=i.value;!isNaN(i.value)&&zC[e]&&(n=parseFloat(i.value)),i.important&&(n+=" !important"),typeof t[e]=="undefined"?t[e]=n:Array.isArray(t[e])?t[e].push(n):t[e]=[t[e],n]}}),t}zd.exports=Oo});var Hn=x((NI,Wd)=>{u();var Ai=$e(),Ud=/\s*!important\s*$/i,UC={"box-flex":!0,"box-flex-group":!0,"column-count":!0,flex:!0,"flex-grow":!0,"flex-positive":!0,"flex-shrink":!0,"flex-negative":!0,"font-weight":!0,"line-clamp":!0,"line-height":!0,opacity:!0,order:!0,orphans:!0,"tab-size":!0,widows:!0,"z-index":!0,zoom:!0,"fill-opacity":!0,"stroke-dashoffset":!0,"stroke-opacity":!0,"stroke-width":!0};function VC(r){return r.replace(/([A-Z])/g,"-$1").replace(/^ms-/,"-ms-").toLowerCase()}function Vd(r,e,t){t===!1||t===null||(e.startsWith("--")||(e=VC(e)),typeof t=="number"&&(t===0||UC[e]?t=t.toString():t+="px"),e==="css-float"&&(e="float"),Ud.test(t)?(t=t.replace(Ud,""),r.push(Ai.decl({prop:e,value:t,important:!0}))):r.push(Ai.decl({prop:e,value:t})))}function Hd(r,e,t){let i=Ai.atRule({name:e[1],params:e[3]||""});typeof t=="object"&&(i.nodes=[],Ro(t,i)),r.push(i)}function Ro(r,e){let t,i,n;for(t in r)if(i=r[t],!(i===null||typeof i=="undefined"))if(t[0]==="@"){let a=t.match(/@(\S+)(\s+([\W\w]*)\s*)?/);if(Array.isArray(i))for(let s of i)Hd(e,a,s);else Hd(e,a,i)}else if(Array.isArray(i))for(let a of i)Vd(e,t,a);else typeof i=="object"?(n=Ai.rule({selector:t}),Ro(i,n),e.push(n)):Vd(e,t,i)}Wd.exports=function(r){let e=Ai.root();return Ro(r,e),e}});var Po=x((BI,Gd)=>{u();var HC=To();Gd.exports=function(e){return console&&console.warn&&e.warnings().forEach(t=>{let i=t.plugin||"PostCSS";console.warn(i+": "+t.text)}),HC(e.root)}});var Yd=x((FI,Qd)=>{u();var WC=$e(),GC=Po(),QC=Hn();Qd.exports=function(e){let t=WC(e);return async i=>{let n=await t.process(i,{parser:QC,from:void 0});return GC(n)}}});var Xd=x((jI,Kd)=>{u();var YC=$e(),KC=Po(),XC=Hn();Kd.exports=function(r){let e=YC(r);return t=>{let i=e.process(t,{parser:XC,from:void 0});return KC(i)}}});var Zd=x((zI,Jd)=>{u();var JC=To(),ZC=Hn(),e_=Yd(),t_=Xd();Jd.exports={objectify:JC,parse:ZC,async:e_,sync:t_}});var or,eh,UI,VI,HI,WI,th=R(()=>{u();or=pe(Zd()),eh=or.default,UI=or.default.objectify,VI=or.default.parse,HI=or.default.async,WI=or.default.sync});function lr(r){return Array.isArray(r)?r.flatMap(e=>ee([(0,rh.default)({bubble:["screen"]})]).process(e,{parser:eh}).root.nodes):lr([r])}var rh,Io=R(()=>{u();Ot();rh=pe(Md());th()});function ur(r,e,t=!1){if(r==="")return e;let i=typeof e=="string"?(0,ih.default)().astSync(e):e;return i.walkClasses(n=>{let a=n.value,s=t&&a.startsWith("-");n.value=s?`-${r}${a.slice(1)}`:`${r}${a}`}),typeof e=="string"?i.toString():i}var ih,Wn=R(()=>{u();ih=pe(it())});function Te(r){let e=nh.default.className();return e.value=r,jt(e?.raws?.value??e.value)}var nh,fr=R(()=>{u();nh=pe(it());Ki()});function Do(r){return jt(`.${Te(r)}`)}function Gn(r,e){return Do(Ci(r,e))}function Ci(r,e){return e==="DEFAULT"?r:e==="-"||e==="-DEFAULT"?`-${r}`:e.startsWith("-")?`-${r}${e}`:e.startsWith("/")?`${r}${e}`:`${r}-${e}`}var qo=R(()=>{u();fr();Ki()});function L(r,e=[[r,[r]]],{filterDefault:t=!1,...i}={}){let n=mt(r);return function({matchUtilities:a,theme:s}){for(let o of e){let l=Array.isArray(o[0])?o:[o];a(l.reduce((c,[f,d])=>Object.assign(c,{[f]:p=>d.reduce((h,b)=>Array.isArray(b)?Object.assign(h,{[b[0]]:b[1]}):Object.assign(h,{[b]:n(p)}),{})}),{}),{...i,values:t?Object.fromEntries(Object.entries(s(r)??{}).filter(([c])=>c!=="DEFAULT")):s(r)})}}}var sh=R(()=>{u();Si()});function Tt(r){return r=Array.isArray(r)?r:[r],r.map(e=>{let t=e.values.map(i=>i.raw!==void 0?i.raw:[i.min&&`(min-width: ${i.min})`,i.max&&`(max-width: ${i.max})`].filter(Boolean).join(" and "));return e.not?`not all and ${t}`:t}).join(", ")}var Qn=R(()=>{u()});function $o(r){return r.split(l_).map(t=>{let i=t.trim(),n={value:i},a=i.split(u_),s=new Set;for(let o of a)!s.has("DIRECTIONS")&&r_.has(o)?(n.direction=o,s.add("DIRECTIONS")):!s.has("PLAY_STATES")&&i_.has(o)?(n.playState=o,s.add("PLAY_STATES")):!s.has("FILL_MODES")&&n_.has(o)?(n.fillMode=o,s.add("FILL_MODES")):!s.has("ITERATION_COUNTS")&&(s_.has(o)||f_.test(o))?(n.iterationCount=o,s.add("ITERATION_COUNTS")):!s.has("TIMING_FUNCTION")&&a_.has(o)||!s.has("TIMING_FUNCTION")&&o_.some(l=>o.startsWith(`${l}(`))?(n.timingFunction=o,s.add("TIMING_FUNCTION")):!s.has("DURATION")&&ah.test(o)?(n.duration=o,s.add("DURATION")):!s.has("DELAY")&&ah.test(o)?(n.delay=o,s.add("DELAY")):s.has("NAME")?(n.unknown||(n.unknown=[]),n.unknown.push(o)):(n.name=o,s.add("NAME"));return n})}var r_,i_,n_,s_,a_,o_,l_,u_,ah,f_,oh=R(()=>{u();r_=new Set(["normal","reverse","alternate","alternate-reverse"]),i_=new Set(["running","paused"]),n_=new Set(["none","forwards","backwards","both"]),s_=new Set(["infinite"]),a_=new Set(["linear","ease","ease-in","ease-out","ease-in-out","step-start","step-end"]),o_=["cubic-bezier","steps"],l_=/\,(?![^(]*\))/g,u_=/\ +(?![^(]*\))/g,ah=/^(-?[\d.]+m?s)$/,f_=/^(\d+)$/});var lh,xe,uh=R(()=>{u();lh=r=>Object.assign({},...Object.entries(r??{}).flatMap(([e,t])=>typeof t=="object"?Object.entries(lh(t)).map(([i,n])=>({[e+(i==="DEFAULT"?"":`-${i}`)]:n})):[{[`${e}`]:t}])),xe=lh});var ch,fh=R(()=>{ch="3.4.14"});function Rt(r,e=!0){return Array.isArray(r)?r.map(t=>{if(e&&Array.isArray(t))throw new Error("The tuple syntax is not supported for `screens`.");if(typeof t=="string")return{name:t.toString(),not:!1,values:[{min:t,max:void 0}]};let[i,n]=t;return i=i.toString(),typeof n=="string"?{name:i,not:!1,values:[{min:n,max:void 0}]}:Array.isArray(n)?{name:i,not:!1,values:n.map(a=>dh(a))}:{name:i,not:!1,values:[dh(n)]}}):Rt(Object.entries(r??{}),!1)}function Yn(r){return r.values.length!==1?{result:!1,reason:"multiple-values"}:r.values[0].raw!==void 0?{result:!1,reason:"raw-values"}:r.values[0].min!==void 0&&r.values[0].max!==void 0?{result:!1,reason:"min-and-max"}:{result:!0,reason:null}}function ph(r,e,t){let i=Kn(e,r),n=Kn(t,r),a=Yn(i),s=Yn(n);if(a.reason==="multiple-values"||s.reason==="multiple-values")throw new Error("Attempted to sort a screen with multiple values. This should never happen. Please open a bug report.");if(a.reason==="raw-values"||s.reason==="raw-values")throw new Error("Attempted to sort a screen with raw values. This should never happen. Please open a bug report.");if(a.reason==="min-and-max"||s.reason==="min-and-max")throw new Error("Attempted to sort a screen with both min and max values. This should never happen. Please open a bug report.");let{min:o,max:l}=i.values[0],{min:c,max:f}=n.values[0];e.not&&([o,l]=[l,o]),t.not&&([c,f]=[f,c]),o=o===void 0?o:parseFloat(o),l=l===void 0?l:parseFloat(l),c=c===void 0?c:parseFloat(c),f=f===void 0?f:parseFloat(f);let[d,p]=r==="min"?[o,c]:[f,l];return d-p}function Kn(r,e){return typeof r=="object"?r:{name:"arbitrary-screen",values:[{[e]:r}]}}function dh({"min-width":r,min:e=r,max:t,raw:i}={}){return{min:e,max:t,raw:i}}var Xn=R(()=>{u()});function Jn(r,e){r.walkDecls(t=>{if(e.includes(t.prop)){t.remove();return}for(let i of e)t.value.includes(`/ var(${i})`)&&(t.value=t.value.replace(`/ var(${i})`,""))})}var hh=R(()=>{u()});var se,Xe,nt,ge,mh,gh=R(()=>{u();ft();et();Ot();sh();Qn();fr();oh();uh();Lr();ea();Kt();Si();fh();Be();Xn();Gs();hh();ct();Br();_i();se={childVariant:({addVariant:r})=>{r("*","& > *")},pseudoElementVariants:({addVariant:r})=>{r("first-letter","&::first-letter"),r("first-line","&::first-line"),r("marker",[({container:e})=>(Jn(e,["--tw-text-opacity"]),"& *::marker"),({container:e})=>(Jn(e,["--tw-text-opacity"]),"&::marker")]),r("selection",["& *::selection","&::selection"]),r("file","&::file-selector-button"),r("placeholder","&::placeholder"),r("backdrop","&::backdrop"),r("before",({container:e})=>(e.walkRules(t=>{let i=!1;t.walkDecls("content",()=>{i=!0}),i||t.prepend(ee.decl({prop:"content",value:"var(--tw-content)"}))}),"&::before")),r("after",({container:e})=>(e.walkRules(t=>{let i=!1;t.walkDecls("content",()=>{i=!0}),i||t.prepend(ee.decl({prop:"content",value:"var(--tw-content)"}))}),"&::after"))},pseudoClassVariants:({addVariant:r,matchVariant:e,config:t,prefix:i})=>{let n=[["first","&:first-child"],["last","&:last-child"],["only","&:only-child"],["odd","&:nth-child(odd)"],["even","&:nth-child(even)"],"first-of-type","last-of-type","only-of-type",["visited",({container:s})=>(Jn(s,["--tw-text-opacity","--tw-border-opacity","--tw-bg-opacity"]),"&:visited")],"target",["open","&[open]"],"default","checked","indeterminate","placeholder-shown","autofill","optional","required","valid","invalid","in-range","out-of-range","read-only","empty","focus-within",["hover",we(t(),"hoverOnlyWhenSupported")?"@media (hover: hover) and (pointer: fine) { &:hover }":"&:hover"],"focus","focus-visible","active","enabled","disabled"].map(s=>Array.isArray(s)?s:[s,`&:${s}`]);for(let[s,o]of n)r(s,l=>typeof o=="function"?o(l):o);let a={group:(s,{modifier:o})=>o?[`:merge(${i(".group")}\\/${Te(o)})`," &"]:[`:merge(${i(".group")})`," &"],peer:(s,{modifier:o})=>o?[`:merge(${i(".peer")}\\/${Te(o)})`," ~ &"]:[`:merge(${i(".peer")})`," ~ &"]};for(let[s,o]of Object.entries(a))e(s,(l="",c)=>{let f=K(typeof l=="function"?l(c):l);f.includes("&")||(f="&"+f);let[d,p]=o("",c),h=null,b=null,v=0;for(let y=0;y<f.length;++y){let w=f[y];w==="&"?h=y:w==="'"||w==='"'?v+=1:h!==null&&w===" "&&!v&&(b=y)}return h!==null&&b===null&&(b=f.length),f.slice(0,h)+d+f.slice(h+1,b)+p+f.slice(b)},{values:Object.fromEntries(n),[Pt]:{respectPrefix:!1}})},directionVariants:({addVariant:r})=>{r("ltr",'&:where([dir="ltr"], [dir="ltr"] *)'),r("rtl",'&:where([dir="rtl"], [dir="rtl"] *)')},reducedMotionVariants:({addVariant:r})=>{r("motion-safe","@media (prefers-reduced-motion: no-preference)"),r("motion-reduce","@media (prefers-reduced-motion: reduce)")},darkVariants:({config:r,addVariant:e})=>{let[t,i=".dark"]=[].concat(r("darkMode","media"));if(t===!1&&(t="media",G.warn("darkmode-false",["The `darkMode` option in your Tailwind CSS configuration is set to `false`, which now behaves the same as `media`.","Change `darkMode` to `media` or remove it entirely.","https://tailwindcss.com/docs/upgrade-guide#remove-dark-mode-configuration"])),t==="variant"){let n;if(Array.isArray(i)||typeof i=="function"?n=i:typeof i=="string"&&(n=[i]),Array.isArray(n))for(let a of n)a===".dark"?(t=!1,G.warn("darkmode-variant-without-selector",["When using `variant` for `darkMode`, you must provide a selector.",'Example: `darkMode: ["variant", ".your-selector &"]`'])):a.includes("&")||(t=!1,G.warn("darkmode-variant-without-ampersand",["When using `variant` for `darkMode`, your selector must contain `&`.",'Example `darkMode: ["variant", ".your-selector &"]`']));i=n}t==="selector"?e("dark",`&:where(${i}, ${i} *)`):t==="media"?e("dark","@media (prefers-color-scheme: dark)"):t==="variant"?e("dark",i):t==="class"&&e("dark",`&:is(${i} *)`)},printVariant:({addVariant:r})=>{r("print","@media print")},screenVariants:({theme:r,addVariant:e,matchVariant:t})=>{let i=r("screens")??{},n=Object.values(i).every(w=>typeof w=="string"),a=Rt(r("screens")),s=new Set([]);function o(w){return w.match(/(\D+)$/)?.[1]??"(none)"}function l(w){w!==void 0&&s.add(o(w))}function c(w){return l(w),s.size===1}for(let w of a)for(let k of w.values)l(k.min),l(k.max);let f=s.size<=1;function d(w){return Object.fromEntries(a.filter(k=>Yn(k).result).map(k=>{let{min:S,max:E}=k.values[0];if(w==="min"&&S!==void 0)return k;if(w==="min"&&E!==void 0)return{...k,not:!k.not};if(w==="max"&&E!==void 0)return k;if(w==="max"&&S!==void 0)return{...k,not:!k.not}}).map(k=>[k.name,k]))}function p(w){return(k,S)=>ph(w,k.value,S.value)}let h=p("max"),b=p("min");function v(w){return k=>{if(n)if(f){if(typeof k=="string"&&!c(k))return G.warn("minmax-have-mixed-units",["The `min-*` and `max-*` variants are not supported with a `screens` configuration containing mixed units."]),[]}else return G.warn("mixed-screen-units",["The `min-*` and `max-*` variants are not supported with a `screens` configuration containing mixed units."]),[];else return G.warn("complex-screen-config",["The `min-*` and `max-*` variants are not supported with a `screens` configuration containing objects."]),[];return[`@media ${Tt(Kn(k,w))}`]}}t("max",v("max"),{sort:h,values:n?d("max"):{}});let y="min-screens";for(let w of a)e(w.name,`@media ${Tt(w)}`,{id:y,sort:n&&f?b:void 0,value:w});t("min",v("min"),{id:y,sort:b})},supportsVariants:({matchVariant:r,theme:e})=>{r("supports",(t="")=>{let i=K(t),n=/^\w*\s*\(/.test(i);return i=n?i.replace(/\b(and|or|not)\b/g," $1 "):i,n?`@supports ${i}`:(i.includes(":")||(i=`${i}: var(--tw)`),i.startsWith("(")&&i.endsWith(")")||(i=`(${i})`),`@supports ${i}`)},{values:e("supports")??{}})},hasVariants:({matchVariant:r,prefix:e})=>{r("has",t=>`&:has(${K(t)})`,{values:{},[Pt]:{respectPrefix:!1}}),r("group-has",(t,{modifier:i})=>i?`:merge(${e(".group")}\\/${i}):has(${K(t)}) &`:`:merge(${e(".group")}):has(${K(t)}) &`,{values:{},[Pt]:{respectPrefix:!1}}),r("peer-has",(t,{modifier:i})=>i?`:merge(${e(".peer")}\\/${i}):has(${K(t)}) ~ &`:`:merge(${e(".peer")}):has(${K(t)}) ~ &`,{values:{},[Pt]:{respectPrefix:!1}})},ariaVariants:({matchVariant:r,theme:e})=>{r("aria",t=>`&[aria-${Ye(K(t))}]`,{values:e("aria")??{}}),r("group-aria",(t,{modifier:i})=>i?`:merge(.group\\/${i})[aria-${Ye(K(t))}] &`:`:merge(.group)[aria-${Ye(K(t))}] &`,{values:e("aria")??{}}),r("peer-aria",(t,{modifier:i})=>i?`:merge(.peer\\/${i})[aria-${Ye(K(t))}] ~ &`:`:merge(.peer)[aria-${Ye(K(t))}] ~ &`,{values:e("aria")??{}})},dataVariants:({matchVariant:r,theme:e})=>{r("data",t=>`&[data-${Ye(K(t))}]`,{values:e("data")??{}}),r("group-data",(t,{modifier:i})=>i?`:merge(.group\\/${i})[data-${Ye(K(t))}] &`:`:merge(.group)[data-${Ye(K(t))}] &`,{values:e("data")??{}}),r("peer-data",(t,{modifier:i})=>i?`:merge(.peer\\/${i})[data-${Ye(K(t))}] ~ &`:`:merge(.peer)[data-${Ye(K(t))}] ~ &`,{values:e("data")??{}})},orientationVariants:({addVariant:r})=>{r("portrait","@media (orientation: portrait)"),r("landscape","@media (orientation: landscape)")},prefersContrastVariants:({addVariant:r})=>{r("contrast-more","@media (prefers-contrast: more)"),r("contrast-less","@media (prefers-contrast: less)")},forcedColorsVariants:({addVariant:r})=>{r("forced-colors","@media (forced-colors: active)")}},Xe=["translate(var(--tw-translate-x), var(--tw-translate-y))","rotate(var(--tw-rotate))","skewX(var(--tw-skew-x))","skewY(var(--tw-skew-y))","scaleX(var(--tw-scale-x))","scaleY(var(--tw-scale-y))"].join(" "),nt=["var(--tw-blur)","var(--tw-brightness)","var(--tw-contrast)","var(--tw-grayscale)","var(--tw-hue-rotate)","var(--tw-invert)","var(--tw-saturate)","var(--tw-sepia)","var(--tw-drop-shadow)"].join(" "),ge=["var(--tw-backdrop-blur)","var(--tw-backdrop-brightness)","var(--tw-backdrop-contrast)","var(--tw-backdrop-grayscale)","var(--tw-backdrop-hue-rotate)","var(--tw-backdrop-invert)","var(--tw-backdrop-opacity)","var(--tw-backdrop-saturate)","var(--tw-backdrop-sepia)"].join(" "),mh={preflight:({addBase:r})=>{let e=ee.parse(`*,::after,::before{box-sizing:border-box;border-width:0;border-style:solid;border-color:theme('borderColor.DEFAULT', currentColor)}::after,::before{--tw-content:''}:host,html{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;tab-size:4;font-family:theme('fontFamily.sans', ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji");font-feature-settings:theme('fontFamily.sans[1].fontFeatureSettings', normal);font-variation-settings:theme('fontFamily.sans[1].fontVariationSettings', normal);-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:theme('fontFamily.mono', ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace);font-feature-settings:theme('fontFamily.mono[1].fontFeatureSettings', normal);font-variation-settings:theme('fontFamily.mono[1].fontVariationSettings', normal);font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::placeholder,textarea::placeholder{opacity:1;color:theme('colors.gray.4', #9ca3af)}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}`);r([ee.comment({text:`! tailwindcss v${ch} | MIT License | https://tailwindcss.com`}),...e.nodes])},container:(()=>{function r(t=[]){return t.flatMap(i=>i.values.map(n=>n.min)).filter(i=>i!==void 0)}function e(t,i,n){if(typeof n=="undefined")return[];if(!(typeof n=="object"&&n!==null))return[{screen:"DEFAULT",minWidth:0,padding:n}];let a=[];n.DEFAULT&&a.push({screen:"DEFAULT",minWidth:0,padding:n.DEFAULT});for(let s of t)for(let o of i)for(let{min:l}of o.values)l===s&&a.push({minWidth:s,padding:n[o.name]});return a}return function({addComponents:t,theme:i}){let n=Rt(i("container.screens",i("screens"))),a=r(n),s=e(a,n,i("container.padding")),o=c=>{let f=s.find(d=>d.minWidth===c);return f?{paddingRight:f.padding,paddingLeft:f.padding}:{}},l=Array.from(new Set(a.slice().sort((c,f)=>parseInt(c)-parseInt(f)))).map(c=>({[`@media (min-width: ${c})`]:{".container":{"max-width":c,...o(c)}}}));t([{".container":Object.assign({width:"100%"},i("container.center",!1)?{marginRight:"auto",marginLeft:"auto"}:{},o(0))},...l])}})(),accessibility:({addUtilities:r})=>{r({".sr-only":{position:"absolute",width:"1px",height:"1px",padding:"0",margin:"-1px",overflow:"hidden",clip:"rect(0, 0, 0, 0)",whiteSpace:"nowrap",borderWidth:"0"},".not-sr-only":{position:"static",width:"auto",height:"auto",padding:"0",margin:"0",overflow:"visible",clip:"auto",whiteSpace:"normal"}})},pointerEvents:({addUtilities:r})=>{r({".pointer-events-none":{"pointer-events":"none"},".pointer-events-auto":{"pointer-events":"auto"}})},visibility:({addUtilities:r})=>{r({".visible":{visibility:"visible"},".invisible":{visibility:"hidden"},".collapse":{visibility:"collapse"}})},position:({addUtilities:r})=>{r({".static":{position:"static"},".fixed":{position:"fixed"},".absolute":{position:"absolute"},".relative":{position:"relative"},".sticky":{position:"sticky"}})},inset:L("inset",[["inset",["inset"]],[["inset-x",["left","right"]],["inset-y",["top","bottom"]]],[["start",["inset-inline-start"]],["end",["inset-inline-end"]],["top",["top"]],["right",["right"]],["bottom",["bottom"]],["left",["left"]]]],{supportsNegativeValues:!0}),isolation:({addUtilities:r})=>{r({".isolate":{isolation:"isolate"},".isolation-auto":{isolation:"auto"}})},zIndex:L("zIndex",[["z",["zIndex"]]],{supportsNegativeValues:!0}),order:L("order",void 0,{supportsNegativeValues:!0}),gridColumn:L("gridColumn",[["col",["gridColumn"]]]),gridColumnStart:L("gridColumnStart",[["col-start",["gridColumnStart"]]],{supportsNegativeValues:!0}),gridColumnEnd:L("gridColumnEnd",[["col-end",["gridColumnEnd"]]],{supportsNegativeValues:!0}),gridRow:L("gridRow",[["row",["gridRow"]]]),gridRowStart:L("gridRowStart",[["row-start",["gridRowStart"]]],{supportsNegativeValues:!0}),gridRowEnd:L("gridRowEnd",[["row-end",["gridRowEnd"]]],{supportsNegativeValues:!0}),float:({addUtilities:r})=>{r({".float-start":{float:"inline-start"},".float-end":{float:"inline-end"},".float-right":{float:"right"},".float-left":{float:"left"},".float-none":{float:"none"}})},clear:({addUtilities:r})=>{r({".clear-start":{clear:"inline-start"},".clear-end":{clear:"inline-end"},".clear-left":{clear:"left"},".clear-right":{clear:"right"},".clear-both":{clear:"both"},".clear-none":{clear:"none"}})},margin:L("margin",[["m",["margin"]],[["mx",["margin-left","margin-right"]],["my",["margin-top","margin-bottom"]]],[["ms",["margin-inline-start"]],["me",["margin-inline-end"]],["mt",["margin-top"]],["mr",["margin-right"]],["mb",["margin-bottom"]],["ml",["margin-left"]]]],{supportsNegativeValues:!0}),boxSizing:({addUtilities:r})=>{r({".box-border":{"box-sizing":"border-box"},".box-content":{"box-sizing":"content-box"}})},lineClamp:({matchUtilities:r,addUtilities:e,theme:t})=>{r({"line-clamp":i=>({overflow:"hidden",display:"-webkit-box","-webkit-box-orient":"vertical","-webkit-line-clamp":`${i}`})},{values:t("lineClamp")}),e({".line-clamp-none":{overflow:"visible",display:"block","-webkit-box-orient":"horizontal","-webkit-line-clamp":"none"}})},display:({addUtilities:r})=>{r({".block":{display:"block"},".inline-block":{display:"inline-block"},".inline":{display:"inline"},".flex":{display:"flex"},".inline-flex":{display:"inline-flex"},".table":{display:"table"},".inline-table":{display:"inline-table"},".table-caption":{display:"table-caption"},".table-cell":{display:"table-cell"},".table-column":{display:"table-column"},".table-column-group":{display:"table-column-group"},".table-footer-group":{display:"table-footer-group"},".table-header-group":{display:"table-header-group"},".table-row-group":{display:"table-row-group"},".table-row":{display:"table-row"},".flow-root":{display:"flow-root"},".grid":{display:"grid"},".inline-grid":{display:"inline-grid"},".contents":{display:"contents"},".list-item":{display:"list-item"},".hidden":{display:"none"}})},aspectRatio:L("aspectRatio",[["aspect",["aspect-ratio"]]]),size:L("size",[["size",["width","height"]]]),height:L("height",[["h",["height"]]]),maxHeight:L("maxHeight",[["max-h",["maxHeight"]]]),minHeight:L("minHeight",[["min-h",["minHeight"]]]),width:L("width",[["w",["width"]]]),minWidth:L("minWidth",[["min-w",["minWidth"]]]),maxWidth:L("maxWidth",[["max-w",["maxWidth"]]]),flex:L("flex"),flexShrink:L("flexShrink",[["flex-shrink",["flex-shrink"]],["shrink",["flex-shrink"]]]),flexGrow:L("flexGrow",[["flex-grow",["flex-grow"]],["grow",["flex-grow"]]]),flexBasis:L("flexBasis",[["basis",["flex-basis"]]]),tableLayout:({addUtilities:r})=>{r({".table-auto":{"table-layout":"auto"},".table-fixed":{"table-layout":"fixed"}})},captionSide:({addUtilities:r})=>{r({".caption-top":{"caption-side":"top"},".caption-bottom":{"caption-side":"bottom"}})},borderCollapse:({addUtilities:r})=>{r({".border-collapse":{"border-collapse":"collapse"},".border-separate":{"border-collapse":"separate"}})},borderSpacing:({addDefaults:r,matchUtilities:e,theme:t})=>{r("border-spacing",{"--tw-border-spacing-x":0,"--tw-border-spacing-y":0}),e({"border-spacing":i=>({"--tw-border-spacing-x":i,"--tw-border-spacing-y":i,"@defaults border-spacing":{},"border-spacing":"var(--tw-border-spacing-x) var(--tw-border-spacing-y)"}),"border-spacing-x":i=>({"--tw-border-spacing-x":i,"@defaults border-spacing":{},"border-spacing":"var(--tw-border-spacing-x) var(--tw-border-spacing-y)"}),"border-spacing-y":i=>({"--tw-border-spacing-y":i,"@defaults border-spacing":{},"border-spacing":"var(--tw-border-spacing-x) var(--tw-border-spacing-y)"})},{values:t("borderSpacing")})},transformOrigin:L("transformOrigin",[["origin",["transformOrigin"]]]),translate:L("translate",[[["translate-x",[["@defaults transform",{}],"--tw-translate-x",["transform",Xe]]],["translate-y",[["@defaults transform",{}],"--tw-translate-y",["transform",Xe]]]]],{supportsNegativeValues:!0}),rotate:L("rotate",[["rotate",[["@defaults transform",{}],"--tw-rotate",["transform",Xe]]]],{supportsNegativeValues:!0}),skew:L("skew",[[["skew-x",[["@defaults transform",{}],"--tw-skew-x",["transform",Xe]]],["skew-y",[["@defaults transform",{}],"--tw-skew-y",["transform",Xe]]]]],{supportsNegativeValues:!0}),scale:L("scale",[["scale",[["@defaults transform",{}],"--tw-scale-x","--tw-scale-y",["transform",Xe]]],[["scale-x",[["@defaults transform",{}],"--tw-scale-x",["transform",Xe]]],["scale-y",[["@defaults transform",{}],"--tw-scale-y",["transform",Xe]]]]],{supportsNegativeValues:!0}),transform:({addDefaults:r,addUtilities:e})=>{r("transform",{"--tw-translate-x":"0","--tw-translate-y":"0","--tw-rotate":"0","--tw-skew-x":"0","--tw-skew-y":"0","--tw-scale-x":"1","--tw-scale-y":"1"}),e({".transform":{"@defaults transform":{},transform:Xe},".transform-cpu":{transform:Xe},".transform-gpu":{transform:Xe.replace("translate(var(--tw-translate-x), var(--tw-translate-y))","translate3d(var(--tw-translate-x), var(--tw-translate-y), 0)")},".transform-none":{transform:"none"}})},animation:({matchUtilities:r,theme:e,config:t})=>{let i=a=>Te(t("prefix")+a),n=Object.fromEntries(Object.entries(e("keyframes")??{}).map(([a,s])=>[a,{[`@keyframes ${i(a)}`]:s}]));r({animate:a=>{let s=$o(a);return[...s.flatMap(o=>n[o.name]),{animation:s.map(({name:o,value:l})=>o===void 0||n[o]===void 0?l:l.replace(o,i(o))).join(", ")}]}},{values:e("animation")})},cursor:L("cursor"),touchAction:({addDefaults:r,addUtilities:e})=>{r("touch-action",{"--tw-pan-x":" ","--tw-pan-y":" ","--tw-pinch-zoom":" "});let t="var(--tw-pan-x) var(--tw-pan-y) var(--tw-pinch-zoom)";e({".touch-auto":{"touch-action":"auto"},".touch-none":{"touch-action":"none"},".touch-pan-x":{"@defaults touch-action":{},"--tw-pan-x":"pan-x","touch-action":t},".touch-pan-left":{"@defaults touch-action":{},"--tw-pan-x":"pan-left","touch-action":t},".touch-pan-right":{"@defaults touch-action":{},"--tw-pan-x":"pan-right","touch-action":t},".touch-pan-y":{"@defaults touch-action":{},"--tw-pan-y":"pan-y","touch-action":t},".touch-pan-up":{"@defaults touch-action":{},"--tw-pan-y":"pan-up","touch-action":t},".touch-pan-down":{"@defaults touch-action":{},"--tw-pan-y":"pan-down","touch-action":t},".touch-pinch-zoom":{"@defaults touch-action":{},"--tw-pinch-zoom":"pinch-zoom","touch-action":t},".touch-manipulation":{"touch-action":"manipulation"}})},userSelect:({addUtilities:r})=>{r({".select-none":{"user-select":"none"},".select-text":{"user-select":"text"},".select-all":{"user-select":"all"},".select-auto":{"user-select":"auto"}})},resize:({addUtilities:r})=>{r({".resize-none":{resize:"none"},".resize-y":{resize:"vertical"},".resize-x":{resize:"horizontal"},".resize":{resize:"both"}})},scrollSnapType:({addDefaults:r,addUtilities:e})=>{r("scroll-snap-type",{"--tw-scroll-snap-strictness":"proximity"}),e({".snap-none":{"scroll-snap-type":"none"},".snap-x":{"@defaults scroll-snap-type":{},"scroll-snap-type":"x var(--tw-scroll-snap-strictness)"},".snap-y":{"@defaults scroll-snap-type":{},"scroll-snap-type":"y var(--tw-scroll-snap-strictness)"},".snap-both":{"@defaults scroll-snap-type":{},"scroll-snap-type":"both var(--tw-scroll-snap-strictness)"},".snap-mandatory":{"--tw-scroll-snap-strictness":"mandatory"},".snap-proximity":{"--tw-scroll-snap-strictness":"proximity"}})},scrollSnapAlign:({addUtilities:r})=>{r({".snap-start":{"scroll-snap-align":"start"},".snap-end":{"scroll-snap-align":"end"},".snap-center":{"scroll-snap-align":"center"},".snap-align-none":{"scroll-snap-align":"none"}})},scrollSnapStop:({addUtilities:r})=>{r({".snap-normal":{"scroll-snap-stop":"normal"},".snap-always":{"scroll-snap-stop":"always"}})},scrollMargin:L("scrollMargin",[["scroll-m",["scroll-margin"]],[["scroll-mx",["scroll-margin-left","scroll-margin-right"]],["scroll-my",["scroll-margin-top","scroll-margin-bottom"]]],[["scroll-ms",["scroll-margin-inline-start"]],["scroll-me",["scroll-margin-inline-end"]],["scroll-mt",["scroll-margin-top"]],["scroll-mr",["scroll-margin-right"]],["scroll-mb",["scroll-margin-bottom"]],["scroll-ml",["scroll-margin-left"]]]],{supportsNegativeValues:!0}),scrollPadding:L("scrollPadding",[["scroll-p",["scroll-padding"]],[["scroll-px",["scroll-padding-left","scroll-padding-right"]],["scroll-py",["scroll-padding-top","scroll-padding-bottom"]]],[["scroll-ps",["scroll-padding-inline-start"]],["scroll-pe",["scroll-padding-inline-end"]],["scroll-pt",["scroll-padding-top"]],["scroll-pr",["scroll-padding-right"]],["scroll-pb",["scroll-padding-bottom"]],["scroll-pl",["scroll-padding-left"]]]]),listStylePosition:({addUtilities:r})=>{r({".list-inside":{"list-style-position":"inside"},".list-outside":{"list-style-position":"outside"}})},listStyleType:L("listStyleType",[["list",["listStyleType"]]]),listStyleImage:L("listStyleImage",[["list-image",["listStyleImage"]]]),appearance:({addUtilities:r})=>{r({".appearance-none":{appearance:"none"},".appearance-auto":{appearance:"auto"}})},columns:L("columns",[["columns",["columns"]]]),breakBefore:({addUtilities:r})=>{r({".break-before-auto":{"break-before":"auto"},".break-before-avoid":{"break-before":"avoid"},".break-before-all":{"break-before":"all"},".break-before-avoid-page":{"break-before":"avoid-page"},".break-before-page":{"break-before":"page"},".break-before-left":{"break-before":"left"},".break-before-right":{"break-before":"right"},".break-before-column":{"break-before":"column"}})},breakInside:({addUtilities:r})=>{r({".break-inside-auto":{"break-inside":"auto"},".break-inside-avoid":{"break-inside":"avoid"},".break-inside-avoid-page":{"break-inside":"avoid-page"},".break-inside-avoid-column":{"break-inside":"avoid-column"}})},breakAfter:({addUtilities:r})=>{r({".break-after-auto":{"break-after":"auto"},".break-after-avoid":{"break-after":"avoid"},".break-after-all":{"break-after":"all"},".break-after-avoid-page":{"break-after":"avoid-page"},".break-after-page":{"break-after":"page"},".break-after-left":{"break-after":"left"},".break-after-right":{"break-after":"right"},".break-after-column":{"break-after":"column"}})},gridAutoColumns:L("gridAutoColumns",[["auto-cols",["gridAutoColumns"]]]),gridAutoFlow:({addUtilities:r})=>{r({".grid-flow-row":{gridAutoFlow:"row"},".grid-flow-col":{gridAutoFlow:"column"},".grid-flow-dense":{gridAutoFlow:"dense"},".grid-flow-row-dense":{gridAutoFlow:"row dense"},".grid-flow-col-dense":{gridAutoFlow:"column dense"}})},gridAutoRows:L("gridAutoRows",[["auto-rows",["gridAutoRows"]]]),gridTemplateColumns:L("gridTemplateColumns",[["grid-cols",["gridTemplateColumns"]]]),gridTemplateRows:L("gridTemplateRows",[["grid-rows",["gridTemplateRows"]]]),flexDirection:({addUtilities:r})=>{r({".flex-row":{"flex-direction":"row"},".flex-row-reverse":{"flex-direction":"row-reverse"},".flex-col":{"flex-direction":"column"},".flex-col-reverse":{"flex-direction":"column-reverse"}})},flexWrap:({addUtilities:r})=>{r({".flex-wrap":{"flex-wrap":"wrap"},".flex-wrap-reverse":{"flex-wrap":"wrap-reverse"},".flex-nowrap":{"flex-wrap":"nowrap"}})},placeContent:({addUtilities:r})=>{r({".place-content-center":{"place-content":"center"},".place-content-start":{"place-content":"start"},".place-content-end":{"place-content":"end"},".place-content-between":{"place-content":"space-between"},".place-content-around":{"place-content":"space-around"},".place-content-evenly":{"place-content":"space-evenly"},".place-content-baseline":{"place-content":"baseline"},".place-content-stretch":{"place-content":"stretch"}})},placeItems:({addUtilities:r})=>{r({".place-items-start":{"place-items":"start"},".place-items-end":{"place-items":"end"},".place-items-center":{"place-items":"center"},".place-items-baseline":{"place-items":"baseline"},".place-items-stretch":{"place-items":"stretch"}})},alignContent:({addUtilities:r})=>{r({".content-normal":{"align-content":"normal"},".content-center":{"align-content":"center"},".content-start":{"align-content":"flex-start"},".content-end":{"align-content":"flex-end"},".content-between":{"align-content":"space-between"},".content-around":{"align-content":"space-around"},".content-evenly":{"align-content":"space-evenly"},".content-baseline":{"align-content":"baseline"},".content-stretch":{"align-content":"stretch"}})},alignItems:({addUtilities:r})=>{r({".items-start":{"align-items":"flex-start"},".items-end":{"align-items":"flex-end"},".items-center":{"align-items":"center"},".items-baseline":{"align-items":"baseline"},".items-stretch":{"align-items":"stretch"}})},justifyContent:({addUtilities:r})=>{r({".justify-normal":{"justify-content":"normal"},".justify-start":{"justify-content":"flex-start"},".justify-end":{"justify-content":"flex-end"},".justify-center":{"justify-content":"center"},".justify-between":{"justify-content":"space-between"},".justify-around":{"justify-content":"space-around"},".justify-evenly":{"justify-content":"space-evenly"},".justify-stretch":{"justify-content":"stretch"}})},justifyItems:({addUtilities:r})=>{r({".justify-items-start":{"justify-items":"start"},".justify-items-end":{"justify-items":"end"},".justify-items-center":{"justify-items":"center"},".justify-items-stretch":{"justify-items":"stretch"}})},gap:L("gap",[["gap",["gap"]],[["gap-x",["columnGap"]],["gap-y",["rowGap"]]]]),space:({matchUtilities:r,addUtilities:e,theme:t})=>{r({"space-x":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"--tw-space-x-reverse":"0","margin-right":`calc(${i} * var(--tw-space-x-reverse))`,"margin-left":`calc(${i} * calc(1 - var(--tw-space-x-reverse)))`}}),"space-y":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"--tw-space-y-reverse":"0","margin-top":`calc(${i} * calc(1 - var(--tw-space-y-reverse)))`,"margin-bottom":`calc(${i} * var(--tw-space-y-reverse))`}})},{values:t("space"),supportsNegativeValues:!0}),e({".space-y-reverse > :not([hidden]) ~ :not([hidden])":{"--tw-space-y-reverse":"1"},".space-x-reverse > :not([hidden]) ~ :not([hidden])":{"--tw-space-x-reverse":"1"}})},divideWidth:({matchUtilities:r,addUtilities:e,theme:t})=>{r({"divide-x":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-x-reverse":"0","border-right-width":`calc(${i} * var(--tw-divide-x-reverse))`,"border-left-width":`calc(${i} * calc(1 - var(--tw-divide-x-reverse)))`}}),"divide-y":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-y-reverse":"0","border-top-width":`calc(${i} * calc(1 - var(--tw-divide-y-reverse)))`,"border-bottom-width":`calc(${i} * var(--tw-divide-y-reverse))`}})},{values:t("divideWidth"),type:["line-width","length","any"]}),e({".divide-y-reverse > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-y-reverse":"1"},".divide-x-reverse > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-x-reverse":"1"}})},divideStyle:({addUtilities:r})=>{r({".divide-solid > :not([hidden]) ~ :not([hidden])":{"border-style":"solid"},".divide-dashed > :not([hidden]) ~ :not([hidden])":{"border-style":"dashed"},".divide-dotted > :not([hidden]) ~ :not([hidden])":{"border-style":"dotted"},".divide-double > :not([hidden]) ~ :not([hidden])":{"border-style":"double"},".divide-none > :not([hidden]) ~ :not([hidden])":{"border-style":"none"}})},divideColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({divide:i=>t("divideOpacity")?{["& > :not([hidden]) ~ :not([hidden])"]:Ae({color:i,property:"border-color",variable:"--tw-divide-opacity"})}:{["& > :not([hidden]) ~ :not([hidden])"]:{"border-color":X(i)}}},{values:(({DEFAULT:i,...n})=>n)(xe(e("divideColor"))),type:["color","any"]})},divideOpacity:({matchUtilities:r,theme:e})=>{r({"divide-opacity":t=>({["& > :not([hidden]) ~ :not([hidden])"]:{"--tw-divide-opacity":t}})},{values:e("divideOpacity")})},placeSelf:({addUtilities:r})=>{r({".place-self-auto":{"place-self":"auto"},".place-self-start":{"place-self":"start"},".place-self-end":{"place-self":"end"},".place-self-center":{"place-self":"center"},".place-self-stretch":{"place-self":"stretch"}})},alignSelf:({addUtilities:r})=>{r({".self-auto":{"align-self":"auto"},".self-start":{"align-self":"flex-start"},".self-end":{"align-self":"flex-end"},".self-center":{"align-self":"center"},".self-stretch":{"align-self":"stretch"},".self-baseline":{"align-self":"baseline"}})},justifySelf:({addUtilities:r})=>{r({".justify-self-auto":{"justify-self":"auto"},".justify-self-start":{"justify-self":"start"},".justify-self-end":{"justify-self":"end"},".justify-self-center":{"justify-self":"center"},".justify-self-stretch":{"justify-self":"stretch"}})},overflow:({addUtilities:r})=>{r({".overflow-auto":{overflow:"auto"},".overflow-hidden":{overflow:"hidden"},".overflow-clip":{overflow:"clip"},".overflow-visible":{overflow:"visible"},".overflow-scroll":{overflow:"scroll"},".overflow-x-auto":{"overflow-x":"auto"},".overflow-y-auto":{"overflow-y":"auto"},".overflow-x-hidden":{"overflow-x":"hidden"},".overflow-y-hidden":{"overflow-y":"hidden"},".overflow-x-clip":{"overflow-x":"clip"},".overflow-y-clip":{"overflow-y":"clip"},".overflow-x-visible":{"overflow-x":"visible"},".overflow-y-visible":{"overflow-y":"visible"},".overflow-x-scroll":{"overflow-x":"scroll"},".overflow-y-scroll":{"overflow-y":"scroll"}})},overscrollBehavior:({addUtilities:r})=>{r({".overscroll-auto":{"overscroll-behavior":"auto"},".overscroll-contain":{"overscroll-behavior":"contain"},".overscroll-none":{"overscroll-behavior":"none"},".overscroll-y-auto":{"overscroll-behavior-y":"auto"},".overscroll-y-contain":{"overscroll-behavior-y":"contain"},".overscroll-y-none":{"overscroll-behavior-y":"none"},".overscroll-x-auto":{"overscroll-behavior-x":"auto"},".overscroll-x-contain":{"overscroll-behavior-x":"contain"},".overscroll-x-none":{"overscroll-behavior-x":"none"}})},scrollBehavior:({addUtilities:r})=>{r({".scroll-auto":{"scroll-behavior":"auto"},".scroll-smooth":{"scroll-behavior":"smooth"}})},textOverflow:({addUtilities:r})=>{r({".truncate":{overflow:"hidden","text-overflow":"ellipsis","white-space":"nowrap"},".overflow-ellipsis":{"text-overflow":"ellipsis"},".text-ellipsis":{"text-overflow":"ellipsis"},".text-clip":{"text-overflow":"clip"}})},hyphens:({addUtilities:r})=>{r({".hyphens-none":{hyphens:"none"},".hyphens-manual":{hyphens:"manual"},".hyphens-auto":{hyphens:"auto"}})},whitespace:({addUtilities:r})=>{r({".whitespace-normal":{"white-space":"normal"},".whitespace-nowrap":{"white-space":"nowrap"},".whitespace-pre":{"white-space":"pre"},".whitespace-pre-line":{"white-space":"pre-line"},".whitespace-pre-wrap":{"white-space":"pre-wrap"},".whitespace-break-spaces":{"white-space":"break-spaces"}})},textWrap:({addUtilities:r})=>{r({".text-wrap":{"text-wrap":"wrap"},".text-nowrap":{"text-wrap":"nowrap"},".text-balance":{"text-wrap":"balance"},".text-pretty":{"text-wrap":"pretty"}})},wordBreak:({addUtilities:r})=>{r({".break-normal":{"overflow-wrap":"normal","word-break":"normal"},".break-words":{"overflow-wrap":"break-word"},".break-all":{"word-break":"break-all"},".break-keep":{"word-break":"keep-all"}})},borderRadius:L("borderRadius",[["rounded",["border-radius"]],[["rounded-s",["border-start-start-radius","border-end-start-radius"]],["rounded-e",["border-start-end-radius","border-end-end-radius"]],["rounded-t",["border-top-left-radius","border-top-right-radius"]],["rounded-r",["border-top-right-radius","border-bottom-right-radius"]],["rounded-b",["border-bottom-right-radius","border-bottom-left-radius"]],["rounded-l",["border-top-left-radius","border-bottom-left-radius"]]],[["rounded-ss",["border-start-start-radius"]],["rounded-se",["border-start-end-radius"]],["rounded-ee",["border-end-end-radius"]],["rounded-es",["border-end-start-radius"]],["rounded-tl",["border-top-left-radius"]],["rounded-tr",["border-top-right-radius"]],["rounded-br",["border-bottom-right-radius"]],["rounded-bl",["border-bottom-left-radius"]]]]),borderWidth:L("borderWidth",[["border",[["@defaults border-width",{}],"border-width"]],[["border-x",[["@defaults border-width",{}],"border-left-width","border-right-width"]],["border-y",[["@defaults border-width",{}],"border-top-width","border-bottom-width"]]],[["border-s",[["@defaults border-width",{}],"border-inline-start-width"]],["border-e",[["@defaults border-width",{}],"border-inline-end-width"]],["border-t",[["@defaults border-width",{}],"border-top-width"]],["border-r",[["@defaults border-width",{}],"border-right-width"]],["border-b",[["@defaults border-width",{}],"border-bottom-width"]],["border-l",[["@defaults border-width",{}],"border-left-width"]]]],{type:["line-width","length"]}),borderStyle:({addUtilities:r})=>{r({".border-solid":{"border-style":"solid"},".border-dashed":{"border-style":"dashed"},".border-dotted":{"border-style":"dotted"},".border-double":{"border-style":"double"},".border-hidden":{"border-style":"hidden"},".border-none":{"border-style":"none"}})},borderColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({border:i=>t("borderOpacity")?Ae({color:i,property:"border-color",variable:"--tw-border-opacity"}):{"border-color":X(i)}},{values:(({DEFAULT:i,...n})=>n)(xe(e("borderColor"))),type:["color","any"]}),r({"border-x":i=>t("borderOpacity")?Ae({color:i,property:["border-left-color","border-right-color"],variable:"--tw-border-opacity"}):{"border-left-color":X(i),"border-right-color":X(i)},"border-y":i=>t("borderOpacity")?Ae({color:i,property:["border-top-color","border-bottom-color"],variable:"--tw-border-opacity"}):{"border-top-color":X(i),"border-bottom-color":X(i)}},{values:(({DEFAULT:i,...n})=>n)(xe(e("borderColor"))),type:["color","any"]}),r({"border-s":i=>t("borderOpacity")?Ae({color:i,property:"border-inline-start-color",variable:"--tw-border-opacity"}):{"border-inline-start-color":X(i)},"border-e":i=>t("borderOpacity")?Ae({color:i,property:"border-inline-end-color",variable:"--tw-border-opacity"}):{"border-inline-end-color":X(i)},"border-t":i=>t("borderOpacity")?Ae({color:i,property:"border-top-color",variable:"--tw-border-opacity"}):{"border-top-color":X(i)},"border-r":i=>t("borderOpacity")?Ae({color:i,property:"border-right-color",variable:"--tw-border-opacity"}):{"border-right-color":X(i)},"border-b":i=>t("borderOpacity")?Ae({color:i,property:"border-bottom-color",variable:"--tw-border-opacity"}):{"border-bottom-color":X(i)},"border-l":i=>t("borderOpacity")?Ae({color:i,property:"border-left-color",variable:"--tw-border-opacity"}):{"border-left-color":X(i)}},{values:(({DEFAULT:i,...n})=>n)(xe(e("borderColor"))),type:["color","any"]})},borderOpacity:L("borderOpacity",[["border-opacity",["--tw-border-opacity"]]]),backgroundColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({bg:i=>t("backgroundOpacity")?Ae({color:i,property:"background-color",variable:"--tw-bg-opacity"}):{"background-color":X(i)}},{values:xe(e("backgroundColor")),type:["color","any"]})},backgroundOpacity:L("backgroundOpacity",[["bg-opacity",["--tw-bg-opacity"]]]),backgroundImage:L("backgroundImage",[["bg",["background-image"]]],{type:["lookup","image","url"]}),gradientColorStops:(()=>{function r(e){return Ze(e,0,"rgb(255 255 255 / 0)")}return function({matchUtilities:e,theme:t,addDefaults:i}){i("gradient-color-stops",{"--tw-gradient-from-position":" ","--tw-gradient-via-position":" ","--tw-gradient-to-position":" "});let n={values:xe(t("gradientColorStops")),type:["color","any"]},a={values:t("gradientColorStopPositions"),type:["length","percentage"]};e({from:s=>{let o=r(s);return{"@defaults gradient-color-stops":{},"--tw-gradient-from":`${X(s)} var(--tw-gradient-from-position)`,"--tw-gradient-to":`${o} var(--tw-gradient-to-position)`,"--tw-gradient-stops":"var(--tw-gradient-from), var(--tw-gradient-to)"}}},n),e({from:s=>({"--tw-gradient-from-position":s})},a),e({via:s=>{let o=r(s);return{"@defaults gradient-color-stops":{},"--tw-gradient-to":`${o}  var(--tw-gradient-to-position)`,"--tw-gradient-stops":`var(--tw-gradient-from), ${X(s)} var(--tw-gradient-via-position), var(--tw-gradient-to)`}}},n),e({via:s=>({"--tw-gradient-via-position":s})},a),e({to:s=>({"@defaults gradient-color-stops":{},"--tw-gradient-to":`${X(s)} var(--tw-gradient-to-position)`})},n),e({to:s=>({"--tw-gradient-to-position":s})},a)}})(),boxDecorationBreak:({addUtilities:r})=>{r({".decoration-slice":{"box-decoration-break":"slice"},".decoration-clone":{"box-decoration-break":"clone"},".box-decoration-slice":{"box-decoration-break":"slice"},".box-decoration-clone":{"box-decoration-break":"clone"}})},backgroundSize:L("backgroundSize",[["bg",["background-size"]]],{type:["lookup","length","percentage","size"]}),backgroundAttachment:({addUtilities:r})=>{r({".bg-fixed":{"background-attachment":"fixed"},".bg-local":{"background-attachment":"local"},".bg-scroll":{"background-attachment":"scroll"}})},backgroundClip:({addUtilities:r})=>{r({".bg-clip-border":{"background-clip":"border-box"},".bg-clip-padding":{"background-clip":"padding-box"},".bg-clip-content":{"background-clip":"content-box"},".bg-clip-text":{"background-clip":"text"}})},backgroundPosition:L("backgroundPosition",[["bg",["background-position"]]],{type:["lookup",["position",{preferOnConflict:!0}]]}),backgroundRepeat:({addUtilities:r})=>{r({".bg-repeat":{"background-repeat":"repeat"},".bg-no-repeat":{"background-repeat":"no-repeat"},".bg-repeat-x":{"background-repeat":"repeat-x"},".bg-repeat-y":{"background-repeat":"repeat-y"},".bg-repeat-round":{"background-repeat":"round"},".bg-repeat-space":{"background-repeat":"space"}})},backgroundOrigin:({addUtilities:r})=>{r({".bg-origin-border":{"background-origin":"border-box"},".bg-origin-padding":{"background-origin":"padding-box"},".bg-origin-content":{"background-origin":"content-box"}})},fill:({matchUtilities:r,theme:e})=>{r({fill:t=>({fill:X(t)})},{values:xe(e("fill")),type:["color","any"]})},stroke:({matchUtilities:r,theme:e})=>{r({stroke:t=>({stroke:X(t)})},{values:xe(e("stroke")),type:["color","url","any"]})},strokeWidth:L("strokeWidth",[["stroke",["stroke-width"]]],{type:["length","number","percentage"]}),objectFit:({addUtilities:r})=>{r({".object-contain":{"object-fit":"contain"},".object-cover":{"object-fit":"cover"},".object-fill":{"object-fit":"fill"},".object-none":{"object-fit":"none"},".object-scale-down":{"object-fit":"scale-down"}})},objectPosition:L("objectPosition",[["object",["object-position"]]]),padding:L("padding",[["p",["padding"]],[["px",["padding-left","padding-right"]],["py",["padding-top","padding-bottom"]]],[["ps",["padding-inline-start"]],["pe",["padding-inline-end"]],["pt",["padding-top"]],["pr",["padding-right"]],["pb",["padding-bottom"]],["pl",["padding-left"]]]]),textAlign:({addUtilities:r})=>{r({".text-left":{"text-align":"left"},".text-center":{"text-align":"center"},".text-right":{"text-align":"right"},".text-justify":{"text-align":"justify"},".text-start":{"text-align":"start"},".text-end":{"text-align":"end"}})},textIndent:L("textIndent",[["indent",["text-indent"]]],{supportsNegativeValues:!0}),verticalAlign:({addUtilities:r,matchUtilities:e})=>{r({".align-baseline":{"vertical-align":"baseline"},".align-top":{"vertical-align":"top"},".align-middle":{"vertical-align":"middle"},".align-bottom":{"vertical-align":"bottom"},".align-text-top":{"vertical-align":"text-top"},".align-text-bottom":{"vertical-align":"text-bottom"},".align-sub":{"vertical-align":"sub"},".align-super":{"vertical-align":"super"}}),e({align:t=>({"vertical-align":t})})},fontFamily:({matchUtilities:r,theme:e})=>{r({font:t=>{let[i,n={}]=Array.isArray(t)&&ke(t[1])?t:[t],{fontFeatureSettings:a,fontVariationSettings:s}=n;return{"font-family":Array.isArray(i)?i.join(", "):i,...a===void 0?{}:{"font-feature-settings":a},...s===void 0?{}:{"font-variation-settings":s}}}},{values:e("fontFamily"),type:["lookup","generic-name","family-name"]})},fontSize:({matchUtilities:r,theme:e})=>{r({text:(t,{modifier:i})=>{let[n,a]=Array.isArray(t)?t:[t];if(i)return{"font-size":n,"line-height":i};let{lineHeight:s,letterSpacing:o,fontWeight:l}=ke(a)?a:{lineHeight:a};return{"font-size":n,...s===void 0?{}:{"line-height":s},...o===void 0?{}:{"letter-spacing":o},...l===void 0?{}:{"font-weight":l}}}},{values:e("fontSize"),modifiers:e("lineHeight"),type:["absolute-size","relative-size","length","percentage"]})},fontWeight:L("fontWeight",[["font",["fontWeight"]]],{type:["lookup","number","any"]}),textTransform:({addUtilities:r})=>{r({".uppercase":{"text-transform":"uppercase"},".lowercase":{"text-transform":"lowercase"},".capitalize":{"text-transform":"capitalize"},".normal-case":{"text-transform":"none"}})},fontStyle:({addUtilities:r})=>{r({".italic":{"font-style":"italic"},".not-italic":{"font-style":"normal"}})},fontVariantNumeric:({addDefaults:r,addUtilities:e})=>{let t="var(--tw-ordinal) var(--tw-slashed-zero) var(--tw-numeric-figure) var(--tw-numeric-spacing) var(--tw-numeric-fraction)";r("font-variant-numeric",{"--tw-ordinal":" ","--tw-slashed-zero":" ","--tw-numeric-figure":" ","--tw-numeric-spacing":" ","--tw-numeric-fraction":" "}),e({".normal-nums":{"font-variant-numeric":"normal"},".ordinal":{"@defaults font-variant-numeric":{},"--tw-ordinal":"ordinal","font-variant-numeric":t},".slashed-zero":{"@defaults font-variant-numeric":{},"--tw-slashed-zero":"slashed-zero","font-variant-numeric":t},".lining-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-figure":"lining-nums","font-variant-numeric":t},".oldstyle-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-figure":"oldstyle-nums","font-variant-numeric":t},".proportional-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-spacing":"proportional-nums","font-variant-numeric":t},".tabular-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-spacing":"tabular-nums","font-variant-numeric":t},".diagonal-fractions":{"@defaults font-variant-numeric":{},"--tw-numeric-fraction":"diagonal-fractions","font-variant-numeric":t},".stacked-fractions":{"@defaults font-variant-numeric":{},"--tw-numeric-fraction":"stacked-fractions","font-variant-numeric":t}})},lineHeight:L("lineHeight",[["leading",["lineHeight"]]]),letterSpacing:L("letterSpacing",[["tracking",["letterSpacing"]]],{supportsNegativeValues:!0}),textColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({text:i=>t("textOpacity")?Ae({color:i,property:"color",variable:"--tw-text-opacity"}):{color:X(i)}},{values:xe(e("textColor")),type:["color","any"]})},textOpacity:L("textOpacity",[["text-opacity",["--tw-text-opacity"]]]),textDecoration:({addUtilities:r})=>{r({".underline":{"text-decoration-line":"underline"},".overline":{"text-decoration-line":"overline"},".line-through":{"text-decoration-line":"line-through"},".no-underline":{"text-decoration-line":"none"}})},textDecorationColor:({matchUtilities:r,theme:e})=>{r({decoration:t=>({"text-decoration-color":X(t)})},{values:xe(e("textDecorationColor")),type:["color","any"]})},textDecorationStyle:({addUtilities:r})=>{r({".decoration-solid":{"text-decoration-style":"solid"},".decoration-double":{"text-decoration-style":"double"},".decoration-dotted":{"text-decoration-style":"dotted"},".decoration-dashed":{"text-decoration-style":"dashed"},".decoration-wavy":{"text-decoration-style":"wavy"}})},textDecorationThickness:L("textDecorationThickness",[["decoration",["text-decoration-thickness"]]],{type:["length","percentage"]}),textUnderlineOffset:L("textUnderlineOffset",[["underline-offset",["text-underline-offset"]]],{type:["length","percentage","any"]}),fontSmoothing:({addUtilities:r})=>{r({".antialiased":{"-webkit-font-smoothing":"antialiased","-moz-osx-font-smoothing":"grayscale"},".subpixel-antialiased":{"-webkit-font-smoothing":"auto","-moz-osx-font-smoothing":"auto"}})},placeholderColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({placeholder:i=>t("placeholderOpacity")?{"&::placeholder":Ae({color:i,property:"color",variable:"--tw-placeholder-opacity"})}:{"&::placeholder":{color:X(i)}}},{values:xe(e("placeholderColor")),type:["color","any"]})},placeholderOpacity:({matchUtilities:r,theme:e})=>{r({"placeholder-opacity":t=>({["&::placeholder"]:{"--tw-placeholder-opacity":t}})},{values:e("placeholderOpacity")})},caretColor:({matchUtilities:r,theme:e})=>{r({caret:t=>({"caret-color":X(t)})},{values:xe(e("caretColor")),type:["color","any"]})},accentColor:({matchUtilities:r,theme:e})=>{r({accent:t=>({"accent-color":X(t)})},{values:xe(e("accentColor")),type:["color","any"]})},opacity:L("opacity",[["opacity",["opacity"]]]),backgroundBlendMode:({addUtilities:r})=>{r({".bg-blend-normal":{"background-blend-mode":"normal"},".bg-blend-multiply":{"background-blend-mode":"multiply"},".bg-blend-screen":{"background-blend-mode":"screen"},".bg-blend-overlay":{"background-blend-mode":"overlay"},".bg-blend-darken":{"background-blend-mode":"darken"},".bg-blend-lighten":{"background-blend-mode":"lighten"},".bg-blend-color-dodge":{"background-blend-mode":"color-dodge"},".bg-blend-color-burn":{"background-blend-mode":"color-burn"},".bg-blend-hard-light":{"background-blend-mode":"hard-light"},".bg-blend-soft-light":{"background-blend-mode":"soft-light"},".bg-blend-difference":{"background-blend-mode":"difference"},".bg-blend-exclusion":{"background-blend-mode":"exclusion"},".bg-blend-hue":{"background-blend-mode":"hue"},".bg-blend-saturation":{"background-blend-mode":"saturation"},".bg-blend-color":{"background-blend-mode":"color"},".bg-blend-luminosity":{"background-blend-mode":"luminosity"}})},mixBlendMode:({addUtilities:r})=>{r({".mix-blend-normal":{"mix-blend-mode":"normal"},".mix-blend-multiply":{"mix-blend-mode":"multiply"},".mix-blend-screen":{"mix-blend-mode":"screen"},".mix-blend-overlay":{"mix-blend-mode":"overlay"},".mix-blend-darken":{"mix-blend-mode":"darken"},".mix-blend-lighten":{"mix-blend-mode":"lighten"},".mix-blend-color-dodge":{"mix-blend-mode":"color-dodge"},".mix-blend-color-burn":{"mix-blend-mode":"color-burn"},".mix-blend-hard-light":{"mix-blend-mode":"hard-light"},".mix-blend-soft-light":{"mix-blend-mode":"soft-light"},".mix-blend-difference":{"mix-blend-mode":"difference"},".mix-blend-exclusion":{"mix-blend-mode":"exclusion"},".mix-blend-hue":{"mix-blend-mode":"hue"},".mix-blend-saturation":{"mix-blend-mode":"saturation"},".mix-blend-color":{"mix-blend-mode":"color"},".mix-blend-luminosity":{"mix-blend-mode":"luminosity"},".mix-blend-plus-darker":{"mix-blend-mode":"plus-darker"},".mix-blend-plus-lighter":{"mix-blend-mode":"plus-lighter"}})},boxShadow:(()=>{let r=mt("boxShadow"),e=["var(--tw-ring-offset-shadow, 0 0 #0000)","var(--tw-ring-shadow, 0 0 #0000)","var(--tw-shadow)"].join(", ");return function({matchUtilities:t,addDefaults:i,theme:n}){i("box-shadow",{"--tw-ring-offset-shadow":"0 0 #0000","--tw-ring-shadow":"0 0 #0000","--tw-shadow":"0 0 #0000","--tw-shadow-colored":"0 0 #0000"}),t({shadow:a=>{a=r(a);let s=Ji(a);for(let o of s)!o.valid||(o.color="var(--tw-shadow-color)");return{"@defaults box-shadow":{},"--tw-shadow":a==="none"?"0 0 #0000":a,"--tw-shadow-colored":a==="none"?"0 0 #0000":qf(s),"box-shadow":e}}},{values:n("boxShadow"),type:["shadow"]})}})(),boxShadowColor:({matchUtilities:r,theme:e})=>{r({shadow:t=>({"--tw-shadow-color":X(t),"--tw-shadow":"var(--tw-shadow-colored)"})},{values:xe(e("boxShadowColor")),type:["color","any"]})},outlineStyle:({addUtilities:r})=>{r({".outline-none":{outline:"2px solid transparent","outline-offset":"2px"},".outline":{"outline-style":"solid"},".outline-dashed":{"outline-style":"dashed"},".outline-dotted":{"outline-style":"dotted"},".outline-double":{"outline-style":"double"}})},outlineWidth:L("outlineWidth",[["outline",["outline-width"]]],{type:["length","number","percentage"]}),outlineOffset:L("outlineOffset",[["outline-offset",["outline-offset"]]],{type:["length","number","percentage","any"],supportsNegativeValues:!0}),outlineColor:({matchUtilities:r,theme:e})=>{r({outline:t=>({"outline-color":X(t)})},{values:xe(e("outlineColor")),type:["color","any"]})},ringWidth:({matchUtilities:r,addDefaults:e,addUtilities:t,theme:i,config:n})=>{let a=(()=>{if(we(n(),"respectDefaultRingColorOpacity"))return i("ringColor.DEFAULT");let s=i("ringOpacity.DEFAULT","0.5");return i("ringColor")?.DEFAULT?Ze(i("ringColor")?.DEFAULT,s,`rgb(147 197 253 / ${s})`):`rgb(147 197 253 / ${s})`})();e("ring-width",{"--tw-ring-inset":" ","--tw-ring-offset-width":i("ringOffsetWidth.DEFAULT","0px"),"--tw-ring-offset-color":i("ringOffsetColor.DEFAULT","#fff"),"--tw-ring-color":a,"--tw-ring-offset-shadow":"0 0 #0000","--tw-ring-shadow":"0 0 #0000","--tw-shadow":"0 0 #0000","--tw-shadow-colored":"0 0 #0000"}),r({ring:s=>({"@defaults ring-width":{},"--tw-ring-offset-shadow":"var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color)","--tw-ring-shadow":`var(--tw-ring-inset) 0 0 0 calc(${s} + var(--tw-ring-offset-width)) var(--tw-ring-color)`,"box-shadow":["var(--tw-ring-offset-shadow)","var(--tw-ring-shadow)","var(--tw-shadow, 0 0 #0000)"].join(", ")})},{values:i("ringWidth"),type:"length"}),t({".ring-inset":{"@defaults ring-width":{},"--tw-ring-inset":"inset"}})},ringColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({ring:i=>t("ringOpacity")?Ae({color:i,property:"--tw-ring-color",variable:"--tw-ring-opacity"}):{"--tw-ring-color":X(i)}},{values:Object.fromEntries(Object.entries(xe(e("ringColor"))).filter(([i])=>i!=="DEFAULT")),type:["color","any"]})},ringOpacity:r=>{let{config:e}=r;return L("ringOpacity",[["ring-opacity",["--tw-ring-opacity"]]],{filterDefault:!we(e(),"respectDefaultRingColorOpacity")})(r)},ringOffsetWidth:L("ringOffsetWidth",[["ring-offset",["--tw-ring-offset-width"]]],{type:"length"}),ringOffsetColor:({matchUtilities:r,theme:e})=>{r({"ring-offset":t=>({"--tw-ring-offset-color":X(t)})},{values:xe(e("ringOffsetColor")),type:["color","any"]})},blur:({matchUtilities:r,theme:e})=>{r({blur:t=>({"--tw-blur":t.trim()===""?" ":`blur(${t})`,"@defaults filter":{},filter:nt})},{values:e("blur")})},brightness:({matchUtilities:r,theme:e})=>{r({brightness:t=>({"--tw-brightness":`brightness(${t})`,"@defaults filter":{},filter:nt})},{values:e("brightness")})},contrast:({matchUtilities:r,theme:e})=>{r({contrast:t=>({"--tw-contrast":`contrast(${t})`,"@defaults filter":{},filter:nt})},{values:e("contrast")})},dropShadow:({matchUtilities:r,theme:e})=>{r({"drop-shadow":t=>({"--tw-drop-shadow":Array.isArray(t)?t.map(i=>`drop-shadow(${i})`).join(" "):`drop-shadow(${t})`,"@defaults filter":{},filter:nt})},{values:e("dropShadow")})},grayscale:({matchUtilities:r,theme:e})=>{r({grayscale:t=>({"--tw-grayscale":`grayscale(${t})`,"@defaults filter":{},filter:nt})},{values:e("grayscale")})},hueRotate:({matchUtilities:r,theme:e})=>{r({"hue-rotate":t=>({"--tw-hue-rotate":`hue-rotate(${t})`,"@defaults filter":{},filter:nt})},{values:e("hueRotate"),supportsNegativeValues:!0})},invert:({matchUtilities:r,theme:e})=>{r({invert:t=>({"--tw-invert":`invert(${t})`,"@defaults filter":{},filter:nt})},{values:e("invert")})},saturate:({matchUtilities:r,theme:e})=>{r({saturate:t=>({"--tw-saturate":`saturate(${t})`,"@defaults filter":{},filter:nt})},{values:e("saturate")})},sepia:({matchUtilities:r,theme:e})=>{r({sepia:t=>({"--tw-sepia":`sepia(${t})`,"@defaults filter":{},filter:nt})},{values:e("sepia")})},filter:({addDefaults:r,addUtilities:e})=>{r("filter",{"--tw-blur":" ","--tw-brightness":" ","--tw-contrast":" ","--tw-grayscale":" ","--tw-hue-rotate":" ","--tw-invert":" ","--tw-saturate":" ","--tw-sepia":" ","--tw-drop-shadow":" "}),e({".filter":{"@defaults filter":{},filter:nt},".filter-none":{filter:"none"}})},backdropBlur:({matchUtilities:r,theme:e})=>{r({"backdrop-blur":t=>({"--tw-backdrop-blur":t.trim()===""?" ":`blur(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropBlur")})},backdropBrightness:({matchUtilities:r,theme:e})=>{r({"backdrop-brightness":t=>({"--tw-backdrop-brightness":`brightness(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropBrightness")})},backdropContrast:({matchUtilities:r,theme:e})=>{r({"backdrop-contrast":t=>({"--tw-backdrop-contrast":`contrast(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropContrast")})},backdropGrayscale:({matchUtilities:r,theme:e})=>{r({"backdrop-grayscale":t=>({"--tw-backdrop-grayscale":`grayscale(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropGrayscale")})},backdropHueRotate:({matchUtilities:r,theme:e})=>{r({"backdrop-hue-rotate":t=>({"--tw-backdrop-hue-rotate":`hue-rotate(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropHueRotate"),supportsNegativeValues:!0})},backdropInvert:({matchUtilities:r,theme:e})=>{r({"backdrop-invert":t=>({"--tw-backdrop-invert":`invert(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropInvert")})},backdropOpacity:({matchUtilities:r,theme:e})=>{r({"backdrop-opacity":t=>({"--tw-backdrop-opacity":`opacity(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropOpacity")})},backdropSaturate:({matchUtilities:r,theme:e})=>{r({"backdrop-saturate":t=>({"--tw-backdrop-saturate":`saturate(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropSaturate")})},backdropSepia:({matchUtilities:r,theme:e})=>{r({"backdrop-sepia":t=>({"--tw-backdrop-sepia":`sepia(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropSepia")})},backdropFilter:({addDefaults:r,addUtilities:e})=>{r("backdrop-filter",{"--tw-backdrop-blur":" ","--tw-backdrop-brightness":" ","--tw-backdrop-contrast":" ","--tw-backdrop-grayscale":" ","--tw-backdrop-hue-rotate":" ","--tw-backdrop-invert":" ","--tw-backdrop-opacity":" ","--tw-backdrop-saturate":" ","--tw-backdrop-sepia":" "}),e({".backdrop-filter":{"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge},".backdrop-filter-none":{"-webkit-backdrop-filter":"none","backdrop-filter":"none"}})},transitionProperty:({matchUtilities:r,theme:e})=>{let t=e("transitionTimingFunction.DEFAULT"),i=e("transitionDuration.DEFAULT");r({transition:n=>({"transition-property":n,...n==="none"?{}:{"transition-timing-function":t,"transition-duration":i}})},{values:e("transitionProperty")})},transitionDelay:L("transitionDelay",[["delay",["transitionDelay"]]]),transitionDuration:L("transitionDuration",[["duration",["transitionDuration"]]],{filterDefault:!0}),transitionTimingFunction:L("transitionTimingFunction",[["ease",["transitionTimingFunction"]]],{filterDefault:!0}),willChange:L("willChange",[["will-change",["will-change"]]]),contain:({addDefaults:r,addUtilities:e})=>{let t="var(--tw-contain-size) var(--tw-contain-layout) var(--tw-contain-paint) var(--tw-contain-style)";r("contain",{"--tw-contain-size":" ","--tw-contain-layout":" ","--tw-contain-paint":" ","--tw-contain-style":" "}),e({".contain-none":{contain:"none"},".contain-content":{contain:"content"},".contain-strict":{contain:"strict"},".contain-size":{"@defaults contain":{},"--tw-contain-size":"size",contain:t},".contain-inline-size":{"@defaults contain":{},"--tw-contain-size":"inline-size",contain:t},".contain-layout":{"@defaults contain":{},"--tw-contain-layout":"layout",contain:t},".contain-paint":{"@defaults contain":{},"--tw-contain-paint":"paint",contain:t},".contain-style":{"@defaults contain":{},"--tw-contain-style":"style",contain:t}})},content:L("content",[["content",["--tw-content",["content","var(--tw-content)"]]]]),forcedColorAdjust:({addUtilities:r})=>{r({".forced-color-adjust-auto":{"forced-color-adjust":"auto"},".forced-color-adjust-none":{"forced-color-adjust":"none"}})}}});function p_(r){if(r===void 0)return!1;if(r==="true"||r==="1")return!0;if(r==="false"||r==="0")return!1;if(r==="*")return!0;let e=r.split(",").map(t=>t.split(":")[0]);return e.includes("-tailwindcss")?!1:!!e.includes("tailwindcss")}var Je,yh,bh,Zn,Lo,gt,Ei,It=R(()=>{u();Je=typeof m!="undefined"?{NODE_ENV:"production",DEBUG:p_(m.env.DEBUG)}:{NODE_ENV:"production",DEBUG:!1},yh=new Map,bh=new Map,Zn=new Map,Lo=new Map,gt=new String("*"),Ei=Symbol("__NONE__")});function cr(r){let e=[],t=!1;for(let i=0;i<r.length;i++){let n=r[i];if(n===":"&&!t&&e.length===0)return!1;if(d_.has(n)&&r[i-1]!=="\\"&&(t=!t),!t&&r[i-1]!=="\\"){if(wh.has(n))e.push(n);else if(vh.has(n)){let a=vh.get(n);if(e.length<=0||e.pop()!==a)return!1}}}return!(e.length>0)}var wh,vh,d_,Mo=R(()=>{u();wh=new Map([["{","}"],["[","]"],["(",")"]]),vh=new Map(Array.from(wh.entries()).map(([r,e])=>[e,r])),d_=new Set(['"',"'","`"])});function pr(r){let[e]=xh(r);return e.forEach(([t,i])=>t.removeChild(i)),r.nodes.push(...e.map(([,t])=>t)),r}function xh(r){let e=[],t=null;for(let i of r.nodes)if(i.type==="combinator")e=e.filter(([,n])=>Bo(n).includes("jumpable")),t=null;else if(i.type==="pseudo"){h_(i)?(t=i,e.push([r,i,null])):t&&m_(i,t)?e.push([r,i,t]):t=null;for(let n of i.nodes??[]){let[a,s]=xh(n);t=s||t,e.push(...a)}}return[e,t]}function kh(r){return r.value.startsWith("::")||No[r.value]!==void 0}function h_(r){return kh(r)&&Bo(r).includes("terminal")}function m_(r,e){return r.type!=="pseudo"||kh(r)?!1:Bo(e).includes("actionable")}function Bo(r){return No[r.value]??No.__default__}var No,es=R(()=>{u();No={"::after":["terminal","jumpable"],"::backdrop":["terminal","jumpable"],"::before":["terminal","jumpable"],"::cue":["terminal"],"::cue-region":["terminal"],"::first-letter":["terminal","jumpable"],"::first-line":["terminal","jumpable"],"::grammar-error":["terminal"],"::marker":["terminal","jumpable"],"::part":["terminal","actionable"],"::placeholder":["terminal","jumpable"],"::selection":["terminal","jumpable"],"::slotted":["terminal"],"::spelling-error":["terminal"],"::target-text":["terminal"],"::file-selector-button":["terminal","actionable"],"::deep":["actionable"],"::v-deep":["actionable"],"::ng-deep":["actionable"],":after":["terminal","jumpable"],":before":["terminal","jumpable"],":first-letter":["terminal","jumpable"],":first-line":["terminal","jumpable"],":where":[],":is":[],":has":[],__default__:["terminal","actionable"]}});function dr(r,{context:e,candidate:t}){let i=e?.tailwindConfig.prefix??"",n=r.map(s=>{let o=(0,st.default)().astSync(s.format);return{...s,ast:s.respectPrefix?ur(i,o):o}}),a=st.default.root({nodes:[st.default.selector({nodes:[st.default.className({value:Te(t)})]})]});for(let{ast:s}of n)[a,s]=y_(a,s),s.walkNesting(o=>o.replaceWith(...a.nodes[0].nodes)),a=s;return a}function Ah(r){let e=[];for(;r.prev()&&r.prev().type!=="combinator";)r=r.prev();for(;r&&r.type!=="combinator";)e.push(r),r=r.next();return e}function g_(r){return r.sort((e,t)=>e.type==="tag"&&t.type==="class"?-1:e.type==="class"&&t.type==="tag"?1:e.type==="class"&&t.type==="pseudo"&&t.value.startsWith("::")?-1:e.type==="pseudo"&&e.value.startsWith("::")&&t.type==="class"?1:r.index(e)-r.index(t)),r}function jo(r,e){let t=!1;r.walk(i=>{if(i.type==="class"&&i.value===e)return t=!0,!1}),t||r.remove()}function ts(r,e,{context:t,candidate:i,base:n}){let a=t?.tailwindConfig?.separator??":";n=n??ve(i,a).pop();let s=(0,st.default)().astSync(r);if(s.walkClasses(f=>{f.raws&&f.value.includes(n)&&(f.raws.value=Te((0,Sh.default)(f.raws.value)))}),s.each(f=>jo(f,n)),s.length===0)return null;let o=Array.isArray(e)?dr(e,{context:t,candidate:i}):e;if(o===null)return s.toString();let l=st.default.comment({value:"/*__simple__*/"}),c=st.default.comment({value:"/*__simple__*/"});return s.walkClasses(f=>{if(f.value!==n)return;let d=f.parent,p=o.nodes[0].nodes;if(d.nodes.length===1){f.replaceWith(...p);return}let h=Ah(f);d.insertBefore(h[0],l),d.insertAfter(h[h.length-1],c);for(let v of p)d.insertBefore(h[0],v.clone());f.remove(),h=Ah(l);let b=d.index(l);d.nodes.splice(b,h.length,...g_(st.default.selector({nodes:h})).nodes),l.remove(),c.remove()}),s.walkPseudos(f=>{f.value===Fo&&f.replaceWith(f.nodes)}),s.each(f=>pr(f)),s.toString()}function y_(r,e){let t=[];return r.walkPseudos(i=>{i.value===Fo&&t.push({pseudo:i,value:i.nodes[0].toString()})}),e.walkPseudos(i=>{if(i.value!==Fo)return;let n=i.nodes[0].toString(),a=t.find(c=>c.value===n);if(!a)return;let s=[],o=i.next();for(;o&&o.type!=="combinator";)s.push(o),o=o.next();let l=o;a.pseudo.parent.insertAfter(a.pseudo,st.default.selector({nodes:s.map(c=>c.clone())})),i.remove(),s.forEach(c=>c.remove()),l&&l.type==="combinator"&&l.remove()}),[r,e]}var st,Sh,Fo,zo=R(()=>{u();st=pe(it()),Sh=pe(Rn());fr();Wn();es();zt();Fo=":merge"});function rs(r,e){let t=(0,Uo.default)().astSync(r);return t.each(i=>{i.nodes.some(a=>a.type==="combinator")&&(i.nodes=[Uo.default.pseudo({value:":is",nodes:[i.clone()]})]),pr(i)}),`${e} ${t.toString()}`}var Uo,Vo=R(()=>{u();Uo=pe(it());es()});function Ho(r){return b_.transformSync(r)}function*w_(r){let e=1/0;for(;e>=0;){let t,i=!1;if(e===1/0&&r.endsWith("]")){let s=r.indexOf("[");r[s-1]==="-"?t=s-1:r[s-1]==="/"?(t=s-1,i=!0):t=-1}else e===1/0&&r.includes("/")?(t=r.lastIndexOf("/"),i=!0):t=r.lastIndexOf("-",e);if(t<0)break;let n=r.slice(0,t),a=r.slice(i?t:t+1);e=t-1,!(n===""||a==="/")&&(yield[n,a])}}function v_(r,e){if(r.length===0||e.tailwindConfig.prefix==="")return r;for(let t of r){let[i]=t;if(i.options.respectPrefix){let n=ee.root({nodes:[t[1].clone()]}),a=t[1].raws.tailwind.classCandidate;n.walkRules(s=>{let o=a.startsWith("-");s.selector=ur(e.tailwindConfig.prefix,s.selector,o)}),t[1]=n.nodes[0]}}return r}function x_(r,e){if(r.length===0)return r;let t=[];function i(n){return n.parent&&n.parent.type==="atrule"&&n.parent.name==="keyframes"}for(let[n,a]of r){let s=ee.root({nodes:[a.clone()]});s.walkRules(o=>{if(i(o))return;let l=(0,is.default)().astSync(o.selector);l.each(c=>jo(c,e)),Wf(l,c=>c===e?`!${c}`:c),o.selector=l.toString(),o.walkDecls(c=>c.important=!0)}),t.push([{...n,important:!0},s.nodes[0]])}return t}function k_(r,e,t){if(e.length===0)return e;let i={modifier:null,value:Ei};{let[n,...a]=ve(r,"/");if(a.length>1&&(n=n+"/"+a.slice(0,-1).join("/"),a=a.slice(-1)),a.length&&!t.variantMap.has(r)&&(r=n,i.modifier=a[0],!we(t.tailwindConfig,"generalizedModifiers")))return[]}if(r.endsWith("]")&&!r.startsWith("[")){let n=/(.)(-?)\[(.*)\]/g.exec(r);if(n){let[,a,s,o]=n;if(a==="@"&&s==="-")return[];if(a!=="@"&&s==="")return[];r=r.replace(`${s}[${o}]`,""),i.value=o}}if(Qo(r)&&!t.variantMap.has(r)){let n=t.offsets.recordVariant(r),a=K(r.slice(1,-1)),s=ve(a,",");if(s.length>1)return[];if(!s.every(os))return[];let o=s.map((l,c)=>[t.offsets.applyParallelOffset(n,c),Oi(l.trim())]);t.variantMap.set(r,o)}if(t.variantMap.has(r)){let n=Qo(r),a=t.variantOptions.get(r)?.[Pt]??{},s=t.variantMap.get(r).slice(),o=[],l=(()=>!(n||a.respectPrefix===!1))();for(let[c,f]of e){if(c.layer==="user")continue;let d=ee.root({nodes:[f.clone()]});for(let[p,h,b]of s){let w=function(){v.raws.neededBackup||(v.raws.neededBackup=!0,v.walkRules(O=>O.raws.originalSelector=O.selector))},k=function(O){return w(),v.each(B=>{B.type==="rule"&&(B.selectors=B.selectors.map(N=>O({get className(){return Ho(N)},selector:N})))}),v},v=(b??d).clone(),y=[],S=h({get container(){return w(),v},separator:t.tailwindConfig.separator,modifySelectors:k,wrap(O){let B=v.nodes;v.removeAll(),O.append(B),v.append(O)},format(O){y.push({format:O,respectPrefix:l})},args:i});if(Array.isArray(S)){for(let[O,B]of S.entries())s.push([t.offsets.applyParallelOffset(p,O),B,v.clone()]);continue}if(typeof S=="string"&&y.push({format:S,respectPrefix:l}),S===null)continue;v.raws.neededBackup&&(delete v.raws.neededBackup,v.walkRules(O=>{let B=O.raws.originalSelector;if(!B||(delete O.raws.originalSelector,B===O.selector))return;let N=O.selector,T=(0,is.default)(F=>{F.walkClasses(Y=>{Y.value=`${r}${t.tailwindConfig.separator}${Y.value}`})}).processSync(B);y.push({format:N.replace(T,"&"),respectPrefix:l}),O.selector=B})),v.nodes[0].raws.tailwind={...v.nodes[0].raws.tailwind,parentLayer:c.layer};let E=[{...c,sort:t.offsets.applyVariantOffset(c.sort,p,Object.assign(i,t.variantOptions.get(r))),collectedFormats:(c.collectedFormats??[]).concat(y)},v.nodes[0]];o.push(E)}}return o}return[]}function Wo(r,e,t={}){return!ke(r)&&!Array.isArray(r)?[[r],t]:Array.isArray(r)?Wo(r[0],e,r[1]):(e.has(r)||e.set(r,lr(r)),[e.get(r),t])}function A_(r){return S_.test(r)}function C_(r){if(!r.includes("://"))return!1;try{let e=new URL(r);return e.scheme!==""&&e.host!==""}catch(e){return!1}}function Ch(r){let e=!0;return r.walkDecls(t=>{if(!_h(t.prop,t.value))return e=!1,!1}),e}function _h(r,e){if(C_(`${r}:${e}`))return!1;try{return ee.parse(`a{${r}:${e}}`).toResult(),!0}catch(t){return!1}}function __(r,e){let[,t,i]=r.match(/^\[([a-zA-Z0-9-_]+):(\S+)\]$/)??[];if(i===void 0||!A_(t)||!cr(i))return null;let n=K(i,{property:t});return _h(t,n)?[[{sort:e.offsets.arbitraryProperty(r),layer:"utilities",options:{respectImportant:!0}},()=>({[Do(r)]:{[t]:n}})]]:null}function*E_(r,e){e.candidateRuleMap.has(r)&&(yield[e.candidateRuleMap.get(r),"DEFAULT"]),yield*function*(o){o!==null&&(yield[o,"DEFAULT"])}(__(r,e));let t=r,i=!1,n=e.tailwindConfig.prefix,a=n.length,s=t.startsWith(n)||t.startsWith(`-${n}`);t[a]==="-"&&s&&(i=!0,t=n+t.slice(a+1)),i&&e.candidateRuleMap.has(t)&&(yield[e.candidateRuleMap.get(t),"-DEFAULT"]);for(let[o,l]of w_(t))e.candidateRuleMap.has(o)&&(yield[e.candidateRuleMap.get(o),i?`-${l}`:l])}function O_(r,e){return r===gt?[gt]:ve(r,e)}function*T_(r,e){for(let t of r)t[1].raws.tailwind={...t[1].raws.tailwind,classCandidate:e,preserveSource:t[0].options?.preserveSource??!1},yield t}function*Go(r,e){let t=e.tailwindConfig.separator,[i,...n]=O_(r,t).reverse(),a=!1;i.startsWith("!")&&(a=!0,i=i.slice(1));for(let s of E_(i,e)){let o=[],l=new Map,[c,f]=s,d=c.length===1;for(let[p,h]of c){let b=[];if(typeof h=="function")for(let v of[].concat(h(f,{isOnlyPlugin:d}))){let[y,w]=Wo(v,e.postCssNodeCache);for(let k of y)b.push([{...p,options:{...p.options,...w}},k])}else if(f==="DEFAULT"||f==="-DEFAULT"){let v=h,[y,w]=Wo(v,e.postCssNodeCache);for(let k of y)b.push([{...p,options:{...p.options,...w}},k])}if(b.length>0){let v=Array.from(Zs(p.options?.types??[],f,p.options??{},e.tailwindConfig)).map(([y,w])=>w);v.length>0&&l.set(b,v),o.push(b)}}if(Qo(f)){if(o.length>1){let b=function(y){return y.length===1?y[0]:y.find(w=>{let k=l.get(w);return w.some(([{options:S},E])=>Ch(E)?S.types.some(({type:O,preferOnConflict:B})=>k.includes(O)&&B):!1)})},[p,h]=o.reduce((y,w)=>(w.some(([{options:S}])=>S.types.some(({type:E})=>E==="any"))?y[0].push(w):y[1].push(w),y),[[],[]]),v=b(h)??b(p);if(v)o=[v];else{let y=o.map(k=>new Set([...l.get(k)??[]]));for(let k of y)for(let S of k){let E=!1;for(let O of y)k!==O&&O.has(S)&&(O.delete(S),E=!0);E&&k.delete(S)}let w=[];for(let[k,S]of y.entries())for(let E of S){let O=o[k].map(([,B])=>B).flat().map(B=>B.toString().split(`
-`).slice(1,-1).map(N=>N.trim()).map(N=>`      ${N}`).join(`
-`)).join(`
-
-`);w.push(`  Use \`${r.replace("[",`[${E}:`)}\` for \`${O.trim()}\``);break}G.warn([`The class \`${r}\` is ambiguous and matches multiple utilities.`,...w,`If this is content and not a class, replace it with \`${r.replace("[","&lsqb;").replace("]","&rsqb;")}\` to silence this warning.`]);continue}}o=o.map(p=>p.filter(h=>Ch(h[1])))}o=o.flat(),o=Array.from(T_(o,i)),o=v_(o,e),a&&(o=x_(o,i));for(let p of n)o=k_(p,o,e);for(let p of o)p[1].raws.tailwind={...p[1].raws.tailwind,candidate:r},p=R_(p,{context:e,candidate:r}),p!==null&&(yield p)}}function R_(r,{context:e,candidate:t}){if(!r[0].collectedFormats)return r;let i=!0,n;try{n=dr(r[0].collectedFormats,{context:e,candidate:t})}catch{return null}let a=ee.root({nodes:[r[1].clone()]});return a.walkRules(s=>{if(!ns(s))try{let o=ts(s.selector,n,{candidate:t,context:e});if(o===null){s.remove();return}s.selector=o}catch{return i=!1,!1}}),!i||a.nodes.length===0?null:(r[1]=a.nodes[0],r)}function ns(r){return r.parent&&r.parent.type==="atrule"&&r.parent.name==="keyframes"}function P_(r){if(r===!0)return e=>{ns(e)||e.walkDecls(t=>{t.parent.type==="rule"&&!ns(t.parent)&&(t.important=!0)})};if(typeof r=="string")return e=>{ns(e)||(e.selectors=e.selectors.map(t=>rs(t,r)))}}function ss(r,e,t=!1){let i=[],n=P_(e.tailwindConfig.important);for(let a of r){if(e.notClassCache.has(a))continue;if(e.candidateRuleCache.has(a)){i=i.concat(Array.from(e.candidateRuleCache.get(a)));continue}let s=Array.from(Go(a,e));if(s.length===0){e.notClassCache.add(a);continue}e.classCache.set(a,s);let o=e.candidateRuleCache.get(a)??new Set;e.candidateRuleCache.set(a,o);for(let l of s){let[{sort:c,options:f},d]=l;if(f.respectImportant&&n){let h=ee.root({nodes:[d.clone()]});h.walkRules(n),d=h.nodes[0]}let p=[c,t?d.clone():d];o.add(p),e.ruleCache.add(p),i.push(p)}}return i}function Qo(r){return r.startsWith("[")&&r.endsWith("]")}var is,b_,S_,as=R(()=>{u();Ot();is=pe(it());Io();Kt();Wn();Fr();Be();It();zo();qo();Br();_i();Mo();zt();ct();Vo();b_=(0,is.default)(r=>r.first.filter(({type:e})=>e==="class").pop().value);S_=/^[a-z_-]/});var Eh,Oh=R(()=>{u();Eh={}});function I_(r){try{return Eh.createHash("md5").update(r,"utf-8").digest("binary")}catch(e){return""}}function Th(r,e){let t=e.toString();if(!t.includes("@tailwind"))return!1;let i=Lo.get(r),n=I_(t),a=i!==n;return Lo.set(r,n),a}var Rh=R(()=>{u();Oh();It()});function ls(r){return(r>0n)-(r<0n)}var Ph=R(()=>{u()});function Ih(r,e){let t=0n,i=0n;for(let[n,a]of e)r&n&&(t=t|n,i=i|a);return r&~t|i}var Dh=R(()=>{u()});function qh(r){let e=null;for(let t of r)e=e??t,e=e>t?e:t;return e}function D_(r,e){let t=r.length,i=e.length,n=t<i?t:i;for(let a=0;a<n;a++){let s=r.charCodeAt(a)-e.charCodeAt(a);if(s!==0)return s}return t-i}var Yo,$h=R(()=>{u();Ph();Dh();Yo=class{constructor(){this.offsets={defaults:0n,base:0n,components:0n,utilities:0n,variants:0n,user:0n},this.layerPositions={defaults:0n,base:1n,components:2n,utilities:3n,user:4n,variants:5n},this.reservedVariantBits=0n,this.variantOffsets=new Map}create(e){return{layer:e,parentLayer:e,arbitrary:0n,variants:0n,parallelIndex:0n,index:this.offsets[e]++,propertyOffset:0n,property:"",options:[]}}arbitraryProperty(e){return{...this.create("utilities"),arbitrary:1n,property:e}}forVariant(e,t=0){let i=this.variantOffsets.get(e);if(i===void 0)throw new Error(`Cannot find offset for unknown variant ${e}`);return{...this.create("variants"),variants:i<<BigInt(t)}}applyVariantOffset(e,t,i){return i.variant=t.variants,{...e,layer:"variants",parentLayer:e.layer==="variants"?e.parentLayer:e.layer,variants:e.variants|t.variants,options:i.sort?[].concat(i,e.options):e.options,parallelIndex:qh([e.parallelIndex,t.parallelIndex])}}applyParallelOffset(e,t){return{...e,parallelIndex:BigInt(t)}}recordVariants(e,t){for(let i of e)this.recordVariant(i,t(i))}recordVariant(e,t=1){return this.variantOffsets.set(e,1n<<this.reservedVariantBits),this.reservedVariantBits+=BigInt(t),{...this.create("variants"),variants:this.variantOffsets.get(e)}}compare(e,t){if(e.layer!==t.layer)return this.layerPositions[e.layer]-this.layerPositions[t.layer];if(e.parentLayer!==t.parentLayer)return this.layerPositions[e.parentLayer]-this.layerPositions[t.parentLayer];for(let i of e.options)for(let n of t.options){if(i.id!==n.id||!i.sort||!n.sort)continue;let a=qh([i.variant,n.variant])??0n,s=~(a|a-1n),o=e.variants&s,l=t.variants&s;if(o!==l)continue;let c=i.sort({value:i.value,modifier:i.modifier},{value:n.value,modifier:n.modifier});if(c!==0)return c}return e.variants!==t.variants?e.variants-t.variants:e.parallelIndex!==t.parallelIndex?e.parallelIndex-t.parallelIndex:e.arbitrary!==t.arbitrary?e.arbitrary-t.arbitrary:e.propertyOffset!==t.propertyOffset?e.propertyOffset-t.propertyOffset:e.index-t.index}recalculateVariantOffsets(){let e=Array.from(this.variantOffsets.entries()).filter(([n])=>n.startsWith("[")).sort(([n],[a])=>D_(n,a)),t=e.map(([,n])=>n).sort((n,a)=>ls(n-a));return e.map(([,n],a)=>[n,t[a]]).filter(([n,a])=>n!==a)}remapArbitraryVariantOffsets(e){let t=this.recalculateVariantOffsets();return t.length===0?e:e.map(i=>{let[n,a]=i;return n={...n,variants:Ih(n.variants,t)},[n,a]})}sortArbitraryProperties(e){let t=new Set;for(let[s]of e)s.arbitrary===1n&&t.add(s.property);if(t.size===0)return e;let i=Array.from(t).sort(),n=new Map,a=1n;for(let s of i)n.set(s,a++);return e.map(s=>{let[o,l]=s;return o={...o,propertyOffset:n.get(o.property)??0n},[o,l]})}sort(e){return e=this.remapArbitraryVariantOffsets(e),e=this.sortArbitraryProperties(e),e.sort(([t],[i])=>ls(this.compare(t,i)))}}});function Zo(r,e){let t=r.tailwindConfig.prefix;return typeof t=="function"?t(e):t+e}function Mh({type:r="any",...e}){let t=[].concat(r);return{...e,types:t.map(i=>Array.isArray(i)?{type:i[0],...i[1]}:{type:i,preferOnConflict:!1})}}function q_(r){let e=[],t="",i=0;for(let n=0;n<r.length;n++){let a=r[n];if(a==="\\")t+="\\"+r[++n];else if(a==="{")++i,e.push(t.trim()),t="";else if(a==="}"){if(--i<0)throw new Error("Your { and } are unbalanced.");e.push(t.trim()),t=""}else t+=a}return t.length>0&&e.push(t.trim()),e=e.filter(n=>n!==""),e}function $_(r,e,{before:t=[]}={}){if(t=[].concat(t),t.length<=0){r.push(e);return}let i=r.length-1;for(let n of t){let a=r.indexOf(n);a!==-1&&(i=Math.min(i,a))}r.splice(i,0,e)}function Nh(r){return Array.isArray(r)?r.flatMap(e=>!Array.isArray(e)&&!ke(e)?e:lr(e)):Nh([r])}function L_(r,e){return(0,Ko.default)(i=>{let n=[];return e&&e(i),i.walkClasses(a=>{n.push(a.value)}),n}).transformSync(r)}function M_(r){r.walkPseudos(e=>{e.value===":not"&&e.remove()})}function N_(r,e={containsNonOnDemandable:!1},t=0){let i=[],n=[];r.type==="rule"?n.push(...r.selectors):r.type==="atrule"&&r.walkRules(a=>n.push(...a.selectors));for(let a of n){let s=L_(a,M_);s.length===0&&(e.containsNonOnDemandable=!0);for(let o of s)i.push(o)}return t===0?[e.containsNonOnDemandable||i.length===0,i]:i}function us(r){return Nh(r).flatMap(e=>{let t=new Map,[i,n]=N_(e);return i&&n.unshift(gt),n.map(a=>(t.has(e)||t.set(e,e),[a,t.get(e)]))})}function os(r){return r.startsWith("@")||r.includes("&")}function Oi(r){r=r.replace(/\n+/g,"").replace(/\s{1,}/g," ").trim();let e=q_(r).map(t=>{if(!t.startsWith("@"))return({format:a})=>a(t);let[,i,n]=/@(\S*)( .+|[({].*)?/g.exec(t);return({wrap:a})=>a(ee.atRule({name:i,params:n?.trim()??""}))}).reverse();return t=>{for(let i of e)i(t)}}function B_(r,e,{variantList:t,variantMap:i,offsets:n,classList:a}){function s(p,h){return p?(0,Lh.default)(r,p,h):r}function o(p){return ur(r.prefix,p)}function l(p,h){return p===gt?gt:h.respectPrefix?e.tailwindConfig.prefix+p:p}function c(p,h,b={}){let v=kt(p),y=s(["theme",...v],h);return mt(v[0])(y,b)}let f=0,d={postcss:ee,prefix:o,e:Te,config:s,theme:c,corePlugins:p=>Array.isArray(r.corePlugins)?r.corePlugins.includes(p):s(["corePlugins",p],!0),variants:()=>[],addBase(p){for(let[h,b]of us(p)){let v=l(h,{}),y=n.create("base");e.candidateRuleMap.has(v)||e.candidateRuleMap.set(v,[]),e.candidateRuleMap.get(v).push([{sort:y,layer:"base"},b])}},addDefaults(p,h){let b={[`@defaults ${p}`]:h};for(let[v,y]of us(b)){let w=l(v,{});e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push([{sort:n.create("defaults"),layer:"defaults"},y])}},addComponents(p,h){h=Object.assign({},{preserveSource:!1,respectPrefix:!0,respectImportant:!1},Array.isArray(h)?{}:h);for(let[v,y]of us(p)){let w=l(v,h);a.add(w),e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push([{sort:n.create("components"),layer:"components",options:h},y])}},addUtilities(p,h){h=Object.assign({},{preserveSource:!1,respectPrefix:!0,respectImportant:!0},Array.isArray(h)?{}:h);for(let[v,y]of us(p)){let w=l(v,h);a.add(w),e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push([{sort:n.create("utilities"),layer:"utilities",options:h},y])}},matchUtilities:function(p,h){h=Mh({...{respectPrefix:!0,respectImportant:!0,modifiers:!1},...h});let v=n.create("utilities");for(let y in p){let S=function(O,{isOnlyPlugin:B}){let[N,T,F]=Js(h.types,O,h,r);if(N===void 0)return[];if(!h.types.some(({type:U})=>U===T))if(B)G.warn([`Unnecessary typehint \`${T}\` in \`${y}-${O}\`.`,`You can safely update it to \`${y}-${O.replace(T+":","")}\`.`]);else return[];if(!cr(N))return[];let Y={get modifier(){return h.modifiers||G.warn(`modifier-used-without-options-for-${y}`,["Your plugin must set `modifiers: true` in its options to support modifiers."]),F}},_=we(r,"generalizedModifiers");return[].concat(_?k(N,Y):k(N)).filter(Boolean).map(U=>({[Gn(y,O)]:U}))},w=l(y,h),k=p[y];a.add([w,h]);let E=[{sort:v,layer:"utilities",options:h},S];e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push(E)}},matchComponents:function(p,h){h=Mh({...{respectPrefix:!0,respectImportant:!1,modifiers:!1},...h});let v=n.create("components");for(let y in p){let S=function(O,{isOnlyPlugin:B}){let[N,T,F]=Js(h.types,O,h,r);if(N===void 0)return[];if(!h.types.some(({type:U})=>U===T))if(B)G.warn([`Unnecessary typehint \`${T}\` in \`${y}-${O}\`.`,`You can safely update it to \`${y}-${O.replace(T+":","")}\`.`]);else return[];if(!cr(N))return[];let Y={get modifier(){return h.modifiers||G.warn(`modifier-used-without-options-for-${y}`,["Your plugin must set `modifiers: true` in its options to support modifiers."]),F}},_=we(r,"generalizedModifiers");return[].concat(_?k(N,Y):k(N)).filter(Boolean).map(U=>({[Gn(y,O)]:U}))},w=l(y,h),k=p[y];a.add([w,h]);let E=[{sort:v,layer:"components",options:h},S];e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push(E)}},addVariant(p,h,b={}){h=[].concat(h).map(v=>{if(typeof v!="string")return(y={})=>{let{args:w,modifySelectors:k,container:S,separator:E,wrap:O,format:B}=y,N=v(Object.assign({modifySelectors:k,container:S,separator:E},b.type===Xo.MatchVariant&&{args:w,wrap:O,format:B}));if(typeof N=="string"&&!os(N))throw new Error(`Your custom variant \`${p}\` has an invalid format string. Make sure it's an at-rule or contains a \`&\` placeholder.`);return Array.isArray(N)?N.filter(T=>typeof T=="string").map(T=>Oi(T)):N&&typeof N=="string"&&Oi(N)(y)};if(!os(v))throw new Error(`Your custom variant \`${p}\` has an invalid format string. Make sure it's an at-rule or contains a \`&\` placeholder.`);return Oi(v)}),$_(t,p,b),i.set(p,h),e.variantOptions.set(p,b)},matchVariant(p,h,b){let v=b?.id??++f,y=p==="@",w=we(r,"generalizedModifiers");for(let[S,E]of Object.entries(b?.values??{}))S!=="DEFAULT"&&d.addVariant(y?`${p}${S}`:`${p}-${S}`,({args:O,container:B})=>h(E,w?{modifier:O?.modifier,container:B}:{container:B}),{...b,value:E,id:v,type:Xo.MatchVariant,variantInfo:Jo.Base});let k="DEFAULT"in(b?.values??{});d.addVariant(p,({args:S,container:E})=>S?.value===Ei&&!k?null:h(S?.value===Ei?b.values.DEFAULT:S?.value??(typeof S=="string"?S:""),w?{modifier:S?.modifier,container:E}:{container:E}),{...b,id:v,type:Xo.MatchVariant,variantInfo:Jo.Dynamic})}};return d}function fs(r){return el.has(r)||el.set(r,new Map),el.get(r)}function Bh(r,e){let t=!1,i=new Map;for(let n of r){if(!n)continue;let a=sa.parse(n),s=a.hash?a.href.replace(a.hash,""):a.href;s=a.search?s.replace(a.search,""):s;let o=be.statSync(decodeURIComponent(s),{throwIfNoEntry:!1})?.mtimeMs;!o||((!e.has(n)||o>e.get(n))&&(t=!0),i.set(n,o))}return[t,i]}function Fh(r){r.walkAtRules(e=>{["responsive","variants"].includes(e.name)&&(Fh(e),e.before(e.nodes),e.remove())})}function F_(r){let e=[];return r.each(t=>{t.type==="atrule"&&["responsive","variants"].includes(t.name)&&(t.name="layer",t.params="utilities")}),r.walkAtRules("layer",t=>{if(Fh(t),t.params==="base"){for(let i of t.nodes)e.push(function({addBase:n}){n(i,{respectPrefix:!1})});t.remove()}else if(t.params==="components"){for(let i of t.nodes)e.push(function({addComponents:n}){n(i,{respectPrefix:!1,preserveSource:!0})});t.remove()}else if(t.params==="utilities"){for(let i of t.nodes)e.push(function({addUtilities:n}){n(i,{respectPrefix:!1,preserveSource:!0})});t.remove()}}),e}function j_(r,e){let t=Object.entries({...se,...mh}).map(([l,c])=>r.tailwindConfig.corePlugins.includes(l)?c:null).filter(Boolean),i=r.tailwindConfig.plugins.map(l=>(l.__isOptionsFunction&&(l=l()),typeof l=="function"?l:l.handler)),n=F_(e),a=[se.childVariant,se.pseudoElementVariants,se.pseudoClassVariants,se.hasVariants,se.ariaVariants,se.dataVariants],s=[se.supportsVariants,se.reducedMotionVariants,se.prefersContrastVariants,se.screenVariants,se.orientationVariants,se.directionVariants,se.darkVariants,se.forcedColorsVariants,se.printVariant];return(r.tailwindConfig.darkMode==="class"||Array.isArray(r.tailwindConfig.darkMode)&&r.tailwindConfig.darkMode[0]==="class")&&(s=[se.supportsVariants,se.reducedMotionVariants,se.prefersContrastVariants,se.darkVariants,se.screenVariants,se.orientationVariants,se.directionVariants,se.forcedColorsVariants,se.printVariant]),[...t,...a,...i,...s,...n]}function z_(r,e){let t=[],i=new Map;e.variantMap=i;let n=new Yo;e.offsets=n;let a=new Set,s=B_(e.tailwindConfig,e,{variantList:t,variantMap:i,offsets:n,classList:a});for(let f of r)if(Array.isArray(f))for(let d of f)d(s);else f?.(s);n.recordVariants(t,f=>i.get(f).length);for(let[f,d]of i.entries())e.variantMap.set(f,d.map((p,h)=>[n.forVariant(f,h),p]));let o=(e.tailwindConfig.safelist??[]).filter(Boolean);if(o.length>0){let f=[];for(let d of o){if(typeof d=="string"){e.changedContent.push({content:d,extension:"html"});continue}if(d instanceof RegExp){G.warn("root-regex",["Regular expressions in `safelist` work differently in Tailwind CSS v3.0.","Update your `safelist` configuration to eliminate this warning.","https://tailwindcss.com/docs/content-configuration#safelisting-classes"]);continue}f.push(d)}if(f.length>0){let d=new Map,p=e.tailwindConfig.prefix.length,h=f.some(b=>b.pattern.source.includes("!"));for(let b of a){let v=Array.isArray(b)?(()=>{let[y,w]=b,S=Object.keys(w?.values??{}).map(E=>Ci(y,E));return w?.supportsNegativeValues&&(S=[...S,...S.map(E=>"-"+E)],S=[...S,...S.map(E=>E.slice(0,p)+"-"+E.slice(p))]),w.types.some(({type:E})=>E==="color")&&(S=[...S,...S.flatMap(E=>Object.keys(e.tailwindConfig.theme.opacity).map(O=>`${E}/${O}`))]),h&&w?.respectImportant&&(S=[...S,...S.map(E=>"!"+E)]),S})():[b];for(let y of v)for(let{pattern:w,variants:k=[]}of f)if(w.lastIndex=0,d.has(w)||d.set(w,0),!!w.test(y)){d.set(w,d.get(w)+1),e.changedContent.push({content:y,extension:"html"});for(let S of k)e.changedContent.push({content:S+e.tailwindConfig.separator+y,extension:"html"})}}for(let[b,v]of d.entries())v===0&&G.warn([`The safelist pattern \`${b}\` doesn't match any Tailwind CSS classes.`,"Fix this pattern or remove it from your `safelist` configuration.","https://tailwindcss.com/docs/content-configuration#safelisting-classes"])}}let l=[].concat(e.tailwindConfig.darkMode??"media")[1]??"dark",c=[Zo(e,l),Zo(e,"group"),Zo(e,"peer")];e.getClassOrder=function(d){let p=[...d].sort((y,w)=>y===w?0:y<w?-1:1),h=new Map(p.map(y=>[y,null])),b=ss(new Set(p),e,!0);b=e.offsets.sort(b);let v=BigInt(c.length);for(let[,y]of b){let w=y.raws.tailwind.candidate;h.set(w,h.get(w)??v++)}return d.map(y=>{let w=h.get(y)??null,k=c.indexOf(y);return w===null&&k!==-1&&(w=BigInt(k)),[y,w]})},e.getClassList=function(d={}){let p=[];for(let h of a)if(Array.isArray(h)){let[b,v]=h,y=[],w=Object.keys(v?.modifiers??{});v?.types?.some(({type:E})=>E==="color")&&w.push(...Object.keys(e.tailwindConfig.theme.opacity??{}));let k={modifiers:w},S=d.includeMetadata&&w.length>0;for(let[E,O]of Object.entries(v?.values??{})){if(O==null)continue;let B=Ci(b,E);if(p.push(S?[B,k]:B),v?.supportsNegativeValues&&xt(O)){let N=Ci(b,`-${E}`);y.push(S?[N,k]:N)}}p.push(...y)}else p.push(h);return p},e.getVariants=function(){let d=Math.random().toString(36).substring(7).toUpperCase(),p=[];for(let[h,b]of e.variantOptions.entries())b.variantInfo!==Jo.Base&&p.push({name:h,isArbitrary:b.type===Symbol.for("MATCH_VARIANT"),values:Object.keys(b.values??{}),hasDash:h!=="@",selectors({modifier:v,value:y}={}){let w=`TAILWINDPLACEHOLDER${d}`,k=ee.rule({selector:`.${w}`}),S=ee.root({nodes:[k.clone()]}),E=S.toString(),O=(e.variantMap.get(h)??[]).flatMap(([oe,A])=>A),B=[];for(let oe of O){let A=[],C={args:{modifier:v,value:b.values?.[y]??y},separator:e.tailwindConfig.separator,modifySelectors(V){return S.each(Ee=>{Ee.type==="rule"&&(Ee.selectors=Ee.selectors.map(Ie=>V({get className(){return Ho(Ie)},selector:Ie})))}),S},format(V){A.push(V)},wrap(V){A.push(`@${V.name} ${V.params} { & }`)},container:S},he=oe(C);if(A.length>0&&B.push(A),Array.isArray(he))for(let V of he)A=[],V(C),B.push(A)}let N=[],T=S.toString();E!==T&&(S.walkRules(oe=>{let A=oe.selector,C=(0,Ko.default)(he=>{he.walkClasses(V=>{V.value=`${h}${e.tailwindConfig.separator}${V.value}`})}).processSync(A);N.push(A.replace(C,"&").replace(w,"&"))}),S.walkAtRules(oe=>{N.push(`@${oe.name} (${oe.params}) { & }`)}));let F=!(y in(b.values??{})),Y=b[Pt]??{},_=(()=>!(F||Y.respectPrefix===!1))();B=B.map(oe=>oe.map(A=>({format:A,respectPrefix:_}))),N=N.map(oe=>({format:oe,respectPrefix:_}));let Q={candidate:w,context:e},U=B.map(oe=>ts(`.${w}`,dr(oe,Q),Q).replace(`.${w}`,"&").replace("{ & }","").trim());return N.length>0&&U.push(dr(N,Q).toString().replace(`.${w}`,"&")),U}});return p}}function jh(r,e){!r.classCache.has(e)||(r.notClassCache.add(e),r.classCache.delete(e),r.applyClassCache.delete(e),r.candidateRuleMap.delete(e),r.candidateRuleCache.delete(e),r.stylesheetCache=null)}function U_(r,e){let t=e.raws.tailwind.candidate;if(!!t){for(let i of r.ruleCache)i[1].raws.tailwind.candidate===t&&r.ruleCache.delete(i);jh(r,t)}}function tl(r,e=[],t=ee.root()){let i={disposables:[],ruleCache:new Set,candidateRuleCache:new Map,classCache:new Map,applyClassCache:new Map,notClassCache:new Set(r.blocklist??[]),postCssNodeCache:new Map,candidateRuleMap:new Map,tailwindConfig:r,changedContent:e,variantMap:new Map,stylesheetCache:null,variantOptions:new Map,markInvalidUtilityCandidate:a=>jh(i,a),markInvalidUtilityNode:a=>U_(i,a)},n=j_(i,t);return z_(n,i),i}function zh(r,e,t,i,n,a){let s=e.opts.from,o=i!==null;Je.DEBUG&&console.log("Source path:",s);let l;if(o&&hr.has(s))l=hr.get(s);else if(Ti.has(n)){let p=Ti.get(n);Dt.get(p).add(s),hr.set(s,p),l=p}let c=Th(s,r);if(l){let[p,h]=Bh([...a],fs(l));if(!p&&!c)return[l,!1,h]}if(hr.has(s)){let p=hr.get(s);if(Dt.has(p)&&(Dt.get(p).delete(s),Dt.get(p).size===0)){Dt.delete(p);for(let[h,b]of Ti)b===p&&Ti.delete(h);for(let h of p.disposables.splice(0))h(p)}}Je.DEBUG&&console.log("Setting up new context...");let f=tl(t,[],r);Object.assign(f,{userConfigPath:i});let[,d]=Bh([...a],fs(f));return Ti.set(n,f),hr.set(s,f),Dt.has(f)||Dt.set(f,new Set),Dt.get(f).add(s),[f,!0,d]}var Lh,Ko,Pt,Xo,Jo,el,hr,Ti,Dt,_i=R(()=>{u();ft();aa();Ot();Lh=pe(Oa()),Ko=pe(it());Si();Io();Wn();Kt();fr();qo();Fr();gh();It();It();Gi();Be();Hi();Mo();as();Rh();$h();ct();zo();Pt=Symbol(),Xo={AddVariant:Symbol.for("ADD_VARIANT"),MatchVariant:Symbol.for("MATCH_VARIANT")},Jo={Base:1<<0,Dynamic:1<<1};el=new WeakMap;hr=yh,Ti=bh,Dt=Zn});function rl(r){return r.ignore?[]:r.glob?m.env.ROLLUP_WATCH==="true"?[{type:"dependency",file:r.base}]:[{type:"dir-dependency",dir:r.base,glob:r.glob}]:[{type:"dependency",file:r.base}]}var Uh=R(()=>{u()});function Vh(r,e){return{handler:r,config:e}}var Hh,Wh=R(()=>{u();Vh.withOptions=function(r,e=()=>({})){let t=function(i){return{__options:i,handler:r(i),config:e(i)}};return t.__isOptionsFunction=!0,t.__pluginFunction=r,t.__configFunction=e,t};Hh=Vh});var il={};Ge(il,{default:()=>V_});var V_,nl=R(()=>{u();Wh();V_=Hh});var Qh=x((F4,Gh)=>{u();var H_=(nl(),il).default,W_={overflow:"hidden",display:"-webkit-box","-webkit-box-orient":"vertical"},G_=H_(function({matchUtilities:r,addUtilities:e,theme:t,variants:i}){let n=t("lineClamp");r({"line-clamp":a=>({...W_,"-webkit-line-clamp":`${a}`})},{values:n}),e([{".line-clamp-none":{"-webkit-line-clamp":"unset"}}],i("lineClamp"))},{theme:{lineClamp:{1:"1",2:"2",3:"3",4:"4",5:"5",6:"6"}},variants:{lineClamp:["responsive"]}});Gh.exports=G_});function sl(r){r.content.files.length===0&&G.warn("content-problems",["The `content` option in your Tailwind CSS configuration is missing or empty.","Configure your content sources or your generated CSS will be missing styles.","https://tailwindcss.com/docs/content-configuration"]);try{let e=Qh();r.plugins.includes(e)&&(G.warn("line-clamp-in-core",["As of Tailwind CSS v3.3, the `@tailwindcss/line-clamp` plugin is now included by default.","Remove it from the `plugins` array in your configuration to eliminate this warning."]),r.plugins=r.plugins.filter(t=>t!==e))}catch{}return r}var Yh=R(()=>{u();Be()});var Kh,Xh=R(()=>{u();Kh=()=>!1});var cs,Jh=R(()=>{u();cs={sync:r=>[].concat(r),generateTasks:r=>[{dynamic:!1,base:".",negative:[],positive:[].concat(r),patterns:[].concat(r)}],escapePath:r=>r}});var al,Zh=R(()=>{u();al=r=>r});var em,tm=R(()=>{u();em=()=>""});function rm(r){let e=r,t=em(r);return t!=="."&&(e=r.substr(t.length),e.charAt(0)==="/"&&(e=e.substr(1))),e.substr(0,2)==="./"?e=e.substr(2):e.charAt(0)==="/"&&(e=e.substr(1)),{base:t,glob:e}}var im=R(()=>{u();tm()});var ps=x(Ve=>{u();"use strict";Ve.isInteger=r=>typeof r=="number"?Number.isInteger(r):typeof r=="string"&&r.trim()!==""?Number.isInteger(Number(r)):!1;Ve.find=(r,e)=>r.nodes.find(t=>t.type===e);Ve.exceedsLimit=(r,e,t=1,i)=>i===!1||!Ve.isInteger(r)||!Ve.isInteger(e)?!1:(Number(e)-Number(r))/Number(t)>=i;Ve.escapeNode=(r,e=0,t)=>{let i=r.nodes[e];!i||(t&&i.type===t||i.type==="open"||i.type==="close")&&i.escaped!==!0&&(i.value="\\"+i.value,i.escaped=!0)};Ve.encloseBrace=r=>r.type!=="brace"?!1:r.commas>>0+r.ranges>>0==0?(r.invalid=!0,!0):!1;Ve.isInvalidBrace=r=>r.type!=="brace"?!1:r.invalid===!0||r.dollar?!0:r.commas>>0+r.ranges>>0==0||r.open!==!0||r.close!==!0?(r.invalid=!0,!0):!1;Ve.isOpenOrClose=r=>r.type==="open"||r.type==="close"?!0:r.open===!0||r.close===!0;Ve.reduce=r=>r.reduce((e,t)=>(t.type==="text"&&e.push(t.value),t.type==="range"&&(t.type="text"),e),[]);Ve.flatten=(...r)=>{let e=[],t=i=>{for(let n=0;n<i.length;n++){let a=i[n];Array.isArray(a)?t(a,e):a!==void 0&&e.push(a)}return e};return t(r),e}});var ds=x((K4,sm)=>{u();"use strict";var nm=ps();sm.exports=(r,e={})=>{let t=(i,n={})=>{let a=e.escapeInvalid&&nm.isInvalidBrace(n),s=i.invalid===!0&&e.escapeInvalid===!0,o="";if(i.value)return(a||s)&&nm.isOpenOrClose(i)?"\\"+i.value:i.value;if(i.value)return i.value;if(i.nodes)for(let l of i.nodes)o+=t(l);return o};return t(r)}});var om=x((X4,am)=>{u();"use strict";am.exports=function(r){return typeof r=="number"?r-r==0:typeof r=="string"&&r.trim()!==""?Number.isFinite?Number.isFinite(+r):isFinite(+r):!1}});var gm=x((J4,mm)=>{u();"use strict";var lm=om(),Wt=(r,e,t)=>{if(lm(r)===!1)throw new TypeError("toRegexRange: expected the first argument to be a number");if(e===void 0||r===e)return String(r);if(lm(e)===!1)throw new TypeError("toRegexRange: expected the second argument to be a number.");let i={relaxZeros:!0,...t};typeof i.strictZeros=="boolean"&&(i.relaxZeros=i.strictZeros===!1);let n=String(i.relaxZeros),a=String(i.shorthand),s=String(i.capture),o=String(i.wrap),l=r+":"+e+"="+n+a+s+o;if(Wt.cache.hasOwnProperty(l))return Wt.cache[l].result;let c=Math.min(r,e),f=Math.max(r,e);if(Math.abs(c-f)===1){let v=r+"|"+e;return i.capture?`(${v})`:i.wrap===!1?v:`(?:${v})`}let d=hm(r)||hm(e),p={min:r,max:e,a:c,b:f},h=[],b=[];if(d&&(p.isPadded=d,p.maxLen=String(p.max).length),c<0){let v=f<0?Math.abs(f):1;b=um(v,Math.abs(c),p,i),c=p.a=0}return f>=0&&(h=um(c,f,p,i)),p.negatives=b,p.positives=h,p.result=Q_(b,h,i),i.capture===!0?p.result=`(${p.result})`:i.wrap!==!1&&h.length+b.length>1&&(p.result=`(?:${p.result})`),Wt.cache[l]=p,p.result};function Q_(r,e,t){let i=ol(r,e,"-",!1,t)||[],n=ol(e,r,"",!1,t)||[],a=ol(r,e,"-?",!0,t)||[];return i.concat(a).concat(n).join("|")}function Y_(r,e){let t=1,i=1,n=cm(r,t),a=new Set([e]);for(;r<=n&&n<=e;)a.add(n),t+=1,n=cm(r,t);for(n=pm(e+1,i)-1;r<n&&n<=e;)a.add(n),i+=1,n=pm(e+1,i)-1;return a=[...a],a.sort(J_),a}function K_(r,e,t){if(r===e)return{pattern:r,count:[],digits:0};let i=X_(r,e),n=i.length,a="",s=0;for(let o=0;o<n;o++){let[l,c]=i[o];l===c?a+=l:l!=="0"||c!=="9"?a+=Z_(l,c,t):s++}return s&&(a+=t.shorthand===!0?"\\d":"[0-9]"),{pattern:a,count:[s],digits:n}}function um(r,e,t,i){let n=Y_(r,e),a=[],s=r,o;for(let l=0;l<n.length;l++){let c=n[l],f=K_(String(s),String(c),i),d="";if(!t.isPadded&&o&&o.pattern===f.pattern){o.count.length>1&&o.count.pop(),o.count.push(f.count[0]),o.string=o.pattern+dm(o.count),s=c+1;continue}t.isPadded&&(d=eE(c,t,i)),f.string=d+f.pattern+dm(f.count),a.push(f),s=c+1,o=f}return a}function ol(r,e,t,i,n){let a=[];for(let s of r){let{string:o}=s;!i&&!fm(e,"string",o)&&a.push(t+o),i&&fm(e,"string",o)&&a.push(t+o)}return a}function X_(r,e){let t=[];for(let i=0;i<r.length;i++)t.push([r[i],e[i]]);return t}function J_(r,e){return r>e?1:e>r?-1:0}function fm(r,e,t){return r.some(i=>i[e]===t)}function cm(r,e){return Number(String(r).slice(0,-e)+"9".repeat(e))}function pm(r,e){return r-r%Math.pow(10,e)}function dm(r){let[e=0,t=""]=r;return t||e>1?`{${e+(t?","+t:"")}}`:""}function Z_(r,e,t){return`[${r}${e-r==1?"":"-"}${e}]`}function hm(r){return/^-?(0+)\d/.test(r)}function eE(r,e,t){if(!e.isPadded)return r;let i=Math.abs(e.maxLen-String(r).length),n=t.relaxZeros!==!1;switch(i){case 0:return"";case 1:return n?"0?":"0";case 2:return n?"0{0,2}":"00";default:return n?`0{0,${i}}`:`0{${i}}`}}Wt.cache={};Wt.clearCache=()=>Wt.cache={};mm.exports=Wt});var fl=x((Z4,Am)=>{u();"use strict";var tE=(Bn(),Nn),ym=gm(),bm=r=>r!==null&&typeof r=="object"&&!Array.isArray(r),rE=r=>e=>r===!0?Number(e):String(e),ll=r=>typeof r=="number"||typeof r=="string"&&r!=="",Ri=r=>Number.isInteger(+r),ul=r=>{let e=`${r}`,t=-1;if(e[0]==="-"&&(e=e.slice(1)),e==="0")return!1;for(;e[++t]==="0";);return t>0},iE=(r,e,t)=>typeof r=="string"||typeof e=="string"?!0:t.stringify===!0,nE=(r,e,t)=>{if(e>0){let i=r[0]==="-"?"-":"";i&&(r=r.slice(1)),r=i+r.padStart(i?e-1:e,"0")}return t===!1?String(r):r},wm=(r,e)=>{let t=r[0]==="-"?"-":"";for(t&&(r=r.slice(1),e--);r.length<e;)r="0"+r;return t?"-"+r:r},sE=(r,e)=>{r.negatives.sort((s,o)=>s<o?-1:s>o?1:0),r.positives.sort((s,o)=>s<o?-1:s>o?1:0);let t=e.capture?"":"?:",i="",n="",a;return r.positives.length&&(i=r.positives.join("|")),r.negatives.length&&(n=`-(${t}${r.negatives.join("|")})`),i&&n?a=`${i}|${n}`:a=i||n,e.wrap?`(${t}${a})`:a},vm=(r,e,t,i)=>{if(t)return ym(r,e,{wrap:!1,...i});let n=String.fromCharCode(r);if(r===e)return n;let a=String.fromCharCode(e);return`[${n}-${a}]`},xm=(r,e,t)=>{if(Array.isArray(r)){let i=t.wrap===!0,n=t.capture?"":"?:";return i?`(${n}${r.join("|")})`:r.join("|")}return ym(r,e,t)},km=(...r)=>new RangeError("Invalid range arguments: "+tE.inspect(...r)),Sm=(r,e,t)=>{if(t.strictRanges===!0)throw km([r,e]);return[]},aE=(r,e)=>{if(e.strictRanges===!0)throw new TypeError(`Expected step "${r}" to be a number`);return[]},oE=(r,e,t=1,i={})=>{let n=Number(r),a=Number(e);if(!Number.isInteger(n)||!Number.isInteger(a)){if(i.strictRanges===!0)throw km([r,e]);return[]}n===0&&(n=0),a===0&&(a=0);let s=n>a,o=String(r),l=String(e),c=String(t);t=Math.max(Math.abs(t),1);let f=ul(o)||ul(l)||ul(c),d=f?Math.max(o.length,l.length,c.length):0,p=f===!1&&iE(r,e,i)===!1,h=i.transform||rE(p);if(i.toRegex&&t===1)return vm(wm(r,d),wm(e,d),!0,i);let b={negatives:[],positives:[]},v=k=>b[k<0?"negatives":"positives"].push(Math.abs(k)),y=[],w=0;for(;s?n>=a:n<=a;)i.toRegex===!0&&t>1?v(n):y.push(nE(h(n,w),d,p)),n=s?n-t:n+t,w++;return i.toRegex===!0?t>1?sE(b,i):xm(y,null,{wrap:!1,...i}):y},lE=(r,e,t=1,i={})=>{if(!Ri(r)&&r.length>1||!Ri(e)&&e.length>1)return Sm(r,e,i);let n=i.transform||(p=>String.fromCharCode(p)),a=`${r}`.charCodeAt(0),s=`${e}`.charCodeAt(0),o=a>s,l=Math.min(a,s),c=Math.max(a,s);if(i.toRegex&&t===1)return vm(l,c,!1,i);let f=[],d=0;for(;o?a>=s:a<=s;)f.push(n(a,d)),a=o?a-t:a+t,d++;return i.toRegex===!0?xm(f,null,{wrap:!1,options:i}):f},hs=(r,e,t,i={})=>{if(e==null&&ll(r))return[r];if(!ll(r)||!ll(e))return Sm(r,e,i);if(typeof t=="function")return hs(r,e,1,{transform:t});if(bm(t))return hs(r,e,0,t);let n={...i};return n.capture===!0&&(n.wrap=!0),t=t||n.step||1,Ri(t)?Ri(r)&&Ri(e)?oE(r,e,t,n):lE(r,e,Math.max(Math.abs(t),1),n):t!=null&&!bm(t)?aE(t,n):hs(r,e,1,t)};Am.exports=hs});var Em=x((e6,_m)=>{u();"use strict";var uE=fl(),Cm=ps(),fE=(r,e={})=>{let t=(i,n={})=>{let a=Cm.isInvalidBrace(n),s=i.invalid===!0&&e.escapeInvalid===!0,o=a===!0||s===!0,l=e.escapeInvalid===!0?"\\":"",c="";if(i.isOpen===!0||i.isClose===!0)return l+i.value;if(i.type==="open")return o?l+i.value:"(";if(i.type==="close")return o?l+i.value:")";if(i.type==="comma")return i.prev.type==="comma"?"":o?i.value:"|";if(i.value)return i.value;if(i.nodes&&i.ranges>0){let f=Cm.reduce(i.nodes),d=uE(...f,{...e,wrap:!1,toRegex:!0});if(d.length!==0)return f.length>1&&d.length>1?`(${d})`:d}if(i.nodes)for(let f of i.nodes)c+=t(f,i);return c};return t(r)};_m.exports=fE});var Rm=x((t6,Tm)=>{u();"use strict";var cE=fl(),Om=ds(),mr=ps(),Gt=(r="",e="",t=!1)=>{let i=[];if(r=[].concat(r),e=[].concat(e),!e.length)return r;if(!r.length)return t?mr.flatten(e).map(n=>`{${n}}`):e;for(let n of r)if(Array.isArray(n))for(let a of n)i.push(Gt(a,e,t));else for(let a of e)t===!0&&typeof a=="string"&&(a=`{${a}}`),i.push(Array.isArray(a)?Gt(n,a,t):n+a);return mr.flatten(i)},pE=(r,e={})=>{let t=e.rangeLimit===void 0?1e3:e.rangeLimit,i=(n,a={})=>{n.queue=[];let s=a,o=a.queue;for(;s.type!=="brace"&&s.type!=="root"&&s.parent;)s=s.parent,o=s.queue;if(n.invalid||n.dollar){o.push(Gt(o.pop(),Om(n,e)));return}if(n.type==="brace"&&n.invalid!==!0&&n.nodes.length===2){o.push(Gt(o.pop(),["{}"]));return}if(n.nodes&&n.ranges>0){let d=mr.reduce(n.nodes);if(mr.exceedsLimit(...d,e.step,t))throw new RangeError("expanded array length exceeds range limit. Use options.rangeLimit to increase or disable the limit.");let p=cE(...d,e);p.length===0&&(p=Om(n,e)),o.push(Gt(o.pop(),p)),n.nodes=[];return}let l=mr.encloseBrace(n),c=n.queue,f=n;for(;f.type!=="brace"&&f.type!=="root"&&f.parent;)f=f.parent,c=f.queue;for(let d=0;d<n.nodes.length;d++){let p=n.nodes[d];if(p.type==="comma"&&n.type==="brace"){d===1&&c.push(""),c.push("");continue}if(p.type==="close"){o.push(Gt(o.pop(),c,l));continue}if(p.value&&p.type!=="open"){c.push(Gt(c.pop(),p.value));continue}p.nodes&&i(p,n)}return c};return mr.flatten(i(r))};Tm.exports=pE});var Im=x((r6,Pm)=>{u();"use strict";Pm.exports={MAX_LENGTH:1024*64,CHAR_0:"0",CHAR_9:"9",CHAR_UPPERCASE_A:"A",CHAR_LOWERCASE_A:"a",CHAR_UPPERCASE_Z:"Z",CHAR_LOWERCASE_Z:"z",CHAR_LEFT_PARENTHESES:"(",CHAR_RIGHT_PARENTHESES:")",CHAR_ASTERISK:"*",CHAR_AMPERSAND:"&",CHAR_AT:"@",CHAR_BACKSLASH:"\\",CHAR_BACKTICK:"`",CHAR_CARRIAGE_RETURN:"\r",CHAR_CIRCUMFLEX_ACCENT:"^",CHAR_COLON:":",CHAR_COMMA:",",CHAR_DOLLAR:"$",CHAR_DOT:".",CHAR_DOUBLE_QUOTE:'"',CHAR_EQUAL:"=",CHAR_EXCLAMATION_MARK:"!",CHAR_FORM_FEED:"\f",CHAR_FORWARD_SLASH:"/",CHAR_HASH:"#",CHAR_HYPHEN_MINUS:"-",CHAR_LEFT_ANGLE_BRACKET:"<",CHAR_LEFT_CURLY_BRACE:"{",CHAR_LEFT_SQUARE_BRACKET:"[",CHAR_LINE_FEED:`
-`,CHAR_NO_BREAK_SPACE:"\xA0",CHAR_PERCENT:"%",CHAR_PLUS:"+",CHAR_QUESTION_MARK:"?",CHAR_RIGHT_ANGLE_BRACKET:">",CHAR_RIGHT_CURLY_BRACE:"}",CHAR_RIGHT_SQUARE_BRACKET:"]",CHAR_SEMICOLON:";",CHAR_SINGLE_QUOTE:"'",CHAR_SPACE:" ",CHAR_TAB:"	",CHAR_UNDERSCORE:"_",CHAR_VERTICAL_LINE:"|",CHAR_ZERO_WIDTH_NOBREAK_SPACE:"\uFEFF"}});var Mm=x((i6,Lm)=>{u();"use strict";var dE=ds(),{MAX_LENGTH:Dm,CHAR_BACKSLASH:cl,CHAR_BACKTICK:hE,CHAR_COMMA:mE,CHAR_DOT:gE,CHAR_LEFT_PARENTHESES:yE,CHAR_RIGHT_PARENTHESES:bE,CHAR_LEFT_CURLY_BRACE:wE,CHAR_RIGHT_CURLY_BRACE:vE,CHAR_LEFT_SQUARE_BRACKET:qm,CHAR_RIGHT_SQUARE_BRACKET:$m,CHAR_DOUBLE_QUOTE:xE,CHAR_SINGLE_QUOTE:kE,CHAR_NO_BREAK_SPACE:SE,CHAR_ZERO_WIDTH_NOBREAK_SPACE:AE}=Im(),CE=(r,e={})=>{if(typeof r!="string")throw new TypeError("Expected a string");let t=e||{},i=typeof t.maxLength=="number"?Math.min(Dm,t.maxLength):Dm;if(r.length>i)throw new SyntaxError(`Input length (${r.length}), exceeds max characters (${i})`);let n={type:"root",input:r,nodes:[]},a=[n],s=n,o=n,l=0,c=r.length,f=0,d=0,p,h={},b=()=>r[f++],v=y=>{if(y.type==="text"&&o.type==="dot"&&(o.type="text"),o&&o.type==="text"&&y.type==="text"){o.value+=y.value;return}return s.nodes.push(y),y.parent=s,y.prev=o,o=y,y};for(v({type:"bos"});f<c;)if(s=a[a.length-1],p=b(),!(p===AE||p===SE)){if(p===cl){v({type:"text",value:(e.keepEscaping?p:"")+b()});continue}if(p===$m){v({type:"text",value:"\\"+p});continue}if(p===qm){l++;let y=!0,w;for(;f<c&&(w=b());){if(p+=w,w===qm){l++;continue}if(w===cl){p+=b();continue}if(w===$m&&(l--,l===0))break}v({type:"text",value:p});continue}if(p===yE){s=v({type:"paren",nodes:[]}),a.push(s),v({type:"text",value:p});continue}if(p===bE){if(s.type!=="paren"){v({type:"text",value:p});continue}s=a.pop(),v({type:"text",value:p}),s=a[a.length-1];continue}if(p===xE||p===kE||p===hE){let y=p,w;for(e.keepQuotes!==!0&&(p="");f<c&&(w=b());){if(w===cl){p+=w+b();continue}if(w===y){e.keepQuotes===!0&&(p+=w);break}p+=w}v({type:"text",value:p});continue}if(p===wE){d++;let y=o.value&&o.value.slice(-1)==="$"||s.dollar===!0;s=v({type:"brace",open:!0,close:!1,dollar:y,depth:d,commas:0,ranges:0,nodes:[]}),a.push(s),v({type:"open",value:p});continue}if(p===vE){if(s.type!=="brace"){v({type:"text",value:p});continue}let y="close";s=a.pop(),s.close=!0,v({type:y,value:p}),d--,s=a[a.length-1];continue}if(p===mE&&d>0){if(s.ranges>0){s.ranges=0;let y=s.nodes.shift();s.nodes=[y,{type:"text",value:dE(s)}]}v({type:"comma",value:p}),s.commas++;continue}if(p===gE&&d>0&&s.commas===0){let y=s.nodes;if(d===0||y.length===0){v({type:"text",value:p});continue}if(o.type==="dot"){if(s.range=[],o.value+=p,o.type="range",s.nodes.length!==3&&s.nodes.length!==5){s.invalid=!0,s.ranges=0,o.type="text";continue}s.ranges++,s.args=[];continue}if(o.type==="range"){y.pop();let w=y[y.length-1];w.value+=o.value+p,o=w,s.ranges--;continue}v({type:"dot",value:p});continue}v({type:"text",value:p})}do if(s=a.pop(),s.type!=="root"){s.nodes.forEach(k=>{k.nodes||(k.type==="open"&&(k.isOpen=!0),k.type==="close"&&(k.isClose=!0),k.nodes||(k.type="text"),k.invalid=!0)});let y=a[a.length-1],w=y.nodes.indexOf(s);y.nodes.splice(w,1,...s.nodes)}while(a.length>0);return v({type:"eos"}),n};Lm.exports=CE});var Fm=x((n6,Bm)=>{u();"use strict";var Nm=ds(),_E=Em(),EE=Rm(),OE=Mm(),Le=(r,e={})=>{let t=[];if(Array.isArray(r))for(let i of r){let n=Le.create(i,e);Array.isArray(n)?t.push(...n):t.push(n)}else t=[].concat(Le.create(r,e));return e&&e.expand===!0&&e.nodupes===!0&&(t=[...new Set(t)]),t};Le.parse=(r,e={})=>OE(r,e);Le.stringify=(r,e={})=>typeof r=="string"?Nm(Le.parse(r,e),e):Nm(r,e);Le.compile=(r,e={})=>(typeof r=="string"&&(r=Le.parse(r,e)),_E(r,e));Le.expand=(r,e={})=>{typeof r=="string"&&(r=Le.parse(r,e));let t=EE(r,e);return e.noempty===!0&&(t=t.filter(Boolean)),e.nodupes===!0&&(t=[...new Set(t)]),t};Le.create=(r,e={})=>r===""||r.length<3?[r]:e.expand!==!0?Le.compile(r,e):Le.expand(r,e);Bm.exports=Le});var Pi=x((s6,Hm)=>{u();"use strict";var TE=(et(),Ur),at="\\\\/",jm=`[^${at}]`,yt="\\.",RE="\\+",PE="\\?",ms="\\/",IE="(?=.)",zm="[^/]",pl=`(?:${ms}|$)`,Um=`(?:^|${ms})`,dl=`${yt}{1,2}${pl}`,DE=`(?!${yt})`,qE=`(?!${Um}${dl})`,$E=`(?!${yt}{0,1}${pl})`,LE=`(?!${dl})`,ME=`[^.${ms}]`,NE=`${zm}*?`,Vm={DOT_LITERAL:yt,PLUS_LITERAL:RE,QMARK_LITERAL:PE,SLASH_LITERAL:ms,ONE_CHAR:IE,QMARK:zm,END_ANCHOR:pl,DOTS_SLASH:dl,NO_DOT:DE,NO_DOTS:qE,NO_DOT_SLASH:$E,NO_DOTS_SLASH:LE,QMARK_NO_DOT:ME,STAR:NE,START_ANCHOR:Um},BE={...Vm,SLASH_LITERAL:`[${at}]`,QMARK:jm,STAR:`${jm}*?`,DOTS_SLASH:`${yt}{1,2}(?:[${at}]|$)`,NO_DOT:`(?!${yt})`,NO_DOTS:`(?!(?:^|[${at}])${yt}{1,2}(?:[${at}]|$))`,NO_DOT_SLASH:`(?!${yt}{0,1}(?:[${at}]|$))`,NO_DOTS_SLASH:`(?!${yt}{1,2}(?:[${at}]|$))`,QMARK_NO_DOT:`[^.${at}]`,START_ANCHOR:`(?:^|[${at}])`,END_ANCHOR:`(?:[${at}]|$)`},FE={alnum:"a-zA-Z0-9",alpha:"a-zA-Z",ascii:"\\x00-\\x7F",blank:" \\t",cntrl:"\\x00-\\x1F\\x7F",digit:"0-9",graph:"\\x21-\\x7E",lower:"a-z",print:"\\x20-\\x7E ",punct:"\\-!\"#$%&'()\\*+,./:;<=>?@[\\]^_`{|}~",space:" \\t\\r\\n\\v\\f",upper:"A-Z",word:"A-Za-z0-9_",xdigit:"A-Fa-f0-9"};Hm.exports={MAX_LENGTH:1024*64,POSIX_REGEX_SOURCE:FE,REGEX_BACKSLASH:/\\(?![*+?^${}(|)[\]])/g,REGEX_NON_SPECIAL_CHARS:/^[^@![\].,$*+?^{}()|\\/]+/,REGEX_SPECIAL_CHARS:/[-*+?.^${}(|)[\]]/,REGEX_SPECIAL_CHARS_BACKREF:/(\\?)((\W)(\3*))/g,REGEX_SPECIAL_CHARS_GLOBAL:/([-*+?.^${}(|)[\]])/g,REGEX_REMOVE_BACKSLASH:/(?:\[.*?[^\\]\]|\\(?=.))/g,REPLACEMENTS:{"***":"*","**/**":"**","**/**/**":"**"},CHAR_0:48,CHAR_9:57,CHAR_UPPERCASE_A:65,CHAR_LOWERCASE_A:97,CHAR_UPPERCASE_Z:90,CHAR_LOWERCASE_Z:122,CHAR_LEFT_PARENTHESES:40,CHAR_RIGHT_PARENTHESES:41,CHAR_ASTERISK:42,CHAR_AMPERSAND:38,CHAR_AT:64,CHAR_BACKWARD_SLASH:92,CHAR_CARRIAGE_RETURN:13,CHAR_CIRCUMFLEX_ACCENT:94,CHAR_COLON:58,CHAR_COMMA:44,CHAR_DOT:46,CHAR_DOUBLE_QUOTE:34,CHAR_EQUAL:61,CHAR_EXCLAMATION_MARK:33,CHAR_FORM_FEED:12,CHAR_FORWARD_SLASH:47,CHAR_GRAVE_ACCENT:96,CHAR_HASH:35,CHAR_HYPHEN_MINUS:45,CHAR_LEFT_ANGLE_BRACKET:60,CHAR_LEFT_CURLY_BRACE:123,CHAR_LEFT_SQUARE_BRACKET:91,CHAR_LINE_FEED:10,CHAR_NO_BREAK_SPACE:160,CHAR_PERCENT:37,CHAR_PLUS:43,CHAR_QUESTION_MARK:63,CHAR_RIGHT_ANGLE_BRACKET:62,CHAR_RIGHT_CURLY_BRACE:125,CHAR_RIGHT_SQUARE_BRACKET:93,CHAR_SEMICOLON:59,CHAR_SINGLE_QUOTE:39,CHAR_SPACE:32,CHAR_TAB:9,CHAR_UNDERSCORE:95,CHAR_VERTICAL_LINE:124,CHAR_ZERO_WIDTH_NOBREAK_SPACE:65279,SEP:TE.sep,extglobChars(r){return{"!":{type:"negate",open:"(?:(?!(?:",close:`))${r.STAR})`},"?":{type:"qmark",open:"(?:",close:")?"},"+":{type:"plus",open:"(?:",close:")+"},"*":{type:"star",open:"(?:",close:")*"},"@":{type:"at",open:"(?:",close:")"}}},globChars(r){return r===!0?BE:Vm}}});var Ii=x(Re=>{u();"use strict";var jE=(et(),Ur),zE=m.platform==="win32",{REGEX_BACKSLASH:UE,REGEX_REMOVE_BACKSLASH:VE,REGEX_SPECIAL_CHARS:HE,REGEX_SPECIAL_CHARS_GLOBAL:WE}=Pi();Re.isObject=r=>r!==null&&typeof r=="object"&&!Array.isArray(r);Re.hasRegexChars=r=>HE.test(r);Re.isRegexChar=r=>r.length===1&&Re.hasRegexChars(r);Re.escapeRegex=r=>r.replace(WE,"\\$1");Re.toPosixSlashes=r=>r.replace(UE,"/");Re.removeBackslashes=r=>r.replace(VE,e=>e==="\\"?"":e);Re.supportsLookbehinds=()=>{let r=m.version.slice(1).split(".").map(Number);return r.length===3&&r[0]>=9||r[0]===8&&r[1]>=10};Re.isWindows=r=>r&&typeof r.windows=="boolean"?r.windows:zE===!0||jE.sep==="\\";Re.escapeLast=(r,e,t)=>{let i=r.lastIndexOf(e,t);return i===-1?r:r[i-1]==="\\"?Re.escapeLast(r,e,i-1):`${r.slice(0,i)}\\${r.slice(i)}`};Re.removePrefix=(r,e={})=>{let t=r;return t.startsWith("./")&&(t=t.slice(2),e.prefix="./"),t};Re.wrapOutput=(r,e={},t={})=>{let i=t.contains?"":"^",n=t.contains?"":"$",a=`${i}(?:${r})${n}`;return e.negated===!0&&(a=`(?:^(?!${a}).*$)`),a}});var Zm=x((o6,Jm)=>{u();"use strict";var Wm=Ii(),{CHAR_ASTERISK:hl,CHAR_AT:GE,CHAR_BACKWARD_SLASH:Di,CHAR_COMMA:QE,CHAR_DOT:ml,CHAR_EXCLAMATION_MARK:gl,CHAR_FORWARD_SLASH:Gm,CHAR_LEFT_CURLY_BRACE:yl,CHAR_LEFT_PARENTHESES:bl,CHAR_LEFT_SQUARE_BRACKET:YE,CHAR_PLUS:KE,CHAR_QUESTION_MARK:Qm,CHAR_RIGHT_CURLY_BRACE:XE,CHAR_RIGHT_PARENTHESES:Ym,CHAR_RIGHT_SQUARE_BRACKET:JE}=Pi(),Km=r=>r===Gm||r===Di,Xm=r=>{r.isPrefix!==!0&&(r.depth=r.isGlobstar?1/0:1)},ZE=(r,e)=>{let t=e||{},i=r.length-1,n=t.parts===!0||t.scanToEnd===!0,a=[],s=[],o=[],l=r,c=-1,f=0,d=0,p=!1,h=!1,b=!1,v=!1,y=!1,w=!1,k=!1,S=!1,E=!1,O=!1,B=0,N,T,F={value:"",depth:0,isGlob:!1},Y=()=>c>=i,_=()=>l.charCodeAt(c+1),Q=()=>(N=T,l.charCodeAt(++c));for(;c<i;){T=Q();let he;if(T===Di){k=F.backslashes=!0,T=Q(),T===yl&&(w=!0);continue}if(w===!0||T===yl){for(B++;Y()!==!0&&(T=Q());){if(T===Di){k=F.backslashes=!0,Q();continue}if(T===yl){B++;continue}if(w!==!0&&T===ml&&(T=Q())===ml){if(p=F.isBrace=!0,b=F.isGlob=!0,O=!0,n===!0)continue;break}if(w!==!0&&T===QE){if(p=F.isBrace=!0,b=F.isGlob=!0,O=!0,n===!0)continue;break}if(T===XE&&(B--,B===0)){w=!1,p=F.isBrace=!0,O=!0;break}}if(n===!0)continue;break}if(T===Gm){if(a.push(c),s.push(F),F={value:"",depth:0,isGlob:!1},O===!0)continue;if(N===ml&&c===f+1){f+=2;continue}d=c+1;continue}if(t.noext!==!0&&(T===KE||T===GE||T===hl||T===Qm||T===gl)===!0&&_()===bl){if(b=F.isGlob=!0,v=F.isExtglob=!0,O=!0,T===gl&&c===f&&(E=!0),n===!0){for(;Y()!==!0&&(T=Q());){if(T===Di){k=F.backslashes=!0,T=Q();continue}if(T===Ym){b=F.isGlob=!0,O=!0;break}}continue}break}if(T===hl){if(N===hl&&(y=F.isGlobstar=!0),b=F.isGlob=!0,O=!0,n===!0)continue;break}if(T===Qm){if(b=F.isGlob=!0,O=!0,n===!0)continue;break}if(T===YE){for(;Y()!==!0&&(he=Q());){if(he===Di){k=F.backslashes=!0,Q();continue}if(he===JE){h=F.isBracket=!0,b=F.isGlob=!0,O=!0;break}}if(n===!0)continue;break}if(t.nonegate!==!0&&T===gl&&c===f){S=F.negated=!0,f++;continue}if(t.noparen!==!0&&T===bl){if(b=F.isGlob=!0,n===!0){for(;Y()!==!0&&(T=Q());){if(T===bl){k=F.backslashes=!0,T=Q();continue}if(T===Ym){O=!0;break}}continue}break}if(b===!0){if(O=!0,n===!0)continue;break}}t.noext===!0&&(v=!1,b=!1);let U=l,oe="",A="";f>0&&(oe=l.slice(0,f),l=l.slice(f),d-=f),U&&b===!0&&d>0?(U=l.slice(0,d),A=l.slice(d)):b===!0?(U="",A=l):U=l,U&&U!==""&&U!=="/"&&U!==l&&Km(U.charCodeAt(U.length-1))&&(U=U.slice(0,-1)),t.unescape===!0&&(A&&(A=Wm.removeBackslashes(A)),U&&k===!0&&(U=Wm.removeBackslashes(U)));let C={prefix:oe,input:r,start:f,base:U,glob:A,isBrace:p,isBracket:h,isGlob:b,isExtglob:v,isGlobstar:y,negated:S,negatedExtglob:E};if(t.tokens===!0&&(C.maxDepth=0,Km(T)||s.push(F),C.tokens=s),t.parts===!0||t.tokens===!0){let he;for(let V=0;V<a.length;V++){let Ee=he?he+1:f,Ie=a[V],De=r.slice(Ee,Ie);t.tokens&&(V===0&&f!==0?(s[V].isPrefix=!0,s[V].value=oe):s[V].value=De,Xm(s[V]),C.maxDepth+=s[V].depth),(V!==0||De!=="")&&o.push(De),he=Ie}if(he&&he+1<r.length){let V=r.slice(he+1);o.push(V),t.tokens&&(s[s.length-1].value=V,Xm(s[s.length-1]),C.maxDepth+=s[s.length-1].depth)}C.slashes=a,C.parts=o}return C};Jm.exports=ZE});var rg=x((l6,tg)=>{u();"use strict";var gs=Pi(),Me=Ii(),{MAX_LENGTH:ys,POSIX_REGEX_SOURCE:e2,REGEX_NON_SPECIAL_CHARS:t2,REGEX_SPECIAL_CHARS_BACKREF:r2,REPLACEMENTS:eg}=gs,i2=(r,e)=>{if(typeof e.expandRange=="function")return e.expandRange(...r,e);r.sort();let t=`[${r.join("-")}]`;try{new RegExp(t)}catch(i){return r.map(n=>Me.escapeRegex(n)).join("..")}return t},gr=(r,e)=>`Missing ${r}: "${e}" - use "\\\\${e}" to match literal characters`,wl=(r,e)=>{if(typeof r!="string")throw new TypeError("Expected a string");r=eg[r]||r;let t={...e},i=typeof t.maxLength=="number"?Math.min(ys,t.maxLength):ys,n=r.length;if(n>i)throw new SyntaxError(`Input length: ${n}, exceeds maximum allowed length: ${i}`);let a={type:"bos",value:"",output:t.prepend||""},s=[a],o=t.capture?"":"?:",l=Me.isWindows(e),c=gs.globChars(l),f=gs.extglobChars(c),{DOT_LITERAL:d,PLUS_LITERAL:p,SLASH_LITERAL:h,ONE_CHAR:b,DOTS_SLASH:v,NO_DOT:y,NO_DOT_SLASH:w,NO_DOTS_SLASH:k,QMARK:S,QMARK_NO_DOT:E,STAR:O,START_ANCHOR:B}=c,N=q=>`(${o}(?:(?!${B}${q.dot?v:d}).)*?)`,T=t.dot?"":y,F=t.dot?S:E,Y=t.bash===!0?N(t):O;t.capture&&(Y=`(${Y})`),typeof t.noext=="boolean"&&(t.noextglob=t.noext);let _={input:r,index:-1,start:0,dot:t.dot===!0,consumed:"",output:"",prefix:"",backtrack:!1,negated:!1,brackets:0,braces:0,parens:0,quotes:0,globstar:!1,tokens:s};r=Me.removePrefix(r,_),n=r.length;let Q=[],U=[],oe=[],A=a,C,he=()=>_.index===n-1,V=_.peek=(q=1)=>r[_.index+q],Ee=_.advance=()=>r[++_.index]||"",Ie=()=>r.slice(_.index+1),De=(q="",ae=0)=>{_.consumed+=q,_.index+=ae},Bi=q=>{_.output+=q.output!=null?q.output:q.value,De(q.value)},Rv=()=>{let q=1;for(;V()==="!"&&(V(2)!=="("||V(3)==="?");)Ee(),_.start++,q++;return q%2==0?!1:(_.negated=!0,_.start++,!0)},Fi=q=>{_[q]++,oe.push(q)},Ft=q=>{_[q]--,oe.pop()},W=q=>{if(A.type==="globstar"){let ae=_.braces>0&&(q.type==="comma"||q.type==="brace"),I=q.extglob===!0||Q.length&&(q.type==="pipe"||q.type==="paren");q.type!=="slash"&&q.type!=="paren"&&!ae&&!I&&(_.output=_.output.slice(0,-A.output.length),A.type="star",A.value="*",A.output=Y,_.output+=A.output)}if(Q.length&&q.type!=="paren"&&(Q[Q.length-1].inner+=q.value),(q.value||q.output)&&Bi(q),A&&A.type==="text"&&q.type==="text"){A.value+=q.value,A.output=(A.output||"")+q.value;return}q.prev=A,s.push(q),A=q},ji=(q,ae)=>{let I={...f[ae],conditions:1,inner:""};I.prev=A,I.parens=_.parens,I.output=_.output;let H=(t.capture?"(":"")+I.open;Fi("parens"),W({type:q,value:ae,output:_.output?"":b}),W({type:"paren",extglob:!0,value:Ee(),output:H}),Q.push(I)},Pv=q=>{let ae=q.close+(t.capture?")":""),I;if(q.type==="negate"){let H=Y;if(q.inner&&q.inner.length>1&&q.inner.includes("/")&&(H=N(t)),(H!==Y||he()||/^\)+$/.test(Ie()))&&(ae=q.close=`)$))${H}`),q.inner.includes("*")&&(I=Ie())&&/^\.[^\\/.]+$/.test(I)){let ce=wl(I,{...e,fastpaths:!1}).output;ae=q.close=`)${ce})${H})`}q.prev.type==="bos"&&(_.negatedExtglob=!0)}W({type:"paren",extglob:!0,value:C,output:ae}),Ft("parens")};if(t.fastpaths!==!1&&!/(^[*!]|[/()[\]{}"])/.test(r)){let q=!1,ae=r.replace(r2,(I,H,ce,Ce,ye,Ms)=>Ce==="\\"?(q=!0,I):Ce==="?"?H?H+Ce+(ye?S.repeat(ye.length):""):Ms===0?F+(ye?S.repeat(ye.length):""):S.repeat(ce.length):Ce==="."?d.repeat(ce.length):Ce==="*"?H?H+Ce+(ye?Y:""):Y:H?I:`\\${I}`);return q===!0&&(t.unescape===!0?ae=ae.replace(/\\/g,""):ae=ae.replace(/\\+/g,I=>I.length%2==0?"\\\\":I?"\\":"")),ae===r&&t.contains===!0?(_.output=r,_):(_.output=Me.wrapOutput(ae,_,e),_)}for(;!he();){if(C=Ee(),C==="\0")continue;if(C==="\\"){let I=V();if(I==="/"&&t.bash!==!0||I==="."||I===";")continue;if(!I){C+="\\",W({type:"text",value:C});continue}let H=/^\\+/.exec(Ie()),ce=0;if(H&&H[0].length>2&&(ce=H[0].length,_.index+=ce,ce%2!=0&&(C+="\\")),t.unescape===!0?C=Ee():C+=Ee(),_.brackets===0){W({type:"text",value:C});continue}}if(_.brackets>0&&(C!=="]"||A.value==="["||A.value==="[^")){if(t.posix!==!1&&C===":"){let I=A.value.slice(1);if(I.includes("[")&&(A.posix=!0,I.includes(":"))){let H=A.value.lastIndexOf("["),ce=A.value.slice(0,H),Ce=A.value.slice(H+2),ye=e2[Ce];if(ye){A.value=ce+ye,_.backtrack=!0,Ee(),!a.output&&s.indexOf(A)===1&&(a.output=b);continue}}}(C==="["&&V()!==":"||C==="-"&&V()==="]")&&(C=`\\${C}`),C==="]"&&(A.value==="["||A.value==="[^")&&(C=`\\${C}`),t.posix===!0&&C==="!"&&A.value==="["&&(C="^"),A.value+=C,Bi({value:C});continue}if(_.quotes===1&&C!=='"'){C=Me.escapeRegex(C),A.value+=C,Bi({value:C});continue}if(C==='"'){_.quotes=_.quotes===1?0:1,t.keepQuotes===!0&&W({type:"text",value:C});continue}if(C==="("){Fi("parens"),W({type:"paren",value:C});continue}if(C===")"){if(_.parens===0&&t.strictBrackets===!0)throw new SyntaxError(gr("opening","("));let I=Q[Q.length-1];if(I&&_.parens===I.parens+1){Pv(Q.pop());continue}W({type:"paren",value:C,output:_.parens?")":"\\)"}),Ft("parens");continue}if(C==="["){if(t.nobracket===!0||!Ie().includes("]")){if(t.nobracket!==!0&&t.strictBrackets===!0)throw new SyntaxError(gr("closing","]"));C=`\\${C}`}else Fi("brackets");W({type:"bracket",value:C});continue}if(C==="]"){if(t.nobracket===!0||A&&A.type==="bracket"&&A.value.length===1){W({type:"text",value:C,output:`\\${C}`});continue}if(_.brackets===0){if(t.strictBrackets===!0)throw new SyntaxError(gr("opening","["));W({type:"text",value:C,output:`\\${C}`});continue}Ft("brackets");let I=A.value.slice(1);if(A.posix!==!0&&I[0]==="^"&&!I.includes("/")&&(C=`/${C}`),A.value+=C,Bi({value:C}),t.literalBrackets===!1||Me.hasRegexChars(I))continue;let H=Me.escapeRegex(A.value);if(_.output=_.output.slice(0,-A.value.length),t.literalBrackets===!0){_.output+=H,A.value=H;continue}A.value=`(${o}${H}|${A.value})`,_.output+=A.value;continue}if(C==="{"&&t.nobrace!==!0){Fi("braces");let I={type:"brace",value:C,output:"(",outputIndex:_.output.length,tokensIndex:_.tokens.length};U.push(I),W(I);continue}if(C==="}"){let I=U[U.length-1];if(t.nobrace===!0||!I){W({type:"text",value:C,output:C});continue}let H=")";if(I.dots===!0){let ce=s.slice(),Ce=[];for(let ye=ce.length-1;ye>=0&&(s.pop(),ce[ye].type!=="brace");ye--)ce[ye].type!=="dots"&&Ce.unshift(ce[ye].value);H=i2(Ce,t),_.backtrack=!0}if(I.comma!==!0&&I.dots!==!0){let ce=_.output.slice(0,I.outputIndex),Ce=_.tokens.slice(I.tokensIndex);I.value=I.output="\\{",C=H="\\}",_.output=ce;for(let ye of Ce)_.output+=ye.output||ye.value}W({type:"brace",value:C,output:H}),Ft("braces"),U.pop();continue}if(C==="|"){Q.length>0&&Q[Q.length-1].conditions++,W({type:"text",value:C});continue}if(C===","){let I=C,H=U[U.length-1];H&&oe[oe.length-1]==="braces"&&(H.comma=!0,I="|"),W({type:"comma",value:C,output:I});continue}if(C==="/"){if(A.type==="dot"&&_.index===_.start+1){_.start=_.index+1,_.consumed="",_.output="",s.pop(),A=a;continue}W({type:"slash",value:C,output:h});continue}if(C==="."){if(_.braces>0&&A.type==="dot"){A.value==="."&&(A.output=d);let I=U[U.length-1];A.type="dots",A.output+=C,A.value+=C,I.dots=!0;continue}if(_.braces+_.parens===0&&A.type!=="bos"&&A.type!=="slash"){W({type:"text",value:C,output:d});continue}W({type:"dot",value:C,output:d});continue}if(C==="?"){if(!(A&&A.value==="(")&&t.noextglob!==!0&&V()==="("&&V(2)!=="?"){ji("qmark",C);continue}if(A&&A.type==="paren"){let H=V(),ce=C;if(H==="<"&&!Me.supportsLookbehinds())throw new Error("Node.js v10 or higher is required for regex lookbehinds");(A.value==="("&&!/[!=<:]/.test(H)||H==="<"&&!/<([!=]|\w+>)/.test(Ie()))&&(ce=`\\${C}`),W({type:"text",value:C,output:ce});continue}if(t.dot!==!0&&(A.type==="slash"||A.type==="bos")){W({type:"qmark",value:C,output:E});continue}W({type:"qmark",value:C,output:S});continue}if(C==="!"){if(t.noextglob!==!0&&V()==="("&&(V(2)!=="?"||!/[!=<:]/.test(V(3)))){ji("negate",C);continue}if(t.nonegate!==!0&&_.index===0){Rv();continue}}if(C==="+"){if(t.noextglob!==!0&&V()==="("&&V(2)!=="?"){ji("plus",C);continue}if(A&&A.value==="("||t.regex===!1){W({type:"plus",value:C,output:p});continue}if(A&&(A.type==="bracket"||A.type==="paren"||A.type==="brace")||_.parens>0){W({type:"plus",value:C});continue}W({type:"plus",value:p});continue}if(C==="@"){if(t.noextglob!==!0&&V()==="("&&V(2)!=="?"){W({type:"at",extglob:!0,value:C,output:""});continue}W({type:"text",value:C});continue}if(C!=="*"){(C==="$"||C==="^")&&(C=`\\${C}`);let I=t2.exec(Ie());I&&(C+=I[0],_.index+=I[0].length),W({type:"text",value:C});continue}if(A&&(A.type==="globstar"||A.star===!0)){A.type="star",A.star=!0,A.value+=C,A.output=Y,_.backtrack=!0,_.globstar=!0,De(C);continue}let q=Ie();if(t.noextglob!==!0&&/^\([^?]/.test(q)){ji("star",C);continue}if(A.type==="star"){if(t.noglobstar===!0){De(C);continue}let I=A.prev,H=I.prev,ce=I.type==="slash"||I.type==="bos",Ce=H&&(H.type==="star"||H.type==="globstar");if(t.bash===!0&&(!ce||q[0]&&q[0]!=="/")){W({type:"star",value:C,output:""});continue}let ye=_.braces>0&&(I.type==="comma"||I.type==="brace"),Ms=Q.length&&(I.type==="pipe"||I.type==="paren");if(!ce&&I.type!=="paren"&&!ye&&!Ms){W({type:"star",value:C,output:""});continue}for(;q.slice(0,3)==="/**";){let zi=r[_.index+4];if(zi&&zi!=="/")break;q=q.slice(3),De("/**",3)}if(I.type==="bos"&&he()){A.type="globstar",A.value+=C,A.output=N(t),_.output=A.output,_.globstar=!0,De(C);continue}if(I.type==="slash"&&I.prev.type!=="bos"&&!Ce&&he()){_.output=_.output.slice(0,-(I.output+A.output).length),I.output=`(?:${I.output}`,A.type="globstar",A.output=N(t)+(t.strictSlashes?")":"|$)"),A.value+=C,_.globstar=!0,_.output+=I.output+A.output,De(C);continue}if(I.type==="slash"&&I.prev.type!=="bos"&&q[0]==="/"){let zi=q[1]!==void 0?"|$":"";_.output=_.output.slice(0,-(I.output+A.output).length),I.output=`(?:${I.output}`,A.type="globstar",A.output=`${N(t)}${h}|${h}${zi})`,A.value+=C,_.output+=I.output+A.output,_.globstar=!0,De(C+Ee()),W({type:"slash",value:"/",output:""});continue}if(I.type==="bos"&&q[0]==="/"){A.type="globstar",A.value+=C,A.output=`(?:^|${h}|${N(t)}${h})`,_.output=A.output,_.globstar=!0,De(C+Ee()),W({type:"slash",value:"/",output:""});continue}_.output=_.output.slice(0,-A.output.length),A.type="globstar",A.output=N(t),A.value+=C,_.output+=A.output,_.globstar=!0,De(C);continue}let ae={type:"star",value:C,output:Y};if(t.bash===!0){ae.output=".*?",(A.type==="bos"||A.type==="slash")&&(ae.output=T+ae.output),W(ae);continue}if(A&&(A.type==="bracket"||A.type==="paren")&&t.regex===!0){ae.output=C,W(ae);continue}(_.index===_.start||A.type==="slash"||A.type==="dot")&&(A.type==="dot"?(_.output+=w,A.output+=w):t.dot===!0?(_.output+=k,A.output+=k):(_.output+=T,A.output+=T),V()!=="*"&&(_.output+=b,A.output+=b)),W(ae)}for(;_.brackets>0;){if(t.strictBrackets===!0)throw new SyntaxError(gr("closing","]"));_.output=Me.escapeLast(_.output,"["),Ft("brackets")}for(;_.parens>0;){if(t.strictBrackets===!0)throw new SyntaxError(gr("closing",")"));_.output=Me.escapeLast(_.output,"("),Ft("parens")}for(;_.braces>0;){if(t.strictBrackets===!0)throw new SyntaxError(gr("closing","}"));_.output=Me.escapeLast(_.output,"{"),Ft("braces")}if(t.strictSlashes!==!0&&(A.type==="star"||A.type==="bracket")&&W({type:"maybe_slash",value:"",output:`${h}?`}),_.backtrack===!0){_.output="";for(let q of _.tokens)_.output+=q.output!=null?q.output:q.value,q.suffix&&(_.output+=q.suffix)}return _};wl.fastpaths=(r,e)=>{let t={...e},i=typeof t.maxLength=="number"?Math.min(ys,t.maxLength):ys,n=r.length;if(n>i)throw new SyntaxError(`Input length: ${n}, exceeds maximum allowed length: ${i}`);r=eg[r]||r;let a=Me.isWindows(e),{DOT_LITERAL:s,SLASH_LITERAL:o,ONE_CHAR:l,DOTS_SLASH:c,NO_DOT:f,NO_DOTS:d,NO_DOTS_SLASH:p,STAR:h,START_ANCHOR:b}=gs.globChars(a),v=t.dot?d:f,y=t.dot?p:f,w=t.capture?"":"?:",k={negated:!1,prefix:""},S=t.bash===!0?".*?":h;t.capture&&(S=`(${S})`);let E=T=>T.noglobstar===!0?S:`(${w}(?:(?!${b}${T.dot?c:s}).)*?)`,O=T=>{switch(T){case"*":return`${v}${l}${S}`;case".*":return`${s}${l}${S}`;case"*.*":return`${v}${S}${s}${l}${S}`;case"*/*":return`${v}${S}${o}${l}${y}${S}`;case"**":return v+E(t);case"**/*":return`(?:${v}${E(t)}${o})?${y}${l}${S}`;case"**/*.*":return`(?:${v}${E(t)}${o})?${y}${S}${s}${l}${S}`;case"**/.*":return`(?:${v}${E(t)}${o})?${s}${l}${S}`;default:{let F=/^(.*?)\.(\w+)$/.exec(T);if(!F)return;let Y=O(F[1]);return Y?Y+s+F[2]:void 0}}},B=Me.removePrefix(r,k),N=O(B);return N&&t.strictSlashes!==!0&&(N+=`${o}?`),N};tg.exports=wl});var ng=x((u6,ig)=>{u();"use strict";var n2=(et(),Ur),s2=Zm(),vl=rg(),xl=Ii(),a2=Pi(),o2=r=>r&&typeof r=="object"&&!Array.isArray(r),de=(r,e,t=!1)=>{if(Array.isArray(r)){let f=r.map(p=>de(p,e,t));return p=>{for(let h of f){let b=h(p);if(b)return b}return!1}}let i=o2(r)&&r.tokens&&r.input;if(r===""||typeof r!="string"&&!i)throw new TypeError("Expected pattern to be a non-empty string");let n=e||{},a=xl.isWindows(e),s=i?de.compileRe(r,e):de.makeRe(r,e,!1,!0),o=s.state;delete s.state;let l=()=>!1;if(n.ignore){let f={...e,ignore:null,onMatch:null,onResult:null};l=de(n.ignore,f,t)}let c=(f,d=!1)=>{let{isMatch:p,match:h,output:b}=de.test(f,s,e,{glob:r,posix:a}),v={glob:r,state:o,regex:s,posix:a,input:f,output:b,match:h,isMatch:p};return typeof n.onResult=="function"&&n.onResult(v),p===!1?(v.isMatch=!1,d?v:!1):l(f)?(typeof n.onIgnore=="function"&&n.onIgnore(v),v.isMatch=!1,d?v:!1):(typeof n.onMatch=="function"&&n.onMatch(v),d?v:!0)};return t&&(c.state=o),c};de.test=(r,e,t,{glob:i,posix:n}={})=>{if(typeof r!="string")throw new TypeError("Expected input to be a string");if(r==="")return{isMatch:!1,output:""};let a=t||{},s=a.format||(n?xl.toPosixSlashes:null),o=r===i,l=o&&s?s(r):r;return o===!1&&(l=s?s(r):r,o=l===i),(o===!1||a.capture===!0)&&(a.matchBase===!0||a.basename===!0?o=de.matchBase(r,e,t,n):o=e.exec(l)),{isMatch:Boolean(o),match:o,output:l}};de.matchBase=(r,e,t,i=xl.isWindows(t))=>(e instanceof RegExp?e:de.makeRe(e,t)).test(n2.basename(r));de.isMatch=(r,e,t)=>de(e,t)(r);de.parse=(r,e)=>Array.isArray(r)?r.map(t=>de.parse(t,e)):vl(r,{...e,fastpaths:!1});de.scan=(r,e)=>s2(r,e);de.compileRe=(r,e,t=!1,i=!1)=>{if(t===!0)return r.output;let n=e||{},a=n.contains?"":"^",s=n.contains?"":"$",o=`${a}(?:${r.output})${s}`;r&&r.negated===!0&&(o=`^(?!${o}).*$`);let l=de.toRegex(o,e);return i===!0&&(l.state=r),l};de.makeRe=(r,e={},t=!1,i=!1)=>{if(!r||typeof r!="string")throw new TypeError("Expected a non-empty string");let n={negated:!1,fastpaths:!0};return e.fastpaths!==!1&&(r[0]==="."||r[0]==="*")&&(n.output=vl.fastpaths(r,e)),n.output||(n=vl(r,e)),de.compileRe(n,e,t,i)};de.toRegex=(r,e)=>{try{let t=e||{};return new RegExp(r,t.flags||(t.nocase?"i":""))}catch(t){if(e&&e.debug===!0)throw t;return/$^/}};de.constants=a2;ig.exports=de});var ag=x((f6,sg)=>{u();"use strict";sg.exports=ng()});var cg=x((c6,fg)=>{u();"use strict";var og=(Bn(),Nn),lg=Fm(),ot=ag(),kl=Ii(),ug=r=>r===""||r==="./",fe=(r,e,t)=>{e=[].concat(e),r=[].concat(r);let i=new Set,n=new Set,a=new Set,s=0,o=f=>{a.add(f.output),t&&t.onResult&&t.onResult(f)};for(let f=0;f<e.length;f++){let d=ot(String(e[f]),{...t,onResult:o},!0),p=d.state.negated||d.state.negatedExtglob;p&&s++;for(let h of r){let b=d(h,!0);!(p?!b.isMatch:b.isMatch)||(p?i.add(b.output):(i.delete(b.output),n.add(b.output)))}}let c=(s===e.length?[...a]:[...n]).filter(f=>!i.has(f));if(t&&c.length===0){if(t.failglob===!0)throw new Error(`No matches found for "${e.join(", ")}"`);if(t.nonull===!0||t.nullglob===!0)return t.unescape?e.map(f=>f.replace(/\\/g,"")):e}return c};fe.match=fe;fe.matcher=(r,e)=>ot(r,e);fe.isMatch=(r,e,t)=>ot(e,t)(r);fe.any=fe.isMatch;fe.not=(r,e,t={})=>{e=[].concat(e).map(String);let i=new Set,n=[],a=o=>{t.onResult&&t.onResult(o),n.push(o.output)},s=new Set(fe(r,e,{...t,onResult:a}));for(let o of n)s.has(o)||i.add(o);return[...i]};fe.contains=(r,e,t)=>{if(typeof r!="string")throw new TypeError(`Expected a string: "${og.inspect(r)}"`);if(Array.isArray(e))return e.some(i=>fe.contains(r,i,t));if(typeof e=="string"){if(ug(r)||ug(e))return!1;if(r.includes(e)||r.startsWith("./")&&r.slice(2).includes(e))return!0}return fe.isMatch(r,e,{...t,contains:!0})};fe.matchKeys=(r,e,t)=>{if(!kl.isObject(r))throw new TypeError("Expected the first argument to be an object");let i=fe(Object.keys(r),e,t),n={};for(let a of i)n[a]=r[a];return n};fe.some=(r,e,t)=>{let i=[].concat(r);for(let n of[].concat(e)){let a=ot(String(n),t);if(i.some(s=>a(s)))return!0}return!1};fe.every=(r,e,t)=>{let i=[].concat(r);for(let n of[].concat(e)){let a=ot(String(n),t);if(!i.every(s=>a(s)))return!1}return!0};fe.all=(r,e,t)=>{if(typeof r!="string")throw new TypeError(`Expected a string: "${og.inspect(r)}"`);return[].concat(e).every(i=>ot(i,t)(r))};fe.capture=(r,e,t)=>{let i=kl.isWindows(t),a=ot.makeRe(String(r),{...t,capture:!0}).exec(i?kl.toPosixSlashes(e):e);if(a)return a.slice(1).map(s=>s===void 0?"":s)};fe.makeRe=(...r)=>ot.makeRe(...r);fe.scan=(...r)=>ot.scan(...r);fe.parse=(r,e)=>{let t=[];for(let i of[].concat(r||[]))for(let n of lg(String(i),e))t.push(ot.parse(n,e));return t};fe.braces=(r,e)=>{if(typeof r!="string")throw new TypeError("Expected a string");return e&&e.nobrace===!0||!/\{.*\}/.test(r)?[r]:lg(r,e)};fe.braceExpand=(r,e)=>{if(typeof r!="string")throw new TypeError("Expected a string");return fe.braces(r,{...e,expand:!0})};fg.exports=fe});function dg(r,e){let t=e.content.files;t=t.filter(o=>typeof o=="string"),t=t.map(al);let i=cs.generateTasks(t),n=[],a=[];for(let o of i)n.push(...o.positive.map(l=>hg(l,!1))),a.push(...o.negative.map(l=>hg(l,!0)));let s=[...n,...a];return s=u2(r,s),s=s.flatMap(f2),s=s.map(l2),s}function hg(r,e){let t={original:r,base:r,ignore:e,pattern:r,glob:null};return Kh(r)&&Object.assign(t,rm(r)),t}function l2(r){let e=al(r.base);return e=cs.escapePath(e),r.pattern=r.glob?`${e}/${r.glob}`:e,r.pattern=r.ignore?`!${r.pattern}`:r.pattern,r}function u2(r,e){let t=[];return r.userConfigPath&&r.tailwindConfig.content.relative&&(t=[me.dirname(r.userConfigPath)]),e.map(i=>(i.base=me.resolve(...t,i.base),i))}function f2(r){let e=[r];try{let t=be.realpathSync(r.base);t!==r.base&&e.push({...r,base:t})}catch{}return e}function mg(r,e,t){let i=r.tailwindConfig.content.files.filter(s=>typeof s.raw=="string").map(({raw:s,extension:o="html"})=>({content:s,extension:o})),[n,a]=p2(e,t);for(let s of n){let o=me.extname(s).slice(1);i.push({file:s,extension:o})}return[i,a]}function c2(r){if(!r.some(a=>a.includes("**")&&!yg.test(a)))return()=>{};let t=[],i=[];for(let a of r){let s=pg.default.matcher(a);yg.test(a)&&i.push(s),t.push(s)}let n=!1;return a=>{if(n||i.some(f=>f(a)))return;let s=t.findIndex(f=>f(a));if(s===-1)return;let o=r[s],l=me.relative(m.cwd(),o);l[0]!=="."&&(l=`./${l}`);let c=gg.find(f=>a.includes(f));c&&(n=!0,G.warn("broad-content-glob-pattern",[`Your \`content\` configuration includes a pattern which looks like it's accidentally matching all of \`${c}\` and can cause serious performance issues.`,`Pattern: \`${l}\``,"See our documentation for recommendations:","https://tailwindcss.com/docs/content-configuration#pattern-recommendations"]))}}function p2(r,e){let t=r.map(o=>o.pattern),i=new Map,n=c2(t),a=new Set;Je.DEBUG&&console.time("Finding changed files");let s=cs.sync(t,{absolute:!0});for(let o of s){n(o);let l=e.get(o)||-1/0,c=be.statSync(o).mtimeMs;c>l&&(a.add(o),i.set(o,c))}return Je.DEBUG&&console.timeEnd("Finding changed files"),[a,i]}var pg,gg,yg,bg=R(()=>{u();ft();et();Xh();Jh();Zh();im();It();Be();pg=pe(cg());gg=["node_modules"],yg=new RegExp(`(${gg.map(r=>String.raw`\b${r}\b`).join("|")})`)});function wg(){}var vg=R(()=>{u()});function g2(r,e){for(let t of e){let i=`${r}${t}`;if(be.existsSync(i)&&be.statSync(i).isFile())return i}for(let t of e){let i=`${r}/index${t}`;if(be.existsSync(i))return i}return null}function*xg(r,e,t,i=me.extname(r)){let n=g2(me.resolve(e,r),d2.includes(i)?h2:m2);if(n===null||t.has(n))return;t.add(n),yield n,e=me.dirname(n),i=me.extname(n);let a=be.readFileSync(n,"utf-8");for(let s of[...a.matchAll(/import[\s\S]*?['"](.{3,}?)['"]/gi),...a.matchAll(/import[\s\S]*from[\s\S]*?['"](.{3,}?)['"]/gi),...a.matchAll(/require\(['"`](.+)['"`]\)/gi)])!s[1].startsWith(".")||(yield*xg(s[1],e,t,i))}function Sl(r){return r===null?new Set:new Set(xg(r,me.dirname(r),new Set))}var d2,h2,m2,kg=R(()=>{u();ft();et();d2=[".js",".cjs",".mjs"],h2=["",".js",".cjs",".mjs",".ts",".cts",".mts",".jsx",".tsx"],m2=["",".ts",".cts",".mts",".tsx",".js",".cjs",".mjs",".jsx"]});function y2(r,e){if(Al.has(r))return Al.get(r);let t=dg(r,e);return Al.set(r,t).get(r)}function b2(r){let e=na(r);if(e!==null){let[i,n,a,s]=Ag.get(e)||[],o=Sl(e),l=!1,c=new Map;for(let p of o){let h=be.statSync(p).mtimeMs;c.set(p,h),(!s||!s.has(p)||h>s.get(p))&&(l=!0)}if(!l)return[i,e,n,a];for(let p of o)delete pf.cache[p];let f=sl(zr(wg(e))),d=Vi(f);return Ag.set(e,[f,d,o,c]),[f,e,d,o]}let t=zr(r?.config??r??{});return t=sl(t),[t,null,Vi(t),[]]}function Cl(r){return({tailwindDirectives:e,registerDependency:t})=>(i,n)=>{let[a,s,o,l]=b2(r),c=new Set(l);if(e.size>0){c.add(n.opts.from);for(let b of n.messages)b.type==="dependency"&&c.add(b.file)}let[f,,d]=zh(i,n,a,s,o,c),p=fs(f),h=y2(f,a);if(e.size>0){for(let y of h)for(let w of rl(y))t(w);let[b,v]=mg(f,h,p);for(let y of b)f.changedContent.push(y);for(let[y,w]of v.entries())d.set(y,w)}for(let b of l)t({type:"dependency",file:b});for(let[b,v]of d.entries())p.set(b,v);return f}}var Sg,Ag,Al,Cg=R(()=>{u();ft();Sg=pe(Ns());yf();ia();sc();_i();Uh();Yh();bg();vg();kg();Ag=new Sg.default({maxSize:100}),Al=new WeakMap});function _l(r){let e=new Set,t=new Set,i=new Set;if(r.walkAtRules(n=>{n.name==="apply"&&i.add(n),n.name==="import"&&(n.params==='"tailwindcss/base"'||n.params==="'tailwindcss/base'"?(n.name="tailwind",n.params="base"):n.params==='"tailwindcss/components"'||n.params==="'tailwindcss/components'"?(n.name="tailwind",n.params="components"):n.params==='"tailwindcss/utilities"'||n.params==="'tailwindcss/utilities'"?(n.name="tailwind",n.params="utilities"):(n.params==='"tailwindcss/screens"'||n.params==="'tailwindcss/screens'"||n.params==='"tailwindcss/variants"'||n.params==="'tailwindcss/variants'")&&(n.name="tailwind",n.params="variants")),n.name==="tailwind"&&(n.params==="screens"&&(n.params="variants"),e.add(n.params)),["layer","responsive","variants"].includes(n.name)&&(["responsive","variants"].includes(n.name)&&G.warn(`${n.name}-at-rule-deprecated`,[`The \`@${n.name}\` directive has been deprecated in Tailwind CSS v3.0.`,"Use `@layer utilities` or `@layer components` instead.","https://tailwindcss.com/docs/upgrade-guide#replace-variants-with-layer"]),t.add(n))}),!e.has("base")||!e.has("components")||!e.has("utilities")){for(let n of t)if(n.name==="layer"&&["base","components","utilities"].includes(n.params)){if(!e.has(n.params))throw n.error(`\`@layer ${n.params}\` is used but no matching \`@tailwind ${n.params}\` directive is present.`)}else if(n.name==="responsive"){if(!e.has("utilities"))throw n.error("`@responsive` is used but `@tailwind utilities` is missing.")}else if(n.name==="variants"&&!e.has("utilities"))throw n.error("`@variants` is used but `@tailwind utilities` is missing.")}return{tailwindDirectives:e,applyDirectives:i}}var _g=R(()=>{u();Be()});function Qt(r,e=void 0,t=void 0){return r.map(i=>{let n=i.clone();return t!==void 0&&(n.raws.tailwind={...n.raws.tailwind,...t}),e!==void 0&&Eg(n,a=>{if(a.raws.tailwind?.preserveSource===!0&&a.source)return!1;a.source=e}),n})}function Eg(r,e){e(r)!==!1&&r.each?.(t=>Eg(t,e))}var Og=R(()=>{u()});function El(r){return r=Array.isArray(r)?r:[r],r=r.map(e=>e instanceof RegExp?e.source:e),r.join("")}function Ne(r){return new RegExp(El(r),"g")}function qt(r){return`(?:${r.map(El).join("|")})`}function Ol(r){return`(?:${El(r)})?`}function Rg(r){return r&&w2.test(r)?r.replace(Tg,"\\$&"):r||""}var Tg,w2,Pg=R(()=>{u();Tg=/[\\^$.*+?()[\]{}|]/g,w2=RegExp(Tg.source)});function Ig(r){let e=Array.from(v2(r));return t=>{let i=[];for(let n of e)for(let a of t.match(n)??[])i.push(S2(a));for(let n of i.slice()){let a=ve(n,".");for(let s=0;s<a.length;s++){let o=a[s];if(s>=a.length-1){i.push(o);continue}let l=Number(a[s+1]);isNaN(l)?i.push(o):s++}}return i}}function*v2(r){let e=r.tailwindConfig.separator,t=r.tailwindConfig.prefix!==""?Ol(Ne([/-?/,Rg(r.tailwindConfig.prefix)])):"",i=qt([/\[[^\s:'"`]+:[^\s\[\]]+\]/,/\[[^\s:'"`\]]+:[^\s]+?\[[^\s]+\][^\s]+?\]/,Ne([qt([/-?(?:\w+)/,/@(?:\w+)/]),Ol(qt([Ne([qt([/-(?:\w+-)*\['[^\s]+'\]/,/-(?:\w+-)*\["[^\s]+"\]/,/-(?:\w+-)*\[`[^\s]+`\]/,/-(?:\w+-)*\[(?:[^\s\[\]]+\[[^\s\[\]]+\])*[^\s:\[\]]+\]/]),/(?![{([]])/,/(?:\/[^\s'"`\\><$]*)?/]),Ne([qt([/-(?:\w+-)*\['[^\s]+'\]/,/-(?:\w+-)*\["[^\s]+"\]/,/-(?:\w+-)*\[`[^\s]+`\]/,/-(?:\w+-)*\[(?:[^\s\[\]]+\[[^\s\[\]]+\])*[^\s\[\]]+\]/]),/(?![{([]])/,/(?:\/[^\s'"`\\$]*)?/]),/[-\/][^\s'"`\\$={><]*/]))])]),n=[qt([Ne([/@\[[^\s"'`]+\](\/[^\s"'`]+)?/,e]),Ne([/([^\s"'`\[\\]+-)?\[[^\s"'`]+\]\/[\w_-]+/,e]),Ne([/([^\s"'`\[\\]+-)?\[[^\s"'`]+\]/,e]),Ne([/[^\s"'`\[\\]+/,e])]),qt([Ne([/([^\s"'`\[\\]+-)?\[[^\s`]+\]\/[\w_-]+/,e]),Ne([/([^\s"'`\[\\]+-)?\[[^\s`]+\]/,e]),Ne([/[^\s`\[\\]+/,e])])];for(let a of n)yield Ne(["((?=((",a,")+))\\2)?",/!?/,t,i]);yield/[^<>"'`\s.(){}[\]#=%$][^<>"'`\s(){}[\]#=%$]*[^<>"'`\s.(){}[\]#=%:$]/g}function S2(r){if(!r.includes("-["))return r;let e=0,t=[],i=r.matchAll(x2);i=Array.from(i).flatMap(n=>{let[,...a]=n;return a.map((s,o)=>Object.assign([],n,{index:n.index+o,0:s}))});for(let n of i){let a=n[0],s=t[t.length-1];if(a===s?t.pop():(a==="'"||a==='"'||a==="`")&&t.push(a),!s){if(a==="["){e++;continue}else if(a==="]"){e--;continue}if(e<0)return r.substring(0,n.index-1);if(e===0&&!k2.test(a))return r.substring(0,n.index)}}return r}var x2,k2,Dg=R(()=>{u();Pg();zt();x2=/([\[\]'"`])([^\[\]'"`])?/g,k2=/[^"'`\s<>\]]+/});function A2(r,e){let t=r.tailwindConfig.content.extract;return t[e]||t.DEFAULT||$g[e]||$g.DEFAULT(r)}function C2(r,e){let t=r.content.transform;return t[e]||t.DEFAULT||Lg[e]||Lg.DEFAULT}function _2(r,e,t,i){qi.has(e)||qi.set(e,new qg.default({maxSize:25e3}));for(let n of r.split(`
-`))if(n=n.trim(),!i.has(n))if(i.add(n),qi.get(e).has(n))for(let a of qi.get(e).get(n))t.add(a);else{let a=e(n).filter(o=>o!=="!*"),s=new Set(a);for(let o of s)t.add(o);qi.get(e).set(n,s)}}function E2(r,e){let t=e.offsets.sort(r),i={base:new Set,defaults:new Set,components:new Set,utilities:new Set,variants:new Set};for(let[n,a]of t)i[n.layer].add(a);return i}function Tl(r){return async e=>{let t={base:null,components:null,utilities:null,variants:null};if(e.walkAtRules(y=>{y.name==="tailwind"&&Object.keys(t).includes(y.params)&&(t[y.params]=y)}),Object.values(t).every(y=>y===null))return e;let i=new Set([...r.candidates??[],gt]),n=new Set;bt.DEBUG&&console.time("Reading changed files");let a=[];for(let y of r.changedContent){let w=C2(r.tailwindConfig,y.extension),k=A2(r,y.extension);a.push([y,{transformer:w,extractor:k}])}let s=500;for(let y=0;y<a.length;y+=s){let w=a.slice(y,y+s);await Promise.all(w.map(async([{file:k,content:S},{transformer:E,extractor:O}])=>{S=k?await be.promises.readFile(k,"utf8"):S,_2(E(S),O,i,n)}))}bt.DEBUG&&console.timeEnd("Reading changed files");let o=r.classCache.size;bt.DEBUG&&console.time("Generate rules"),bt.DEBUG&&console.time("Sorting candidates");let l=new Set([...i].sort((y,w)=>y===w?0:y<w?-1:1));bt.DEBUG&&console.timeEnd("Sorting candidates"),ss(l,r),bt.DEBUG&&console.timeEnd("Generate rules"),bt.DEBUG&&console.time("Build stylesheet"),(r.stylesheetCache===null||r.classCache.size!==o)&&(r.stylesheetCache=E2([...r.ruleCache],r)),bt.DEBUG&&console.timeEnd("Build stylesheet");let{defaults:c,base:f,components:d,utilities:p,variants:h}=r.stylesheetCache;t.base&&(t.base.before(Qt([...c,...f],t.base.source,{layer:"base"})),t.base.remove()),t.components&&(t.components.before(Qt([...d],t.components.source,{layer:"components"})),t.components.remove()),t.utilities&&(t.utilities.before(Qt([...p],t.utilities.source,{layer:"utilities"})),t.utilities.remove());let b=Array.from(h).filter(y=>{let w=y.raws.tailwind?.parentLayer;return w==="components"?t.components!==null:w==="utilities"?t.utilities!==null:!0});t.variants?(t.variants.before(Qt(b,t.variants.source,{layer:"variants"})),t.variants.remove()):b.length>0&&e.append(Qt(b,e.source,{layer:"variants"})),e.source.end=e.source.end??e.source.start;let v=b.some(y=>y.raws.tailwind?.parentLayer==="utilities");t.utilities&&p.size===0&&!v&&G.warn("content-problems",["No utility classes were detected in your source files. If this is unexpected, double-check the `content` option in your Tailwind CSS configuration.","https://tailwindcss.com/docs/content-configuration"]),bt.DEBUG&&(console.log("Potential classes: ",i.size),console.log("Active contexts: ",Zn.size)),r.changedContent=[],e.walkAtRules("layer",y=>{Object.keys(t).includes(y.params)&&y.remove()})}}var qg,bt,$g,Lg,qi,Mg=R(()=>{u();ft();qg=pe(Ns());It();as();Be();Og();Dg();bt=Je,$g={DEFAULT:Ig},Lg={DEFAULT:r=>r,svelte:r=>r.replace(/(?:^|\s)class:/g," ")};qi=new WeakMap});function ws(r){let e=new Map;ee.root({nodes:[r.clone()]}).walkRules(a=>{(0,bs.default)(s=>{s.walkClasses(o=>{let l=o.parent.toString(),c=e.get(l);c||e.set(l,c=new Set),c.add(o.value)})}).processSync(a.selector)});let i=Array.from(e.values(),a=>Array.from(a)),n=i.flat();return Object.assign(n,{groups:i})}function Rl(r){return O2.astSync(r)}function Ng(r,e){let t=new Set;for(let i of r)t.add(i.split(e).pop());return Array.from(t)}function Bg(r,e){let t=r.tailwindConfig.prefix;return typeof t=="function"?t(e):t+e}function*Fg(r){for(yield r;r.parent;)yield r.parent,r=r.parent}function T2(r,e={}){let t=r.nodes;r.nodes=[];let i=r.clone(e);return r.nodes=t,i}function R2(r){for(let e of Fg(r))if(r!==e){if(e.type==="root")break;r=T2(e,{nodes:[r]})}return r}function P2(r,e){let t=new Map;return r.walkRules(i=>{for(let s of Fg(i))if(s.raws.tailwind?.layer!==void 0)return;let n=R2(i),a=e.offsets.create("user");for(let s of ws(i)){let o=t.get(s)||[];t.set(s,o),o.push([{layer:"user",sort:a,important:!1},n])}}),t}function I2(r,e){for(let t of r){if(e.notClassCache.has(t)||e.applyClassCache.has(t))continue;if(e.classCache.has(t)){e.applyClassCache.set(t,e.classCache.get(t).map(([n,a])=>[n,a.clone()]));continue}let i=Array.from(Go(t,e));if(i.length===0){e.notClassCache.add(t);continue}e.applyClassCache.set(t,i)}return e.applyClassCache}function D2(r){let e=null;return{get:t=>(e=e||r(),e.get(t)),has:t=>(e=e||r(),e.has(t))}}function q2(r){return{get:e=>r.flatMap(t=>t.get(e)||[]),has:e=>r.some(t=>t.has(e))}}function jg(r){let e=r.split(/[\s\t\n]+/g);return e[e.length-1]==="!important"?[e.slice(0,-1),!0]:[e,!1]}function zg(r,e,t){let i=new Set,n=[];if(r.walkAtRules("apply",l=>{let[c]=jg(l.params);for(let f of c)i.add(f);n.push(l)}),n.length===0)return;let a=q2([t,I2(i,e)]);function s(l,c,f){let d=Rl(l),p=Rl(c),b=Rl(`.${Te(f)}`).nodes[0].nodes[0];return d.each(v=>{let y=new Set;p.each(w=>{let k=!1;w=w.clone(),w.walkClasses(S=>{S.value===b.value&&(k||(S.replaceWith(...v.nodes.map(E=>E.clone())),y.add(w),k=!0))})});for(let w of y){let k=[[]];for(let S of w.nodes)S.type==="combinator"?(k.push(S),k.push([])):k[k.length-1].push(S);w.nodes=[];for(let S of k)Array.isArray(S)&&S.sort((E,O)=>E.type==="tag"&&O.type==="class"?-1:E.type==="class"&&O.type==="tag"?1:E.type==="class"&&O.type==="pseudo"&&O.value.startsWith("::")?-1:E.type==="pseudo"&&E.value.startsWith("::")&&O.type==="class"?1:0),w.nodes=w.nodes.concat(S)}v.replaceWith(...y)}),d.toString()}let o=new Map;for(let l of n){let[c]=o.get(l.parent)||[[],l.source];o.set(l.parent,[c,l.source]);let[f,d]=jg(l.params);if(l.parent.type==="atrule"){if(l.parent.name==="screen"){let p=l.parent.params;throw l.error(`@apply is not supported within nested at-rules like @screen. We suggest you write this as @apply ${f.map(h=>`${p}:${h}`).join(" ")} instead.`)}throw l.error(`@apply is not supported within nested at-rules like @${l.parent.name}. You can fix this by un-nesting @${l.parent.name}.`)}for(let p of f){if([Bg(e,"group"),Bg(e,"peer")].includes(p))throw l.error(`@apply should not be used with the '${p}' utility`);if(!a.has(p))throw l.error(`The \`${p}\` class does not exist. If \`${p}\` is a custom class, make sure it is defined within a \`@layer\` directive.`);let h=a.get(p);for(let[,b]of h)b.type!=="atrule"&&b.walkRules(()=>{throw l.error([`The \`${p}\` class cannot be used with \`@apply\` because \`@apply\` does not currently support nested CSS.`,"Rewrite the selector without nesting or configure the `tailwindcss/nesting` plugin:","https://tailwindcss.com/docs/using-with-preprocessors#nesting"].join(`
-`))});c.push([p,d,h])}}for(let[l,[c,f]]of o){let d=[];for(let[h,b,v]of c){let y=[h,...Ng([h],e.tailwindConfig.separator)];for(let[w,k]of v){let S=ws(l),E=ws(k);if(E=E.groups.filter(T=>T.some(F=>y.includes(F))).flat(),E=E.concat(Ng(E,e.tailwindConfig.separator)),S.some(T=>E.includes(T)))throw k.error(`You cannot \`@apply\` the \`${h}\` utility here because it creates a circular dependency.`);let B=ee.root({nodes:[k.clone()]});B.walk(T=>{T.source=f}),(k.type!=="atrule"||k.type==="atrule"&&k.name!=="keyframes")&&B.walkRules(T=>{if(!ws(T).some(U=>U===h)){T.remove();return}let F=typeof e.tailwindConfig.important=="string"?e.tailwindConfig.important:null,_=l.raws.tailwind!==void 0&&F&&l.selector.indexOf(F)===0?l.selector.slice(F.length):l.selector;_===""&&(_=l.selector),T.selector=s(_,T.selector,h),F&&_!==l.selector&&(T.selector=rs(T.selector,F)),T.walkDecls(U=>{U.important=w.important||b});let Q=(0,bs.default)().astSync(T.selector);Q.each(U=>pr(U)),T.selector=Q.toString()}),!!B.nodes[0]&&d.push([w.sort,B.nodes[0]])}}let p=e.offsets.sort(d).map(h=>h[1]);l.after(p)}for(let l of n)l.parent.nodes.length>1?l.remove():l.parent.remove();zg(r,e,t)}function Pl(r){return e=>{let t=D2(()=>P2(e,r));zg(e,r,t)}}var bs,O2,Ug=R(()=>{u();Ot();bs=pe(it());as();fr();Vo();es();O2=(0,bs.default)()});var Vg=x((rq,vs)=>{u();(function(){"use strict";function r(i,n,a){if(!i)return null;r.caseSensitive||(i=i.toLowerCase());var s=r.threshold===null?null:r.threshold*i.length,o=r.thresholdAbsolute,l;s!==null&&o!==null?l=Math.min(s,o):s!==null?l=s:o!==null?l=o:l=null;var c,f,d,p,h,b=n.length;for(h=0;h<b;h++)if(f=n[h],a&&(f=f[a]),!!f&&(r.caseSensitive?d=f:d=f.toLowerCase(),p=t(i,d,l),(l===null||p<l)&&(l=p,a&&r.returnWinningObject?c=n[h]:c=f,r.returnFirstMatch)))return c;return c||r.nullResultValue}r.threshold=.4,r.thresholdAbsolute=20,r.caseSensitive=!1,r.nullResultValue=null,r.returnWinningObject=null,r.returnFirstMatch=!1,typeof vs!="undefined"&&vs.exports?vs.exports=r:window.didYouMean=r;var e=Math.pow(2,32)-1;function t(i,n,a){a=a||a===0?a:e;var s=i.length,o=n.length;if(s===0)return Math.min(a+1,o);if(o===0)return Math.min(a+1,s);if(Math.abs(s-o)>a)return a+1;var l=[],c,f,d,p,h;for(c=0;c<=o;c++)l[c]=[c];for(f=0;f<=s;f++)l[0][f]=f;for(c=1;c<=o;c++){for(d=e,p=1,c>a&&(p=c-a),h=o+1,h>a+c&&(h=a+c),f=1;f<=s;f++)fh?l[c][f]=a+1:n.charAt(c-1)===i.charAt(f-1)?l[c][f]=l[c-1][f-1]:l[c][f]=Math.min(l[c-1][f-1]+1,Math.min(l[c][f-1]+1,l[c-1][f]+1)),l[c][f]<d&&(d=l[c][f]);if(d>a)return a+1}return l[o][s]}})()});var Wg=x((iq,Hg)=>{u();var Il="(".charCodeAt(0),Dl=")".charCodeAt(0),xs="'".charCodeAt(0),ql='"'.charCodeAt(0),$l="\\".charCodeAt(0),yr="/".charCodeAt(0),Ll=",".charCodeAt(0),Ml=":".charCodeAt(0),ks="*".charCodeAt(0),$2="u".charCodeAt(0),L2="U".charCodeAt(0),M2="+".charCodeAt(0),N2=/^[a-f0-9?-]+$/i;Hg.exports=function(r){for(var e=[],t=r,i,n,a,s,o,l,c,f,d=0,p=t.charCodeAt(d),h=t.length,b=[{nodes:e}],v=0,y,w="",k="",S="";d<h;)if(p<=32){i=d;do i+=1,p=t.charCodeAt(i);while(p<=32);s=t.slice(d,i),a=e[e.length-1],p===Dl&&v?S=s:a&&a.type==="div"?(a.after=s,a.sourceEndIndex+=s.length):p===Ll||p===Ml||p===yr&&t.charCodeAt(i+1)!==ks&&(!y||y&&y.type==="function"&&!1)?k=s:e.push({type:"space",sourceIndex:d,sourceEndIndex:i,value:s}),d=i}else if(p===xs||p===ql){i=d,n=p===xs?"'":'"',s={type:"string",sourceIndex:d,quote:n};do if(o=!1,i=t.indexOf(n,i+1),~i)for(l=i;t.charCodeAt(l-1)===$l;)l-=1,o=!o;else t+=n,i=t.length-1,s.unclosed=!0;while(o);s.value=t.slice(d+1,i),s.sourceEndIndex=s.unclosed?i:i+1,e.push(s),d=i+1,p=t.charCodeAt(d)}else if(p===yr&&t.charCodeAt(d+1)===ks)i=t.indexOf("*/",d),s={type:"comment",sourceIndex:d,sourceEndIndex:i+2},i===-1&&(s.unclosed=!0,i=t.length,s.sourceEndIndex=i),s.value=t.slice(d+2,i),e.push(s),d=i+2,p=t.charCodeAt(d);else if((p===yr||p===ks)&&y&&y.type==="function")s=t[d],e.push({type:"word",sourceIndex:d-k.length,sourceEndIndex:d+s.length,value:s}),d+=1,p=t.charCodeAt(d);else if(p===yr||p===Ll||p===Ml)s=t[d],e.push({type:"div",sourceIndex:d-k.length,sourceEndIndex:d+s.length,value:s,before:k,after:""}),k="",d+=1,p=t.charCodeAt(d);else if(Il===p){i=d;do i+=1,p=t.charCodeAt(i);while(p<=32);if(f=d,s={type:"function",sourceIndex:d-w.length,value:w,before:t.slice(f+1,i)},d=i,w==="url"&&p!==xs&&p!==ql){i-=1;do if(o=!1,i=t.indexOf(")",i+1),~i)for(l=i;t.charCodeAt(l-1)===$l;)l-=1,o=!o;else t+=")",i=t.length-1,s.unclosed=!0;while(o);c=i;do c-=1,p=t.charCodeAt(c);while(p<=32);f<c?(d!==c+1?s.nodes=[{type:"word",sourceIndex:d,sourceEndIndex:c+1,value:t.slice(d,c+1)}]:s.nodes=[],s.unclosed&&c+1!==i?(s.after="",s.nodes.push({type:"space",sourceIndex:c+1,sourceEndIndex:i,value:t.slice(c+1,i)})):(s.after=t.slice(c+1,i),s.sourceEndIndex=i)):(s.after="",s.nodes=[]),d=i+1,s.sourceEndIndex=s.unclosed?i:d,p=t.charCodeAt(d),e.push(s)}else v+=1,s.after="",s.sourceEndIndex=d+1,e.push(s),b.push(s),e=s.nodes=[],y=s;w=""}else if(Dl===p&&v)d+=1,p=t.charCodeAt(d),y.after=S,y.sourceEndIndex+=S.length,S="",v-=1,b[b.length-1].sourceEndIndex=d,b.pop(),y=b[v],e=y.nodes;else{i=d;do p===$l&&(i+=1),i+=1,p=t.charCodeAt(i);while(i<h&&!(p<=32||p===xs||p===ql||p===Ll||p===Ml||p===yr||p===Il||p===ks&&y&&y.type==="function"&&!0||p===yr&&y.type==="function"&&!0||p===Dl&&v));s=t.slice(d,i),Il===p?w=s:($2===s.charCodeAt(0)||L2===s.charCodeAt(0))&&M2===s.charCodeAt(1)&&N2.test(s.slice(2))?e.push({type:"unicode-range",sourceIndex:d,sourceEndIndex:i,value:s}):e.push({type:"word",sourceIndex:d,sourceEndIndex:i,value:s}),d=i}for(d=b.length-1;d;d-=1)b[d].unclosed=!0,b[d].sourceEndIndex=t.length;return b[0].nodes}});var Qg=x((nq,Gg)=>{u();Gg.exports=function r(e,t,i){var n,a,s,o;for(n=0,a=e.length;n<a;n+=1)s=e[n],i||(o=t(s,n,e)),o!==!1&&s.type==="function"&&Array.isArray(s.nodes)&&r(s.nodes,t,i),i&&t(s,n,e)}});var Jg=x((sq,Xg)=>{u();function Yg(r,e){var t=r.type,i=r.value,n,a;return e&&(a=e(r))!==void 0?a:t==="word"||t==="space"?i:t==="string"?(n=r.quote||"",n+i+(r.unclosed?"":n)):t==="comment"?"/*"+i+(r.unclosed?"":"*/"):t==="div"?(r.before||"")+i+(r.after||""):Array.isArray(r.nodes)?(n=Kg(r.nodes,e),t!=="function"?n:i+"("+(r.before||"")+n+(r.after||"")+(r.unclosed?"":")")):i}function Kg(r,e){var t,i;if(Array.isArray(r)){for(t="",i=r.length-1;~i;i-=1)t=Yg(r[i],e)+t;return t}return Yg(r,e)}Xg.exports=Kg});var ey=x((aq,Zg)=>{u();var Ss="-".charCodeAt(0),As="+".charCodeAt(0),Nl=".".charCodeAt(0),B2="e".charCodeAt(0),F2="E".charCodeAt(0);function j2(r){var e=r.charCodeAt(0),t;if(e===As||e===Ss){if(t=r.charCodeAt(1),t>=48&&t<=57)return!0;var i=r.charCodeAt(2);return t===Nl&&i>=48&&i<=57}return e===Nl?(t=r.charCodeAt(1),t>=48&&t<=57):e>=48&&e<=57}Zg.exports=function(r){var e=0,t=r.length,i,n,a;if(t===0||!j2(r))return!1;for(i=r.charCodeAt(e),(i===As||i===Ss)&&e++;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),i===Nl&&n>=48&&n<=57)for(e+=2;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),a=r.charCodeAt(e+2),(i===B2||i===F2)&&(n>=48&&n<=57||(n===As||n===Ss)&&a>=48&&a<=57))for(e+=n===As||n===Ss?3:2;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;return{number:r.slice(0,e),unit:r.slice(e)}}});var ny=x((oq,iy)=>{u();var z2=Wg(),ty=Qg(),ry=Jg();function $t(r){return this instanceof $t?(this.nodes=z2(r),this):new $t(r)}$t.prototype.toString=function(){return Array.isArray(this.nodes)?ry(this.nodes):""};$t.prototype.walk=function(r,e){return ty(this.nodes,r,e),this};$t.unit=ey();$t.walk=ty;$t.stringify=ry;iy.exports=$t});function Fl(r){return typeof r=="object"&&r!==null}function U2(r,e){let t=kt(e);do if(t.pop(),(0,$i.default)(r,t)!==void 0)break;while(t.length);return t.length?t:void 0}function br(r){return typeof r=="string"?r:r.reduce((e,t,i)=>t.includes(".")?`${e}[${t}]`:i===0?t:`${e}.${t}`,"")}function ay(r){return r.map(e=>`'${e}'`).join(", ")}function oy(r){return ay(Object.keys(r))}function jl(r,e,t,i={}){let n=Array.isArray(e)?br(e):e.replace(/^['"]+|['"]+$/g,""),a=Array.isArray(e)?e:kt(n),s=(0,$i.default)(r.theme,a,t);if(s===void 0){let l=`'${n}' does not exist in your theme config.`,c=a.slice(0,-1),f=(0,$i.default)(r.theme,c);if(Fl(f)){let d=Object.keys(f).filter(h=>jl(r,[...c,h]).isValid),p=(0,sy.default)(a[a.length-1],d);p?l+=` Did you mean '${br([...c,p])}'?`:d.length>0&&(l+=` '${br(c)}' has the following valid keys: ${ay(d)}`)}else{let d=U2(r.theme,n);if(d){let p=(0,$i.default)(r.theme,d);Fl(p)?l+=` '${br(d)}' has the following keys: ${oy(p)}`:l+=` '${br(d)}' is not an object.`}else l+=` Your theme has the following top-level keys: ${oy(r.theme)}`}return{isValid:!1,error:l}}if(!(typeof s=="string"||typeof s=="number"||typeof s=="function"||s instanceof String||s instanceof Number||Array.isArray(s))){let l=`'${n}' was found but does not resolve to a string.`;if(Fl(s)){let c=Object.keys(s).filter(f=>jl(r,[...a,f]).isValid);c.length&&(l+=` Did you mean something like '${br([...a,c[0]])}'?`)}return{isValid:!1,error:l}}let[o]=a;return{isValid:!0,value:mt(o)(s,i)}}function V2(r,e,t){e=e.map(n=>ly(r,n,t));let i=[""];for(let n of e)n.type==="div"&&n.value===","?i.push(""):i[i.length-1]+=Bl.default.stringify(n);return i}function ly(r,e,t){if(e.type==="function"&&t[e.value]!==void 0){let i=V2(r,e.nodes,t);e.type="word",e.value=t[e.value](r,...i)}return e}function H2(r,e,t){return Object.keys(t).some(n=>e.includes(`${n}(`))?(0,Bl.default)(e).walk(n=>{ly(r,n,t)}).toString():e}function*G2(r){r=r.replace(/^['"]+|['"]+$/g,"");let e=r.match(/^([^\s]+)(?![^\[]*\])(?:\s*\/\s*([^\/\s]+))$/),t;yield[r,void 0],e&&(r=e[1],t=e[2],yield[r,t])}function Q2(r,e,t){let i=Array.from(G2(e)).map(([n,a])=>Object.assign(jl(r,n,t,{opacityValue:a}),{resolvedPath:n,alpha:a}));return i.find(n=>n.isValid)??i[0]}function uy(r){let e=r.tailwindConfig,t={theme:(i,n,...a)=>{let{isValid:s,value:o,error:l,alpha:c}=Q2(e,n,a.length?a:void 0);if(!s){let p=i.parent,h=p?.raws.tailwind?.candidate;if(p&&h!==void 0){r.markInvalidUtilityNode(p),p.remove(),G.warn("invalid-theme-key-in-class",[`The utility \`${h}\` contains an invalid theme value and was not generated.`]);return}throw i.error(l)}let f=Xt(o),d=f!==void 0&&typeof f=="function";return(c!==void 0||d)&&(c===void 0&&(c=1),o=Ze(f,c,f)),o},screen:(i,n)=>{n=n.replace(/^['"]+/g,"").replace(/['"]+$/g,"");let s=Rt(e.theme.screens).find(({name:o})=>o===n);if(!s)throw i.error(`The '${n}' screen does not exist in your theme.`);return Tt(s)}};return i=>{i.walk(n=>{let a=W2[n.type];a!==void 0&&(n[a]=H2(n,n[a],t))})}}var $i,sy,Bl,W2,fy=R(()=>{u();$i=pe(Oa()),sy=pe(Vg());Si();Bl=pe(ny());Xn();Qn();Gi();Lr();Fr();Be();W2={atrule:"params",decl:"value"}});function cy({tailwindConfig:{theme:r}}){return function(e){e.walkAtRules("screen",t=>{let i=t.params,a=Rt(r.screens).find(({name:s})=>s===i);if(!a)throw t.error(`No \`${i}\` screen found.`);t.name="media",t.params=Tt(a)})}}var py=R(()=>{u();Xn();Qn()});function Y2(r){let e=r.filter(o=>o.type!=="pseudo"||o.nodes.length>0?!0:o.value.startsWith("::")||[":before",":after",":first-line",":first-letter"].includes(o.value)).reverse(),t=new Set(["tag","class","id","attribute"]),i=e.findIndex(o=>t.has(o.type));if(i===-1)return e.reverse().join("").trim();let n=e[i],a=dy[n.type]?dy[n.type](n):n;e=e.slice(0,i);let s=e.findIndex(o=>o.type==="combinator"&&o.value===">");return s!==-1&&(e.splice(0,s),e.unshift(Cs.default.universal())),[a,...e.reverse()].join("").trim()}function X2(r){return zl.has(r)||zl.set(r,K2.transformSync(r)),zl.get(r)}function Ul({tailwindConfig:r}){return e=>{let t=new Map,i=new Set;if(e.walkAtRules("defaults",n=>{if(n.nodes&&n.nodes.length>0){i.add(n);return}let a=n.params;t.has(a)||t.set(a,new Set),t.get(a).add(n.parent),n.remove()}),we(r,"optimizeUniversalDefaults"))for(let n of i){let a=new Map,s=t.get(n.params)??[];for(let o of s)for(let l of X2(o.selector)){let c=l.includes(":-")||l.includes("::-")||l.includes(":has")?l:"__DEFAULT__",f=a.get(c)??new Set;a.set(c,f),f.add(l)}if(a.size===0){n.remove();continue}for(let[,o]of a){let l=ee.rule({source:n.source});l.selectors=[...o],l.append(n.nodes.map(c=>c.clone())),n.before(l)}n.remove()}else if(i.size){let n=ee.rule({selectors:["*","::before","::after"]});for(let s of i)n.append(s.nodes),n.parent||s.before(n),n.source||(n.source=s.source),s.remove();let a=n.clone({selectors:["::backdrop"]});n.after(a)}}}var Cs,dy,K2,zl,hy=R(()=>{u();Ot();Cs=pe(it());ct();dy={id(r){return Cs.default.attribute({attribute:"id",operator:"=",value:r.value,quoteMark:'"'})}};K2=(0,Cs.default)(r=>r.map(e=>{let t=e.split(i=>i.type==="combinator"&&i.value===" ").pop();return Y2(t)})),zl=new Map});function Vl(){function r(e){let t=null;e.each(i=>{if(!J2.has(i.type)){t=null;return}if(t===null){t=i;return}let n=my[i.type];i.type==="atrule"&&i.name==="font-face"?t=i:n.every(a=>(i[a]??"").replace(/\s+/g," ")===(t[a]??"").replace(/\s+/g," "))?(i.nodes&&t.append(i.nodes),i.remove()):t=i}),e.each(i=>{i.type==="atrule"&&r(i)})}return e=>{r(e)}}var my,J2,gy=R(()=>{u();my={atrule:["name","params"],rule:["selector"]},J2=new Set(Object.keys(my))});function Hl(){return r=>{r.walkRules(e=>{let t=new Map,i=new Set([]),n=new Map;e.walkDecls(a=>{if(a.parent===e){if(t.has(a.prop)){if(t.get(a.prop).value===a.value){i.add(t.get(a.prop)),t.set(a.prop,a);return}n.has(a.prop)||n.set(a.prop,new Set),n.get(a.prop).add(t.get(a.prop)),n.get(a.prop).add(a)}t.set(a.prop,a)}});for(let a of i)a.remove();for(let a of n.values()){let s=new Map;for(let o of a){let l=eO(o.value);l!==null&&(s.has(l)||s.set(l,new Set),s.get(l).add(o))}for(let o of s.values()){let l=Array.from(o).slice(0,-1);for(let c of l)c.remove()}}})}}function eO(r){let e=/^-?\d*.?\d+([\w%]+)?$/g.exec(r);return e?e[1]??Z2:null}var Z2,yy=R(()=>{u();Z2=Symbol("unitless-number")});function tO(r){if(!r.walkAtRules)return;let e=new Set;if(r.walkAtRules("apply",t=>{e.add(t.parent)}),e.size!==0)for(let t of e){let i=[],n=[];for(let a of t.nodes)a.type==="atrule"&&a.name==="apply"?(n.length>0&&(i.push(n),n=[]),i.push([a])):n.push(a);if(n.length>0&&i.push(n),i.length!==1){for(let a of[...i].reverse()){let s=t.clone({nodes:[]});s.append(a),t.after(s)}t.remove()}}}function _s(){return r=>{tO(r)}}var by=R(()=>{u()});function Es(r){return async function(e,t){let{tailwindDirectives:i,applyDirectives:n}=_l(e);_s()(e,t);let a=r({tailwindDirectives:i,applyDirectives:n,registerDependency(s){t.messages.push({plugin:"tailwindcss",parent:t.opts.from,...s})},createContext(s,o){return tl(s,o,e)}})(e,t);if(a.tailwindConfig.separator==="-")throw new Error("The '-' character cannot be used as a custom separator in JIT mode due to parsing ambiguity. Please use another character like '_' instead.");Of(a.tailwindConfig),await Tl(a)(e,t),_s()(e,t),Pl(a)(e,t),uy(a)(e,t),cy(a)(e,t),Ul(a)(e,t),Vl(a)(e,t),Hl(a)(e,t)}}var wy=R(()=>{u();_g();Mg();Ug();fy();py();hy();gy();yy();by();_i();ct()});function vy(r,e){let t=null,i=null;return r.walkAtRules("config",n=>{if(i=n.source?.input.file??e.opts.from??null,i===null)throw n.error("The `@config` directive cannot be used without setting `from` in your PostCSS config.");if(t)throw n.error("Only one `@config` directive is allowed per file.");let a=n.params.match(/(['"])(.*?)\1/);if(!a)throw n.error("A path is required when using the `@config` directive.");let s=a[2];if(me.isAbsolute(s))throw n.error("The `@config` directive cannot be used with an absolute path.");if(t=me.resolve(me.dirname(i),s),!be.existsSync(t))throw n.error(`The config file at "${s}" does not exist. Make sure the path is correct and the file exists.`);n.remove()}),t||null}var xy=R(()=>{u();ft();et()});var ky=x((Vq,Wl)=>{u();Cg();wy();It();xy();Wl.exports=function(e){return{postcssPlugin:"tailwindcss",plugins:[Je.DEBUG&&function(t){return console.log(`
-`),console.time("JIT TOTAL"),t},async function(t,i){e=vy(t,i)??e;let n=Cl(e);if(t.type==="document"){let a=t.nodes.filter(s=>s.type==="root");for(let s of a)s.type==="root"&&await Es(n)(s,i);return}await Es(n)(t,i)},Je.DEBUG&&function(t){return console.timeEnd("JIT TOTAL"),console.log(`
-`),t}].filter(Boolean)}};Wl.exports.postcss=!0});var Ay=x((Hq,Sy)=>{u();Sy.exports=ky()});var Gl=x((Wq,Cy)=>{u();Cy.exports=()=>["and_chr 114","and_uc 15.5","chrome 114","chrome 113","chrome 109","edge 114","firefox 114","ios_saf 16.5","ios_saf 16.4","ios_saf 16.3","ios_saf 16.1","opera 99","safari 16.5","samsung 21"]});var Os={};Ge(Os,{agents:()=>rO,feature:()=>iO});function iO(){return{status:"cr",title:"CSS Feature Queries",stats:{ie:{"6":"n","7":"n","8":"n","9":"n","10":"n","11":"n","5.5":"n"},edge:{"12":"y","13":"y","14":"y","15":"y","16":"y","17":"y","18":"y","79":"y","80":"y","81":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","101":"y","102":"y","103":"y","104":"y","105":"y","106":"y","107":"y","108":"y","109":"y","110":"y","111":"y","112":"y","113":"y","114":"y"},firefox:{"2":"n","3":"n","4":"n","5":"n","6":"n","7":"n","8":"n","9":"n","10":"n","11":"n","12":"n","13":"n","14":"n","15":"n","16":"n","17":"n","18":"n","19":"n","20":"n","21":"n","22":"y","23":"y","24":"y","25":"y","26":"y","27":"y","28":"y","29":"y","30":"y","31":"y","32":"y","33":"y","34":"y","35":"y","36":"y","37":"y","38":"y","39":"y","40":"y","41":"y","42":"y","43":"y","44":"y","45":"y","46":"y","47":"y","48":"y","49":"y","50":"y","51":"y","52":"y","53":"y","54":"y","55":"y","56":"y","57":"y","58":"y","59":"y","60":"y","61":"y","62":"y","63":"y","64":"y","65":"y","66":"y","67":"y","68":"y","69":"y","70":"y","71":"y","72":"y","73":"y","74":"y","75":"y","76":"y","77":"y","78":"y","79":"y","80":"y","81":"y","82":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","101":"y","102":"y","103":"y","104":"y","105":"y","106":"y","107":"y","108":"y","109":"y","110":"y","111":"y","112":"y","113":"y","114":"y","115":"y","116":"y","117":"y","3.5":"n","3.6":"n"},chrome:{"4":"n","5":"n","6":"n","7":"n","8":"n","9":"n","10":"n","11":"n","12":"n","13":"n","14":"n","15":"n","16":"n","17":"n","18":"n","19":"n","20":"n","21":"n","22":"n","23":"n","24":"n","25":"n","26":"n","27":"n","28":"y","29":"y","30":"y","31":"y","32":"y","33":"y","34":"y","35":"y","36":"y","37":"y","38":"y","39":"y","40":"y","41":"y","42":"y","43":"y","44":"y","45":"y","46":"y","47":"y","48":"y","49":"y","50":"y","51":"y","52":"y","53":"y","54":"y","55":"y","56":"y","57":"y","58":"y","59":"y","60":"y","61":"y","62":"y","63":"y","64":"y","65":"y","66":"y","67":"y","68":"y","69":"y","70":"y","71":"y","72":"y","73":"y","74":"y","75":"y","76":"y","77":"y","78":"y","79":"y","80":"y","81":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","101":"y","102":"y","103":"y","104":"y","105":"y","106":"y","107":"y","108":"y","109":"y","110":"y","111":"y","112":"y","113":"y","114":"y","115":"y","116":"y","117":"y"},safari:{"4":"n","5":"n","6":"n","7":"n","8":"n","9":"y","10":"y","11":"y","12":"y","13":"y","14":"y","15":"y","17":"y","9.1":"y","10.1":"y","11.1":"y","12.1":"y","13.1":"y","14.1":"y","15.1":"y","15.2-15.3":"y","15.4":"y","15.5":"y","15.6":"y","16.0":"y","16.1":"y","16.2":"y","16.3":"y","16.4":"y","16.5":"y","16.6":"y",TP:"y","3.1":"n","3.2":"n","5.1":"n","6.1":"n","7.1":"n"},opera:{"9":"n","11":"n","12":"n","15":"y","16":"y","17":"y","18":"y","19":"y","20":"y","21":"y","22":"y","23":"y","24":"y","25":"y","26":"y","27":"y","28":"y","29":"y","30":"y","31":"y","32":"y","33":"y","34":"y","35":"y","36":"y","37":"y","38":"y","39":"y","40":"y","41":"y","42":"y","43":"y","44":"y","45":"y","46":"y","47":"y","48":"y","49":"y","50":"y","51":"y","52":"y","53":"y","54":"y","55":"y","56":"y","57":"y","58":"y","60":"y","62":"y","63":"y","64":"y","65":"y","66":"y","67":"y","68":"y","69":"y","70":"y","71":"y","72":"y","73":"y","74":"y","75":"y","76":"y","77":"y","78":"y","79":"y","80":"y","81":"y","82":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","12.1":"y","9.5-9.6":"n","10.0-10.1":"n","10.5":"n","10.6":"n","11.1":"n","11.5":"n","11.6":"n"},ios_saf:{"8":"n","17":"y","9.0-9.2":"y","9.3":"y","10.0-10.2":"y","10.3":"y","11.0-11.2":"y","11.3-11.4":"y","12.0-12.1":"y","12.2-12.5":"y","13.0-13.1":"y","13.2":"y","13.3":"y","13.4-13.7":"y","14.0-14.4":"y","14.5-14.8":"y","15.0-15.1":"y","15.2-15.3":"y","15.4":"y","15.5":"y","15.6":"y","16.0":"y","16.1":"y","16.2":"y","16.3":"y","16.4":"y","16.5":"y","16.6":"y","3.2":"n","4.0-4.1":"n","4.2-4.3":"n","5.0-5.1":"n","6.0-6.1":"n","7.0-7.1":"n","8.1-8.4":"n"},op_mini:{all:"y"},android:{"3":"n","4":"n","114":"y","4.4":"y","4.4.3-4.4.4":"y","2.1":"n","2.2":"n","2.3":"n","4.1":"n","4.2-4.3":"n"},bb:{"7":"n","10":"n"},op_mob:{"10":"n","11":"n","12":"n","73":"y","11.1":"n","11.5":"n","12.1":"n"},and_chr:{"114":"y"},and_ff:{"115":"y"},ie_mob:{"10":"n","11":"n"},and_uc:{"15.5":"y"},samsung:{"4":"y","20":"y","21":"y","5.0-5.4":"y","6.2-6.4":"y","7.2-7.4":"y","8.2":"y","9.2":"y","10.1":"y","11.1-11.2":"y","12.0":"y","13.0":"y","14.0":"y","15.0":"y","16.0":"y","17.0":"y","18.0":"y","19.0":"y"},and_qq:{"13.1":"y"},baidu:{"13.18":"y"},kaios:{"2.5":"y","3.0-3.1":"y"}}}}var rO,Ts=R(()=>{u();rO={ie:{prefix:"ms"},edge:{prefix:"webkit",prefix_exceptions:{"12":"ms","13":"ms","14":"ms","15":"ms","16":"ms","17":"ms","18":"ms"}},firefox:{prefix:"moz"},chrome:{prefix:"webkit"},safari:{prefix:"webkit"},opera:{prefix:"webkit",prefix_exceptions:{"9":"o","11":"o","12":"o","9.5-9.6":"o","10.0-10.1":"o","10.5":"o","10.6":"o","11.1":"o","11.5":"o","11.6":"o","12.1":"o"}},ios_saf:{prefix:"webkit"},op_mini:{prefix:"o"},android:{prefix:"webkit"},bb:{prefix:"webkit"},op_mob:{prefix:"o",prefix_exceptions:{"73":"webkit"}},and_chr:{prefix:"webkit"},and_ff:{prefix:"moz"},ie_mob:{prefix:"ms"},and_uc:{prefix:"webkit",prefix_exceptions:{"15.5":"webkit"}},samsung:{prefix:"webkit"},and_qq:{prefix:"webkit"},baidu:{prefix:"webkit"},kaios:{prefix:"moz"}}});var _y=x(()=>{u()});var _e=x((Yq,Lt)=>{u();var{list:Ql}=$e();Lt.exports.error=function(r){let e=new Error(r);throw e.autoprefixer=!0,e};Lt.exports.uniq=function(r){return[...new Set(r)]};Lt.exports.removeNote=function(r){return r.includes(" ")?r.split(" ")[0]:r};Lt.exports.escapeRegexp=function(r){return r.replace(/[$()*+-.?[\\\]^{|}]/g,"\\$&")};Lt.exports.regexp=function(r,e=!0){return e&&(r=this.escapeRegexp(r)),new RegExp(`(^|[\\s,(])(${r}($|[\\s(,]))`,"gi")};Lt.exports.editList=function(r,e){let t=Ql.comma(r),i=e(t,[]);if(t===i)return r;let n=r.match(/,\s*/);return n=n?n[0]:", ",i.join(n)};Lt.exports.splitSelector=function(r){return Ql.comma(r).map(e=>Ql.space(e).map(t=>t.split(/(?=\.|#)/g)))}});var Mt=x((Kq,Ty)=>{u();var nO=Gl(),Ey=(Ts(),Os).agents,sO=_e(),Oy=class{static prefixes(){if(this.prefixesCache)return this.prefixesCache;this.prefixesCache=[];for(let e in Ey)this.prefixesCache.push(`-${Ey[e].prefix}-`);return this.prefixesCache=sO.uniq(this.prefixesCache).sort((e,t)=>t.length-e.length),this.prefixesCache}static withPrefix(e){return this.prefixesRegexp||(this.prefixesRegexp=new RegExp(this.prefixes().join("|"))),this.prefixesRegexp.test(e)}constructor(e,t,i,n){this.data=e,this.options=i||{},this.browserslistOpts=n||{},this.selected=this.parse(t)}parse(e){let t={};for(let i in this.browserslistOpts)t[i]=this.browserslistOpts[i];return t.path=this.options.from,nO(e,t)}prefix(e){let[t,i]=e.split(" "),n=this.data[t],a=n.prefix_exceptions&&n.prefix_exceptions[i];return a||(a=n.prefix),`-${a}-`}isSelected(e){return this.selected.includes(e)}};Ty.exports=Oy});var Li=x((Xq,Ry)=>{u();Ry.exports={prefix(r){let e=r.match(/^(-\w+-)/);return e?e[0]:""},unprefixed(r){return r.replace(/^-\w+-/,"")}}});var wr=x((Jq,Iy)=>{u();var aO=Mt(),Py=Li(),oO=_e();function Yl(r,e){let t=new r.constructor;for(let i of Object.keys(r||{})){let n=r[i];i==="parent"&&typeof n=="object"?e&&(t[i]=e):i==="source"||i===null?t[i]=n:Array.isArray(n)?t[i]=n.map(a=>Yl(a,t)):i!=="_autoprefixerPrefix"&&i!=="_autoprefixerValues"&&i!=="proxyCache"&&(typeof n=="object"&&n!==null&&(n=Yl(n,t)),t[i]=n)}return t}var Rs=class{static hack(e){return this.hacks||(this.hacks={}),e.names.map(t=>(this.hacks[t]=e,this.hacks[t]))}static load(e,t,i){let n=this.hacks&&this.hacks[e];return n?new n(e,t,i):new this(e,t,i)}static clone(e,t){let i=Yl(e);for(let n in t)i[n]=t[n];return i}constructor(e,t,i){this.prefixes=t,this.name=e,this.all=i}parentPrefix(e){let t;return typeof e._autoprefixerPrefix!="undefined"?t=e._autoprefixerPrefix:e.type==="decl"&&e.prop[0]==="-"?t=Py.prefix(e.prop):e.type==="root"?t=!1:e.type==="rule"&&e.selector.includes(":-")&&/:(-\w+-)/.test(e.selector)?t=e.selector.match(/:(-\w+-)/)[1]:e.type==="atrule"&&e.name[0]==="-"?t=Py.prefix(e.name):t=this.parentPrefix(e.parent),aO.prefixes().includes(t)||(t=!1),e._autoprefixerPrefix=t,e._autoprefixerPrefix}process(e,t){if(!this.check(e))return;let i=this.parentPrefix(e),n=this.prefixes.filter(s=>!i||i===oO.removeNote(s)),a=[];for(let s of n)this.add(e,s,a.concat([s]),t)&&a.push(s);return a}clone(e,t){return Rs.clone(e,t)}};Iy.exports=Rs});var j=x((Zq,$y)=>{u();var lO=wr(),uO=Mt(),Dy=_e(),qy=class extends lO{check(){return!0}prefixed(e,t){return t+e}normalize(e){return e}otherPrefixes(e,t){for(let i of uO.prefixes())if(i!==t&&e.includes(i))return!0;return!1}set(e,t){return e.prop=this.prefixed(e.prop,t),e}needCascade(e){return e._autoprefixerCascade||(e._autoprefixerCascade=this.all.options.cascade!==!1&&e.raw("before").includes(`
-`)),e._autoprefixerCascade}maxPrefixed(e,t){if(t._autoprefixerMax)return t._autoprefixerMax;let i=0;for(let n of e)n=Dy.removeNote(n),n.length>i&&(i=n.length);return t._autoprefixerMax=i,t._autoprefixerMax}calcBefore(e,t,i=""){let a=this.maxPrefixed(e,t)-Dy.removeNote(i).length,s=t.raw("before");return a>0&&(s+=Array(a).fill(" ").join("")),s}restoreBefore(e){let t=e.raw("before").split(`
-`),i=t[t.length-1];this.all.group(e).up(n=>{let a=n.raw("before").split(`
-`),s=a[a.length-1];s.length<i.length&&(i=s)}),t[t.length-1]=i,e.raws.before=t.join(`
-`)}insert(e,t,i){let n=this.set(this.clone(e),t);if(!(!n||e.parent.some(s=>s.prop===n.prop&&s.value===n.value)))return this.needCascade(e)&&(n.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,n)}isAlready(e,t){let i=this.all.group(e).up(n=>n.prop===t);return i||(i=this.all.group(e).down(n=>n.prop===t)),i}add(e,t,i,n){let a=this.prefixed(e.prop,t);if(!(this.isAlready(e,a)||this.otherPrefixes(e.value,t)))return this.insert(e,t,i,n)}process(e,t){if(!this.needCascade(e)){super.process(e,t);return}let i=super.process(e,t);!i||!i.length||(this.restoreBefore(e),e.raws.before=this.calcBefore(i,e))}old(e,t){return[this.prefixed(e,t)]}};$y.exports=qy});var My=x((e$,Ly)=>{u();Ly.exports=function r(e){return{mul:t=>new r(e*t),div:t=>new r(e/t),simplify:()=>new r(e),toString:()=>e.toString()}}});var Fy=x((t$,By)=>{u();var fO=My(),cO=wr(),Kl=_e(),pO=/(min|max)-resolution\s*:\s*\d*\.?\d+(dppx|dpcm|dpi|x)/gi,dO=/(min|max)-resolution(\s*:\s*)(\d*\.?\d+)(dppx|dpcm|dpi|x)/i,Ny=class extends cO{prefixName(e,t){return e==="-moz-"?t+"--moz-device-pixel-ratio":e+t+"-device-pixel-ratio"}prefixQuery(e,t,i,n,a){return n=new fO(n),a==="dpi"?n=n.div(96):a==="dpcm"&&(n=n.mul(2.54).div(96)),n=n.simplify(),e==="-o-"&&(n=n.n+"/"+n.d),this.prefixName(e,t)+i+n}clean(e){if(!this.bad){this.bad=[];for(let t of this.prefixes)this.bad.push(this.prefixName(t,"min")),this.bad.push(this.prefixName(t,"max"))}e.params=Kl.editList(e.params,t=>t.filter(i=>this.bad.every(n=>!i.includes(n))))}process(e){let t=this.parentPrefix(e),i=t?[t]:this.prefixes;e.params=Kl.editList(e.params,(n,a)=>{for(let s of n){if(!s.includes("min-resolution")&&!s.includes("max-resolution")){a.push(s);continue}for(let o of i){let l=s.replace(pO,c=>{let f=c.match(dO);return this.prefixQuery(o,f[1],f[2],f[3],f[4])});a.push(l)}a.push(s)}return Kl.uniq(a)})}};By.exports=Ny});var zy=x((r$,jy)=>{u();var Xl="(".charCodeAt(0),Jl=")".charCodeAt(0),Ps="'".charCodeAt(0),Zl='"'.charCodeAt(0),eu="\\".charCodeAt(0),vr="/".charCodeAt(0),tu=",".charCodeAt(0),ru=":".charCodeAt(0),Is="*".charCodeAt(0),hO="u".charCodeAt(0),mO="U".charCodeAt(0),gO="+".charCodeAt(0),yO=/^[a-f0-9?-]+$/i;jy.exports=function(r){for(var e=[],t=r,i,n,a,s,o,l,c,f,d=0,p=t.charCodeAt(d),h=t.length,b=[{nodes:e}],v=0,y,w="",k="",S="";d<h;)if(p<=32){i=d;do i+=1,p=t.charCodeAt(i);while(p<=32);s=t.slice(d,i),a=e[e.length-1],p===Jl&&v?S=s:a&&a.type==="div"?(a.after=s,a.sourceEndIndex+=s.length):p===tu||p===ru||p===vr&&t.charCodeAt(i+1)!==Is&&(!y||y&&y.type==="function"&&y.value!=="calc")?k=s:e.push({type:"space",sourceIndex:d,sourceEndIndex:i,value:s}),d=i}else if(p===Ps||p===Zl){i=d,n=p===Ps?"'":'"',s={type:"string",sourceIndex:d,quote:n};do if(o=!1,i=t.indexOf(n,i+1),~i)for(l=i;t.charCodeAt(l-1)===eu;)l-=1,o=!o;else t+=n,i=t.length-1,s.unclosed=!0;while(o);s.value=t.slice(d+1,i),s.sourceEndIndex=s.unclosed?i:i+1,e.push(s),d=i+1,p=t.charCodeAt(d)}else if(p===vr&&t.charCodeAt(d+1)===Is)i=t.indexOf("*/",d),s={type:"comment",sourceIndex:d,sourceEndIndex:i+2},i===-1&&(s.unclosed=!0,i=t.length,s.sourceEndIndex=i),s.value=t.slice(d+2,i),e.push(s),d=i+2,p=t.charCodeAt(d);else if((p===vr||p===Is)&&y&&y.type==="function"&&y.value==="calc")s=t[d],e.push({type:"word",sourceIndex:d-k.length,sourceEndIndex:d+s.length,value:s}),d+=1,p=t.charCodeAt(d);else if(p===vr||p===tu||p===ru)s=t[d],e.push({type:"div",sourceIndex:d-k.length,sourceEndIndex:d+s.length,value:s,before:k,after:""}),k="",d+=1,p=t.charCodeAt(d);else if(Xl===p){i=d;do i+=1,p=t.charCodeAt(i);while(p<=32);if(f=d,s={type:"function",sourceIndex:d-w.length,value:w,before:t.slice(f+1,i)},d=i,w==="url"&&p!==Ps&&p!==Zl){i-=1;do if(o=!1,i=t.indexOf(")",i+1),~i)for(l=i;t.charCodeAt(l-1)===eu;)l-=1,o=!o;else t+=")",i=t.length-1,s.unclosed=!0;while(o);c=i;do c-=1,p=t.charCodeAt(c);while(p<=32);f<c?(d!==c+1?s.nodes=[{type:"word",sourceIndex:d,sourceEndIndex:c+1,value:t.slice(d,c+1)}]:s.nodes=[],s.unclosed&&c+1!==i?(s.after="",s.nodes.push({type:"space",sourceIndex:c+1,sourceEndIndex:i,value:t.slice(c+1,i)})):(s.after=t.slice(c+1,i),s.sourceEndIndex=i)):(s.after="",s.nodes=[]),d=i+1,s.sourceEndIndex=s.unclosed?i:d,p=t.charCodeAt(d),e.push(s)}else v+=1,s.after="",s.sourceEndIndex=d+1,e.push(s),b.push(s),e=s.nodes=[],y=s;w=""}else if(Jl===p&&v)d+=1,p=t.charCodeAt(d),y.after=S,y.sourceEndIndex+=S.length,S="",v-=1,b[b.length-1].sourceEndIndex=d,b.pop(),y=b[v],e=y.nodes;else{i=d;do p===eu&&(i+=1),i+=1,p=t.charCodeAt(i);while(i<h&&!(p<=32||p===Ps||p===Zl||p===tu||p===ru||p===vr||p===Xl||p===Is&&y&&y.type==="function"&&y.value==="calc"||p===vr&&y.type==="function"&&y.value==="calc"||p===Jl&&v));s=t.slice(d,i),Xl===p?w=s:(hO===s.charCodeAt(0)||mO===s.charCodeAt(0))&&gO===s.charCodeAt(1)&&yO.test(s.slice(2))?e.push({type:"unicode-range",sourceIndex:d,sourceEndIndex:i,value:s}):e.push({type:"word",sourceIndex:d,sourceEndIndex:i,value:s}),d=i}for(d=b.length-1;d;d-=1)b[d].unclosed=!0,b[d].sourceEndIndex=t.length;return b[0].nodes}});var Vy=x((i$,Uy)=>{u();Uy.exports=function r(e,t,i){var n,a,s,o;for(n=0,a=e.length;n<a;n+=1)s=e[n],i||(o=t(s,n,e)),o!==!1&&s.type==="function"&&Array.isArray(s.nodes)&&r(s.nodes,t,i),i&&t(s,n,e)}});var Qy=x((n$,Gy)=>{u();function Hy(r,e){var t=r.type,i=r.value,n,a;return e&&(a=e(r))!==void 0?a:t==="word"||t==="space"?i:t==="string"?(n=r.quote||"",n+i+(r.unclosed?"":n)):t==="comment"?"/*"+i+(r.unclosed?"":"*/"):t==="div"?(r.before||"")+i+(r.after||""):Array.isArray(r.nodes)?(n=Wy(r.nodes,e),t!=="function"?n:i+"("+(r.before||"")+n+(r.after||"")+(r.unclosed?"":")")):i}function Wy(r,e){var t,i;if(Array.isArray(r)){for(t="",i=r.length-1;~i;i-=1)t=Hy(r[i],e)+t;return t}return Hy(r,e)}Gy.exports=Wy});var Ky=x((s$,Yy)=>{u();var Ds="-".charCodeAt(0),qs="+".charCodeAt(0),iu=".".charCodeAt(0),bO="e".charCodeAt(0),wO="E".charCodeAt(0);function vO(r){var e=r.charCodeAt(0),t;if(e===qs||e===Ds){if(t=r.charCodeAt(1),t>=48&&t<=57)return!0;var i=r.charCodeAt(2);return t===iu&&i>=48&&i<=57}return e===iu?(t=r.charCodeAt(1),t>=48&&t<=57):e>=48&&e<=57}Yy.exports=function(r){var e=0,t=r.length,i,n,a;if(t===0||!vO(r))return!1;for(i=r.charCodeAt(e),(i===qs||i===Ds)&&e++;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),i===iu&&n>=48&&n<=57)for(e+=2;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),a=r.charCodeAt(e+2),(i===bO||i===wO)&&(n>=48&&n<=57||(n===qs||n===Ds)&&a>=48&&a<=57))for(e+=n===qs||n===Ds?3:2;e<t&&(i=r.charCodeAt(e),!(i<48||i>57));)e+=1;return{number:r.slice(0,e),unit:r.slice(e)}}});var $s=x((a$,Zy)=>{u();var xO=zy(),Xy=Vy(),Jy=Qy();function Nt(r){return this instanceof Nt?(this.nodes=xO(r),this):new Nt(r)}Nt.prototype.toString=function(){return Array.isArray(this.nodes)?Jy(this.nodes):""};Nt.prototype.walk=function(r,e){return Xy(this.nodes,r,e),this};Nt.unit=Ky();Nt.walk=Xy;Nt.stringify=Jy;Zy.exports=Nt});var nb=x((o$,ib)=>{u();var{list:kO}=$e(),eb=$s(),SO=Mt(),tb=Li(),rb=class{constructor(e){this.props=["transition","transition-property"],this.prefixes=e}add(e,t){let i,n,a=this.prefixes.add[e.prop],s=this.ruleVendorPrefixes(e),o=s||a&&a.prefixes||[],l=this.parse(e.value),c=l.map(h=>this.findProp(h)),f=[];if(c.some(h=>h[0]==="-"))return;for(let h of l){if(n=this.findProp(h),n[0]==="-")continue;let b=this.prefixes.add[n];if(!(!b||!b.prefixes))for(i of b.prefixes){if(s&&!s.some(y=>i.includes(y)))continue;let v=this.prefixes.prefixed(n,i);v!=="-ms-transform"&&!c.includes(v)&&(this.disabled(n,i)||f.push(this.clone(n,v,h)))}}l=l.concat(f);let d=this.stringify(l),p=this.stringify(this.cleanFromUnprefixed(l,"-webkit-"));if(o.includes("-webkit-")&&this.cloneBefore(e,`-webkit-${e.prop}`,p),this.cloneBefore(e,e.prop,p),o.includes("-o-")){let h=this.stringify(this.cleanFromUnprefixed(l,"-o-"));this.cloneBefore(e,`-o-${e.prop}`,h)}for(i of o)if(i!=="-webkit-"&&i!=="-o-"){let h=this.stringify(this.cleanOtherPrefixes(l,i));this.cloneBefore(e,i+e.prop,h)}d!==e.value&&!this.already(e,e.prop,d)&&(this.checkForWarning(t,e),e.cloneBefore(),e.value=d)}findProp(e){let t=e[0].value;if(/^\d/.test(t)){for(let[i,n]of e.entries())if(i!==0&&n.type==="word")return n.value}return t}already(e,t,i){return e.parent.some(n=>n.prop===t&&n.value===i)}cloneBefore(e,t,i){this.already(e,t,i)||e.cloneBefore({prop:t,value:i})}checkForWarning(e,t){if(t.prop!=="transition-property")return;let i=!1,n=!1;t.parent.each(a=>{if(a.type!=="decl"||a.prop.indexOf("transition-")!==0)return;let s=kO.comma(a.value);if(a.prop==="transition-property"){s.forEach(o=>{let l=this.prefixes.add[o];l&&l.prefixes&&l.prefixes.length>0&&(i=!0)});return}return n=n||s.length>1,!1}),i&&n&&t.warn(e,"Replace transition-property to transition, because Autoprefixer could not support any cases of transition-property and other transition-*")}remove(e){let t=this.parse(e.value);t=t.filter(s=>{let o=this.prefixes.remove[this.findProp(s)];return!o||!o.remove});let i=this.stringify(t);if(e.value===i)return;if(t.length===0){e.remove();return}let n=e.parent.some(s=>s.prop===e.prop&&s.value===i),a=e.parent.some(s=>s!==e&&s.prop===e.prop&&s.value.length>i.length);if(n||a){e.remove();return}e.value=i}parse(e){let t=eb(e),i=[],n=[];for(let a of t.nodes)n.push(a),a.type==="div"&&a.value===","&&(i.push(n),n=[]);return i.push(n),i.filter(a=>a.length>0)}stringify(e){if(e.length===0)return"";let t=[];for(let i of e)i[i.length-1].type!=="div"&&i.push(this.div(e)),t=t.concat(i);return t[0].type==="div"&&(t=t.slice(1)),t[t.length-1].type==="div"&&(t=t.slice(0,-2+1||void 0)),eb.stringify({nodes:t})}clone(e,t,i){let n=[],a=!1;for(let s of i)!a&&s.type==="word"&&s.value===e?(n.push({type:"word",value:t}),a=!0):n.push(s);return n}div(e){for(let t of e)for(let i of t)if(i.type==="div"&&i.value===",")return i;return{type:"div",value:",",after:" "}}cleanOtherPrefixes(e,t){return e.filter(i=>{let n=tb.prefix(this.findProp(i));return n===""||n===t})}cleanFromUnprefixed(e,t){let i=e.map(a=>this.findProp(a)).filter(a=>a.slice(0,t.length)===t).map(a=>this.prefixes.unprefixed(a)),n=[];for(let a of e){let s=this.findProp(a),o=tb.prefix(s);!i.includes(s)&&(o===t||o==="")&&n.push(a)}return n}disabled(e,t){let i=["order","justify-content","align-self","align-content"];if(e.includes("flex")||i.includes(e)){if(this.prefixes.options.flexbox===!1)return!0;if(this.prefixes.options.flexbox==="no-2009")return t.includes("2009")}}ruleVendorPrefixes(e){let{parent:t}=e;if(t.type!=="rule")return!1;if(!t.selector.includes(":-"))return!1;let i=SO.prefixes().filter(n=>t.selector.includes(":"+n));return i.length>0?i:!1}};ib.exports=rb});var xr=x((l$,ab)=>{u();var AO=_e(),sb=class{constructor(e,t,i,n){this.unprefixed=e,this.prefixed=t,this.string=i||t,this.regexp=n||AO.regexp(t)}check(e){return e.includes(this.string)?!!e.match(this.regexp):!1}};ab.exports=sb});var He=x((u$,lb)=>{u();var CO=wr(),_O=xr(),EO=Li(),OO=_e(),ob=class extends CO{static save(e,t){let i=t.prop,n=[];for(let a in t._autoprefixerValues){let s=t._autoprefixerValues[a];if(s===t.value)continue;let o,l=EO.prefix(i);if(l==="-pie-")continue;if(l===a){o=t.value=s,n.push(o);continue}let c=e.prefixed(i,a),f=t.parent;if(!f.every(b=>b.prop!==c)){n.push(o);continue}let d=s.replace(/\s+/," ");if(f.some(b=>b.prop===t.prop&&b.value.replace(/\s+/," ")===d)){n.push(o);continue}let h=this.clone(t,{value:s});o=t.parent.insertBefore(t,h),n.push(o)}return n}check(e){let t=e.value;return t.includes(this.name)?!!t.match(this.regexp()):!1}regexp(){return this.regexpCache||(this.regexpCache=OO.regexp(this.name))}replace(e,t){return e.replace(this.regexp(),`$1${t}$2`)}value(e){return e.raws.value&&e.raws.value.value===e.value?e.raws.value.raw:e.value}add(e,t){e._autoprefixerValues||(e._autoprefixerValues={});let i=e._autoprefixerValues[t]||this.value(e),n;do if(n=i,i=this.replace(i,t),i===!1)return;while(i!==n);e._autoprefixerValues[t]=i}old(e){return new _O(this.name,e+this.name)}};lb.exports=ob});var Bt=x((f$,ub)=>{u();ub.exports={}});var su=x((c$,pb)=>{u();var fb=$s(),TO=He(),RO=Bt().insertAreas,PO=/(^|[^-])linear-gradient\(\s*(top|left|right|bottom)/i,IO=/(^|[^-])radial-gradient\(\s*\d+(\w*|%)\s+\d+(\w*|%)\s*,/i,DO=/(!\s*)?autoprefixer:\s*ignore\s+next/i,qO=/(!\s*)?autoprefixer\s*grid:\s*(on|off|(no-)?autoplace)/i,$O=["width","height","min-width","max-width","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size"];function nu(r){return r.parent.some(e=>e.prop==="grid-template"||e.prop==="grid-template-areas")}function LO(r){let e=r.parent.some(i=>i.prop==="grid-template-rows"),t=r.parent.some(i=>i.prop==="grid-template-columns");return e&&t}var cb=class{constructor(e){this.prefixes=e}add(e,t){let i=this.prefixes.add["@resolution"],n=this.prefixes.add["@keyframes"],a=this.prefixes.add["@viewport"],s=this.prefixes.add["@supports"];e.walkAtRules(f=>{if(f.name==="keyframes"){if(!this.disabled(f,t))return n&&n.process(f)}else if(f.name==="viewport"){if(!this.disabled(f,t))return a&&a.process(f)}else if(f.name==="supports"){if(this.prefixes.options.supports!==!1&&!this.disabled(f,t))return s.process(f)}else if(f.name==="media"&&f.params.includes("-resolution")&&!this.disabled(f,t))return i&&i.process(f)}),e.walkRules(f=>{if(!this.disabled(f,t))return this.prefixes.add.selectors.map(d=>d.process(f,t))});function o(f){return f.parent.nodes.some(d=>{if(d.type!=="decl")return!1;let p=d.prop==="display"&&/(inline-)?grid/.test(d.value),h=d.prop.startsWith("grid-template"),b=/^grid-([A-z]+-)?gap/.test(d.prop);return p||h||b})}function l(f){return f.parent.some(d=>d.prop==="display"&&/(inline-)?flex/.test(d.value))}let c=this.gridStatus(e,t)&&this.prefixes.add["grid-area"]&&this.prefixes.add["grid-area"].prefixes;return e.walkDecls(f=>{if(this.disabledDecl(f,t))return;let d=f.parent,p=f.prop,h=f.value;if(p==="grid-row-span"){t.warn("grid-row-span is not part of final Grid Layout. Use grid-row.",{node:f});return}else if(p==="grid-column-span"){t.warn("grid-column-span is not part of final Grid Layout. Use grid-column.",{node:f});return}else if(p==="display"&&h==="box"){t.warn("You should write display: flex by final spec instead of display: box",{node:f});return}else if(p==="text-emphasis-position")(h==="under"||h==="over")&&t.warn("You should use 2 values for text-emphasis-position For example, `under left` instead of just `under`.",{node:f});else if(/^(align|justify|place)-(items|content)$/.test(p)&&l(f))(h==="start"||h==="end")&&t.warn(`${h} value has mixed support, consider using flex-${h} instead`,{node:f});else if(p==="text-decoration-skip"&&h==="ink")t.warn("Replace text-decoration-skip: ink to text-decoration-skip-ink: auto, because spec had been changed",{node:f});else{if(c&&this.gridStatus(f,t))if(f.value==="subgrid"&&t.warn("IE does not support subgrid",{node:f}),/^(align|justify|place)-items$/.test(p)&&o(f)){let v=p.replace("-items","-self");t.warn(`IE does not support ${p} on grid containers. Try using ${v} on child elements instead: ${f.parent.selector} > * { ${v}: ${f.value} }`,{node:f})}else if(/^(align|justify|place)-content$/.test(p)&&o(f))t.warn(`IE does not support ${f.prop} on grid containers`,{node:f});else if(p==="display"&&f.value==="contents"){t.warn("Please do not use display: contents; if you have grid setting enabled",{node:f});return}else if(f.prop==="grid-gap"){let v=this.gridStatus(f,t);v==="autoplace"&&!LO(f)&&!nu(f)?t.warn("grid-gap only works if grid-template(-areas) is being used or both rows and columns have been declared and cells have not been manually placed inside the explicit grid",{node:f}):(v===!0||v==="no-autoplace")&&!nu(f)&&t.warn("grid-gap only works if grid-template(-areas) is being used",{node:f})}else if(p==="grid-auto-columns"){t.warn("grid-auto-columns is not supported by IE",{node:f});return}else if(p==="grid-auto-rows"){t.warn("grid-auto-rows is not supported by IE",{node:f});return}else if(p==="grid-auto-flow"){let v=d.some(w=>w.prop==="grid-template-rows"),y=d.some(w=>w.prop==="grid-template-columns");nu(f)?t.warn("grid-auto-flow is not supported by IE",{node:f}):h.includes("dense")?t.warn("grid-auto-flow: dense is not supported by IE",{node:f}):!v&&!y&&t.warn("grid-auto-flow works only if grid-template-rows and grid-template-columns are present in the same rule",{node:f});return}else if(h.includes("auto-fit")){t.warn("auto-fit value is not supported by IE",{node:f,word:"auto-fit"});return}else if(h.includes("auto-fill")){t.warn("auto-fill value is not supported by IE",{node:f,word:"auto-fill"});return}else p.startsWith("grid-template")&&h.includes("[")&&t.warn("Autoprefixer currently does not support line names. Try using grid-template-areas instead.",{node:f,word:"["});if(h.includes("radial-gradient"))if(IO.test(f.value))t.warn("Gradient has outdated direction syntax. New syntax is like `closest-side at 0 0` instead of `0 0, closest-side`.",{node:f});else{let v=fb(h);for(let y of v.nodes)if(y.type==="function"&&y.value==="radial-gradient")for(let w of y.nodes)w.type==="word"&&(w.value==="cover"?t.warn("Gradient has outdated direction syntax. Replace `cover` to `farthest-corner`.",{node:f}):w.value==="contain"&&t.warn("Gradient has outdated direction syntax. Replace `contain` to `closest-side`.",{node:f}))}h.includes("linear-gradient")&&PO.test(h)&&t.warn("Gradient has outdated direction syntax. New syntax is like `to left` instead of `right`.",{node:f})}$O.includes(f.prop)&&(f.value.includes("-fill-available")||(f.value.includes("fill-available")?t.warn("Replace fill-available to stretch, because spec had been changed",{node:f}):f.value.includes("fill")&&fb(h).nodes.some(y=>y.type==="word"&&y.value==="fill")&&t.warn("Replace fill to stretch, because spec had been changed",{node:f})));let b;if(f.prop==="transition"||f.prop==="transition-property")return this.prefixes.transition.add(f,t);if(f.prop==="align-self"){if(this.displayType(f)!=="grid"&&this.prefixes.options.flexbox!==!1&&(b=this.prefixes.add["align-self"],b&&b.prefixes&&b.process(f)),this.gridStatus(f,t)!==!1&&(b=this.prefixes.add["grid-row-align"],b&&b.prefixes))return b.process(f,t)}else if(f.prop==="justify-self"){if(this.gridStatus(f,t)!==!1&&(b=this.prefixes.add["grid-column-align"],b&&b.prefixes))return b.process(f,t)}else if(f.prop==="place-self"){if(b=this.prefixes.add["place-self"],b&&b.prefixes&&this.gridStatus(f,t)!==!1)return b.process(f,t)}else if(b=this.prefixes.add[f.prop],b&&b.prefixes)return b.process(f,t)}),this.gridStatus(e,t)&&RO(e,this.disabled),e.walkDecls(f=>{if(this.disabledValue(f,t))return;let d=this.prefixes.unprefixed(f.prop),p=this.prefixes.values("add",d);if(Array.isArray(p))for(let h of p)h.process&&h.process(f,t);TO.save(this.prefixes,f)})}remove(e,t){let i=this.prefixes.remove["@resolution"];e.walkAtRules((n,a)=>{this.prefixes.remove[`@${n.name}`]?this.disabled(n,t)||n.parent.removeChild(a):n.name==="media"&&n.params.includes("-resolution")&&i&&i.clean(n)});for(let n of this.prefixes.remove.selectors)e.walkRules((a,s)=>{n.check(a)&&(this.disabled(a,t)||a.parent.removeChild(s))});return e.walkDecls((n,a)=>{if(this.disabled(n,t))return;let s=n.parent,o=this.prefixes.unprefixed(n.prop);if((n.prop==="transition"||n.prop==="transition-property")&&this.prefixes.transition.remove(n),this.prefixes.remove[n.prop]&&this.prefixes.remove[n.prop].remove){let l=this.prefixes.group(n).down(c=>this.prefixes.normalize(c.prop)===o);if(o==="flex-flow"&&(l=!0),n.prop==="-webkit-box-orient"){let c={"flex-direction":!0,"flex-flow":!0};if(!n.parent.some(f=>c[f.prop]))return}if(l&&!this.withHackValue(n)){n.raw("before").includes(`
-`)&&this.reduceSpaces(n),s.removeChild(a);return}}for(let l of this.prefixes.values("remove",o)){if(!l.check||!l.check(n.value))continue;if(o=l.unprefixed,this.prefixes.group(n).down(f=>f.value.includes(o))){s.removeChild(a);return}}})}withHackValue(e){return e.prop==="-webkit-background-clip"&&e.value==="text"}disabledValue(e,t){return this.gridStatus(e,t)===!1&&e.type==="decl"&&e.prop==="display"&&e.value.includes("grid")||this.prefixes.options.flexbox===!1&&e.type==="decl"&&e.prop==="display"&&e.value.includes("flex")||e.type==="decl"&&e.prop==="content"?!0:this.disabled(e,t)}disabledDecl(e,t){if(this.gridStatus(e,t)===!1&&e.type==="decl"&&(e.prop.includes("grid")||e.prop==="justify-items"))return!0;if(this.prefixes.options.flexbox===!1&&e.type==="decl"){let i=["order","justify-content","align-items","align-content"];if(e.prop.includes("flex")||i.includes(e.prop))return!0}return this.disabled(e,t)}disabled(e,t){if(!e)return!1;if(e._autoprefixerDisabled!==void 0)return e._autoprefixerDisabled;if(e.parent){let n=e.prev();if(n&&n.type==="comment"&&DO.test(n.text))return e._autoprefixerDisabled=!0,e._autoprefixerSelfDisabled=!0,!0}let i=null;if(e.nodes){let n;e.each(a=>{a.type==="comment"&&/(!\s*)?autoprefixer:\s*(off|on)/i.test(a.text)&&(typeof n!="undefined"?t.warn("Second Autoprefixer control comment was ignored. Autoprefixer applies control comment to whole block, not to next rules.",{node:a}):n=/on/i.test(a.text))}),n!==void 0&&(i=!n)}if(!e.nodes||i===null)if(e.parent){let n=this.disabled(e.parent,t);e.parent._autoprefixerSelfDisabled===!0?i=!1:i=n}else i=!1;return e._autoprefixerDisabled=i,i}reduceSpaces(e){let t=!1;if(this.prefixes.group(e).up(()=>(t=!0,!0)),t)return;let i=e.raw("before").split(`
-`),n=i[i.length-1].length,a=!1;this.prefixes.group(e).down(s=>{i=s.raw("before").split(`
-`);let o=i.length-1;i[o].length>n&&(a===!1&&(a=i[o].length-n),i[o]=i[o].slice(0,-a),s.raws.before=i.join(`
-`))})}displayType(e){for(let t of e.parent.nodes)if(t.prop==="display"){if(t.value.includes("flex"))return"flex";if(t.value.includes("grid"))return"grid"}return!1}gridStatus(e,t){if(!e)return!1;if(e._autoprefixerGridStatus!==void 0)return e._autoprefixerGridStatus;let i=null;if(e.nodes){let n;e.each(a=>{if(a.type==="comment"&&qO.test(a.text)){let s=/:\s*autoplace/i.test(a.text),o=/no-autoplace/i.test(a.text);typeof n!="undefined"?t.warn("Second Autoprefixer grid control comment was ignored. Autoprefixer applies control comments to the whole block, not to the next rules.",{node:a}):s?n="autoplace":o?n=!0:n=/on/i.test(a.text)}}),n!==void 0&&(i=n)}if(e.type==="atrule"&&e.name==="supports"){let n=e.params;n.includes("grid")&&n.includes("auto")&&(i=!1)}if(!e.nodes||i===null)if(e.parent){let n=this.gridStatus(e.parent,t);e.parent._autoprefixerSelfDisabled===!0?i=!1:i=n}else typeof this.prefixes.options.grid!="undefined"?i=this.prefixes.options.grid:typeof m.env.AUTOPREFIXER_GRID!="undefined"?m.env.AUTOPREFIXER_GRID==="autoplace"?i="autoplace":i=!0:i=!1;return e._autoprefixerGridStatus=i,i}};pb.exports=cb});var hb=x((p$,db)=>{u();db.exports={A:{A:{"2":"K E F G A B JC"},B:{"1":"C L M H N D O P Q R S T U V W X Y Z a b c d e f g h i j n o p q r s t u v w x y z I"},C:{"1":"2 3 4 5 6 7 8 9 AB BB CB DB EB FB GB HB IB JB KB LB MB NB OB PB QB RB SB TB UB VB WB XB YB ZB aB bB cB 0B dB 1B eB fB gB hB iB jB kB lB mB nB oB m pB qB rB sB tB P Q R 2B S T U V W X Y Z a b c d e f g h i j n o p q r s t u v w x y z I uB 3B 4B","2":"0 1 KC zB J K E F G A B C L M H N D O k l LC MC"},D:{"1":"8 9 AB BB CB DB EB FB GB HB IB JB KB LB MB NB OB PB QB RB SB TB UB VB WB XB YB ZB aB bB cB 0B dB 1B eB fB gB hB iB jB kB lB mB nB oB m pB qB rB sB tB P Q R S T U V W X Y Z a b c d e f g h i j n o p q r s t u v w x y z I uB 3B 4B","2":"0 1 2 3 4 5 6 7 J K E F G A B C L M H N D O k l"},E:{"1":"G A B C L M H D RC 6B vB wB 7B SC TC 8B 9B xB AC yB BC CC DC EC FC GC UC","2":"0 J K E F NC 5B OC PC QC"},F:{"1":"1 2 3 4 5 6 7 8 9 H N D O k l AB BB CB DB EB FB GB HB IB JB KB LB MB NB OB PB QB RB SB TB UB VB WB XB YB ZB aB bB cB dB eB fB gB hB iB jB kB lB mB nB oB m pB qB rB sB tB P Q R 2B S T U V W X Y Z a b c d e f g h i j wB","2":"G B C VC WC XC YC vB HC ZC"},G:{"1":"D fC gC hC iC jC kC lC mC nC oC pC qC rC sC tC 8B 9B xB AC yB BC CC DC EC FC GC","2":"F 5B aC IC bC cC dC eC"},H:{"1":"uC"},I:{"1":"I zC 0C","2":"zB J vC wC xC yC IC"},J:{"2":"E A"},K:{"1":"m","2":"A B C vB HC wB"},L:{"1":"I"},M:{"1":"uB"},N:{"2":"A B"},O:{"1":"xB"},P:{"1":"J k l 1C 2C 3C 4C 5C 6B 6C 7C 8C 9C AD yB BD CD DD"},Q:{"1":"7B"},R:{"1":"ED"},S:{"1":"FD GD"}},B:4,C:"CSS Feature Queries"}});var bb=x((d$,yb)=>{u();function mb(r){return r[r.length-1]}var gb={parse(r){let e=[""],t=[e];for(let i of r){if(i==="("){e=[""],mb(t).push(e),t.push(e);continue}if(i===")"){t.pop(),e=mb(t),e.push("");continue}e[e.length-1]+=i}return t[0]},stringify(r){let e="";for(let t of r){if(typeof t=="object"){e+=`(${gb.stringify(t)})`;continue}e+=t}return e}};yb.exports=gb});var Sb=x((h$,kb)=>{u();var MO=hb(),{feature:NO}=(Ts(),Os),{parse:BO}=$e(),FO=Mt(),au=bb(),jO=He(),zO=_e(),wb=NO(MO),vb=[];for(let r in wb.stats){let e=wb.stats[r];for(let t in e){let i=e[t];/y/.test(i)&&vb.push(r+" "+t)}}var xb=class{constructor(e,t){this.Prefixes=e,this.all=t}prefixer(){if(this.prefixerCache)return this.prefixerCache;let e=this.all.browsers.selected.filter(i=>vb.includes(i)),t=new FO(this.all.browsers.data,e,this.all.options);return this.prefixerCache=new this.Prefixes(this.all.data,t,this.all.options),this.prefixerCache}parse(e){let t=e.split(":"),i=t[0],n=t[1];return n||(n=""),[i.trim(),n.trim()]}virtual(e){let[t,i]=this.parse(e),n=BO("a{}").first;return n.append({prop:t,value:i,raws:{before:""}}),n}prefixed(e){let t=this.virtual(e);if(this.disabled(t.first))return t.nodes;let i={warn:()=>null},n=this.prefixer().add[t.first.prop];n&&n.process&&n.process(t.first,i);for(let a of t.nodes){for(let s of this.prefixer().values("add",t.first.prop))s.process(a);jO.save(this.all,a)}return t.nodes}isNot(e){return typeof e=="string"&&/not\s*/i.test(e)}isOr(e){return typeof e=="string"&&/\s*or\s*/i.test(e)}isProp(e){return typeof e=="object"&&e.length===1&&typeof e[0]=="string"}isHack(e,t){return!new RegExp(`(\\(|\\s)${zO.escapeRegexp(t)}:`).test(e)}toRemove(e,t){let[i,n]=this.parse(e),a=this.all.unprefixed(i),s=this.all.cleaner();if(s.remove[i]&&s.remove[i].remove&&!this.isHack(t,a))return!0;for(let o of s.values("remove",a))if(o.check(n))return!0;return!1}remove(e,t){let i=0;for(;i<e.length;){if(!this.isNot(e[i-1])&&this.isProp(e[i])&&this.isOr(e[i+1])){if(this.toRemove(e[i][0],t)){e.splice(i,2);continue}i+=2;continue}typeof e[i]=="object"&&(e[i]=this.remove(e[i],t)),i+=1}return e}cleanBrackets(e){return e.map(t=>typeof t!="object"?t:t.length===1&&typeof t[0]=="object"?this.cleanBrackets(t[0]):this.cleanBrackets(t))}convert(e){let t=[""];for(let i of e)t.push([`${i.prop}: ${i.value}`]),t.push(" or ");return t[t.length-1]="",t}normalize(e){if(typeof e!="object")return e;if(e=e.filter(t=>t!==""),typeof e[0]=="string"){let t=e[0].trim();if(t.includes(":")||t==="selector"||t==="not selector")return[au.stringify(e)]}return e.map(t=>this.normalize(t))}add(e,t){return e.map(i=>{if(this.isProp(i)){let n=this.prefixed(i[0]);return n.length>1?this.convert(n):i}return typeof i=="object"?this.add(i,t):i})}process(e){let t=au.parse(e.params);t=this.normalize(t),t=this.remove(t,e.params),t=this.add(t,e.params),t=this.cleanBrackets(t),e.params=au.stringify(t)}disabled(e){if(!this.all.options.grid&&(e.prop==="display"&&e.value.includes("grid")||e.prop.includes("grid")||e.prop==="justify-items"))return!0;if(this.all.options.flexbox===!1){if(e.prop==="display"&&e.value.includes("flex"))return!0;let t=["order","justify-content","align-items","align-content"];if(e.prop.includes("flex")||t.includes(e.prop))return!0}return!1}};kb.exports=xb});var _b=x((m$,Cb)=>{u();var Ab=class{constructor(e,t){this.prefix=t,this.prefixed=e.prefixed(this.prefix),this.regexp=e.regexp(this.prefix),this.prefixeds=e.possible().map(i=>[e.prefixed(i),e.regexp(i)]),this.unprefixed=e.name,this.nameRegexp=e.regexp()}isHack(e){let t=e.parent.index(e)+1,i=e.parent.nodes;for(;t<i.length;){let n=i[t].selector;if(!n)return!0;if(n.includes(this.unprefixed)&&n.match(this.nameRegexp))return!1;let a=!1;for(let[s,o]of this.prefixeds)if(n.includes(s)&&n.match(o)){a=!0;break}if(!a)return!0;t+=1}return!0}check(e){return!(!e.selector.includes(this.prefixed)||!e.selector.match(this.regexp)||this.isHack(e))}};Cb.exports=Ab});var kr=x((g$,Ob)=>{u();var{list:UO}=$e(),VO=_b(),HO=wr(),WO=Mt(),GO=_e(),Eb=class extends HO{constructor(e,t,i){super(e,t,i);this.regexpCache=new Map}check(e){return e.selector.includes(this.name)?!!e.selector.match(this.regexp()):!1}prefixed(e){return this.name.replace(/^(\W*)/,`$1${e}`)}regexp(e){if(!this.regexpCache.has(e)){let t=e?this.prefixed(e):this.name;this.regexpCache.set(e,new RegExp(`(^|[^:"'=])${GO.escapeRegexp(t)}`,"gi"))}return this.regexpCache.get(e)}possible(){return WO.prefixes()}prefixeds(e){if(e._autoprefixerPrefixeds){if(e._autoprefixerPrefixeds[this.name])return e._autoprefixerPrefixeds}else e._autoprefixerPrefixeds={};let t={};if(e.selector.includes(",")){let n=UO.comma(e.selector).filter(a=>a.includes(this.name));for(let a of this.possible())t[a]=n.map(s=>this.replace(s,a)).join(", ")}else for(let i of this.possible())t[i]=this.replace(e.selector,i);return e._autoprefixerPrefixeds[this.name]=t,e._autoprefixerPrefixeds}already(e,t,i){let n=e.parent.index(e)-1;for(;n>=0;){let a=e.parent.nodes[n];if(a.type!=="rule")return!1;let s=!1;for(let o in t[this.name]){let l=t[this.name][o];if(a.selector===l){if(i===o)return!0;s=!0;break}}if(!s)return!1;n-=1}return!1}replace(e,t){return e.replace(this.regexp(),`$1${this.prefixed(t)}`)}add(e,t){let i=this.prefixeds(e);if(this.already(e,i,t))return;let n=this.clone(e,{selector:i[this.name][t]});e.parent.insertBefore(e,n)}old(e){return new VO(this,e)}};Ob.exports=Eb});var Pb=x((y$,Rb)=>{u();var QO=wr(),Tb=class extends QO{add(e,t){let i=t+e.name;if(e.parent.some(s=>s.name===i&&s.params===e.params))return;let a=this.clone(e,{name:i});return e.parent.insertBefore(e,a)}process(e){let t=this.parentPrefix(e);for(let i of this.prefixes)(!t||t===i)&&this.add(e,i)}};Rb.exports=Tb});var Db=x((b$,Ib)=>{u();var YO=kr(),ou=class extends YO{prefixed(e){return e==="-webkit-"?":-webkit-full-screen":e==="-moz-"?":-moz-full-screen":`:${e}fullscreen`}};ou.names=[":fullscreen"];Ib.exports=ou});var $b=x((w$,qb)=>{u();var KO=kr(),lu=class extends KO{possible(){return super.possible().concat(["-moz- old","-ms- old"])}prefixed(e){return e==="-webkit-"?"::-webkit-input-placeholder":e==="-ms-"?"::-ms-input-placeholder":e==="-ms- old"?":-ms-input-placeholder":e==="-moz- old"?":-moz-placeholder":`::${e}placeholder`}};lu.names=["::placeholder"];qb.exports=lu});var Mb=x((v$,Lb)=>{u();var XO=kr(),uu=class extends XO{prefixed(e){return e==="-ms-"?":-ms-input-placeholder":`:${e}placeholder-shown`}};uu.names=[":placeholder-shown"];Lb.exports=uu});var Bb=x((x$,Nb)=>{u();var JO=kr(),ZO=_e(),fu=class extends JO{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=ZO.uniq(this.prefixes.map(n=>"-webkit-")))}prefixed(e){return e==="-webkit-"?"::-webkit-file-upload-button":`::${e}file-selector-button`}};fu.names=["::file-selector-button"];Nb.exports=fu});var Pe=x((k$,Fb)=>{u();Fb.exports=function(r){let e;return r==="-webkit- 2009"||r==="-moz-"?e=2009:r==="-ms-"?e=2012:r==="-webkit-"&&(e="final"),r==="-webkit- 2009"&&(r="-webkit-"),[e,r]}});var Vb=x((S$,Ub)=>{u();var jb=$e().list,zb=Pe(),eT=j(),Sr=class extends eT{prefixed(e,t){let i;return[i,t]=zb(t),i===2009?t+"box-flex":super.prefixed(e,t)}normalize(){return"flex"}set(e,t){let i=zb(t)[0];if(i===2009)return e.value=jb.space(e.value)[0],e.value=Sr.oldValues[e.value]||e.value,super.set(e,t);if(i===2012){let n=jb.space(e.value);n.length===3&&n[2]==="0"&&(e.value=n.slice(0,2).concat("0px").join(" "))}return super.set(e,t)}};Sr.names=["flex","box-flex"];Sr.oldValues={auto:"1",none:"0"};Ub.exports=Sr});var Gb=x((A$,Wb)=>{u();var Hb=Pe(),tT=j(),cu=class extends tT{prefixed(e,t){let i;return[i,t]=Hb(t),i===2009?t+"box-ordinal-group":i===2012?t+"flex-order":super.prefixed(e,t)}normalize(){return"order"}set(e,t){return Hb(t)[0]===2009&&/\d/.test(e.value)?(e.value=(parseInt(e.value)+1).toString(),super.set(e,t)):super.set(e,t)}};cu.names=["order","flex-order","box-ordinal-group"];Wb.exports=cu});var Yb=x((C$,Qb)=>{u();var rT=j(),pu=class extends rT{check(e){let t=e.value;return!t.toLowerCase().includes("alpha(")&&!t.includes("DXImageTransform.Microsoft")&&!t.includes("data:image/svg+xml")}};pu.names=["filter"];Qb.exports=pu});var Xb=x((_$,Kb)=>{u();var iT=j(),du=class extends iT{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let a=this.clone(e),s=e.prop.replace(/end$/,"start"),o=t+e.prop.replace(/end$/,"span");if(!e.parent.some(l=>l.prop===o)){if(a.prop=o,e.value.includes("span"))a.value=e.value.replace(/span\s/i,"");else{let l;if(e.parent.walkDecls(s,c=>{l=c}),l){let c=Number(e.value)-Number(l.value)+"";a.value=c}else e.warn(n,`Can not prefix ${e.prop} (${s} is not found)`)}e.cloneBefore(a)}}};du.names=["grid-row-end","grid-column-end"];Kb.exports=du});var Zb=x((E$,Jb)=>{u();var nT=j(),hu=class extends nT{check(e){return!e.value.split(/\s+/).some(t=>{let i=t.toLowerCase();return i==="reverse"||i==="alternate-reverse"})}};hu.names=["animation","animation-direction"];Jb.exports=hu});var tw=x((O$,ew)=>{u();var sT=Pe(),aT=j(),mu=class extends aT{insert(e,t,i){let n;if([n,t]=sT(t),n!==2009)return super.insert(e,t,i);let a=e.value.split(/\s+/).filter(d=>d!=="wrap"&&d!=="nowrap"&&"wrap-reverse");if(a.length===0||e.parent.some(d=>d.prop===t+"box-orient"||d.prop===t+"box-direction"))return;let o=a[0],l=o.includes("row")?"horizontal":"vertical",c=o.includes("reverse")?"reverse":"normal",f=this.clone(e);return f.prop=t+"box-orient",f.value=l,this.needCascade(e)&&(f.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,f),f=this.clone(e),f.prop=t+"box-direction",f.value=c,this.needCascade(e)&&(f.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,f)}};mu.names=["flex-flow","box-direction","box-orient"];ew.exports=mu});var iw=x((T$,rw)=>{u();var oT=Pe(),lT=j(),gu=class extends lT{normalize(){return"flex"}prefixed(e,t){let i;return[i,t]=oT(t),i===2009?t+"box-flex":i===2012?t+"flex-positive":super.prefixed(e,t)}};gu.names=["flex-grow","flex-positive"];rw.exports=gu});var sw=x((R$,nw)=>{u();var uT=Pe(),fT=j(),yu=class extends fT{set(e,t){if(uT(t)[0]!==2009)return super.set(e,t)}};yu.names=["flex-wrap"];nw.exports=yu});var ow=x((P$,aw)=>{u();var cT=j(),Ar=Bt(),bu=class extends cT{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let a=Ar.parse(e),[s,o]=Ar.translate(a,0,2),[l,c]=Ar.translate(a,1,3);[["grid-row",s],["grid-row-span",o],["grid-column",l],["grid-column-span",c]].forEach(([f,d])=>{Ar.insertDecl(e,f,d)}),Ar.warnTemplateSelectorNotFound(e,n),Ar.warnIfGridRowColumnExists(e,n)}};bu.names=["grid-area"];aw.exports=bu});var uw=x((I$,lw)=>{u();var pT=j(),Mi=Bt(),wu=class extends pT{insert(e,t,i){if(t!=="-ms-")return super.insert(e,t,i);if(e.parent.some(s=>s.prop==="-ms-grid-row-align"))return;let[[n,a]]=Mi.parse(e);a?(Mi.insertDecl(e,"grid-row-align",n),Mi.insertDecl(e,"grid-column-align",a)):(Mi.insertDecl(e,"grid-row-align",n),Mi.insertDecl(e,"grid-column-align",n))}};wu.names=["place-self"];lw.exports=wu});var cw=x((D$,fw)=>{u();var dT=j(),vu=class extends dT{check(e){let t=e.value;return!t.includes("/")||t.includes("span")}normalize(e){return e.replace("-start","")}prefixed(e,t){let i=super.prefixed(e,t);return t==="-ms-"&&(i=i.replace("-start","")),i}};vu.names=["grid-row-start","grid-column-start"];fw.exports=vu});var hw=x((q$,dw)=>{u();var pw=Pe(),hT=j(),Cr=class extends hT{check(e){return e.parent&&!e.parent.some(t=>t.prop&&t.prop.startsWith("grid-"))}prefixed(e,t){let i;return[i,t]=pw(t),i===2012?t+"flex-item-align":super.prefixed(e,t)}normalize(){return"align-self"}set(e,t){let i=pw(t)[0];if(i===2012)return e.value=Cr.oldValues[e.value]||e.value,super.set(e,t);if(i==="final")return super.set(e,t)}};Cr.names=["align-self","flex-item-align"];Cr.oldValues={"flex-end":"end","flex-start":"start"};dw.exports=Cr});var gw=x(($$,mw)=>{u();var mT=j(),gT=_e(),xu=class extends mT{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=gT.uniq(this.prefixes.map(n=>n==="-ms-"?"-webkit-":n)))}};xu.names=["appearance"];mw.exports=xu});var ww=x((L$,bw)=>{u();var yw=Pe(),yT=j(),ku=class extends yT{normalize(){return"flex-basis"}prefixed(e,t){let i;return[i,t]=yw(t),i===2012?t+"flex-preferred-size":super.prefixed(e,t)}set(e,t){let i;if([i,t]=yw(t),i===2012||i==="final")return super.set(e,t)}};ku.names=["flex-basis","flex-preferred-size"];bw.exports=ku});var xw=x((M$,vw)=>{u();var bT=j(),Su=class extends bT{normalize(){return this.name.replace("box-image","border")}prefixed(e,t){let i=super.prefixed(e,t);return t==="-webkit-"&&(i=i.replace("border","box-image")),i}};Su.names=["mask-border","mask-border-source","mask-border-slice","mask-border-width","mask-border-outset","mask-border-repeat","mask-box-image","mask-box-image-source","mask-box-image-slice","mask-box-image-width","mask-box-image-outset","mask-box-image-repeat"];vw.exports=Su});var Sw=x((N$,kw)=>{u();var wT=j(),lt=class extends wT{insert(e,t,i){let n=e.prop==="mask-composite",a;n?a=e.value.split(","):a=e.value.match(lt.regexp)||[],a=a.map(c=>c.trim()).filter(c=>c);let s=a.length,o;if(s&&(o=this.clone(e),o.value=a.map(c=>lt.oldValues[c]||c).join(", "),a.includes("intersect")&&(o.value+=", xor"),o.prop=t+"mask-composite"),n)return s?(this.needCascade(e)&&(o.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,o)):void 0;let l=this.clone(e);return l.prop=t+l.prop,s&&(l.value=l.value.replace(lt.regexp,"")),this.needCascade(e)&&(l.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,l),s?(this.needCascade(e)&&(o.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,o)):e}};lt.names=["mask","mask-composite"];lt.oldValues={add:"source-over",subtract:"source-out",intersect:"source-in",exclude:"xor"};lt.regexp=new RegExp(`\\s+(${Object.keys(lt.oldValues).join("|")})\\b(?!\\))\\s*(?=[,])`,"ig");kw.exports=lt});var _w=x((B$,Cw)=>{u();var Aw=Pe(),vT=j(),_r=class extends vT{prefixed(e,t){let i;return[i,t]=Aw(t),i===2009?t+"box-align":i===2012?t+"flex-align":super.prefixed(e,t)}normalize(){return"align-items"}set(e,t){let i=Aw(t)[0];return(i===2009||i===2012)&&(e.value=_r.oldValues[e.value]||e.value),super.set(e,t)}};_r.names=["align-items","flex-align","box-align"];_r.oldValues={"flex-end":"end","flex-start":"start"};Cw.exports=_r});var Ow=x((F$,Ew)=>{u();var xT=j(),Au=class extends xT{set(e,t){return t==="-ms-"&&e.value==="contain"&&(e.value="element"),super.set(e,t)}insert(e,t,i){if(!(e.value==="all"&&t==="-ms-"))return super.insert(e,t,i)}};Au.names=["user-select"];Ew.exports=Au});var Pw=x((j$,Rw)=>{u();var Tw=Pe(),kT=j(),Cu=class extends kT{normalize(){return"flex-shrink"}prefixed(e,t){let i;return[i,t]=Tw(t),i===2012?t+"flex-negative":super.prefixed(e,t)}set(e,t){let i;if([i,t]=Tw(t),i===2012||i==="final")return super.set(e,t)}};Cu.names=["flex-shrink","flex-negative"];Rw.exports=Cu});var Dw=x((z$,Iw)=>{u();var ST=j(),_u=class extends ST{prefixed(e,t){return`${t}column-${e}`}normalize(e){return e.includes("inside")?"break-inside":e.includes("before")?"break-before":"break-after"}set(e,t){return(e.prop==="break-inside"&&e.value==="avoid-column"||e.value==="avoid-page")&&(e.value="avoid"),super.set(e,t)}insert(e,t,i){if(e.prop!=="break-inside")return super.insert(e,t,i);if(!(/region/i.test(e.value)||/page/i.test(e.value)))return super.insert(e,t,i)}};_u.names=["break-inside","page-break-inside","column-break-inside","break-before","page-break-before","column-break-before","break-after","page-break-after","column-break-after"];Iw.exports=_u});var $w=x((U$,qw)=>{u();var AT=j(),Eu=class extends AT{prefixed(e,t){return t+"print-color-adjust"}normalize(){return"color-adjust"}};Eu.names=["color-adjust","print-color-adjust"];qw.exports=Eu});var Mw=x((V$,Lw)=>{u();var CT=j(),Er=class extends CT{insert(e,t,i){if(t==="-ms-"){let n=this.set(this.clone(e),t);this.needCascade(e)&&(n.raws.before=this.calcBefore(i,e,t));let a="ltr";return e.parent.nodes.forEach(s=>{s.prop==="direction"&&(s.value==="rtl"||s.value==="ltr")&&(a=s.value)}),n.value=Er.msValues[a][e.value]||e.value,e.parent.insertBefore(e,n)}return super.insert(e,t,i)}};Er.names=["writing-mode"];Er.msValues={ltr:{"horizontal-tb":"lr-tb","vertical-rl":"tb-rl","vertical-lr":"tb-lr"},rtl:{"horizontal-tb":"rl-tb","vertical-rl":"bt-rl","vertical-lr":"bt-lr"}};Lw.exports=Er});var Bw=x((H$,Nw)=>{u();var _T=j(),Ou=class extends _T{set(e,t){return e.value=e.value.replace(/\s+fill(\s)/,"$1"),super.set(e,t)}};Ou.names=["border-image"];Nw.exports=Ou});var zw=x((W$,jw)=>{u();var Fw=Pe(),ET=j(),Or=class extends ET{prefixed(e,t){let i;return[i,t]=Fw(t),i===2012?t+"flex-line-pack":super.prefixed(e,t)}normalize(){return"align-content"}set(e,t){let i=Fw(t)[0];if(i===2012)return e.value=Or.oldValues[e.value]||e.value,super.set(e,t);if(i==="final")return super.set(e,t)}};Or.names=["align-content","flex-line-pack"];Or.oldValues={"flex-end":"end","flex-start":"start","space-between":"justify","space-around":"distribute"};jw.exports=Or});var Vw=x((G$,Uw)=>{u();var OT=j(),We=class extends OT{prefixed(e,t){return t==="-moz-"?t+(We.toMozilla[e]||e):super.prefixed(e,t)}normalize(e){return We.toNormal[e]||e}};We.names=["border-radius"];We.toMozilla={};We.toNormal={};for(let r of["top","bottom"])for(let e of["left","right"]){let t=`border-${r}-${e}-radius`,i=`border-radius-${r}${e}`;We.names.push(t),We.names.push(i),We.toMozilla[t]=i,We.toNormal[i]=t}Uw.exports=We});var Ww=x((Q$,Hw)=>{u();var TT=j(),Tu=class extends TT{prefixed(e,t){return e.includes("-start")?t+e.replace("-block-start","-before"):t+e.replace("-block-end","-after")}normalize(e){return e.includes("-before")?e.replace("-before","-block-start"):e.replace("-after","-block-end")}};Tu.names=["border-block-start","border-block-end","margin-block-start","margin-block-end","padding-block-start","padding-block-end","border-before","border-after","margin-before","margin-after","padding-before","padding-after"];Hw.exports=Tu});var Qw=x((Y$,Gw)=>{u();var RT=j(),{parseTemplate:PT,warnMissedAreas:IT,getGridGap:DT,warnGridGap:qT,inheritGridGap:$T}=Bt(),Ru=class extends RT{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);if(e.parent.some(h=>h.prop==="-ms-grid-rows"))return;let a=DT(e),s=$T(e,a),{rows:o,columns:l,areas:c}=PT({decl:e,gap:s||a}),f=Object.keys(c).length>0,d=Boolean(o),p=Boolean(l);return qT({gap:a,hasColumns:p,decl:e,result:n}),IT(c,e,n),(d&&p||f)&&e.cloneBefore({prop:"-ms-grid-rows",value:o,raws:{}}),p&&e.cloneBefore({prop:"-ms-grid-columns",value:l,raws:{}}),e}};Ru.names=["grid-template"];Gw.exports=Ru});var Kw=x((K$,Yw)=>{u();var LT=j(),Pu=class extends LT{prefixed(e,t){return t+e.replace("-inline","")}normalize(e){return e.replace(/(margin|padding|border)-(start|end)/,"$1-inline-$2")}};Pu.names=["border-inline-start","border-inline-end","margin-inline-start","margin-inline-end","padding-inline-start","padding-inline-end","border-start","border-end","margin-start","margin-end","padding-start","padding-end"];Yw.exports=Pu});var Jw=x((X$,Xw)=>{u();var MT=j(),Iu=class extends MT{check(e){return!e.value.includes("flex-")&&e.value!=="baseline"}prefixed(e,t){return t+"grid-row-align"}normalize(){return"align-self"}};Iu.names=["grid-row-align"];Xw.exports=Iu});var e0=x((J$,Zw)=>{u();var NT=j(),Tr=class extends NT{keyframeParents(e){let{parent:t}=e;for(;t;){if(t.type==="atrule"&&t.name==="keyframes")return!0;({parent:t}=t)}return!1}contain3d(e){if(e.prop==="transform-origin")return!1;for(let t of Tr.functions3d)if(e.value.includes(`${t}(`))return!0;return!1}set(e,t){return e=super.set(e,t),t==="-ms-"&&(e.value=e.value.replace(/rotatez/gi,"rotate")),e}insert(e,t,i){if(t==="-ms-"){if(!this.contain3d(e)&&!this.keyframeParents(e))return super.insert(e,t,i)}else if(t==="-o-"){if(!this.contain3d(e))return super.insert(e,t,i)}else return super.insert(e,t,i)}};Tr.names=["transform","transform-origin"];Tr.functions3d=["matrix3d","translate3d","translateZ","scale3d","scaleZ","rotate3d","rotateX","rotateY","perspective"];Zw.exports=Tr});var i0=x((Z$,r0)=>{u();var t0=Pe(),BT=j(),Du=class extends BT{normalize(){return"flex-direction"}insert(e,t,i){let n;if([n,t]=t0(t),n!==2009)return super.insert(e,t,i);if(e.parent.some(f=>f.prop===t+"box-orient"||f.prop===t+"box-direction"))return;let s=e.value,o,l;s==="inherit"||s==="initial"||s==="unset"?(o=s,l=s):(o=s.includes("row")?"horizontal":"vertical",l=s.includes("reverse")?"reverse":"normal");let c=this.clone(e);return c.prop=t+"box-orient",c.value=o,this.needCascade(e)&&(c.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,c),c=this.clone(e),c.prop=t+"box-direction",c.value=l,this.needCascade(e)&&(c.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,c)}old(e,t){let i;return[i,t]=t0(t),i===2009?[t+"box-orient",t+"box-direction"]:super.old(e,t)}};Du.names=["flex-direction","box-direction","box-orient"];r0.exports=Du});var s0=x((eL,n0)=>{u();var FT=j(),qu=class extends FT{check(e){return e.value==="pixelated"}prefixed(e,t){return t==="-ms-"?"-ms-interpolation-mode":super.prefixed(e,t)}set(e,t){return t!=="-ms-"?super.set(e,t):(e.prop="-ms-interpolation-mode",e.value="nearest-neighbor",e)}normalize(){return"image-rendering"}process(e,t){return super.process(e,t)}};qu.names=["image-rendering","interpolation-mode"];n0.exports=qu});var o0=x((tL,a0)=>{u();var jT=j(),zT=_e(),$u=class extends jT{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=zT.uniq(this.prefixes.map(n=>n==="-ms-"?"-webkit-":n)))}};$u.names=["backdrop-filter"];a0.exports=$u});var u0=x((rL,l0)=>{u();var UT=j(),VT=_e(),Lu=class extends UT{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=VT.uniq(this.prefixes.map(n=>n==="-ms-"?"-webkit-":n)))}check(e){return e.value.toLowerCase()==="text"}};Lu.names=["background-clip"];l0.exports=Lu});var c0=x((iL,f0)=>{u();var HT=j(),WT=["none","underline","overline","line-through","blink","inherit","initial","unset"],Mu=class extends HT{check(e){return e.value.split(/\s+/).some(t=>!WT.includes(t))}};Mu.names=["text-decoration"];f0.exports=Mu});var h0=x((nL,d0)=>{u();var p0=Pe(),GT=j(),Rr=class extends GT{prefixed(e,t){let i;return[i,t]=p0(t),i===2009?t+"box-pack":i===2012?t+"flex-pack":super.prefixed(e,t)}normalize(){return"justify-content"}set(e,t){let i=p0(t)[0];if(i===2009||i===2012){let n=Rr.oldValues[e.value]||e.value;if(e.value=n,i!==2009||n!=="distribute")return super.set(e,t)}else if(i==="final")return super.set(e,t)}};Rr.names=["justify-content","flex-pack","box-pack"];Rr.oldValues={"flex-end":"end","flex-start":"start","space-between":"justify","space-around":"distribute"};d0.exports=Rr});var g0=x((sL,m0)=>{u();var QT=j(),Nu=class extends QT{set(e,t){let i=e.value.toLowerCase();return t==="-webkit-"&&!i.includes(" ")&&i!=="contain"&&i!=="cover"&&(e.value=e.value+" "+e.value),super.set(e,t)}};Nu.names=["background-size"];m0.exports=Nu});var b0=x((aL,y0)=>{u();var YT=j(),Bu=Bt(),Fu=class extends YT{insert(e,t,i){if(t!=="-ms-")return super.insert(e,t,i);let n=Bu.parse(e),[a,s]=Bu.translate(n,0,1);n[0]&&n[0].includes("span")&&(s=n[0].join("").replace(/\D/g,"")),[[e.prop,a],[`${e.prop}-span`,s]].forEach(([l,c])=>{Bu.insertDecl(e,l,c)})}};Fu.names=["grid-row","grid-column"];y0.exports=Fu});var x0=x((oL,v0)=>{u();var KT=j(),{prefixTrackProp:w0,prefixTrackValue:XT,autoplaceGridItems:JT,getGridGap:ZT,inheritGridGap:eR}=Bt(),tR=su(),ju=class extends KT{prefixed(e,t){return t==="-ms-"?w0({prop:e,prefix:t}):super.prefixed(e,t)}normalize(e){return e.replace(/^grid-(rows|columns)/,"grid-template-$1")}insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let{parent:a,prop:s,value:o}=e,l=s.includes("rows"),c=s.includes("columns"),f=a.some(k=>k.prop==="grid-template"||k.prop==="grid-template-areas");if(f&&l)return!1;let d=new tR({options:{}}),p=d.gridStatus(a,n),h=ZT(e);h=eR(e,h)||h;let b=l?h.row:h.column;(p==="no-autoplace"||p===!0)&&!f&&(b=null);let v=XT({value:o,gap:b});e.cloneBefore({prop:w0({prop:s,prefix:t}),value:v});let y=a.nodes.find(k=>k.prop==="grid-auto-flow"),w="row";if(y&&!d.disabled(y,n)&&(w=y.value.trim()),p==="autoplace"){let k=a.nodes.find(E=>E.prop==="grid-template-rows");if(!k&&f)return;if(!k&&!f){e.warn(n,"Autoplacement does not work without grid-template-rows property");return}!a.nodes.find(E=>E.prop==="grid-template-columns")&&!f&&e.warn(n,"Autoplacement does not work without grid-template-columns property"),c&&!f&&JT(e,n,h,w)}}};ju.names=["grid-template-rows","grid-template-columns","grid-rows","grid-columns"];v0.exports=ju});var S0=x((lL,k0)=>{u();var rR=j(),zu=class extends rR{check(e){return!e.value.includes("flex-")&&e.value!=="baseline"}prefixed(e,t){return t+"grid-column-align"}normalize(){return"justify-self"}};zu.names=["grid-column-align"];k0.exports=zu});var C0=x((uL,A0)=>{u();var iR=j(),Uu=class extends iR{prefixed(e,t){return t+"scroll-chaining"}normalize(){return"overscroll-behavior"}set(e,t){return e.value==="auto"?e.value="chained":(e.value==="none"||e.value==="contain")&&(e.value="none"),super.set(e,t)}};Uu.names=["overscroll-behavior","scroll-chaining"];A0.exports=Uu});var O0=x((fL,E0)=>{u();var nR=j(),{parseGridAreas:sR,warnMissedAreas:aR,prefixTrackProp:oR,prefixTrackValue:_0,getGridGap:lR,warnGridGap:uR,inheritGridGap:fR}=Bt();function cR(r){return r.trim().slice(1,-1).split(/["']\s*["']?/g)}var Vu=class extends nR{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let a=!1,s=!1,o=e.parent,l=lR(e);l=fR(e,l)||l,o.walkDecls(/-ms-grid-rows/,d=>d.remove()),o.walkDecls(/grid-template-(rows|columns)/,d=>{if(d.prop==="grid-template-rows"){s=!0;let{prop:p,value:h}=d;d.cloneBefore({prop:oR({prop:p,prefix:t}),value:_0({value:h,gap:l.row})})}else a=!0});let c=cR(e.value);a&&!s&&l.row&&c.length>1&&e.cloneBefore({prop:"-ms-grid-rows",value:_0({value:`repeat(${c.length}, auto)`,gap:l.row}),raws:{}}),uR({gap:l,hasColumns:a,decl:e,result:n});let f=sR({rows:c,gap:l});return aR(f,e,n),e}};Vu.names=["grid-template-areas"];E0.exports=Vu});var R0=x((cL,T0)=>{u();var pR=j(),Hu=class extends pR{set(e,t){return t==="-webkit-"&&(e.value=e.value.replace(/\s*(right|left)\s*/i,"")),super.set(e,t)}};Hu.names=["text-emphasis-position"];T0.exports=Hu});var I0=x((pL,P0)=>{u();var dR=j(),Wu=class extends dR{set(e,t){return e.prop==="text-decoration-skip-ink"&&e.value==="auto"?(e.prop=t+"text-decoration-skip",e.value="ink",e):super.set(e,t)}};Wu.names=["text-decoration-skip-ink","text-decoration-skip"];P0.exports=Wu});var N0=x((dL,M0)=>{u();"use strict";M0.exports={wrap:D0,limit:q0,validate:$0,test:Gu,curry:hR,name:L0};function D0(r,e,t){var i=e-r;return((t-r)%i+i)%i+r}function q0(r,e,t){return Math.max(r,Math.min(e,t))}function $0(r,e,t,i,n){if(!Gu(r,e,t,i,n))throw new Error(t+" is outside of range ["+r+","+e+")");return t}function Gu(r,e,t,i,n){return!(t<r||t>e||n&&t===e||i&&t===r)}function L0(r,e,t,i){return(t?"(":"[")+r+","+e+(i?")":"]")}function hR(r,e,t,i){var n=L0.bind(null,r,e,t,i);return{wrap:D0.bind(null,r,e),limit:q0.bind(null,r,e),validate:function(a){return $0(r,e,a,t,i)},test:function(a){return Gu(r,e,a,t,i)},toString:n,name:n}}});var j0=x((hL,F0)=>{u();var Qu=$s(),mR=N0(),gR=xr(),yR=He(),bR=_e(),B0=/top|left|right|bottom/gi,wt=class extends yR{replace(e,t){let i=Qu(e);for(let n of i.nodes)if(n.type==="function"&&n.value===this.name)if(n.nodes=this.newDirection(n.nodes),n.nodes=this.normalize(n.nodes),t==="-webkit- old"){if(!this.oldWebkit(n))return!1}else n.nodes=this.convertDirection(n.nodes),n.value=t+n.value;return i.toString()}replaceFirst(e,...t){return t.map(n=>n===" "?{type:"space",value:n}:{type:"word",value:n}).concat(e.slice(1))}normalizeUnit(e,t){return`${parseFloat(e)/t*360}deg`}normalize(e){if(!e[0])return e;if(/-?\d+(.\d+)?grad/.test(e[0].value))e[0].value=this.normalizeUnit(e[0].value,400);else if(/-?\d+(.\d+)?rad/.test(e[0].value))e[0].value=this.normalizeUnit(e[0].value,2*Math.PI);else if(/-?\d+(.\d+)?turn/.test(e[0].value))e[0].value=this.normalizeUnit(e[0].value,1);else if(e[0].value.includes("deg")){let t=parseFloat(e[0].value);t=mR.wrap(0,360,t),e[0].value=`${t}deg`}return e[0].value==="0deg"?e=this.replaceFirst(e,"to"," ","top"):e[0].value==="90deg"?e=this.replaceFirst(e,"to"," ","right"):e[0].value==="180deg"?e=this.replaceFirst(e,"to"," ","bottom"):e[0].value==="270deg"&&(e=this.replaceFirst(e,"to"," ","left")),e}newDirection(e){if(e[0].value==="to"||(B0.lastIndex=0,!B0.test(e[0].value)))return e;e.unshift({type:"word",value:"to"},{type:"space",value:" "});for(let t=2;t<e.length&&e[t].type!=="div";t++)e[t].type==="word"&&(e[t].value=this.revertDirection(e[t].value));return e}isRadial(e){let t="before";for(let i of e)if(t==="before"&&i.type==="space")t="at";else if(t==="at"&&i.value==="at")t="after";else{if(t==="after"&&i.type==="space")return!0;if(i.type==="div")break;t="before"}return!1}convertDirection(e){return e.length>0&&(e[0].value==="to"?this.fixDirection(e):e[0].value.includes("deg")?this.fixAngle(e):this.isRadial(e)&&this.fixRadial(e)),e}fixDirection(e){e.splice(0,2);for(let t of e){if(t.type==="div")break;t.type==="word"&&(t.value=this.revertDirection(t.value))}}fixAngle(e){let t=e[0].value;t=parseFloat(t),t=Math.abs(450-t)%360,t=this.roundFloat(t,3),e[0].value=`${t}deg`}fixRadial(e){let t=[],i=[],n,a,s,o,l;for(o=0;o<e.length-2;o++)if(n=e[o],a=e[o+1],s=e[o+2],n.type==="space"&&a.value==="at"&&s.type==="space"){l=o+3;break}else t.push(n);let c;for(o=l;o<e.length;o++)if(e[o].type==="div"){c=e[o];break}else i.push(e[o]);e.splice(0,o,...i,c,...t)}revertDirection(e){return wt.directions[e.toLowerCase()]||e}roundFloat(e,t){return parseFloat(e.toFixed(t))}oldWebkit(e){let{nodes:t}=e,i=Qu.stringify(e.nodes);if(this.name!=="linear-gradient"||t[0]&&t[0].value.includes("deg")||i.includes("px")||i.includes("-corner")||i.includes("-side"))return!1;let n=[[]];for(let a of t)n[n.length-1].push(a),a.type==="div"&&a.value===","&&n.push([]);this.oldDirection(n),this.colorStops(n),e.nodes=[];for(let a of n)e.nodes=e.nodes.concat(a);return e.nodes.unshift({type:"word",value:"linear"},this.cloneDiv(e.nodes)),e.value="-webkit-gradient",!0}oldDirection(e){let t=this.cloneDiv(e[0]);if(e[0][0].value!=="to")return e.unshift([{type:"word",value:wt.oldDirections.bottom},t]);{let i=[];for(let a of e[0].slice(2))a.type==="word"&&i.push(a.value.toLowerCase());i=i.join(" ");let n=wt.oldDirections[i]||i;return e[0]=[{type:"word",value:n},t],e[0]}}cloneDiv(e){for(let t of e)if(t.type==="div"&&t.value===",")return t;return{type:"div",value:",",after:" "}}colorStops(e){let t=[];for(let i=0;i<e.length;i++){let n,a=e[i],s;if(i===0)continue;let o=Qu.stringify(a[0]);a[1]&&a[1].type==="word"?n=a[1].value:a[2]&&a[2].type==="word"&&(n=a[2].value);let l;i===1&&(!n||n==="0%")?l=`from(${o})`:i===e.length-1&&(!n||n==="100%")?l=`to(${o})`:n?l=`color-stop(${n}, ${o})`:l=`color-stop(${o})`;let c=a[a.length-1];e[i]=[{type:"word",value:l}],c.type==="div"&&c.value===","&&(s=e[i].push(c)),t.push(s)}return t}old(e){if(e==="-webkit-"){let t=this.name==="linear-gradient"?"linear":"radial",i="-gradient",n=bR.regexp(`-webkit-(${t}-gradient|gradient\\(\\s*${t})`,!1);return new gR(this.name,e+this.name,i,n)}else return super.old(e)}add(e,t){let i=e.prop;if(i.includes("mask")){if(t==="-webkit-"||t==="-webkit- old")return super.add(e,t)}else if(i==="list-style"||i==="list-style-image"||i==="content"){if(t==="-webkit-"||t==="-webkit- old")return super.add(e,t)}else return super.add(e,t)}};wt.names=["linear-gradient","repeating-linear-gradient","radial-gradient","repeating-radial-gradient"];wt.directions={top:"bottom",left:"right",bottom:"top",right:"left"};wt.oldDirections={top:"left bottom, left top",left:"right top, left top",bottom:"left top, left bottom",right:"left top, right top","top right":"left bottom, right top","top left":"right bottom, left top","right top":"left bottom, right top","right bottom":"left top, right bottom","bottom right":"left top, right bottom","bottom left":"right top, left bottom","left top":"right bottom, left top","left bottom":"right top, left bottom"};F0.exports=wt});var V0=x((mL,U0)=>{u();var wR=xr(),vR=He();function z0(r){return new RegExp(`(^|[\\s,(])(${r}($|[\\s),]))`,"gi")}var Yu=class extends vR{regexp(){return this.regexpCache||(this.regexpCache=z0(this.name)),this.regexpCache}isStretch(){return this.name==="stretch"||this.name==="fill"||this.name==="fill-available"}replace(e,t){return t==="-moz-"&&this.isStretch()?e.replace(this.regexp(),"$1-moz-available$3"):t==="-webkit-"&&this.isStretch()?e.replace(this.regexp(),"$1-webkit-fill-available$3"):super.replace(e,t)}old(e){let t=e+this.name;return this.isStretch()&&(e==="-moz-"?t="-moz-available":e==="-webkit-"&&(t="-webkit-fill-available")),new wR(this.name,t,t,z0(t))}add(e,t){if(!(e.prop.includes("grid")&&t!=="-webkit-"))return super.add(e,t)}};Yu.names=["max-content","min-content","fit-content","fill","fill-available","stretch"];U0.exports=Yu});var G0=x((gL,W0)=>{u();var H0=xr(),xR=He(),Ku=class extends xR{replace(e,t){return t==="-webkit-"?e.replace(this.regexp(),"$1-webkit-optimize-contrast"):t==="-moz-"?e.replace(this.regexp(),"$1-moz-crisp-edges"):super.replace(e,t)}old(e){return e==="-webkit-"?new H0(this.name,"-webkit-optimize-contrast"):e==="-moz-"?new H0(this.name,"-moz-crisp-edges"):super.old(e)}};Ku.names=["pixelated"];W0.exports=Ku});var Y0=x((yL,Q0)=>{u();var kR=He(),Xu=class extends kR{replace(e,t){let i=super.replace(e,t);return t==="-webkit-"&&(i=i.replace(/("[^"]+"|'[^']+')(\s+\d+\w)/gi,"url($1)$2")),i}};Xu.names=["image-set"];Q0.exports=Xu});var X0=x((bL,K0)=>{u();var SR=$e().list,AR=He(),Ju=class extends AR{replace(e,t){return SR.space(e).map(i=>{if(i.slice(0,+this.name.length+1)!==this.name+"(")return i;let n=i.lastIndexOf(")"),a=i.slice(n+1),s=i.slice(this.name.length+1,n);if(t==="-webkit-"){let o=s.match(/\d*.?\d+%?/);o?(s=s.slice(o[0].length).trim(),s+=`, ${o[0]}`):s+=", 0.5"}return t+this.name+"("+s+")"+a}).join(" ")}};Ju.names=["cross-fade"];K0.exports=Ju});var Z0=x((wL,J0)=>{u();var CR=Pe(),_R=xr(),ER=He(),Zu=class extends ER{constructor(e,t){super(e,t);e==="display-flex"&&(this.name="flex")}check(e){return e.prop==="display"&&e.value===this.name}prefixed(e){let t,i;return[t,e]=CR(e),t===2009?this.name==="flex"?i="box":i="inline-box":t===2012?this.name==="flex"?i="flexbox":i="inline-flexbox":t==="final"&&(i=this.name),e+i}replace(e,t){return this.prefixed(t)}old(e){let t=this.prefixed(e);if(!!t)return new _R(this.name,t)}};Zu.names=["display-flex","inline-flex"];J0.exports=Zu});var tv=x((vL,ev)=>{u();var OR=He(),ef=class extends OR{constructor(e,t){super(e,t);e==="display-grid"&&(this.name="grid")}check(e){return e.prop==="display"&&e.value===this.name}};ef.names=["display-grid","inline-grid"];ev.exports=ef});var iv=x((xL,rv)=>{u();var TR=He(),tf=class extends TR{constructor(e,t){super(e,t);e==="filter-function"&&(this.name="filter")}};tf.names=["filter","filter-function"];rv.exports=tf});var ov=x((kL,av)=>{u();var nv=Li(),z=j(),sv=Fy(),RR=nb(),PR=su(),IR=Sb(),rf=Mt(),Pr=kr(),DR=Pb(),ut=He(),Ir=_e(),qR=Db(),$R=$b(),LR=Mb(),MR=Bb(),NR=Vb(),BR=Gb(),FR=Yb(),jR=Xb(),zR=Zb(),UR=tw(),VR=iw(),HR=sw(),WR=ow(),GR=uw(),QR=cw(),YR=hw(),KR=gw(),XR=ww(),JR=xw(),ZR=Sw(),e5=_w(),t5=Ow(),r5=Pw(),i5=Dw(),n5=$w(),s5=Mw(),a5=Bw(),o5=zw(),l5=Vw(),u5=Ww(),f5=Qw(),c5=Kw(),p5=Jw(),d5=e0(),h5=i0(),m5=s0(),g5=o0(),y5=u0(),b5=c0(),w5=h0(),v5=g0(),x5=b0(),k5=x0(),S5=S0(),A5=C0(),C5=O0(),_5=R0(),E5=I0(),O5=j0(),T5=V0(),R5=G0(),P5=Y0(),I5=X0(),D5=Z0(),q5=tv(),$5=iv();Pr.hack(qR);Pr.hack($R);Pr.hack(LR);Pr.hack(MR);z.hack(NR);z.hack(BR);z.hack(FR);z.hack(jR);z.hack(zR);z.hack(UR);z.hack(VR);z.hack(HR);z.hack(WR);z.hack(GR);z.hack(QR);z.hack(YR);z.hack(KR);z.hack(XR);z.hack(JR);z.hack(ZR);z.hack(e5);z.hack(t5);z.hack(r5);z.hack(i5);z.hack(n5);z.hack(s5);z.hack(a5);z.hack(o5);z.hack(l5);z.hack(u5);z.hack(f5);z.hack(c5);z.hack(p5);z.hack(d5);z.hack(h5);z.hack(m5);z.hack(g5);z.hack(y5);z.hack(b5);z.hack(w5);z.hack(v5);z.hack(x5);z.hack(k5);z.hack(S5);z.hack(A5);z.hack(C5);z.hack(_5);z.hack(E5);ut.hack(O5);ut.hack(T5);ut.hack(R5);ut.hack(P5);ut.hack(I5);ut.hack(D5);ut.hack(q5);ut.hack($5);var nf=new Map,Ni=class{constructor(e,t,i={}){this.data=e,this.browsers=t,this.options=i,[this.add,this.remove]=this.preprocess(this.select(this.data)),this.transition=new RR(this),this.processor=new PR(this)}cleaner(){if(this.cleanerCache)return this.cleanerCache;if(this.browsers.selected.length){let e=new rf(this.browsers.data,[]);this.cleanerCache=new Ni(this.data,e,this.options)}else return this;return this.cleanerCache}select(e){let t={add:{},remove:{}};for(let i in e){let n=e[i],a=n.browsers.map(l=>{let c=l.split(" ");return{browser:`${c[0]} ${c[1]}`,note:c[2]}}),s=a.filter(l=>l.note).map(l=>`${this.browsers.prefix(l.browser)} ${l.note}`);s=Ir.uniq(s),a=a.filter(l=>this.browsers.isSelected(l.browser)).map(l=>{let c=this.browsers.prefix(l.browser);return l.note?`${c} ${l.note}`:c}),a=this.sort(Ir.uniq(a)),this.options.flexbox==="no-2009"&&(a=a.filter(l=>!l.includes("2009")));let o=n.browsers.map(l=>this.browsers.prefix(l));n.mistakes&&(o=o.concat(n.mistakes)),o=o.concat(s),o=Ir.uniq(o),a.length?(t.add[i]=a,a.length<o.length&&(t.remove[i]=o.filter(l=>!a.includes(l)))):t.remove[i]=o}return t}sort(e){return e.sort((t,i)=>{let n=Ir.removeNote(t).length,a=Ir.removeNote(i).length;return n===a?i.length-t.length:a-n})}preprocess(e){let t={selectors:[],"@supports":new IR(Ni,this)};for(let n in e.add){let a=e.add[n];if(n==="@keyframes"||n==="@viewport")t[n]=new DR(n,a,this);else if(n==="@resolution")t[n]=new sv(n,a,this);else if(this.data[n].selector)t.selectors.push(Pr.load(n,a,this));else{let s=this.data[n].props;if(s){let o=ut.load(n,a,this);for(let l of s)t[l]||(t[l]={values:[]}),t[l].values.push(o)}else{let o=t[n]&&t[n].values||[];t[n]=z.load(n,a,this),t[n].values=o}}}let i={selectors:[]};for(let n in e.remove){let a=e.remove[n];if(this.data[n].selector){let s=Pr.load(n,a);for(let o of a)i.selectors.push(s.old(o))}else if(n==="@keyframes"||n==="@viewport")for(let s of a){let o=`@${s}${n.slice(1)}`;i[o]={remove:!0}}else if(n==="@resolution")i[n]=new sv(n,a,this);else{let s=this.data[n].props;if(s){let o=ut.load(n,[],this);for(let l of a){let c=o.old(l);if(c)for(let f of s)i[f]||(i[f]={}),i[f].values||(i[f].values=[]),i[f].values.push(c)}}else for(let o of a){let l=this.decl(n).old(n,o);if(n==="align-self"){let c=t[n]&&t[n].prefixes;if(c){if(o==="-webkit- 2009"&&c.includes("-webkit-"))continue;if(o==="-webkit-"&&c.includes("-webkit- 2009"))continue}}for(let c of l)i[c]||(i[c]={}),i[c].remove=!0}}}return[t,i]}decl(e){return nf.has(e)||nf.set(e,z.load(e)),nf.get(e)}unprefixed(e){let t=this.normalize(nv.unprefixed(e));return t==="flex-direction"&&(t="flex-flow"),t}normalize(e){return this.decl(e).normalize(e)}prefixed(e,t){return e=nv.unprefixed(e),this.decl(e).prefixed(e,t)}values(e,t){let i=this[e],n=i["*"]&&i["*"].values,a=i[t]&&i[t].values;return n&&a?Ir.uniq(n.concat(a)):n||a||[]}group(e){let t=e.parent,i=t.index(e),{length:n}=t.nodes,a=this.unprefixed(e.prop),s=(o,l)=>{for(i+=o;i>=0&&i<n;){let c=t.nodes[i];if(c.type==="decl"){if(o===-1&&c.prop===a&&!rf.withPrefix(c.value)||this.unprefixed(c.prop)!==a)break;if(l(c)===!0)return!0;if(o===1&&c.prop===a&&!rf.withPrefix(c.value))break}i+=o}return!1};return{up(o){return s(-1,o)},down(o){return s(1,o)}}}};av.exports=Ni});var uv=x((SL,lv)=>{u();lv.exports={"backdrop-filter":{feature:"css-backdrop-filter",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},element:{props:["background","background-image","border-image","mask","list-style","list-style-image","content","mask-image"],feature:"css-element-function",browsers:["firefox 114"]},"user-select":{mistakes:["-khtml-"],feature:"user-select-none",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},"background-clip":{feature:"background-clip-text",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},hyphens:{feature:"css-hyphens",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},fill:{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"fill-available":{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},stretch:{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["firefox 114"]},"fit-content":{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["firefox 114"]},"text-decoration-style":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-color":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-line":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-skip":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-skip-ink":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-size-adjust":{feature:"text-size-adjust",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"mask-clip":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-composite":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-image":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-origin":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-repeat":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-repeat":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-source":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},mask:{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-position":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-size":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-outset":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-width":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-slice":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"clip-path":{feature:"css-clip-path",browsers:["samsung 21"]},"box-decoration-break":{feature:"css-boxdecorationbreak",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","opera 99","safari 16.5","samsung 21"]},appearance:{feature:"css-appearance",browsers:["samsung 21"]},"image-set":{props:["background","background-image","border-image","cursor","mask","mask-image","list-style","list-style-image","content"],feature:"css-image-set",browsers:["and_uc 15.5","chrome 109","samsung 21"]},"cross-fade":{props:["background","background-image","border-image","mask","list-style","list-style-image","content","mask-image"],feature:"css-cross-fade",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},isolate:{props:["unicode-bidi"],feature:"css-unicode-bidi",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},"color-adjust":{feature:"css-color-adjust",browsers:["chrome 109","chrome 113","chrome 114","edge 114","opera 99"]}}});var cv=x((AL,fv)=>{u();fv.exports={}});var mv=x((CL,hv)=>{u();var L5=Gl(),{agents:M5}=(Ts(),Os),sf=_y(),N5=Mt(),B5=ov(),F5=uv(),j5=cv(),pv={browsers:M5,prefixes:F5},dv=`
-  Replace Autoprefixer \`browsers\` option to Browserslist config.
-  Use \`browserslist\` key in \`package.json\` or \`.browserslistrc\` file.
-
-  Using \`browsers\` option can cause errors. Browserslist config can
-  be used for Babel, Autoprefixer, postcss-normalize and other tools.
-
-  If you really need to use option, rename it to \`overrideBrowserslist\`.
-
-  Learn more at:
-  https://github.com/browserslist/browserslist#readme
-  https://twitter.com/browserslist
-
-`;function z5(r){return Object.prototype.toString.apply(r)==="[object Object]"}var af=new Map;function U5(r,e){e.browsers.selected.length!==0&&(e.add.selectors.length>0||Object.keys(e.add).length>2||r.warn(`Autoprefixer target browsers do not need any prefixes.You do not need Autoprefixer anymore.
-Check your Browserslist config to be sure that your targets are set up correctly.
-
-  Learn more at:
-  https://github.com/postcss/autoprefixer#readme
-  https://github.com/browserslist/browserslist#readme
-
-`))}hv.exports=Dr;function Dr(...r){let e;if(r.length===1&&z5(r[0])?(e=r[0],r=void 0):r.length===0||r.length===1&&!r[0]?r=void 0:r.length<=2&&(Array.isArray(r[0])||!r[0])?(e=r[1],r=r[0]):typeof r[r.length-1]=="object"&&(e=r.pop()),e||(e={}),e.browser)throw new Error("Change `browser` option to `overrideBrowserslist` in Autoprefixer");if(e.browserslist)throw new Error("Change `browserslist` option to `overrideBrowserslist` in Autoprefixer");e.overrideBrowserslist?r=e.overrideBrowserslist:e.browsers&&(typeof console!="undefined"&&console.warn&&(sf.red?console.warn(sf.red(dv.replace(/`[^`]+`/g,n=>sf.yellow(n.slice(1,-1))))):console.warn(dv)),r=e.browsers);let t={ignoreUnknownVersions:e.ignoreUnknownVersions,stats:e.stats,env:e.env};function i(n){let a=pv,s=new N5(a.browsers,r,n,t),o=s.selected.join(", ")+JSON.stringify(e);return af.has(o)||af.set(o,new B5(a.prefixes,s,e)),af.get(o)}return{postcssPlugin:"autoprefixer",prepare(n){let a=i({from:n.opts.from,env:e.env});return{OnceExit(s){U5(n,a),e.remove!==!1&&a.processor.remove(s,n),e.add!==!1&&a.processor.add(s,n)}}},info(n){return n=n||{},n.from=n.from||m.cwd(),j5(i(n))},options:e,browsers:r}}Dr.postcss=!0;Dr.data=pv;Dr.defaults=L5.defaults;Dr.info=()=>Dr().info()});var gv={};Ge(gv,{default:()=>V5});var V5,yv=R(()=>{u();V5=[]});var wv={};Ge(wv,{default:()=>H5});var bv,H5,vv=R(()=>{u();Yi();bv=pe(en()),H5=St(bv.default.theme)});var kv={};Ge(kv,{default:()=>W5});var xv,W5,Sv=R(()=>{u();Yi();xv=pe(en()),W5=St(xv.default)});u();"use strict";var G5=vt(Ay()),Q5=vt($e()),Y5=vt(mv()),K5=vt((yv(),gv)),X5=vt((vv(),wv)),J5=vt((Sv(),kv)),Z5=vt((zs(),Af)),eP=vt((nl(),il)),tP=vt((ia(),ic));function vt(r){return r&&r.__esModule?r:{default:r}}console.warn("cdn.tailwindcss.com should not be used in production. To use Tailwind CSS in production, install it as a PostCSS plugin or use the Tailwind CLI: https://tailwindcss.com/docs/installation");var Ls="tailwind",of="text/tailwindcss",Av="/template.html",Yt,Cv=!0,_v=0,lf=new Set,uf,Ev="",Ov=(r=!1)=>({get(e,t){return(!r||t==="config")&&typeof e[t]=="object"&&e[t]!==null?new Proxy(e[t],Ov()):e[t]},set(e,t,i){return e[t]=i,(!r||t==="config")&&ff(!0),!0}});window[Ls]=new Proxy({config:{},defaultTheme:X5.default,defaultConfig:J5.default,colors:Z5.default,plugin:eP.default,resolveConfig:tP.default},Ov(!0));function Tv(r){uf.observe(r,{attributes:!0,attributeFilter:["type"],characterData:!0,subtree:!0,childList:!0})}new MutationObserver(async r=>{let e=!1;if(!uf){uf=new MutationObserver(async()=>await ff(!0));for(let t of document.querySelectorAll(`style[type="${of}"]`))Tv(t)}for(let t of r)for(let i of t.addedNodes)i.nodeType===1&&i.tagName==="STYLE"&&i.getAttribute("type")===of&&(Tv(i),e=!0);await ff(e)}).observe(document.documentElement,{attributes:!0,attributeFilter:["class"],childList:!0,subtree:!0});async function ff(r=!1){r&&(_v++,lf.clear());let e="";for(let i of document.querySelectorAll(`style[type="${of}"]`))e+=i.textContent;let t=new Set;for(let i of document.querySelectorAll("[class]"))for(let n of i.classList)lf.has(n)||t.add(n);if(document.body&&(Cv||t.size>0||e!==Ev||!Yt||!Yt.isConnected)){for(let n of t)lf.add(n);Cv=!1,Ev=e,self[Av]=Array.from(t).join(" ");let{css:i}=await(0,Q5.default)([(0,G5.default)({...window[Ls].config,_hash:_v,content:{files:[Av],extract:{html:n=>n.split(" ")}},plugins:[...K5.default,...Array.isArray(window[Ls].config.plugins)?window[Ls].config.plugins:[]]}),(0,Y5.default)({remove:!1})]).process(`@tailwind base;@tailwind components;@tailwind utilities;${e}`);(!Yt||!Yt.isConnected)&&(Yt=document.createElement("style"),document.head.append(Yt)),Yt.textContent=i}}})();
-/*!
- * fill-range <https://github.com/jonschlinkert/fill-range>
- *
- * Copyright (c) 2014-present, Jon Schlinkert.
- * Licensed under the MIT License.
- */
-/*!
- * is-number <https://github.com/jonschlinkert/is-number>
- *
- * Copyright (c) 2014-present, Jon Schlinkert.
- * Released under the MIT License.
- */
-/*!
- * to-regex-range <https://github.com/micromatch/to-regex-range>
- *
- * Copyright (c) 2015-present, Jon Schlinkert.
- * Released under the MIT License.
- */
-/*! https://mths.be/cssesc v3.0.0 by @mathias */
-
diff --git a/examples/server/public/deps_vue.esm-browser.js b/examples/server/public/deps_vue.esm-browser.js
deleted file mode 100644
index 4679d9614..000000000
--- a/examples/server/public/deps_vue.esm-browser.js
+++ /dev/null
@@ -1,18160 +0,0 @@
-/**
-* vue v3.5.12
-* (c) 2018-present Yuxi (Evan) You and Vue contributors
-* @license MIT
-**/
-/*! #__NO_SIDE_EFFECTS__ */
-// @__NO_SIDE_EFFECTS__
-function makeMap(str) {
-  const map = /* @__PURE__ */ Object.create(null);
-  for (const key of str.split(",")) map[key] = 1;
-  return (val) => val in map;
-}
-
-const EMPTY_OBJ = Object.freeze({}) ;
-const EMPTY_ARR = Object.freeze([]) ;
-const NOOP = () => {
-};
-const NO = () => false;
-const isOn = (key) => key.charCodeAt(0) === 111 && key.charCodeAt(1) === 110 && // uppercase letter
-(key.charCodeAt(2) > 122 || key.charCodeAt(2) < 97);
-const isModelListener = (key) => key.startsWith("onUpdate:");
-const extend = Object.assign;
-const remove = (arr, el) => {
-  const i = arr.indexOf(el);
-  if (i > -1) {
-    arr.splice(i, 1);
-  }
-};
-const hasOwnProperty$1 = Object.prototype.hasOwnProperty;
-const hasOwn = (val, key) => hasOwnProperty$1.call(val, key);
-const isArray = Array.isArray;
-const isMap = (val) => toTypeString(val) === "[object Map]";
-const isSet = (val) => toTypeString(val) === "[object Set]";
-const isDate = (val) => toTypeString(val) === "[object Date]";
-const isRegExp = (val) => toTypeString(val) === "[object RegExp]";
-const isFunction = (val) => typeof val === "function";
-const isString = (val) => typeof val === "string";
-const isSymbol = (val) => typeof val === "symbol";
-const isObject = (val) => val !== null && typeof val === "object";
-const isPromise = (val) => {
-  return (isObject(val) || isFunction(val)) && isFunction(val.then) && isFunction(val.catch);
-};
-const objectToString = Object.prototype.toString;
-const toTypeString = (value) => objectToString.call(value);
-const toRawType = (value) => {
-  return toTypeString(value).slice(8, -1);
-};
-const isPlainObject = (val) => toTypeString(val) === "[object Object]";
-const isIntegerKey = (key) => isString(key) && key !== "NaN" && key[0] !== "-" && "" + parseInt(key, 10) === key;
-const isReservedProp = /* @__PURE__ */ makeMap(
-  // the leading comma is intentional so empty string "" is also included
-  ",key,ref,ref_for,ref_key,onVnodeBeforeMount,onVnodeMounted,onVnodeBeforeUpdate,onVnodeUpdated,onVnodeBeforeUnmount,onVnodeUnmounted"
-);
-const isBuiltInDirective = /* @__PURE__ */ makeMap(
-  "bind,cloak,else-if,else,for,html,if,model,on,once,pre,show,slot,text,memo"
-);
-const cacheStringFunction = (fn) => {
-  const cache = /* @__PURE__ */ Object.create(null);
-  return (str) => {
-    const hit = cache[str];
-    return hit || (cache[str] = fn(str));
-  };
-};
-const camelizeRE = /-(\w)/g;
-const camelize = cacheStringFunction(
-  (str) => {
-    return str.replace(camelizeRE, (_, c) => c ? c.toUpperCase() : "");
-  }
-);
-const hyphenateRE = /\B([A-Z])/g;
-const hyphenate = cacheStringFunction(
-  (str) => str.replace(hyphenateRE, "-$1").toLowerCase()
-);
-const capitalize = cacheStringFunction((str) => {
-  return str.charAt(0).toUpperCase() + str.slice(1);
-});
-const toHandlerKey = cacheStringFunction(
-  (str) => {
-    const s = str ? `on${capitalize(str)}` : ``;
-    return s;
-  }
-);
-const hasChanged = (value, oldValue) => !Object.is(value, oldValue);
-const invokeArrayFns = (fns, ...arg) => {
-  for (let i = 0; i < fns.length; i++) {
-    fns[i](...arg);
-  }
-};
-const def = (obj, key, value, writable = false) => {
-  Object.defineProperty(obj, key, {
-    configurable: true,
-    enumerable: false,
-    writable,
-    value
-  });
-};
-const looseToNumber = (val) => {
-  const n = parseFloat(val);
-  return isNaN(n) ? val : n;
-};
-const toNumber = (val) => {
-  const n = isString(val) ? Number(val) : NaN;
-  return isNaN(n) ? val : n;
-};
-let _globalThis;
-const getGlobalThis = () => {
-  return _globalThis || (_globalThis = typeof globalThis !== "undefined" ? globalThis : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : {});
-};
-function genCacheKey(source, options) {
-  return source + JSON.stringify(
-    options,
-    (_, val) => typeof val === "function" ? val.toString() : val
-  );
-}
-
-const PatchFlagNames = {
-  [1]: `TEXT`,
-  [2]: `CLASS`,
-  [4]: `STYLE`,
-  [8]: `PROPS`,
-  [16]: `FULL_PROPS`,
-  [32]: `NEED_HYDRATION`,
-  [64]: `STABLE_FRAGMENT`,
-  [128]: `KEYED_FRAGMENT`,
-  [256]: `UNKEYED_FRAGMENT`,
-  [512]: `NEED_PATCH`,
-  [1024]: `DYNAMIC_SLOTS`,
-  [2048]: `DEV_ROOT_FRAGMENT`,
-  [-1]: `HOISTED`,
-  [-2]: `BAIL`
-};
-
-const slotFlagsText = {
-  [1]: "STABLE",
-  [2]: "DYNAMIC",
-  [3]: "FORWARDED"
-};
-
-const GLOBALS_ALLOWED = "Infinity,undefined,NaN,isFinite,isNaN,parseFloat,parseInt,decodeURI,decodeURIComponent,encodeURI,encodeURIComponent,Math,Number,Date,Array,Object,Boolean,String,RegExp,Map,Set,JSON,Intl,BigInt,console,Error,Symbol";
-const isGloballyAllowed = /* @__PURE__ */ makeMap(GLOBALS_ALLOWED);
-
-const range = 2;
-function generateCodeFrame(source, start = 0, end = source.length) {
-  start = Math.max(0, Math.min(start, source.length));
-  end = Math.max(0, Math.min(end, source.length));
-  if (start > end) return "";
-  let lines = source.split(/(\r?\n)/);
-  const newlineSequences = lines.filter((_, idx) => idx % 2 === 1);
-  lines = lines.filter((_, idx) => idx % 2 === 0);
-  let count = 0;
-  const res = [];
-  for (let i = 0; i < lines.length; i++) {
-    count += lines[i].length + (newlineSequences[i] && newlineSequences[i].length || 0);
-    if (count >= start) {
-      for (let j = i - range; j <= i + range || end > count; j++) {
-        if (j < 0 || j >= lines.length) continue;
-        const line = j + 1;
-        res.push(
-          `${line}${" ".repeat(Math.max(3 - String(line).length, 0))}|  ${lines[j]}`
-        );
-        const lineLength = lines[j].length;
-        const newLineSeqLength = newlineSequences[j] && newlineSequences[j].length || 0;
-        if (j === i) {
-          const pad = start - (count - (lineLength + newLineSeqLength));
-          const length = Math.max(
-            1,
-            end > count ? lineLength - pad : end - start
-          );
-          res.push(`   |  ` + " ".repeat(pad) + "^".repeat(length));
-        } else if (j > i) {
-          if (end > count) {
-            const length = Math.max(Math.min(end - count, lineLength), 1);
-            res.push(`   |  ` + "^".repeat(length));
-          }
-          count += lineLength + newLineSeqLength;
-        }
-      }
-      break;
-    }
-  }
-  return res.join("\n");
-}
-
-function normalizeStyle(value) {
-  if (isArray(value)) {
-    const res = {};
-    for (let i = 0; i < value.length; i++) {
-      const item = value[i];
-      const normalized = isString(item) ? parseStringStyle(item) : normalizeStyle(item);
-      if (normalized) {
-        for (const key in normalized) {
-          res[key] = normalized[key];
-        }
-      }
-    }
-    return res;
-  } else if (isString(value) || isObject(value)) {
-    return value;
-  }
-}
-const listDelimiterRE = /;(?![^(]*\))/g;
-const propertyDelimiterRE = /:([^]+)/;
-const styleCommentRE = /\/\*[^]*?\*\//g;
-function parseStringStyle(cssText) {
-  const ret = {};
-  cssText.replace(styleCommentRE, "").split(listDelimiterRE).forEach((item) => {
-    if (item) {
-      const tmp = item.split(propertyDelimiterRE);
-      tmp.length > 1 && (ret[tmp[0].trim()] = tmp[1].trim());
-    }
-  });
-  return ret;
-}
-function stringifyStyle(styles) {
-  let ret = "";
-  if (!styles || isString(styles)) {
-    return ret;
-  }
-  for (const key in styles) {
-    const value = styles[key];
-    if (isString(value) || typeof value === "number") {
-      const normalizedKey = key.startsWith(`--`) ? key : hyphenate(key);
-      ret += `${normalizedKey}:${value};`;
-    }
-  }
-  return ret;
-}
-function normalizeClass(value) {
-  let res = "";
-  if (isString(value)) {
-    res = value;
-  } else if (isArray(value)) {
-    for (let i = 0; i < value.length; i++) {
-      const normalized = normalizeClass(value[i]);
-      if (normalized) {
-        res += normalized + " ";
-      }
-    }
-  } else if (isObject(value)) {
-    for (const name in value) {
-      if (value[name]) {
-        res += name + " ";
-      }
-    }
-  }
-  return res.trim();
-}
-function normalizeProps(props) {
-  if (!props) return null;
-  let { class: klass, style } = props;
-  if (klass && !isString(klass)) {
-    props.class = normalizeClass(klass);
-  }
-  if (style) {
-    props.style = normalizeStyle(style);
-  }
-  return props;
-}
-
-const HTML_TAGS = "html,body,base,head,link,meta,style,title,address,article,aside,footer,header,hgroup,h1,h2,h3,h4,h5,h6,nav,section,div,dd,dl,dt,figcaption,figure,picture,hr,img,li,main,ol,p,pre,ul,a,b,abbr,bdi,bdo,br,cite,code,data,dfn,em,i,kbd,mark,q,rp,rt,ruby,s,samp,small,span,strong,sub,sup,time,u,var,wbr,area,audio,map,track,video,embed,object,param,source,canvas,script,noscript,del,ins,caption,col,colgroup,table,thead,tbody,td,th,tr,button,datalist,fieldset,form,input,label,legend,meter,optgroup,option,output,progress,select,textarea,details,dialog,menu,summary,template,blockquote,iframe,tfoot";
-const SVG_TAGS = "svg,animate,animateMotion,animateTransform,circle,clipPath,color-profile,defs,desc,discard,ellipse,feBlend,feColorMatrix,feComponentTransfer,feComposite,feConvolveMatrix,feDiffuseLighting,feDisplacementMap,feDistantLight,feDropShadow,feFlood,feFuncA,feFuncB,feFuncG,feFuncR,feGaussianBlur,feImage,feMerge,feMergeNode,feMorphology,feOffset,fePointLight,feSpecularLighting,feSpotLight,feTile,feTurbulence,filter,foreignObject,g,hatch,hatchpath,image,line,linearGradient,marker,mask,mesh,meshgradient,meshpatch,meshrow,metadata,mpath,path,pattern,polygon,polyline,radialGradient,rect,set,solidcolor,stop,switch,symbol,text,textPath,title,tspan,unknown,use,view";
-const MATH_TAGS = "annotation,annotation-xml,maction,maligngroup,malignmark,math,menclose,merror,mfenced,mfrac,mfraction,mglyph,mi,mlabeledtr,mlongdiv,mmultiscripts,mn,mo,mover,mpadded,mphantom,mprescripts,mroot,mrow,ms,mscarries,mscarry,msgroup,msline,mspace,msqrt,msrow,mstack,mstyle,msub,msubsup,msup,mtable,mtd,mtext,mtr,munder,munderover,none,semantics";
-const VOID_TAGS = "area,base,br,col,embed,hr,img,input,link,meta,param,source,track,wbr";
-const isHTMLTag = /* @__PURE__ */ makeMap(HTML_TAGS);
-const isSVGTag = /* @__PURE__ */ makeMap(SVG_TAGS);
-const isMathMLTag = /* @__PURE__ */ makeMap(MATH_TAGS);
-const isVoidTag = /* @__PURE__ */ makeMap(VOID_TAGS);
-
-const specialBooleanAttrs = `itemscope,allowfullscreen,formnovalidate,ismap,nomodule,novalidate,readonly`;
-const isSpecialBooleanAttr = /* @__PURE__ */ makeMap(specialBooleanAttrs);
-const isBooleanAttr = /* @__PURE__ */ makeMap(
-  specialBooleanAttrs + `,async,autofocus,autoplay,controls,default,defer,disabled,hidden,inert,loop,open,required,reversed,scoped,seamless,checked,muted,multiple,selected`
-);
-function includeBooleanAttr(value) {
-  return !!value || value === "";
-}
-const isKnownHtmlAttr = /* @__PURE__ */ makeMap(
-  `accept,accept-charset,accesskey,action,align,allow,alt,async,autocapitalize,autocomplete,autofocus,autoplay,background,bgcolor,border,buffered,capture,challenge,charset,checked,cite,class,code,codebase,color,cols,colspan,content,contenteditable,contextmenu,controls,coords,crossorigin,csp,data,datetime,decoding,default,defer,dir,dirname,disabled,download,draggable,dropzone,enctype,enterkeyhint,for,form,formaction,formenctype,formmethod,formnovalidate,formtarget,headers,height,hidden,high,href,hreflang,http-equiv,icon,id,importance,inert,integrity,ismap,itemprop,keytype,kind,label,lang,language,loading,list,loop,low,manifest,max,maxlength,minlength,media,min,multiple,muted,name,novalidate,open,optimum,pattern,ping,placeholder,poster,preload,radiogroup,readonly,referrerpolicy,rel,required,reversed,rows,rowspan,sandbox,scope,scoped,selected,shape,size,sizes,slot,span,spellcheck,src,srcdoc,srclang,srcset,start,step,style,summary,tabindex,target,title,translate,type,usemap,value,width,wrap`
-);
-const isKnownSvgAttr = /* @__PURE__ */ makeMap(
-  `xmlns,accent-height,accumulate,additive,alignment-baseline,alphabetic,amplitude,arabic-form,ascent,attributeName,attributeType,azimuth,baseFrequency,baseline-shift,baseProfile,bbox,begin,bias,by,calcMode,cap-height,class,clip,clipPathUnits,clip-path,clip-rule,color,color-interpolation,color-interpolation-filters,color-profile,color-rendering,contentScriptType,contentStyleType,crossorigin,cursor,cx,cy,d,decelerate,descent,diffuseConstant,direction,display,divisor,dominant-baseline,dur,dx,dy,edgeMode,elevation,enable-background,end,exponent,fill,fill-opacity,fill-rule,filter,filterRes,filterUnits,flood-color,flood-opacity,font-family,font-size,font-size-adjust,font-stretch,font-style,font-variant,font-weight,format,from,fr,fx,fy,g1,g2,glyph-name,glyph-orientation-horizontal,glyph-orientation-vertical,glyphRef,gradientTransform,gradientUnits,hanging,height,href,hreflang,horiz-adv-x,horiz-origin-x,id,ideographic,image-rendering,in,in2,intercept,k,k1,k2,k3,k4,kernelMatrix,kernelUnitLength,kerning,keyPoints,keySplines,keyTimes,lang,lengthAdjust,letter-spacing,lighting-color,limitingConeAngle,local,marker-end,marker-mid,marker-start,markerHeight,markerUnits,markerWidth,mask,maskContentUnits,maskUnits,mathematical,max,media,method,min,mode,name,numOctaves,offset,opacity,operator,order,orient,orientation,origin,overflow,overline-position,overline-thickness,panose-1,paint-order,path,pathLength,patternContentUnits,patternTransform,patternUnits,ping,pointer-events,points,pointsAtX,pointsAtY,pointsAtZ,preserveAlpha,preserveAspectRatio,primitiveUnits,r,radius,referrerPolicy,refX,refY,rel,rendering-intent,repeatCount,repeatDur,requiredExtensions,requiredFeatures,restart,result,rotate,rx,ry,scale,seed,shape-rendering,slope,spacing,specularConstant,specularExponent,speed,spreadMethod,startOffset,stdDeviation,stemh,stemv,stitchTiles,stop-color,stop-opacity,strikethrough-position,strikethrough-thickness,string,stroke,stroke-dasharray,stroke-dashoffset,stroke-linecap,stroke-linejoin,stroke-miterlimit,stroke-opacity,stroke-width,style,surfaceScale,systemLanguage,tabindex,tableValues,target,targetX,targetY,text-anchor,text-decoration,text-rendering,textLength,to,transform,transform-origin,type,u1,u2,underline-position,underline-thickness,unicode,unicode-bidi,unicode-range,units-per-em,v-alphabetic,v-hanging,v-ideographic,v-mathematical,values,vector-effect,version,vert-adv-y,vert-origin-x,vert-origin-y,viewBox,viewTarget,visibility,width,widths,word-spacing,writing-mode,x,x-height,x1,x2,xChannelSelector,xlink:actuate,xlink:arcrole,xlink:href,xlink:role,xlink:show,xlink:title,xlink:type,xmlns:xlink,xml:base,xml:lang,xml:space,y,y1,y2,yChannelSelector,z,zoomAndPan`
-);
-function isRenderableAttrValue(value) {
-  if (value == null) {
-    return false;
-  }
-  const type = typeof value;
-  return type === "string" || type === "number" || type === "boolean";
-}
-
-const cssVarNameEscapeSymbolsRE = /[ !"#$%&'()*+,./:;<=>?@[\\\]^`{|}~]/g;
-function getEscapedCssVarName(key, doubleEscape) {
-  return key.replace(
-    cssVarNameEscapeSymbolsRE,
-    (s) => `\\${s}`
-  );
-}
-
-function looseCompareArrays(a, b) {
-  if (a.length !== b.length) return false;
-  let equal = true;
-  for (let i = 0; equal && i < a.length; i++) {
-    equal = looseEqual(a[i], b[i]);
-  }
-  return equal;
-}
-function looseEqual(a, b) {
-  if (a === b) return true;
-  let aValidType = isDate(a);
-  let bValidType = isDate(b);
-  if (aValidType || bValidType) {
-    return aValidType && bValidType ? a.getTime() === b.getTime() : false;
-  }
-  aValidType = isSymbol(a);
-  bValidType = isSymbol(b);
-  if (aValidType || bValidType) {
-    return a === b;
-  }
-  aValidType = isArray(a);
-  bValidType = isArray(b);
-  if (aValidType || bValidType) {
-    return aValidType && bValidType ? looseCompareArrays(a, b) : false;
-  }
-  aValidType = isObject(a);
-  bValidType = isObject(b);
-  if (aValidType || bValidType) {
-    if (!aValidType || !bValidType) {
-      return false;
-    }
-    const aKeysCount = Object.keys(a).length;
-    const bKeysCount = Object.keys(b).length;
-    if (aKeysCount !== bKeysCount) {
-      return false;
-    }
-    for (const key in a) {
-      const aHasKey = a.hasOwnProperty(key);
-      const bHasKey = b.hasOwnProperty(key);
-      if (aHasKey && !bHasKey || !aHasKey && bHasKey || !looseEqual(a[key], b[key])) {
-        return false;
-      }
-    }
-  }
-  return String(a) === String(b);
-}
-function looseIndexOf(arr, val) {
-  return arr.findIndex((item) => looseEqual(item, val));
-}
-
-const isRef$1 = (val) => {
-  return !!(val && val["__v_isRef"] === true);
-};
-const toDisplayString = (val) => {
-  return isString(val) ? val : val == null ? "" : isArray(val) || isObject(val) && (val.toString === objectToString || !isFunction(val.toString)) ? isRef$1(val) ? toDisplayString(val.value) : JSON.stringify(val, replacer, 2) : String(val);
-};
-const replacer = (_key, val) => {
-  if (isRef$1(val)) {
-    return replacer(_key, val.value);
-  } else if (isMap(val)) {
-    return {
-      [`Map(${val.size})`]: [...val.entries()].reduce(
-        (entries, [key, val2], i) => {
-          entries[stringifySymbol(key, i) + " =>"] = val2;
-          return entries;
-        },
-        {}
-      )
-    };
-  } else if (isSet(val)) {
-    return {
-      [`Set(${val.size})`]: [...val.values()].map((v) => stringifySymbol(v))
-    };
-  } else if (isSymbol(val)) {
-    return stringifySymbol(val);
-  } else if (isObject(val) && !isArray(val) && !isPlainObject(val)) {
-    return String(val);
-  }
-  return val;
-};
-const stringifySymbol = (v, i = "") => {
-  var _a;
-  return (
-    // Symbol.description in es2019+ so we need to cast here to pass
-    // the lib: es2016 check
-    isSymbol(v) ? `Symbol(${(_a = v.description) != null ? _a : i})` : v
-  );
-};
-
-function warn$2(msg, ...args) {
-  console.warn(`[Vue warn] ${msg}`, ...args);
-}
-
-let activeEffectScope;
-class EffectScope {
-  constructor(detached = false) {
-    this.detached = detached;
-    /**
-     * @internal
-     */
-    this._active = true;
-    /**
-     * @internal
-     */
-    this.effects = [];
-    /**
-     * @internal
-     */
-    this.cleanups = [];
-    this._isPaused = false;
-    this.parent = activeEffectScope;
-    if (!detached && activeEffectScope) {
-      this.index = (activeEffectScope.scopes || (activeEffectScope.scopes = [])).push(
-        this
-      ) - 1;
-    }
-  }
-  get active() {
-    return this._active;
-  }
-  pause() {
-    if (this._active) {
-      this._isPaused = true;
-      let i, l;
-      if (this.scopes) {
-        for (i = 0, l = this.scopes.length; i < l; i++) {
-          this.scopes[i].pause();
-        }
-      }
-      for (i = 0, l = this.effects.length; i < l; i++) {
-        this.effects[i].pause();
-      }
-    }
-  }
-  /**
-   * Resumes the effect scope, including all child scopes and effects.
-   */
-  resume() {
-    if (this._active) {
-      if (this._isPaused) {
-        this._isPaused = false;
-        let i, l;
-        if (this.scopes) {
-          for (i = 0, l = this.scopes.length; i < l; i++) {
-            this.scopes[i].resume();
-          }
-        }
-        for (i = 0, l = this.effects.length; i < l; i++) {
-          this.effects[i].resume();
-        }
-      }
-    }
-  }
-  run(fn) {
-    if (this._active) {
-      const currentEffectScope = activeEffectScope;
-      try {
-        activeEffectScope = this;
-        return fn();
-      } finally {
-        activeEffectScope = currentEffectScope;
-      }
-    } else {
-      warn$2(`cannot run an inactive effect scope.`);
-    }
-  }
-  /**
-   * This should only be called on non-detached scopes
-   * @internal
-   */
-  on() {
-    activeEffectScope = this;
-  }
-  /**
-   * This should only be called on non-detached scopes
-   * @internal
-   */
-  off() {
-    activeEffectScope = this.parent;
-  }
-  stop(fromParent) {
-    if (this._active) {
-      let i, l;
-      for (i = 0, l = this.effects.length; i < l; i++) {
-        this.effects[i].stop();
-      }
-      for (i = 0, l = this.cleanups.length; i < l; i++) {
-        this.cleanups[i]();
-      }
-      if (this.scopes) {
-        for (i = 0, l = this.scopes.length; i < l; i++) {
-          this.scopes[i].stop(true);
-        }
-      }
-      if (!this.detached && this.parent && !fromParent) {
-        const last = this.parent.scopes.pop();
-        if (last && last !== this) {
-          this.parent.scopes[this.index] = last;
-          last.index = this.index;
-        }
-      }
-      this.parent = void 0;
-      this._active = false;
-    }
-  }
-}
-function effectScope(detached) {
-  return new EffectScope(detached);
-}
-function getCurrentScope() {
-  return activeEffectScope;
-}
-function onScopeDispose(fn, failSilently = false) {
-  if (activeEffectScope) {
-    activeEffectScope.cleanups.push(fn);
-  } else if (!failSilently) {
-    warn$2(
-      `onScopeDispose() is called when there is no active effect scope to be associated with.`
-    );
-  }
-}
-
-let activeSub;
-const pausedQueueEffects = /* @__PURE__ */ new WeakSet();
-class ReactiveEffect {
-  constructor(fn) {
-    this.fn = fn;
-    /**
-     * @internal
-     */
-    this.deps = void 0;
-    /**
-     * @internal
-     */
-    this.depsTail = void 0;
-    /**
-     * @internal
-     */
-    this.flags = 1 | 4;
-    /**
-     * @internal
-     */
-    this.next = void 0;
-    /**
-     * @internal
-     */
-    this.cleanup = void 0;
-    this.scheduler = void 0;
-    if (activeEffectScope && activeEffectScope.active) {
-      activeEffectScope.effects.push(this);
-    }
-  }
-  pause() {
-    this.flags |= 64;
-  }
-  resume() {
-    if (this.flags & 64) {
-      this.flags &= ~64;
-      if (pausedQueueEffects.has(this)) {
-        pausedQueueEffects.delete(this);
-        this.trigger();
-      }
-    }
-  }
-  /**
-   * @internal
-   */
-  notify() {
-    if (this.flags & 2 && !(this.flags & 32)) {
-      return;
-    }
-    if (!(this.flags & 8)) {
-      batch(this);
-    }
-  }
-  run() {
-    if (!(this.flags & 1)) {
-      return this.fn();
-    }
-    this.flags |= 2;
-    cleanupEffect(this);
-    prepareDeps(this);
-    const prevEffect = activeSub;
-    const prevShouldTrack = shouldTrack;
-    activeSub = this;
-    shouldTrack = true;
-    try {
-      return this.fn();
-    } finally {
-      if (activeSub !== this) {
-        warn$2(
-          "Active effect was not restored correctly - this is likely a Vue internal bug."
-        );
-      }
-      cleanupDeps(this);
-      activeSub = prevEffect;
-      shouldTrack = prevShouldTrack;
-      this.flags &= ~2;
-    }
-  }
-  stop() {
-    if (this.flags & 1) {
-      for (let link = this.deps; link; link = link.nextDep) {
-        removeSub(link);
-      }
-      this.deps = this.depsTail = void 0;
-      cleanupEffect(this);
-      this.onStop && this.onStop();
-      this.flags &= ~1;
-    }
-  }
-  trigger() {
-    if (this.flags & 64) {
-      pausedQueueEffects.add(this);
-    } else if (this.scheduler) {
-      this.scheduler();
-    } else {
-      this.runIfDirty();
-    }
-  }
-  /**
-   * @internal
-   */
-  runIfDirty() {
-    if (isDirty(this)) {
-      this.run();
-    }
-  }
-  get dirty() {
-    return isDirty(this);
-  }
-}
-let batchDepth = 0;
-let batchedSub;
-let batchedComputed;
-function batch(sub, isComputed = false) {
-  sub.flags |= 8;
-  if (isComputed) {
-    sub.next = batchedComputed;
-    batchedComputed = sub;
-    return;
-  }
-  sub.next = batchedSub;
-  batchedSub = sub;
-}
-function startBatch() {
-  batchDepth++;
-}
-function endBatch() {
-  if (--batchDepth > 0) {
-    return;
-  }
-  if (batchedComputed) {
-    let e = batchedComputed;
-    batchedComputed = void 0;
-    while (e) {
-      const next = e.next;
-      e.next = void 0;
-      e.flags &= ~8;
-      e = next;
-    }
-  }
-  let error;
-  while (batchedSub) {
-    let e = batchedSub;
-    batchedSub = void 0;
-    while (e) {
-      const next = e.next;
-      e.next = void 0;
-      e.flags &= ~8;
-      if (e.flags & 1) {
-        try {
-          ;
-          e.trigger();
-        } catch (err) {
-          if (!error) error = err;
-        }
-      }
-      e = next;
-    }
-  }
-  if (error) throw error;
-}
-function prepareDeps(sub) {
-  for (let link = sub.deps; link; link = link.nextDep) {
-    link.version = -1;
-    link.prevActiveLink = link.dep.activeLink;
-    link.dep.activeLink = link;
-  }
-}
-function cleanupDeps(sub) {
-  let head;
-  let tail = sub.depsTail;
-  let link = tail;
-  while (link) {
-    const prev = link.prevDep;
-    if (link.version === -1) {
-      if (link === tail) tail = prev;
-      removeSub(link);
-      removeDep(link);
-    } else {
-      head = link;
-    }
-    link.dep.activeLink = link.prevActiveLink;
-    link.prevActiveLink = void 0;
-    link = prev;
-  }
-  sub.deps = head;
-  sub.depsTail = tail;
-}
-function isDirty(sub) {
-  for (let link = sub.deps; link; link = link.nextDep) {
-    if (link.dep.version !== link.version || link.dep.computed && (refreshComputed(link.dep.computed) || link.dep.version !== link.version)) {
-      return true;
-    }
-  }
-  if (sub._dirty) {
-    return true;
-  }
-  return false;
-}
-function refreshComputed(computed) {
-  if (computed.flags & 4 && !(computed.flags & 16)) {
-    return;
-  }
-  computed.flags &= ~16;
-  if (computed.globalVersion === globalVersion) {
-    return;
-  }
-  computed.globalVersion = globalVersion;
-  const dep = computed.dep;
-  computed.flags |= 2;
-  if (dep.version > 0 && !computed.isSSR && computed.deps && !isDirty(computed)) {
-    computed.flags &= ~2;
-    return;
-  }
-  const prevSub = activeSub;
-  const prevShouldTrack = shouldTrack;
-  activeSub = computed;
-  shouldTrack = true;
-  try {
-    prepareDeps(computed);
-    const value = computed.fn(computed._value);
-    if (dep.version === 0 || hasChanged(value, computed._value)) {
-      computed._value = value;
-      dep.version++;
-    }
-  } catch (err) {
-    dep.version++;
-    throw err;
-  } finally {
-    activeSub = prevSub;
-    shouldTrack = prevShouldTrack;
-    cleanupDeps(computed);
-    computed.flags &= ~2;
-  }
-}
-function removeSub(link, soft = false) {
-  const { dep, prevSub, nextSub } = link;
-  if (prevSub) {
-    prevSub.nextSub = nextSub;
-    link.prevSub = void 0;
-  }
-  if (nextSub) {
-    nextSub.prevSub = prevSub;
-    link.nextSub = void 0;
-  }
-  if (dep.subsHead === link) {
-    dep.subsHead = nextSub;
-  }
-  if (dep.subs === link) {
-    dep.subs = prevSub;
-    if (!prevSub && dep.computed) {
-      dep.computed.flags &= ~4;
-      for (let l = dep.computed.deps; l; l = l.nextDep) {
-        removeSub(l, true);
-      }
-    }
-  }
-  if (!soft && !--dep.sc && dep.map) {
-    dep.map.delete(dep.key);
-  }
-}
-function removeDep(link) {
-  const { prevDep, nextDep } = link;
-  if (prevDep) {
-    prevDep.nextDep = nextDep;
-    link.prevDep = void 0;
-  }
-  if (nextDep) {
-    nextDep.prevDep = prevDep;
-    link.nextDep = void 0;
-  }
-}
-function effect(fn, options) {
-  if (fn.effect instanceof ReactiveEffect) {
-    fn = fn.effect.fn;
-  }
-  const e = new ReactiveEffect(fn);
-  if (options) {
-    extend(e, options);
-  }
-  try {
-    e.run();
-  } catch (err) {
-    e.stop();
-    throw err;
-  }
-  const runner = e.run.bind(e);
-  runner.effect = e;
-  return runner;
-}
-function stop(runner) {
-  runner.effect.stop();
-}
-let shouldTrack = true;
-const trackStack = [];
-function pauseTracking() {
-  trackStack.push(shouldTrack);
-  shouldTrack = false;
-}
-function resetTracking() {
-  const last = trackStack.pop();
-  shouldTrack = last === void 0 ? true : last;
-}
-function cleanupEffect(e) {
-  const { cleanup } = e;
-  e.cleanup = void 0;
-  if (cleanup) {
-    const prevSub = activeSub;
-    activeSub = void 0;
-    try {
-      cleanup();
-    } finally {
-      activeSub = prevSub;
-    }
-  }
-}
-
-let globalVersion = 0;
-class Link {
-  constructor(sub, dep) {
-    this.sub = sub;
-    this.dep = dep;
-    this.version = dep.version;
-    this.nextDep = this.prevDep = this.nextSub = this.prevSub = this.prevActiveLink = void 0;
-  }
-}
-class Dep {
-  constructor(computed) {
-    this.computed = computed;
-    this.version = 0;
-    /**
-     * Link between this dep and the current active effect
-     */
-    this.activeLink = void 0;
-    /**
-     * Doubly linked list representing the subscribing effects (tail)
-     */
-    this.subs = void 0;
-    /**
-     * For object property deps cleanup
-     */
-    this.map = void 0;
-    this.key = void 0;
-    /**
-     * Subscriber counter
-     */
-    this.sc = 0;
-    {
-      this.subsHead = void 0;
-    }
-  }
-  track(debugInfo) {
-    if (!activeSub || !shouldTrack || activeSub === this.computed) {
-      return;
-    }
-    let link = this.activeLink;
-    if (link === void 0 || link.sub !== activeSub) {
-      link = this.activeLink = new Link(activeSub, this);
-      if (!activeSub.deps) {
-        activeSub.deps = activeSub.depsTail = link;
-      } else {
-        link.prevDep = activeSub.depsTail;
-        activeSub.depsTail.nextDep = link;
-        activeSub.depsTail = link;
-      }
-      addSub(link);
-    } else if (link.version === -1) {
-      link.version = this.version;
-      if (link.nextDep) {
-        const next = link.nextDep;
-        next.prevDep = link.prevDep;
-        if (link.prevDep) {
-          link.prevDep.nextDep = next;
-        }
-        link.prevDep = activeSub.depsTail;
-        link.nextDep = void 0;
-        activeSub.depsTail.nextDep = link;
-        activeSub.depsTail = link;
-        if (activeSub.deps === link) {
-          activeSub.deps = next;
-        }
-      }
-    }
-    if (activeSub.onTrack) {
-      activeSub.onTrack(
-        extend(
-          {
-            effect: activeSub
-          },
-          debugInfo
-        )
-      );
-    }
-    return link;
-  }
-  trigger(debugInfo) {
-    this.version++;
-    globalVersion++;
-    this.notify(debugInfo);
-  }
-  notify(debugInfo) {
-    startBatch();
-    try {
-      if (true) {
-        for (let head = this.subsHead; head; head = head.nextSub) {
-          if (head.sub.onTrigger && !(head.sub.flags & 8)) {
-            head.sub.onTrigger(
-              extend(
-                {
-                  effect: head.sub
-                },
-                debugInfo
-              )
-            );
-          }
-        }
-      }
-      for (let link = this.subs; link; link = link.prevSub) {
-        if (link.sub.notify()) {
-          ;
-          link.sub.dep.notify();
-        }
-      }
-    } finally {
-      endBatch();
-    }
-  }
-}
-function addSub(link) {
-  link.dep.sc++;
-  if (link.sub.flags & 4) {
-    const computed = link.dep.computed;
-    if (computed && !link.dep.subs) {
-      computed.flags |= 4 | 16;
-      for (let l = computed.deps; l; l = l.nextDep) {
-        addSub(l);
-      }
-    }
-    const currentTail = link.dep.subs;
-    if (currentTail !== link) {
-      link.prevSub = currentTail;
-      if (currentTail) currentTail.nextSub = link;
-    }
-    if (link.dep.subsHead === void 0) {
-      link.dep.subsHead = link;
-    }
-    link.dep.subs = link;
-  }
-}
-const targetMap = /* @__PURE__ */ new WeakMap();
-const ITERATE_KEY = Symbol(
-  "Object iterate" 
-);
-const MAP_KEY_ITERATE_KEY = Symbol(
-  "Map keys iterate" 
-);
-const ARRAY_ITERATE_KEY = Symbol(
-  "Array iterate" 
-);
-function track(target, type, key) {
-  if (shouldTrack && activeSub) {
-    let depsMap = targetMap.get(target);
-    if (!depsMap) {
-      targetMap.set(target, depsMap = /* @__PURE__ */ new Map());
-    }
-    let dep = depsMap.get(key);
-    if (!dep) {
-      depsMap.set(key, dep = new Dep());
-      dep.map = depsMap;
-      dep.key = key;
-    }
-    {
-      dep.track({
-        target,
-        type,
-        key
-      });
-    }
-  }
-}
-function trigger(target, type, key, newValue, oldValue, oldTarget) {
-  const depsMap = targetMap.get(target);
-  if (!depsMap) {
-    globalVersion++;
-    return;
-  }
-  const run = (dep) => {
-    if (dep) {
-      {
-        dep.trigger({
-          target,
-          type,
-          key,
-          newValue,
-          oldValue,
-          oldTarget
-        });
-      }
-    }
-  };
-  startBatch();
-  if (type === "clear") {
-    depsMap.forEach(run);
-  } else {
-    const targetIsArray = isArray(target);
-    const isArrayIndex = targetIsArray && isIntegerKey(key);
-    if (targetIsArray && key === "length") {
-      const newLength = Number(newValue);
-      depsMap.forEach((dep, key2) => {
-        if (key2 === "length" || key2 === ARRAY_ITERATE_KEY || !isSymbol(key2) && key2 >= newLength) {
-          run(dep);
-        }
-      });
-    } else {
-      if (key !== void 0 || depsMap.has(void 0)) {
-        run(depsMap.get(key));
-      }
-      if (isArrayIndex) {
-        run(depsMap.get(ARRAY_ITERATE_KEY));
-      }
-      switch (type) {
-        case "add":
-          if (!targetIsArray) {
-            run(depsMap.get(ITERATE_KEY));
-            if (isMap(target)) {
-              run(depsMap.get(MAP_KEY_ITERATE_KEY));
-            }
-          } else if (isArrayIndex) {
-            run(depsMap.get("length"));
-          }
-          break;
-        case "delete":
-          if (!targetIsArray) {
-            run(depsMap.get(ITERATE_KEY));
-            if (isMap(target)) {
-              run(depsMap.get(MAP_KEY_ITERATE_KEY));
-            }
-          }
-          break;
-        case "set":
-          if (isMap(target)) {
-            run(depsMap.get(ITERATE_KEY));
-          }
-          break;
-      }
-    }
-  }
-  endBatch();
-}
-function getDepFromReactive(object, key) {
-  const depMap = targetMap.get(object);
-  return depMap && depMap.get(key);
-}
-
-function reactiveReadArray(array) {
-  const raw = toRaw(array);
-  if (raw === array) return raw;
-  track(raw, "iterate", ARRAY_ITERATE_KEY);
-  return isShallow(array) ? raw : raw.map(toReactive);
-}
-function shallowReadArray(arr) {
-  track(arr = toRaw(arr), "iterate", ARRAY_ITERATE_KEY);
-  return arr;
-}
-const arrayInstrumentations = {
-  __proto__: null,
-  [Symbol.iterator]() {
-    return iterator(this, Symbol.iterator, toReactive);
-  },
-  concat(...args) {
-    return reactiveReadArray(this).concat(
-      ...args.map((x) => isArray(x) ? reactiveReadArray(x) : x)
-    );
-  },
-  entries() {
-    return iterator(this, "entries", (value) => {
-      value[1] = toReactive(value[1]);
-      return value;
-    });
-  },
-  every(fn, thisArg) {
-    return apply(this, "every", fn, thisArg, void 0, arguments);
-  },
-  filter(fn, thisArg) {
-    return apply(this, "filter", fn, thisArg, (v) => v.map(toReactive), arguments);
-  },
-  find(fn, thisArg) {
-    return apply(this, "find", fn, thisArg, toReactive, arguments);
-  },
-  findIndex(fn, thisArg) {
-    return apply(this, "findIndex", fn, thisArg, void 0, arguments);
-  },
-  findLast(fn, thisArg) {
-    return apply(this, "findLast", fn, thisArg, toReactive, arguments);
-  },
-  findLastIndex(fn, thisArg) {
-    return apply(this, "findLastIndex", fn, thisArg, void 0, arguments);
-  },
-  // flat, flatMap could benefit from ARRAY_ITERATE but are not straight-forward to implement
-  forEach(fn, thisArg) {
-    return apply(this, "forEach", fn, thisArg, void 0, arguments);
-  },
-  includes(...args) {
-    return searchProxy(this, "includes", args);
-  },
-  indexOf(...args) {
-    return searchProxy(this, "indexOf", args);
-  },
-  join(separator) {
-    return reactiveReadArray(this).join(separator);
-  },
-  // keys() iterator only reads `length`, no optimisation required
-  lastIndexOf(...args) {
-    return searchProxy(this, "lastIndexOf", args);
-  },
-  map(fn, thisArg) {
-    return apply(this, "map", fn, thisArg, void 0, arguments);
-  },
-  pop() {
-    return noTracking(this, "pop");
-  },
-  push(...args) {
-    return noTracking(this, "push", args);
-  },
-  reduce(fn, ...args) {
-    return reduce(this, "reduce", fn, args);
-  },
-  reduceRight(fn, ...args) {
-    return reduce(this, "reduceRight", fn, args);
-  },
-  shift() {
-    return noTracking(this, "shift");
-  },
-  // slice could use ARRAY_ITERATE but also seems to beg for range tracking
-  some(fn, thisArg) {
-    return apply(this, "some", fn, thisArg, void 0, arguments);
-  },
-  splice(...args) {
-    return noTracking(this, "splice", args);
-  },
-  toReversed() {
-    return reactiveReadArray(this).toReversed();
-  },
-  toSorted(comparer) {
-    return reactiveReadArray(this).toSorted(comparer);
-  },
-  toSpliced(...args) {
-    return reactiveReadArray(this).toSpliced(...args);
-  },
-  unshift(...args) {
-    return noTracking(this, "unshift", args);
-  },
-  values() {
-    return iterator(this, "values", toReactive);
-  }
-};
-function iterator(self, method, wrapValue) {
-  const arr = shallowReadArray(self);
-  const iter = arr[method]();
-  if (arr !== self && !isShallow(self)) {
-    iter._next = iter.next;
-    iter.next = () => {
-      const result = iter._next();
-      if (result.value) {
-        result.value = wrapValue(result.value);
-      }
-      return result;
-    };
-  }
-  return iter;
-}
-const arrayProto = Array.prototype;
-function apply(self, method, fn, thisArg, wrappedRetFn, args) {
-  const arr = shallowReadArray(self);
-  const needsWrap = arr !== self && !isShallow(self);
-  const methodFn = arr[method];
-  if (methodFn !== arrayProto[method]) {
-    const result2 = methodFn.apply(self, args);
-    return needsWrap ? toReactive(result2) : result2;
-  }
-  let wrappedFn = fn;
-  if (arr !== self) {
-    if (needsWrap) {
-      wrappedFn = function(item, index) {
-        return fn.call(this, toReactive(item), index, self);
-      };
-    } else if (fn.length > 2) {
-      wrappedFn = function(item, index) {
-        return fn.call(this, item, index, self);
-      };
-    }
-  }
-  const result = methodFn.call(arr, wrappedFn, thisArg);
-  return needsWrap && wrappedRetFn ? wrappedRetFn(result) : result;
-}
-function reduce(self, method, fn, args) {
-  const arr = shallowReadArray(self);
-  let wrappedFn = fn;
-  if (arr !== self) {
-    if (!isShallow(self)) {
-      wrappedFn = function(acc, item, index) {
-        return fn.call(this, acc, toReactive(item), index, self);
-      };
-    } else if (fn.length > 3) {
-      wrappedFn = function(acc, item, index) {
-        return fn.call(this, acc, item, index, self);
-      };
-    }
-  }
-  return arr[method](wrappedFn, ...args);
-}
-function searchProxy(self, method, args) {
-  const arr = toRaw(self);
-  track(arr, "iterate", ARRAY_ITERATE_KEY);
-  const res = arr[method](...args);
-  if ((res === -1 || res === false) && isProxy(args[0])) {
-    args[0] = toRaw(args[0]);
-    return arr[method](...args);
-  }
-  return res;
-}
-function noTracking(self, method, args = []) {
-  pauseTracking();
-  startBatch();
-  const res = toRaw(self)[method].apply(self, args);
-  endBatch();
-  resetTracking();
-  return res;
-}
-
-const isNonTrackableKeys = /* @__PURE__ */ makeMap(`__proto__,__v_isRef,__isVue`);
-const builtInSymbols = new Set(
-  /* @__PURE__ */ Object.getOwnPropertyNames(Symbol).filter((key) => key !== "arguments" && key !== "caller").map((key) => Symbol[key]).filter(isSymbol)
-);
-function hasOwnProperty(key) {
-  if (!isSymbol(key)) key = String(key);
-  const obj = toRaw(this);
-  track(obj, "has", key);
-  return obj.hasOwnProperty(key);
-}
-class BaseReactiveHandler {
-  constructor(_isReadonly = false, _isShallow = false) {
-    this._isReadonly = _isReadonly;
-    this._isShallow = _isShallow;
-  }
-  get(target, key, receiver) {
-    const isReadonly2 = this._isReadonly, isShallow2 = this._isShallow;
-    if (key === "__v_isReactive") {
-      return !isReadonly2;
-    } else if (key === "__v_isReadonly") {
-      return isReadonly2;
-    } else if (key === "__v_isShallow") {
-      return isShallow2;
-    } else if (key === "__v_raw") {
-      if (receiver === (isReadonly2 ? isShallow2 ? shallowReadonlyMap : readonlyMap : isShallow2 ? shallowReactiveMap : reactiveMap).get(target) || // receiver is not the reactive proxy, but has the same prototype
-      // this means the receiver is a user proxy of the reactive proxy
-      Object.getPrototypeOf(target) === Object.getPrototypeOf(receiver)) {
-        return target;
-      }
-      return;
-    }
-    const targetIsArray = isArray(target);
-    if (!isReadonly2) {
-      let fn;
-      if (targetIsArray && (fn = arrayInstrumentations[key])) {
-        return fn;
-      }
-      if (key === "hasOwnProperty") {
-        return hasOwnProperty;
-      }
-    }
-    const res = Reflect.get(
-      target,
-      key,
-      // if this is a proxy wrapping a ref, return methods using the raw ref
-      // as receiver so that we don't have to call `toRaw` on the ref in all
-      // its class methods
-      isRef(target) ? target : receiver
-    );
-    if (isSymbol(key) ? builtInSymbols.has(key) : isNonTrackableKeys(key)) {
-      return res;
-    }
-    if (!isReadonly2) {
-      track(target, "get", key);
-    }
-    if (isShallow2) {
-      return res;
-    }
-    if (isRef(res)) {
-      return targetIsArray && isIntegerKey(key) ? res : res.value;
-    }
-    if (isObject(res)) {
-      return isReadonly2 ? readonly(res) : reactive(res);
-    }
-    return res;
-  }
-}
-class MutableReactiveHandler extends BaseReactiveHandler {
-  constructor(isShallow2 = false) {
-    super(false, isShallow2);
-  }
-  set(target, key, value, receiver) {
-    let oldValue = target[key];
-    if (!this._isShallow) {
-      const isOldValueReadonly = isReadonly(oldValue);
-      if (!isShallow(value) && !isReadonly(value)) {
-        oldValue = toRaw(oldValue);
-        value = toRaw(value);
-      }
-      if (!isArray(target) && isRef(oldValue) && !isRef(value)) {
-        if (isOldValueReadonly) {
-          return false;
-        } else {
-          oldValue.value = value;
-          return true;
-        }
-      }
-    }
-    const hadKey = isArray(target) && isIntegerKey(key) ? Number(key) < target.length : hasOwn(target, key);
-    const result = Reflect.set(
-      target,
-      key,
-      value,
-      isRef(target) ? target : receiver
-    );
-    if (target === toRaw(receiver)) {
-      if (!hadKey) {
-        trigger(target, "add", key, value);
-      } else if (hasChanged(value, oldValue)) {
-        trigger(target, "set", key, value, oldValue);
-      }
-    }
-    return result;
-  }
-  deleteProperty(target, key) {
-    const hadKey = hasOwn(target, key);
-    const oldValue = target[key];
-    const result = Reflect.deleteProperty(target, key);
-    if (result && hadKey) {
-      trigger(target, "delete", key, void 0, oldValue);
-    }
-    return result;
-  }
-  has(target, key) {
-    const result = Reflect.has(target, key);
-    if (!isSymbol(key) || !builtInSymbols.has(key)) {
-      track(target, "has", key);
-    }
-    return result;
-  }
-  ownKeys(target) {
-    track(
-      target,
-      "iterate",
-      isArray(target) ? "length" : ITERATE_KEY
-    );
-    return Reflect.ownKeys(target);
-  }
-}
-class ReadonlyReactiveHandler extends BaseReactiveHandler {
-  constructor(isShallow2 = false) {
-    super(true, isShallow2);
-  }
-  set(target, key) {
-    {
-      warn$2(
-        `Set operation on key "${String(key)}" failed: target is readonly.`,
-        target
-      );
-    }
-    return true;
-  }
-  deleteProperty(target, key) {
-    {
-      warn$2(
-        `Delete operation on key "${String(key)}" failed: target is readonly.`,
-        target
-      );
-    }
-    return true;
-  }
-}
-const mutableHandlers = /* @__PURE__ */ new MutableReactiveHandler();
-const readonlyHandlers = /* @__PURE__ */ new ReadonlyReactiveHandler();
-const shallowReactiveHandlers = /* @__PURE__ */ new MutableReactiveHandler(true);
-const shallowReadonlyHandlers = /* @__PURE__ */ new ReadonlyReactiveHandler(true);
-
-const toShallow = (value) => value;
-const getProto = (v) => Reflect.getPrototypeOf(v);
-function createIterableMethod(method, isReadonly2, isShallow2) {
-  return function(...args) {
-    const target = this["__v_raw"];
-    const rawTarget = toRaw(target);
-    const targetIsMap = isMap(rawTarget);
-    const isPair = method === "entries" || method === Symbol.iterator && targetIsMap;
-    const isKeyOnly = method === "keys" && targetIsMap;
-    const innerIterator = target[method](...args);
-    const wrap = isShallow2 ? toShallow : isReadonly2 ? toReadonly : toReactive;
-    !isReadonly2 && track(
-      rawTarget,
-      "iterate",
-      isKeyOnly ? MAP_KEY_ITERATE_KEY : ITERATE_KEY
-    );
-    return {
-      // iterator protocol
-      next() {
-        const { value, done } = innerIterator.next();
-        return done ? { value, done } : {
-          value: isPair ? [wrap(value[0]), wrap(value[1])] : wrap(value),
-          done
-        };
-      },
-      // iterable protocol
-      [Symbol.iterator]() {
-        return this;
-      }
-    };
-  };
-}
-function createReadonlyMethod(type) {
-  return function(...args) {
-    {
-      const key = args[0] ? `on key "${args[0]}" ` : ``;
-      warn$2(
-        `${capitalize(type)} operation ${key}failed: target is readonly.`,
-        toRaw(this)
-      );
-    }
-    return type === "delete" ? false : type === "clear" ? void 0 : this;
-  };
-}
-function createInstrumentations(readonly, shallow) {
-  const instrumentations = {
-    get(key) {
-      const target = this["__v_raw"];
-      const rawTarget = toRaw(target);
-      const rawKey = toRaw(key);
-      if (!readonly) {
-        if (hasChanged(key, rawKey)) {
-          track(rawTarget, "get", key);
-        }
-        track(rawTarget, "get", rawKey);
-      }
-      const { has } = getProto(rawTarget);
-      const wrap = shallow ? toShallow : readonly ? toReadonly : toReactive;
-      if (has.call(rawTarget, key)) {
-        return wrap(target.get(key));
-      } else if (has.call(rawTarget, rawKey)) {
-        return wrap(target.get(rawKey));
-      } else if (target !== rawTarget) {
-        target.get(key);
-      }
-    },
-    get size() {
-      const target = this["__v_raw"];
-      !readonly && track(toRaw(target), "iterate", ITERATE_KEY);
-      return Reflect.get(target, "size", target);
-    },
-    has(key) {
-      const target = this["__v_raw"];
-      const rawTarget = toRaw(target);
-      const rawKey = toRaw(key);
-      if (!readonly) {
-        if (hasChanged(key, rawKey)) {
-          track(rawTarget, "has", key);
-        }
-        track(rawTarget, "has", rawKey);
-      }
-      return key === rawKey ? target.has(key) : target.has(key) || target.has(rawKey);
-    },
-    forEach(callback, thisArg) {
-      const observed = this;
-      const target = observed["__v_raw"];
-      const rawTarget = toRaw(target);
-      const wrap = shallow ? toShallow : readonly ? toReadonly : toReactive;
-      !readonly && track(rawTarget, "iterate", ITERATE_KEY);
-      return target.forEach((value, key) => {
-        return callback.call(thisArg, wrap(value), wrap(key), observed);
-      });
-    }
-  };
-  extend(
-    instrumentations,
-    readonly ? {
-      add: createReadonlyMethod("add"),
-      set: createReadonlyMethod("set"),
-      delete: createReadonlyMethod("delete"),
-      clear: createReadonlyMethod("clear")
-    } : {
-      add(value) {
-        if (!shallow && !isShallow(value) && !isReadonly(value)) {
-          value = toRaw(value);
-        }
-        const target = toRaw(this);
-        const proto = getProto(target);
-        const hadKey = proto.has.call(target, value);
-        if (!hadKey) {
-          target.add(value);
-          trigger(target, "add", value, value);
-        }
-        return this;
-      },
-      set(key, value) {
-        if (!shallow && !isShallow(value) && !isReadonly(value)) {
-          value = toRaw(value);
-        }
-        const target = toRaw(this);
-        const { has, get } = getProto(target);
-        let hadKey = has.call(target, key);
-        if (!hadKey) {
-          key = toRaw(key);
-          hadKey = has.call(target, key);
-        } else {
-          checkIdentityKeys(target, has, key);
-        }
-        const oldValue = get.call(target, key);
-        target.set(key, value);
-        if (!hadKey) {
-          trigger(target, "add", key, value);
-        } else if (hasChanged(value, oldValue)) {
-          trigger(target, "set", key, value, oldValue);
-        }
-        return this;
-      },
-      delete(key) {
-        const target = toRaw(this);
-        const { has, get } = getProto(target);
-        let hadKey = has.call(target, key);
-        if (!hadKey) {
-          key = toRaw(key);
-          hadKey = has.call(target, key);
-        } else {
-          checkIdentityKeys(target, has, key);
-        }
-        const oldValue = get ? get.call(target, key) : void 0;
-        const result = target.delete(key);
-        if (hadKey) {
-          trigger(target, "delete", key, void 0, oldValue);
-        }
-        return result;
-      },
-      clear() {
-        const target = toRaw(this);
-        const hadItems = target.size !== 0;
-        const oldTarget = isMap(target) ? new Map(target) : new Set(target) ;
-        const result = target.clear();
-        if (hadItems) {
-          trigger(
-            target,
-            "clear",
-            void 0,
-            void 0,
-            oldTarget
-          );
-        }
-        return result;
-      }
-    }
-  );
-  const iteratorMethods = [
-    "keys",
-    "values",
-    "entries",
-    Symbol.iterator
-  ];
-  iteratorMethods.forEach((method) => {
-    instrumentations[method] = createIterableMethod(method, readonly, shallow);
-  });
-  return instrumentations;
-}
-function createInstrumentationGetter(isReadonly2, shallow) {
-  const instrumentations = createInstrumentations(isReadonly2, shallow);
-  return (target, key, receiver) => {
-    if (key === "__v_isReactive") {
-      return !isReadonly2;
-    } else if (key === "__v_isReadonly") {
-      return isReadonly2;
-    } else if (key === "__v_raw") {
-      return target;
-    }
-    return Reflect.get(
-      hasOwn(instrumentations, key) && key in target ? instrumentations : target,
-      key,
-      receiver
-    );
-  };
-}
-const mutableCollectionHandlers = {
-  get: /* @__PURE__ */ createInstrumentationGetter(false, false)
-};
-const shallowCollectionHandlers = {
-  get: /* @__PURE__ */ createInstrumentationGetter(false, true)
-};
-const readonlyCollectionHandlers = {
-  get: /* @__PURE__ */ createInstrumentationGetter(true, false)
-};
-const shallowReadonlyCollectionHandlers = {
-  get: /* @__PURE__ */ createInstrumentationGetter(true, true)
-};
-function checkIdentityKeys(target, has, key) {
-  const rawKey = toRaw(key);
-  if (rawKey !== key && has.call(target, rawKey)) {
-    const type = toRawType(target);
-    warn$2(
-      `Reactive ${type} contains both the raw and reactive versions of the same object${type === `Map` ? ` as keys` : ``}, which can lead to inconsistencies. Avoid differentiating between the raw and reactive versions of an object and only use the reactive version if possible.`
-    );
-  }
-}
-
-const reactiveMap = /* @__PURE__ */ new WeakMap();
-const shallowReactiveMap = /* @__PURE__ */ new WeakMap();
-const readonlyMap = /* @__PURE__ */ new WeakMap();
-const shallowReadonlyMap = /* @__PURE__ */ new WeakMap();
-function targetTypeMap(rawType) {
-  switch (rawType) {
-    case "Object":
-    case "Array":
-      return 1 /* COMMON */;
-    case "Map":
-    case "Set":
-    case "WeakMap":
-    case "WeakSet":
-      return 2 /* COLLECTION */;
-    default:
-      return 0 /* INVALID */;
-  }
-}
-function getTargetType(value) {
-  return value["__v_skip"] || !Object.isExtensible(value) ? 0 /* INVALID */ : targetTypeMap(toRawType(value));
-}
-function reactive(target) {
-  if (isReadonly(target)) {
-    return target;
-  }
-  return createReactiveObject(
-    target,
-    false,
-    mutableHandlers,
-    mutableCollectionHandlers,
-    reactiveMap
-  );
-}
-function shallowReactive(target) {
-  return createReactiveObject(
-    target,
-    false,
-    shallowReactiveHandlers,
-    shallowCollectionHandlers,
-    shallowReactiveMap
-  );
-}
-function readonly(target) {
-  return createReactiveObject(
-    target,
-    true,
-    readonlyHandlers,
-    readonlyCollectionHandlers,
-    readonlyMap
-  );
-}
-function shallowReadonly(target) {
-  return createReactiveObject(
-    target,
-    true,
-    shallowReadonlyHandlers,
-    shallowReadonlyCollectionHandlers,
-    shallowReadonlyMap
-  );
-}
-function createReactiveObject(target, isReadonly2, baseHandlers, collectionHandlers, proxyMap) {
-  if (!isObject(target)) {
-    {
-      warn$2(
-        `value cannot be made ${isReadonly2 ? "readonly" : "reactive"}: ${String(
-          target
-        )}`
-      );
-    }
-    return target;
-  }
-  if (target["__v_raw"] && !(isReadonly2 && target["__v_isReactive"])) {
-    return target;
-  }
-  const existingProxy = proxyMap.get(target);
-  if (existingProxy) {
-    return existingProxy;
-  }
-  const targetType = getTargetType(target);
-  if (targetType === 0 /* INVALID */) {
-    return target;
-  }
-  const proxy = new Proxy(
-    target,
-    targetType === 2 /* COLLECTION */ ? collectionHandlers : baseHandlers
-  );
-  proxyMap.set(target, proxy);
-  return proxy;
-}
-function isReactive(value) {
-  if (isReadonly(value)) {
-    return isReactive(value["__v_raw"]);
-  }
-  return !!(value && value["__v_isReactive"]);
-}
-function isReadonly(value) {
-  return !!(value && value["__v_isReadonly"]);
-}
-function isShallow(value) {
-  return !!(value && value["__v_isShallow"]);
-}
-function isProxy(value) {
-  return value ? !!value["__v_raw"] : false;
-}
-function toRaw(observed) {
-  const raw = observed && observed["__v_raw"];
-  return raw ? toRaw(raw) : observed;
-}
-function markRaw(value) {
-  if (!hasOwn(value, "__v_skip") && Object.isExtensible(value)) {
-    def(value, "__v_skip", true);
-  }
-  return value;
-}
-const toReactive = (value) => isObject(value) ? reactive(value) : value;
-const toReadonly = (value) => isObject(value) ? readonly(value) : value;
-
-function isRef(r) {
-  return r ? r["__v_isRef"] === true : false;
-}
-function ref(value) {
-  return createRef(value, false);
-}
-function shallowRef(value) {
-  return createRef(value, true);
-}
-function createRef(rawValue, shallow) {
-  if (isRef(rawValue)) {
-    return rawValue;
-  }
-  return new RefImpl(rawValue, shallow);
-}
-class RefImpl {
-  constructor(value, isShallow2) {
-    this.dep = new Dep();
-    this["__v_isRef"] = true;
-    this["__v_isShallow"] = false;
-    this._rawValue = isShallow2 ? value : toRaw(value);
-    this._value = isShallow2 ? value : toReactive(value);
-    this["__v_isShallow"] = isShallow2;
-  }
-  get value() {
-    {
-      this.dep.track({
-        target: this,
-        type: "get",
-        key: "value"
-      });
-    }
-    return this._value;
-  }
-  set value(newValue) {
-    const oldValue = this._rawValue;
-    const useDirectValue = this["__v_isShallow"] || isShallow(newValue) || isReadonly(newValue);
-    newValue = useDirectValue ? newValue : toRaw(newValue);
-    if (hasChanged(newValue, oldValue)) {
-      this._rawValue = newValue;
-      this._value = useDirectValue ? newValue : toReactive(newValue);
-      {
-        this.dep.trigger({
-          target: this,
-          type: "set",
-          key: "value",
-          newValue,
-          oldValue
-        });
-      }
-    }
-  }
-}
-function triggerRef(ref2) {
-  if (ref2.dep) {
-    {
-      ref2.dep.trigger({
-        target: ref2,
-        type: "set",
-        key: "value",
-        newValue: ref2._value
-      });
-    }
-  }
-}
-function unref(ref2) {
-  return isRef(ref2) ? ref2.value : ref2;
-}
-function toValue(source) {
-  return isFunction(source) ? source() : unref(source);
-}
-const shallowUnwrapHandlers = {
-  get: (target, key, receiver) => key === "__v_raw" ? target : unref(Reflect.get(target, key, receiver)),
-  set: (target, key, value, receiver) => {
-    const oldValue = target[key];
-    if (isRef(oldValue) && !isRef(value)) {
-      oldValue.value = value;
-      return true;
-    } else {
-      return Reflect.set(target, key, value, receiver);
-    }
-  }
-};
-function proxyRefs(objectWithRefs) {
-  return isReactive(objectWithRefs) ? objectWithRefs : new Proxy(objectWithRefs, shallowUnwrapHandlers);
-}
-class CustomRefImpl {
-  constructor(factory) {
-    this["__v_isRef"] = true;
-    this._value = void 0;
-    const dep = this.dep = new Dep();
-    const { get, set } = factory(dep.track.bind(dep), dep.trigger.bind(dep));
-    this._get = get;
-    this._set = set;
-  }
-  get value() {
-    return this._value = this._get();
-  }
-  set value(newVal) {
-    this._set(newVal);
-  }
-}
-function customRef(factory) {
-  return new CustomRefImpl(factory);
-}
-function toRefs(object) {
-  if (!isProxy(object)) {
-    warn$2(`toRefs() expects a reactive object but received a plain one.`);
-  }
-  const ret = isArray(object) ? new Array(object.length) : {};
-  for (const key in object) {
-    ret[key] = propertyToRef(object, key);
-  }
-  return ret;
-}
-class ObjectRefImpl {
-  constructor(_object, _key, _defaultValue) {
-    this._object = _object;
-    this._key = _key;
-    this._defaultValue = _defaultValue;
-    this["__v_isRef"] = true;
-    this._value = void 0;
-  }
-  get value() {
-    const val = this._object[this._key];
-    return this._value = val === void 0 ? this._defaultValue : val;
-  }
-  set value(newVal) {
-    this._object[this._key] = newVal;
-  }
-  get dep() {
-    return getDepFromReactive(toRaw(this._object), this._key);
-  }
-}
-class GetterRefImpl {
-  constructor(_getter) {
-    this._getter = _getter;
-    this["__v_isRef"] = true;
-    this["__v_isReadonly"] = true;
-    this._value = void 0;
-  }
-  get value() {
-    return this._value = this._getter();
-  }
-}
-function toRef(source, key, defaultValue) {
-  if (isRef(source)) {
-    return source;
-  } else if (isFunction(source)) {
-    return new GetterRefImpl(source);
-  } else if (isObject(source) && arguments.length > 1) {
-    return propertyToRef(source, key, defaultValue);
-  } else {
-    return ref(source);
-  }
-}
-function propertyToRef(source, key, defaultValue) {
-  const val = source[key];
-  return isRef(val) ? val : new ObjectRefImpl(source, key, defaultValue);
-}
-
-class ComputedRefImpl {
-  constructor(fn, setter, isSSR) {
-    this.fn = fn;
-    this.setter = setter;
-    /**
-     * @internal
-     */
-    this._value = void 0;
-    /**
-     * @internal
-     */
-    this.dep = new Dep(this);
-    /**
-     * @internal
-     */
-    this.__v_isRef = true;
-    // TODO isolatedDeclarations "__v_isReadonly"
-    // A computed is also a subscriber that tracks other deps
-    /**
-     * @internal
-     */
-    this.deps = void 0;
-    /**
-     * @internal
-     */
-    this.depsTail = void 0;
-    /**
-     * @internal
-     */
-    this.flags = 16;
-    /**
-     * @internal
-     */
-    this.globalVersion = globalVersion - 1;
-    /**
-     * @internal
-     */
-    this.next = void 0;
-    // for backwards compat
-    this.effect = this;
-    this["__v_isReadonly"] = !setter;
-    this.isSSR = isSSR;
-  }
-  /**
-   * @internal
-   */
-  notify() {
-    this.flags |= 16;
-    if (!(this.flags & 8) && // avoid infinite self recursion
-    activeSub !== this) {
-      batch(this, true);
-      return true;
-    }
-  }
-  get value() {
-    const link = this.dep.track({
-      target: this,
-      type: "get",
-      key: "value"
-    }) ;
-    refreshComputed(this);
-    if (link) {
-      link.version = this.dep.version;
-    }
-    return this._value;
-  }
-  set value(newValue) {
-    if (this.setter) {
-      this.setter(newValue);
-    } else {
-      warn$2("Write operation failed: computed value is readonly");
-    }
-  }
-}
-function computed$1(getterOrOptions, debugOptions, isSSR = false) {
-  let getter;
-  let setter;
-  if (isFunction(getterOrOptions)) {
-    getter = getterOrOptions;
-  } else {
-    getter = getterOrOptions.get;
-    setter = getterOrOptions.set;
-  }
-  const cRef = new ComputedRefImpl(getter, setter, isSSR);
-  if (debugOptions && !isSSR) {
-    cRef.onTrack = debugOptions.onTrack;
-    cRef.onTrigger = debugOptions.onTrigger;
-  }
-  return cRef;
-}
-
-const TrackOpTypes = {
-  "GET": "get",
-  "HAS": "has",
-  "ITERATE": "iterate"
-};
-const TriggerOpTypes = {
-  "SET": "set",
-  "ADD": "add",
-  "DELETE": "delete",
-  "CLEAR": "clear"
-};
-
-const INITIAL_WATCHER_VALUE = {};
-const cleanupMap = /* @__PURE__ */ new WeakMap();
-let activeWatcher = void 0;
-function getCurrentWatcher() {
-  return activeWatcher;
-}
-function onWatcherCleanup(cleanupFn, failSilently = false, owner = activeWatcher) {
-  if (owner) {
-    let cleanups = cleanupMap.get(owner);
-    if (!cleanups) cleanupMap.set(owner, cleanups = []);
-    cleanups.push(cleanupFn);
-  } else if (!failSilently) {
-    warn$2(
-      `onWatcherCleanup() was called when there was no active watcher to associate with.`
-    );
-  }
-}
-function watch$1(source, cb, options = EMPTY_OBJ) {
-  const { immediate, deep, once, scheduler, augmentJob, call } = options;
-  const warnInvalidSource = (s) => {
-    (options.onWarn || warn$2)(
-      `Invalid watch source: `,
-      s,
-      `A watch source can only be a getter/effect function, a ref, a reactive object, or an array of these types.`
-    );
-  };
-  const reactiveGetter = (source2) => {
-    if (deep) return source2;
-    if (isShallow(source2) || deep === false || deep === 0)
-      return traverse(source2, 1);
-    return traverse(source2);
-  };
-  let effect;
-  let getter;
-  let cleanup;
-  let boundCleanup;
-  let forceTrigger = false;
-  let isMultiSource = false;
-  if (isRef(source)) {
-    getter = () => source.value;
-    forceTrigger = isShallow(source);
-  } else if (isReactive(source)) {
-    getter = () => reactiveGetter(source);
-    forceTrigger = true;
-  } else if (isArray(source)) {
-    isMultiSource = true;
-    forceTrigger = source.some((s) => isReactive(s) || isShallow(s));
-    getter = () => source.map((s) => {
-      if (isRef(s)) {
-        return s.value;
-      } else if (isReactive(s)) {
-        return reactiveGetter(s);
-      } else if (isFunction(s)) {
-        return call ? call(s, 2) : s();
-      } else {
-        warnInvalidSource(s);
-      }
-    });
-  } else if (isFunction(source)) {
-    if (cb) {
-      getter = call ? () => call(source, 2) : source;
-    } else {
-      getter = () => {
-        if (cleanup) {
-          pauseTracking();
-          try {
-            cleanup();
-          } finally {
-            resetTracking();
-          }
-        }
-        const currentEffect = activeWatcher;
-        activeWatcher = effect;
-        try {
-          return call ? call(source, 3, [boundCleanup]) : source(boundCleanup);
-        } finally {
-          activeWatcher = currentEffect;
-        }
-      };
-    }
-  } else {
-    getter = NOOP;
-    warnInvalidSource(source);
-  }
-  if (cb && deep) {
-    const baseGetter = getter;
-    const depth = deep === true ? Infinity : deep;
-    getter = () => traverse(baseGetter(), depth);
-  }
-  const scope = getCurrentScope();
-  const watchHandle = () => {
-    effect.stop();
-    if (scope) {
-      remove(scope.effects, effect);
-    }
-  };
-  if (once && cb) {
-    const _cb = cb;
-    cb = (...args) => {
-      _cb(...args);
-      watchHandle();
-    };
-  }
-  let oldValue = isMultiSource ? new Array(source.length).fill(INITIAL_WATCHER_VALUE) : INITIAL_WATCHER_VALUE;
-  const job = (immediateFirstRun) => {
-    if (!(effect.flags & 1) || !effect.dirty && !immediateFirstRun) {
-      return;
-    }
-    if (cb) {
-      const newValue = effect.run();
-      if (deep || forceTrigger || (isMultiSource ? newValue.some((v, i) => hasChanged(v, oldValue[i])) : hasChanged(newValue, oldValue))) {
-        if (cleanup) {
-          cleanup();
-        }
-        const currentWatcher = activeWatcher;
-        activeWatcher = effect;
-        try {
-          const args = [
-            newValue,
-            // pass undefined as the old value when it's changed for the first time
-            oldValue === INITIAL_WATCHER_VALUE ? void 0 : isMultiSource && oldValue[0] === INITIAL_WATCHER_VALUE ? [] : oldValue,
-            boundCleanup
-          ];
-          call ? call(cb, 3, args) : (
-            // @ts-expect-error
-            cb(...args)
-          );
-          oldValue = newValue;
-        } finally {
-          activeWatcher = currentWatcher;
-        }
-      }
-    } else {
-      effect.run();
-    }
-  };
-  if (augmentJob) {
-    augmentJob(job);
-  }
-  effect = new ReactiveEffect(getter);
-  effect.scheduler = scheduler ? () => scheduler(job, false) : job;
-  boundCleanup = (fn) => onWatcherCleanup(fn, false, effect);
-  cleanup = effect.onStop = () => {
-    const cleanups = cleanupMap.get(effect);
-    if (cleanups) {
-      if (call) {
-        call(cleanups, 4);
-      } else {
-        for (const cleanup2 of cleanups) cleanup2();
-      }
-      cleanupMap.delete(effect);
-    }
-  };
-  {
-    effect.onTrack = options.onTrack;
-    effect.onTrigger = options.onTrigger;
-  }
-  if (cb) {
-    if (immediate) {
-      job(true);
-    } else {
-      oldValue = effect.run();
-    }
-  } else if (scheduler) {
-    scheduler(job.bind(null, true), true);
-  } else {
-    effect.run();
-  }
-  watchHandle.pause = effect.pause.bind(effect);
-  watchHandle.resume = effect.resume.bind(effect);
-  watchHandle.stop = watchHandle;
-  return watchHandle;
-}
-function traverse(value, depth = Infinity, seen) {
-  if (depth <= 0 || !isObject(value) || value["__v_skip"]) {
-    return value;
-  }
-  seen = seen || /* @__PURE__ */ new Set();
-  if (seen.has(value)) {
-    return value;
-  }
-  seen.add(value);
-  depth--;
-  if (isRef(value)) {
-    traverse(value.value, depth, seen);
-  } else if (isArray(value)) {
-    for (let i = 0; i < value.length; i++) {
-      traverse(value[i], depth, seen);
-    }
-  } else if (isSet(value) || isMap(value)) {
-    value.forEach((v) => {
-      traverse(v, depth, seen);
-    });
-  } else if (isPlainObject(value)) {
-    for (const key in value) {
-      traverse(value[key], depth, seen);
-    }
-    for (const key of Object.getOwnPropertySymbols(value)) {
-      if (Object.prototype.propertyIsEnumerable.call(value, key)) {
-        traverse(value[key], depth, seen);
-      }
-    }
-  }
-  return value;
-}
-
-const stack$1 = [];
-function pushWarningContext(vnode) {
-  stack$1.push(vnode);
-}
-function popWarningContext() {
-  stack$1.pop();
-}
-let isWarning = false;
-function warn$1(msg, ...args) {
-  if (isWarning) return;
-  isWarning = true;
-  pauseTracking();
-  const instance = stack$1.length ? stack$1[stack$1.length - 1].component : null;
-  const appWarnHandler = instance && instance.appContext.config.warnHandler;
-  const trace = getComponentTrace();
-  if (appWarnHandler) {
-    callWithErrorHandling(
-      appWarnHandler,
-      instance,
-      11,
-      [
-        // eslint-disable-next-line no-restricted-syntax
-        msg + args.map((a) => {
-          var _a, _b;
-          return (_b = (_a = a.toString) == null ? void 0 : _a.call(a)) != null ? _b : JSON.stringify(a);
-        }).join(""),
-        instance && instance.proxy,
-        trace.map(
-          ({ vnode }) => `at <${formatComponentName(instance, vnode.type)}>`
-        ).join("\n"),
-        trace
-      ]
-    );
-  } else {
-    const warnArgs = [`[Vue warn]: ${msg}`, ...args];
-    if (trace.length && // avoid spamming console during tests
-    true) {
-      warnArgs.push(`
-`, ...formatTrace(trace));
-    }
-    console.warn(...warnArgs);
-  }
-  resetTracking();
-  isWarning = false;
-}
-function getComponentTrace() {
-  let currentVNode = stack$1[stack$1.length - 1];
-  if (!currentVNode) {
-    return [];
-  }
-  const normalizedStack = [];
-  while (currentVNode) {
-    const last = normalizedStack[0];
-    if (last && last.vnode === currentVNode) {
-      last.recurseCount++;
-    } else {
-      normalizedStack.push({
-        vnode: currentVNode,
-        recurseCount: 0
-      });
-    }
-    const parentInstance = currentVNode.component && currentVNode.component.parent;
-    currentVNode = parentInstance && parentInstance.vnode;
-  }
-  return normalizedStack;
-}
-function formatTrace(trace) {
-  const logs = [];
-  trace.forEach((entry, i) => {
-    logs.push(...i === 0 ? [] : [`
-`], ...formatTraceEntry(entry));
-  });
-  return logs;
-}
-function formatTraceEntry({ vnode, recurseCount }) {
-  const postfix = recurseCount > 0 ? `... (${recurseCount} recursive calls)` : ``;
-  const isRoot = vnode.component ? vnode.component.parent == null : false;
-  const open = ` at <${formatComponentName(
-    vnode.component,
-    vnode.type,
-    isRoot
-  )}`;
-  const close = `>` + postfix;
-  return vnode.props ? [open, ...formatProps(vnode.props), close] : [open + close];
-}
-function formatProps(props) {
-  const res = [];
-  const keys = Object.keys(props);
-  keys.slice(0, 3).forEach((key) => {
-    res.push(...formatProp(key, props[key]));
-  });
-  if (keys.length > 3) {
-    res.push(` ...`);
-  }
-  return res;
-}
-function formatProp(key, value, raw) {
-  if (isString(value)) {
-    value = JSON.stringify(value);
-    return raw ? value : [`${key}=${value}`];
-  } else if (typeof value === "number" || typeof value === "boolean" || value == null) {
-    return raw ? value : [`${key}=${value}`];
-  } else if (isRef(value)) {
-    value = formatProp(key, toRaw(value.value), true);
-    return raw ? value : [`${key}=Ref<`, value, `>`];
-  } else if (isFunction(value)) {
-    return [`${key}=fn${value.name ? `<${value.name}>` : ``}`];
-  } else {
-    value = toRaw(value);
-    return raw ? value : [`${key}=`, value];
-  }
-}
-function assertNumber(val, type) {
-  if (val === void 0) {
-    return;
-  } else if (typeof val !== "number") {
-    warn$1(`${type} is not a valid number - got ${JSON.stringify(val)}.`);
-  } else if (isNaN(val)) {
-    warn$1(`${type} is NaN - the duration expression might be incorrect.`);
-  }
-}
-
-const ErrorCodes = {
-  "SETUP_FUNCTION": 0,
-  "0": "SETUP_FUNCTION",
-  "RENDER_FUNCTION": 1,
-  "1": "RENDER_FUNCTION",
-  "NATIVE_EVENT_HANDLER": 5,
-  "5": "NATIVE_EVENT_HANDLER",
-  "COMPONENT_EVENT_HANDLER": 6,
-  "6": "COMPONENT_EVENT_HANDLER",
-  "VNODE_HOOK": 7,
-  "7": "VNODE_HOOK",
-  "DIRECTIVE_HOOK": 8,
-  "8": "DIRECTIVE_HOOK",
-  "TRANSITION_HOOK": 9,
-  "9": "TRANSITION_HOOK",
-  "APP_ERROR_HANDLER": 10,
-  "10": "APP_ERROR_HANDLER",
-  "APP_WARN_HANDLER": 11,
-  "11": "APP_WARN_HANDLER",
-  "FUNCTION_REF": 12,
-  "12": "FUNCTION_REF",
-  "ASYNC_COMPONENT_LOADER": 13,
-  "13": "ASYNC_COMPONENT_LOADER",
-  "SCHEDULER": 14,
-  "14": "SCHEDULER",
-  "COMPONENT_UPDATE": 15,
-  "15": "COMPONENT_UPDATE",
-  "APP_UNMOUNT_CLEANUP": 16,
-  "16": "APP_UNMOUNT_CLEANUP"
-};
-const ErrorTypeStrings$1 = {
-  ["sp"]: "serverPrefetch hook",
-  ["bc"]: "beforeCreate hook",
-  ["c"]: "created hook",
-  ["bm"]: "beforeMount hook",
-  ["m"]: "mounted hook",
-  ["bu"]: "beforeUpdate hook",
-  ["u"]: "updated",
-  ["bum"]: "beforeUnmount hook",
-  ["um"]: "unmounted hook",
-  ["a"]: "activated hook",
-  ["da"]: "deactivated hook",
-  ["ec"]: "errorCaptured hook",
-  ["rtc"]: "renderTracked hook",
-  ["rtg"]: "renderTriggered hook",
-  [0]: "setup function",
-  [1]: "render function",
-  [2]: "watcher getter",
-  [3]: "watcher callback",
-  [4]: "watcher cleanup function",
-  [5]: "native event handler",
-  [6]: "component event handler",
-  [7]: "vnode hook",
-  [8]: "directive hook",
-  [9]: "transition hook",
-  [10]: "app errorHandler",
-  [11]: "app warnHandler",
-  [12]: "ref function",
-  [13]: "async component loader",
-  [14]: "scheduler flush",
-  [15]: "component update",
-  [16]: "app unmount cleanup function"
-};
-function callWithErrorHandling(fn, instance, type, args) {
-  try {
-    return args ? fn(...args) : fn();
-  } catch (err) {
-    handleError(err, instance, type);
-  }
-}
-function callWithAsyncErrorHandling(fn, instance, type, args) {
-  if (isFunction(fn)) {
-    const res = callWithErrorHandling(fn, instance, type, args);
-    if (res && isPromise(res)) {
-      res.catch((err) => {
-        handleError(err, instance, type);
-      });
-    }
-    return res;
-  }
-  if (isArray(fn)) {
-    const values = [];
-    for (let i = 0; i < fn.length; i++) {
-      values.push(callWithAsyncErrorHandling(fn[i], instance, type, args));
-    }
-    return values;
-  } else {
-    warn$1(
-      `Invalid value type passed to callWithAsyncErrorHandling(): ${typeof fn}`
-    );
-  }
-}
-function handleError(err, instance, type, throwInDev = true) {
-  const contextVNode = instance ? instance.vnode : null;
-  const { errorHandler, throwUnhandledErrorInProduction } = instance && instance.appContext.config || EMPTY_OBJ;
-  if (instance) {
-    let cur = instance.parent;
-    const exposedInstance = instance.proxy;
-    const errorInfo = ErrorTypeStrings$1[type] ;
-    while (cur) {
-      const errorCapturedHooks = cur.ec;
-      if (errorCapturedHooks) {
-        for (let i = 0; i < errorCapturedHooks.length; i++) {
-          if (errorCapturedHooks[i](err, exposedInstance, errorInfo) === false) {
-            return;
-          }
-        }
-      }
-      cur = cur.parent;
-    }
-    if (errorHandler) {
-      pauseTracking();
-      callWithErrorHandling(errorHandler, null, 10, [
-        err,
-        exposedInstance,
-        errorInfo
-      ]);
-      resetTracking();
-      return;
-    }
-  }
-  logError(err, type, contextVNode, throwInDev, throwUnhandledErrorInProduction);
-}
-function logError(err, type, contextVNode, throwInDev = true, throwInProd = false) {
-  {
-    const info = ErrorTypeStrings$1[type];
-    if (contextVNode) {
-      pushWarningContext(contextVNode);
-    }
-    warn$1(`Unhandled error${info ? ` during execution of ${info}` : ``}`);
-    if (contextVNode) {
-      popWarningContext();
-    }
-    if (throwInDev) {
-      throw err;
-    } else {
-      console.error(err);
-    }
-  }
-}
-
-const queue = [];
-let flushIndex = -1;
-const pendingPostFlushCbs = [];
-let activePostFlushCbs = null;
-let postFlushIndex = 0;
-const resolvedPromise = /* @__PURE__ */ Promise.resolve();
-let currentFlushPromise = null;
-const RECURSION_LIMIT = 100;
-function nextTick(fn) {
-  const p = currentFlushPromise || resolvedPromise;
-  return fn ? p.then(this ? fn.bind(this) : fn) : p;
-}
-function findInsertionIndex(id) {
-  let start = flushIndex + 1;
-  let end = queue.length;
-  while (start < end) {
-    const middle = start + end >>> 1;
-    const middleJob = queue[middle];
-    const middleJobId = getId(middleJob);
-    if (middleJobId < id || middleJobId === id && middleJob.flags & 2) {
-      start = middle + 1;
-    } else {
-      end = middle;
-    }
-  }
-  return start;
-}
-function queueJob(job) {
-  if (!(job.flags & 1)) {
-    const jobId = getId(job);
-    const lastJob = queue[queue.length - 1];
-    if (!lastJob || // fast path when the job id is larger than the tail
-    !(job.flags & 2) && jobId >= getId(lastJob)) {
-      queue.push(job);
-    } else {
-      queue.splice(findInsertionIndex(jobId), 0, job);
-    }
-    job.flags |= 1;
-    queueFlush();
-  }
-}
-function queueFlush() {
-  if (!currentFlushPromise) {
-    currentFlushPromise = resolvedPromise.then(flushJobs);
-  }
-}
-function queuePostFlushCb(cb) {
-  if (!isArray(cb)) {
-    if (activePostFlushCbs && cb.id === -1) {
-      activePostFlushCbs.splice(postFlushIndex + 1, 0, cb);
-    } else if (!(cb.flags & 1)) {
-      pendingPostFlushCbs.push(cb);
-      cb.flags |= 1;
-    }
-  } else {
-    pendingPostFlushCbs.push(...cb);
-  }
-  queueFlush();
-}
-function flushPreFlushCbs(instance, seen, i = flushIndex + 1) {
-  {
-    seen = seen || /* @__PURE__ */ new Map();
-  }
-  for (; i < queue.length; i++) {
-    const cb = queue[i];
-    if (cb && cb.flags & 2) {
-      if (instance && cb.id !== instance.uid) {
-        continue;
-      }
-      if (checkRecursiveUpdates(seen, cb)) {
-        continue;
-      }
-      queue.splice(i, 1);
-      i--;
-      if (cb.flags & 4) {
-        cb.flags &= ~1;
-      }
-      cb();
-      if (!(cb.flags & 4)) {
-        cb.flags &= ~1;
-      }
-    }
-  }
-}
-function flushPostFlushCbs(seen) {
-  if (pendingPostFlushCbs.length) {
-    const deduped = [...new Set(pendingPostFlushCbs)].sort(
-      (a, b) => getId(a) - getId(b)
-    );
-    pendingPostFlushCbs.length = 0;
-    if (activePostFlushCbs) {
-      activePostFlushCbs.push(...deduped);
-      return;
-    }
-    activePostFlushCbs = deduped;
-    {
-      seen = seen || /* @__PURE__ */ new Map();
-    }
-    for (postFlushIndex = 0; postFlushIndex < activePostFlushCbs.length; postFlushIndex++) {
-      const cb = activePostFlushCbs[postFlushIndex];
-      if (checkRecursiveUpdates(seen, cb)) {
-        continue;
-      }
-      if (cb.flags & 4) {
-        cb.flags &= ~1;
-      }
-      if (!(cb.flags & 8)) cb();
-      cb.flags &= ~1;
-    }
-    activePostFlushCbs = null;
-    postFlushIndex = 0;
-  }
-}
-const getId = (job) => job.id == null ? job.flags & 2 ? -1 : Infinity : job.id;
-function flushJobs(seen) {
-  {
-    seen = seen || /* @__PURE__ */ new Map();
-  }
-  const check = (job) => checkRecursiveUpdates(seen, job) ;
-  try {
-    for (flushIndex = 0; flushIndex < queue.length; flushIndex++) {
-      const job = queue[flushIndex];
-      if (job && !(job.flags & 8)) {
-        if (check(job)) {
-          continue;
-        }
-        if (job.flags & 4) {
-          job.flags &= ~1;
-        }
-        callWithErrorHandling(
-          job,
-          job.i,
-          job.i ? 15 : 14
-        );
-        if (!(job.flags & 4)) {
-          job.flags &= ~1;
-        }
-      }
-    }
-  } finally {
-    for (; flushIndex < queue.length; flushIndex++) {
-      const job = queue[flushIndex];
-      if (job) {
-        job.flags &= ~1;
-      }
-    }
-    flushIndex = -1;
-    queue.length = 0;
-    flushPostFlushCbs(seen);
-    currentFlushPromise = null;
-    if (queue.length || pendingPostFlushCbs.length) {
-      flushJobs(seen);
-    }
-  }
-}
-function checkRecursiveUpdates(seen, fn) {
-  const count = seen.get(fn) || 0;
-  if (count > RECURSION_LIMIT) {
-    const instance = fn.i;
-    const componentName = instance && getComponentName(instance.type);
-    handleError(
-      `Maximum recursive updates exceeded${componentName ? ` in component <${componentName}>` : ``}. This means you have a reactive effect that is mutating its own dependencies and thus recursively triggering itself. Possible sources include component template, render function, updated hook or watcher source function.`,
-      null,
-      10
-    );
-    return true;
-  }
-  seen.set(fn, count + 1);
-  return false;
-}
-
-let isHmrUpdating = false;
-const hmrDirtyComponents = /* @__PURE__ */ new Map();
-{
-  getGlobalThis().__VUE_HMR_RUNTIME__ = {
-    createRecord: tryWrap(createRecord),
-    rerender: tryWrap(rerender),
-    reload: tryWrap(reload)
-  };
-}
-const map = /* @__PURE__ */ new Map();
-function registerHMR(instance) {
-  const id = instance.type.__hmrId;
-  let record = map.get(id);
-  if (!record) {
-    createRecord(id, instance.type);
-    record = map.get(id);
-  }
-  record.instances.add(instance);
-}
-function unregisterHMR(instance) {
-  map.get(instance.type.__hmrId).instances.delete(instance);
-}
-function createRecord(id, initialDef) {
-  if (map.has(id)) {
-    return false;
-  }
-  map.set(id, {
-    initialDef: normalizeClassComponent(initialDef),
-    instances: /* @__PURE__ */ new Set()
-  });
-  return true;
-}
-function normalizeClassComponent(component) {
-  return isClassComponent(component) ? component.__vccOpts : component;
-}
-function rerender(id, newRender) {
-  const record = map.get(id);
-  if (!record) {
-    return;
-  }
-  record.initialDef.render = newRender;
-  [...record.instances].forEach((instance) => {
-    if (newRender) {
-      instance.render = newRender;
-      normalizeClassComponent(instance.type).render = newRender;
-    }
-    instance.renderCache = [];
-    isHmrUpdating = true;
-    instance.update();
-    isHmrUpdating = false;
-  });
-}
-function reload(id, newComp) {
-  const record = map.get(id);
-  if (!record) return;
-  newComp = normalizeClassComponent(newComp);
-  updateComponentDef(record.initialDef, newComp);
-  const instances = [...record.instances];
-  for (let i = 0; i < instances.length; i++) {
-    const instance = instances[i];
-    const oldComp = normalizeClassComponent(instance.type);
-    let dirtyInstances = hmrDirtyComponents.get(oldComp);
-    if (!dirtyInstances) {
-      if (oldComp !== record.initialDef) {
-        updateComponentDef(oldComp, newComp);
-      }
-      hmrDirtyComponents.set(oldComp, dirtyInstances = /* @__PURE__ */ new Set());
-    }
-    dirtyInstances.add(instance);
-    instance.appContext.propsCache.delete(instance.type);
-    instance.appContext.emitsCache.delete(instance.type);
-    instance.appContext.optionsCache.delete(instance.type);
-    if (instance.ceReload) {
-      dirtyInstances.add(instance);
-      instance.ceReload(newComp.styles);
-      dirtyInstances.delete(instance);
-    } else if (instance.parent) {
-      queueJob(() => {
-        isHmrUpdating = true;
-        instance.parent.update();
-        isHmrUpdating = false;
-        dirtyInstances.delete(instance);
-      });
-    } else if (instance.appContext.reload) {
-      instance.appContext.reload();
-    } else if (typeof window !== "undefined") {
-      window.location.reload();
-    } else {
-      console.warn(
-        "[HMR] Root or manually mounted instance modified. Full reload required."
-      );
-    }
-    if (instance.root.ce && instance !== instance.root) {
-      instance.root.ce._removeChildStyle(oldComp);
-    }
-  }
-  queuePostFlushCb(() => {
-    hmrDirtyComponents.clear();
-  });
-}
-function updateComponentDef(oldComp, newComp) {
-  extend(oldComp, newComp);
-  for (const key in oldComp) {
-    if (key !== "__file" && !(key in newComp)) {
-      delete oldComp[key];
-    }
-  }
-}
-function tryWrap(fn) {
-  return (id, arg) => {
-    try {
-      return fn(id, arg);
-    } catch (e) {
-      console.error(e);
-      console.warn(
-        `[HMR] Something went wrong during Vue component hot-reload. Full reload required.`
-      );
-    }
-  };
-}
-
-let devtools$1;
-let buffer = [];
-let devtoolsNotInstalled = false;
-function emit$1(event, ...args) {
-  if (devtools$1) {
-    devtools$1.emit(event, ...args);
-  } else if (!devtoolsNotInstalled) {
-    buffer.push({ event, args });
-  }
-}
-function setDevtoolsHook$1(hook, target) {
-  var _a, _b;
-  devtools$1 = hook;
-  if (devtools$1) {
-    devtools$1.enabled = true;
-    buffer.forEach(({ event, args }) => devtools$1.emit(event, ...args));
-    buffer = [];
-  } else if (
-    // handle late devtools injection - only do this if we are in an actual
-    // browser environment to avoid the timer handle stalling test runner exit
-    // (#4815)
-    typeof window !== "undefined" && // some envs mock window but not fully
-    window.HTMLElement && // also exclude jsdom
-    // eslint-disable-next-line no-restricted-syntax
-    !((_b = (_a = window.navigator) == null ? void 0 : _a.userAgent) == null ? void 0 : _b.includes("jsdom"))
-  ) {
-    const replay = target.__VUE_DEVTOOLS_HOOK_REPLAY__ = target.__VUE_DEVTOOLS_HOOK_REPLAY__ || [];
-    replay.push((newHook) => {
-      setDevtoolsHook$1(newHook, target);
-    });
-    setTimeout(() => {
-      if (!devtools$1) {
-        target.__VUE_DEVTOOLS_HOOK_REPLAY__ = null;
-        devtoolsNotInstalled = true;
-        buffer = [];
-      }
-    }, 3e3);
-  } else {
-    devtoolsNotInstalled = true;
-    buffer = [];
-  }
-}
-function devtoolsInitApp(app, version) {
-  emit$1("app:init" /* APP_INIT */, app, version, {
-    Fragment,
-    Text,
-    Comment,
-    Static
-  });
-}
-function devtoolsUnmountApp(app) {
-  emit$1("app:unmount" /* APP_UNMOUNT */, app);
-}
-const devtoolsComponentAdded = /* @__PURE__ */ createDevtoolsComponentHook("component:added" /* COMPONENT_ADDED */);
-const devtoolsComponentUpdated = /* @__PURE__ */ createDevtoolsComponentHook("component:updated" /* COMPONENT_UPDATED */);
-const _devtoolsComponentRemoved = /* @__PURE__ */ createDevtoolsComponentHook(
-  "component:removed" /* COMPONENT_REMOVED */
-);
-const devtoolsComponentRemoved = (component) => {
-  if (devtools$1 && typeof devtools$1.cleanupBuffer === "function" && // remove the component if it wasn't buffered
-  !devtools$1.cleanupBuffer(component)) {
-    _devtoolsComponentRemoved(component);
-  }
-};
-/*! #__NO_SIDE_EFFECTS__ */
-// @__NO_SIDE_EFFECTS__
-function createDevtoolsComponentHook(hook) {
-  return (component) => {
-    emit$1(
-      hook,
-      component.appContext.app,
-      component.uid,
-      component.parent ? component.parent.uid : void 0,
-      component
-    );
-  };
-}
-const devtoolsPerfStart = /* @__PURE__ */ createDevtoolsPerformanceHook("perf:start" /* PERFORMANCE_START */);
-const devtoolsPerfEnd = /* @__PURE__ */ createDevtoolsPerformanceHook("perf:end" /* PERFORMANCE_END */);
-function createDevtoolsPerformanceHook(hook) {
-  return (component, type, time) => {
-    emit$1(hook, component.appContext.app, component.uid, component, type, time);
-  };
-}
-function devtoolsComponentEmit(component, event, params) {
-  emit$1(
-    "component:emit" /* COMPONENT_EMIT */,
-    component.appContext.app,
-    component,
-    event,
-    params
-  );
-}
-
-let currentRenderingInstance = null;
-let currentScopeId = null;
-function setCurrentRenderingInstance(instance) {
-  const prev = currentRenderingInstance;
-  currentRenderingInstance = instance;
-  currentScopeId = instance && instance.type.__scopeId || null;
-  return prev;
-}
-function pushScopeId(id) {
-  currentScopeId = id;
-}
-function popScopeId() {
-  currentScopeId = null;
-}
-const withScopeId = (_id) => withCtx;
-function withCtx(fn, ctx = currentRenderingInstance, isNonScopedSlot) {
-  if (!ctx) return fn;
-  if (fn._n) {
-    return fn;
-  }
-  const renderFnWithContext = (...args) => {
-    if (renderFnWithContext._d) {
-      setBlockTracking(-1);
-    }
-    const prevInstance = setCurrentRenderingInstance(ctx);
-    let res;
-    try {
-      res = fn(...args);
-    } finally {
-      setCurrentRenderingInstance(prevInstance);
-      if (renderFnWithContext._d) {
-        setBlockTracking(1);
-      }
-    }
-    {
-      devtoolsComponentUpdated(ctx);
-    }
-    return res;
-  };
-  renderFnWithContext._n = true;
-  renderFnWithContext._c = true;
-  renderFnWithContext._d = true;
-  return renderFnWithContext;
-}
-
-function validateDirectiveName(name) {
-  if (isBuiltInDirective(name)) {
-    warn$1("Do not use built-in directive ids as custom directive id: " + name);
-  }
-}
-function withDirectives(vnode, directives) {
-  if (currentRenderingInstance === null) {
-    warn$1(`withDirectives can only be used inside render functions.`);
-    return vnode;
-  }
-  const instance = getComponentPublicInstance(currentRenderingInstance);
-  const bindings = vnode.dirs || (vnode.dirs = []);
-  for (let i = 0; i < directives.length; i++) {
-    let [dir, value, arg, modifiers = EMPTY_OBJ] = directives[i];
-    if (dir) {
-      if (isFunction(dir)) {
-        dir = {
-          mounted: dir,
-          updated: dir
-        };
-      }
-      if (dir.deep) {
-        traverse(value);
-      }
-      bindings.push({
-        dir,
-        instance,
-        value,
-        oldValue: void 0,
-        arg,
-        modifiers
-      });
-    }
-  }
-  return vnode;
-}
-function invokeDirectiveHook(vnode, prevVNode, instance, name) {
-  const bindings = vnode.dirs;
-  const oldBindings = prevVNode && prevVNode.dirs;
-  for (let i = 0; i < bindings.length; i++) {
-    const binding = bindings[i];
-    if (oldBindings) {
-      binding.oldValue = oldBindings[i].value;
-    }
-    let hook = binding.dir[name];
-    if (hook) {
-      pauseTracking();
-      callWithAsyncErrorHandling(hook, instance, 8, [
-        vnode.el,
-        binding,
-        vnode,
-        prevVNode
-      ]);
-      resetTracking();
-    }
-  }
-}
-
-const TeleportEndKey = Symbol("_vte");
-const isTeleport = (type) => type.__isTeleport;
-const isTeleportDisabled = (props) => props && (props.disabled || props.disabled === "");
-const isTeleportDeferred = (props) => props && (props.defer || props.defer === "");
-const isTargetSVG = (target) => typeof SVGElement !== "undefined" && target instanceof SVGElement;
-const isTargetMathML = (target) => typeof MathMLElement === "function" && target instanceof MathMLElement;
-const resolveTarget = (props, select) => {
-  const targetSelector = props && props.to;
-  if (isString(targetSelector)) {
-    if (!select) {
-      warn$1(
-        `Current renderer does not support string target for Teleports. (missing querySelector renderer option)`
-      );
-      return null;
-    } else {
-      const target = select(targetSelector);
-      if (!target && !isTeleportDisabled(props)) {
-        warn$1(
-          `Failed to locate Teleport target with selector "${targetSelector}". Note the target element must exist before the component is mounted - i.e. the target cannot be rendered by the component itself, and ideally should be outside of the entire Vue component tree.`
-        );
-      }
-      return target;
-    }
-  } else {
-    if (!targetSelector && !isTeleportDisabled(props)) {
-      warn$1(`Invalid Teleport target: ${targetSelector}`);
-    }
-    return targetSelector;
-  }
-};
-const TeleportImpl = {
-  name: "Teleport",
-  __isTeleport: true,
-  process(n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, internals) {
-    const {
-      mc: mountChildren,
-      pc: patchChildren,
-      pbc: patchBlockChildren,
-      o: { insert, querySelector, createText, createComment }
-    } = internals;
-    const disabled = isTeleportDisabled(n2.props);
-    let { shapeFlag, children, dynamicChildren } = n2;
-    if (isHmrUpdating) {
-      optimized = false;
-      dynamicChildren = null;
-    }
-    if (n1 == null) {
-      const placeholder = n2.el = createComment("teleport start") ;
-      const mainAnchor = n2.anchor = createComment("teleport end") ;
-      insert(placeholder, container, anchor);
-      insert(mainAnchor, container, anchor);
-      const mount = (container2, anchor2) => {
-        if (shapeFlag & 16) {
-          if (parentComponent && parentComponent.isCE) {
-            parentComponent.ce._teleportTarget = container2;
-          }
-          mountChildren(
-            children,
-            container2,
-            anchor2,
-            parentComponent,
-            parentSuspense,
-            namespace,
-            slotScopeIds,
-            optimized
-          );
-        }
-      };
-      const mountToTarget = () => {
-        const target = n2.target = resolveTarget(n2.props, querySelector);
-        const targetAnchor = prepareAnchor(target, n2, createText, insert);
-        if (target) {
-          if (namespace !== "svg" && isTargetSVG(target)) {
-            namespace = "svg";
-          } else if (namespace !== "mathml" && isTargetMathML(target)) {
-            namespace = "mathml";
-          }
-          if (!disabled) {
-            mount(target, targetAnchor);
-            updateCssVars(n2, false);
-          }
-        } else if (!disabled) {
-          warn$1(
-            "Invalid Teleport target on mount:",
-            target,
-            `(${typeof target})`
-          );
-        }
-      };
-      if (disabled) {
-        mount(container, mainAnchor);
-        updateCssVars(n2, true);
-      }
-      if (isTeleportDeferred(n2.props)) {
-        queuePostRenderEffect(mountToTarget, parentSuspense);
-      } else {
-        mountToTarget();
-      }
-    } else {
-      n2.el = n1.el;
-      n2.targetStart = n1.targetStart;
-      const mainAnchor = n2.anchor = n1.anchor;
-      const target = n2.target = n1.target;
-      const targetAnchor = n2.targetAnchor = n1.targetAnchor;
-      const wasDisabled = isTeleportDisabled(n1.props);
-      const currentContainer = wasDisabled ? container : target;
-      const currentAnchor = wasDisabled ? mainAnchor : targetAnchor;
-      if (namespace === "svg" || isTargetSVG(target)) {
-        namespace = "svg";
-      } else if (namespace === "mathml" || isTargetMathML(target)) {
-        namespace = "mathml";
-      }
-      if (dynamicChildren) {
-        patchBlockChildren(
-          n1.dynamicChildren,
-          dynamicChildren,
-          currentContainer,
-          parentComponent,
-          parentSuspense,
-          namespace,
-          slotScopeIds
-        );
-        traverseStaticChildren(n1, n2, true);
-      } else if (!optimized) {
-        patchChildren(
-          n1,
-          n2,
-          currentContainer,
-          currentAnchor,
-          parentComponent,
-          parentSuspense,
-          namespace,
-          slotScopeIds,
-          false
-        );
-      }
-      if (disabled) {
-        if (!wasDisabled) {
-          moveTeleport(
-            n2,
-            container,
-            mainAnchor,
-            internals,
-            1
-          );
-        } else {
-          if (n2.props && n1.props && n2.props.to !== n1.props.to) {
-            n2.props.to = n1.props.to;
-          }
-        }
-      } else {
-        if ((n2.props && n2.props.to) !== (n1.props && n1.props.to)) {
-          const nextTarget = n2.target = resolveTarget(
-            n2.props,
-            querySelector
-          );
-          if (nextTarget) {
-            moveTeleport(
-              n2,
-              nextTarget,
-              null,
-              internals,
-              0
-            );
-          } else {
-            warn$1(
-              "Invalid Teleport target on update:",
-              target,
-              `(${typeof target})`
-            );
-          }
-        } else if (wasDisabled) {
-          moveTeleport(
-            n2,
-            target,
-            targetAnchor,
-            internals,
-            1
-          );
-        }
-      }
-      updateCssVars(n2, disabled);
-    }
-  },
-  remove(vnode, parentComponent, parentSuspense, { um: unmount, o: { remove: hostRemove } }, doRemove) {
-    const {
-      shapeFlag,
-      children,
-      anchor,
-      targetStart,
-      targetAnchor,
-      target,
-      props
-    } = vnode;
-    if (target) {
-      hostRemove(targetStart);
-      hostRemove(targetAnchor);
-    }
-    doRemove && hostRemove(anchor);
-    if (shapeFlag & 16) {
-      const shouldRemove = doRemove || !isTeleportDisabled(props);
-      for (let i = 0; i < children.length; i++) {
-        const child = children[i];
-        unmount(
-          child,
-          parentComponent,
-          parentSuspense,
-          shouldRemove,
-          !!child.dynamicChildren
-        );
-      }
-    }
-  },
-  move: moveTeleport,
-  hydrate: hydrateTeleport
-};
-function moveTeleport(vnode, container, parentAnchor, { o: { insert }, m: move }, moveType = 2) {
-  if (moveType === 0) {
-    insert(vnode.targetAnchor, container, parentAnchor);
-  }
-  const { el, anchor, shapeFlag, children, props } = vnode;
-  const isReorder = moveType === 2;
-  if (isReorder) {
-    insert(el, container, parentAnchor);
-  }
-  if (!isReorder || isTeleportDisabled(props)) {
-    if (shapeFlag & 16) {
-      for (let i = 0; i < children.length; i++) {
-        move(
-          children[i],
-          container,
-          parentAnchor,
-          2
-        );
-      }
-    }
-  }
-  if (isReorder) {
-    insert(anchor, container, parentAnchor);
-  }
-}
-function hydrateTeleport(node, vnode, parentComponent, parentSuspense, slotScopeIds, optimized, {
-  o: { nextSibling, parentNode, querySelector, insert, createText }
-}, hydrateChildren) {
-  const target = vnode.target = resolveTarget(
-    vnode.props,
-    querySelector
-  );
-  if (target) {
-    const disabled = isTeleportDisabled(vnode.props);
-    const targetNode = target._lpa || target.firstChild;
-    if (vnode.shapeFlag & 16) {
-      if (disabled) {
-        vnode.anchor = hydrateChildren(
-          nextSibling(node),
-          vnode,
-          parentNode(node),
-          parentComponent,
-          parentSuspense,
-          slotScopeIds,
-          optimized
-        );
-        vnode.targetStart = targetNode;
-        vnode.targetAnchor = targetNode && nextSibling(targetNode);
-      } else {
-        vnode.anchor = nextSibling(node);
-        let targetAnchor = targetNode;
-        while (targetAnchor) {
-          if (targetAnchor && targetAnchor.nodeType === 8) {
-            if (targetAnchor.data === "teleport start anchor") {
-              vnode.targetStart = targetAnchor;
-            } else if (targetAnchor.data === "teleport anchor") {
-              vnode.targetAnchor = targetAnchor;
-              target._lpa = vnode.targetAnchor && nextSibling(vnode.targetAnchor);
-              break;
-            }
-          }
-          targetAnchor = nextSibling(targetAnchor);
-        }
-        if (!vnode.targetAnchor) {
-          prepareAnchor(target, vnode, createText, insert);
-        }
-        hydrateChildren(
-          targetNode && nextSibling(targetNode),
-          vnode,
-          target,
-          parentComponent,
-          parentSuspense,
-          slotScopeIds,
-          optimized
-        );
-      }
-    }
-    updateCssVars(vnode, disabled);
-  }
-  return vnode.anchor && nextSibling(vnode.anchor);
-}
-const Teleport = TeleportImpl;
-function updateCssVars(vnode, isDisabled) {
-  const ctx = vnode.ctx;
-  if (ctx && ctx.ut) {
-    let node, anchor;
-    if (isDisabled) {
-      node = vnode.el;
-      anchor = vnode.anchor;
-    } else {
-      node = vnode.targetStart;
-      anchor = vnode.targetAnchor;
-    }
-    while (node && node !== anchor) {
-      if (node.nodeType === 1) node.setAttribute("data-v-owner", ctx.uid);
-      node = node.nextSibling;
-    }
-    ctx.ut();
-  }
-}
-function prepareAnchor(target, vnode, createText, insert) {
-  const targetStart = vnode.targetStart = createText("");
-  const targetAnchor = vnode.targetAnchor = createText("");
-  targetStart[TeleportEndKey] = targetAnchor;
-  if (target) {
-    insert(targetStart, target);
-    insert(targetAnchor, target);
-  }
-  return targetAnchor;
-}
-
-const leaveCbKey = Symbol("_leaveCb");
-const enterCbKey$1 = Symbol("_enterCb");
-function useTransitionState() {
-  const state = {
-    isMounted: false,
-    isLeaving: false,
-    isUnmounting: false,
-    leavingVNodes: /* @__PURE__ */ new Map()
-  };
-  onMounted(() => {
-    state.isMounted = true;
-  });
-  onBeforeUnmount(() => {
-    state.isUnmounting = true;
-  });
-  return state;
-}
-const TransitionHookValidator = [Function, Array];
-const BaseTransitionPropsValidators = {
-  mode: String,
-  appear: Boolean,
-  persisted: Boolean,
-  // enter
-  onBeforeEnter: TransitionHookValidator,
-  onEnter: TransitionHookValidator,
-  onAfterEnter: TransitionHookValidator,
-  onEnterCancelled: TransitionHookValidator,
-  // leave
-  onBeforeLeave: TransitionHookValidator,
-  onLeave: TransitionHookValidator,
-  onAfterLeave: TransitionHookValidator,
-  onLeaveCancelled: TransitionHookValidator,
-  // appear
-  onBeforeAppear: TransitionHookValidator,
-  onAppear: TransitionHookValidator,
-  onAfterAppear: TransitionHookValidator,
-  onAppearCancelled: TransitionHookValidator
-};
-const recursiveGetSubtree = (instance) => {
-  const subTree = instance.subTree;
-  return subTree.component ? recursiveGetSubtree(subTree.component) : subTree;
-};
-const BaseTransitionImpl = {
-  name: `BaseTransition`,
-  props: BaseTransitionPropsValidators,
-  setup(props, { slots }) {
-    const instance = getCurrentInstance();
-    const state = useTransitionState();
-    return () => {
-      const children = slots.default && getTransitionRawChildren(slots.default(), true);
-      if (!children || !children.length) {
-        return;
-      }
-      const child = findNonCommentChild(children);
-      const rawProps = toRaw(props);
-      const { mode } = rawProps;
-      if (mode && mode !== "in-out" && mode !== "out-in" && mode !== "default") {
-        warn$1(`invalid <transition> mode: ${mode}`);
-      }
-      if (state.isLeaving) {
-        return emptyPlaceholder(child);
-      }
-      const innerChild = getInnerChild$1(child);
-      if (!innerChild) {
-        return emptyPlaceholder(child);
-      }
-      let enterHooks = resolveTransitionHooks(
-        innerChild,
-        rawProps,
-        state,
-        instance,
-        // #11061, ensure enterHooks is fresh after clone
-        (hooks) => enterHooks = hooks
-      );
-      if (innerChild.type !== Comment) {
-        setTransitionHooks(innerChild, enterHooks);
-      }
-      const oldChild = instance.subTree;
-      const oldInnerChild = oldChild && getInnerChild$1(oldChild);
-      if (oldInnerChild && oldInnerChild.type !== Comment && !isSameVNodeType(innerChild, oldInnerChild) && recursiveGetSubtree(instance).type !== Comment) {
-        const leavingHooks = resolveTransitionHooks(
-          oldInnerChild,
-          rawProps,
-          state,
-          instance
-        );
-        setTransitionHooks(oldInnerChild, leavingHooks);
-        if (mode === "out-in" && innerChild.type !== Comment) {
-          state.isLeaving = true;
-          leavingHooks.afterLeave = () => {
-            state.isLeaving = false;
-            if (!(instance.job.flags & 8)) {
-              instance.update();
-            }
-            delete leavingHooks.afterLeave;
-          };
-          return emptyPlaceholder(child);
-        } else if (mode === "in-out" && innerChild.type !== Comment) {
-          leavingHooks.delayLeave = (el, earlyRemove, delayedLeave) => {
-            const leavingVNodesCache = getLeavingNodesForType(
-              state,
-              oldInnerChild
-            );
-            leavingVNodesCache[String(oldInnerChild.key)] = oldInnerChild;
-            el[leaveCbKey] = () => {
-              earlyRemove();
-              el[leaveCbKey] = void 0;
-              delete enterHooks.delayedLeave;
-            };
-            enterHooks.delayedLeave = delayedLeave;
-          };
-        }
-      }
-      return child;
-    };
-  }
-};
-function findNonCommentChild(children) {
-  let child = children[0];
-  if (children.length > 1) {
-    let hasFound = false;
-    for (const c of children) {
-      if (c.type !== Comment) {
-        if (hasFound) {
-          warn$1(
-            "<transition> can only be used on a single element or component. Use <transition-group> for lists."
-          );
-          break;
-        }
-        child = c;
-        hasFound = true;
-      }
-    }
-  }
-  return child;
-}
-const BaseTransition = BaseTransitionImpl;
-function getLeavingNodesForType(state, vnode) {
-  const { leavingVNodes } = state;
-  let leavingVNodesCache = leavingVNodes.get(vnode.type);
-  if (!leavingVNodesCache) {
-    leavingVNodesCache = /* @__PURE__ */ Object.create(null);
-    leavingVNodes.set(vnode.type, leavingVNodesCache);
-  }
-  return leavingVNodesCache;
-}
-function resolveTransitionHooks(vnode, props, state, instance, postClone) {
-  const {
-    appear,
-    mode,
-    persisted = false,
-    onBeforeEnter,
-    onEnter,
-    onAfterEnter,
-    onEnterCancelled,
-    onBeforeLeave,
-    onLeave,
-    onAfterLeave,
-    onLeaveCancelled,
-    onBeforeAppear,
-    onAppear,
-    onAfterAppear,
-    onAppearCancelled
-  } = props;
-  const key = String(vnode.key);
-  const leavingVNodesCache = getLeavingNodesForType(state, vnode);
-  const callHook = (hook, args) => {
-    hook && callWithAsyncErrorHandling(
-      hook,
-      instance,
-      9,
-      args
-    );
-  };
-  const callAsyncHook = (hook, args) => {
-    const done = args[1];
-    callHook(hook, args);
-    if (isArray(hook)) {
-      if (hook.every((hook2) => hook2.length <= 1)) done();
-    } else if (hook.length <= 1) {
-      done();
-    }
-  };
-  const hooks = {
-    mode,
-    persisted,
-    beforeEnter(el) {
-      let hook = onBeforeEnter;
-      if (!state.isMounted) {
-        if (appear) {
-          hook = onBeforeAppear || onBeforeEnter;
-        } else {
-          return;
-        }
-      }
-      if (el[leaveCbKey]) {
-        el[leaveCbKey](
-          true
-          /* cancelled */
-        );
-      }
-      const leavingVNode = leavingVNodesCache[key];
-      if (leavingVNode && isSameVNodeType(vnode, leavingVNode) && leavingVNode.el[leaveCbKey]) {
-        leavingVNode.el[leaveCbKey]();
-      }
-      callHook(hook, [el]);
-    },
-    enter(el) {
-      let hook = onEnter;
-      let afterHook = onAfterEnter;
-      let cancelHook = onEnterCancelled;
-      if (!state.isMounted) {
-        if (appear) {
-          hook = onAppear || onEnter;
-          afterHook = onAfterAppear || onAfterEnter;
-          cancelHook = onAppearCancelled || onEnterCancelled;
-        } else {
-          return;
-        }
-      }
-      let called = false;
-      const done = el[enterCbKey$1] = (cancelled) => {
-        if (called) return;
-        called = true;
-        if (cancelled) {
-          callHook(cancelHook, [el]);
-        } else {
-          callHook(afterHook, [el]);
-        }
-        if (hooks.delayedLeave) {
-          hooks.delayedLeave();
-        }
-        el[enterCbKey$1] = void 0;
-      };
-      if (hook) {
-        callAsyncHook(hook, [el, done]);
-      } else {
-        done();
-      }
-    },
-    leave(el, remove) {
-      const key2 = String(vnode.key);
-      if (el[enterCbKey$1]) {
-        el[enterCbKey$1](
-          true
-          /* cancelled */
-        );
-      }
-      if (state.isUnmounting) {
-        return remove();
-      }
-      callHook(onBeforeLeave, [el]);
-      let called = false;
-      const done = el[leaveCbKey] = (cancelled) => {
-        if (called) return;
-        called = true;
-        remove();
-        if (cancelled) {
-          callHook(onLeaveCancelled, [el]);
-        } else {
-          callHook(onAfterLeave, [el]);
-        }
-        el[leaveCbKey] = void 0;
-        if (leavingVNodesCache[key2] === vnode) {
-          delete leavingVNodesCache[key2];
-        }
-      };
-      leavingVNodesCache[key2] = vnode;
-      if (onLeave) {
-        callAsyncHook(onLeave, [el, done]);
-      } else {
-        done();
-      }
-    },
-    clone(vnode2) {
-      const hooks2 = resolveTransitionHooks(
-        vnode2,
-        props,
-        state,
-        instance,
-        postClone
-      );
-      if (postClone) postClone(hooks2);
-      return hooks2;
-    }
-  };
-  return hooks;
-}
-function emptyPlaceholder(vnode) {
-  if (isKeepAlive(vnode)) {
-    vnode = cloneVNode(vnode);
-    vnode.children = null;
-    return vnode;
-  }
-}
-function getInnerChild$1(vnode) {
-  if (!isKeepAlive(vnode)) {
-    if (isTeleport(vnode.type) && vnode.children) {
-      return findNonCommentChild(vnode.children);
-    }
-    return vnode;
-  }
-  if (vnode.component) {
-    return vnode.component.subTree;
-  }
-  const { shapeFlag, children } = vnode;
-  if (children) {
-    if (shapeFlag & 16) {
-      return children[0];
-    }
-    if (shapeFlag & 32 && isFunction(children.default)) {
-      return children.default();
-    }
-  }
-}
-function setTransitionHooks(vnode, hooks) {
-  if (vnode.shapeFlag & 6 && vnode.component) {
-    vnode.transition = hooks;
-    setTransitionHooks(vnode.component.subTree, hooks);
-  } else if (vnode.shapeFlag & 128) {
-    vnode.ssContent.transition = hooks.clone(vnode.ssContent);
-    vnode.ssFallback.transition = hooks.clone(vnode.ssFallback);
-  } else {
-    vnode.transition = hooks;
-  }
-}
-function getTransitionRawChildren(children, keepComment = false, parentKey) {
-  let ret = [];
-  let keyedFragmentCount = 0;
-  for (let i = 0; i < children.length; i++) {
-    let child = children[i];
-    const key = parentKey == null ? child.key : String(parentKey) + String(child.key != null ? child.key : i);
-    if (child.type === Fragment) {
-      if (child.patchFlag & 128) keyedFragmentCount++;
-      ret = ret.concat(
-        getTransitionRawChildren(child.children, keepComment, key)
-      );
-    } else if (keepComment || child.type !== Comment) {
-      ret.push(key != null ? cloneVNode(child, { key }) : child);
-    }
-  }
-  if (keyedFragmentCount > 1) {
-    for (let i = 0; i < ret.length; i++) {
-      ret[i].patchFlag = -2;
-    }
-  }
-  return ret;
-}
-
-/*! #__NO_SIDE_EFFECTS__ */
-// @__NO_SIDE_EFFECTS__
-function defineComponent(options, extraOptions) {
-  return isFunction(options) ? (
-    // #8236: extend call and options.name access are considered side-effects
-    // by Rollup, so we have to wrap it in a pure-annotated IIFE.
-    /* @__PURE__ */ (() => extend({ name: options.name }, extraOptions, { setup: options }))()
-  ) : options;
-}
-
-function useId() {
-  const i = getCurrentInstance();
-  if (i) {
-    return (i.appContext.config.idPrefix || "v") + "-" + i.ids[0] + i.ids[1]++;
-  } else {
-    warn$1(
-      `useId() is called when there is no active component instance to be associated with.`
-    );
-  }
-  return "";
-}
-function markAsyncBoundary(instance) {
-  instance.ids = [instance.ids[0] + instance.ids[2]++ + "-", 0, 0];
-}
-
-const knownTemplateRefs = /* @__PURE__ */ new WeakSet();
-function useTemplateRef(key) {
-  const i = getCurrentInstance();
-  const r = shallowRef(null);
-  if (i) {
-    const refs = i.refs === EMPTY_OBJ ? i.refs = {} : i.refs;
-    let desc;
-    if ((desc = Object.getOwnPropertyDescriptor(refs, key)) && !desc.configurable) {
-      warn$1(`useTemplateRef('${key}') already exists.`);
-    } else {
-      Object.defineProperty(refs, key, {
-        enumerable: true,
-        get: () => r.value,
-        set: (val) => r.value = val
-      });
-    }
-  } else {
-    warn$1(
-      `useTemplateRef() is called when there is no active component instance to be associated with.`
-    );
-  }
-  const ret = readonly(r) ;
-  {
-    knownTemplateRefs.add(ret);
-  }
-  return ret;
-}
-
-function setRef(rawRef, oldRawRef, parentSuspense, vnode, isUnmount = false) {
-  if (isArray(rawRef)) {
-    rawRef.forEach(
-      (r, i) => setRef(
-        r,
-        oldRawRef && (isArray(oldRawRef) ? oldRawRef[i] : oldRawRef),
-        parentSuspense,
-        vnode,
-        isUnmount
-      )
-    );
-    return;
-  }
-  if (isAsyncWrapper(vnode) && !isUnmount) {
-    return;
-  }
-  const refValue = vnode.shapeFlag & 4 ? getComponentPublicInstance(vnode.component) : vnode.el;
-  const value = isUnmount ? null : refValue;
-  const { i: owner, r: ref } = rawRef;
-  if (!owner) {
-    warn$1(
-      `Missing ref owner context. ref cannot be used on hoisted vnodes. A vnode with ref must be created inside the render function.`
-    );
-    return;
-  }
-  const oldRef = oldRawRef && oldRawRef.r;
-  const refs = owner.refs === EMPTY_OBJ ? owner.refs = {} : owner.refs;
-  const setupState = owner.setupState;
-  const rawSetupState = toRaw(setupState);
-  const canSetSetupRef = setupState === EMPTY_OBJ ? () => false : (key) => {
-    {
-      if (hasOwn(rawSetupState, key) && !isRef(rawSetupState[key])) {
-        warn$1(
-          `Template ref "${key}" used on a non-ref value. It will not work in the production build.`
-        );
-      }
-      if (knownTemplateRefs.has(rawSetupState[key])) {
-        return false;
-      }
-    }
-    return hasOwn(rawSetupState, key);
-  };
-  if (oldRef != null && oldRef !== ref) {
-    if (isString(oldRef)) {
-      refs[oldRef] = null;
-      if (canSetSetupRef(oldRef)) {
-        setupState[oldRef] = null;
-      }
-    } else if (isRef(oldRef)) {
-      oldRef.value = null;
-    }
-  }
-  if (isFunction(ref)) {
-    callWithErrorHandling(ref, owner, 12, [value, refs]);
-  } else {
-    const _isString = isString(ref);
-    const _isRef = isRef(ref);
-    if (_isString || _isRef) {
-      const doSet = () => {
-        if (rawRef.f) {
-          const existing = _isString ? canSetSetupRef(ref) ? setupState[ref] : refs[ref] : ref.value;
-          if (isUnmount) {
-            isArray(existing) && remove(existing, refValue);
-          } else {
-            if (!isArray(existing)) {
-              if (_isString) {
-                refs[ref] = [refValue];
-                if (canSetSetupRef(ref)) {
-                  setupState[ref] = refs[ref];
-                }
-              } else {
-                ref.value = [refValue];
-                if (rawRef.k) refs[rawRef.k] = ref.value;
-              }
-            } else if (!existing.includes(refValue)) {
-              existing.push(refValue);
-            }
-          }
-        } else if (_isString) {
-          refs[ref] = value;
-          if (canSetSetupRef(ref)) {
-            setupState[ref] = value;
-          }
-        } else if (_isRef) {
-          ref.value = value;
-          if (rawRef.k) refs[rawRef.k] = value;
-        } else {
-          warn$1("Invalid template ref type:", ref, `(${typeof ref})`);
-        }
-      };
-      if (value) {
-        doSet.id = -1;
-        queuePostRenderEffect(doSet, parentSuspense);
-      } else {
-        doSet();
-      }
-    } else {
-      warn$1("Invalid template ref type:", ref, `(${typeof ref})`);
-    }
-  }
-}
-
-let hasLoggedMismatchError = false;
-const logMismatchError = () => {
-  if (hasLoggedMismatchError) {
-    return;
-  }
-  console.error("Hydration completed but contains mismatches.");
-  hasLoggedMismatchError = true;
-};
-const isSVGContainer = (container) => container.namespaceURI.includes("svg") && container.tagName !== "foreignObject";
-const isMathMLContainer = (container) => container.namespaceURI.includes("MathML");
-const getContainerType = (container) => {
-  if (container.nodeType !== 1) return void 0;
-  if (isSVGContainer(container)) return "svg";
-  if (isMathMLContainer(container)) return "mathml";
-  return void 0;
-};
-const isComment = (node) => node.nodeType === 8;
-function createHydrationFunctions(rendererInternals) {
-  const {
-    mt: mountComponent,
-    p: patch,
-    o: {
-      patchProp,
-      createText,
-      nextSibling,
-      parentNode,
-      remove,
-      insert,
-      createComment
-    }
-  } = rendererInternals;
-  const hydrate = (vnode, container) => {
-    if (!container.hasChildNodes()) {
-      warn$1(
-        `Attempting to hydrate existing markup but container is empty. Performing full mount instead.`
-      );
-      patch(null, vnode, container);
-      flushPostFlushCbs();
-      container._vnode = vnode;
-      return;
-    }
-    hydrateNode(container.firstChild, vnode, null, null, null);
-    flushPostFlushCbs();
-    container._vnode = vnode;
-  };
-  const hydrateNode = (node, vnode, parentComponent, parentSuspense, slotScopeIds, optimized = false) => {
-    optimized = optimized || !!vnode.dynamicChildren;
-    const isFragmentStart = isComment(node) && node.data === "[";
-    const onMismatch = () => handleMismatch(
-      node,
-      vnode,
-      parentComponent,
-      parentSuspense,
-      slotScopeIds,
-      isFragmentStart
-    );
-    const { type, ref, shapeFlag, patchFlag } = vnode;
-    let domType = node.nodeType;
-    vnode.el = node;
-    {
-      def(node, "__vnode", vnode, true);
-      def(node, "__vueParentComponent", parentComponent, true);
-    }
-    if (patchFlag === -2) {
-      optimized = false;
-      vnode.dynamicChildren = null;
-    }
-    let nextNode = null;
-    switch (type) {
-      case Text:
-        if (domType !== 3) {
-          if (vnode.children === "") {
-            insert(vnode.el = createText(""), parentNode(node), node);
-            nextNode = node;
-          } else {
-            nextNode = onMismatch();
-          }
-        } else {
-          if (node.data !== vnode.children) {
-            warn$1(
-              `Hydration text mismatch in`,
-              node.parentNode,
-              `
-  - rendered on server: ${JSON.stringify(
-                node.data
-              )}
-  - expected on client: ${JSON.stringify(vnode.children)}`
-            );
-            logMismatchError();
-            node.data = vnode.children;
-          }
-          nextNode = nextSibling(node);
-        }
-        break;
-      case Comment:
-        if (isTemplateNode(node)) {
-          nextNode = nextSibling(node);
-          replaceNode(
-            vnode.el = node.content.firstChild,
-            node,
-            parentComponent
-          );
-        } else if (domType !== 8 || isFragmentStart) {
-          nextNode = onMismatch();
-        } else {
-          nextNode = nextSibling(node);
-        }
-        break;
-      case Static:
-        if (isFragmentStart) {
-          node = nextSibling(node);
-          domType = node.nodeType;
-        }
-        if (domType === 1 || domType === 3) {
-          nextNode = node;
-          const needToAdoptContent = !vnode.children.length;
-          for (let i = 0; i < vnode.staticCount; i++) {
-            if (needToAdoptContent)
-              vnode.children += nextNode.nodeType === 1 ? nextNode.outerHTML : nextNode.data;
-            if (i === vnode.staticCount - 1) {
-              vnode.anchor = nextNode;
-            }
-            nextNode = nextSibling(nextNode);
-          }
-          return isFragmentStart ? nextSibling(nextNode) : nextNode;
-        } else {
-          onMismatch();
-        }
-        break;
-      case Fragment:
-        if (!isFragmentStart) {
-          nextNode = onMismatch();
-        } else {
-          nextNode = hydrateFragment(
-            node,
-            vnode,
-            parentComponent,
-            parentSuspense,
-            slotScopeIds,
-            optimized
-          );
-        }
-        break;
-      default:
-        if (shapeFlag & 1) {
-          if ((domType !== 1 || vnode.type.toLowerCase() !== node.tagName.toLowerCase()) && !isTemplateNode(node)) {
-            nextNode = onMismatch();
-          } else {
-            nextNode = hydrateElement(
-              node,
-              vnode,
-              parentComponent,
-              parentSuspense,
-              slotScopeIds,
-              optimized
-            );
-          }
-        } else if (shapeFlag & 6) {
-          vnode.slotScopeIds = slotScopeIds;
-          const container = parentNode(node);
-          if (isFragmentStart) {
-            nextNode = locateClosingAnchor(node);
-          } else if (isComment(node) && node.data === "teleport start") {
-            nextNode = locateClosingAnchor(node, node.data, "teleport end");
-          } else {
-            nextNode = nextSibling(node);
-          }
-          mountComponent(
-            vnode,
-            container,
-            null,
-            parentComponent,
-            parentSuspense,
-            getContainerType(container),
-            optimized
-          );
-          if (isAsyncWrapper(vnode)) {
-            let subTree;
-            if (isFragmentStart) {
-              subTree = createVNode(Fragment);
-              subTree.anchor = nextNode ? nextNode.previousSibling : container.lastChild;
-            } else {
-              subTree = node.nodeType === 3 ? createTextVNode("") : createVNode("div");
-            }
-            subTree.el = node;
-            vnode.component.subTree = subTree;
-          }
-        } else if (shapeFlag & 64) {
-          if (domType !== 8) {
-            nextNode = onMismatch();
-          } else {
-            nextNode = vnode.type.hydrate(
-              node,
-              vnode,
-              parentComponent,
-              parentSuspense,
-              slotScopeIds,
-              optimized,
-              rendererInternals,
-              hydrateChildren
-            );
-          }
-        } else if (shapeFlag & 128) {
-          nextNode = vnode.type.hydrate(
-            node,
-            vnode,
-            parentComponent,
-            parentSuspense,
-            getContainerType(parentNode(node)),
-            slotScopeIds,
-            optimized,
-            rendererInternals,
-            hydrateNode
-          );
-        } else {
-          warn$1("Invalid HostVNode type:", type, `(${typeof type})`);
-        }
-    }
-    if (ref != null) {
-      setRef(ref, null, parentSuspense, vnode);
-    }
-    return nextNode;
-  };
-  const hydrateElement = (el, vnode, parentComponent, parentSuspense, slotScopeIds, optimized) => {
-    optimized = optimized || !!vnode.dynamicChildren;
-    const { type, props, patchFlag, shapeFlag, dirs, transition } = vnode;
-    const forcePatch = type === "input" || type === "option";
-    {
-      if (dirs) {
-        invokeDirectiveHook(vnode, null, parentComponent, "created");
-      }
-      let needCallTransitionHooks = false;
-      if (isTemplateNode(el)) {
-        needCallTransitionHooks = needTransition(
-          null,
-          // no need check parentSuspense in hydration
-          transition
-        ) && parentComponent && parentComponent.vnode.props && parentComponent.vnode.props.appear;
-        const content = el.content.firstChild;
-        if (needCallTransitionHooks) {
-          transition.beforeEnter(content);
-        }
-        replaceNode(content, el, parentComponent);
-        vnode.el = el = content;
-      }
-      if (shapeFlag & 16 && // skip if element has innerHTML / textContent
-      !(props && (props.innerHTML || props.textContent))) {
-        let next = hydrateChildren(
-          el.firstChild,
-          vnode,
-          el,
-          parentComponent,
-          parentSuspense,
-          slotScopeIds,
-          optimized
-        );
-        let hasWarned = false;
-        while (next) {
-          if (!isMismatchAllowed(el, 1 /* CHILDREN */)) {
-            if (!hasWarned) {
-              warn$1(
-                `Hydration children mismatch on`,
-                el,
-                `
-Server rendered element contains more child nodes than client vdom.`
-              );
-              hasWarned = true;
-            }
-            logMismatchError();
-          }
-          const cur = next;
-          next = next.nextSibling;
-          remove(cur);
-        }
-      } else if (shapeFlag & 8) {
-        let clientText = vnode.children;
-        if (clientText[0] === "\n" && (el.tagName === "PRE" || el.tagName === "TEXTAREA")) {
-          clientText = clientText.slice(1);
-        }
-        if (el.textContent !== clientText) {
-          if (!isMismatchAllowed(el, 0 /* TEXT */)) {
-            warn$1(
-              `Hydration text content mismatch on`,
-              el,
-              `
-  - rendered on server: ${el.textContent}
-  - expected on client: ${vnode.children}`
-            );
-            logMismatchError();
-          }
-          el.textContent = vnode.children;
-        }
-      }
-      if (props) {
-        {
-          const isCustomElement = el.tagName.includes("-");
-          for (const key in props) {
-            if (// #11189 skip if this node has directives that have created hooks
-            // as it could have mutated the DOM in any possible way
-            !(dirs && dirs.some((d) => d.dir.created)) && propHasMismatch(el, key, props[key], vnode, parentComponent)) {
-              logMismatchError();
-            }
-            if (forcePatch && (key.endsWith("value") || key === "indeterminate") || isOn(key) && !isReservedProp(key) || // force hydrate v-bind with .prop modifiers
-            key[0] === "." || isCustomElement) {
-              patchProp(el, key, null, props[key], void 0, parentComponent);
-            }
-          }
-        }
-      }
-      let vnodeHooks;
-      if (vnodeHooks = props && props.onVnodeBeforeMount) {
-        invokeVNodeHook(vnodeHooks, parentComponent, vnode);
-      }
-      if (dirs) {
-        invokeDirectiveHook(vnode, null, parentComponent, "beforeMount");
-      }
-      if ((vnodeHooks = props && props.onVnodeMounted) || dirs || needCallTransitionHooks) {
-        queueEffectWithSuspense(() => {
-          vnodeHooks && invokeVNodeHook(vnodeHooks, parentComponent, vnode);
-          needCallTransitionHooks && transition.enter(el);
-          dirs && invokeDirectiveHook(vnode, null, parentComponent, "mounted");
-        }, parentSuspense);
-      }
-    }
-    return el.nextSibling;
-  };
-  const hydrateChildren = (node, parentVNode, container, parentComponent, parentSuspense, slotScopeIds, optimized) => {
-    optimized = optimized || !!parentVNode.dynamicChildren;
-    const children = parentVNode.children;
-    const l = children.length;
-    let hasWarned = false;
-    for (let i = 0; i < l; i++) {
-      const vnode = optimized ? children[i] : children[i] = normalizeVNode(children[i]);
-      const isText = vnode.type === Text;
-      if (node) {
-        if (isText && !optimized) {
-          if (i + 1 < l && normalizeVNode(children[i + 1]).type === Text) {
-            insert(
-              createText(
-                node.data.slice(vnode.children.length)
-              ),
-              container,
-              nextSibling(node)
-            );
-            node.data = vnode.children;
-          }
-        }
-        node = hydrateNode(
-          node,
-          vnode,
-          parentComponent,
-          parentSuspense,
-          slotScopeIds,
-          optimized
-        );
-      } else if (isText && !vnode.children) {
-        insert(vnode.el = createText(""), container);
-      } else {
-        if (!isMismatchAllowed(container, 1 /* CHILDREN */)) {
-          if (!hasWarned) {
-            warn$1(
-              `Hydration children mismatch on`,
-              container,
-              `
-Server rendered element contains fewer child nodes than client vdom.`
-            );
-            hasWarned = true;
-          }
-          logMismatchError();
-        }
-        patch(
-          null,
-          vnode,
-          container,
-          null,
-          parentComponent,
-          parentSuspense,
-          getContainerType(container),
-          slotScopeIds
-        );
-      }
-    }
-    return node;
-  };
-  const hydrateFragment = (node, vnode, parentComponent, parentSuspense, slotScopeIds, optimized) => {
-    const { slotScopeIds: fragmentSlotScopeIds } = vnode;
-    if (fragmentSlotScopeIds) {
-      slotScopeIds = slotScopeIds ? slotScopeIds.concat(fragmentSlotScopeIds) : fragmentSlotScopeIds;
-    }
-    const container = parentNode(node);
-    const next = hydrateChildren(
-      nextSibling(node),
-      vnode,
-      container,
-      parentComponent,
-      parentSuspense,
-      slotScopeIds,
-      optimized
-    );
-    if (next && isComment(next) && next.data === "]") {
-      return nextSibling(vnode.anchor = next);
-    } else {
-      logMismatchError();
-      insert(vnode.anchor = createComment(`]`), container, next);
-      return next;
-    }
-  };
-  const handleMismatch = (node, vnode, parentComponent, parentSuspense, slotScopeIds, isFragment) => {
-    if (!isMismatchAllowed(node.parentElement, 1 /* CHILDREN */)) {
-      warn$1(
-        `Hydration node mismatch:
-- rendered on server:`,
-        node,
-        node.nodeType === 3 ? `(text)` : isComment(node) && node.data === "[" ? `(start of fragment)` : ``,
-        `
-- expected on client:`,
-        vnode.type
-      );
-      logMismatchError();
-    }
-    vnode.el = null;
-    if (isFragment) {
-      const end = locateClosingAnchor(node);
-      while (true) {
-        const next2 = nextSibling(node);
-        if (next2 && next2 !== end) {
-          remove(next2);
-        } else {
-          break;
-        }
-      }
-    }
-    const next = nextSibling(node);
-    const container = parentNode(node);
-    remove(node);
-    patch(
-      null,
-      vnode,
-      container,
-      next,
-      parentComponent,
-      parentSuspense,
-      getContainerType(container),
-      slotScopeIds
-    );
-    return next;
-  };
-  const locateClosingAnchor = (node, open = "[", close = "]") => {
-    let match = 0;
-    while (node) {
-      node = nextSibling(node);
-      if (node && isComment(node)) {
-        if (node.data === open) match++;
-        if (node.data === close) {
-          if (match === 0) {
-            return nextSibling(node);
-          } else {
-            match--;
-          }
-        }
-      }
-    }
-    return node;
-  };
-  const replaceNode = (newNode, oldNode, parentComponent) => {
-    const parentNode2 = oldNode.parentNode;
-    if (parentNode2) {
-      parentNode2.replaceChild(newNode, oldNode);
-    }
-    let parent = parentComponent;
-    while (parent) {
-      if (parent.vnode.el === oldNode) {
-        parent.vnode.el = parent.subTree.el = newNode;
-      }
-      parent = parent.parent;
-    }
-  };
-  const isTemplateNode = (node) => {
-    return node.nodeType === 1 && node.tagName === "TEMPLATE";
-  };
-  return [hydrate, hydrateNode];
-}
-function propHasMismatch(el, key, clientValue, vnode, instance) {
-  let mismatchType;
-  let mismatchKey;
-  let actual;
-  let expected;
-  if (key === "class") {
-    actual = el.getAttribute("class");
-    expected = normalizeClass(clientValue);
-    if (!isSetEqual(toClassSet(actual || ""), toClassSet(expected))) {
-      mismatchType = 2 /* CLASS */;
-      mismatchKey = `class`;
-    }
-  } else if (key === "style") {
-    actual = el.getAttribute("style") || "";
-    expected = isString(clientValue) ? clientValue : stringifyStyle(normalizeStyle(clientValue));
-    const actualMap = toStyleMap(actual);
-    const expectedMap = toStyleMap(expected);
-    if (vnode.dirs) {
-      for (const { dir, value } of vnode.dirs) {
-        if (dir.name === "show" && !value) {
-          expectedMap.set("display", "none");
-        }
-      }
-    }
-    if (instance) {
-      resolveCssVars(instance, vnode, expectedMap);
-    }
-    if (!isMapEqual(actualMap, expectedMap)) {
-      mismatchType = 3 /* STYLE */;
-      mismatchKey = "style";
-    }
-  } else if (el instanceof SVGElement && isKnownSvgAttr(key) || el instanceof HTMLElement && (isBooleanAttr(key) || isKnownHtmlAttr(key))) {
-    if (isBooleanAttr(key)) {
-      actual = el.hasAttribute(key);
-      expected = includeBooleanAttr(clientValue);
-    } else if (clientValue == null) {
-      actual = el.hasAttribute(key);
-      expected = false;
-    } else {
-      if (el.hasAttribute(key)) {
-        actual = el.getAttribute(key);
-      } else if (key === "value" && el.tagName === "TEXTAREA") {
-        actual = el.value;
-      } else {
-        actual = false;
-      }
-      expected = isRenderableAttrValue(clientValue) ? String(clientValue) : false;
-    }
-    if (actual !== expected) {
-      mismatchType = 4 /* ATTRIBUTE */;
-      mismatchKey = key;
-    }
-  }
-  if (mismatchType != null && !isMismatchAllowed(el, mismatchType)) {
-    const format = (v) => v === false ? `(not rendered)` : `${mismatchKey}="${v}"`;
-    const preSegment = `Hydration ${MismatchTypeString[mismatchType]} mismatch on`;
-    const postSegment = `
-  - rendered on server: ${format(actual)}
-  - expected on client: ${format(expected)}
-  Note: this mismatch is check-only. The DOM will not be rectified in production due to performance overhead.
-  You should fix the source of the mismatch.`;
-    {
-      warn$1(preSegment, el, postSegment);
-    }
-    return true;
-  }
-  return false;
-}
-function toClassSet(str) {
-  return new Set(str.trim().split(/\s+/));
-}
-function isSetEqual(a, b) {
-  if (a.size !== b.size) {
-    return false;
-  }
-  for (const s of a) {
-    if (!b.has(s)) {
-      return false;
-    }
-  }
-  return true;
-}
-function toStyleMap(str) {
-  const styleMap = /* @__PURE__ */ new Map();
-  for (const item of str.split(";")) {
-    let [key, value] = item.split(":");
-    key = key.trim();
-    value = value && value.trim();
-    if (key && value) {
-      styleMap.set(key, value);
-    }
-  }
-  return styleMap;
-}
-function isMapEqual(a, b) {
-  if (a.size !== b.size) {
-    return false;
-  }
-  for (const [key, value] of a) {
-    if (value !== b.get(key)) {
-      return false;
-    }
-  }
-  return true;
-}
-function resolveCssVars(instance, vnode, expectedMap) {
-  const root = instance.subTree;
-  if (instance.getCssVars && (vnode === root || root && root.type === Fragment && root.children.includes(vnode))) {
-    const cssVars = instance.getCssVars();
-    for (const key in cssVars) {
-      expectedMap.set(
-        `--${getEscapedCssVarName(key)}`,
-        String(cssVars[key])
-      );
-    }
-  }
-  if (vnode === root && instance.parent) {
-    resolveCssVars(instance.parent, instance.vnode, expectedMap);
-  }
-}
-const allowMismatchAttr = "data-allow-mismatch";
-const MismatchTypeString = {
-  [0 /* TEXT */]: "text",
-  [1 /* CHILDREN */]: "children",
-  [2 /* CLASS */]: "class",
-  [3 /* STYLE */]: "style",
-  [4 /* ATTRIBUTE */]: "attribute"
-};
-function isMismatchAllowed(el, allowedType) {
-  if (allowedType === 0 /* TEXT */ || allowedType === 1 /* CHILDREN */) {
-    while (el && !el.hasAttribute(allowMismatchAttr)) {
-      el = el.parentElement;
-    }
-  }
-  const allowedAttr = el && el.getAttribute(allowMismatchAttr);
-  if (allowedAttr == null) {
-    return false;
-  } else if (allowedAttr === "") {
-    return true;
-  } else {
-    const list = allowedAttr.split(",");
-    if (allowedType === 0 /* TEXT */ && list.includes("children")) {
-      return true;
-    }
-    return allowedAttr.split(",").includes(MismatchTypeString[allowedType]);
-  }
-}
-
-const requestIdleCallback = getGlobalThis().requestIdleCallback || ((cb) => setTimeout(cb, 1));
-const cancelIdleCallback = getGlobalThis().cancelIdleCallback || ((id) => clearTimeout(id));
-const hydrateOnIdle = (timeout = 1e4) => (hydrate) => {
-  const id = requestIdleCallback(hydrate, { timeout });
-  return () => cancelIdleCallback(id);
-};
-function elementIsVisibleInViewport(el) {
-  const { top, left, bottom, right } = el.getBoundingClientRect();
-  const { innerHeight, innerWidth } = window;
-  return (top > 0 && top < innerHeight || bottom > 0 && bottom < innerHeight) && (left > 0 && left < innerWidth || right > 0 && right < innerWidth);
-}
-const hydrateOnVisible = (opts) => (hydrate, forEach) => {
-  const ob = new IntersectionObserver((entries) => {
-    for (const e of entries) {
-      if (!e.isIntersecting) continue;
-      ob.disconnect();
-      hydrate();
-      break;
-    }
-  }, opts);
-  forEach((el) => {
-    if (!(el instanceof Element)) return;
-    if (elementIsVisibleInViewport(el)) {
-      hydrate();
-      ob.disconnect();
-      return false;
-    }
-    ob.observe(el);
-  });
-  return () => ob.disconnect();
-};
-const hydrateOnMediaQuery = (query) => (hydrate) => {
-  if (query) {
-    const mql = matchMedia(query);
-    if (mql.matches) {
-      hydrate();
-    } else {
-      mql.addEventListener("change", hydrate, { once: true });
-      return () => mql.removeEventListener("change", hydrate);
-    }
-  }
-};
-const hydrateOnInteraction = (interactions = []) => (hydrate, forEach) => {
-  if (isString(interactions)) interactions = [interactions];
-  let hasHydrated = false;
-  const doHydrate = (e) => {
-    if (!hasHydrated) {
-      hasHydrated = true;
-      teardown();
-      hydrate();
-      e.target.dispatchEvent(new e.constructor(e.type, e));
-    }
-  };
-  const teardown = () => {
-    forEach((el) => {
-      for (const i of interactions) {
-        el.removeEventListener(i, doHydrate);
-      }
-    });
-  };
-  forEach((el) => {
-    for (const i of interactions) {
-      el.addEventListener(i, doHydrate, { once: true });
-    }
-  });
-  return teardown;
-};
-function forEachElement(node, cb) {
-  if (isComment(node) && node.data === "[") {
-    let depth = 1;
-    let next = node.nextSibling;
-    while (next) {
-      if (next.nodeType === 1) {
-        const result = cb(next);
-        if (result === false) {
-          break;
-        }
-      } else if (isComment(next)) {
-        if (next.data === "]") {
-          if (--depth === 0) break;
-        } else if (next.data === "[") {
-          depth++;
-        }
-      }
-      next = next.nextSibling;
-    }
-  } else {
-    cb(node);
-  }
-}
-
-const isAsyncWrapper = (i) => !!i.type.__asyncLoader;
-/*! #__NO_SIDE_EFFECTS__ */
-// @__NO_SIDE_EFFECTS__
-function defineAsyncComponent(source) {
-  if (isFunction(source)) {
-    source = { loader: source };
-  }
-  const {
-    loader,
-    loadingComponent,
-    errorComponent,
-    delay = 200,
-    hydrate: hydrateStrategy,
-    timeout,
-    // undefined = never times out
-    suspensible = true,
-    onError: userOnError
-  } = source;
-  let pendingRequest = null;
-  let resolvedComp;
-  let retries = 0;
-  const retry = () => {
-    retries++;
-    pendingRequest = null;
-    return load();
-  };
-  const load = () => {
-    let thisRequest;
-    return pendingRequest || (thisRequest = pendingRequest = loader().catch((err) => {
-      err = err instanceof Error ? err : new Error(String(err));
-      if (userOnError) {
-        return new Promise((resolve, reject) => {
-          const userRetry = () => resolve(retry());
-          const userFail = () => reject(err);
-          userOnError(err, userRetry, userFail, retries + 1);
-        });
-      } else {
-        throw err;
-      }
-    }).then((comp) => {
-      if (thisRequest !== pendingRequest && pendingRequest) {
-        return pendingRequest;
-      }
-      if (!comp) {
-        warn$1(
-          `Async component loader resolved to undefined. If you are using retry(), make sure to return its return value.`
-        );
-      }
-      if (comp && (comp.__esModule || comp[Symbol.toStringTag] === "Module")) {
-        comp = comp.default;
-      }
-      if (comp && !isObject(comp) && !isFunction(comp)) {
-        throw new Error(`Invalid async component load result: ${comp}`);
-      }
-      resolvedComp = comp;
-      return comp;
-    }));
-  };
-  return defineComponent({
-    name: "AsyncComponentWrapper",
-    __asyncLoader: load,
-    __asyncHydrate(el, instance, hydrate) {
-      const doHydrate = hydrateStrategy ? () => {
-        const teardown = hydrateStrategy(
-          hydrate,
-          (cb) => forEachElement(el, cb)
-        );
-        if (teardown) {
-          (instance.bum || (instance.bum = [])).push(teardown);
-        }
-      } : hydrate;
-      if (resolvedComp) {
-        doHydrate();
-      } else {
-        load().then(() => !instance.isUnmounted && doHydrate());
-      }
-    },
-    get __asyncResolved() {
-      return resolvedComp;
-    },
-    setup() {
-      const instance = currentInstance;
-      markAsyncBoundary(instance);
-      if (resolvedComp) {
-        return () => createInnerComp(resolvedComp, instance);
-      }
-      const onError = (err) => {
-        pendingRequest = null;
-        handleError(
-          err,
-          instance,
-          13,
-          !errorComponent
-        );
-      };
-      if (suspensible && instance.suspense || isInSSRComponentSetup) {
-        return load().then((comp) => {
-          return () => createInnerComp(comp, instance);
-        }).catch((err) => {
-          onError(err);
-          return () => errorComponent ? createVNode(errorComponent, {
-            error: err
-          }) : null;
-        });
-      }
-      const loaded = ref(false);
-      const error = ref();
-      const delayed = ref(!!delay);
-      if (delay) {
-        setTimeout(() => {
-          delayed.value = false;
-        }, delay);
-      }
-      if (timeout != null) {
-        setTimeout(() => {
-          if (!loaded.value && !error.value) {
-            const err = new Error(
-              `Async component timed out after ${timeout}ms.`
-            );
-            onError(err);
-            error.value = err;
-          }
-        }, timeout);
-      }
-      load().then(() => {
-        loaded.value = true;
-        if (instance.parent && isKeepAlive(instance.parent.vnode)) {
-          instance.parent.update();
-        }
-      }).catch((err) => {
-        onError(err);
-        error.value = err;
-      });
-      return () => {
-        if (loaded.value && resolvedComp) {
-          return createInnerComp(resolvedComp, instance);
-        } else if (error.value && errorComponent) {
-          return createVNode(errorComponent, {
-            error: error.value
-          });
-        } else if (loadingComponent && !delayed.value) {
-          return createVNode(loadingComponent);
-        }
-      };
-    }
-  });
-}
-function createInnerComp(comp, parent) {
-  const { ref: ref2, props, children, ce } = parent.vnode;
-  const vnode = createVNode(comp, props, children);
-  vnode.ref = ref2;
-  vnode.ce = ce;
-  delete parent.vnode.ce;
-  return vnode;
-}
-
-const isKeepAlive = (vnode) => vnode.type.__isKeepAlive;
-const KeepAliveImpl = {
-  name: `KeepAlive`,
-  // Marker for special handling inside the renderer. We are not using a ===
-  // check directly on KeepAlive in the renderer, because importing it directly
-  // would prevent it from being tree-shaken.
-  __isKeepAlive: true,
-  props: {
-    include: [String, RegExp, Array],
-    exclude: [String, RegExp, Array],
-    max: [String, Number]
-  },
-  setup(props, { slots }) {
-    const instance = getCurrentInstance();
-    const sharedContext = instance.ctx;
-    if (!sharedContext.renderer) {
-      return () => {
-        const children = slots.default && slots.default();
-        return children && children.length === 1 ? children[0] : children;
-      };
-    }
-    const cache = /* @__PURE__ */ new Map();
-    const keys = /* @__PURE__ */ new Set();
-    let current = null;
-    {
-      instance.__v_cache = cache;
-    }
-    const parentSuspense = instance.suspense;
-    const {
-      renderer: {
-        p: patch,
-        m: move,
-        um: _unmount,
-        o: { createElement }
-      }
-    } = sharedContext;
-    const storageContainer = createElement("div");
-    sharedContext.activate = (vnode, container, anchor, namespace, optimized) => {
-      const instance2 = vnode.component;
-      move(vnode, container, anchor, 0, parentSuspense);
-      patch(
-        instance2.vnode,
-        vnode,
-        container,
-        anchor,
-        instance2,
-        parentSuspense,
-        namespace,
-        vnode.slotScopeIds,
-        optimized
-      );
-      queuePostRenderEffect(() => {
-        instance2.isDeactivated = false;
-        if (instance2.a) {
-          invokeArrayFns(instance2.a);
-        }
-        const vnodeHook = vnode.props && vnode.props.onVnodeMounted;
-        if (vnodeHook) {
-          invokeVNodeHook(vnodeHook, instance2.parent, vnode);
-        }
-      }, parentSuspense);
-      {
-        devtoolsComponentAdded(instance2);
-      }
-    };
-    sharedContext.deactivate = (vnode) => {
-      const instance2 = vnode.component;
-      invalidateMount(instance2.m);
-      invalidateMount(instance2.a);
-      move(vnode, storageContainer, null, 1, parentSuspense);
-      queuePostRenderEffect(() => {
-        if (instance2.da) {
-          invokeArrayFns(instance2.da);
-        }
-        const vnodeHook = vnode.props && vnode.props.onVnodeUnmounted;
-        if (vnodeHook) {
-          invokeVNodeHook(vnodeHook, instance2.parent, vnode);
-        }
-        instance2.isDeactivated = true;
-      }, parentSuspense);
-      {
-        devtoolsComponentAdded(instance2);
-      }
-    };
-    function unmount(vnode) {
-      resetShapeFlag(vnode);
-      _unmount(vnode, instance, parentSuspense, true);
-    }
-    function pruneCache(filter) {
-      cache.forEach((vnode, key) => {
-        const name = getComponentName(vnode.type);
-        if (name && !filter(name)) {
-          pruneCacheEntry(key);
-        }
-      });
-    }
-    function pruneCacheEntry(key) {
-      const cached = cache.get(key);
-      if (cached && (!current || !isSameVNodeType(cached, current))) {
-        unmount(cached);
-      } else if (current) {
-        resetShapeFlag(current);
-      }
-      cache.delete(key);
-      keys.delete(key);
-    }
-    watch(
-      () => [props.include, props.exclude],
-      ([include, exclude]) => {
-        include && pruneCache((name) => matches(include, name));
-        exclude && pruneCache((name) => !matches(exclude, name));
-      },
-      // prune post-render after `current` has been updated
-      { flush: "post", deep: true }
-    );
-    let pendingCacheKey = null;
-    const cacheSubtree = () => {
-      if (pendingCacheKey != null) {
-        if (isSuspense(instance.subTree.type)) {
-          queuePostRenderEffect(() => {
-            cache.set(pendingCacheKey, getInnerChild(instance.subTree));
-          }, instance.subTree.suspense);
-        } else {
-          cache.set(pendingCacheKey, getInnerChild(instance.subTree));
-        }
-      }
-    };
-    onMounted(cacheSubtree);
-    onUpdated(cacheSubtree);
-    onBeforeUnmount(() => {
-      cache.forEach((cached) => {
-        const { subTree, suspense } = instance;
-        const vnode = getInnerChild(subTree);
-        if (cached.type === vnode.type && cached.key === vnode.key) {
-          resetShapeFlag(vnode);
-          const da = vnode.component.da;
-          da && queuePostRenderEffect(da, suspense);
-          return;
-        }
-        unmount(cached);
-      });
-    });
-    return () => {
-      pendingCacheKey = null;
-      if (!slots.default) {
-        return current = null;
-      }
-      const children = slots.default();
-      const rawVNode = children[0];
-      if (children.length > 1) {
-        {
-          warn$1(`KeepAlive should contain exactly one component child.`);
-        }
-        current = null;
-        return children;
-      } else if (!isVNode(rawVNode) || !(rawVNode.shapeFlag & 4) && !(rawVNode.shapeFlag & 128)) {
-        current = null;
-        return rawVNode;
-      }
-      let vnode = getInnerChild(rawVNode);
-      if (vnode.type === Comment) {
-        current = null;
-        return vnode;
-      }
-      const comp = vnode.type;
-      const name = getComponentName(
-        isAsyncWrapper(vnode) ? vnode.type.__asyncResolved || {} : comp
-      );
-      const { include, exclude, max } = props;
-      if (include && (!name || !matches(include, name)) || exclude && name && matches(exclude, name)) {
-        vnode.shapeFlag &= ~256;
-        current = vnode;
-        return rawVNode;
-      }
-      const key = vnode.key == null ? comp : vnode.key;
-      const cachedVNode = cache.get(key);
-      if (vnode.el) {
-        vnode = cloneVNode(vnode);
-        if (rawVNode.shapeFlag & 128) {
-          rawVNode.ssContent = vnode;
-        }
-      }
-      pendingCacheKey = key;
-      if (cachedVNode) {
-        vnode.el = cachedVNode.el;
-        vnode.component = cachedVNode.component;
-        if (vnode.transition) {
-          setTransitionHooks(vnode, vnode.transition);
-        }
-        vnode.shapeFlag |= 512;
-        keys.delete(key);
-        keys.add(key);
-      } else {
-        keys.add(key);
-        if (max && keys.size > parseInt(max, 10)) {
-          pruneCacheEntry(keys.values().next().value);
-        }
-      }
-      vnode.shapeFlag |= 256;
-      current = vnode;
-      return isSuspense(rawVNode.type) ? rawVNode : vnode;
-    };
-  }
-};
-const KeepAlive = KeepAliveImpl;
-function matches(pattern, name) {
-  if (isArray(pattern)) {
-    return pattern.some((p) => matches(p, name));
-  } else if (isString(pattern)) {
-    return pattern.split(",").includes(name);
-  } else if (isRegExp(pattern)) {
-    pattern.lastIndex = 0;
-    return pattern.test(name);
-  }
-  return false;
-}
-function onActivated(hook, target) {
-  registerKeepAliveHook(hook, "a", target);
-}
-function onDeactivated(hook, target) {
-  registerKeepAliveHook(hook, "da", target);
-}
-function registerKeepAliveHook(hook, type, target = currentInstance) {
-  const wrappedHook = hook.__wdc || (hook.__wdc = () => {
-    let current = target;
-    while (current) {
-      if (current.isDeactivated) {
-        return;
-      }
-      current = current.parent;
-    }
-    return hook();
-  });
-  injectHook(type, wrappedHook, target);
-  if (target) {
-    let current = target.parent;
-    while (current && current.parent) {
-      if (isKeepAlive(current.parent.vnode)) {
-        injectToKeepAliveRoot(wrappedHook, type, target, current);
-      }
-      current = current.parent;
-    }
-  }
-}
-function injectToKeepAliveRoot(hook, type, target, keepAliveRoot) {
-  const injected = injectHook(
-    type,
-    hook,
-    keepAliveRoot,
-    true
-    /* prepend */
-  );
-  onUnmounted(() => {
-    remove(keepAliveRoot[type], injected);
-  }, target);
-}
-function resetShapeFlag(vnode) {
-  vnode.shapeFlag &= ~256;
-  vnode.shapeFlag &= ~512;
-}
-function getInnerChild(vnode) {
-  return vnode.shapeFlag & 128 ? vnode.ssContent : vnode;
-}
-
-function injectHook(type, hook, target = currentInstance, prepend = false) {
-  if (target) {
-    const hooks = target[type] || (target[type] = []);
-    const wrappedHook = hook.__weh || (hook.__weh = (...args) => {
-      pauseTracking();
-      const reset = setCurrentInstance(target);
-      const res = callWithAsyncErrorHandling(hook, target, type, args);
-      reset();
-      resetTracking();
-      return res;
-    });
-    if (prepend) {
-      hooks.unshift(wrappedHook);
-    } else {
-      hooks.push(wrappedHook);
-    }
-    return wrappedHook;
-  } else {
-    const apiName = toHandlerKey(ErrorTypeStrings$1[type].replace(/ hook$/, ""));
-    warn$1(
-      `${apiName} is called when there is no active component instance to be associated with. Lifecycle injection APIs can only be used during execution of setup().` + (` If you are using async setup(), make sure to register lifecycle hooks before the first await statement.` )
-    );
-  }
-}
-const createHook = (lifecycle) => (hook, target = currentInstance) => {
-  if (!isInSSRComponentSetup || lifecycle === "sp") {
-    injectHook(lifecycle, (...args) => hook(...args), target);
-  }
-};
-const onBeforeMount = createHook("bm");
-const onMounted = createHook("m");
-const onBeforeUpdate = createHook(
-  "bu"
-);
-const onUpdated = createHook("u");
-const onBeforeUnmount = createHook(
-  "bum"
-);
-const onUnmounted = createHook("um");
-const onServerPrefetch = createHook(
-  "sp"
-);
-const onRenderTriggered = createHook("rtg");
-const onRenderTracked = createHook("rtc");
-function onErrorCaptured(hook, target = currentInstance) {
-  injectHook("ec", hook, target);
-}
-
-const COMPONENTS = "components";
-const DIRECTIVES = "directives";
-function resolveComponent(name, maybeSelfReference) {
-  return resolveAsset(COMPONENTS, name, true, maybeSelfReference) || name;
-}
-const NULL_DYNAMIC_COMPONENT = Symbol.for("v-ndc");
-function resolveDynamicComponent(component) {
-  if (isString(component)) {
-    return resolveAsset(COMPONENTS, component, false) || component;
-  } else {
-    return component || NULL_DYNAMIC_COMPONENT;
-  }
-}
-function resolveDirective(name) {
-  return resolveAsset(DIRECTIVES, name);
-}
-function resolveAsset(type, name, warnMissing = true, maybeSelfReference = false) {
-  const instance = currentRenderingInstance || currentInstance;
-  if (instance) {
-    const Component = instance.type;
-    if (type === COMPONENTS) {
-      const selfName = getComponentName(
-        Component,
-        false
-      );
-      if (selfName && (selfName === name || selfName === camelize(name) || selfName === capitalize(camelize(name)))) {
-        return Component;
-      }
-    }
-    const res = (
-      // local registration
-      // check instance[type] first which is resolved for options API
-      resolve(instance[type] || Component[type], name) || // global registration
-      resolve(instance.appContext[type], name)
-    );
-    if (!res && maybeSelfReference) {
-      return Component;
-    }
-    if (warnMissing && !res) {
-      const extra = type === COMPONENTS ? `
-If this is a native custom element, make sure to exclude it from component resolution via compilerOptions.isCustomElement.` : ``;
-      warn$1(`Failed to resolve ${type.slice(0, -1)}: ${name}${extra}`);
-    }
-    return res;
-  } else {
-    warn$1(
-      `resolve${capitalize(type.slice(0, -1))} can only be used in render() or setup().`
-    );
-  }
-}
-function resolve(registry, name) {
-  return registry && (registry[name] || registry[camelize(name)] || registry[capitalize(camelize(name))]);
-}
-
-function renderList(source, renderItem, cache, index) {
-  let ret;
-  const cached = cache && cache[index];
-  const sourceIsArray = isArray(source);
-  if (sourceIsArray || isString(source)) {
-    const sourceIsReactiveArray = sourceIsArray && isReactive(source);
-    let needsWrap = false;
-    if (sourceIsReactiveArray) {
-      needsWrap = !isShallow(source);
-      source = shallowReadArray(source);
-    }
-    ret = new Array(source.length);
-    for (let i = 0, l = source.length; i < l; i++) {
-      ret[i] = renderItem(
-        needsWrap ? toReactive(source[i]) : source[i],
-        i,
-        void 0,
-        cached && cached[i]
-      );
-    }
-  } else if (typeof source === "number") {
-    if (!Number.isInteger(source)) {
-      warn$1(`The v-for range expect an integer value but got ${source}.`);
-    }
-    ret = new Array(source);
-    for (let i = 0; i < source; i++) {
-      ret[i] = renderItem(i + 1, i, void 0, cached && cached[i]);
-    }
-  } else if (isObject(source)) {
-    if (source[Symbol.iterator]) {
-      ret = Array.from(
-        source,
-        (item, i) => renderItem(item, i, void 0, cached && cached[i])
-      );
-    } else {
-      const keys = Object.keys(source);
-      ret = new Array(keys.length);
-      for (let i = 0, l = keys.length; i < l; i++) {
-        const key = keys[i];
-        ret[i] = renderItem(source[key], key, i, cached && cached[i]);
-      }
-    }
-  } else {
-    ret = [];
-  }
-  if (cache) {
-    cache[index] = ret;
-  }
-  return ret;
-}
-
-function createSlots(slots, dynamicSlots) {
-  for (let i = 0; i < dynamicSlots.length; i++) {
-    const slot = dynamicSlots[i];
-    if (isArray(slot)) {
-      for (let j = 0; j < slot.length; j++) {
-        slots[slot[j].name] = slot[j].fn;
-      }
-    } else if (slot) {
-      slots[slot.name] = slot.key ? (...args) => {
-        const res = slot.fn(...args);
-        if (res) res.key = slot.key;
-        return res;
-      } : slot.fn;
-    }
-  }
-  return slots;
-}
-
-function renderSlot(slots, name, props = {}, fallback, noSlotted) {
-  if (currentRenderingInstance.ce || currentRenderingInstance.parent && isAsyncWrapper(currentRenderingInstance.parent) && currentRenderingInstance.parent.ce) {
-    if (name !== "default") props.name = name;
-    return openBlock(), createBlock(
-      Fragment,
-      null,
-      [createVNode("slot", props, fallback && fallback())],
-      64
-    );
-  }
-  let slot = slots[name];
-  if (slot && slot.length > 1) {
-    warn$1(
-      `SSR-optimized slot function detected in a non-SSR-optimized render function. You need to mark this component with $dynamic-slots in the parent template.`
-    );
-    slot = () => [];
-  }
-  if (slot && slot._c) {
-    slot._d = false;
-  }
-  openBlock();
-  const validSlotContent = slot && ensureValidVNode(slot(props));
-  const slotKey = props.key || // slot content array of a dynamic conditional slot may have a branch
-  // key attached in the `createSlots` helper, respect that
-  validSlotContent && validSlotContent.key;
-  const rendered = createBlock(
-    Fragment,
-    {
-      key: (slotKey && !isSymbol(slotKey) ? slotKey : `_${name}`) + // #7256 force differentiate fallback content from actual content
-      (!validSlotContent && fallback ? "_fb" : "")
-    },
-    validSlotContent || (fallback ? fallback() : []),
-    validSlotContent && slots._ === 1 ? 64 : -2
-  );
-  if (!noSlotted && rendered.scopeId) {
-    rendered.slotScopeIds = [rendered.scopeId + "-s"];
-  }
-  if (slot && slot._c) {
-    slot._d = true;
-  }
-  return rendered;
-}
-function ensureValidVNode(vnodes) {
-  return vnodes.some((child) => {
-    if (!isVNode(child)) return true;
-    if (child.type === Comment) return false;
-    if (child.type === Fragment && !ensureValidVNode(child.children))
-      return false;
-    return true;
-  }) ? vnodes : null;
-}
-
-function toHandlers(obj, preserveCaseIfNecessary) {
-  const ret = {};
-  if (!isObject(obj)) {
-    warn$1(`v-on with no argument expects an object value.`);
-    return ret;
-  }
-  for (const key in obj) {
-    ret[preserveCaseIfNecessary && /[A-Z]/.test(key) ? `on:${key}` : toHandlerKey(key)] = obj[key];
-  }
-  return ret;
-}
-
-const getPublicInstance = (i) => {
-  if (!i) return null;
-  if (isStatefulComponent(i)) return getComponentPublicInstance(i);
-  return getPublicInstance(i.parent);
-};
-const publicPropertiesMap = (
-  // Move PURE marker to new line to workaround compiler discarding it
-  // due to type annotation
-  /* @__PURE__ */ extend(/* @__PURE__ */ Object.create(null), {
-    $: (i) => i,
-    $el: (i) => i.vnode.el,
-    $data: (i) => i.data,
-    $props: (i) => shallowReadonly(i.props) ,
-    $attrs: (i) => shallowReadonly(i.attrs) ,
-    $slots: (i) => shallowReadonly(i.slots) ,
-    $refs: (i) => shallowReadonly(i.refs) ,
-    $parent: (i) => getPublicInstance(i.parent),
-    $root: (i) => getPublicInstance(i.root),
-    $host: (i) => i.ce,
-    $emit: (i) => i.emit,
-    $options: (i) => resolveMergedOptions(i) ,
-    $forceUpdate: (i) => i.f || (i.f = () => {
-      queueJob(i.update);
-    }),
-    $nextTick: (i) => i.n || (i.n = nextTick.bind(i.proxy)),
-    $watch: (i) => instanceWatch.bind(i) 
-  })
-);
-const isReservedPrefix = (key) => key === "_" || key === "$";
-const hasSetupBinding = (state, key) => state !== EMPTY_OBJ && !state.__isScriptSetup && hasOwn(state, key);
-const PublicInstanceProxyHandlers = {
-  get({ _: instance }, key) {
-    if (key === "__v_skip") {
-      return true;
-    }
-    const { ctx, setupState, data, props, accessCache, type, appContext } = instance;
-    if (key === "__isVue") {
-      return true;
-    }
-    let normalizedProps;
-    if (key[0] !== "$") {
-      const n = accessCache[key];
-      if (n !== void 0) {
-        switch (n) {
-          case 1 /* SETUP */:
-            return setupState[key];
-          case 2 /* DATA */:
-            return data[key];
-          case 4 /* CONTEXT */:
-            return ctx[key];
-          case 3 /* PROPS */:
-            return props[key];
-        }
-      } else if (hasSetupBinding(setupState, key)) {
-        accessCache[key] = 1 /* SETUP */;
-        return setupState[key];
-      } else if (data !== EMPTY_OBJ && hasOwn(data, key)) {
-        accessCache[key] = 2 /* DATA */;
-        return data[key];
-      } else if (
-        // only cache other properties when instance has declared (thus stable)
-        // props
-        (normalizedProps = instance.propsOptions[0]) && hasOwn(normalizedProps, key)
-      ) {
-        accessCache[key] = 3 /* PROPS */;
-        return props[key];
-      } else if (ctx !== EMPTY_OBJ && hasOwn(ctx, key)) {
-        accessCache[key] = 4 /* CONTEXT */;
-        return ctx[key];
-      } else if (shouldCacheAccess) {
-        accessCache[key] = 0 /* OTHER */;
-      }
-    }
-    const publicGetter = publicPropertiesMap[key];
-    let cssModule, globalProperties;
-    if (publicGetter) {
-      if (key === "$attrs") {
-        track(instance.attrs, "get", "");
-        markAttrsAccessed();
-      } else if (key === "$slots") {
-        track(instance, "get", key);
-      }
-      return publicGetter(instance);
-    } else if (
-      // css module (injected by vue-loader)
-      (cssModule = type.__cssModules) && (cssModule = cssModule[key])
-    ) {
-      return cssModule;
-    } else if (ctx !== EMPTY_OBJ && hasOwn(ctx, key)) {
-      accessCache[key] = 4 /* CONTEXT */;
-      return ctx[key];
-    } else if (
-      // global properties
-      globalProperties = appContext.config.globalProperties, hasOwn(globalProperties, key)
-    ) {
-      {
-        return globalProperties[key];
-      }
-    } else if (currentRenderingInstance && (!isString(key) || // #1091 avoid internal isRef/isVNode checks on component instance leading
-    // to infinite warning loop
-    key.indexOf("__v") !== 0)) {
-      if (data !== EMPTY_OBJ && isReservedPrefix(key[0]) && hasOwn(data, key)) {
-        warn$1(
-          `Property ${JSON.stringify(
-            key
-          )} must be accessed via $data because it starts with a reserved character ("$" or "_") and is not proxied on the render context.`
-        );
-      } else if (instance === currentRenderingInstance) {
-        warn$1(
-          `Property ${JSON.stringify(key)} was accessed during render but is not defined on instance.`
-        );
-      }
-    }
-  },
-  set({ _: instance }, key, value) {
-    const { data, setupState, ctx } = instance;
-    if (hasSetupBinding(setupState, key)) {
-      setupState[key] = value;
-      return true;
-    } else if (setupState.__isScriptSetup && hasOwn(setupState, key)) {
-      warn$1(`Cannot mutate <script setup> binding "${key}" from Options API.`);
-      return false;
-    } else if (data !== EMPTY_OBJ && hasOwn(data, key)) {
-      data[key] = value;
-      return true;
-    } else if (hasOwn(instance.props, key)) {
-      warn$1(`Attempting to mutate prop "${key}". Props are readonly.`);
-      return false;
-    }
-    if (key[0] === "$" && key.slice(1) in instance) {
-      warn$1(
-        `Attempting to mutate public property "${key}". Properties starting with $ are reserved and readonly.`
-      );
-      return false;
-    } else {
-      if (key in instance.appContext.config.globalProperties) {
-        Object.defineProperty(ctx, key, {
-          enumerable: true,
-          configurable: true,
-          value
-        });
-      } else {
-        ctx[key] = value;
-      }
-    }
-    return true;
-  },
-  has({
-    _: { data, setupState, accessCache, ctx, appContext, propsOptions }
-  }, key) {
-    let normalizedProps;
-    return !!accessCache[key] || data !== EMPTY_OBJ && hasOwn(data, key) || hasSetupBinding(setupState, key) || (normalizedProps = propsOptions[0]) && hasOwn(normalizedProps, key) || hasOwn(ctx, key) || hasOwn(publicPropertiesMap, key) || hasOwn(appContext.config.globalProperties, key);
-  },
-  defineProperty(target, key, descriptor) {
-    if (descriptor.get != null) {
-      target._.accessCache[key] = 0;
-    } else if (hasOwn(descriptor, "value")) {
-      this.set(target, key, descriptor.value, null);
-    }
-    return Reflect.defineProperty(target, key, descriptor);
-  }
-};
-{
-  PublicInstanceProxyHandlers.ownKeys = (target) => {
-    warn$1(
-      `Avoid app logic that relies on enumerating keys on a component instance. The keys will be empty in production mode to avoid performance overhead.`
-    );
-    return Reflect.ownKeys(target);
-  };
-}
-const RuntimeCompiledPublicInstanceProxyHandlers = /* @__PURE__ */ extend({}, PublicInstanceProxyHandlers, {
-  get(target, key) {
-    if (key === Symbol.unscopables) {
-      return;
-    }
-    return PublicInstanceProxyHandlers.get(target, key, target);
-  },
-  has(_, key) {
-    const has = key[0] !== "_" && !isGloballyAllowed(key);
-    if (!has && PublicInstanceProxyHandlers.has(_, key)) {
-      warn$1(
-        `Property ${JSON.stringify(
-          key
-        )} should not start with _ which is a reserved prefix for Vue internals.`
-      );
-    }
-    return has;
-  }
-});
-function createDevRenderContext(instance) {
-  const target = {};
-  Object.defineProperty(target, `_`, {
-    configurable: true,
-    enumerable: false,
-    get: () => instance
-  });
-  Object.keys(publicPropertiesMap).forEach((key) => {
-    Object.defineProperty(target, key, {
-      configurable: true,
-      enumerable: false,
-      get: () => publicPropertiesMap[key](instance),
-      // intercepted by the proxy so no need for implementation,
-      // but needed to prevent set errors
-      set: NOOP
-    });
-  });
-  return target;
-}
-function exposePropsOnRenderContext(instance) {
-  const {
-    ctx,
-    propsOptions: [propsOptions]
-  } = instance;
-  if (propsOptions) {
-    Object.keys(propsOptions).forEach((key) => {
-      Object.defineProperty(ctx, key, {
-        enumerable: true,
-        configurable: true,
-        get: () => instance.props[key],
-        set: NOOP
-      });
-    });
-  }
-}
-function exposeSetupStateOnRenderContext(instance) {
-  const { ctx, setupState } = instance;
-  Object.keys(toRaw(setupState)).forEach((key) => {
-    if (!setupState.__isScriptSetup) {
-      if (isReservedPrefix(key[0])) {
-        warn$1(
-          `setup() return property ${JSON.stringify(
-            key
-          )} should not start with "$" or "_" which are reserved prefixes for Vue internals.`
-        );
-        return;
-      }
-      Object.defineProperty(ctx, key, {
-        enumerable: true,
-        configurable: true,
-        get: () => setupState[key],
-        set: NOOP
-      });
-    }
-  });
-}
-
-const warnRuntimeUsage = (method) => warn$1(
-  `${method}() is a compiler-hint helper that is only usable inside <script setup> of a single file component. Its arguments should be compiled away and passing it at runtime has no effect.`
-);
-function defineProps() {
-  {
-    warnRuntimeUsage(`defineProps`);
-  }
-  return null;
-}
-function defineEmits() {
-  {
-    warnRuntimeUsage(`defineEmits`);
-  }
-  return null;
-}
-function defineExpose(exposed) {
-  {
-    warnRuntimeUsage(`defineExpose`);
-  }
-}
-function defineOptions(options) {
-  {
-    warnRuntimeUsage(`defineOptions`);
-  }
-}
-function defineSlots() {
-  {
-    warnRuntimeUsage(`defineSlots`);
-  }
-  return null;
-}
-function defineModel() {
-  {
-    warnRuntimeUsage("defineModel");
-  }
-}
-function withDefaults(props, defaults) {
-  {
-    warnRuntimeUsage(`withDefaults`);
-  }
-  return null;
-}
-function useSlots() {
-  return getContext().slots;
-}
-function useAttrs() {
-  return getContext().attrs;
-}
-function getContext() {
-  const i = getCurrentInstance();
-  if (!i) {
-    warn$1(`useContext() called without active instance.`);
-  }
-  return i.setupContext || (i.setupContext = createSetupContext(i));
-}
-function normalizePropsOrEmits(props) {
-  return isArray(props) ? props.reduce(
-    (normalized, p) => (normalized[p] = null, normalized),
-    {}
-  ) : props;
-}
-function mergeDefaults(raw, defaults) {
-  const props = normalizePropsOrEmits(raw);
-  for (const key in defaults) {
-    if (key.startsWith("__skip")) continue;
-    let opt = props[key];
-    if (opt) {
-      if (isArray(opt) || isFunction(opt)) {
-        opt = props[key] = { type: opt, default: defaults[key] };
-      } else {
-        opt.default = defaults[key];
-      }
-    } else if (opt === null) {
-      opt = props[key] = { default: defaults[key] };
-    } else {
-      warn$1(`props default key "${key}" has no corresponding declaration.`);
-    }
-    if (opt && defaults[`__skip_${key}`]) {
-      opt.skipFactory = true;
-    }
-  }
-  return props;
-}
-function mergeModels(a, b) {
-  if (!a || !b) return a || b;
-  if (isArray(a) && isArray(b)) return a.concat(b);
-  return extend({}, normalizePropsOrEmits(a), normalizePropsOrEmits(b));
-}
-function createPropsRestProxy(props, excludedKeys) {
-  const ret = {};
-  for (const key in props) {
-    if (!excludedKeys.includes(key)) {
-      Object.defineProperty(ret, key, {
-        enumerable: true,
-        get: () => props[key]
-      });
-    }
-  }
-  return ret;
-}
-function withAsyncContext(getAwaitable) {
-  const ctx = getCurrentInstance();
-  if (!ctx) {
-    warn$1(
-      `withAsyncContext called without active current instance. This is likely a bug.`
-    );
-  }
-  let awaitable = getAwaitable();
-  unsetCurrentInstance();
-  if (isPromise(awaitable)) {
-    awaitable = awaitable.catch((e) => {
-      setCurrentInstance(ctx);
-      throw e;
-    });
-  }
-  return [awaitable, () => setCurrentInstance(ctx)];
-}
-
-function createDuplicateChecker() {
-  const cache = /* @__PURE__ */ Object.create(null);
-  return (type, key) => {
-    if (cache[key]) {
-      warn$1(`${type} property "${key}" is already defined in ${cache[key]}.`);
-    } else {
-      cache[key] = type;
-    }
-  };
-}
-let shouldCacheAccess = true;
-function applyOptions(instance) {
-  const options = resolveMergedOptions(instance);
-  const publicThis = instance.proxy;
-  const ctx = instance.ctx;
-  shouldCacheAccess = false;
-  if (options.beforeCreate) {
-    callHook$1(options.beforeCreate, instance, "bc");
-  }
-  const {
-    // state
-    data: dataOptions,
-    computed: computedOptions,
-    methods,
-    watch: watchOptions,
-    provide: provideOptions,
-    inject: injectOptions,
-    // lifecycle
-    created,
-    beforeMount,
-    mounted,
-    beforeUpdate,
-    updated,
-    activated,
-    deactivated,
-    beforeDestroy,
-    beforeUnmount,
-    destroyed,
-    unmounted,
-    render,
-    renderTracked,
-    renderTriggered,
-    errorCaptured,
-    serverPrefetch,
-    // public API
-    expose,
-    inheritAttrs,
-    // assets
-    components,
-    directives,
-    filters
-  } = options;
-  const checkDuplicateProperties = createDuplicateChecker() ;
-  {
-    const [propsOptions] = instance.propsOptions;
-    if (propsOptions) {
-      for (const key in propsOptions) {
-        checkDuplicateProperties("Props" /* PROPS */, key);
-      }
-    }
-  }
-  if (injectOptions) {
-    resolveInjections(injectOptions, ctx, checkDuplicateProperties);
-  }
-  if (methods) {
-    for (const key in methods) {
-      const methodHandler = methods[key];
-      if (isFunction(methodHandler)) {
-        {
-          Object.defineProperty(ctx, key, {
-            value: methodHandler.bind(publicThis),
-            configurable: true,
-            enumerable: true,
-            writable: true
-          });
-        }
-        {
-          checkDuplicateProperties("Methods" /* METHODS */, key);
-        }
-      } else {
-        warn$1(
-          `Method "${key}" has type "${typeof methodHandler}" in the component definition. Did you reference the function correctly?`
-        );
-      }
-    }
-  }
-  if (dataOptions) {
-    if (!isFunction(dataOptions)) {
-      warn$1(
-        `The data option must be a function. Plain object usage is no longer supported.`
-      );
-    }
-    const data = dataOptions.call(publicThis, publicThis);
-    if (isPromise(data)) {
-      warn$1(
-        `data() returned a Promise - note data() cannot be async; If you intend to perform data fetching before component renders, use async setup() + <Suspense>.`
-      );
-    }
-    if (!isObject(data)) {
-      warn$1(`data() should return an object.`);
-    } else {
-      instance.data = reactive(data);
-      {
-        for (const key in data) {
-          checkDuplicateProperties("Data" /* DATA */, key);
-          if (!isReservedPrefix(key[0])) {
-            Object.defineProperty(ctx, key, {
-              configurable: true,
-              enumerable: true,
-              get: () => data[key],
-              set: NOOP
-            });
-          }
-        }
-      }
-    }
-  }
-  shouldCacheAccess = true;
-  if (computedOptions) {
-    for (const key in computedOptions) {
-      const opt = computedOptions[key];
-      const get = isFunction(opt) ? opt.bind(publicThis, publicThis) : isFunction(opt.get) ? opt.get.bind(publicThis, publicThis) : NOOP;
-      if (get === NOOP) {
-        warn$1(`Computed property "${key}" has no getter.`);
-      }
-      const set = !isFunction(opt) && isFunction(opt.set) ? opt.set.bind(publicThis) : () => {
-        warn$1(
-          `Write operation failed: computed property "${key}" is readonly.`
-        );
-      } ;
-      const c = computed({
-        get,
-        set
-      });
-      Object.defineProperty(ctx, key, {
-        enumerable: true,
-        configurable: true,
-        get: () => c.value,
-        set: (v) => c.value = v
-      });
-      {
-        checkDuplicateProperties("Computed" /* COMPUTED */, key);
-      }
-    }
-  }
-  if (watchOptions) {
-    for (const key in watchOptions) {
-      createWatcher(watchOptions[key], ctx, publicThis, key);
-    }
-  }
-  if (provideOptions) {
-    const provides = isFunction(provideOptions) ? provideOptions.call(publicThis) : provideOptions;
-    Reflect.ownKeys(provides).forEach((key) => {
-      provide(key, provides[key]);
-    });
-  }
-  if (created) {
-    callHook$1(created, instance, "c");
-  }
-  function registerLifecycleHook(register, hook) {
-    if (isArray(hook)) {
-      hook.forEach((_hook) => register(_hook.bind(publicThis)));
-    } else if (hook) {
-      register(hook.bind(publicThis));
-    }
-  }
-  registerLifecycleHook(onBeforeMount, beforeMount);
-  registerLifecycleHook(onMounted, mounted);
-  registerLifecycleHook(onBeforeUpdate, beforeUpdate);
-  registerLifecycleHook(onUpdated, updated);
-  registerLifecycleHook(onActivated, activated);
-  registerLifecycleHook(onDeactivated, deactivated);
-  registerLifecycleHook(onErrorCaptured, errorCaptured);
-  registerLifecycleHook(onRenderTracked, renderTracked);
-  registerLifecycleHook(onRenderTriggered, renderTriggered);
-  registerLifecycleHook(onBeforeUnmount, beforeUnmount);
-  registerLifecycleHook(onUnmounted, unmounted);
-  registerLifecycleHook(onServerPrefetch, serverPrefetch);
-  if (isArray(expose)) {
-    if (expose.length) {
-      const exposed = instance.exposed || (instance.exposed = {});
-      expose.forEach((key) => {
-        Object.defineProperty(exposed, key, {
-          get: () => publicThis[key],
-          set: (val) => publicThis[key] = val
-        });
-      });
-    } else if (!instance.exposed) {
-      instance.exposed = {};
-    }
-  }
-  if (render && instance.render === NOOP) {
-    instance.render = render;
-  }
-  if (inheritAttrs != null) {
-    instance.inheritAttrs = inheritAttrs;
-  }
-  if (components) instance.components = components;
-  if (directives) instance.directives = directives;
-  if (serverPrefetch) {
-    markAsyncBoundary(instance);
-  }
-}
-function resolveInjections(injectOptions, ctx, checkDuplicateProperties = NOOP) {
-  if (isArray(injectOptions)) {
-    injectOptions = normalizeInject(injectOptions);
-  }
-  for (const key in injectOptions) {
-    const opt = injectOptions[key];
-    let injected;
-    if (isObject(opt)) {
-      if ("default" in opt) {
-        injected = inject(
-          opt.from || key,
-          opt.default,
-          true
-        );
-      } else {
-        injected = inject(opt.from || key);
-      }
-    } else {
-      injected = inject(opt);
-    }
-    if (isRef(injected)) {
-      Object.defineProperty(ctx, key, {
-        enumerable: true,
-        configurable: true,
-        get: () => injected.value,
-        set: (v) => injected.value = v
-      });
-    } else {
-      ctx[key] = injected;
-    }
-    {
-      checkDuplicateProperties("Inject" /* INJECT */, key);
-    }
-  }
-}
-function callHook$1(hook, instance, type) {
-  callWithAsyncErrorHandling(
-    isArray(hook) ? hook.map((h) => h.bind(instance.proxy)) : hook.bind(instance.proxy),
-    instance,
-    type
-  );
-}
-function createWatcher(raw, ctx, publicThis, key) {
-  let getter = key.includes(".") ? createPathGetter(publicThis, key) : () => publicThis[key];
-  if (isString(raw)) {
-    const handler = ctx[raw];
-    if (isFunction(handler)) {
-      {
-        watch(getter, handler);
-      }
-    } else {
-      warn$1(`Invalid watch handler specified by key "${raw}"`, handler);
-    }
-  } else if (isFunction(raw)) {
-    {
-      watch(getter, raw.bind(publicThis));
-    }
-  } else if (isObject(raw)) {
-    if (isArray(raw)) {
-      raw.forEach((r) => createWatcher(r, ctx, publicThis, key));
-    } else {
-      const handler = isFunction(raw.handler) ? raw.handler.bind(publicThis) : ctx[raw.handler];
-      if (isFunction(handler)) {
-        watch(getter, handler, raw);
-      } else {
-        warn$1(`Invalid watch handler specified by key "${raw.handler}"`, handler);
-      }
-    }
-  } else {
-    warn$1(`Invalid watch option: "${key}"`, raw);
-  }
-}
-function resolveMergedOptions(instance) {
-  const base = instance.type;
-  const { mixins, extends: extendsOptions } = base;
-  const {
-    mixins: globalMixins,
-    optionsCache: cache,
-    config: { optionMergeStrategies }
-  } = instance.appContext;
-  const cached = cache.get(base);
-  let resolved;
-  if (cached) {
-    resolved = cached;
-  } else if (!globalMixins.length && !mixins && !extendsOptions) {
-    {
-      resolved = base;
-    }
-  } else {
-    resolved = {};
-    if (globalMixins.length) {
-      globalMixins.forEach(
-        (m) => mergeOptions(resolved, m, optionMergeStrategies, true)
-      );
-    }
-    mergeOptions(resolved, base, optionMergeStrategies);
-  }
-  if (isObject(base)) {
-    cache.set(base, resolved);
-  }
-  return resolved;
-}
-function mergeOptions(to, from, strats, asMixin = false) {
-  const { mixins, extends: extendsOptions } = from;
-  if (extendsOptions) {
-    mergeOptions(to, extendsOptions, strats, true);
-  }
-  if (mixins) {
-    mixins.forEach(
-      (m) => mergeOptions(to, m, strats, true)
-    );
-  }
-  for (const key in from) {
-    if (asMixin && key === "expose") {
-      warn$1(
-        `"expose" option is ignored when declared in mixins or extends. It should only be declared in the base component itself.`
-      );
-    } else {
-      const strat = internalOptionMergeStrats[key] || strats && strats[key];
-      to[key] = strat ? strat(to[key], from[key]) : from[key];
-    }
-  }
-  return to;
-}
-const internalOptionMergeStrats = {
-  data: mergeDataFn,
-  props: mergeEmitsOrPropsOptions,
-  emits: mergeEmitsOrPropsOptions,
-  // objects
-  methods: mergeObjectOptions,
-  computed: mergeObjectOptions,
-  // lifecycle
-  beforeCreate: mergeAsArray$1,
-  created: mergeAsArray$1,
-  beforeMount: mergeAsArray$1,
-  mounted: mergeAsArray$1,
-  beforeUpdate: mergeAsArray$1,
-  updated: mergeAsArray$1,
-  beforeDestroy: mergeAsArray$1,
-  beforeUnmount: mergeAsArray$1,
-  destroyed: mergeAsArray$1,
-  unmounted: mergeAsArray$1,
-  activated: mergeAsArray$1,
-  deactivated: mergeAsArray$1,
-  errorCaptured: mergeAsArray$1,
-  serverPrefetch: mergeAsArray$1,
-  // assets
-  components: mergeObjectOptions,
-  directives: mergeObjectOptions,
-  // watch
-  watch: mergeWatchOptions,
-  // provide / inject
-  provide: mergeDataFn,
-  inject: mergeInject
-};
-function mergeDataFn(to, from) {
-  if (!from) {
-    return to;
-  }
-  if (!to) {
-    return from;
-  }
-  return function mergedDataFn() {
-    return (extend)(
-      isFunction(to) ? to.call(this, this) : to,
-      isFunction(from) ? from.call(this, this) : from
-    );
-  };
-}
-function mergeInject(to, from) {
-  return mergeObjectOptions(normalizeInject(to), normalizeInject(from));
-}
-function normalizeInject(raw) {
-  if (isArray(raw)) {
-    const res = {};
-    for (let i = 0; i < raw.length; i++) {
-      res[raw[i]] = raw[i];
-    }
-    return res;
-  }
-  return raw;
-}
-function mergeAsArray$1(to, from) {
-  return to ? [...new Set([].concat(to, from))] : from;
-}
-function mergeObjectOptions(to, from) {
-  return to ? extend(/* @__PURE__ */ Object.create(null), to, from) : from;
-}
-function mergeEmitsOrPropsOptions(to, from) {
-  if (to) {
-    if (isArray(to) && isArray(from)) {
-      return [.../* @__PURE__ */ new Set([...to, ...from])];
-    }
-    return extend(
-      /* @__PURE__ */ Object.create(null),
-      normalizePropsOrEmits(to),
-      normalizePropsOrEmits(from != null ? from : {})
-    );
-  } else {
-    return from;
-  }
-}
-function mergeWatchOptions(to, from) {
-  if (!to) return from;
-  if (!from) return to;
-  const merged = extend(/* @__PURE__ */ Object.create(null), to);
-  for (const key in from) {
-    merged[key] = mergeAsArray$1(to[key], from[key]);
-  }
-  return merged;
-}
-
-function createAppContext() {
-  return {
-    app: null,
-    config: {
-      isNativeTag: NO,
-      performance: false,
-      globalProperties: {},
-      optionMergeStrategies: {},
-      errorHandler: void 0,
-      warnHandler: void 0,
-      compilerOptions: {}
-    },
-    mixins: [],
-    components: {},
-    directives: {},
-    provides: /* @__PURE__ */ Object.create(null),
-    optionsCache: /* @__PURE__ */ new WeakMap(),
-    propsCache: /* @__PURE__ */ new WeakMap(),
-    emitsCache: /* @__PURE__ */ new WeakMap()
-  };
-}
-let uid$1 = 0;
-function createAppAPI(render, hydrate) {
-  return function createApp(rootComponent, rootProps = null) {
-    if (!isFunction(rootComponent)) {
-      rootComponent = extend({}, rootComponent);
-    }
-    if (rootProps != null && !isObject(rootProps)) {
-      warn$1(`root props passed to app.mount() must be an object.`);
-      rootProps = null;
-    }
-    const context = createAppContext();
-    const installedPlugins = /* @__PURE__ */ new WeakSet();
-    const pluginCleanupFns = [];
-    let isMounted = false;
-    const app = context.app = {
-      _uid: uid$1++,
-      _component: rootComponent,
-      _props: rootProps,
-      _container: null,
-      _context: context,
-      _instance: null,
-      version,
-      get config() {
-        return context.config;
-      },
-      set config(v) {
-        {
-          warn$1(
-            `app.config cannot be replaced. Modify individual options instead.`
-          );
-        }
-      },
-      use(plugin, ...options) {
-        if (installedPlugins.has(plugin)) {
-          warn$1(`Plugin has already been applied to target app.`);
-        } else if (plugin && isFunction(plugin.install)) {
-          installedPlugins.add(plugin);
-          plugin.install(app, ...options);
-        } else if (isFunction(plugin)) {
-          installedPlugins.add(plugin);
-          plugin(app, ...options);
-        } else {
-          warn$1(
-            `A plugin must either be a function or an object with an "install" function.`
-          );
-        }
-        return app;
-      },
-      mixin(mixin) {
-        {
-          if (!context.mixins.includes(mixin)) {
-            context.mixins.push(mixin);
-          } else {
-            warn$1(
-              "Mixin has already been applied to target app" + (mixin.name ? `: ${mixin.name}` : "")
-            );
-          }
-        }
-        return app;
-      },
-      component(name, component) {
-        {
-          validateComponentName(name, context.config);
-        }
-        if (!component) {
-          return context.components[name];
-        }
-        if (context.components[name]) {
-          warn$1(`Component "${name}" has already been registered in target app.`);
-        }
-        context.components[name] = component;
-        return app;
-      },
-      directive(name, directive) {
-        {
-          validateDirectiveName(name);
-        }
-        if (!directive) {
-          return context.directives[name];
-        }
-        if (context.directives[name]) {
-          warn$1(`Directive "${name}" has already been registered in target app.`);
-        }
-        context.directives[name] = directive;
-        return app;
-      },
-      mount(rootContainer, isHydrate, namespace) {
-        if (!isMounted) {
-          if (rootContainer.__vue_app__) {
-            warn$1(
-              `There is already an app instance mounted on the host container.
- If you want to mount another app on the same host container, you need to unmount the previous app by calling \`app.unmount()\` first.`
-            );
-          }
-          const vnode = app._ceVNode || createVNode(rootComponent, rootProps);
-          vnode.appContext = context;
-          if (namespace === true) {
-            namespace = "svg";
-          } else if (namespace === false) {
-            namespace = void 0;
-          }
-          {
-            context.reload = () => {
-              render(
-                cloneVNode(vnode),
-                rootContainer,
-                namespace
-              );
-            };
-          }
-          if (isHydrate && hydrate) {
-            hydrate(vnode, rootContainer);
-          } else {
-            render(vnode, rootContainer, namespace);
-          }
-          isMounted = true;
-          app._container = rootContainer;
-          rootContainer.__vue_app__ = app;
-          {
-            app._instance = vnode.component;
-            devtoolsInitApp(app, version);
-          }
-          return getComponentPublicInstance(vnode.component);
-        } else {
-          warn$1(
-            `App has already been mounted.
-If you want to remount the same app, move your app creation logic into a factory function and create fresh app instances for each mount - e.g. \`const createMyApp = () => createApp(App)\``
-          );
-        }
-      },
-      onUnmount(cleanupFn) {
-        if (typeof cleanupFn !== "function") {
-          warn$1(
-            `Expected function as first argument to app.onUnmount(), but got ${typeof cleanupFn}`
-          );
-        }
-        pluginCleanupFns.push(cleanupFn);
-      },
-      unmount() {
-        if (isMounted) {
-          callWithAsyncErrorHandling(
-            pluginCleanupFns,
-            app._instance,
-            16
-          );
-          render(null, app._container);
-          {
-            app._instance = null;
-            devtoolsUnmountApp(app);
-          }
-          delete app._container.__vue_app__;
-        } else {
-          warn$1(`Cannot unmount an app that is not mounted.`);
-        }
-      },
-      provide(key, value) {
-        if (key in context.provides) {
-          warn$1(
-            `App already provides property with key "${String(key)}". It will be overwritten with the new value.`
-          );
-        }
-        context.provides[key] = value;
-        return app;
-      },
-      runWithContext(fn) {
-        const lastApp = currentApp;
-        currentApp = app;
-        try {
-          return fn();
-        } finally {
-          currentApp = lastApp;
-        }
-      }
-    };
-    return app;
-  };
-}
-let currentApp = null;
-
-function provide(key, value) {
-  if (!currentInstance) {
-    {
-      warn$1(`provide() can only be used inside setup().`);
-    }
-  } else {
-    let provides = currentInstance.provides;
-    const parentProvides = currentInstance.parent && currentInstance.parent.provides;
-    if (parentProvides === provides) {
-      provides = currentInstance.provides = Object.create(parentProvides);
-    }
-    provides[key] = value;
-  }
-}
-function inject(key, defaultValue, treatDefaultAsFactory = false) {
-  const instance = currentInstance || currentRenderingInstance;
-  if (instance || currentApp) {
-    const provides = currentApp ? currentApp._context.provides : instance ? instance.parent == null ? instance.vnode.appContext && instance.vnode.appContext.provides : instance.parent.provides : void 0;
-    if (provides && key in provides) {
-      return provides[key];
-    } else if (arguments.length > 1) {
-      return treatDefaultAsFactory && isFunction(defaultValue) ? defaultValue.call(instance && instance.proxy) : defaultValue;
-    } else {
-      warn$1(`injection "${String(key)}" not found.`);
-    }
-  } else {
-    warn$1(`inject() can only be used inside setup() or functional components.`);
-  }
-}
-function hasInjectionContext() {
-  return !!(currentInstance || currentRenderingInstance || currentApp);
-}
-
-const internalObjectProto = {};
-const createInternalObject = () => Object.create(internalObjectProto);
-const isInternalObject = (obj) => Object.getPrototypeOf(obj) === internalObjectProto;
-
-function initProps(instance, rawProps, isStateful, isSSR = false) {
-  const props = {};
-  const attrs = createInternalObject();
-  instance.propsDefaults = /* @__PURE__ */ Object.create(null);
-  setFullProps(instance, rawProps, props, attrs);
-  for (const key in instance.propsOptions[0]) {
-    if (!(key in props)) {
-      props[key] = void 0;
-    }
-  }
-  {
-    validateProps(rawProps || {}, props, instance);
-  }
-  if (isStateful) {
-    instance.props = isSSR ? props : shallowReactive(props);
-  } else {
-    if (!instance.type.props) {
-      instance.props = attrs;
-    } else {
-      instance.props = props;
-    }
-  }
-  instance.attrs = attrs;
-}
-function isInHmrContext(instance) {
-  while (instance) {
-    if (instance.type.__hmrId) return true;
-    instance = instance.parent;
-  }
-}
-function updateProps(instance, rawProps, rawPrevProps, optimized) {
-  const {
-    props,
-    attrs,
-    vnode: { patchFlag }
-  } = instance;
-  const rawCurrentProps = toRaw(props);
-  const [options] = instance.propsOptions;
-  let hasAttrsChanged = false;
-  if (
-    // always force full diff in dev
-    // - #1942 if hmr is enabled with sfc component
-    // - vite#872 non-sfc component used by sfc component
-    !isInHmrContext(instance) && (optimized || patchFlag > 0) && !(patchFlag & 16)
-  ) {
-    if (patchFlag & 8) {
-      const propsToUpdate = instance.vnode.dynamicProps;
-      for (let i = 0; i < propsToUpdate.length; i++) {
-        let key = propsToUpdate[i];
-        if (isEmitListener(instance.emitsOptions, key)) {
-          continue;
-        }
-        const value = rawProps[key];
-        if (options) {
-          if (hasOwn(attrs, key)) {
-            if (value !== attrs[key]) {
-              attrs[key] = value;
-              hasAttrsChanged = true;
-            }
-          } else {
-            const camelizedKey = camelize(key);
-            props[camelizedKey] = resolvePropValue(
-              options,
-              rawCurrentProps,
-              camelizedKey,
-              value,
-              instance,
-              false
-            );
-          }
-        } else {
-          if (value !== attrs[key]) {
-            attrs[key] = value;
-            hasAttrsChanged = true;
-          }
-        }
-      }
-    }
-  } else {
-    if (setFullProps(instance, rawProps, props, attrs)) {
-      hasAttrsChanged = true;
-    }
-    let kebabKey;
-    for (const key in rawCurrentProps) {
-      if (!rawProps || // for camelCase
-      !hasOwn(rawProps, key) && // it's possible the original props was passed in as kebab-case
-      // and converted to camelCase (#955)
-      ((kebabKey = hyphenate(key)) === key || !hasOwn(rawProps, kebabKey))) {
-        if (options) {
-          if (rawPrevProps && // for camelCase
-          (rawPrevProps[key] !== void 0 || // for kebab-case
-          rawPrevProps[kebabKey] !== void 0)) {
-            props[key] = resolvePropValue(
-              options,
-              rawCurrentProps,
-              key,
-              void 0,
-              instance,
-              true
-            );
-          }
-        } else {
-          delete props[key];
-        }
-      }
-    }
-    if (attrs !== rawCurrentProps) {
-      for (const key in attrs) {
-        if (!rawProps || !hasOwn(rawProps, key) && true) {
-          delete attrs[key];
-          hasAttrsChanged = true;
-        }
-      }
-    }
-  }
-  if (hasAttrsChanged) {
-    trigger(instance.attrs, "set", "");
-  }
-  {
-    validateProps(rawProps || {}, props, instance);
-  }
-}
-function setFullProps(instance, rawProps, props, attrs) {
-  const [options, needCastKeys] = instance.propsOptions;
-  let hasAttrsChanged = false;
-  let rawCastValues;
-  if (rawProps) {
-    for (let key in rawProps) {
-      if (isReservedProp(key)) {
-        continue;
-      }
-      const value = rawProps[key];
-      let camelKey;
-      if (options && hasOwn(options, camelKey = camelize(key))) {
-        if (!needCastKeys || !needCastKeys.includes(camelKey)) {
-          props[camelKey] = value;
-        } else {
-          (rawCastValues || (rawCastValues = {}))[camelKey] = value;
-        }
-      } else if (!isEmitListener(instance.emitsOptions, key)) {
-        if (!(key in attrs) || value !== attrs[key]) {
-          attrs[key] = value;
-          hasAttrsChanged = true;
-        }
-      }
-    }
-  }
-  if (needCastKeys) {
-    const rawCurrentProps = toRaw(props);
-    const castValues = rawCastValues || EMPTY_OBJ;
-    for (let i = 0; i < needCastKeys.length; i++) {
-      const key = needCastKeys[i];
-      props[key] = resolvePropValue(
-        options,
-        rawCurrentProps,
-        key,
-        castValues[key],
-        instance,
-        !hasOwn(castValues, key)
-      );
-    }
-  }
-  return hasAttrsChanged;
-}
-function resolvePropValue(options, props, key, value, instance, isAbsent) {
-  const opt = options[key];
-  if (opt != null) {
-    const hasDefault = hasOwn(opt, "default");
-    if (hasDefault && value === void 0) {
-      const defaultValue = opt.default;
-      if (opt.type !== Function && !opt.skipFactory && isFunction(defaultValue)) {
-        const { propsDefaults } = instance;
-        if (key in propsDefaults) {
-          value = propsDefaults[key];
-        } else {
-          const reset = setCurrentInstance(instance);
-          value = propsDefaults[key] = defaultValue.call(
-            null,
-            props
-          );
-          reset();
-        }
-      } else {
-        value = defaultValue;
-      }
-      if (instance.ce) {
-        instance.ce._setProp(key, value);
-      }
-    }
-    if (opt[0 /* shouldCast */]) {
-      if (isAbsent && !hasDefault) {
-        value = false;
-      } else if (opt[1 /* shouldCastTrue */] && (value === "" || value === hyphenate(key))) {
-        value = true;
-      }
-    }
-  }
-  return value;
-}
-const mixinPropsCache = /* @__PURE__ */ new WeakMap();
-function normalizePropsOptions(comp, appContext, asMixin = false) {
-  const cache = asMixin ? mixinPropsCache : appContext.propsCache;
-  const cached = cache.get(comp);
-  if (cached) {
-    return cached;
-  }
-  const raw = comp.props;
-  const normalized = {};
-  const needCastKeys = [];
-  let hasExtends = false;
-  if (!isFunction(comp)) {
-    const extendProps = (raw2) => {
-      hasExtends = true;
-      const [props, keys] = normalizePropsOptions(raw2, appContext, true);
-      extend(normalized, props);
-      if (keys) needCastKeys.push(...keys);
-    };
-    if (!asMixin && appContext.mixins.length) {
-      appContext.mixins.forEach(extendProps);
-    }
-    if (comp.extends) {
-      extendProps(comp.extends);
-    }
-    if (comp.mixins) {
-      comp.mixins.forEach(extendProps);
-    }
-  }
-  if (!raw && !hasExtends) {
-    if (isObject(comp)) {
-      cache.set(comp, EMPTY_ARR);
-    }
-    return EMPTY_ARR;
-  }
-  if (isArray(raw)) {
-    for (let i = 0; i < raw.length; i++) {
-      if (!isString(raw[i])) {
-        warn$1(`props must be strings when using array syntax.`, raw[i]);
-      }
-      const normalizedKey = camelize(raw[i]);
-      if (validatePropName(normalizedKey)) {
-        normalized[normalizedKey] = EMPTY_OBJ;
-      }
-    }
-  } else if (raw) {
-    if (!isObject(raw)) {
-      warn$1(`invalid props options`, raw);
-    }
-    for (const key in raw) {
-      const normalizedKey = camelize(key);
-      if (validatePropName(normalizedKey)) {
-        const opt = raw[key];
-        const prop = normalized[normalizedKey] = isArray(opt) || isFunction(opt) ? { type: opt } : extend({}, opt);
-        const propType = prop.type;
-        let shouldCast = false;
-        let shouldCastTrue = true;
-        if (isArray(propType)) {
-          for (let index = 0; index < propType.length; ++index) {
-            const type = propType[index];
-            const typeName = isFunction(type) && type.name;
-            if (typeName === "Boolean") {
-              shouldCast = true;
-              break;
-            } else if (typeName === "String") {
-              shouldCastTrue = false;
-            }
-          }
-        } else {
-          shouldCast = isFunction(propType) && propType.name === "Boolean";
-        }
-        prop[0 /* shouldCast */] = shouldCast;
-        prop[1 /* shouldCastTrue */] = shouldCastTrue;
-        if (shouldCast || hasOwn(prop, "default")) {
-          needCastKeys.push(normalizedKey);
-        }
-      }
-    }
-  }
-  const res = [normalized, needCastKeys];
-  if (isObject(comp)) {
-    cache.set(comp, res);
-  }
-  return res;
-}
-function validatePropName(key) {
-  if (key[0] !== "$" && !isReservedProp(key)) {
-    return true;
-  } else {
-    warn$1(`Invalid prop name: "${key}" is a reserved property.`);
-  }
-  return false;
-}
-function getType(ctor) {
-  if (ctor === null) {
-    return "null";
-  }
-  if (typeof ctor === "function") {
-    return ctor.name || "";
-  } else if (typeof ctor === "object") {
-    const name = ctor.constructor && ctor.constructor.name;
-    return name || "";
-  }
-  return "";
-}
-function validateProps(rawProps, props, instance) {
-  const resolvedValues = toRaw(props);
-  const options = instance.propsOptions[0];
-  const camelizePropsKey = Object.keys(rawProps).map((key) => camelize(key));
-  for (const key in options) {
-    let opt = options[key];
-    if (opt == null) continue;
-    validateProp(
-      key,
-      resolvedValues[key],
-      opt,
-      shallowReadonly(resolvedValues) ,
-      !camelizePropsKey.includes(key)
-    );
-  }
-}
-function validateProp(name, value, prop, props, isAbsent) {
-  const { type, required, validator, skipCheck } = prop;
-  if (required && isAbsent) {
-    warn$1('Missing required prop: "' + name + '"');
-    return;
-  }
-  if (value == null && !required) {
-    return;
-  }
-  if (type != null && type !== true && !skipCheck) {
-    let isValid = false;
-    const types = isArray(type) ? type : [type];
-    const expectedTypes = [];
-    for (let i = 0; i < types.length && !isValid; i++) {
-      const { valid, expectedType } = assertType(value, types[i]);
-      expectedTypes.push(expectedType || "");
-      isValid = valid;
-    }
-    if (!isValid) {
-      warn$1(getInvalidTypeMessage(name, value, expectedTypes));
-      return;
-    }
-  }
-  if (validator && !validator(value, props)) {
-    warn$1('Invalid prop: custom validator check failed for prop "' + name + '".');
-  }
-}
-const isSimpleType = /* @__PURE__ */ makeMap(
-  "String,Number,Boolean,Function,Symbol,BigInt"
-);
-function assertType(value, type) {
-  let valid;
-  const expectedType = getType(type);
-  if (expectedType === "null") {
-    valid = value === null;
-  } else if (isSimpleType(expectedType)) {
-    const t = typeof value;
-    valid = t === expectedType.toLowerCase();
-    if (!valid && t === "object") {
-      valid = value instanceof type;
-    }
-  } else if (expectedType === "Object") {
-    valid = isObject(value);
-  } else if (expectedType === "Array") {
-    valid = isArray(value);
-  } else {
-    valid = value instanceof type;
-  }
-  return {
-    valid,
-    expectedType
-  };
-}
-function getInvalidTypeMessage(name, value, expectedTypes) {
-  if (expectedTypes.length === 0) {
-    return `Prop type [] for prop "${name}" won't match anything. Did you mean to use type Array instead?`;
-  }
-  let message = `Invalid prop: type check failed for prop "${name}". Expected ${expectedTypes.map(capitalize).join(" | ")}`;
-  const expectedType = expectedTypes[0];
-  const receivedType = toRawType(value);
-  const expectedValue = styleValue(value, expectedType);
-  const receivedValue = styleValue(value, receivedType);
-  if (expectedTypes.length === 1 && isExplicable(expectedType) && !isBoolean(expectedType, receivedType)) {
-    message += ` with value ${expectedValue}`;
-  }
-  message += `, got ${receivedType} `;
-  if (isExplicable(receivedType)) {
-    message += `with value ${receivedValue}.`;
-  }
-  return message;
-}
-function styleValue(value, type) {
-  if (type === "String") {
-    return `"${value}"`;
-  } else if (type === "Number") {
-    return `${Number(value)}`;
-  } else {
-    return `${value}`;
-  }
-}
-function isExplicable(type) {
-  const explicitTypes = ["string", "number", "boolean"];
-  return explicitTypes.some((elem) => type.toLowerCase() === elem);
-}
-function isBoolean(...args) {
-  return args.some((elem) => elem.toLowerCase() === "boolean");
-}
-
-const isInternalKey = (key) => key[0] === "_" || key === "$stable";
-const normalizeSlotValue = (value) => isArray(value) ? value.map(normalizeVNode) : [normalizeVNode(value)];
-const normalizeSlot = (key, rawSlot, ctx) => {
-  if (rawSlot._n) {
-    return rawSlot;
-  }
-  const normalized = withCtx((...args) => {
-    if (currentInstance && (!ctx || ctx.root === currentInstance.root)) {
-      warn$1(
-        `Slot "${key}" invoked outside of the render function: this will not track dependencies used in the slot. Invoke the slot function inside the render function instead.`
-      );
-    }
-    return normalizeSlotValue(rawSlot(...args));
-  }, ctx);
-  normalized._c = false;
-  return normalized;
-};
-const normalizeObjectSlots = (rawSlots, slots, instance) => {
-  const ctx = rawSlots._ctx;
-  for (const key in rawSlots) {
-    if (isInternalKey(key)) continue;
-    const value = rawSlots[key];
-    if (isFunction(value)) {
-      slots[key] = normalizeSlot(key, value, ctx);
-    } else if (value != null) {
-      {
-        warn$1(
-          `Non-function value encountered for slot "${key}". Prefer function slots for better performance.`
-        );
-      }
-      const normalized = normalizeSlotValue(value);
-      slots[key] = () => normalized;
-    }
-  }
-};
-const normalizeVNodeSlots = (instance, children) => {
-  if (!isKeepAlive(instance.vnode) && true) {
-    warn$1(
-      `Non-function value encountered for default slot. Prefer function slots for better performance.`
-    );
-  }
-  const normalized = normalizeSlotValue(children);
-  instance.slots.default = () => normalized;
-};
-const assignSlots = (slots, children, optimized) => {
-  for (const key in children) {
-    if (optimized || key !== "_") {
-      slots[key] = children[key];
-    }
-  }
-};
-const initSlots = (instance, children, optimized) => {
-  const slots = instance.slots = createInternalObject();
-  if (instance.vnode.shapeFlag & 32) {
-    const type = children._;
-    if (type) {
-      assignSlots(slots, children, optimized);
-      if (optimized) {
-        def(slots, "_", type, true);
-      }
-    } else {
-      normalizeObjectSlots(children, slots);
-    }
-  } else if (children) {
-    normalizeVNodeSlots(instance, children);
-  }
-};
-const updateSlots = (instance, children, optimized) => {
-  const { vnode, slots } = instance;
-  let needDeletionCheck = true;
-  let deletionComparisonTarget = EMPTY_OBJ;
-  if (vnode.shapeFlag & 32) {
-    const type = children._;
-    if (type) {
-      if (isHmrUpdating) {
-        assignSlots(slots, children, optimized);
-        trigger(instance, "set", "$slots");
-      } else if (optimized && type === 1) {
-        needDeletionCheck = false;
-      } else {
-        assignSlots(slots, children, optimized);
-      }
-    } else {
-      needDeletionCheck = !children.$stable;
-      normalizeObjectSlots(children, slots);
-    }
-    deletionComparisonTarget = children;
-  } else if (children) {
-    normalizeVNodeSlots(instance, children);
-    deletionComparisonTarget = { default: 1 };
-  }
-  if (needDeletionCheck) {
-    for (const key in slots) {
-      if (!isInternalKey(key) && deletionComparisonTarget[key] == null) {
-        delete slots[key];
-      }
-    }
-  }
-};
-
-let supported;
-let perf;
-function startMeasure(instance, type) {
-  if (instance.appContext.config.performance && isSupported()) {
-    perf.mark(`vue-${type}-${instance.uid}`);
-  }
-  {
-    devtoolsPerfStart(instance, type, isSupported() ? perf.now() : Date.now());
-  }
-}
-function endMeasure(instance, type) {
-  if (instance.appContext.config.performance && isSupported()) {
-    const startTag = `vue-${type}-${instance.uid}`;
-    const endTag = startTag + `:end`;
-    perf.mark(endTag);
-    perf.measure(
-      `<${formatComponentName(instance, instance.type)}> ${type}`,
-      startTag,
-      endTag
-    );
-    perf.clearMarks(startTag);
-    perf.clearMarks(endTag);
-  }
-  {
-    devtoolsPerfEnd(instance, type, isSupported() ? perf.now() : Date.now());
-  }
-}
-function isSupported() {
-  if (supported !== void 0) {
-    return supported;
-  }
-  if (typeof window !== "undefined" && window.performance) {
-    supported = true;
-    perf = window.performance;
-  } else {
-    supported = false;
-  }
-  return supported;
-}
-
-const queuePostRenderEffect = queueEffectWithSuspense ;
-function createRenderer(options) {
-  return baseCreateRenderer(options);
-}
-function createHydrationRenderer(options) {
-  return baseCreateRenderer(options, createHydrationFunctions);
-}
-function baseCreateRenderer(options, createHydrationFns) {
-  const target = getGlobalThis();
-  target.__VUE__ = true;
-  {
-    setDevtoolsHook$1(target.__VUE_DEVTOOLS_GLOBAL_HOOK__, target);
-  }
-  const {
-    insert: hostInsert,
-    remove: hostRemove,
-    patchProp: hostPatchProp,
-    createElement: hostCreateElement,
-    createText: hostCreateText,
-    createComment: hostCreateComment,
-    setText: hostSetText,
-    setElementText: hostSetElementText,
-    parentNode: hostParentNode,
-    nextSibling: hostNextSibling,
-    setScopeId: hostSetScopeId = NOOP,
-    insertStaticContent: hostInsertStaticContent
-  } = options;
-  const patch = (n1, n2, container, anchor = null, parentComponent = null, parentSuspense = null, namespace = void 0, slotScopeIds = null, optimized = isHmrUpdating ? false : !!n2.dynamicChildren) => {
-    if (n1 === n2) {
-      return;
-    }
-    if (n1 && !isSameVNodeType(n1, n2)) {
-      anchor = getNextHostNode(n1);
-      unmount(n1, parentComponent, parentSuspense, true);
-      n1 = null;
-    }
-    if (n2.patchFlag === -2) {
-      optimized = false;
-      n2.dynamicChildren = null;
-    }
-    const { type, ref, shapeFlag } = n2;
-    switch (type) {
-      case Text:
-        processText(n1, n2, container, anchor);
-        break;
-      case Comment:
-        processCommentNode(n1, n2, container, anchor);
-        break;
-      case Static:
-        if (n1 == null) {
-          mountStaticNode(n2, container, anchor, namespace);
-        } else {
-          patchStaticNode(n1, n2, container, namespace);
-        }
-        break;
-      case Fragment:
-        processFragment(
-          n1,
-          n2,
-          container,
-          anchor,
-          parentComponent,
-          parentSuspense,
-          namespace,
-          slotScopeIds,
-          optimized
-        );
-        break;
-      default:
-        if (shapeFlag & 1) {
-          processElement(
-            n1,
-            n2,
-            container,
-            anchor,
-            parentComponent,
-            parentSuspense,
-            namespace,
-            slotScopeIds,
-            optimized
-          );
-        } else if (shapeFlag & 6) {
-          processComponent(
-            n1,
-            n2,
-            container,
-            anchor,
-            parentComponent,
-            parentSuspense,
-            namespace,
-            slotScopeIds,
-            optimized
-          );
-        } else if (shapeFlag & 64) {
-          type.process(
-            n1,
-            n2,
-            container,
-            anchor,
-            parentComponent,
-            parentSuspense,
-            namespace,
-            slotScopeIds,
-            optimized,
-            internals
-          );
-        } else if (shapeFlag & 128) {
-          type.process(
-            n1,
-            n2,
-            container,
-            anchor,
-            parentComponent,
-            parentSuspense,
-            namespace,
-            slotScopeIds,
-            optimized,
-            internals
-          );
-        } else {
-          warn$1("Invalid VNode type:", type, `(${typeof type})`);
-        }
-    }
-    if (ref != null && parentComponent) {
-      setRef(ref, n1 && n1.ref, parentSuspense, n2 || n1, !n2);
-    }
-  };
-  const processText = (n1, n2, container, anchor) => {
-    if (n1 == null) {
-      hostInsert(
-        n2.el = hostCreateText(n2.children),
-        container,
-        anchor
-      );
-    } else {
-      const el = n2.el = n1.el;
-      if (n2.children !== n1.children) {
-        hostSetText(el, n2.children);
-      }
-    }
-  };
-  const processCommentNode = (n1, n2, container, anchor) => {
-    if (n1 == null) {
-      hostInsert(
-        n2.el = hostCreateComment(n2.children || ""),
-        container,
-        anchor
-      );
-    } else {
-      n2.el = n1.el;
-    }
-  };
-  const mountStaticNode = (n2, container, anchor, namespace) => {
-    [n2.el, n2.anchor] = hostInsertStaticContent(
-      n2.children,
-      container,
-      anchor,
-      namespace,
-      n2.el,
-      n2.anchor
-    );
-  };
-  const patchStaticNode = (n1, n2, container, namespace) => {
-    if (n2.children !== n1.children) {
-      const anchor = hostNextSibling(n1.anchor);
-      removeStaticNode(n1);
-      [n2.el, n2.anchor] = hostInsertStaticContent(
-        n2.children,
-        container,
-        anchor,
-        namespace
-      );
-    } else {
-      n2.el = n1.el;
-      n2.anchor = n1.anchor;
-    }
-  };
-  const moveStaticNode = ({ el, anchor }, container, nextSibling) => {
-    let next;
-    while (el && el !== anchor) {
-      next = hostNextSibling(el);
-      hostInsert(el, container, nextSibling);
-      el = next;
-    }
-    hostInsert(anchor, container, nextSibling);
-  };
-  const removeStaticNode = ({ el, anchor }) => {
-    let next;
-    while (el && el !== anchor) {
-      next = hostNextSibling(el);
-      hostRemove(el);
-      el = next;
-    }
-    hostRemove(anchor);
-  };
-  const processElement = (n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
-    if (n2.type === "svg") {
-      namespace = "svg";
-    } else if (n2.type === "math") {
-      namespace = "mathml";
-    }
-    if (n1 == null) {
-      mountElement(
-        n2,
-        container,
-        anchor,
-        parentComponent,
-        parentSuspense,
-        namespace,
-        slotScopeIds,
-        optimized
-      );
-    } else {
-      patchElement(
-        n1,
-        n2,
-        parentComponent,
-        parentSuspense,
-        namespace,
-        slotScopeIds,
-        optimized
-      );
-    }
-  };
-  const mountElement = (vnode, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
-    let el;
-    let vnodeHook;
-    const { props, shapeFlag, transition, dirs } = vnode;
-    el = vnode.el = hostCreateElement(
-      vnode.type,
-      namespace,
-      props && props.is,
-      props
-    );
-    if (shapeFlag & 8) {
-      hostSetElementText(el, vnode.children);
-    } else if (shapeFlag & 16) {
-      mountChildren(
-        vnode.children,
-        el,
-        null,
-        parentComponent,
-        parentSuspense,
-        resolveChildrenNamespace(vnode, namespace),
-        slotScopeIds,
-        optimized
-      );
-    }
-    if (dirs) {
-      invokeDirectiveHook(vnode, null, parentComponent, "created");
-    }
-    setScopeId(el, vnode, vnode.scopeId, slotScopeIds, parentComponent);
-    if (props) {
-      for (const key in props) {
-        if (key !== "value" && !isReservedProp(key)) {
-          hostPatchProp(el, key, null, props[key], namespace, parentComponent);
-        }
-      }
-      if ("value" in props) {
-        hostPatchProp(el, "value", null, props.value, namespace);
-      }
-      if (vnodeHook = props.onVnodeBeforeMount) {
-        invokeVNodeHook(vnodeHook, parentComponent, vnode);
-      }
-    }
-    {
-      def(el, "__vnode", vnode, true);
-      def(el, "__vueParentComponent", parentComponent, true);
-    }
-    if (dirs) {
-      invokeDirectiveHook(vnode, null, parentComponent, "beforeMount");
-    }
-    const needCallTransitionHooks = needTransition(parentSuspense, transition);
-    if (needCallTransitionHooks) {
-      transition.beforeEnter(el);
-    }
-    hostInsert(el, container, anchor);
-    if ((vnodeHook = props && props.onVnodeMounted) || needCallTransitionHooks || dirs) {
-      queuePostRenderEffect(() => {
-        vnodeHook && invokeVNodeHook(vnodeHook, parentComponent, vnode);
-        needCallTransitionHooks && transition.enter(el);
-        dirs && invokeDirectiveHook(vnode, null, parentComponent, "mounted");
-      }, parentSuspense);
-    }
-  };
-  const setScopeId = (el, vnode, scopeId, slotScopeIds, parentComponent) => {
-    if (scopeId) {
-      hostSetScopeId(el, scopeId);
-    }
-    if (slotScopeIds) {
-      for (let i = 0; i < slotScopeIds.length; i++) {
-        hostSetScopeId(el, slotScopeIds[i]);
-      }
-    }
-    if (parentComponent) {
-      let subTree = parentComponent.subTree;
-      if (subTree.patchFlag > 0 && subTree.patchFlag & 2048) {
-        subTree = filterSingleRoot(subTree.children) || subTree;
-      }
-      if (vnode === subTree || isSuspense(subTree.type) && (subTree.ssContent === vnode || subTree.ssFallback === vnode)) {
-        const parentVNode = parentComponent.vnode;
-        setScopeId(
-          el,
-          parentVNode,
-          parentVNode.scopeId,
-          parentVNode.slotScopeIds,
-          parentComponent.parent
-        );
-      }
-    }
-  };
-  const mountChildren = (children, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, start = 0) => {
-    for (let i = start; i < children.length; i++) {
-      const child = children[i] = optimized ? cloneIfMounted(children[i]) : normalizeVNode(children[i]);
-      patch(
-        null,
-        child,
-        container,
-        anchor,
-        parentComponent,
-        parentSuspense,
-        namespace,
-        slotScopeIds,
-        optimized
-      );
-    }
-  };
-  const patchElement = (n1, n2, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
-    const el = n2.el = n1.el;
-    {
-      el.__vnode = n2;
-    }
-    let { patchFlag, dynamicChildren, dirs } = n2;
-    patchFlag |= n1.patchFlag & 16;
-    const oldProps = n1.props || EMPTY_OBJ;
-    const newProps = n2.props || EMPTY_OBJ;
-    let vnodeHook;
-    parentComponent && toggleRecurse(parentComponent, false);
-    if (vnodeHook = newProps.onVnodeBeforeUpdate) {
-      invokeVNodeHook(vnodeHook, parentComponent, n2, n1);
-    }
-    if (dirs) {
-      invokeDirectiveHook(n2, n1, parentComponent, "beforeUpdate");
-    }
-    parentComponent && toggleRecurse(parentComponent, true);
-    if (isHmrUpdating) {
-      patchFlag = 0;
-      optimized = false;
-      dynamicChildren = null;
-    }
-    if (oldProps.innerHTML && newProps.innerHTML == null || oldProps.textContent && newProps.textContent == null) {
-      hostSetElementText(el, "");
-    }
-    if (dynamicChildren) {
-      patchBlockChildren(
-        n1.dynamicChildren,
-        dynamicChildren,
-        el,
-        parentComponent,
-        parentSuspense,
-        resolveChildrenNamespace(n2, namespace),
-        slotScopeIds
-      );
-      {
-        traverseStaticChildren(n1, n2);
-      }
-    } else if (!optimized) {
-      patchChildren(
-        n1,
-        n2,
-        el,
-        null,
-        parentComponent,
-        parentSuspense,
-        resolveChildrenNamespace(n2, namespace),
-        slotScopeIds,
-        false
-      );
-    }
-    if (patchFlag > 0) {
-      if (patchFlag & 16) {
-        patchProps(el, oldProps, newProps, parentComponent, namespace);
-      } else {
-        if (patchFlag & 2) {
-          if (oldProps.class !== newProps.class) {
-            hostPatchProp(el, "class", null, newProps.class, namespace);
-          }
-        }
-        if (patchFlag & 4) {
-          hostPatchProp(el, "style", oldProps.style, newProps.style, namespace);
-        }
-        if (patchFlag & 8) {
-          const propsToUpdate = n2.dynamicProps;
-          for (let i = 0; i < propsToUpdate.length; i++) {
-            const key = propsToUpdate[i];
-            const prev = oldProps[key];
-            const next = newProps[key];
-            if (next !== prev || key === "value") {
-              hostPatchProp(el, key, prev, next, namespace, parentComponent);
-            }
-          }
-        }
-      }
-      if (patchFlag & 1) {
-        if (n1.children !== n2.children) {
-          hostSetElementText(el, n2.children);
-        }
-      }
-    } else if (!optimized && dynamicChildren == null) {
-      patchProps(el, oldProps, newProps, parentComponent, namespace);
-    }
-    if ((vnodeHook = newProps.onVnodeUpdated) || dirs) {
-      queuePostRenderEffect(() => {
-        vnodeHook && invokeVNodeHook(vnodeHook, parentComponent, n2, n1);
-        dirs && invokeDirectiveHook(n2, n1, parentComponent, "updated");
-      }, parentSuspense);
-    }
-  };
-  const patchBlockChildren = (oldChildren, newChildren, fallbackContainer, parentComponent, parentSuspense, namespace, slotScopeIds) => {
-    for (let i = 0; i < newChildren.length; i++) {
-      const oldVNode = oldChildren[i];
-      const newVNode = newChildren[i];
-      const container = (
-        // oldVNode may be an errored async setup() component inside Suspense
-        // which will not have a mounted element
-        oldVNode.el && // - In the case of a Fragment, we need to provide the actual parent
-        // of the Fragment itself so it can move its children.
-        (oldVNode.type === Fragment || // - In the case of different nodes, there is going to be a replacement
-        // which also requires the correct parent container
-        !isSameVNodeType(oldVNode, newVNode) || // - In the case of a component, it could contain anything.
-        oldVNode.shapeFlag & (6 | 64)) ? hostParentNode(oldVNode.el) : (
-          // In other cases, the parent container is not actually used so we
-          // just pass the block element here to avoid a DOM parentNode call.
-          fallbackContainer
-        )
-      );
-      patch(
-        oldVNode,
-        newVNode,
-        container,
-        null,
-        parentComponent,
-        parentSuspense,
-        namespace,
-        slotScopeIds,
-        true
-      );
-    }
-  };
-  const patchProps = (el, oldProps, newProps, parentComponent, namespace) => {
-    if (oldProps !== newProps) {
-      if (oldProps !== EMPTY_OBJ) {
-        for (const key in oldProps) {
-          if (!isReservedProp(key) && !(key in newProps)) {
-            hostPatchProp(
-              el,
-              key,
-              oldProps[key],
-              null,
-              namespace,
-              parentComponent
-            );
-          }
-        }
-      }
-      for (const key in newProps) {
-        if (isReservedProp(key)) continue;
-        const next = newProps[key];
-        const prev = oldProps[key];
-        if (next !== prev && key !== "value") {
-          hostPatchProp(el, key, prev, next, namespace, parentComponent);
-        }
-      }
-      if ("value" in newProps) {
-        hostPatchProp(el, "value", oldProps.value, newProps.value, namespace);
-      }
-    }
-  };
-  const processFragment = (n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
-    const fragmentStartAnchor = n2.el = n1 ? n1.el : hostCreateText("");
-    const fragmentEndAnchor = n2.anchor = n1 ? n1.anchor : hostCreateText("");
-    let { patchFlag, dynamicChildren, slotScopeIds: fragmentSlotScopeIds } = n2;
-    if (
-      // #5523 dev root fragment may inherit directives
-      isHmrUpdating || patchFlag & 2048
-    ) {
-      patchFlag = 0;
-      optimized = false;
-      dynamicChildren = null;
-    }
-    if (fragmentSlotScopeIds) {
-      slotScopeIds = slotScopeIds ? slotScopeIds.concat(fragmentSlotScopeIds) : fragmentSlotScopeIds;
-    }
-    if (n1 == null) {
-      hostInsert(fragmentStartAnchor, container, anchor);
-      hostInsert(fragmentEndAnchor, container, anchor);
-      mountChildren(
-        // #10007
-        // such fragment like `<></>` will be compiled into
-        // a fragment which doesn't have a children.
-        // In this case fallback to an empty array
-        n2.children || [],
-        container,
-        fragmentEndAnchor,
-        parentComponent,
-        parentSuspense,
-        namespace,
-        slotScopeIds,
-        optimized
-      );
-    } else {
-      if (patchFlag > 0 && patchFlag & 64 && dynamicChildren && // #2715 the previous fragment could've been a BAILed one as a result
-      // of renderSlot() with no valid children
-      n1.dynamicChildren) {
-        patchBlockChildren(
-          n1.dynamicChildren,
-          dynamicChildren,
-          container,
-          parentComponent,
-          parentSuspense,
-          namespace,
-          slotScopeIds
-        );
-        {
-          traverseStaticChildren(n1, n2);
-        }
-      } else {
-        patchChildren(
-          n1,
-          n2,
-          container,
-          fragmentEndAnchor,
-          parentComponent,
-          parentSuspense,
-          namespace,
-          slotScopeIds,
-          optimized
-        );
-      }
-    }
-  };
-  const processComponent = (n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
-    n2.slotScopeIds = slotScopeIds;
-    if (n1 == null) {
-      if (n2.shapeFlag & 512) {
-        parentComponent.ctx.activate(
-          n2,
-          container,
-          anchor,
-          namespace,
-          optimized
-        );
-      } else {
-        mountComponent(
-          n2,
-          container,
-          anchor,
-          parentComponent,
-          parentSuspense,
-          namespace,
-          optimized
-        );
-      }
-    } else {
-      updateComponent(n1, n2, optimized);
-    }
-  };
-  const mountComponent = (initialVNode, container, anchor, parentComponent, parentSuspense, namespace, optimized) => {
-    const instance = (initialVNode.component = createComponentInstance(
-      initialVNode,
-      parentComponent,
-      parentSuspense
-    ));
-    if (instance.type.__hmrId) {
-      registerHMR(instance);
-    }
-    {
-      pushWarningContext(initialVNode);
-      startMeasure(instance, `mount`);
-    }
-    if (isKeepAlive(initialVNode)) {
-      instance.ctx.renderer = internals;
-    }
-    {
-      {
-        startMeasure(instance, `init`);
-      }
-      setupComponent(instance, false, optimized);
-      {
-        endMeasure(instance, `init`);
-      }
-    }
-    if (instance.asyncDep) {
-      if (isHmrUpdating) initialVNode.el = null;
-      parentSuspense && parentSuspense.registerDep(instance, setupRenderEffect, optimized);
-      if (!initialVNode.el) {
-        const placeholder = instance.subTree = createVNode(Comment);
-        processCommentNode(null, placeholder, container, anchor);
-      }
-    } else {
-      setupRenderEffect(
-        instance,
-        initialVNode,
-        container,
-        anchor,
-        parentSuspense,
-        namespace,
-        optimized
-      );
-    }
-    {
-      popWarningContext();
-      endMeasure(instance, `mount`);
-    }
-  };
-  const updateComponent = (n1, n2, optimized) => {
-    const instance = n2.component = n1.component;
-    if (shouldUpdateComponent(n1, n2, optimized)) {
-      if (instance.asyncDep && !instance.asyncResolved) {
-        {
-          pushWarningContext(n2);
-        }
-        updateComponentPreRender(instance, n2, optimized);
-        {
-          popWarningContext();
-        }
-        return;
-      } else {
-        instance.next = n2;
-        instance.update();
-      }
-    } else {
-      n2.el = n1.el;
-      instance.vnode = n2;
-    }
-  };
-  const setupRenderEffect = (instance, initialVNode, container, anchor, parentSuspense, namespace, optimized) => {
-    const componentUpdateFn = () => {
-      if (!instance.isMounted) {
-        let vnodeHook;
-        const { el, props } = initialVNode;
-        const { bm, m, parent, root, type } = instance;
-        const isAsyncWrapperVNode = isAsyncWrapper(initialVNode);
-        toggleRecurse(instance, false);
-        if (bm) {
-          invokeArrayFns(bm);
-        }
-        if (!isAsyncWrapperVNode && (vnodeHook = props && props.onVnodeBeforeMount)) {
-          invokeVNodeHook(vnodeHook, parent, initialVNode);
-        }
-        toggleRecurse(instance, true);
-        if (el && hydrateNode) {
-          const hydrateSubTree = () => {
-            {
-              startMeasure(instance, `render`);
-            }
-            instance.subTree = renderComponentRoot(instance);
-            {
-              endMeasure(instance, `render`);
-            }
-            {
-              startMeasure(instance, `hydrate`);
-            }
-            hydrateNode(
-              el,
-              instance.subTree,
-              instance,
-              parentSuspense,
-              null
-            );
-            {
-              endMeasure(instance, `hydrate`);
-            }
-          };
-          if (isAsyncWrapperVNode && type.__asyncHydrate) {
-            type.__asyncHydrate(
-              el,
-              instance,
-              hydrateSubTree
-            );
-          } else {
-            hydrateSubTree();
-          }
-        } else {
-          if (root.ce) {
-            root.ce._injectChildStyle(type);
-          }
-          {
-            startMeasure(instance, `render`);
-          }
-          const subTree = instance.subTree = renderComponentRoot(instance);
-          {
-            endMeasure(instance, `render`);
-          }
-          {
-            startMeasure(instance, `patch`);
-          }
-          patch(
-            null,
-            subTree,
-            container,
-            anchor,
-            instance,
-            parentSuspense,
-            namespace
-          );
-          {
-            endMeasure(instance, `patch`);
-          }
-          initialVNode.el = subTree.el;
-        }
-        if (m) {
-          queuePostRenderEffect(m, parentSuspense);
-        }
-        if (!isAsyncWrapperVNode && (vnodeHook = props && props.onVnodeMounted)) {
-          const scopedInitialVNode = initialVNode;
-          queuePostRenderEffect(
-            () => invokeVNodeHook(vnodeHook, parent, scopedInitialVNode),
-            parentSuspense
-          );
-        }
-        if (initialVNode.shapeFlag & 256 || parent && isAsyncWrapper(parent.vnode) && parent.vnode.shapeFlag & 256) {
-          instance.a && queuePostRenderEffect(instance.a, parentSuspense);
-        }
-        instance.isMounted = true;
-        {
-          devtoolsComponentAdded(instance);
-        }
-        initialVNode = container = anchor = null;
-      } else {
-        let { next, bu, u, parent, vnode } = instance;
-        {
-          const nonHydratedAsyncRoot = locateNonHydratedAsyncRoot(instance);
-          if (nonHydratedAsyncRoot) {
-            if (next) {
-              next.el = vnode.el;
-              updateComponentPreRender(instance, next, optimized);
-            }
-            nonHydratedAsyncRoot.asyncDep.then(() => {
-              if (!instance.isUnmounted) {
-                componentUpdateFn();
-              }
-            });
-            return;
-          }
-        }
-        let originNext = next;
-        let vnodeHook;
-        {
-          pushWarningContext(next || instance.vnode);
-        }
-        toggleRecurse(instance, false);
-        if (next) {
-          next.el = vnode.el;
-          updateComponentPreRender(instance, next, optimized);
-        } else {
-          next = vnode;
-        }
-        if (bu) {
-          invokeArrayFns(bu);
-        }
-        if (vnodeHook = next.props && next.props.onVnodeBeforeUpdate) {
-          invokeVNodeHook(vnodeHook, parent, next, vnode);
-        }
-        toggleRecurse(instance, true);
-        {
-          startMeasure(instance, `render`);
-        }
-        const nextTree = renderComponentRoot(instance);
-        {
-          endMeasure(instance, `render`);
-        }
-        const prevTree = instance.subTree;
-        instance.subTree = nextTree;
-        {
-          startMeasure(instance, `patch`);
-        }
-        patch(
-          prevTree,
-          nextTree,
-          // parent may have changed if it's in a teleport
-          hostParentNode(prevTree.el),
-          // anchor may have changed if it's in a fragment
-          getNextHostNode(prevTree),
-          instance,
-          parentSuspense,
-          namespace
-        );
-        {
-          endMeasure(instance, `patch`);
-        }
-        next.el = nextTree.el;
-        if (originNext === null) {
-          updateHOCHostEl(instance, nextTree.el);
-        }
-        if (u) {
-          queuePostRenderEffect(u, parentSuspense);
-        }
-        if (vnodeHook = next.props && next.props.onVnodeUpdated) {
-          queuePostRenderEffect(
-            () => invokeVNodeHook(vnodeHook, parent, next, vnode),
-            parentSuspense
-          );
-        }
-        {
-          devtoolsComponentUpdated(instance);
-        }
-        {
-          popWarningContext();
-        }
-      }
-    };
-    instance.scope.on();
-    const effect = instance.effect = new ReactiveEffect(componentUpdateFn);
-    instance.scope.off();
-    const update = instance.update = effect.run.bind(effect);
-    const job = instance.job = effect.runIfDirty.bind(effect);
-    job.i = instance;
-    job.id = instance.uid;
-    effect.scheduler = () => queueJob(job);
-    toggleRecurse(instance, true);
-    {
-      effect.onTrack = instance.rtc ? (e) => invokeArrayFns(instance.rtc, e) : void 0;
-      effect.onTrigger = instance.rtg ? (e) => invokeArrayFns(instance.rtg, e) : void 0;
-    }
-    update();
-  };
-  const updateComponentPreRender = (instance, nextVNode, optimized) => {
-    nextVNode.component = instance;
-    const prevProps = instance.vnode.props;
-    instance.vnode = nextVNode;
-    instance.next = null;
-    updateProps(instance, nextVNode.props, prevProps, optimized);
-    updateSlots(instance, nextVNode.children, optimized);
-    pauseTracking();
-    flushPreFlushCbs(instance);
-    resetTracking();
-  };
-  const patchChildren = (n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized = false) => {
-    const c1 = n1 && n1.children;
-    const prevShapeFlag = n1 ? n1.shapeFlag : 0;
-    const c2 = n2.children;
-    const { patchFlag, shapeFlag } = n2;
-    if (patchFlag > 0) {
-      if (patchFlag & 128) {
-        patchKeyedChildren(
-          c1,
-          c2,
-          container,
-          anchor,
-          parentComponent,
-          parentSuspense,
-          namespace,
-          slotScopeIds,
-          optimized
-        );
-        return;
-      } else if (patchFlag & 256) {
-        patchUnkeyedChildren(
-          c1,
-          c2,
-          container,
-          anchor,
-          parentComponent,
-          parentSuspense,
-          namespace,
-          slotScopeIds,
-          optimized
-        );
-        return;
-      }
-    }
-    if (shapeFlag & 8) {
-      if (prevShapeFlag & 16) {
-        unmountChildren(c1, parentComponent, parentSuspense);
-      }
-      if (c2 !== c1) {
-        hostSetElementText(container, c2);
-      }
-    } else {
-      if (prevShapeFlag & 16) {
-        if (shapeFlag & 16) {
-          patchKeyedChildren(
-            c1,
-            c2,
-            container,
-            anchor,
-            parentComponent,
-            parentSuspense,
-            namespace,
-            slotScopeIds,
-            optimized
-          );
-        } else {
-          unmountChildren(c1, parentComponent, parentSuspense, true);
-        }
-      } else {
-        if (prevShapeFlag & 8) {
-          hostSetElementText(container, "");
-        }
-        if (shapeFlag & 16) {
-          mountChildren(
-            c2,
-            container,
-            anchor,
-            parentComponent,
-            parentSuspense,
-            namespace,
-            slotScopeIds,
-            optimized
-          );
-        }
-      }
-    }
-  };
-  const patchUnkeyedChildren = (c1, c2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
-    c1 = c1 || EMPTY_ARR;
-    c2 = c2 || EMPTY_ARR;
-    const oldLength = c1.length;
-    const newLength = c2.length;
-    const commonLength = Math.min(oldLength, newLength);
-    let i;
-    for (i = 0; i < commonLength; i++) {
-      const nextChild = c2[i] = optimized ? cloneIfMounted(c2[i]) : normalizeVNode(c2[i]);
-      patch(
-        c1[i],
-        nextChild,
-        container,
-        null,
-        parentComponent,
-        parentSuspense,
-        namespace,
-        slotScopeIds,
-        optimized
-      );
-    }
-    if (oldLength > newLength) {
-      unmountChildren(
-        c1,
-        parentComponent,
-        parentSuspense,
-        true,
-        false,
-        commonLength
-      );
-    } else {
-      mountChildren(
-        c2,
-        container,
-        anchor,
-        parentComponent,
-        parentSuspense,
-        namespace,
-        slotScopeIds,
-        optimized,
-        commonLength
-      );
-    }
-  };
-  const patchKeyedChildren = (c1, c2, container, parentAnchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized) => {
-    let i = 0;
-    const l2 = c2.length;
-    let e1 = c1.length - 1;
-    let e2 = l2 - 1;
-    while (i <= e1 && i <= e2) {
-      const n1 = c1[i];
-      const n2 = c2[i] = optimized ? cloneIfMounted(c2[i]) : normalizeVNode(c2[i]);
-      if (isSameVNodeType(n1, n2)) {
-        patch(
-          n1,
-          n2,
-          container,
-          null,
-          parentComponent,
-          parentSuspense,
-          namespace,
-          slotScopeIds,
-          optimized
-        );
-      } else {
-        break;
-      }
-      i++;
-    }
-    while (i <= e1 && i <= e2) {
-      const n1 = c1[e1];
-      const n2 = c2[e2] = optimized ? cloneIfMounted(c2[e2]) : normalizeVNode(c2[e2]);
-      if (isSameVNodeType(n1, n2)) {
-        patch(
-          n1,
-          n2,
-          container,
-          null,
-          parentComponent,
-          parentSuspense,
-          namespace,
-          slotScopeIds,
-          optimized
-        );
-      } else {
-        break;
-      }
-      e1--;
-      e2--;
-    }
-    if (i > e1) {
-      if (i <= e2) {
-        const nextPos = e2 + 1;
-        const anchor = nextPos < l2 ? c2[nextPos].el : parentAnchor;
-        while (i <= e2) {
-          patch(
-            null,
-            c2[i] = optimized ? cloneIfMounted(c2[i]) : normalizeVNode(c2[i]),
-            container,
-            anchor,
-            parentComponent,
-            parentSuspense,
-            namespace,
-            slotScopeIds,
-            optimized
-          );
-          i++;
-        }
-      }
-    } else if (i > e2) {
-      while (i <= e1) {
-        unmount(c1[i], parentComponent, parentSuspense, true);
-        i++;
-      }
-    } else {
-      const s1 = i;
-      const s2 = i;
-      const keyToNewIndexMap = /* @__PURE__ */ new Map();
-      for (i = s2; i <= e2; i++) {
-        const nextChild = c2[i] = optimized ? cloneIfMounted(c2[i]) : normalizeVNode(c2[i]);
-        if (nextChild.key != null) {
-          if (keyToNewIndexMap.has(nextChild.key)) {
-            warn$1(
-              `Duplicate keys found during update:`,
-              JSON.stringify(nextChild.key),
-              `Make sure keys are unique.`
-            );
-          }
-          keyToNewIndexMap.set(nextChild.key, i);
-        }
-      }
-      let j;
-      let patched = 0;
-      const toBePatched = e2 - s2 + 1;
-      let moved = false;
-      let maxNewIndexSoFar = 0;
-      const newIndexToOldIndexMap = new Array(toBePatched);
-      for (i = 0; i < toBePatched; i++) newIndexToOldIndexMap[i] = 0;
-      for (i = s1; i <= e1; i++) {
-        const prevChild = c1[i];
-        if (patched >= toBePatched) {
-          unmount(prevChild, parentComponent, parentSuspense, true);
-          continue;
-        }
-        let newIndex;
-        if (prevChild.key != null) {
-          newIndex = keyToNewIndexMap.get(prevChild.key);
-        } else {
-          for (j = s2; j <= e2; j++) {
-            if (newIndexToOldIndexMap[j - s2] === 0 && isSameVNodeType(prevChild, c2[j])) {
-              newIndex = j;
-              break;
-            }
-          }
-        }
-        if (newIndex === void 0) {
-          unmount(prevChild, parentComponent, parentSuspense, true);
-        } else {
-          newIndexToOldIndexMap[newIndex - s2] = i + 1;
-          if (newIndex >= maxNewIndexSoFar) {
-            maxNewIndexSoFar = newIndex;
-          } else {
-            moved = true;
-          }
-          patch(
-            prevChild,
-            c2[newIndex],
-            container,
-            null,
-            parentComponent,
-            parentSuspense,
-            namespace,
-            slotScopeIds,
-            optimized
-          );
-          patched++;
-        }
-      }
-      const increasingNewIndexSequence = moved ? getSequence(newIndexToOldIndexMap) : EMPTY_ARR;
-      j = increasingNewIndexSequence.length - 1;
-      for (i = toBePatched - 1; i >= 0; i--) {
-        const nextIndex = s2 + i;
-        const nextChild = c2[nextIndex];
-        const anchor = nextIndex + 1 < l2 ? c2[nextIndex + 1].el : parentAnchor;
-        if (newIndexToOldIndexMap[i] === 0) {
-          patch(
-            null,
-            nextChild,
-            container,
-            anchor,
-            parentComponent,
-            parentSuspense,
-            namespace,
-            slotScopeIds,
-            optimized
-          );
-        } else if (moved) {
-          if (j < 0 || i !== increasingNewIndexSequence[j]) {
-            move(nextChild, container, anchor, 2);
-          } else {
-            j--;
-          }
-        }
-      }
-    }
-  };
-  const move = (vnode, container, anchor, moveType, parentSuspense = null) => {
-    const { el, type, transition, children, shapeFlag } = vnode;
-    if (shapeFlag & 6) {
-      move(vnode.component.subTree, container, anchor, moveType);
-      return;
-    }
-    if (shapeFlag & 128) {
-      vnode.suspense.move(container, anchor, moveType);
-      return;
-    }
-    if (shapeFlag & 64) {
-      type.move(vnode, container, anchor, internals);
-      return;
-    }
-    if (type === Fragment) {
-      hostInsert(el, container, anchor);
-      for (let i = 0; i < children.length; i++) {
-        move(children[i], container, anchor, moveType);
-      }
-      hostInsert(vnode.anchor, container, anchor);
-      return;
-    }
-    if (type === Static) {
-      moveStaticNode(vnode, container, anchor);
-      return;
-    }
-    const needTransition2 = moveType !== 2 && shapeFlag & 1 && transition;
-    if (needTransition2) {
-      if (moveType === 0) {
-        transition.beforeEnter(el);
-        hostInsert(el, container, anchor);
-        queuePostRenderEffect(() => transition.enter(el), parentSuspense);
-      } else {
-        const { leave, delayLeave, afterLeave } = transition;
-        const remove2 = () => hostInsert(el, container, anchor);
-        const performLeave = () => {
-          leave(el, () => {
-            remove2();
-            afterLeave && afterLeave();
-          });
-        };
-        if (delayLeave) {
-          delayLeave(el, remove2, performLeave);
-        } else {
-          performLeave();
-        }
-      }
-    } else {
-      hostInsert(el, container, anchor);
-    }
-  };
-  const unmount = (vnode, parentComponent, parentSuspense, doRemove = false, optimized = false) => {
-    const {
-      type,
-      props,
-      ref,
-      children,
-      dynamicChildren,
-      shapeFlag,
-      patchFlag,
-      dirs,
-      cacheIndex
-    } = vnode;
-    if (patchFlag === -2) {
-      optimized = false;
-    }
-    if (ref != null) {
-      setRef(ref, null, parentSuspense, vnode, true);
-    }
-    if (cacheIndex != null) {
-      parentComponent.renderCache[cacheIndex] = void 0;
-    }
-    if (shapeFlag & 256) {
-      parentComponent.ctx.deactivate(vnode);
-      return;
-    }
-    const shouldInvokeDirs = shapeFlag & 1 && dirs;
-    const shouldInvokeVnodeHook = !isAsyncWrapper(vnode);
-    let vnodeHook;
-    if (shouldInvokeVnodeHook && (vnodeHook = props && props.onVnodeBeforeUnmount)) {
-      invokeVNodeHook(vnodeHook, parentComponent, vnode);
-    }
-    if (shapeFlag & 6) {
-      unmountComponent(vnode.component, parentSuspense, doRemove);
-    } else {
-      if (shapeFlag & 128) {
-        vnode.suspense.unmount(parentSuspense, doRemove);
-        return;
-      }
-      if (shouldInvokeDirs) {
-        invokeDirectiveHook(vnode, null, parentComponent, "beforeUnmount");
-      }
-      if (shapeFlag & 64) {
-        vnode.type.remove(
-          vnode,
-          parentComponent,
-          parentSuspense,
-          internals,
-          doRemove
-        );
-      } else if (dynamicChildren && // #5154
-      // when v-once is used inside a block, setBlockTracking(-1) marks the
-      // parent block with hasOnce: true
-      // so that it doesn't take the fast path during unmount - otherwise
-      // components nested in v-once are never unmounted.
-      !dynamicChildren.hasOnce && // #1153: fast path should not be taken for non-stable (v-for) fragments
-      (type !== Fragment || patchFlag > 0 && patchFlag & 64)) {
-        unmountChildren(
-          dynamicChildren,
-          parentComponent,
-          parentSuspense,
-          false,
-          true
-        );
-      } else if (type === Fragment && patchFlag & (128 | 256) || !optimized && shapeFlag & 16) {
-        unmountChildren(children, parentComponent, parentSuspense);
-      }
-      if (doRemove) {
-        remove(vnode);
-      }
-    }
-    if (shouldInvokeVnodeHook && (vnodeHook = props && props.onVnodeUnmounted) || shouldInvokeDirs) {
-      queuePostRenderEffect(() => {
-        vnodeHook && invokeVNodeHook(vnodeHook, parentComponent, vnode);
-        shouldInvokeDirs && invokeDirectiveHook(vnode, null, parentComponent, "unmounted");
-      }, parentSuspense);
-    }
-  };
-  const remove = (vnode) => {
-    const { type, el, anchor, transition } = vnode;
-    if (type === Fragment) {
-      if (vnode.patchFlag > 0 && vnode.patchFlag & 2048 && transition && !transition.persisted) {
-        vnode.children.forEach((child) => {
-          if (child.type === Comment) {
-            hostRemove(child.el);
-          } else {
-            remove(child);
-          }
-        });
-      } else {
-        removeFragment(el, anchor);
-      }
-      return;
-    }
-    if (type === Static) {
-      removeStaticNode(vnode);
-      return;
-    }
-    const performRemove = () => {
-      hostRemove(el);
-      if (transition && !transition.persisted && transition.afterLeave) {
-        transition.afterLeave();
-      }
-    };
-    if (vnode.shapeFlag & 1 && transition && !transition.persisted) {
-      const { leave, delayLeave } = transition;
-      const performLeave = () => leave(el, performRemove);
-      if (delayLeave) {
-        delayLeave(vnode.el, performRemove, performLeave);
-      } else {
-        performLeave();
-      }
-    } else {
-      performRemove();
-    }
-  };
-  const removeFragment = (cur, end) => {
-    let next;
-    while (cur !== end) {
-      next = hostNextSibling(cur);
-      hostRemove(cur);
-      cur = next;
-    }
-    hostRemove(end);
-  };
-  const unmountComponent = (instance, parentSuspense, doRemove) => {
-    if (instance.type.__hmrId) {
-      unregisterHMR(instance);
-    }
-    const { bum, scope, job, subTree, um, m, a } = instance;
-    invalidateMount(m);
-    invalidateMount(a);
-    if (bum) {
-      invokeArrayFns(bum);
-    }
-    scope.stop();
-    if (job) {
-      job.flags |= 8;
-      unmount(subTree, instance, parentSuspense, doRemove);
-    }
-    if (um) {
-      queuePostRenderEffect(um, parentSuspense);
-    }
-    queuePostRenderEffect(() => {
-      instance.isUnmounted = true;
-    }, parentSuspense);
-    if (parentSuspense && parentSuspense.pendingBranch && !parentSuspense.isUnmounted && instance.asyncDep && !instance.asyncResolved && instance.suspenseId === parentSuspense.pendingId) {
-      parentSuspense.deps--;
-      if (parentSuspense.deps === 0) {
-        parentSuspense.resolve();
-      }
-    }
-    {
-      devtoolsComponentRemoved(instance);
-    }
-  };
-  const unmountChildren = (children, parentComponent, parentSuspense, doRemove = false, optimized = false, start = 0) => {
-    for (let i = start; i < children.length; i++) {
-      unmount(children[i], parentComponent, parentSuspense, doRemove, optimized);
-    }
-  };
-  const getNextHostNode = (vnode) => {
-    if (vnode.shapeFlag & 6) {
-      return getNextHostNode(vnode.component.subTree);
-    }
-    if (vnode.shapeFlag & 128) {
-      return vnode.suspense.next();
-    }
-    const el = hostNextSibling(vnode.anchor || vnode.el);
-    const teleportEnd = el && el[TeleportEndKey];
-    return teleportEnd ? hostNextSibling(teleportEnd) : el;
-  };
-  let isFlushing = false;
-  const render = (vnode, container, namespace) => {
-    if (vnode == null) {
-      if (container._vnode) {
-        unmount(container._vnode, null, null, true);
-      }
-    } else {
-      patch(
-        container._vnode || null,
-        vnode,
-        container,
-        null,
-        null,
-        null,
-        namespace
-      );
-    }
-    container._vnode = vnode;
-    if (!isFlushing) {
-      isFlushing = true;
-      flushPreFlushCbs();
-      flushPostFlushCbs();
-      isFlushing = false;
-    }
-  };
-  const internals = {
-    p: patch,
-    um: unmount,
-    m: move,
-    r: remove,
-    mt: mountComponent,
-    mc: mountChildren,
-    pc: patchChildren,
-    pbc: patchBlockChildren,
-    n: getNextHostNode,
-    o: options
-  };
-  let hydrate;
-  let hydrateNode;
-  if (createHydrationFns) {
-    [hydrate, hydrateNode] = createHydrationFns(
-      internals
-    );
-  }
-  return {
-    render,
-    hydrate,
-    createApp: createAppAPI(render, hydrate)
-  };
-}
-function resolveChildrenNamespace({ type, props }, currentNamespace) {
-  return currentNamespace === "svg" && type === "foreignObject" || currentNamespace === "mathml" && type === "annotation-xml" && props && props.encoding && props.encoding.includes("html") ? void 0 : currentNamespace;
-}
-function toggleRecurse({ effect, job }, allowed) {
-  if (allowed) {
-    effect.flags |= 32;
-    job.flags |= 4;
-  } else {
-    effect.flags &= ~32;
-    job.flags &= ~4;
-  }
-}
-function needTransition(parentSuspense, transition) {
-  return (!parentSuspense || parentSuspense && !parentSuspense.pendingBranch) && transition && !transition.persisted;
-}
-function traverseStaticChildren(n1, n2, shallow = false) {
-  const ch1 = n1.children;
-  const ch2 = n2.children;
-  if (isArray(ch1) && isArray(ch2)) {
-    for (let i = 0; i < ch1.length; i++) {
-      const c1 = ch1[i];
-      let c2 = ch2[i];
-      if (c2.shapeFlag & 1 && !c2.dynamicChildren) {
-        if (c2.patchFlag <= 0 || c2.patchFlag === 32) {
-          c2 = ch2[i] = cloneIfMounted(ch2[i]);
-          c2.el = c1.el;
-        }
-        if (!shallow && c2.patchFlag !== -2)
-          traverseStaticChildren(c1, c2);
-      }
-      if (c2.type === Text) {
-        c2.el = c1.el;
-      }
-      if (c2.type === Comment && !c2.el) {
-        c2.el = c1.el;
-      }
-    }
-  }
-}
-function getSequence(arr) {
-  const p = arr.slice();
-  const result = [0];
-  let i, j, u, v, c;
-  const len = arr.length;
-  for (i = 0; i < len; i++) {
-    const arrI = arr[i];
-    if (arrI !== 0) {
-      j = result[result.length - 1];
-      if (arr[j] < arrI) {
-        p[i] = j;
-        result.push(i);
-        continue;
-      }
-      u = 0;
-      v = result.length - 1;
-      while (u < v) {
-        c = u + v >> 1;
-        if (arr[result[c]] < arrI) {
-          u = c + 1;
-        } else {
-          v = c;
-        }
-      }
-      if (arrI < arr[result[u]]) {
-        if (u > 0) {
-          p[i] = result[u - 1];
-        }
-        result[u] = i;
-      }
-    }
-  }
-  u = result.length;
-  v = result[u - 1];
-  while (u-- > 0) {
-    result[u] = v;
-    v = p[v];
-  }
-  return result;
-}
-function locateNonHydratedAsyncRoot(instance) {
-  const subComponent = instance.subTree.component;
-  if (subComponent) {
-    if (subComponent.asyncDep && !subComponent.asyncResolved) {
-      return subComponent;
-    } else {
-      return locateNonHydratedAsyncRoot(subComponent);
-    }
-  }
-}
-function invalidateMount(hooks) {
-  if (hooks) {
-    for (let i = 0; i < hooks.length; i++)
-      hooks[i].flags |= 8;
-  }
-}
-
-const ssrContextKey = Symbol.for("v-scx");
-const useSSRContext = () => {
-  {
-    const ctx = inject(ssrContextKey);
-    if (!ctx) {
-      warn$1(
-        `Server rendering context not provided. Make sure to only call useSSRContext() conditionally in the server build.`
-      );
-    }
-    return ctx;
-  }
-};
-
-function watchEffect(effect, options) {
-  return doWatch(effect, null, options);
-}
-function watchPostEffect(effect, options) {
-  return doWatch(
-    effect,
-    null,
-    extend({}, options, { flush: "post" }) 
-  );
-}
-function watchSyncEffect(effect, options) {
-  return doWatch(
-    effect,
-    null,
-    extend({}, options, { flush: "sync" }) 
-  );
-}
-function watch(source, cb, options) {
-  if (!isFunction(cb)) {
-    warn$1(
-      `\`watch(fn, options?)\` signature has been moved to a separate API. Use \`watchEffect(fn, options?)\` instead. \`watch\` now only supports \`watch(source, cb, options?) signature.`
-    );
-  }
-  return doWatch(source, cb, options);
-}
-function doWatch(source, cb, options = EMPTY_OBJ) {
-  const { immediate, deep, flush, once } = options;
-  if (!cb) {
-    if (immediate !== void 0) {
-      warn$1(
-        `watch() "immediate" option is only respected when using the watch(source, callback, options?) signature.`
-      );
-    }
-    if (deep !== void 0) {
-      warn$1(
-        `watch() "deep" option is only respected when using the watch(source, callback, options?) signature.`
-      );
-    }
-    if (once !== void 0) {
-      warn$1(
-        `watch() "once" option is only respected when using the watch(source, callback, options?) signature.`
-      );
-    }
-  }
-  const baseWatchOptions = extend({}, options);
-  baseWatchOptions.onWarn = warn$1;
-  const runsImmediately = cb && immediate || !cb && flush !== "post";
-  let ssrCleanup;
-  if (isInSSRComponentSetup) {
-    if (flush === "sync") {
-      const ctx = useSSRContext();
-      ssrCleanup = ctx.__watcherHandles || (ctx.__watcherHandles = []);
-    } else if (!runsImmediately) {
-      const watchStopHandle = () => {
-      };
-      watchStopHandle.stop = NOOP;
-      watchStopHandle.resume = NOOP;
-      watchStopHandle.pause = NOOP;
-      return watchStopHandle;
-    }
-  }
-  const instance = currentInstance;
-  baseWatchOptions.call = (fn, type, args) => callWithAsyncErrorHandling(fn, instance, type, args);
-  let isPre = false;
-  if (flush === "post") {
-    baseWatchOptions.scheduler = (job) => {
-      queuePostRenderEffect(job, instance && instance.suspense);
-    };
-  } else if (flush !== "sync") {
-    isPre = true;
-    baseWatchOptions.scheduler = (job, isFirstRun) => {
-      if (isFirstRun) {
-        job();
-      } else {
-        queueJob(job);
-      }
-    };
-  }
-  baseWatchOptions.augmentJob = (job) => {
-    if (cb) {
-      job.flags |= 4;
-    }
-    if (isPre) {
-      job.flags |= 2;
-      if (instance) {
-        job.id = instance.uid;
-        job.i = instance;
-      }
-    }
-  };
-  const watchHandle = watch$1(source, cb, baseWatchOptions);
-  if (isInSSRComponentSetup) {
-    if (ssrCleanup) {
-      ssrCleanup.push(watchHandle);
-    } else if (runsImmediately) {
-      watchHandle();
-    }
-  }
-  return watchHandle;
-}
-function instanceWatch(source, value, options) {
-  const publicThis = this.proxy;
-  const getter = isString(source) ? source.includes(".") ? createPathGetter(publicThis, source) : () => publicThis[source] : source.bind(publicThis, publicThis);
-  let cb;
-  if (isFunction(value)) {
-    cb = value;
-  } else {
-    cb = value.handler;
-    options = value;
-  }
-  const reset = setCurrentInstance(this);
-  const res = doWatch(getter, cb.bind(publicThis), options);
-  reset();
-  return res;
-}
-function createPathGetter(ctx, path) {
-  const segments = path.split(".");
-  return () => {
-    let cur = ctx;
-    for (let i = 0; i < segments.length && cur; i++) {
-      cur = cur[segments[i]];
-    }
-    return cur;
-  };
-}
-
-function useModel(props, name, options = EMPTY_OBJ) {
-  const i = getCurrentInstance();
-  if (!i) {
-    warn$1(`useModel() called without active instance.`);
-    return ref();
-  }
-  const camelizedName = camelize(name);
-  if (!i.propsOptions[0][camelizedName]) {
-    warn$1(`useModel() called with prop "${name}" which is not declared.`);
-    return ref();
-  }
-  const hyphenatedName = hyphenate(name);
-  const modifiers = getModelModifiers(props, camelizedName);
-  const res = customRef((track, trigger) => {
-    let localValue;
-    let prevSetValue = EMPTY_OBJ;
-    let prevEmittedValue;
-    watchSyncEffect(() => {
-      const propValue = props[camelizedName];
-      if (hasChanged(localValue, propValue)) {
-        localValue = propValue;
-        trigger();
-      }
-    });
-    return {
-      get() {
-        track();
-        return options.get ? options.get(localValue) : localValue;
-      },
-      set(value) {
-        const emittedValue = options.set ? options.set(value) : value;
-        if (!hasChanged(emittedValue, localValue) && !(prevSetValue !== EMPTY_OBJ && hasChanged(value, prevSetValue))) {
-          return;
-        }
-        const rawProps = i.vnode.props;
-        if (!(rawProps && // check if parent has passed v-model
-        (name in rawProps || camelizedName in rawProps || hyphenatedName in rawProps) && (`onUpdate:${name}` in rawProps || `onUpdate:${camelizedName}` in rawProps || `onUpdate:${hyphenatedName}` in rawProps))) {
-          localValue = value;
-          trigger();
-        }
-        i.emit(`update:${name}`, emittedValue);
-        if (hasChanged(value, emittedValue) && hasChanged(value, prevSetValue) && !hasChanged(emittedValue, prevEmittedValue)) {
-          trigger();
-        }
-        prevSetValue = value;
-        prevEmittedValue = emittedValue;
-      }
-    };
-  });
-  res[Symbol.iterator] = () => {
-    let i2 = 0;
-    return {
-      next() {
-        if (i2 < 2) {
-          return { value: i2++ ? modifiers || EMPTY_OBJ : res, done: false };
-        } else {
-          return { done: true };
-        }
-      }
-    };
-  };
-  return res;
-}
-const getModelModifiers = (props, modelName) => {
-  return modelName === "modelValue" || modelName === "model-value" ? props.modelModifiers : props[`${modelName}Modifiers`] || props[`${camelize(modelName)}Modifiers`] || props[`${hyphenate(modelName)}Modifiers`];
-};
-
-function emit(instance, event, ...rawArgs) {
-  if (instance.isUnmounted) return;
-  const props = instance.vnode.props || EMPTY_OBJ;
-  {
-    const {
-      emitsOptions,
-      propsOptions: [propsOptions]
-    } = instance;
-    if (emitsOptions) {
-      if (!(event in emitsOptions) && true) {
-        if (!propsOptions || !(toHandlerKey(camelize(event)) in propsOptions)) {
-          warn$1(
-            `Component emitted event "${event}" but it is neither declared in the emits option nor as an "${toHandlerKey(camelize(event))}" prop.`
-          );
-        }
-      } else {
-        const validator = emitsOptions[event];
-        if (isFunction(validator)) {
-          const isValid = validator(...rawArgs);
-          if (!isValid) {
-            warn$1(
-              `Invalid event arguments: event validation failed for event "${event}".`
-            );
-          }
-        }
-      }
-    }
-  }
-  let args = rawArgs;
-  const isModelListener = event.startsWith("update:");
-  const modifiers = isModelListener && getModelModifiers(props, event.slice(7));
-  if (modifiers) {
-    if (modifiers.trim) {
-      args = rawArgs.map((a) => isString(a) ? a.trim() : a);
-    }
-    if (modifiers.number) {
-      args = rawArgs.map(looseToNumber);
-    }
-  }
-  {
-    devtoolsComponentEmit(instance, event, args);
-  }
-  {
-    const lowerCaseEvent = event.toLowerCase();
-    if (lowerCaseEvent !== event && props[toHandlerKey(lowerCaseEvent)]) {
-      warn$1(
-        `Event "${lowerCaseEvent}" is emitted in component ${formatComponentName(
-          instance,
-          instance.type
-        )} but the handler is registered for "${event}". Note that HTML attributes are case-insensitive and you cannot use v-on to listen to camelCase events when using in-DOM templates. You should probably use "${hyphenate(
-          event
-        )}" instead of "${event}".`
-      );
-    }
-  }
-  let handlerName;
-  let handler = props[handlerName = toHandlerKey(event)] || // also try camelCase event handler (#2249)
-  props[handlerName = toHandlerKey(camelize(event))];
-  if (!handler && isModelListener) {
-    handler = props[handlerName = toHandlerKey(hyphenate(event))];
-  }
-  if (handler) {
-    callWithAsyncErrorHandling(
-      handler,
-      instance,
-      6,
-      args
-    );
-  }
-  const onceHandler = props[handlerName + `Once`];
-  if (onceHandler) {
-    if (!instance.emitted) {
-      instance.emitted = {};
-    } else if (instance.emitted[handlerName]) {
-      return;
-    }
-    instance.emitted[handlerName] = true;
-    callWithAsyncErrorHandling(
-      onceHandler,
-      instance,
-      6,
-      args
-    );
-  }
-}
-function normalizeEmitsOptions(comp, appContext, asMixin = false) {
-  const cache = appContext.emitsCache;
-  const cached = cache.get(comp);
-  if (cached !== void 0) {
-    return cached;
-  }
-  const raw = comp.emits;
-  let normalized = {};
-  let hasExtends = false;
-  if (!isFunction(comp)) {
-    const extendEmits = (raw2) => {
-      const normalizedFromExtend = normalizeEmitsOptions(raw2, appContext, true);
-      if (normalizedFromExtend) {
-        hasExtends = true;
-        extend(normalized, normalizedFromExtend);
-      }
-    };
-    if (!asMixin && appContext.mixins.length) {
-      appContext.mixins.forEach(extendEmits);
-    }
-    if (comp.extends) {
-      extendEmits(comp.extends);
-    }
-    if (comp.mixins) {
-      comp.mixins.forEach(extendEmits);
-    }
-  }
-  if (!raw && !hasExtends) {
-    if (isObject(comp)) {
-      cache.set(comp, null);
-    }
-    return null;
-  }
-  if (isArray(raw)) {
-    raw.forEach((key) => normalized[key] = null);
-  } else {
-    extend(normalized, raw);
-  }
-  if (isObject(comp)) {
-    cache.set(comp, normalized);
-  }
-  return normalized;
-}
-function isEmitListener(options, key) {
-  if (!options || !isOn(key)) {
-    return false;
-  }
-  key = key.slice(2).replace(/Once$/, "");
-  return hasOwn(options, key[0].toLowerCase() + key.slice(1)) || hasOwn(options, hyphenate(key)) || hasOwn(options, key);
-}
-
-let accessedAttrs = false;
-function markAttrsAccessed() {
-  accessedAttrs = true;
-}
-function renderComponentRoot(instance) {
-  const {
-    type: Component,
-    vnode,
-    proxy,
-    withProxy,
-    propsOptions: [propsOptions],
-    slots,
-    attrs,
-    emit,
-    render,
-    renderCache,
-    props,
-    data,
-    setupState,
-    ctx,
-    inheritAttrs
-  } = instance;
-  const prev = setCurrentRenderingInstance(instance);
-  let result;
-  let fallthroughAttrs;
-  {
-    accessedAttrs = false;
-  }
-  try {
-    if (vnode.shapeFlag & 4) {
-      const proxyToUse = withProxy || proxy;
-      const thisProxy = setupState.__isScriptSetup ? new Proxy(proxyToUse, {
-        get(target, key, receiver) {
-          warn$1(
-            `Property '${String(
-              key
-            )}' was accessed via 'this'. Avoid using 'this' in templates.`
-          );
-          return Reflect.get(target, key, receiver);
-        }
-      }) : proxyToUse;
-      result = normalizeVNode(
-        render.call(
-          thisProxy,
-          proxyToUse,
-          renderCache,
-          true ? shallowReadonly(props) : props,
-          setupState,
-          data,
-          ctx
-        )
-      );
-      fallthroughAttrs = attrs;
-    } else {
-      const render2 = Component;
-      if (attrs === props) {
-        markAttrsAccessed();
-      }
-      result = normalizeVNode(
-        render2.length > 1 ? render2(
-          true ? shallowReadonly(props) : props,
-          true ? {
-            get attrs() {
-              markAttrsAccessed();
-              return shallowReadonly(attrs);
-            },
-            slots,
-            emit
-          } : { attrs, slots, emit }
-        ) : render2(
-          true ? shallowReadonly(props) : props,
-          null
-        )
-      );
-      fallthroughAttrs = Component.props ? attrs : getFunctionalFallthrough(attrs);
-    }
-  } catch (err) {
-    blockStack.length = 0;
-    handleError(err, instance, 1);
-    result = createVNode(Comment);
-  }
-  let root = result;
-  let setRoot = void 0;
-  if (result.patchFlag > 0 && result.patchFlag & 2048) {
-    [root, setRoot] = getChildRoot(result);
-  }
-  if (fallthroughAttrs && inheritAttrs !== false) {
-    const keys = Object.keys(fallthroughAttrs);
-    const { shapeFlag } = root;
-    if (keys.length) {
-      if (shapeFlag & (1 | 6)) {
-        if (propsOptions && keys.some(isModelListener)) {
-          fallthroughAttrs = filterModelListeners(
-            fallthroughAttrs,
-            propsOptions
-          );
-        }
-        root = cloneVNode(root, fallthroughAttrs, false, true);
-      } else if (!accessedAttrs && root.type !== Comment) {
-        const allAttrs = Object.keys(attrs);
-        const eventAttrs = [];
-        const extraAttrs = [];
-        for (let i = 0, l = allAttrs.length; i < l; i++) {
-          const key = allAttrs[i];
-          if (isOn(key)) {
-            if (!isModelListener(key)) {
-              eventAttrs.push(key[2].toLowerCase() + key.slice(3));
-            }
-          } else {
-            extraAttrs.push(key);
-          }
-        }
-        if (extraAttrs.length) {
-          warn$1(
-            `Extraneous non-props attributes (${extraAttrs.join(", ")}) were passed to component but could not be automatically inherited because component renders fragment or text root nodes.`
-          );
-        }
-        if (eventAttrs.length) {
-          warn$1(
-            `Extraneous non-emits event listeners (${eventAttrs.join(", ")}) were passed to component but could not be automatically inherited because component renders fragment or text root nodes. If the listener is intended to be a component custom event listener only, declare it using the "emits" option.`
-          );
-        }
-      }
-    }
-  }
-  if (vnode.dirs) {
-    if (!isElementRoot(root)) {
-      warn$1(
-        `Runtime directive used on component with non-element root node. The directives will not function as intended.`
-      );
-    }
-    root = cloneVNode(root, null, false, true);
-    root.dirs = root.dirs ? root.dirs.concat(vnode.dirs) : vnode.dirs;
-  }
-  if (vnode.transition) {
-    if (!isElementRoot(root)) {
-      warn$1(
-        `Component inside <Transition> renders non-element root node that cannot be animated.`
-      );
-    }
-    setTransitionHooks(root, vnode.transition);
-  }
-  if (setRoot) {
-    setRoot(root);
-  } else {
-    result = root;
-  }
-  setCurrentRenderingInstance(prev);
-  return result;
-}
-const getChildRoot = (vnode) => {
-  const rawChildren = vnode.children;
-  const dynamicChildren = vnode.dynamicChildren;
-  const childRoot = filterSingleRoot(rawChildren, false);
-  if (!childRoot) {
-    return [vnode, void 0];
-  } else if (childRoot.patchFlag > 0 && childRoot.patchFlag & 2048) {
-    return getChildRoot(childRoot);
-  }
-  const index = rawChildren.indexOf(childRoot);
-  const dynamicIndex = dynamicChildren ? dynamicChildren.indexOf(childRoot) : -1;
-  const setRoot = (updatedRoot) => {
-    rawChildren[index] = updatedRoot;
-    if (dynamicChildren) {
-      if (dynamicIndex > -1) {
-        dynamicChildren[dynamicIndex] = updatedRoot;
-      } else if (updatedRoot.patchFlag > 0) {
-        vnode.dynamicChildren = [...dynamicChildren, updatedRoot];
-      }
-    }
-  };
-  return [normalizeVNode(childRoot), setRoot];
-};
-function filterSingleRoot(children, recurse = true) {
-  let singleRoot;
-  for (let i = 0; i < children.length; i++) {
-    const child = children[i];
-    if (isVNode(child)) {
-      if (child.type !== Comment || child.children === "v-if") {
-        if (singleRoot) {
-          return;
-        } else {
-          singleRoot = child;
-          if (recurse && singleRoot.patchFlag > 0 && singleRoot.patchFlag & 2048) {
-            return filterSingleRoot(singleRoot.children);
-          }
-        }
-      }
-    } else {
-      return;
-    }
-  }
-  return singleRoot;
-}
-const getFunctionalFallthrough = (attrs) => {
-  let res;
-  for (const key in attrs) {
-    if (key === "class" || key === "style" || isOn(key)) {
-      (res || (res = {}))[key] = attrs[key];
-    }
-  }
-  return res;
-};
-const filterModelListeners = (attrs, props) => {
-  const res = {};
-  for (const key in attrs) {
-    if (!isModelListener(key) || !(key.slice(9) in props)) {
-      res[key] = attrs[key];
-    }
-  }
-  return res;
-};
-const isElementRoot = (vnode) => {
-  return vnode.shapeFlag & (6 | 1) || vnode.type === Comment;
-};
-function shouldUpdateComponent(prevVNode, nextVNode, optimized) {
-  const { props: prevProps, children: prevChildren, component } = prevVNode;
-  const { props: nextProps, children: nextChildren, patchFlag } = nextVNode;
-  const emits = component.emitsOptions;
-  if ((prevChildren || nextChildren) && isHmrUpdating) {
-    return true;
-  }
-  if (nextVNode.dirs || nextVNode.transition) {
-    return true;
-  }
-  if (optimized && patchFlag >= 0) {
-    if (patchFlag & 1024) {
-      return true;
-    }
-    if (patchFlag & 16) {
-      if (!prevProps) {
-        return !!nextProps;
-      }
-      return hasPropsChanged(prevProps, nextProps, emits);
-    } else if (patchFlag & 8) {
-      const dynamicProps = nextVNode.dynamicProps;
-      for (let i = 0; i < dynamicProps.length; i++) {
-        const key = dynamicProps[i];
-        if (nextProps[key] !== prevProps[key] && !isEmitListener(emits, key)) {
-          return true;
-        }
-      }
-    }
-  } else {
-    if (prevChildren || nextChildren) {
-      if (!nextChildren || !nextChildren.$stable) {
-        return true;
-      }
-    }
-    if (prevProps === nextProps) {
-      return false;
-    }
-    if (!prevProps) {
-      return !!nextProps;
-    }
-    if (!nextProps) {
-      return true;
-    }
-    return hasPropsChanged(prevProps, nextProps, emits);
-  }
-  return false;
-}
-function hasPropsChanged(prevProps, nextProps, emitsOptions) {
-  const nextKeys = Object.keys(nextProps);
-  if (nextKeys.length !== Object.keys(prevProps).length) {
-    return true;
-  }
-  for (let i = 0; i < nextKeys.length; i++) {
-    const key = nextKeys[i];
-    if (nextProps[key] !== prevProps[key] && !isEmitListener(emitsOptions, key)) {
-      return true;
-    }
-  }
-  return false;
-}
-function updateHOCHostEl({ vnode, parent }, el) {
-  while (parent) {
-    const root = parent.subTree;
-    if (root.suspense && root.suspense.activeBranch === vnode) {
-      root.el = vnode.el;
-    }
-    if (root === vnode) {
-      (vnode = parent.vnode).el = el;
-      parent = parent.parent;
-    } else {
-      break;
-    }
-  }
-}
-
-const isSuspense = (type) => type.__isSuspense;
-let suspenseId = 0;
-const SuspenseImpl = {
-  name: "Suspense",
-  // In order to make Suspense tree-shakable, we need to avoid importing it
-  // directly in the renderer. The renderer checks for the __isSuspense flag
-  // on a vnode's type and calls the `process` method, passing in renderer
-  // internals.
-  __isSuspense: true,
-  process(n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, rendererInternals) {
-    if (n1 == null) {
-      mountSuspense(
-        n2,
-        container,
-        anchor,
-        parentComponent,
-        parentSuspense,
-        namespace,
-        slotScopeIds,
-        optimized,
-        rendererInternals
-      );
-    } else {
-      if (parentSuspense && parentSuspense.deps > 0 && !n1.suspense.isInFallback) {
-        n2.suspense = n1.suspense;
-        n2.suspense.vnode = n2;
-        n2.el = n1.el;
-        return;
-      }
-      patchSuspense(
-        n1,
-        n2,
-        container,
-        anchor,
-        parentComponent,
-        namespace,
-        slotScopeIds,
-        optimized,
-        rendererInternals
-      );
-    }
-  },
-  hydrate: hydrateSuspense,
-  normalize: normalizeSuspenseChildren
-};
-const Suspense = SuspenseImpl ;
-function triggerEvent(vnode, name) {
-  const eventListener = vnode.props && vnode.props[name];
-  if (isFunction(eventListener)) {
-    eventListener();
-  }
-}
-function mountSuspense(vnode, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, rendererInternals) {
-  const {
-    p: patch,
-    o: { createElement }
-  } = rendererInternals;
-  const hiddenContainer = createElement("div");
-  const suspense = vnode.suspense = createSuspenseBoundary(
-    vnode,
-    parentSuspense,
-    parentComponent,
-    container,
-    hiddenContainer,
-    anchor,
-    namespace,
-    slotScopeIds,
-    optimized,
-    rendererInternals
-  );
-  patch(
-    null,
-    suspense.pendingBranch = vnode.ssContent,
-    hiddenContainer,
-    null,
-    parentComponent,
-    suspense,
-    namespace,
-    slotScopeIds
-  );
-  if (suspense.deps > 0) {
-    triggerEvent(vnode, "onPending");
-    triggerEvent(vnode, "onFallback");
-    patch(
-      null,
-      vnode.ssFallback,
-      container,
-      anchor,
-      parentComponent,
-      null,
-      // fallback tree will not have suspense context
-      namespace,
-      slotScopeIds
-    );
-    setActiveBranch(suspense, vnode.ssFallback);
-  } else {
-    suspense.resolve(false, true);
-  }
-}
-function patchSuspense(n1, n2, container, anchor, parentComponent, namespace, slotScopeIds, optimized, { p: patch, um: unmount, o: { createElement } }) {
-  const suspense = n2.suspense = n1.suspense;
-  suspense.vnode = n2;
-  n2.el = n1.el;
-  const newBranch = n2.ssContent;
-  const newFallback = n2.ssFallback;
-  const { activeBranch, pendingBranch, isInFallback, isHydrating } = suspense;
-  if (pendingBranch) {
-    suspense.pendingBranch = newBranch;
-    if (isSameVNodeType(newBranch, pendingBranch)) {
-      patch(
-        pendingBranch,
-        newBranch,
-        suspense.hiddenContainer,
-        null,
-        parentComponent,
-        suspense,
-        namespace,
-        slotScopeIds,
-        optimized
-      );
-      if (suspense.deps <= 0) {
-        suspense.resolve();
-      } else if (isInFallback) {
-        if (!isHydrating) {
-          patch(
-            activeBranch,
-            newFallback,
-            container,
-            anchor,
-            parentComponent,
-            null,
-            // fallback tree will not have suspense context
-            namespace,
-            slotScopeIds,
-            optimized
-          );
-          setActiveBranch(suspense, newFallback);
-        }
-      }
-    } else {
-      suspense.pendingId = suspenseId++;
-      if (isHydrating) {
-        suspense.isHydrating = false;
-        suspense.activeBranch = pendingBranch;
-      } else {
-        unmount(pendingBranch, parentComponent, suspense);
-      }
-      suspense.deps = 0;
-      suspense.effects.length = 0;
-      suspense.hiddenContainer = createElement("div");
-      if (isInFallback) {
-        patch(
-          null,
-          newBranch,
-          suspense.hiddenContainer,
-          null,
-          parentComponent,
-          suspense,
-          namespace,
-          slotScopeIds,
-          optimized
-        );
-        if (suspense.deps <= 0) {
-          suspense.resolve();
-        } else {
-          patch(
-            activeBranch,
-            newFallback,
-            container,
-            anchor,
-            parentComponent,
-            null,
-            // fallback tree will not have suspense context
-            namespace,
-            slotScopeIds,
-            optimized
-          );
-          setActiveBranch(suspense, newFallback);
-        }
-      } else if (activeBranch && isSameVNodeType(newBranch, activeBranch)) {
-        patch(
-          activeBranch,
-          newBranch,
-          container,
-          anchor,
-          parentComponent,
-          suspense,
-          namespace,
-          slotScopeIds,
-          optimized
-        );
-        suspense.resolve(true);
-      } else {
-        patch(
-          null,
-          newBranch,
-          suspense.hiddenContainer,
-          null,
-          parentComponent,
-          suspense,
-          namespace,
-          slotScopeIds,
-          optimized
-        );
-        if (suspense.deps <= 0) {
-          suspense.resolve();
-        }
-      }
-    }
-  } else {
-    if (activeBranch && isSameVNodeType(newBranch, activeBranch)) {
-      patch(
-        activeBranch,
-        newBranch,
-        container,
-        anchor,
-        parentComponent,
-        suspense,
-        namespace,
-        slotScopeIds,
-        optimized
-      );
-      setActiveBranch(suspense, newBranch);
-    } else {
-      triggerEvent(n2, "onPending");
-      suspense.pendingBranch = newBranch;
-      if (newBranch.shapeFlag & 512) {
-        suspense.pendingId = newBranch.component.suspenseId;
-      } else {
-        suspense.pendingId = suspenseId++;
-      }
-      patch(
-        null,
-        newBranch,
-        suspense.hiddenContainer,
-        null,
-        parentComponent,
-        suspense,
-        namespace,
-        slotScopeIds,
-        optimized
-      );
-      if (suspense.deps <= 0) {
-        suspense.resolve();
-      } else {
-        const { timeout, pendingId } = suspense;
-        if (timeout > 0) {
-          setTimeout(() => {
-            if (suspense.pendingId === pendingId) {
-              suspense.fallback(newFallback);
-            }
-          }, timeout);
-        } else if (timeout === 0) {
-          suspense.fallback(newFallback);
-        }
-      }
-    }
-  }
-}
-let hasWarned = false;
-function createSuspenseBoundary(vnode, parentSuspense, parentComponent, container, hiddenContainer, anchor, namespace, slotScopeIds, optimized, rendererInternals, isHydrating = false) {
-  if (!hasWarned) {
-    hasWarned = true;
-    console[console.info ? "info" : "log"](
-      `<Suspense> is an experimental feature and its API will likely change.`
-    );
-  }
-  const {
-    p: patch,
-    m: move,
-    um: unmount,
-    n: next,
-    o: { parentNode, remove }
-  } = rendererInternals;
-  let parentSuspenseId;
-  const isSuspensible = isVNodeSuspensible(vnode);
-  if (isSuspensible) {
-    if (parentSuspense && parentSuspense.pendingBranch) {
-      parentSuspenseId = parentSuspense.pendingId;
-      parentSuspense.deps++;
-    }
-  }
-  const timeout = vnode.props ? toNumber(vnode.props.timeout) : void 0;
-  {
-    assertNumber(timeout, `Suspense timeout`);
-  }
-  const initialAnchor = anchor;
-  const suspense = {
-    vnode,
-    parent: parentSuspense,
-    parentComponent,
-    namespace,
-    container,
-    hiddenContainer,
-    deps: 0,
-    pendingId: suspenseId++,
-    timeout: typeof timeout === "number" ? timeout : -1,
-    activeBranch: null,
-    pendingBranch: null,
-    isInFallback: !isHydrating,
-    isHydrating,
-    isUnmounted: false,
-    effects: [],
-    resolve(resume = false, sync = false) {
-      {
-        if (!resume && !suspense.pendingBranch) {
-          throw new Error(
-            `suspense.resolve() is called without a pending branch.`
-          );
-        }
-        if (suspense.isUnmounted) {
-          throw new Error(
-            `suspense.resolve() is called on an already unmounted suspense boundary.`
-          );
-        }
-      }
-      const {
-        vnode: vnode2,
-        activeBranch,
-        pendingBranch,
-        pendingId,
-        effects,
-        parentComponent: parentComponent2,
-        container: container2
-      } = suspense;
-      let delayEnter = false;
-      if (suspense.isHydrating) {
-        suspense.isHydrating = false;
-      } else if (!resume) {
-        delayEnter = activeBranch && pendingBranch.transition && pendingBranch.transition.mode === "out-in";
-        if (delayEnter) {
-          activeBranch.transition.afterLeave = () => {
-            if (pendingId === suspense.pendingId) {
-              move(
-                pendingBranch,
-                container2,
-                anchor === initialAnchor ? next(activeBranch) : anchor,
-                0
-              );
-              queuePostFlushCb(effects);
-            }
-          };
-        }
-        if (activeBranch) {
-          if (parentNode(activeBranch.el) === container2) {
-            anchor = next(activeBranch);
-          }
-          unmount(activeBranch, parentComponent2, suspense, true);
-        }
-        if (!delayEnter) {
-          move(pendingBranch, container2, anchor, 0);
-        }
-      }
-      setActiveBranch(suspense, pendingBranch);
-      suspense.pendingBranch = null;
-      suspense.isInFallback = false;
-      let parent = suspense.parent;
-      let hasUnresolvedAncestor = false;
-      while (parent) {
-        if (parent.pendingBranch) {
-          parent.effects.push(...effects);
-          hasUnresolvedAncestor = true;
-          break;
-        }
-        parent = parent.parent;
-      }
-      if (!hasUnresolvedAncestor && !delayEnter) {
-        queuePostFlushCb(effects);
-      }
-      suspense.effects = [];
-      if (isSuspensible) {
-        if (parentSuspense && parentSuspense.pendingBranch && parentSuspenseId === parentSuspense.pendingId) {
-          parentSuspense.deps--;
-          if (parentSuspense.deps === 0 && !sync) {
-            parentSuspense.resolve();
-          }
-        }
-      }
-      triggerEvent(vnode2, "onResolve");
-    },
-    fallback(fallbackVNode) {
-      if (!suspense.pendingBranch) {
-        return;
-      }
-      const { vnode: vnode2, activeBranch, parentComponent: parentComponent2, container: container2, namespace: namespace2 } = suspense;
-      triggerEvent(vnode2, "onFallback");
-      const anchor2 = next(activeBranch);
-      const mountFallback = () => {
-        if (!suspense.isInFallback) {
-          return;
-        }
-        patch(
-          null,
-          fallbackVNode,
-          container2,
-          anchor2,
-          parentComponent2,
-          null,
-          // fallback tree will not have suspense context
-          namespace2,
-          slotScopeIds,
-          optimized
-        );
-        setActiveBranch(suspense, fallbackVNode);
-      };
-      const delayEnter = fallbackVNode.transition && fallbackVNode.transition.mode === "out-in";
-      if (delayEnter) {
-        activeBranch.transition.afterLeave = mountFallback;
-      }
-      suspense.isInFallback = true;
-      unmount(
-        activeBranch,
-        parentComponent2,
-        null,
-        // no suspense so unmount hooks fire now
-        true
-        // shouldRemove
-      );
-      if (!delayEnter) {
-        mountFallback();
-      }
-    },
-    move(container2, anchor2, type) {
-      suspense.activeBranch && move(suspense.activeBranch, container2, anchor2, type);
-      suspense.container = container2;
-    },
-    next() {
-      return suspense.activeBranch && next(suspense.activeBranch);
-    },
-    registerDep(instance, setupRenderEffect, optimized2) {
-      const isInPendingSuspense = !!suspense.pendingBranch;
-      if (isInPendingSuspense) {
-        suspense.deps++;
-      }
-      const hydratedEl = instance.vnode.el;
-      instance.asyncDep.catch((err) => {
-        handleError(err, instance, 0);
-      }).then((asyncSetupResult) => {
-        if (instance.isUnmounted || suspense.isUnmounted || suspense.pendingId !== instance.suspenseId) {
-          return;
-        }
-        instance.asyncResolved = true;
-        const { vnode: vnode2 } = instance;
-        {
-          pushWarningContext(vnode2);
-        }
-        handleSetupResult(instance, asyncSetupResult, false);
-        if (hydratedEl) {
-          vnode2.el = hydratedEl;
-        }
-        const placeholder = !hydratedEl && instance.subTree.el;
-        setupRenderEffect(
-          instance,
-          vnode2,
-          // component may have been moved before resolve.
-          // if this is not a hydration, instance.subTree will be the comment
-          // placeholder.
-          parentNode(hydratedEl || instance.subTree.el),
-          // anchor will not be used if this is hydration, so only need to
-          // consider the comment placeholder case.
-          hydratedEl ? null : next(instance.subTree),
-          suspense,
-          namespace,
-          optimized2
-        );
-        if (placeholder) {
-          remove(placeholder);
-        }
-        updateHOCHostEl(instance, vnode2.el);
-        {
-          popWarningContext();
-        }
-        if (isInPendingSuspense && --suspense.deps === 0) {
-          suspense.resolve();
-        }
-      });
-    },
-    unmount(parentSuspense2, doRemove) {
-      suspense.isUnmounted = true;
-      if (suspense.activeBranch) {
-        unmount(
-          suspense.activeBranch,
-          parentComponent,
-          parentSuspense2,
-          doRemove
-        );
-      }
-      if (suspense.pendingBranch) {
-        unmount(
-          suspense.pendingBranch,
-          parentComponent,
-          parentSuspense2,
-          doRemove
-        );
-      }
-    }
-  };
-  return suspense;
-}
-function hydrateSuspense(node, vnode, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, rendererInternals, hydrateNode) {
-  const suspense = vnode.suspense = createSuspenseBoundary(
-    vnode,
-    parentSuspense,
-    parentComponent,
-    node.parentNode,
-    // eslint-disable-next-line no-restricted-globals
-    document.createElement("div"),
-    null,
-    namespace,
-    slotScopeIds,
-    optimized,
-    rendererInternals,
-    true
-  );
-  const result = hydrateNode(
-    node,
-    suspense.pendingBranch = vnode.ssContent,
-    parentComponent,
-    suspense,
-    slotScopeIds,
-    optimized
-  );
-  if (suspense.deps === 0) {
-    suspense.resolve(false, true);
-  }
-  return result;
-}
-function normalizeSuspenseChildren(vnode) {
-  const { shapeFlag, children } = vnode;
-  const isSlotChildren = shapeFlag & 32;
-  vnode.ssContent = normalizeSuspenseSlot(
-    isSlotChildren ? children.default : children
-  );
-  vnode.ssFallback = isSlotChildren ? normalizeSuspenseSlot(children.fallback) : createVNode(Comment);
-}
-function normalizeSuspenseSlot(s) {
-  let block;
-  if (isFunction(s)) {
-    const trackBlock = isBlockTreeEnabled && s._c;
-    if (trackBlock) {
-      s._d = false;
-      openBlock();
-    }
-    s = s();
-    if (trackBlock) {
-      s._d = true;
-      block = currentBlock;
-      closeBlock();
-    }
-  }
-  if (isArray(s)) {
-    const singleChild = filterSingleRoot(s);
-    if (!singleChild && s.filter((child) => child !== NULL_DYNAMIC_COMPONENT).length > 0) {
-      warn$1(`<Suspense> slots expect a single root node.`);
-    }
-    s = singleChild;
-  }
-  s = normalizeVNode(s);
-  if (block && !s.dynamicChildren) {
-    s.dynamicChildren = block.filter((c) => c !== s);
-  }
-  return s;
-}
-function queueEffectWithSuspense(fn, suspense) {
-  if (suspense && suspense.pendingBranch) {
-    if (isArray(fn)) {
-      suspense.effects.push(...fn);
-    } else {
-      suspense.effects.push(fn);
-    }
-  } else {
-    queuePostFlushCb(fn);
-  }
-}
-function setActiveBranch(suspense, branch) {
-  suspense.activeBranch = branch;
-  const { vnode, parentComponent } = suspense;
-  let el = branch.el;
-  while (!el && branch.component) {
-    branch = branch.component.subTree;
-    el = branch.el;
-  }
-  vnode.el = el;
-  if (parentComponent && parentComponent.subTree === vnode) {
-    parentComponent.vnode.el = el;
-    updateHOCHostEl(parentComponent, el);
-  }
-}
-function isVNodeSuspensible(vnode) {
-  const suspensible = vnode.props && vnode.props.suspensible;
-  return suspensible != null && suspensible !== false;
-}
-
-const Fragment = Symbol.for("v-fgt");
-const Text = Symbol.for("v-txt");
-const Comment = Symbol.for("v-cmt");
-const Static = Symbol.for("v-stc");
-const blockStack = [];
-let currentBlock = null;
-function openBlock(disableTracking = false) {
-  blockStack.push(currentBlock = disableTracking ? null : []);
-}
-function closeBlock() {
-  blockStack.pop();
-  currentBlock = blockStack[blockStack.length - 1] || null;
-}
-let isBlockTreeEnabled = 1;
-function setBlockTracking(value) {
-  isBlockTreeEnabled += value;
-  if (value < 0 && currentBlock) {
-    currentBlock.hasOnce = true;
-  }
-}
-function setupBlock(vnode) {
-  vnode.dynamicChildren = isBlockTreeEnabled > 0 ? currentBlock || EMPTY_ARR : null;
-  closeBlock();
-  if (isBlockTreeEnabled > 0 && currentBlock) {
-    currentBlock.push(vnode);
-  }
-  return vnode;
-}
-function createElementBlock(type, props, children, patchFlag, dynamicProps, shapeFlag) {
-  return setupBlock(
-    createBaseVNode(
-      type,
-      props,
-      children,
-      patchFlag,
-      dynamicProps,
-      shapeFlag,
-      true
-    )
-  );
-}
-function createBlock(type, props, children, patchFlag, dynamicProps) {
-  return setupBlock(
-    createVNode(
-      type,
-      props,
-      children,
-      patchFlag,
-      dynamicProps,
-      true
-    )
-  );
-}
-function isVNode(value) {
-  return value ? value.__v_isVNode === true : false;
-}
-function isSameVNodeType(n1, n2) {
-  if (n2.shapeFlag & 6 && n1.component) {
-    const dirtyInstances = hmrDirtyComponents.get(n2.type);
-    if (dirtyInstances && dirtyInstances.has(n1.component)) {
-      n1.shapeFlag &= ~256;
-      n2.shapeFlag &= ~512;
-      return false;
-    }
-  }
-  return n1.type === n2.type && n1.key === n2.key;
-}
-let vnodeArgsTransformer;
-function transformVNodeArgs(transformer) {
-  vnodeArgsTransformer = transformer;
-}
-const createVNodeWithArgsTransform = (...args) => {
-  return _createVNode(
-    ...vnodeArgsTransformer ? vnodeArgsTransformer(args, currentRenderingInstance) : args
-  );
-};
-const normalizeKey = ({ key }) => key != null ? key : null;
-const normalizeRef = ({
-  ref,
-  ref_key,
-  ref_for
-}) => {
-  if (typeof ref === "number") {
-    ref = "" + ref;
-  }
-  return ref != null ? isString(ref) || isRef(ref) || isFunction(ref) ? { i: currentRenderingInstance, r: ref, k: ref_key, f: !!ref_for } : ref : null;
-};
-function createBaseVNode(type, props = null, children = null, patchFlag = 0, dynamicProps = null, shapeFlag = type === Fragment ? 0 : 1, isBlockNode = false, needFullChildrenNormalization = false) {
-  const vnode = {
-    __v_isVNode: true,
-    __v_skip: true,
-    type,
-    props,
-    key: props && normalizeKey(props),
-    ref: props && normalizeRef(props),
-    scopeId: currentScopeId,
-    slotScopeIds: null,
-    children,
-    component: null,
-    suspense: null,
-    ssContent: null,
-    ssFallback: null,
-    dirs: null,
-    transition: null,
-    el: null,
-    anchor: null,
-    target: null,
-    targetStart: null,
-    targetAnchor: null,
-    staticCount: 0,
-    shapeFlag,
-    patchFlag,
-    dynamicProps,
-    dynamicChildren: null,
-    appContext: null,
-    ctx: currentRenderingInstance
-  };
-  if (needFullChildrenNormalization) {
-    normalizeChildren(vnode, children);
-    if (shapeFlag & 128) {
-      type.normalize(vnode);
-    }
-  } else if (children) {
-    vnode.shapeFlag |= isString(children) ? 8 : 16;
-  }
-  if (vnode.key !== vnode.key) {
-    warn$1(`VNode created with invalid key (NaN). VNode type:`, vnode.type);
-  }
-  if (isBlockTreeEnabled > 0 && // avoid a block node from tracking itself
-  !isBlockNode && // has current parent block
-  currentBlock && // presence of a patch flag indicates this node needs patching on updates.
-  // component nodes also should always be patched, because even if the
-  // component doesn't need to update, it needs to persist the instance on to
-  // the next vnode so that it can be properly unmounted later.
-  (vnode.patchFlag > 0 || shapeFlag & 6) && // the EVENTS flag is only for hydration and if it is the only flag, the
-  // vnode should not be considered dynamic due to handler caching.
-  vnode.patchFlag !== 32) {
-    currentBlock.push(vnode);
-  }
-  return vnode;
-}
-const createVNode = createVNodeWithArgsTransform ;
-function _createVNode(type, props = null, children = null, patchFlag = 0, dynamicProps = null, isBlockNode = false) {
-  if (!type || type === NULL_DYNAMIC_COMPONENT) {
-    if (!type) {
-      warn$1(`Invalid vnode type when creating vnode: ${type}.`);
-    }
-    type = Comment;
-  }
-  if (isVNode(type)) {
-    const cloned = cloneVNode(
-      type,
-      props,
-      true
-      /* mergeRef: true */
-    );
-    if (children) {
-      normalizeChildren(cloned, children);
-    }
-    if (isBlockTreeEnabled > 0 && !isBlockNode && currentBlock) {
-      if (cloned.shapeFlag & 6) {
-        currentBlock[currentBlock.indexOf(type)] = cloned;
-      } else {
-        currentBlock.push(cloned);
-      }
-    }
-    cloned.patchFlag = -2;
-    return cloned;
-  }
-  if (isClassComponent(type)) {
-    type = type.__vccOpts;
-  }
-  if (props) {
-    props = guardReactiveProps(props);
-    let { class: klass, style } = props;
-    if (klass && !isString(klass)) {
-      props.class = normalizeClass(klass);
-    }
-    if (isObject(style)) {
-      if (isProxy(style) && !isArray(style)) {
-        style = extend({}, style);
-      }
-      props.style = normalizeStyle(style);
-    }
-  }
-  const shapeFlag = isString(type) ? 1 : isSuspense(type) ? 128 : isTeleport(type) ? 64 : isObject(type) ? 4 : isFunction(type) ? 2 : 0;
-  if (shapeFlag & 4 && isProxy(type)) {
-    type = toRaw(type);
-    warn$1(
-      `Vue received a Component that was made a reactive object. This can lead to unnecessary performance overhead and should be avoided by marking the component with \`markRaw\` or using \`shallowRef\` instead of \`ref\`.`,
-      `
-Component that was made reactive: `,
-      type
-    );
-  }
-  return createBaseVNode(
-    type,
-    props,
-    children,
-    patchFlag,
-    dynamicProps,
-    shapeFlag,
-    isBlockNode,
-    true
-  );
-}
-function guardReactiveProps(props) {
-  if (!props) return null;
-  return isProxy(props) || isInternalObject(props) ? extend({}, props) : props;
-}
-function cloneVNode(vnode, extraProps, mergeRef = false, cloneTransition = false) {
-  const { props, ref, patchFlag, children, transition } = vnode;
-  const mergedProps = extraProps ? mergeProps(props || {}, extraProps) : props;
-  const cloned = {
-    __v_isVNode: true,
-    __v_skip: true,
-    type: vnode.type,
-    props: mergedProps,
-    key: mergedProps && normalizeKey(mergedProps),
-    ref: extraProps && extraProps.ref ? (
-      // #2078 in the case of <component :is="vnode" ref="extra"/>
-      // if the vnode itself already has a ref, cloneVNode will need to merge
-      // the refs so the single vnode can be set on multiple refs
-      mergeRef && ref ? isArray(ref) ? ref.concat(normalizeRef(extraProps)) : [ref, normalizeRef(extraProps)] : normalizeRef(extraProps)
-    ) : ref,
-    scopeId: vnode.scopeId,
-    slotScopeIds: vnode.slotScopeIds,
-    children: patchFlag === -1 && isArray(children) ? children.map(deepCloneVNode) : children,
-    target: vnode.target,
-    targetStart: vnode.targetStart,
-    targetAnchor: vnode.targetAnchor,
-    staticCount: vnode.staticCount,
-    shapeFlag: vnode.shapeFlag,
-    // if the vnode is cloned with extra props, we can no longer assume its
-    // existing patch flag to be reliable and need to add the FULL_PROPS flag.
-    // note: preserve flag for fragments since they use the flag for children
-    // fast paths only.
-    patchFlag: extraProps && vnode.type !== Fragment ? patchFlag === -1 ? 16 : patchFlag | 16 : patchFlag,
-    dynamicProps: vnode.dynamicProps,
-    dynamicChildren: vnode.dynamicChildren,
-    appContext: vnode.appContext,
-    dirs: vnode.dirs,
-    transition,
-    // These should technically only be non-null on mounted VNodes. However,
-    // they *should* be copied for kept-alive vnodes. So we just always copy
-    // them since them being non-null during a mount doesn't affect the logic as
-    // they will simply be overwritten.
-    component: vnode.component,
-    suspense: vnode.suspense,
-    ssContent: vnode.ssContent && cloneVNode(vnode.ssContent),
-    ssFallback: vnode.ssFallback && cloneVNode(vnode.ssFallback),
-    el: vnode.el,
-    anchor: vnode.anchor,
-    ctx: vnode.ctx,
-    ce: vnode.ce
-  };
-  if (transition && cloneTransition) {
-    setTransitionHooks(
-      cloned,
-      transition.clone(cloned)
-    );
-  }
-  return cloned;
-}
-function deepCloneVNode(vnode) {
-  const cloned = cloneVNode(vnode);
-  if (isArray(vnode.children)) {
-    cloned.children = vnode.children.map(deepCloneVNode);
-  }
-  return cloned;
-}
-function createTextVNode(text = " ", flag = 0) {
-  return createVNode(Text, null, text, flag);
-}
-function createStaticVNode(content, numberOfNodes) {
-  const vnode = createVNode(Static, null, content);
-  vnode.staticCount = numberOfNodes;
-  return vnode;
-}
-function createCommentVNode(text = "", asBlock = false) {
-  return asBlock ? (openBlock(), createBlock(Comment, null, text)) : createVNode(Comment, null, text);
-}
-function normalizeVNode(child) {
-  if (child == null || typeof child === "boolean") {
-    return createVNode(Comment);
-  } else if (isArray(child)) {
-    return createVNode(
-      Fragment,
-      null,
-      // #3666, avoid reference pollution when reusing vnode
-      child.slice()
-    );
-  } else if (isVNode(child)) {
-    return cloneIfMounted(child);
-  } else {
-    return createVNode(Text, null, String(child));
-  }
-}
-function cloneIfMounted(child) {
-  return child.el === null && child.patchFlag !== -1 || child.memo ? child : cloneVNode(child);
-}
-function normalizeChildren(vnode, children) {
-  let type = 0;
-  const { shapeFlag } = vnode;
-  if (children == null) {
-    children = null;
-  } else if (isArray(children)) {
-    type = 16;
-  } else if (typeof children === "object") {
-    if (shapeFlag & (1 | 64)) {
-      const slot = children.default;
-      if (slot) {
-        slot._c && (slot._d = false);
-        normalizeChildren(vnode, slot());
-        slot._c && (slot._d = true);
-      }
-      return;
-    } else {
-      type = 32;
-      const slotFlag = children._;
-      if (!slotFlag && !isInternalObject(children)) {
-        children._ctx = currentRenderingInstance;
-      } else if (slotFlag === 3 && currentRenderingInstance) {
-        if (currentRenderingInstance.slots._ === 1) {
-          children._ = 1;
-        } else {
-          children._ = 2;
-          vnode.patchFlag |= 1024;
-        }
-      }
-    }
-  } else if (isFunction(children)) {
-    children = { default: children, _ctx: currentRenderingInstance };
-    type = 32;
-  } else {
-    children = String(children);
-    if (shapeFlag & 64) {
-      type = 16;
-      children = [createTextVNode(children)];
-    } else {
-      type = 8;
-    }
-  }
-  vnode.children = children;
-  vnode.shapeFlag |= type;
-}
-function mergeProps(...args) {
-  const ret = {};
-  for (let i = 0; i < args.length; i++) {
-    const toMerge = args[i];
-    for (const key in toMerge) {
-      if (key === "class") {
-        if (ret.class !== toMerge.class) {
-          ret.class = normalizeClass([ret.class, toMerge.class]);
-        }
-      } else if (key === "style") {
-        ret.style = normalizeStyle([ret.style, toMerge.style]);
-      } else if (isOn(key)) {
-        const existing = ret[key];
-        const incoming = toMerge[key];
-        if (incoming && existing !== incoming && !(isArray(existing) && existing.includes(incoming))) {
-          ret[key] = existing ? [].concat(existing, incoming) : incoming;
-        }
-      } else if (key !== "") {
-        ret[key] = toMerge[key];
-      }
-    }
-  }
-  return ret;
-}
-function invokeVNodeHook(hook, instance, vnode, prevVNode = null) {
-  callWithAsyncErrorHandling(hook, instance, 7, [
-    vnode,
-    prevVNode
-  ]);
-}
-
-const emptyAppContext = createAppContext();
-let uid = 0;
-function createComponentInstance(vnode, parent, suspense) {
-  const type = vnode.type;
-  const appContext = (parent ? parent.appContext : vnode.appContext) || emptyAppContext;
-  const instance = {
-    uid: uid++,
-    vnode,
-    type,
-    parent,
-    appContext,
-    root: null,
-    // to be immediately set
-    next: null,
-    subTree: null,
-    // will be set synchronously right after creation
-    effect: null,
-    update: null,
-    // will be set synchronously right after creation
-    job: null,
-    scope: new EffectScope(
-      true
-      /* detached */
-    ),
-    render: null,
-    proxy: null,
-    exposed: null,
-    exposeProxy: null,
-    withProxy: null,
-    provides: parent ? parent.provides : Object.create(appContext.provides),
-    ids: parent ? parent.ids : ["", 0, 0],
-    accessCache: null,
-    renderCache: [],
-    // local resolved assets
-    components: null,
-    directives: null,
-    // resolved props and emits options
-    propsOptions: normalizePropsOptions(type, appContext),
-    emitsOptions: normalizeEmitsOptions(type, appContext),
-    // emit
-    emit: null,
-    // to be set immediately
-    emitted: null,
-    // props default value
-    propsDefaults: EMPTY_OBJ,
-    // inheritAttrs
-    inheritAttrs: type.inheritAttrs,
-    // state
-    ctx: EMPTY_OBJ,
-    data: EMPTY_OBJ,
-    props: EMPTY_OBJ,
-    attrs: EMPTY_OBJ,
-    slots: EMPTY_OBJ,
-    refs: EMPTY_OBJ,
-    setupState: EMPTY_OBJ,
-    setupContext: null,
-    // suspense related
-    suspense,
-    suspenseId: suspense ? suspense.pendingId : 0,
-    asyncDep: null,
-    asyncResolved: false,
-    // lifecycle hooks
-    // not using enums here because it results in computed properties
-    isMounted: false,
-    isUnmounted: false,
-    isDeactivated: false,
-    bc: null,
-    c: null,
-    bm: null,
-    m: null,
-    bu: null,
-    u: null,
-    um: null,
-    bum: null,
-    da: null,
-    a: null,
-    rtg: null,
-    rtc: null,
-    ec: null,
-    sp: null
-  };
-  {
-    instance.ctx = createDevRenderContext(instance);
-  }
-  instance.root = parent ? parent.root : instance;
-  instance.emit = emit.bind(null, instance);
-  if (vnode.ce) {
-    vnode.ce(instance);
-  }
-  return instance;
-}
-let currentInstance = null;
-const getCurrentInstance = () => currentInstance || currentRenderingInstance;
-let internalSetCurrentInstance;
-let setInSSRSetupState;
-{
-  const g = getGlobalThis();
-  const registerGlobalSetter = (key, setter) => {
-    let setters;
-    if (!(setters = g[key])) setters = g[key] = [];
-    setters.push(setter);
-    return (v) => {
-      if (setters.length > 1) setters.forEach((set) => set(v));
-      else setters[0](v);
-    };
-  };
-  internalSetCurrentInstance = registerGlobalSetter(
-    `__VUE_INSTANCE_SETTERS__`,
-    (v) => currentInstance = v
-  );
-  setInSSRSetupState = registerGlobalSetter(
-    `__VUE_SSR_SETTERS__`,
-    (v) => isInSSRComponentSetup = v
-  );
-}
-const setCurrentInstance = (instance) => {
-  const prev = currentInstance;
-  internalSetCurrentInstance(instance);
-  instance.scope.on();
-  return () => {
-    instance.scope.off();
-    internalSetCurrentInstance(prev);
-  };
-};
-const unsetCurrentInstance = () => {
-  currentInstance && currentInstance.scope.off();
-  internalSetCurrentInstance(null);
-};
-const isBuiltInTag = /* @__PURE__ */ makeMap("slot,component");
-function validateComponentName(name, { isNativeTag }) {
-  if (isBuiltInTag(name) || isNativeTag(name)) {
-    warn$1(
-      "Do not use built-in or reserved HTML elements as component id: " + name
-    );
-  }
-}
-function isStatefulComponent(instance) {
-  return instance.vnode.shapeFlag & 4;
-}
-let isInSSRComponentSetup = false;
-function setupComponent(instance, isSSR = false, optimized = false) {
-  isSSR && setInSSRSetupState(isSSR);
-  const { props, children } = instance.vnode;
-  const isStateful = isStatefulComponent(instance);
-  initProps(instance, props, isStateful, isSSR);
-  initSlots(instance, children, optimized);
-  const setupResult = isStateful ? setupStatefulComponent(instance, isSSR) : void 0;
-  isSSR && setInSSRSetupState(false);
-  return setupResult;
-}
-function setupStatefulComponent(instance, isSSR) {
-  var _a;
-  const Component = instance.type;
-  {
-    if (Component.name) {
-      validateComponentName(Component.name, instance.appContext.config);
-    }
-    if (Component.components) {
-      const names = Object.keys(Component.components);
-      for (let i = 0; i < names.length; i++) {
-        validateComponentName(names[i], instance.appContext.config);
-      }
-    }
-    if (Component.directives) {
-      const names = Object.keys(Component.directives);
-      for (let i = 0; i < names.length; i++) {
-        validateDirectiveName(names[i]);
-      }
-    }
-    if (Component.compilerOptions && isRuntimeOnly()) {
-      warn$1(
-        `"compilerOptions" is only supported when using a build of Vue that includes the runtime compiler. Since you are using a runtime-only build, the options should be passed via your build tool config instead.`
-      );
-    }
-  }
-  instance.accessCache = /* @__PURE__ */ Object.create(null);
-  instance.proxy = new Proxy(instance.ctx, PublicInstanceProxyHandlers);
-  {
-    exposePropsOnRenderContext(instance);
-  }
-  const { setup } = Component;
-  if (setup) {
-    pauseTracking();
-    const setupContext = instance.setupContext = setup.length > 1 ? createSetupContext(instance) : null;
-    const reset = setCurrentInstance(instance);
-    const setupResult = callWithErrorHandling(
-      setup,
-      instance,
-      0,
-      [
-        shallowReadonly(instance.props) ,
-        setupContext
-      ]
-    );
-    const isAsyncSetup = isPromise(setupResult);
-    resetTracking();
-    reset();
-    if ((isAsyncSetup || instance.sp) && !isAsyncWrapper(instance)) {
-      markAsyncBoundary(instance);
-    }
-    if (isAsyncSetup) {
-      setupResult.then(unsetCurrentInstance, unsetCurrentInstance);
-      if (isSSR) {
-        return setupResult.then((resolvedResult) => {
-          handleSetupResult(instance, resolvedResult, isSSR);
-        }).catch((e) => {
-          handleError(e, instance, 0);
-        });
-      } else {
-        instance.asyncDep = setupResult;
-        if (!instance.suspense) {
-          const name = (_a = Component.name) != null ? _a : "Anonymous";
-          warn$1(
-            `Component <${name}>: setup function returned a promise, but no <Suspense> boundary was found in the parent component tree. A component with async setup() must be nested in a <Suspense> in order to be rendered.`
-          );
-        }
-      }
-    } else {
-      handleSetupResult(instance, setupResult, isSSR);
-    }
-  } else {
-    finishComponentSetup(instance, isSSR);
-  }
-}
-function handleSetupResult(instance, setupResult, isSSR) {
-  if (isFunction(setupResult)) {
-    if (instance.type.__ssrInlineRender) {
-      instance.ssrRender = setupResult;
-    } else {
-      instance.render = setupResult;
-    }
-  } else if (isObject(setupResult)) {
-    if (isVNode(setupResult)) {
-      warn$1(
-        `setup() should not return VNodes directly - return a render function instead.`
-      );
-    }
-    {
-      instance.devtoolsRawSetupState = setupResult;
-    }
-    instance.setupState = proxyRefs(setupResult);
-    {
-      exposeSetupStateOnRenderContext(instance);
-    }
-  } else if (setupResult !== void 0) {
-    warn$1(
-      `setup() should return an object. Received: ${setupResult === null ? "null" : typeof setupResult}`
-    );
-  }
-  finishComponentSetup(instance, isSSR);
-}
-let compile$1;
-let installWithProxy;
-function registerRuntimeCompiler(_compile) {
-  compile$1 = _compile;
-  installWithProxy = (i) => {
-    if (i.render._rc) {
-      i.withProxy = new Proxy(i.ctx, RuntimeCompiledPublicInstanceProxyHandlers);
-    }
-  };
-}
-const isRuntimeOnly = () => !compile$1;
-function finishComponentSetup(instance, isSSR, skipOptions) {
-  const Component = instance.type;
-  if (!instance.render) {
-    if (!isSSR && compile$1 && !Component.render) {
-      const template = Component.template || resolveMergedOptions(instance).template;
-      if (template) {
-        {
-          startMeasure(instance, `compile`);
-        }
-        const { isCustomElement, compilerOptions } = instance.appContext.config;
-        const { delimiters, compilerOptions: componentCompilerOptions } = Component;
-        const finalCompilerOptions = extend(
-          extend(
-            {
-              isCustomElement,
-              delimiters
-            },
-            compilerOptions
-          ),
-          componentCompilerOptions
-        );
-        Component.render = compile$1(template, finalCompilerOptions);
-        {
-          endMeasure(instance, `compile`);
-        }
-      }
-    }
-    instance.render = Component.render || NOOP;
-    if (installWithProxy) {
-      installWithProxy(instance);
-    }
-  }
-  {
-    const reset = setCurrentInstance(instance);
-    pauseTracking();
-    try {
-      applyOptions(instance);
-    } finally {
-      resetTracking();
-      reset();
-    }
-  }
-  if (!Component.render && instance.render === NOOP && !isSSR) {
-    if (!compile$1 && Component.template) {
-      warn$1(
-        `Component provided template option but runtime compilation is not supported in this build of Vue.` + (` Use "vue.esm-browser.js" instead.` )
-      );
-    } else {
-      warn$1(`Component is missing template or render function: `, Component);
-    }
-  }
-}
-const attrsProxyHandlers = {
-  get(target, key) {
-    markAttrsAccessed();
-    track(target, "get", "");
-    return target[key];
-  },
-  set() {
-    warn$1(`setupContext.attrs is readonly.`);
-    return false;
-  },
-  deleteProperty() {
-    warn$1(`setupContext.attrs is readonly.`);
-    return false;
-  }
-} ;
-function getSlotsProxy(instance) {
-  return new Proxy(instance.slots, {
-    get(target, key) {
-      track(instance, "get", "$slots");
-      return target[key];
-    }
-  });
-}
-function createSetupContext(instance) {
-  const expose = (exposed) => {
-    {
-      if (instance.exposed) {
-        warn$1(`expose() should be called only once per setup().`);
-      }
-      if (exposed != null) {
-        let exposedType = typeof exposed;
-        if (exposedType === "object") {
-          if (isArray(exposed)) {
-            exposedType = "array";
-          } else if (isRef(exposed)) {
-            exposedType = "ref";
-          }
-        }
-        if (exposedType !== "object") {
-          warn$1(
-            `expose() should be passed a plain object, received ${exposedType}.`
-          );
-        }
-      }
-    }
-    instance.exposed = exposed || {};
-  };
-  {
-    let attrsProxy;
-    let slotsProxy;
-    return Object.freeze({
-      get attrs() {
-        return attrsProxy || (attrsProxy = new Proxy(instance.attrs, attrsProxyHandlers));
-      },
-      get slots() {
-        return slotsProxy || (slotsProxy = getSlotsProxy(instance));
-      },
-      get emit() {
-        return (event, ...args) => instance.emit(event, ...args);
-      },
-      expose
-    });
-  }
-}
-function getComponentPublicInstance(instance) {
-  if (instance.exposed) {
-    return instance.exposeProxy || (instance.exposeProxy = new Proxy(proxyRefs(markRaw(instance.exposed)), {
-      get(target, key) {
-        if (key in target) {
-          return target[key];
-        } else if (key in publicPropertiesMap) {
-          return publicPropertiesMap[key](instance);
-        }
-      },
-      has(target, key) {
-        return key in target || key in publicPropertiesMap;
-      }
-    }));
-  } else {
-    return instance.proxy;
-  }
-}
-const classifyRE = /(?:^|[-_])(\w)/g;
-const classify = (str) => str.replace(classifyRE, (c) => c.toUpperCase()).replace(/[-_]/g, "");
-function getComponentName(Component, includeInferred = true) {
-  return isFunction(Component) ? Component.displayName || Component.name : Component.name || includeInferred && Component.__name;
-}
-function formatComponentName(instance, Component, isRoot = false) {
-  let name = getComponentName(Component);
-  if (!name && Component.__file) {
-    const match = Component.__file.match(/([^/\\]+)\.\w+$/);
-    if (match) {
-      name = match[1];
-    }
-  }
-  if (!name && instance && instance.parent) {
-    const inferFromRegistry = (registry) => {
-      for (const key in registry) {
-        if (registry[key] === Component) {
-          return key;
-        }
-      }
-    };
-    name = inferFromRegistry(
-      instance.components || instance.parent.type.components
-    ) || inferFromRegistry(instance.appContext.components);
-  }
-  return name ? classify(name) : isRoot ? `App` : `Anonymous`;
-}
-function isClassComponent(value) {
-  return isFunction(value) && "__vccOpts" in value;
-}
-
-const computed = (getterOrOptions, debugOptions) => {
-  const c = computed$1(getterOrOptions, debugOptions, isInSSRComponentSetup);
-  {
-    const i = getCurrentInstance();
-    if (i && i.appContext.config.warnRecursiveComputed) {
-      c._warnRecursive = true;
-    }
-  }
-  return c;
-};
-
-function h(type, propsOrChildren, children) {
-  const l = arguments.length;
-  if (l === 2) {
-    if (isObject(propsOrChildren) && !isArray(propsOrChildren)) {
-      if (isVNode(propsOrChildren)) {
-        return createVNode(type, null, [propsOrChildren]);
-      }
-      return createVNode(type, propsOrChildren);
-    } else {
-      return createVNode(type, null, propsOrChildren);
-    }
-  } else {
-    if (l > 3) {
-      children = Array.prototype.slice.call(arguments, 2);
-    } else if (l === 3 && isVNode(children)) {
-      children = [children];
-    }
-    return createVNode(type, propsOrChildren, children);
-  }
-}
-
-function initCustomFormatter() {
-  if (typeof window === "undefined") {
-    return;
-  }
-  const vueStyle = { style: "color:#3ba776" };
-  const numberStyle = { style: "color:#1677ff" };
-  const stringStyle = { style: "color:#f5222d" };
-  const keywordStyle = { style: "color:#eb2f96" };
-  const formatter = {
-    __vue_custom_formatter: true,
-    header(obj) {
-      if (!isObject(obj)) {
-        return null;
-      }
-      if (obj.__isVue) {
-        return ["div", vueStyle, `VueInstance`];
-      } else if (isRef(obj)) {
-        return [
-          "div",
-          {},
-          ["span", vueStyle, genRefFlag(obj)],
-          "<",
-          // avoid debugger accessing value affecting behavior
-          formatValue("_value" in obj ? obj._value : obj),
-          `>`
-        ];
-      } else if (isReactive(obj)) {
-        return [
-          "div",
-          {},
-          ["span", vueStyle, isShallow(obj) ? "ShallowReactive" : "Reactive"],
-          "<",
-          formatValue(obj),
-          `>${isReadonly(obj) ? ` (readonly)` : ``}`
-        ];
-      } else if (isReadonly(obj)) {
-        return [
-          "div",
-          {},
-          ["span", vueStyle, isShallow(obj) ? "ShallowReadonly" : "Readonly"],
-          "<",
-          formatValue(obj),
-          ">"
-        ];
-      }
-      return null;
-    },
-    hasBody(obj) {
-      return obj && obj.__isVue;
-    },
-    body(obj) {
-      if (obj && obj.__isVue) {
-        return [
-          "div",
-          {},
-          ...formatInstance(obj.$)
-        ];
-      }
-    }
-  };
-  function formatInstance(instance) {
-    const blocks = [];
-    if (instance.type.props && instance.props) {
-      blocks.push(createInstanceBlock("props", toRaw(instance.props)));
-    }
-    if (instance.setupState !== EMPTY_OBJ) {
-      blocks.push(createInstanceBlock("setup", instance.setupState));
-    }
-    if (instance.data !== EMPTY_OBJ) {
-      blocks.push(createInstanceBlock("data", toRaw(instance.data)));
-    }
-    const computed = extractKeys(instance, "computed");
-    if (computed) {
-      blocks.push(createInstanceBlock("computed", computed));
-    }
-    const injected = extractKeys(instance, "inject");
-    if (injected) {
-      blocks.push(createInstanceBlock("injected", injected));
-    }
-    blocks.push([
-      "div",
-      {},
-      [
-        "span",
-        {
-          style: keywordStyle.style + ";opacity:0.66"
-        },
-        "$ (internal): "
-      ],
-      ["object", { object: instance }]
-    ]);
-    return blocks;
-  }
-  function createInstanceBlock(type, target) {
-    target = extend({}, target);
-    if (!Object.keys(target).length) {
-      return ["span", {}];
-    }
-    return [
-      "div",
-      { style: "line-height:1.25em;margin-bottom:0.6em" },
-      [
-        "div",
-        {
-          style: "color:#476582"
-        },
-        type
-      ],
-      [
-        "div",
-        {
-          style: "padding-left:1.25em"
-        },
-        ...Object.keys(target).map((key) => {
-          return [
-            "div",
-            {},
-            ["span", keywordStyle, key + ": "],
-            formatValue(target[key], false)
-          ];
-        })
-      ]
-    ];
-  }
-  function formatValue(v, asRaw = true) {
-    if (typeof v === "number") {
-      return ["span", numberStyle, v];
-    } else if (typeof v === "string") {
-      return ["span", stringStyle, JSON.stringify(v)];
-    } else if (typeof v === "boolean") {
-      return ["span", keywordStyle, v];
-    } else if (isObject(v)) {
-      return ["object", { object: asRaw ? toRaw(v) : v }];
-    } else {
-      return ["span", stringStyle, String(v)];
-    }
-  }
-  function extractKeys(instance, type) {
-    const Comp = instance.type;
-    if (isFunction(Comp)) {
-      return;
-    }
-    const extracted = {};
-    for (const key in instance.ctx) {
-      if (isKeyOfType(Comp, key, type)) {
-        extracted[key] = instance.ctx[key];
-      }
-    }
-    return extracted;
-  }
-  function isKeyOfType(Comp, key, type) {
-    const opts = Comp[type];
-    if (isArray(opts) && opts.includes(key) || isObject(opts) && key in opts) {
-      return true;
-    }
-    if (Comp.extends && isKeyOfType(Comp.extends, key, type)) {
-      return true;
-    }
-    if (Comp.mixins && Comp.mixins.some((m) => isKeyOfType(m, key, type))) {
-      return true;
-    }
-  }
-  function genRefFlag(v) {
-    if (isShallow(v)) {
-      return `ShallowRef`;
-    }
-    if (v.effect) {
-      return `ComputedRef`;
-    }
-    return `Ref`;
-  }
-  if (window.devtoolsFormatters) {
-    window.devtoolsFormatters.push(formatter);
-  } else {
-    window.devtoolsFormatters = [formatter];
-  }
-}
-
-function withMemo(memo, render, cache, index) {
-  const cached = cache[index];
-  if (cached && isMemoSame(cached, memo)) {
-    return cached;
-  }
-  const ret = render();
-  ret.memo = memo.slice();
-  ret.cacheIndex = index;
-  return cache[index] = ret;
-}
-function isMemoSame(cached, memo) {
-  const prev = cached.memo;
-  if (prev.length != memo.length) {
-    return false;
-  }
-  for (let i = 0; i < prev.length; i++) {
-    if (hasChanged(prev[i], memo[i])) {
-      return false;
-    }
-  }
-  if (isBlockTreeEnabled > 0 && currentBlock) {
-    currentBlock.push(cached);
-  }
-  return true;
-}
-
-const version = "3.5.12";
-const warn = warn$1 ;
-const ErrorTypeStrings = ErrorTypeStrings$1 ;
-const devtools = devtools$1 ;
-const setDevtoolsHook = setDevtoolsHook$1 ;
-const _ssrUtils = {
-  createComponentInstance,
-  setupComponent,
-  renderComponentRoot,
-  setCurrentRenderingInstance,
-  isVNode: isVNode,
-  normalizeVNode,
-  getComponentPublicInstance,
-  ensureValidVNode,
-  pushWarningContext,
-  popWarningContext
-};
-const ssrUtils = _ssrUtils ;
-const resolveFilter = null;
-const compatUtils = null;
-const DeprecationTypes = null;
-
-let policy = void 0;
-const tt = typeof window !== "undefined" && window.trustedTypes;
-if (tt) {
-  try {
-    policy = /* @__PURE__ */ tt.createPolicy("vue", {
-      createHTML: (val) => val
-    });
-  } catch (e) {
-    warn(`Error creating trusted types policy: ${e}`);
-  }
-}
-const unsafeToTrustedHTML = policy ? (val) => policy.createHTML(val) : (val) => val;
-const svgNS = "http://www.w3.org/2000/svg";
-const mathmlNS = "http://www.w3.org/1998/Math/MathML";
-const doc = typeof document !== "undefined" ? document : null;
-const templateContainer = doc && /* @__PURE__ */ doc.createElement("template");
-const nodeOps = {
-  insert: (child, parent, anchor) => {
-    parent.insertBefore(child, anchor || null);
-  },
-  remove: (child) => {
-    const parent = child.parentNode;
-    if (parent) {
-      parent.removeChild(child);
-    }
-  },
-  createElement: (tag, namespace, is, props) => {
-    const el = namespace === "svg" ? doc.createElementNS(svgNS, tag) : namespace === "mathml" ? doc.createElementNS(mathmlNS, tag) : is ? doc.createElement(tag, { is }) : doc.createElement(tag);
-    if (tag === "select" && props && props.multiple != null) {
-      el.setAttribute("multiple", props.multiple);
-    }
-    return el;
-  },
-  createText: (text) => doc.createTextNode(text),
-  createComment: (text) => doc.createComment(text),
-  setText: (node, text) => {
-    node.nodeValue = text;
-  },
-  setElementText: (el, text) => {
-    el.textContent = text;
-  },
-  parentNode: (node) => node.parentNode,
-  nextSibling: (node) => node.nextSibling,
-  querySelector: (selector) => doc.querySelector(selector),
-  setScopeId(el, id) {
-    el.setAttribute(id, "");
-  },
-  // __UNSAFE__
-  // Reason: innerHTML.
-  // Static content here can only come from compiled templates.
-  // As long as the user only uses trusted templates, this is safe.
-  insertStaticContent(content, parent, anchor, namespace, start, end) {
-    const before = anchor ? anchor.previousSibling : parent.lastChild;
-    if (start && (start === end || start.nextSibling)) {
-      while (true) {
-        parent.insertBefore(start.cloneNode(true), anchor);
-        if (start === end || !(start = start.nextSibling)) break;
-      }
-    } else {
-      templateContainer.innerHTML = unsafeToTrustedHTML(
-        namespace === "svg" ? `<svg>${content}</svg>` : namespace === "mathml" ? `<math>${content}</math>` : content
-      );
-      const template = templateContainer.content;
-      if (namespace === "svg" || namespace === "mathml") {
-        const wrapper = template.firstChild;
-        while (wrapper.firstChild) {
-          template.appendChild(wrapper.firstChild);
-        }
-        template.removeChild(wrapper);
-      }
-      parent.insertBefore(template, anchor);
-    }
-    return [
-      // first
-      before ? before.nextSibling : parent.firstChild,
-      // last
-      anchor ? anchor.previousSibling : parent.lastChild
-    ];
-  }
-};
-
-const TRANSITION$1 = "transition";
-const ANIMATION = "animation";
-const vtcKey = Symbol("_vtc");
-const DOMTransitionPropsValidators = {
-  name: String,
-  type: String,
-  css: {
-    type: Boolean,
-    default: true
-  },
-  duration: [String, Number, Object],
-  enterFromClass: String,
-  enterActiveClass: String,
-  enterToClass: String,
-  appearFromClass: String,
-  appearActiveClass: String,
-  appearToClass: String,
-  leaveFromClass: String,
-  leaveActiveClass: String,
-  leaveToClass: String
-};
-const TransitionPropsValidators = /* @__PURE__ */ extend(
-  {},
-  BaseTransitionPropsValidators,
-  DOMTransitionPropsValidators
-);
-const decorate$1 = (t) => {
-  t.displayName = "Transition";
-  t.props = TransitionPropsValidators;
-  return t;
-};
-const Transition = /* @__PURE__ */ decorate$1(
-  (props, { slots }) => h(BaseTransition, resolveTransitionProps(props), slots)
-);
-const callHook = (hook, args = []) => {
-  if (isArray(hook)) {
-    hook.forEach((h2) => h2(...args));
-  } else if (hook) {
-    hook(...args);
-  }
-};
-const hasExplicitCallback = (hook) => {
-  return hook ? isArray(hook) ? hook.some((h2) => h2.length > 1) : hook.length > 1 : false;
-};
-function resolveTransitionProps(rawProps) {
-  const baseProps = {};
-  for (const key in rawProps) {
-    if (!(key in DOMTransitionPropsValidators)) {
-      baseProps[key] = rawProps[key];
-    }
-  }
-  if (rawProps.css === false) {
-    return baseProps;
-  }
-  const {
-    name = "v",
-    type,
-    duration,
-    enterFromClass = `${name}-enter-from`,
-    enterActiveClass = `${name}-enter-active`,
-    enterToClass = `${name}-enter-to`,
-    appearFromClass = enterFromClass,
-    appearActiveClass = enterActiveClass,
-    appearToClass = enterToClass,
-    leaveFromClass = `${name}-leave-from`,
-    leaveActiveClass = `${name}-leave-active`,
-    leaveToClass = `${name}-leave-to`
-  } = rawProps;
-  const durations = normalizeDuration(duration);
-  const enterDuration = durations && durations[0];
-  const leaveDuration = durations && durations[1];
-  const {
-    onBeforeEnter,
-    onEnter,
-    onEnterCancelled,
-    onLeave,
-    onLeaveCancelled,
-    onBeforeAppear = onBeforeEnter,
-    onAppear = onEnter,
-    onAppearCancelled = onEnterCancelled
-  } = baseProps;
-  const finishEnter = (el, isAppear, done) => {
-    removeTransitionClass(el, isAppear ? appearToClass : enterToClass);
-    removeTransitionClass(el, isAppear ? appearActiveClass : enterActiveClass);
-    done && done();
-  };
-  const finishLeave = (el, done) => {
-    el._isLeaving = false;
-    removeTransitionClass(el, leaveFromClass);
-    removeTransitionClass(el, leaveToClass);
-    removeTransitionClass(el, leaveActiveClass);
-    done && done();
-  };
-  const makeEnterHook = (isAppear) => {
-    return (el, done) => {
-      const hook = isAppear ? onAppear : onEnter;
-      const resolve = () => finishEnter(el, isAppear, done);
-      callHook(hook, [el, resolve]);
-      nextFrame(() => {
-        removeTransitionClass(el, isAppear ? appearFromClass : enterFromClass);
-        addTransitionClass(el, isAppear ? appearToClass : enterToClass);
-        if (!hasExplicitCallback(hook)) {
-          whenTransitionEnds(el, type, enterDuration, resolve);
-        }
-      });
-    };
-  };
-  return extend(baseProps, {
-    onBeforeEnter(el) {
-      callHook(onBeforeEnter, [el]);
-      addTransitionClass(el, enterFromClass);
-      addTransitionClass(el, enterActiveClass);
-    },
-    onBeforeAppear(el) {
-      callHook(onBeforeAppear, [el]);
-      addTransitionClass(el, appearFromClass);
-      addTransitionClass(el, appearActiveClass);
-    },
-    onEnter: makeEnterHook(false),
-    onAppear: makeEnterHook(true),
-    onLeave(el, done) {
-      el._isLeaving = true;
-      const resolve = () => finishLeave(el, done);
-      addTransitionClass(el, leaveFromClass);
-      addTransitionClass(el, leaveActiveClass);
-      forceReflow();
-      nextFrame(() => {
-        if (!el._isLeaving) {
-          return;
-        }
-        removeTransitionClass(el, leaveFromClass);
-        addTransitionClass(el, leaveToClass);
-        if (!hasExplicitCallback(onLeave)) {
-          whenTransitionEnds(el, type, leaveDuration, resolve);
-        }
-      });
-      callHook(onLeave, [el, resolve]);
-    },
-    onEnterCancelled(el) {
-      finishEnter(el, false);
-      callHook(onEnterCancelled, [el]);
-    },
-    onAppearCancelled(el) {
-      finishEnter(el, true);
-      callHook(onAppearCancelled, [el]);
-    },
-    onLeaveCancelled(el) {
-      finishLeave(el);
-      callHook(onLeaveCancelled, [el]);
-    }
-  });
-}
-function normalizeDuration(duration) {
-  if (duration == null) {
-    return null;
-  } else if (isObject(duration)) {
-    return [NumberOf(duration.enter), NumberOf(duration.leave)];
-  } else {
-    const n = NumberOf(duration);
-    return [n, n];
-  }
-}
-function NumberOf(val) {
-  const res = toNumber(val);
-  {
-    assertNumber(res, "<transition> explicit duration");
-  }
-  return res;
-}
-function addTransitionClass(el, cls) {
-  cls.split(/\s+/).forEach((c) => c && el.classList.add(c));
-  (el[vtcKey] || (el[vtcKey] = /* @__PURE__ */ new Set())).add(cls);
-}
-function removeTransitionClass(el, cls) {
-  cls.split(/\s+/).forEach((c) => c && el.classList.remove(c));
-  const _vtc = el[vtcKey];
-  if (_vtc) {
-    _vtc.delete(cls);
-    if (!_vtc.size) {
-      el[vtcKey] = void 0;
-    }
-  }
-}
-function nextFrame(cb) {
-  requestAnimationFrame(() => {
-    requestAnimationFrame(cb);
-  });
-}
-let endId = 0;
-function whenTransitionEnds(el, expectedType, explicitTimeout, resolve) {
-  const id = el._endId = ++endId;
-  const resolveIfNotStale = () => {
-    if (id === el._endId) {
-      resolve();
-    }
-  };
-  if (explicitTimeout != null) {
-    return setTimeout(resolveIfNotStale, explicitTimeout);
-  }
-  const { type, timeout, propCount } = getTransitionInfo(el, expectedType);
-  if (!type) {
-    return resolve();
-  }
-  const endEvent = type + "end";
-  let ended = 0;
-  const end = () => {
-    el.removeEventListener(endEvent, onEnd);
-    resolveIfNotStale();
-  };
-  const onEnd = (e) => {
-    if (e.target === el && ++ended >= propCount) {
-      end();
-    }
-  };
-  setTimeout(() => {
-    if (ended < propCount) {
-      end();
-    }
-  }, timeout + 1);
-  el.addEventListener(endEvent, onEnd);
-}
-function getTransitionInfo(el, expectedType) {
-  const styles = window.getComputedStyle(el);
-  const getStyleProperties = (key) => (styles[key] || "").split(", ");
-  const transitionDelays = getStyleProperties(`${TRANSITION$1}Delay`);
-  const transitionDurations = getStyleProperties(`${TRANSITION$1}Duration`);
-  const transitionTimeout = getTimeout(transitionDelays, transitionDurations);
-  const animationDelays = getStyleProperties(`${ANIMATION}Delay`);
-  const animationDurations = getStyleProperties(`${ANIMATION}Duration`);
-  const animationTimeout = getTimeout(animationDelays, animationDurations);
-  let type = null;
-  let timeout = 0;
-  let propCount = 0;
-  if (expectedType === TRANSITION$1) {
-    if (transitionTimeout > 0) {
-      type = TRANSITION$1;
-      timeout = transitionTimeout;
-      propCount = transitionDurations.length;
-    }
-  } else if (expectedType === ANIMATION) {
-    if (animationTimeout > 0) {
-      type = ANIMATION;
-      timeout = animationTimeout;
-      propCount = animationDurations.length;
-    }
-  } else {
-    timeout = Math.max(transitionTimeout, animationTimeout);
-    type = timeout > 0 ? transitionTimeout > animationTimeout ? TRANSITION$1 : ANIMATION : null;
-    propCount = type ? type === TRANSITION$1 ? transitionDurations.length : animationDurations.length : 0;
-  }
-  const hasTransform = type === TRANSITION$1 && /\b(transform|all)(,|$)/.test(
-    getStyleProperties(`${TRANSITION$1}Property`).toString()
-  );
-  return {
-    type,
-    timeout,
-    propCount,
-    hasTransform
-  };
-}
-function getTimeout(delays, durations) {
-  while (delays.length < durations.length) {
-    delays = delays.concat(delays);
-  }
-  return Math.max(...durations.map((d, i) => toMs(d) + toMs(delays[i])));
-}
-function toMs(s) {
-  if (s === "auto") return 0;
-  return Number(s.slice(0, -1).replace(",", ".")) * 1e3;
-}
-function forceReflow() {
-  return document.body.offsetHeight;
-}
-
-function patchClass(el, value, isSVG) {
-  const transitionClasses = el[vtcKey];
-  if (transitionClasses) {
-    value = (value ? [value, ...transitionClasses] : [...transitionClasses]).join(" ");
-  }
-  if (value == null) {
-    el.removeAttribute("class");
-  } else if (isSVG) {
-    el.setAttribute("class", value);
-  } else {
-    el.className = value;
-  }
-}
-
-const vShowOriginalDisplay = Symbol("_vod");
-const vShowHidden = Symbol("_vsh");
-const vShow = {
-  beforeMount(el, { value }, { transition }) {
-    el[vShowOriginalDisplay] = el.style.display === "none" ? "" : el.style.display;
-    if (transition && value) {
-      transition.beforeEnter(el);
-    } else {
-      setDisplay(el, value);
-    }
-  },
-  mounted(el, { value }, { transition }) {
-    if (transition && value) {
-      transition.enter(el);
-    }
-  },
-  updated(el, { value, oldValue }, { transition }) {
-    if (!value === !oldValue) return;
-    if (transition) {
-      if (value) {
-        transition.beforeEnter(el);
-        setDisplay(el, true);
-        transition.enter(el);
-      } else {
-        transition.leave(el, () => {
-          setDisplay(el, false);
-        });
-      }
-    } else {
-      setDisplay(el, value);
-    }
-  },
-  beforeUnmount(el, { value }) {
-    setDisplay(el, value);
-  }
-};
-{
-  vShow.name = "show";
-}
-function setDisplay(el, value) {
-  el.style.display = value ? el[vShowOriginalDisplay] : "none";
-  el[vShowHidden] = !value;
-}
-function initVShowForSSR() {
-  vShow.getSSRProps = ({ value }) => {
-    if (!value) {
-      return { style: { display: "none" } };
-    }
-  };
-}
-
-const CSS_VAR_TEXT = Symbol("CSS_VAR_TEXT" );
-function useCssVars(getter) {
-  const instance = getCurrentInstance();
-  if (!instance) {
-    warn(`useCssVars is called without current active component instance.`);
-    return;
-  }
-  const updateTeleports = instance.ut = (vars = getter(instance.proxy)) => {
-    Array.from(
-      document.querySelectorAll(`[data-v-owner="${instance.uid}"]`)
-    ).forEach((node) => setVarsOnNode(node, vars));
-  };
-  {
-    instance.getCssVars = () => getter(instance.proxy);
-  }
-  const setVars = () => {
-    const vars = getter(instance.proxy);
-    if (instance.ce) {
-      setVarsOnNode(instance.ce, vars);
-    } else {
-      setVarsOnVNode(instance.subTree, vars);
-    }
-    updateTeleports(vars);
-  };
-  onBeforeMount(() => {
-    watchPostEffect(setVars);
-  });
-  onMounted(() => {
-    const ob = new MutationObserver(setVars);
-    ob.observe(instance.subTree.el.parentNode, { childList: true });
-    onUnmounted(() => ob.disconnect());
-  });
-}
-function setVarsOnVNode(vnode, vars) {
-  if (vnode.shapeFlag & 128) {
-    const suspense = vnode.suspense;
-    vnode = suspense.activeBranch;
-    if (suspense.pendingBranch && !suspense.isHydrating) {
-      suspense.effects.push(() => {
-        setVarsOnVNode(suspense.activeBranch, vars);
-      });
-    }
-  }
-  while (vnode.component) {
-    vnode = vnode.component.subTree;
-  }
-  if (vnode.shapeFlag & 1 && vnode.el) {
-    setVarsOnNode(vnode.el, vars);
-  } else if (vnode.type === Fragment) {
-    vnode.children.forEach((c) => setVarsOnVNode(c, vars));
-  } else if (vnode.type === Static) {
-    let { el, anchor } = vnode;
-    while (el) {
-      setVarsOnNode(el, vars);
-      if (el === anchor) break;
-      el = el.nextSibling;
-    }
-  }
-}
-function setVarsOnNode(el, vars) {
-  if (el.nodeType === 1) {
-    const style = el.style;
-    let cssText = "";
-    for (const key in vars) {
-      style.setProperty(`--${key}`, vars[key]);
-      cssText += `--${key}: ${vars[key]};`;
-    }
-    style[CSS_VAR_TEXT] = cssText;
-  }
-}
-
-const displayRE = /(^|;)\s*display\s*:/;
-function patchStyle(el, prev, next) {
-  const style = el.style;
-  const isCssString = isString(next);
-  let hasControlledDisplay = false;
-  if (next && !isCssString) {
-    if (prev) {
-      if (!isString(prev)) {
-        for (const key in prev) {
-          if (next[key] == null) {
-            setStyle(style, key, "");
-          }
-        }
-      } else {
-        for (const prevStyle of prev.split(";")) {
-          const key = prevStyle.slice(0, prevStyle.indexOf(":")).trim();
-          if (next[key] == null) {
-            setStyle(style, key, "");
-          }
-        }
-      }
-    }
-    for (const key in next) {
-      if (key === "display") {
-        hasControlledDisplay = true;
-      }
-      setStyle(style, key, next[key]);
-    }
-  } else {
-    if (isCssString) {
-      if (prev !== next) {
-        const cssVarText = style[CSS_VAR_TEXT];
-        if (cssVarText) {
-          next += ";" + cssVarText;
-        }
-        style.cssText = next;
-        hasControlledDisplay = displayRE.test(next);
-      }
-    } else if (prev) {
-      el.removeAttribute("style");
-    }
-  }
-  if (vShowOriginalDisplay in el) {
-    el[vShowOriginalDisplay] = hasControlledDisplay ? style.display : "";
-    if (el[vShowHidden]) {
-      style.display = "none";
-    }
-  }
-}
-const semicolonRE = /[^\\];\s*$/;
-const importantRE = /\s*!important$/;
-function setStyle(style, name, val) {
-  if (isArray(val)) {
-    val.forEach((v) => setStyle(style, name, v));
-  } else {
-    if (val == null) val = "";
-    {
-      if (semicolonRE.test(val)) {
-        warn(
-          `Unexpected semicolon at the end of '${name}' style value: '${val}'`
-        );
-      }
-    }
-    if (name.startsWith("--")) {
-      style.setProperty(name, val);
-    } else {
-      const prefixed = autoPrefix(style, name);
-      if (importantRE.test(val)) {
-        style.setProperty(
-          hyphenate(prefixed),
-          val.replace(importantRE, ""),
-          "important"
-        );
-      } else {
-        style[prefixed] = val;
-      }
-    }
-  }
-}
-const prefixes = ["Webkit", "Moz", "ms"];
-const prefixCache = {};
-function autoPrefix(style, rawName) {
-  const cached = prefixCache[rawName];
-  if (cached) {
-    return cached;
-  }
-  let name = camelize(rawName);
-  if (name !== "filter" && name in style) {
-    return prefixCache[rawName] = name;
-  }
-  name = capitalize(name);
-  for (let i = 0; i < prefixes.length; i++) {
-    const prefixed = prefixes[i] + name;
-    if (prefixed in style) {
-      return prefixCache[rawName] = prefixed;
-    }
-  }
-  return rawName;
-}
-
-const xlinkNS = "http://www.w3.org/1999/xlink";
-function patchAttr(el, key, value, isSVG, instance, isBoolean = isSpecialBooleanAttr(key)) {
-  if (isSVG && key.startsWith("xlink:")) {
-    if (value == null) {
-      el.removeAttributeNS(xlinkNS, key.slice(6, key.length));
-    } else {
-      el.setAttributeNS(xlinkNS, key, value);
-    }
-  } else {
-    if (value == null || isBoolean && !includeBooleanAttr(value)) {
-      el.removeAttribute(key);
-    } else {
-      el.setAttribute(
-        key,
-        isBoolean ? "" : isSymbol(value) ? String(value) : value
-      );
-    }
-  }
-}
-
-function patchDOMProp(el, key, value, parentComponent, attrName) {
-  if (key === "innerHTML" || key === "textContent") {
-    if (value != null) {
-      el[key] = key === "innerHTML" ? unsafeToTrustedHTML(value) : value;
-    }
-    return;
-  }
-  const tag = el.tagName;
-  if (key === "value" && tag !== "PROGRESS" && // custom elements may use _value internally
-  !tag.includes("-")) {
-    const oldValue = tag === "OPTION" ? el.getAttribute("value") || "" : el.value;
-    const newValue = value == null ? (
-      // #11647: value should be set as empty string for null and undefined,
-      // but <input type="checkbox"> should be set as 'on'.
-      el.type === "checkbox" ? "on" : ""
-    ) : String(value);
-    if (oldValue !== newValue || !("_value" in el)) {
-      el.value = newValue;
-    }
-    if (value == null) {
-      el.removeAttribute(key);
-    }
-    el._value = value;
-    return;
-  }
-  let needRemove = false;
-  if (value === "" || value == null) {
-    const type = typeof el[key];
-    if (type === "boolean") {
-      value = includeBooleanAttr(value);
-    } else if (value == null && type === "string") {
-      value = "";
-      needRemove = true;
-    } else if (type === "number") {
-      value = 0;
-      needRemove = true;
-    }
-  }
-  try {
-    el[key] = value;
-  } catch (e) {
-    if (!needRemove) {
-      warn(
-        `Failed setting prop "${key}" on <${tag.toLowerCase()}>: value ${value} is invalid.`,
-        e
-      );
-    }
-  }
-  needRemove && el.removeAttribute(attrName || key);
-}
-
-function addEventListener(el, event, handler, options) {
-  el.addEventListener(event, handler, options);
-}
-function removeEventListener(el, event, handler, options) {
-  el.removeEventListener(event, handler, options);
-}
-const veiKey = Symbol("_vei");
-function patchEvent(el, rawName, prevValue, nextValue, instance = null) {
-  const invokers = el[veiKey] || (el[veiKey] = {});
-  const existingInvoker = invokers[rawName];
-  if (nextValue && existingInvoker) {
-    existingInvoker.value = sanitizeEventValue(nextValue, rawName) ;
-  } else {
-    const [name, options] = parseName(rawName);
-    if (nextValue) {
-      const invoker = invokers[rawName] = createInvoker(
-        sanitizeEventValue(nextValue, rawName) ,
-        instance
-      );
-      addEventListener(el, name, invoker, options);
-    } else if (existingInvoker) {
-      removeEventListener(el, name, existingInvoker, options);
-      invokers[rawName] = void 0;
-    }
-  }
-}
-const optionsModifierRE = /(?:Once|Passive|Capture)$/;
-function parseName(name) {
-  let options;
-  if (optionsModifierRE.test(name)) {
-    options = {};
-    let m;
-    while (m = name.match(optionsModifierRE)) {
-      name = name.slice(0, name.length - m[0].length);
-      options[m[0].toLowerCase()] = true;
-    }
-  }
-  const event = name[2] === ":" ? name.slice(3) : hyphenate(name.slice(2));
-  return [event, options];
-}
-let cachedNow = 0;
-const p = /* @__PURE__ */ Promise.resolve();
-const getNow = () => cachedNow || (p.then(() => cachedNow = 0), cachedNow = Date.now());
-function createInvoker(initialValue, instance) {
-  const invoker = (e) => {
-    if (!e._vts) {
-      e._vts = Date.now();
-    } else if (e._vts <= invoker.attached) {
-      return;
-    }
-    callWithAsyncErrorHandling(
-      patchStopImmediatePropagation(e, invoker.value),
-      instance,
-      5,
-      [e]
-    );
-  };
-  invoker.value = initialValue;
-  invoker.attached = getNow();
-  return invoker;
-}
-function sanitizeEventValue(value, propName) {
-  if (isFunction(value) || isArray(value)) {
-    return value;
-  }
-  warn(
-    `Wrong type passed as event handler to ${propName} - did you forget @ or : in front of your prop?
-Expected function or array of functions, received type ${typeof value}.`
-  );
-  return NOOP;
-}
-function patchStopImmediatePropagation(e, value) {
-  if (isArray(value)) {
-    const originalStop = e.stopImmediatePropagation;
-    e.stopImmediatePropagation = () => {
-      originalStop.call(e);
-      e._stopped = true;
-    };
-    return value.map(
-      (fn) => (e2) => !e2._stopped && fn && fn(e2)
-    );
-  } else {
-    return value;
-  }
-}
-
-const isNativeOn = (key) => key.charCodeAt(0) === 111 && key.charCodeAt(1) === 110 && // lowercase letter
-key.charCodeAt(2) > 96 && key.charCodeAt(2) < 123;
-const patchProp = (el, key, prevValue, nextValue, namespace, parentComponent) => {
-  const isSVG = namespace === "svg";
-  if (key === "class") {
-    patchClass(el, nextValue, isSVG);
-  } else if (key === "style") {
-    patchStyle(el, prevValue, nextValue);
-  } else if (isOn(key)) {
-    if (!isModelListener(key)) {
-      patchEvent(el, key, prevValue, nextValue, parentComponent);
-    }
-  } else if (key[0] === "." ? (key = key.slice(1), true) : key[0] === "^" ? (key = key.slice(1), false) : shouldSetAsProp(el, key, nextValue, isSVG)) {
-    patchDOMProp(el, key, nextValue);
-    if (!el.tagName.includes("-") && (key === "value" || key === "checked" || key === "selected")) {
-      patchAttr(el, key, nextValue, isSVG, parentComponent, key !== "value");
-    }
-  } else if (
-    // #11081 force set props for possible async custom element
-    el._isVueCE && (/[A-Z]/.test(key) || !isString(nextValue))
-  ) {
-    patchDOMProp(el, camelize(key), nextValue, parentComponent, key);
-  } else {
-    if (key === "true-value") {
-      el._trueValue = nextValue;
-    } else if (key === "false-value") {
-      el._falseValue = nextValue;
-    }
-    patchAttr(el, key, nextValue, isSVG);
-  }
-};
-function shouldSetAsProp(el, key, value, isSVG) {
-  if (isSVG) {
-    if (key === "innerHTML" || key === "textContent") {
-      return true;
-    }
-    if (key in el && isNativeOn(key) && isFunction(value)) {
-      return true;
-    }
-    return false;
-  }
-  if (key === "spellcheck" || key === "draggable" || key === "translate") {
-    return false;
-  }
-  if (key === "form") {
-    return false;
-  }
-  if (key === "list" && el.tagName === "INPUT") {
-    return false;
-  }
-  if (key === "type" && el.tagName === "TEXTAREA") {
-    return false;
-  }
-  if (key === "width" || key === "height") {
-    const tag = el.tagName;
-    if (tag === "IMG" || tag === "VIDEO" || tag === "CANVAS" || tag === "SOURCE") {
-      return false;
-    }
-  }
-  if (isNativeOn(key) && isString(value)) {
-    return false;
-  }
-  return key in el;
-}
-
-const REMOVAL = {};
-/*! #__NO_SIDE_EFFECTS__ */
-// @__NO_SIDE_EFFECTS__
-function defineCustomElement(options, extraOptions, _createApp) {
-  const Comp = defineComponent(options, extraOptions);
-  if (isPlainObject(Comp)) extend(Comp, extraOptions);
-  class VueCustomElement extends VueElement {
-    constructor(initialProps) {
-      super(Comp, initialProps, _createApp);
-    }
-  }
-  VueCustomElement.def = Comp;
-  return VueCustomElement;
-}
-/*! #__NO_SIDE_EFFECTS__ */
-const defineSSRCustomElement = /* @__NO_SIDE_EFFECTS__ */ (options, extraOptions) => {
-  return /* @__PURE__ */ defineCustomElement(options, extraOptions, createSSRApp);
-};
-const BaseClass = typeof HTMLElement !== "undefined" ? HTMLElement : class {
-};
-class VueElement extends BaseClass {
-  constructor(_def, _props = {}, _createApp = createApp) {
-    super();
-    this._def = _def;
-    this._props = _props;
-    this._createApp = _createApp;
-    this._isVueCE = true;
-    /**
-     * @internal
-     */
-    this._instance = null;
-    /**
-     * @internal
-     */
-    this._app = null;
-    /**
-     * @internal
-     */
-    this._nonce = this._def.nonce;
-    this._connected = false;
-    this._resolved = false;
-    this._numberProps = null;
-    this._styleChildren = /* @__PURE__ */ new WeakSet();
-    this._ob = null;
-    if (this.shadowRoot && _createApp !== createApp) {
-      this._root = this.shadowRoot;
-    } else {
-      if (this.shadowRoot) {
-        warn(
-          `Custom element has pre-rendered declarative shadow root but is not defined as hydratable. Use \`defineSSRCustomElement\`.`
-        );
-      }
-      if (_def.shadowRoot !== false) {
-        this.attachShadow({ mode: "open" });
-        this._root = this.shadowRoot;
-      } else {
-        this._root = this;
-      }
-    }
-    if (!this._def.__asyncLoader) {
-      this._resolveProps(this._def);
-    }
-  }
-  connectedCallback() {
-    if (!this.isConnected) return;
-    if (!this.shadowRoot) {
-      this._parseSlots();
-    }
-    this._connected = true;
-    let parent = this;
-    while (parent = parent && (parent.parentNode || parent.host)) {
-      if (parent instanceof VueElement) {
-        this._parent = parent;
-        break;
-      }
-    }
-    if (!this._instance) {
-      if (this._resolved) {
-        this._setParent();
-        this._update();
-      } else {
-        if (parent && parent._pendingResolve) {
-          this._pendingResolve = parent._pendingResolve.then(() => {
-            this._pendingResolve = void 0;
-            this._resolveDef();
-          });
-        } else {
-          this._resolveDef();
-        }
-      }
-    }
-  }
-  _setParent(parent = this._parent) {
-    if (parent) {
-      this._instance.parent = parent._instance;
-      this._instance.provides = parent._instance.provides;
-    }
-  }
-  disconnectedCallback() {
-    this._connected = false;
-    nextTick(() => {
-      if (!this._connected) {
-        if (this._ob) {
-          this._ob.disconnect();
-          this._ob = null;
-        }
-        this._app && this._app.unmount();
-        if (this._instance) this._instance.ce = void 0;
-        this._app = this._instance = null;
-      }
-    });
-  }
-  /**
-   * resolve inner component definition (handle possible async component)
-   */
-  _resolveDef() {
-    if (this._pendingResolve) {
-      return;
-    }
-    for (let i = 0; i < this.attributes.length; i++) {
-      this._setAttr(this.attributes[i].name);
-    }
-    this._ob = new MutationObserver((mutations) => {
-      for (const m of mutations) {
-        this._setAttr(m.attributeName);
-      }
-    });
-    this._ob.observe(this, { attributes: true });
-    const resolve = (def, isAsync = false) => {
-      this._resolved = true;
-      this._pendingResolve = void 0;
-      const { props, styles } = def;
-      let numberProps;
-      if (props && !isArray(props)) {
-        for (const key in props) {
-          const opt = props[key];
-          if (opt === Number || opt && opt.type === Number) {
-            if (key in this._props) {
-              this._props[key] = toNumber(this._props[key]);
-            }
-            (numberProps || (numberProps = /* @__PURE__ */ Object.create(null)))[camelize(key)] = true;
-          }
-        }
-      }
-      this._numberProps = numberProps;
-      if (isAsync) {
-        this._resolveProps(def);
-      }
-      if (this.shadowRoot) {
-        this._applyStyles(styles);
-      } else if (styles) {
-        warn(
-          "Custom element style injection is not supported when using shadowRoot: false"
-        );
-      }
-      this._mount(def);
-    };
-    const asyncDef = this._def.__asyncLoader;
-    if (asyncDef) {
-      this._pendingResolve = asyncDef().then(
-        (def) => resolve(this._def = def, true)
-      );
-    } else {
-      resolve(this._def);
-    }
-  }
-  _mount(def) {
-    if (!def.name) {
-      def.name = "VueElement";
-    }
-    this._app = this._createApp(def);
-    if (def.configureApp) {
-      def.configureApp(this._app);
-    }
-    this._app._ceVNode = this._createVNode();
-    this._app.mount(this._root);
-    const exposed = this._instance && this._instance.exposed;
-    if (!exposed) return;
-    for (const key in exposed) {
-      if (!hasOwn(this, key)) {
-        Object.defineProperty(this, key, {
-          // unwrap ref to be consistent with public instance behavior
-          get: () => unref(exposed[key])
-        });
-      } else {
-        warn(`Exposed property "${key}" already exists on custom element.`);
-      }
-    }
-  }
-  _resolveProps(def) {
-    const { props } = def;
-    const declaredPropKeys = isArray(props) ? props : Object.keys(props || {});
-    for (const key of Object.keys(this)) {
-      if (key[0] !== "_" && declaredPropKeys.includes(key)) {
-        this._setProp(key, this[key]);
-      }
-    }
-    for (const key of declaredPropKeys.map(camelize)) {
-      Object.defineProperty(this, key, {
-        get() {
-          return this._getProp(key);
-        },
-        set(val) {
-          this._setProp(key, val, true, true);
-        }
-      });
-    }
-  }
-  _setAttr(key) {
-    if (key.startsWith("data-v-")) return;
-    const has = this.hasAttribute(key);
-    let value = has ? this.getAttribute(key) : REMOVAL;
-    const camelKey = camelize(key);
-    if (has && this._numberProps && this._numberProps[camelKey]) {
-      value = toNumber(value);
-    }
-    this._setProp(camelKey, value, false, true);
-  }
-  /**
-   * @internal
-   */
-  _getProp(key) {
-    return this._props[key];
-  }
-  /**
-   * @internal
-   */
-  _setProp(key, val, shouldReflect = true, shouldUpdate = false) {
-    if (val !== this._props[key]) {
-      if (val === REMOVAL) {
-        delete this._props[key];
-      } else {
-        this._props[key] = val;
-        if (key === "key" && this._app) {
-          this._app._ceVNode.key = val;
-        }
-      }
-      if (shouldUpdate && this._instance) {
-        this._update();
-      }
-      if (shouldReflect) {
-        if (val === true) {
-          this.setAttribute(hyphenate(key), "");
-        } else if (typeof val === "string" || typeof val === "number") {
-          this.setAttribute(hyphenate(key), val + "");
-        } else if (!val) {
-          this.removeAttribute(hyphenate(key));
-        }
-      }
-    }
-  }
-  _update() {
-    render(this._createVNode(), this._root);
-  }
-  _createVNode() {
-    const baseProps = {};
-    if (!this.shadowRoot) {
-      baseProps.onVnodeMounted = baseProps.onVnodeUpdated = this._renderSlots.bind(this);
-    }
-    const vnode = createVNode(this._def, extend(baseProps, this._props));
-    if (!this._instance) {
-      vnode.ce = (instance) => {
-        this._instance = instance;
-        instance.ce = this;
-        instance.isCE = true;
-        {
-          instance.ceReload = (newStyles) => {
-            if (this._styles) {
-              this._styles.forEach((s) => this._root.removeChild(s));
-              this._styles.length = 0;
-            }
-            this._applyStyles(newStyles);
-            this._instance = null;
-            this._update();
-          };
-        }
-        const dispatch = (event, args) => {
-          this.dispatchEvent(
-            new CustomEvent(
-              event,
-              isPlainObject(args[0]) ? extend({ detail: args }, args[0]) : { detail: args }
-            )
-          );
-        };
-        instance.emit = (event, ...args) => {
-          dispatch(event, args);
-          if (hyphenate(event) !== event) {
-            dispatch(hyphenate(event), args);
-          }
-        };
-        this._setParent();
-      };
-    }
-    return vnode;
-  }
-  _applyStyles(styles, owner) {
-    if (!styles) return;
-    if (owner) {
-      if (owner === this._def || this._styleChildren.has(owner)) {
-        return;
-      }
-      this._styleChildren.add(owner);
-    }
-    const nonce = this._nonce;
-    for (let i = styles.length - 1; i >= 0; i--) {
-      const s = document.createElement("style");
-      if (nonce) s.setAttribute("nonce", nonce);
-      s.textContent = styles[i];
-      this.shadowRoot.prepend(s);
-      {
-        if (owner) {
-          if (owner.__hmrId) {
-            if (!this._childStyles) this._childStyles = /* @__PURE__ */ new Map();
-            let entry = this._childStyles.get(owner.__hmrId);
-            if (!entry) {
-              this._childStyles.set(owner.__hmrId, entry = []);
-            }
-            entry.push(s);
-          }
-        } else {
-          (this._styles || (this._styles = [])).push(s);
-        }
-      }
-    }
-  }
-  /**
-   * Only called when shadowRoot is false
-   */
-  _parseSlots() {
-    const slots = this._slots = {};
-    let n;
-    while (n = this.firstChild) {
-      const slotName = n.nodeType === 1 && n.getAttribute("slot") || "default";
-      (slots[slotName] || (slots[slotName] = [])).push(n);
-      this.removeChild(n);
-    }
-  }
-  /**
-   * Only called when shadowRoot is false
-   */
-  _renderSlots() {
-    const outlets = (this._teleportTarget || this).querySelectorAll("slot");
-    const scopeId = this._instance.type.__scopeId;
-    for (let i = 0; i < outlets.length; i++) {
-      const o = outlets[i];
-      const slotName = o.getAttribute("name") || "default";
-      const content = this._slots[slotName];
-      const parent = o.parentNode;
-      if (content) {
-        for (const n of content) {
-          if (scopeId && n.nodeType === 1) {
-            const id = scopeId + "-s";
-            const walker = document.createTreeWalker(n, 1);
-            n.setAttribute(id, "");
-            let child;
-            while (child = walker.nextNode()) {
-              child.setAttribute(id, "");
-            }
-          }
-          parent.insertBefore(n, o);
-        }
-      } else {
-        while (o.firstChild) parent.insertBefore(o.firstChild, o);
-      }
-      parent.removeChild(o);
-    }
-  }
-  /**
-   * @internal
-   */
-  _injectChildStyle(comp) {
-    this._applyStyles(comp.styles, comp);
-  }
-  /**
-   * @internal
-   */
-  _removeChildStyle(comp) {
-    {
-      this._styleChildren.delete(comp);
-      if (this._childStyles && comp.__hmrId) {
-        const oldStyles = this._childStyles.get(comp.__hmrId);
-        if (oldStyles) {
-          oldStyles.forEach((s) => this._root.removeChild(s));
-          oldStyles.length = 0;
-        }
-      }
-    }
-  }
-}
-function useHost(caller) {
-  const instance = getCurrentInstance();
-  const el = instance && instance.ce;
-  if (el) {
-    return el;
-  } else {
-    if (!instance) {
-      warn(
-        `${caller || "useHost"} called without an active component instance.`
-      );
-    } else {
-      warn(
-        `${caller || "useHost"} can only be used in components defined via defineCustomElement.`
-      );
-    }
-  }
-  return null;
-}
-function useShadowRoot() {
-  const el = useHost("useShadowRoot") ;
-  return el && el.shadowRoot;
-}
-
-function useCssModule(name = "$style") {
-  {
-    const instance = getCurrentInstance();
-    if (!instance) {
-      warn(`useCssModule must be called inside setup()`);
-      return EMPTY_OBJ;
-    }
-    const modules = instance.type.__cssModules;
-    if (!modules) {
-      warn(`Current instance does not have CSS modules injected.`);
-      return EMPTY_OBJ;
-    }
-    const mod = modules[name];
-    if (!mod) {
-      warn(`Current instance does not have CSS module named "${name}".`);
-      return EMPTY_OBJ;
-    }
-    return mod;
-  }
-}
-
-const positionMap = /* @__PURE__ */ new WeakMap();
-const newPositionMap = /* @__PURE__ */ new WeakMap();
-const moveCbKey = Symbol("_moveCb");
-const enterCbKey = Symbol("_enterCb");
-const decorate = (t) => {
-  delete t.props.mode;
-  return t;
-};
-const TransitionGroupImpl = /* @__PURE__ */ decorate({
-  name: "TransitionGroup",
-  props: /* @__PURE__ */ extend({}, TransitionPropsValidators, {
-    tag: String,
-    moveClass: String
-  }),
-  setup(props, { slots }) {
-    const instance = getCurrentInstance();
-    const state = useTransitionState();
-    let prevChildren;
-    let children;
-    onUpdated(() => {
-      if (!prevChildren.length) {
-        return;
-      }
-      const moveClass = props.moveClass || `${props.name || "v"}-move`;
-      if (!hasCSSTransform(
-        prevChildren[0].el,
-        instance.vnode.el,
-        moveClass
-      )) {
-        return;
-      }
-      prevChildren.forEach(callPendingCbs);
-      prevChildren.forEach(recordPosition);
-      const movedChildren = prevChildren.filter(applyTranslation);
-      forceReflow();
-      movedChildren.forEach((c) => {
-        const el = c.el;
-        const style = el.style;
-        addTransitionClass(el, moveClass);
-        style.transform = style.webkitTransform = style.transitionDuration = "";
-        const cb = el[moveCbKey] = (e) => {
-          if (e && e.target !== el) {
-            return;
-          }
-          if (!e || /transform$/.test(e.propertyName)) {
-            el.removeEventListener("transitionend", cb);
-            el[moveCbKey] = null;
-            removeTransitionClass(el, moveClass);
-          }
-        };
-        el.addEventListener("transitionend", cb);
-      });
-    });
-    return () => {
-      const rawProps = toRaw(props);
-      const cssTransitionProps = resolveTransitionProps(rawProps);
-      let tag = rawProps.tag || Fragment;
-      prevChildren = [];
-      if (children) {
-        for (let i = 0; i < children.length; i++) {
-          const child = children[i];
-          if (child.el && child.el instanceof Element) {
-            prevChildren.push(child);
-            setTransitionHooks(
-              child,
-              resolveTransitionHooks(
-                child,
-                cssTransitionProps,
-                state,
-                instance
-              )
-            );
-            positionMap.set(
-              child,
-              child.el.getBoundingClientRect()
-            );
-          }
-        }
-      }
-      children = slots.default ? getTransitionRawChildren(slots.default()) : [];
-      for (let i = 0; i < children.length; i++) {
-        const child = children[i];
-        if (child.key != null) {
-          setTransitionHooks(
-            child,
-            resolveTransitionHooks(child, cssTransitionProps, state, instance)
-          );
-        } else if (child.type !== Text) {
-          warn(`<TransitionGroup> children must be keyed.`);
-        }
-      }
-      return createVNode(tag, null, children);
-    };
-  }
-});
-const TransitionGroup = TransitionGroupImpl;
-function callPendingCbs(c) {
-  const el = c.el;
-  if (el[moveCbKey]) {
-    el[moveCbKey]();
-  }
-  if (el[enterCbKey]) {
-    el[enterCbKey]();
-  }
-}
-function recordPosition(c) {
-  newPositionMap.set(c, c.el.getBoundingClientRect());
-}
-function applyTranslation(c) {
-  const oldPos = positionMap.get(c);
-  const newPos = newPositionMap.get(c);
-  const dx = oldPos.left - newPos.left;
-  const dy = oldPos.top - newPos.top;
-  if (dx || dy) {
-    const s = c.el.style;
-    s.transform = s.webkitTransform = `translate(${dx}px,${dy}px)`;
-    s.transitionDuration = "0s";
-    return c;
-  }
-}
-function hasCSSTransform(el, root, moveClass) {
-  const clone = el.cloneNode();
-  const _vtc = el[vtcKey];
-  if (_vtc) {
-    _vtc.forEach((cls) => {
-      cls.split(/\s+/).forEach((c) => c && clone.classList.remove(c));
-    });
-  }
-  moveClass.split(/\s+/).forEach((c) => c && clone.classList.add(c));
-  clone.style.display = "none";
-  const container = root.nodeType === 1 ? root : root.parentNode;
-  container.appendChild(clone);
-  const { hasTransform } = getTransitionInfo(clone);
-  container.removeChild(clone);
-  return hasTransform;
-}
-
-const getModelAssigner = (vnode) => {
-  const fn = vnode.props["onUpdate:modelValue"] || false;
-  return isArray(fn) ? (value) => invokeArrayFns(fn, value) : fn;
-};
-function onCompositionStart(e) {
-  e.target.composing = true;
-}
-function onCompositionEnd(e) {
-  const target = e.target;
-  if (target.composing) {
-    target.composing = false;
-    target.dispatchEvent(new Event("input"));
-  }
-}
-const assignKey = Symbol("_assign");
-const vModelText = {
-  created(el, { modifiers: { lazy, trim, number } }, vnode) {
-    el[assignKey] = getModelAssigner(vnode);
-    const castToNumber = number || vnode.props && vnode.props.type === "number";
-    addEventListener(el, lazy ? "change" : "input", (e) => {
-      if (e.target.composing) return;
-      let domValue = el.value;
-      if (trim) {
-        domValue = domValue.trim();
-      }
-      if (castToNumber) {
-        domValue = looseToNumber(domValue);
-      }
-      el[assignKey](domValue);
-    });
-    if (trim) {
-      addEventListener(el, "change", () => {
-        el.value = el.value.trim();
-      });
-    }
-    if (!lazy) {
-      addEventListener(el, "compositionstart", onCompositionStart);
-      addEventListener(el, "compositionend", onCompositionEnd);
-      addEventListener(el, "change", onCompositionEnd);
-    }
-  },
-  // set value on mounted so it's after min/max for type="range"
-  mounted(el, { value }) {
-    el.value = value == null ? "" : value;
-  },
-  beforeUpdate(el, { value, oldValue, modifiers: { lazy, trim, number } }, vnode) {
-    el[assignKey] = getModelAssigner(vnode);
-    if (el.composing) return;
-    const elValue = (number || el.type === "number") && !/^0\d/.test(el.value) ? looseToNumber(el.value) : el.value;
-    const newValue = value == null ? "" : value;
-    if (elValue === newValue) {
-      return;
-    }
-    if (document.activeElement === el && el.type !== "range") {
-      if (lazy && value === oldValue) {
-        return;
-      }
-      if (trim && el.value.trim() === newValue) {
-        return;
-      }
-    }
-    el.value = newValue;
-  }
-};
-const vModelCheckbox = {
-  // #4096 array checkboxes need to be deep traversed
-  deep: true,
-  created(el, _, vnode) {
-    el[assignKey] = getModelAssigner(vnode);
-    addEventListener(el, "change", () => {
-      const modelValue = el._modelValue;
-      const elementValue = getValue(el);
-      const checked = el.checked;
-      const assign = el[assignKey];
-      if (isArray(modelValue)) {
-        const index = looseIndexOf(modelValue, elementValue);
-        const found = index !== -1;
-        if (checked && !found) {
-          assign(modelValue.concat(elementValue));
-        } else if (!checked && found) {
-          const filtered = [...modelValue];
-          filtered.splice(index, 1);
-          assign(filtered);
-        }
-      } else if (isSet(modelValue)) {
-        const cloned = new Set(modelValue);
-        if (checked) {
-          cloned.add(elementValue);
-        } else {
-          cloned.delete(elementValue);
-        }
-        assign(cloned);
-      } else {
-        assign(getCheckboxValue(el, checked));
-      }
-    });
-  },
-  // set initial checked on mount to wait for true-value/false-value
-  mounted: setChecked,
-  beforeUpdate(el, binding, vnode) {
-    el[assignKey] = getModelAssigner(vnode);
-    setChecked(el, binding, vnode);
-  }
-};
-function setChecked(el, { value, oldValue }, vnode) {
-  el._modelValue = value;
-  let checked;
-  if (isArray(value)) {
-    checked = looseIndexOf(value, vnode.props.value) > -1;
-  } else if (isSet(value)) {
-    checked = value.has(vnode.props.value);
-  } else {
-    if (value === oldValue) return;
-    checked = looseEqual(value, getCheckboxValue(el, true));
-  }
-  if (el.checked !== checked) {
-    el.checked = checked;
-  }
-}
-const vModelRadio = {
-  created(el, { value }, vnode) {
-    el.checked = looseEqual(value, vnode.props.value);
-    el[assignKey] = getModelAssigner(vnode);
-    addEventListener(el, "change", () => {
-      el[assignKey](getValue(el));
-    });
-  },
-  beforeUpdate(el, { value, oldValue }, vnode) {
-    el[assignKey] = getModelAssigner(vnode);
-    if (value !== oldValue) {
-      el.checked = looseEqual(value, vnode.props.value);
-    }
-  }
-};
-const vModelSelect = {
-  // <select multiple> value need to be deep traversed
-  deep: true,
-  created(el, { value, modifiers: { number } }, vnode) {
-    const isSetModel = isSet(value);
-    addEventListener(el, "change", () => {
-      const selectedVal = Array.prototype.filter.call(el.options, (o) => o.selected).map(
-        (o) => number ? looseToNumber(getValue(o)) : getValue(o)
-      );
-      el[assignKey](
-        el.multiple ? isSetModel ? new Set(selectedVal) : selectedVal : selectedVal[0]
-      );
-      el._assigning = true;
-      nextTick(() => {
-        el._assigning = false;
-      });
-    });
-    el[assignKey] = getModelAssigner(vnode);
-  },
-  // set value in mounted & updated because <select> relies on its children
-  // <option>s.
-  mounted(el, { value }) {
-    setSelected(el, value);
-  },
-  beforeUpdate(el, _binding, vnode) {
-    el[assignKey] = getModelAssigner(vnode);
-  },
-  updated(el, { value }) {
-    if (!el._assigning) {
-      setSelected(el, value);
-    }
-  }
-};
-function setSelected(el, value) {
-  const isMultiple = el.multiple;
-  const isArrayValue = isArray(value);
-  if (isMultiple && !isArrayValue && !isSet(value)) {
-    warn(
-      `<select multiple v-model> expects an Array or Set value for its binding, but got ${Object.prototype.toString.call(value).slice(8, -1)}.`
-    );
-    return;
-  }
-  for (let i = 0, l = el.options.length; i < l; i++) {
-    const option = el.options[i];
-    const optionValue = getValue(option);
-    if (isMultiple) {
-      if (isArrayValue) {
-        const optionType = typeof optionValue;
-        if (optionType === "string" || optionType === "number") {
-          option.selected = value.some((v) => String(v) === String(optionValue));
-        } else {
-          option.selected = looseIndexOf(value, optionValue) > -1;
-        }
-      } else {
-        option.selected = value.has(optionValue);
-      }
-    } else if (looseEqual(getValue(option), value)) {
-      if (el.selectedIndex !== i) el.selectedIndex = i;
-      return;
-    }
-  }
-  if (!isMultiple && el.selectedIndex !== -1) {
-    el.selectedIndex = -1;
-  }
-}
-function getValue(el) {
-  return "_value" in el ? el._value : el.value;
-}
-function getCheckboxValue(el, checked) {
-  const key = checked ? "_trueValue" : "_falseValue";
-  return key in el ? el[key] : checked;
-}
-const vModelDynamic = {
-  created(el, binding, vnode) {
-    callModelHook(el, binding, vnode, null, "created");
-  },
-  mounted(el, binding, vnode) {
-    callModelHook(el, binding, vnode, null, "mounted");
-  },
-  beforeUpdate(el, binding, vnode, prevVNode) {
-    callModelHook(el, binding, vnode, prevVNode, "beforeUpdate");
-  },
-  updated(el, binding, vnode, prevVNode) {
-    callModelHook(el, binding, vnode, prevVNode, "updated");
-  }
-};
-function resolveDynamicModel(tagName, type) {
-  switch (tagName) {
-    case "SELECT":
-      return vModelSelect;
-    case "TEXTAREA":
-      return vModelText;
-    default:
-      switch (type) {
-        case "checkbox":
-          return vModelCheckbox;
-        case "radio":
-          return vModelRadio;
-        default:
-          return vModelText;
-      }
-  }
-}
-function callModelHook(el, binding, vnode, prevVNode, hook) {
-  const modelToUse = resolveDynamicModel(
-    el.tagName,
-    vnode.props && vnode.props.type
-  );
-  const fn = modelToUse[hook];
-  fn && fn(el, binding, vnode, prevVNode);
-}
-function initVModelForSSR() {
-  vModelText.getSSRProps = ({ value }) => ({ value });
-  vModelRadio.getSSRProps = ({ value }, vnode) => {
-    if (vnode.props && looseEqual(vnode.props.value, value)) {
-      return { checked: true };
-    }
-  };
-  vModelCheckbox.getSSRProps = ({ value }, vnode) => {
-    if (isArray(value)) {
-      if (vnode.props && looseIndexOf(value, vnode.props.value) > -1) {
-        return { checked: true };
-      }
-    } else if (isSet(value)) {
-      if (vnode.props && value.has(vnode.props.value)) {
-        return { checked: true };
-      }
-    } else if (value) {
-      return { checked: true };
-    }
-  };
-  vModelDynamic.getSSRProps = (binding, vnode) => {
-    if (typeof vnode.type !== "string") {
-      return;
-    }
-    const modelToUse = resolveDynamicModel(
-      // resolveDynamicModel expects an uppercase tag name, but vnode.type is lowercase
-      vnode.type.toUpperCase(),
-      vnode.props && vnode.props.type
-    );
-    if (modelToUse.getSSRProps) {
-      return modelToUse.getSSRProps(binding, vnode);
-    }
-  };
-}
-
-const systemModifiers = ["ctrl", "shift", "alt", "meta"];
-const modifierGuards = {
-  stop: (e) => e.stopPropagation(),
-  prevent: (e) => e.preventDefault(),
-  self: (e) => e.target !== e.currentTarget,
-  ctrl: (e) => !e.ctrlKey,
-  shift: (e) => !e.shiftKey,
-  alt: (e) => !e.altKey,
-  meta: (e) => !e.metaKey,
-  left: (e) => "button" in e && e.button !== 0,
-  middle: (e) => "button" in e && e.button !== 1,
-  right: (e) => "button" in e && e.button !== 2,
-  exact: (e, modifiers) => systemModifiers.some((m) => e[`${m}Key`] && !modifiers.includes(m))
-};
-const withModifiers = (fn, modifiers) => {
-  const cache = fn._withMods || (fn._withMods = {});
-  const cacheKey = modifiers.join(".");
-  return cache[cacheKey] || (cache[cacheKey] = (event, ...args) => {
-    for (let i = 0; i < modifiers.length; i++) {
-      const guard = modifierGuards[modifiers[i]];
-      if (guard && guard(event, modifiers)) return;
-    }
-    return fn(event, ...args);
-  });
-};
-const keyNames = {
-  esc: "escape",
-  space: " ",
-  up: "arrow-up",
-  left: "arrow-left",
-  right: "arrow-right",
-  down: "arrow-down",
-  delete: "backspace"
-};
-const withKeys = (fn, modifiers) => {
-  const cache = fn._withKeys || (fn._withKeys = {});
-  const cacheKey = modifiers.join(".");
-  return cache[cacheKey] || (cache[cacheKey] = (event) => {
-    if (!("key" in event)) {
-      return;
-    }
-    const eventKey = hyphenate(event.key);
-    if (modifiers.some(
-      (k) => k === eventKey || keyNames[k] === eventKey
-    )) {
-      return fn(event);
-    }
-  });
-};
-
-const rendererOptions = /* @__PURE__ */ extend({ patchProp }, nodeOps);
-let renderer;
-let enabledHydration = false;
-function ensureRenderer() {
-  return renderer || (renderer = createRenderer(rendererOptions));
-}
-function ensureHydrationRenderer() {
-  renderer = enabledHydration ? renderer : createHydrationRenderer(rendererOptions);
-  enabledHydration = true;
-  return renderer;
-}
-const render = (...args) => {
-  ensureRenderer().render(...args);
-};
-const hydrate = (...args) => {
-  ensureHydrationRenderer().hydrate(...args);
-};
-const createApp = (...args) => {
-  const app = ensureRenderer().createApp(...args);
-  {
-    injectNativeTagCheck(app);
-    injectCompilerOptionsCheck(app);
-  }
-  const { mount } = app;
-  app.mount = (containerOrSelector) => {
-    const container = normalizeContainer(containerOrSelector);
-    if (!container) return;
-    const component = app._component;
-    if (!isFunction(component) && !component.render && !component.template) {
-      component.template = container.innerHTML;
-    }
-    if (container.nodeType === 1) {
-      container.textContent = "";
-    }
-    const proxy = mount(container, false, resolveRootNamespace(container));
-    if (container instanceof Element) {
-      container.removeAttribute("v-cloak");
-      container.setAttribute("data-v-app", "");
-    }
-    return proxy;
-  };
-  return app;
-};
-const createSSRApp = (...args) => {
-  const app = ensureHydrationRenderer().createApp(...args);
-  {
-    injectNativeTagCheck(app);
-    injectCompilerOptionsCheck(app);
-  }
-  const { mount } = app;
-  app.mount = (containerOrSelector) => {
-    const container = normalizeContainer(containerOrSelector);
-    if (container) {
-      return mount(container, true, resolveRootNamespace(container));
-    }
-  };
-  return app;
-};
-function resolveRootNamespace(container) {
-  if (container instanceof SVGElement) {
-    return "svg";
-  }
-  if (typeof MathMLElement === "function" && container instanceof MathMLElement) {
-    return "mathml";
-  }
-}
-function injectNativeTagCheck(app) {
-  Object.defineProperty(app.config, "isNativeTag", {
-    value: (tag) => isHTMLTag(tag) || isSVGTag(tag) || isMathMLTag(tag),
-    writable: false
-  });
-}
-function injectCompilerOptionsCheck(app) {
-  if (isRuntimeOnly()) {
-    const isCustomElement = app.config.isCustomElement;
-    Object.defineProperty(app.config, "isCustomElement", {
-      get() {
-        return isCustomElement;
-      },
-      set() {
-        warn(
-          `The \`isCustomElement\` config option is deprecated. Use \`compilerOptions.isCustomElement\` instead.`
-        );
-      }
-    });
-    const compilerOptions = app.config.compilerOptions;
-    const msg = `The \`compilerOptions\` config option is only respected when using a build of Vue.js that includes the runtime compiler (aka "full build"). Since you are using the runtime-only build, \`compilerOptions\` must be passed to \`@vue/compiler-dom\` in the build setup instead.
-- For vue-loader: pass it via vue-loader's \`compilerOptions\` loader option.
-- For vue-cli: see https://cli.vuejs.org/guide/webpack.html#modifying-options-of-a-loader
-- For vite: pass it via @vitejs/plugin-vue options. See https://github.com/vitejs/vite-plugin-vue/tree/main/packages/plugin-vue#example-for-passing-options-to-vuecompiler-sfc`;
-    Object.defineProperty(app.config, "compilerOptions", {
-      get() {
-        warn(msg);
-        return compilerOptions;
-      },
-      set() {
-        warn(msg);
-      }
-    });
-  }
-}
-function normalizeContainer(container) {
-  if (isString(container)) {
-    const res = document.querySelector(container);
-    if (!res) {
-      warn(
-        `Failed to mount app: mount target selector "${container}" returned null.`
-      );
-    }
-    return res;
-  }
-  if (window.ShadowRoot && container instanceof window.ShadowRoot && container.mode === "closed") {
-    warn(
-      `mounting on a ShadowRoot with \`{mode: "closed"}\` may lead to unpredictable bugs`
-    );
-  }
-  return container;
-}
-let ssrDirectiveInitialized = false;
-const initDirectivesForSSR = () => {
-  if (!ssrDirectiveInitialized) {
-    ssrDirectiveInitialized = true;
-    initVModelForSSR();
-    initVShowForSSR();
-  }
-} ;
-
-var runtimeDom = /*#__PURE__*/Object.freeze({
-  __proto__: null,
-  BaseTransition: BaseTransition,
-  BaseTransitionPropsValidators: BaseTransitionPropsValidators,
-  Comment: Comment,
-  DeprecationTypes: DeprecationTypes,
-  EffectScope: EffectScope,
-  ErrorCodes: ErrorCodes,
-  ErrorTypeStrings: ErrorTypeStrings,
-  Fragment: Fragment,
-  KeepAlive: KeepAlive,
-  ReactiveEffect: ReactiveEffect,
-  Static: Static,
-  Suspense: Suspense,
-  Teleport: Teleport,
-  Text: Text,
-  TrackOpTypes: TrackOpTypes,
-  Transition: Transition,
-  TransitionGroup: TransitionGroup,
-  TriggerOpTypes: TriggerOpTypes,
-  VueElement: VueElement,
-  assertNumber: assertNumber,
-  callWithAsyncErrorHandling: callWithAsyncErrorHandling,
-  callWithErrorHandling: callWithErrorHandling,
-  camelize: camelize,
-  capitalize: capitalize,
-  cloneVNode: cloneVNode,
-  compatUtils: compatUtils,
-  computed: computed,
-  createApp: createApp,
-  createBlock: createBlock,
-  createCommentVNode: createCommentVNode,
-  createElementBlock: createElementBlock,
-  createElementVNode: createBaseVNode,
-  createHydrationRenderer: createHydrationRenderer,
-  createPropsRestProxy: createPropsRestProxy,
-  createRenderer: createRenderer,
-  createSSRApp: createSSRApp,
-  createSlots: createSlots,
-  createStaticVNode: createStaticVNode,
-  createTextVNode: createTextVNode,
-  createVNode: createVNode,
-  customRef: customRef,
-  defineAsyncComponent: defineAsyncComponent,
-  defineComponent: defineComponent,
-  defineCustomElement: defineCustomElement,
-  defineEmits: defineEmits,
-  defineExpose: defineExpose,
-  defineModel: defineModel,
-  defineOptions: defineOptions,
-  defineProps: defineProps,
-  defineSSRCustomElement: defineSSRCustomElement,
-  defineSlots: defineSlots,
-  devtools: devtools,
-  effect: effect,
-  effectScope: effectScope,
-  getCurrentInstance: getCurrentInstance,
-  getCurrentScope: getCurrentScope,
-  getCurrentWatcher: getCurrentWatcher,
-  getTransitionRawChildren: getTransitionRawChildren,
-  guardReactiveProps: guardReactiveProps,
-  h: h,
-  handleError: handleError,
-  hasInjectionContext: hasInjectionContext,
-  hydrate: hydrate,
-  hydrateOnIdle: hydrateOnIdle,
-  hydrateOnInteraction: hydrateOnInteraction,
-  hydrateOnMediaQuery: hydrateOnMediaQuery,
-  hydrateOnVisible: hydrateOnVisible,
-  initCustomFormatter: initCustomFormatter,
-  initDirectivesForSSR: initDirectivesForSSR,
-  inject: inject,
-  isMemoSame: isMemoSame,
-  isProxy: isProxy,
-  isReactive: isReactive,
-  isReadonly: isReadonly,
-  isRef: isRef,
-  isRuntimeOnly: isRuntimeOnly,
-  isShallow: isShallow,
-  isVNode: isVNode,
-  markRaw: markRaw,
-  mergeDefaults: mergeDefaults,
-  mergeModels: mergeModels,
-  mergeProps: mergeProps,
-  nextTick: nextTick,
-  normalizeClass: normalizeClass,
-  normalizeProps: normalizeProps,
-  normalizeStyle: normalizeStyle,
-  onActivated: onActivated,
-  onBeforeMount: onBeforeMount,
-  onBeforeUnmount: onBeforeUnmount,
-  onBeforeUpdate: onBeforeUpdate,
-  onDeactivated: onDeactivated,
-  onErrorCaptured: onErrorCaptured,
-  onMounted: onMounted,
-  onRenderTracked: onRenderTracked,
-  onRenderTriggered: onRenderTriggered,
-  onScopeDispose: onScopeDispose,
-  onServerPrefetch: onServerPrefetch,
-  onUnmounted: onUnmounted,
-  onUpdated: onUpdated,
-  onWatcherCleanup: onWatcherCleanup,
-  openBlock: openBlock,
-  popScopeId: popScopeId,
-  provide: provide,
-  proxyRefs: proxyRefs,
-  pushScopeId: pushScopeId,
-  queuePostFlushCb: queuePostFlushCb,
-  reactive: reactive,
-  readonly: readonly,
-  ref: ref,
-  registerRuntimeCompiler: registerRuntimeCompiler,
-  render: render,
-  renderList: renderList,
-  renderSlot: renderSlot,
-  resolveComponent: resolveComponent,
-  resolveDirective: resolveDirective,
-  resolveDynamicComponent: resolveDynamicComponent,
-  resolveFilter: resolveFilter,
-  resolveTransitionHooks: resolveTransitionHooks,
-  setBlockTracking: setBlockTracking,
-  setDevtoolsHook: setDevtoolsHook,
-  setTransitionHooks: setTransitionHooks,
-  shallowReactive: shallowReactive,
-  shallowReadonly: shallowReadonly,
-  shallowRef: shallowRef,
-  ssrContextKey: ssrContextKey,
-  ssrUtils: ssrUtils,
-  stop: stop,
-  toDisplayString: toDisplayString,
-  toHandlerKey: toHandlerKey,
-  toHandlers: toHandlers,
-  toRaw: toRaw,
-  toRef: toRef,
-  toRefs: toRefs,
-  toValue: toValue,
-  transformVNodeArgs: transformVNodeArgs,
-  triggerRef: triggerRef,
-  unref: unref,
-  useAttrs: useAttrs,
-  useCssModule: useCssModule,
-  useCssVars: useCssVars,
-  useHost: useHost,
-  useId: useId,
-  useModel: useModel,
-  useSSRContext: useSSRContext,
-  useShadowRoot: useShadowRoot,
-  useSlots: useSlots,
-  useTemplateRef: useTemplateRef,
-  useTransitionState: useTransitionState,
-  vModelCheckbox: vModelCheckbox,
-  vModelDynamic: vModelDynamic,
-  vModelRadio: vModelRadio,
-  vModelSelect: vModelSelect,
-  vModelText: vModelText,
-  vShow: vShow,
-  version: version,
-  warn: warn,
-  watch: watch,
-  watchEffect: watchEffect,
-  watchPostEffect: watchPostEffect,
-  watchSyncEffect: watchSyncEffect,
-  withAsyncContext: withAsyncContext,
-  withCtx: withCtx,
-  withDefaults: withDefaults,
-  withDirectives: withDirectives,
-  withKeys: withKeys,
-  withMemo: withMemo,
-  withModifiers: withModifiers,
-  withScopeId: withScopeId
-});
-
-function initDev() {
-  {
-    {
-      console.info(
-        `You are running a development build of Vue.
-Make sure to use the production build (*.prod.js) when deploying for production.`
-      );
-    }
-    initCustomFormatter();
-  }
-}
-
-const FRAGMENT = Symbol(`Fragment` );
-const TELEPORT = Symbol(`Teleport` );
-const SUSPENSE = Symbol(`Suspense` );
-const KEEP_ALIVE = Symbol(`KeepAlive` );
-const BASE_TRANSITION = Symbol(
-  `BaseTransition` 
-);
-const OPEN_BLOCK = Symbol(`openBlock` );
-const CREATE_BLOCK = Symbol(`createBlock` );
-const CREATE_ELEMENT_BLOCK = Symbol(
-  `createElementBlock` 
-);
-const CREATE_VNODE = Symbol(`createVNode` );
-const CREATE_ELEMENT_VNODE = Symbol(
-  `createElementVNode` 
-);
-const CREATE_COMMENT = Symbol(
-  `createCommentVNode` 
-);
-const CREATE_TEXT = Symbol(
-  `createTextVNode` 
-);
-const CREATE_STATIC = Symbol(
-  `createStaticVNode` 
-);
-const RESOLVE_COMPONENT = Symbol(
-  `resolveComponent` 
-);
-const RESOLVE_DYNAMIC_COMPONENT = Symbol(
-  `resolveDynamicComponent` 
-);
-const RESOLVE_DIRECTIVE = Symbol(
-  `resolveDirective` 
-);
-const RESOLVE_FILTER = Symbol(
-  `resolveFilter` 
-);
-const WITH_DIRECTIVES = Symbol(
-  `withDirectives` 
-);
-const RENDER_LIST = Symbol(`renderList` );
-const RENDER_SLOT = Symbol(`renderSlot` );
-const CREATE_SLOTS = Symbol(`createSlots` );
-const TO_DISPLAY_STRING = Symbol(
-  `toDisplayString` 
-);
-const MERGE_PROPS = Symbol(`mergeProps` );
-const NORMALIZE_CLASS = Symbol(
-  `normalizeClass` 
-);
-const NORMALIZE_STYLE = Symbol(
-  `normalizeStyle` 
-);
-const NORMALIZE_PROPS = Symbol(
-  `normalizeProps` 
-);
-const GUARD_REACTIVE_PROPS = Symbol(
-  `guardReactiveProps` 
-);
-const TO_HANDLERS = Symbol(`toHandlers` );
-const CAMELIZE = Symbol(`camelize` );
-const CAPITALIZE = Symbol(`capitalize` );
-const TO_HANDLER_KEY = Symbol(
-  `toHandlerKey` 
-);
-const SET_BLOCK_TRACKING = Symbol(
-  `setBlockTracking` 
-);
-const PUSH_SCOPE_ID = Symbol(`pushScopeId` );
-const POP_SCOPE_ID = Symbol(`popScopeId` );
-const WITH_CTX = Symbol(`withCtx` );
-const UNREF = Symbol(`unref` );
-const IS_REF = Symbol(`isRef` );
-const WITH_MEMO = Symbol(`withMemo` );
-const IS_MEMO_SAME = Symbol(`isMemoSame` );
-const helperNameMap = {
-  [FRAGMENT]: `Fragment`,
-  [TELEPORT]: `Teleport`,
-  [SUSPENSE]: `Suspense`,
-  [KEEP_ALIVE]: `KeepAlive`,
-  [BASE_TRANSITION]: `BaseTransition`,
-  [OPEN_BLOCK]: `openBlock`,
-  [CREATE_BLOCK]: `createBlock`,
-  [CREATE_ELEMENT_BLOCK]: `createElementBlock`,
-  [CREATE_VNODE]: `createVNode`,
-  [CREATE_ELEMENT_VNODE]: `createElementVNode`,
-  [CREATE_COMMENT]: `createCommentVNode`,
-  [CREATE_TEXT]: `createTextVNode`,
-  [CREATE_STATIC]: `createStaticVNode`,
-  [RESOLVE_COMPONENT]: `resolveComponent`,
-  [RESOLVE_DYNAMIC_COMPONENT]: `resolveDynamicComponent`,
-  [RESOLVE_DIRECTIVE]: `resolveDirective`,
-  [RESOLVE_FILTER]: `resolveFilter`,
-  [WITH_DIRECTIVES]: `withDirectives`,
-  [RENDER_LIST]: `renderList`,
-  [RENDER_SLOT]: `renderSlot`,
-  [CREATE_SLOTS]: `createSlots`,
-  [TO_DISPLAY_STRING]: `toDisplayString`,
-  [MERGE_PROPS]: `mergeProps`,
-  [NORMALIZE_CLASS]: `normalizeClass`,
-  [NORMALIZE_STYLE]: `normalizeStyle`,
-  [NORMALIZE_PROPS]: `normalizeProps`,
-  [GUARD_REACTIVE_PROPS]: `guardReactiveProps`,
-  [TO_HANDLERS]: `toHandlers`,
-  [CAMELIZE]: `camelize`,
-  [CAPITALIZE]: `capitalize`,
-  [TO_HANDLER_KEY]: `toHandlerKey`,
-  [SET_BLOCK_TRACKING]: `setBlockTracking`,
-  [PUSH_SCOPE_ID]: `pushScopeId`,
-  [POP_SCOPE_ID]: `popScopeId`,
-  [WITH_CTX]: `withCtx`,
-  [UNREF]: `unref`,
-  [IS_REF]: `isRef`,
-  [WITH_MEMO]: `withMemo`,
-  [IS_MEMO_SAME]: `isMemoSame`
-};
-function registerRuntimeHelpers(helpers) {
-  Object.getOwnPropertySymbols(helpers).forEach((s) => {
-    helperNameMap[s] = helpers[s];
-  });
-}
-
-const locStub = {
-  start: { line: 1, column: 1, offset: 0 },
-  end: { line: 1, column: 1, offset: 0 },
-  source: ""
-};
-function createRoot(children, source = "") {
-  return {
-    type: 0,
-    source,
-    children,
-    helpers: /* @__PURE__ */ new Set(),
-    components: [],
-    directives: [],
-    hoists: [],
-    imports: [],
-    cached: [],
-    temps: 0,
-    codegenNode: void 0,
-    loc: locStub
-  };
-}
-function createVNodeCall(context, tag, props, children, patchFlag, dynamicProps, directives, isBlock = false, disableTracking = false, isComponent = false, loc = locStub) {
-  if (context) {
-    if (isBlock) {
-      context.helper(OPEN_BLOCK);
-      context.helper(getVNodeBlockHelper(context.inSSR, isComponent));
-    } else {
-      context.helper(getVNodeHelper(context.inSSR, isComponent));
-    }
-    if (directives) {
-      context.helper(WITH_DIRECTIVES);
-    }
-  }
-  return {
-    type: 13,
-    tag,
-    props,
-    children,
-    patchFlag,
-    dynamicProps,
-    directives,
-    isBlock,
-    disableTracking,
-    isComponent,
-    loc
-  };
-}
-function createArrayExpression(elements, loc = locStub) {
-  return {
-    type: 17,
-    loc,
-    elements
-  };
-}
-function createObjectExpression(properties, loc = locStub) {
-  return {
-    type: 15,
-    loc,
-    properties
-  };
-}
-function createObjectProperty(key, value) {
-  return {
-    type: 16,
-    loc: locStub,
-    key: isString(key) ? createSimpleExpression(key, true) : key,
-    value
-  };
-}
-function createSimpleExpression(content, isStatic = false, loc = locStub, constType = 0) {
-  return {
-    type: 4,
-    loc,
-    content,
-    isStatic,
-    constType: isStatic ? 3 : constType
-  };
-}
-function createCompoundExpression(children, loc = locStub) {
-  return {
-    type: 8,
-    loc,
-    children
-  };
-}
-function createCallExpression(callee, args = [], loc = locStub) {
-  return {
-    type: 14,
-    loc,
-    callee,
-    arguments: args
-  };
-}
-function createFunctionExpression(params, returns = void 0, newline = false, isSlot = false, loc = locStub) {
-  return {
-    type: 18,
-    params,
-    returns,
-    newline,
-    isSlot,
-    loc
-  };
-}
-function createConditionalExpression(test, consequent, alternate, newline = true) {
-  return {
-    type: 19,
-    test,
-    consequent,
-    alternate,
-    newline,
-    loc: locStub
-  };
-}
-function createCacheExpression(index, value, needPauseTracking = false) {
-  return {
-    type: 20,
-    index,
-    value,
-    needPauseTracking,
-    needArraySpread: false,
-    loc: locStub
-  };
-}
-function createBlockStatement(body) {
-  return {
-    type: 21,
-    body,
-    loc: locStub
-  };
-}
-function getVNodeHelper(ssr, isComponent) {
-  return ssr || isComponent ? CREATE_VNODE : CREATE_ELEMENT_VNODE;
-}
-function getVNodeBlockHelper(ssr, isComponent) {
-  return ssr || isComponent ? CREATE_BLOCK : CREATE_ELEMENT_BLOCK;
-}
-function convertToBlock(node, { helper, removeHelper, inSSR }) {
-  if (!node.isBlock) {
-    node.isBlock = true;
-    removeHelper(getVNodeHelper(inSSR, node.isComponent));
-    helper(OPEN_BLOCK);
-    helper(getVNodeBlockHelper(inSSR, node.isComponent));
-  }
-}
-
-const defaultDelimitersOpen = new Uint8Array([123, 123]);
-const defaultDelimitersClose = new Uint8Array([125, 125]);
-function isTagStartChar(c) {
-  return c >= 97 && c <= 122 || c >= 65 && c <= 90;
-}
-function isWhitespace(c) {
-  return c === 32 || c === 10 || c === 9 || c === 12 || c === 13;
-}
-function isEndOfTagSection(c) {
-  return c === 47 || c === 62 || isWhitespace(c);
-}
-function toCharCodes(str) {
-  const ret = new Uint8Array(str.length);
-  for (let i = 0; i < str.length; i++) {
-    ret[i] = str.charCodeAt(i);
-  }
-  return ret;
-}
-const Sequences = {
-  Cdata: new Uint8Array([67, 68, 65, 84, 65, 91]),
-  // CDATA[
-  CdataEnd: new Uint8Array([93, 93, 62]),
-  // ]]>
-  CommentEnd: new Uint8Array([45, 45, 62]),
-  // `-->`
-  ScriptEnd: new Uint8Array([60, 47, 115, 99, 114, 105, 112, 116]),
-  // `<\/script`
-  StyleEnd: new Uint8Array([60, 47, 115, 116, 121, 108, 101]),
-  // `</style`
-  TitleEnd: new Uint8Array([60, 47, 116, 105, 116, 108, 101]),
-  // `</title`
-  TextareaEnd: new Uint8Array([
-    60,
-    47,
-    116,
-    101,
-    120,
-    116,
-    97,
-    114,
-    101,
-    97
-  ])
-  // `</textarea
-};
-class Tokenizer {
-  constructor(stack, cbs) {
-    this.stack = stack;
-    this.cbs = cbs;
-    /** The current state the tokenizer is in. */
-    this.state = 1;
-    /** The read buffer. */
-    this.buffer = "";
-    /** The beginning of the section that is currently being read. */
-    this.sectionStart = 0;
-    /** The index within the buffer that we are currently looking at. */
-    this.index = 0;
-    /** The start of the last entity. */
-    this.entityStart = 0;
-    /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
-    this.baseState = 1;
-    /** For special parsing behavior inside of script and style tags. */
-    this.inRCDATA = false;
-    /** For disabling RCDATA tags handling */
-    this.inXML = false;
-    /** For disabling interpolation parsing in v-pre */
-    this.inVPre = false;
-    /** Record newline positions for fast line / column calculation */
-    this.newlines = [];
-    this.mode = 0;
-    this.delimiterOpen = defaultDelimitersOpen;
-    this.delimiterClose = defaultDelimitersClose;
-    this.delimiterIndex = -1;
-    this.currentSequence = void 0;
-    this.sequenceIndex = 0;
-  }
-  get inSFCRoot() {
-    return this.mode === 2 && this.stack.length === 0;
-  }
-  reset() {
-    this.state = 1;
-    this.mode = 0;
-    this.buffer = "";
-    this.sectionStart = 0;
-    this.index = 0;
-    this.baseState = 1;
-    this.inRCDATA = false;
-    this.currentSequence = void 0;
-    this.newlines.length = 0;
-    this.delimiterOpen = defaultDelimitersOpen;
-    this.delimiterClose = defaultDelimitersClose;
-  }
-  /**
-   * Generate Position object with line / column information using recorded
-   * newline positions. We know the index is always going to be an already
-   * processed index, so all the newlines up to this index should have been
-   * recorded.
-   */
-  getPos(index) {
-    let line = 1;
-    let column = index + 1;
-    for (let i = this.newlines.length - 1; i >= 0; i--) {
-      const newlineIndex = this.newlines[i];
-      if (index > newlineIndex) {
-        line = i + 2;
-        column = index - newlineIndex;
-        break;
-      }
-    }
-    return {
-      column,
-      line,
-      offset: index
-    };
-  }
-  peek() {
-    return this.buffer.charCodeAt(this.index + 1);
-  }
-  stateText(c) {
-    if (c === 60) {
-      if (this.index > this.sectionStart) {
-        this.cbs.ontext(this.sectionStart, this.index);
-      }
-      this.state = 5;
-      this.sectionStart = this.index;
-    } else if (!this.inVPre && c === this.delimiterOpen[0]) {
-      this.state = 2;
-      this.delimiterIndex = 0;
-      this.stateInterpolationOpen(c);
-    }
-  }
-  stateInterpolationOpen(c) {
-    if (c === this.delimiterOpen[this.delimiterIndex]) {
-      if (this.delimiterIndex === this.delimiterOpen.length - 1) {
-        const start = this.index + 1 - this.delimiterOpen.length;
-        if (start > this.sectionStart) {
-          this.cbs.ontext(this.sectionStart, start);
-        }
-        this.state = 3;
-        this.sectionStart = start;
-      } else {
-        this.delimiterIndex++;
-      }
-    } else if (this.inRCDATA) {
-      this.state = 32;
-      this.stateInRCDATA(c);
-    } else {
-      this.state = 1;
-      this.stateText(c);
-    }
-  }
-  stateInterpolation(c) {
-    if (c === this.delimiterClose[0]) {
-      this.state = 4;
-      this.delimiterIndex = 0;
-      this.stateInterpolationClose(c);
-    }
-  }
-  stateInterpolationClose(c) {
-    if (c === this.delimiterClose[this.delimiterIndex]) {
-      if (this.delimiterIndex === this.delimiterClose.length - 1) {
-        this.cbs.oninterpolation(this.sectionStart, this.index + 1);
-        if (this.inRCDATA) {
-          this.state = 32;
-        } else {
-          this.state = 1;
-        }
-        this.sectionStart = this.index + 1;
-      } else {
-        this.delimiterIndex++;
-      }
-    } else {
-      this.state = 3;
-      this.stateInterpolation(c);
-    }
-  }
-  stateSpecialStartSequence(c) {
-    const isEnd = this.sequenceIndex === this.currentSequence.length;
-    const isMatch = isEnd ? (
-      // If we are at the end of the sequence, make sure the tag name has ended
-      isEndOfTagSection(c)
-    ) : (
-      // Otherwise, do a case-insensitive comparison
-      (c | 32) === this.currentSequence[this.sequenceIndex]
-    );
-    if (!isMatch) {
-      this.inRCDATA = false;
-    } else if (!isEnd) {
-      this.sequenceIndex++;
-      return;
-    }
-    this.sequenceIndex = 0;
-    this.state = 6;
-    this.stateInTagName(c);
-  }
-  /** Look for an end tag. For <title> and <textarea>, also decode entities. */
-  stateInRCDATA(c) {
-    if (this.sequenceIndex === this.currentSequence.length) {
-      if (c === 62 || isWhitespace(c)) {
-        const endOfText = this.index - this.currentSequence.length;
-        if (this.sectionStart < endOfText) {
-          const actualIndex = this.index;
-          this.index = endOfText;
-          this.cbs.ontext(this.sectionStart, endOfText);
-          this.index = actualIndex;
-        }
-        this.sectionStart = endOfText + 2;
-        this.stateInClosingTagName(c);
-        this.inRCDATA = false;
-        return;
-      }
-      this.sequenceIndex = 0;
-    }
-    if ((c | 32) === this.currentSequence[this.sequenceIndex]) {
-      this.sequenceIndex += 1;
-    } else if (this.sequenceIndex === 0) {
-      if (this.currentSequence === Sequences.TitleEnd || this.currentSequence === Sequences.TextareaEnd && !this.inSFCRoot) {
-        if (!this.inVPre && c === this.delimiterOpen[0]) {
-          this.state = 2;
-          this.delimiterIndex = 0;
-          this.stateInterpolationOpen(c);
-        }
-      } else if (this.fastForwardTo(60)) {
-        this.sequenceIndex = 1;
-      }
-    } else {
-      this.sequenceIndex = Number(c === 60);
-    }
-  }
-  stateCDATASequence(c) {
-    if (c === Sequences.Cdata[this.sequenceIndex]) {
-      if (++this.sequenceIndex === Sequences.Cdata.length) {
-        this.state = 28;
-        this.currentSequence = Sequences.CdataEnd;
-        this.sequenceIndex = 0;
-        this.sectionStart = this.index + 1;
-      }
-    } else {
-      this.sequenceIndex = 0;
-      this.state = 23;
-      this.stateInDeclaration(c);
-    }
-  }
-  /**
-   * When we wait for one specific character, we can speed things up
-   * by skipping through the buffer until we find it.
-   *
-   * @returns Whether the character was found.
-   */
-  fastForwardTo(c) {
-    while (++this.index < this.buffer.length) {
-      const cc = this.buffer.charCodeAt(this.index);
-      if (cc === 10) {
-        this.newlines.push(this.index);
-      }
-      if (cc === c) {
-        return true;
-      }
-    }
-    this.index = this.buffer.length - 1;
-    return false;
-  }
-  /**
-   * Comments and CDATA end with `-->` and `]]>`.
-   *
-   * Their common qualities are:
-   * - Their end sequences have a distinct character they start with.
-   * - That character is then repeated, so we have to check multiple repeats.
-   * - All characters but the start character of the sequence can be skipped.
-   */
-  stateInCommentLike(c) {
-    if (c === this.currentSequence[this.sequenceIndex]) {
-      if (++this.sequenceIndex === this.currentSequence.length) {
-        if (this.currentSequence === Sequences.CdataEnd) {
-          this.cbs.oncdata(this.sectionStart, this.index - 2);
-        } else {
-          this.cbs.oncomment(this.sectionStart, this.index - 2);
-        }
-        this.sequenceIndex = 0;
-        this.sectionStart = this.index + 1;
-        this.state = 1;
-      }
-    } else if (this.sequenceIndex === 0) {
-      if (this.fastForwardTo(this.currentSequence[0])) {
-        this.sequenceIndex = 1;
-      }
-    } else if (c !== this.currentSequence[this.sequenceIndex - 1]) {
-      this.sequenceIndex = 0;
-    }
-  }
-  startSpecial(sequence, offset) {
-    this.enterRCDATA(sequence, offset);
-    this.state = 31;
-  }
-  enterRCDATA(sequence, offset) {
-    this.inRCDATA = true;
-    this.currentSequence = sequence;
-    this.sequenceIndex = offset;
-  }
-  stateBeforeTagName(c) {
-    if (c === 33) {
-      this.state = 22;
-      this.sectionStart = this.index + 1;
-    } else if (c === 63) {
-      this.state = 24;
-      this.sectionStart = this.index + 1;
-    } else if (isTagStartChar(c)) {
-      this.sectionStart = this.index;
-      if (this.mode === 0) {
-        this.state = 6;
-      } else if (this.inSFCRoot) {
-        this.state = 34;
-      } else if (!this.inXML) {
-        if (c === 116) {
-          this.state = 30;
-        } else {
-          this.state = c === 115 ? 29 : 6;
-        }
-      } else {
-        this.state = 6;
-      }
-    } else if (c === 47) {
-      this.state = 8;
-    } else {
-      this.state = 1;
-      this.stateText(c);
-    }
-  }
-  stateInTagName(c) {
-    if (isEndOfTagSection(c)) {
-      this.handleTagName(c);
-    }
-  }
-  stateInSFCRootTagName(c) {
-    if (isEndOfTagSection(c)) {
-      const tag = this.buffer.slice(this.sectionStart, this.index);
-      if (tag !== "template") {
-        this.enterRCDATA(toCharCodes(`</` + tag), 0);
-      }
-      this.handleTagName(c);
-    }
-  }
-  handleTagName(c) {
-    this.cbs.onopentagname(this.sectionStart, this.index);
-    this.sectionStart = -1;
-    this.state = 11;
-    this.stateBeforeAttrName(c);
-  }
-  stateBeforeClosingTagName(c) {
-    if (isWhitespace(c)) ; else if (c === 62) {
-      {
-        this.cbs.onerr(14, this.index);
-      }
-      this.state = 1;
-      this.sectionStart = this.index + 1;
-    } else {
-      this.state = isTagStartChar(c) ? 9 : 27;
-      this.sectionStart = this.index;
-    }
-  }
-  stateInClosingTagName(c) {
-    if (c === 62 || isWhitespace(c)) {
-      this.cbs.onclosetag(this.sectionStart, this.index);
-      this.sectionStart = -1;
-      this.state = 10;
-      this.stateAfterClosingTagName(c);
-    }
-  }
-  stateAfterClosingTagName(c) {
-    if (c === 62) {
-      this.state = 1;
-      this.sectionStart = this.index + 1;
-    }
-  }
-  stateBeforeAttrName(c) {
-    if (c === 62) {
-      this.cbs.onopentagend(this.index);
-      if (this.inRCDATA) {
-        this.state = 32;
-      } else {
-        this.state = 1;
-      }
-      this.sectionStart = this.index + 1;
-    } else if (c === 47) {
-      this.state = 7;
-      if (this.peek() !== 62) {
-        this.cbs.onerr(22, this.index);
-      }
-    } else if (c === 60 && this.peek() === 47) {
-      this.cbs.onopentagend(this.index);
-      this.state = 5;
-      this.sectionStart = this.index;
-    } else if (!isWhitespace(c)) {
-      if (c === 61) {
-        this.cbs.onerr(
-          19,
-          this.index
-        );
-      }
-      this.handleAttrStart(c);
-    }
-  }
-  handleAttrStart(c) {
-    if (c === 118 && this.peek() === 45) {
-      this.state = 13;
-      this.sectionStart = this.index;
-    } else if (c === 46 || c === 58 || c === 64 || c === 35) {
-      this.cbs.ondirname(this.index, this.index + 1);
-      this.state = 14;
-      this.sectionStart = this.index + 1;
-    } else {
-      this.state = 12;
-      this.sectionStart = this.index;
-    }
-  }
-  stateInSelfClosingTag(c) {
-    if (c === 62) {
-      this.cbs.onselfclosingtag(this.index);
-      this.state = 1;
-      this.sectionStart = this.index + 1;
-      this.inRCDATA = false;
-    } else if (!isWhitespace(c)) {
-      this.state = 11;
-      this.stateBeforeAttrName(c);
-    }
-  }
-  stateInAttrName(c) {
-    if (c === 61 || isEndOfTagSection(c)) {
-      this.cbs.onattribname(this.sectionStart, this.index);
-      this.handleAttrNameEnd(c);
-    } else if (c === 34 || c === 39 || c === 60) {
-      this.cbs.onerr(
-        17,
-        this.index
-      );
-    }
-  }
-  stateInDirName(c) {
-    if (c === 61 || isEndOfTagSection(c)) {
-      this.cbs.ondirname(this.sectionStart, this.index);
-      this.handleAttrNameEnd(c);
-    } else if (c === 58) {
-      this.cbs.ondirname(this.sectionStart, this.index);
-      this.state = 14;
-      this.sectionStart = this.index + 1;
-    } else if (c === 46) {
-      this.cbs.ondirname(this.sectionStart, this.index);
-      this.state = 16;
-      this.sectionStart = this.index + 1;
-    }
-  }
-  stateInDirArg(c) {
-    if (c === 61 || isEndOfTagSection(c)) {
-      this.cbs.ondirarg(this.sectionStart, this.index);
-      this.handleAttrNameEnd(c);
-    } else if (c === 91) {
-      this.state = 15;
-    } else if (c === 46) {
-      this.cbs.ondirarg(this.sectionStart, this.index);
-      this.state = 16;
-      this.sectionStart = this.index + 1;
-    }
-  }
-  stateInDynamicDirArg(c) {
-    if (c === 93) {
-      this.state = 14;
-    } else if (c === 61 || isEndOfTagSection(c)) {
-      this.cbs.ondirarg(this.sectionStart, this.index + 1);
-      this.handleAttrNameEnd(c);
-      {
-        this.cbs.onerr(
-          27,
-          this.index
-        );
-      }
-    }
-  }
-  stateInDirModifier(c) {
-    if (c === 61 || isEndOfTagSection(c)) {
-      this.cbs.ondirmodifier(this.sectionStart, this.index);
-      this.handleAttrNameEnd(c);
-    } else if (c === 46) {
-      this.cbs.ondirmodifier(this.sectionStart, this.index);
-      this.sectionStart = this.index + 1;
-    }
-  }
-  handleAttrNameEnd(c) {
-    this.sectionStart = this.index;
-    this.state = 17;
-    this.cbs.onattribnameend(this.index);
-    this.stateAfterAttrName(c);
-  }
-  stateAfterAttrName(c) {
-    if (c === 61) {
-      this.state = 18;
-    } else if (c === 47 || c === 62) {
-      this.cbs.onattribend(0, this.sectionStart);
-      this.sectionStart = -1;
-      this.state = 11;
-      this.stateBeforeAttrName(c);
-    } else if (!isWhitespace(c)) {
-      this.cbs.onattribend(0, this.sectionStart);
-      this.handleAttrStart(c);
-    }
-  }
-  stateBeforeAttrValue(c) {
-    if (c === 34) {
-      this.state = 19;
-      this.sectionStart = this.index + 1;
-    } else if (c === 39) {
-      this.state = 20;
-      this.sectionStart = this.index + 1;
-    } else if (!isWhitespace(c)) {
-      this.sectionStart = this.index;
-      this.state = 21;
-      this.stateInAttrValueNoQuotes(c);
-    }
-  }
-  handleInAttrValue(c, quote) {
-    if (c === quote || this.fastForwardTo(quote)) {
-      this.cbs.onattribdata(this.sectionStart, this.index);
-      this.sectionStart = -1;
-      this.cbs.onattribend(
-        quote === 34 ? 3 : 2,
-        this.index + 1
-      );
-      this.state = 11;
-    }
-  }
-  stateInAttrValueDoubleQuotes(c) {
-    this.handleInAttrValue(c, 34);
-  }
-  stateInAttrValueSingleQuotes(c) {
-    this.handleInAttrValue(c, 39);
-  }
-  stateInAttrValueNoQuotes(c) {
-    if (isWhitespace(c) || c === 62) {
-      this.cbs.onattribdata(this.sectionStart, this.index);
-      this.sectionStart = -1;
-      this.cbs.onattribend(1, this.index);
-      this.state = 11;
-      this.stateBeforeAttrName(c);
-    } else if (c === 34 || c === 39 || c === 60 || c === 61 || c === 96) {
-      this.cbs.onerr(
-        18,
-        this.index
-      );
-    } else ;
-  }
-  stateBeforeDeclaration(c) {
-    if (c === 91) {
-      this.state = 26;
-      this.sequenceIndex = 0;
-    } else {
-      this.state = c === 45 ? 25 : 23;
-    }
-  }
-  stateInDeclaration(c) {
-    if (c === 62 || this.fastForwardTo(62)) {
-      this.state = 1;
-      this.sectionStart = this.index + 1;
-    }
-  }
-  stateInProcessingInstruction(c) {
-    if (c === 62 || this.fastForwardTo(62)) {
-      this.cbs.onprocessinginstruction(this.sectionStart, this.index);
-      this.state = 1;
-      this.sectionStart = this.index + 1;
-    }
-  }
-  stateBeforeComment(c) {
-    if (c === 45) {
-      this.state = 28;
-      this.currentSequence = Sequences.CommentEnd;
-      this.sequenceIndex = 2;
-      this.sectionStart = this.index + 1;
-    } else {
-      this.state = 23;
-    }
-  }
-  stateInSpecialComment(c) {
-    if (c === 62 || this.fastForwardTo(62)) {
-      this.cbs.oncomment(this.sectionStart, this.index);
-      this.state = 1;
-      this.sectionStart = this.index + 1;
-    }
-  }
-  stateBeforeSpecialS(c) {
-    if (c === Sequences.ScriptEnd[3]) {
-      this.startSpecial(Sequences.ScriptEnd, 4);
-    } else if (c === Sequences.StyleEnd[3]) {
-      this.startSpecial(Sequences.StyleEnd, 4);
-    } else {
-      this.state = 6;
-      this.stateInTagName(c);
-    }
-  }
-  stateBeforeSpecialT(c) {
-    if (c === Sequences.TitleEnd[3]) {
-      this.startSpecial(Sequences.TitleEnd, 4);
-    } else if (c === Sequences.TextareaEnd[3]) {
-      this.startSpecial(Sequences.TextareaEnd, 4);
-    } else {
-      this.state = 6;
-      this.stateInTagName(c);
-    }
-  }
-  startEntity() {
-  }
-  stateInEntity() {
-  }
-  /**
-   * Iterates through the buffer, calling the function corresponding to the current state.
-   *
-   * States that are more likely to be hit are higher up, as a performance improvement.
-   */
-  parse(input) {
-    this.buffer = input;
-    while (this.index < this.buffer.length) {
-      const c = this.buffer.charCodeAt(this.index);
-      if (c === 10) {
-        this.newlines.push(this.index);
-      }
-      switch (this.state) {
-        case 1: {
-          this.stateText(c);
-          break;
-        }
-        case 2: {
-          this.stateInterpolationOpen(c);
-          break;
-        }
-        case 3: {
-          this.stateInterpolation(c);
-          break;
-        }
-        case 4: {
-          this.stateInterpolationClose(c);
-          break;
-        }
-        case 31: {
-          this.stateSpecialStartSequence(c);
-          break;
-        }
-        case 32: {
-          this.stateInRCDATA(c);
-          break;
-        }
-        case 26: {
-          this.stateCDATASequence(c);
-          break;
-        }
-        case 19: {
-          this.stateInAttrValueDoubleQuotes(c);
-          break;
-        }
-        case 12: {
-          this.stateInAttrName(c);
-          break;
-        }
-        case 13: {
-          this.stateInDirName(c);
-          break;
-        }
-        case 14: {
-          this.stateInDirArg(c);
-          break;
-        }
-        case 15: {
-          this.stateInDynamicDirArg(c);
-          break;
-        }
-        case 16: {
-          this.stateInDirModifier(c);
-          break;
-        }
-        case 28: {
-          this.stateInCommentLike(c);
-          break;
-        }
-        case 27: {
-          this.stateInSpecialComment(c);
-          break;
-        }
-        case 11: {
-          this.stateBeforeAttrName(c);
-          break;
-        }
-        case 6: {
-          this.stateInTagName(c);
-          break;
-        }
-        case 34: {
-          this.stateInSFCRootTagName(c);
-          break;
-        }
-        case 9: {
-          this.stateInClosingTagName(c);
-          break;
-        }
-        case 5: {
-          this.stateBeforeTagName(c);
-          break;
-        }
-        case 17: {
-          this.stateAfterAttrName(c);
-          break;
-        }
-        case 20: {
-          this.stateInAttrValueSingleQuotes(c);
-          break;
-        }
-        case 18: {
-          this.stateBeforeAttrValue(c);
-          break;
-        }
-        case 8: {
-          this.stateBeforeClosingTagName(c);
-          break;
-        }
-        case 10: {
-          this.stateAfterClosingTagName(c);
-          break;
-        }
-        case 29: {
-          this.stateBeforeSpecialS(c);
-          break;
-        }
-        case 30: {
-          this.stateBeforeSpecialT(c);
-          break;
-        }
-        case 21: {
-          this.stateInAttrValueNoQuotes(c);
-          break;
-        }
-        case 7: {
-          this.stateInSelfClosingTag(c);
-          break;
-        }
-        case 23: {
-          this.stateInDeclaration(c);
-          break;
-        }
-        case 22: {
-          this.stateBeforeDeclaration(c);
-          break;
-        }
-        case 25: {
-          this.stateBeforeComment(c);
-          break;
-        }
-        case 24: {
-          this.stateInProcessingInstruction(c);
-          break;
-        }
-        case 33: {
-          this.stateInEntity();
-          break;
-        }
-      }
-      this.index++;
-    }
-    this.cleanup();
-    this.finish();
-  }
-  /**
-   * Remove data that has already been consumed from the buffer.
-   */
-  cleanup() {
-    if (this.sectionStart !== this.index) {
-      if (this.state === 1 || this.state === 32 && this.sequenceIndex === 0) {
-        this.cbs.ontext(this.sectionStart, this.index);
-        this.sectionStart = this.index;
-      } else if (this.state === 19 || this.state === 20 || this.state === 21) {
-        this.cbs.onattribdata(this.sectionStart, this.index);
-        this.sectionStart = this.index;
-      }
-    }
-  }
-  finish() {
-    this.handleTrailingData();
-    this.cbs.onend();
-  }
-  /** Handle any trailing data. */
-  handleTrailingData() {
-    const endIndex = this.buffer.length;
-    if (this.sectionStart >= endIndex) {
-      return;
-    }
-    if (this.state === 28) {
-      if (this.currentSequence === Sequences.CdataEnd) {
-        this.cbs.oncdata(this.sectionStart, endIndex);
-      } else {
-        this.cbs.oncomment(this.sectionStart, endIndex);
-      }
-    } else if (this.state === 6 || this.state === 11 || this.state === 18 || this.state === 17 || this.state === 12 || this.state === 13 || this.state === 14 || this.state === 15 || this.state === 16 || this.state === 20 || this.state === 19 || this.state === 21 || this.state === 9) ; else {
-      this.cbs.ontext(this.sectionStart, endIndex);
-    }
-  }
-  emitCodePoint(cp, consumed) {
-  }
-}
-
-function defaultOnError(error) {
-  throw error;
-}
-function defaultOnWarn(msg) {
-  console.warn(`[Vue warn] ${msg.message}`);
-}
-function createCompilerError(code, loc, messages, additionalMessage) {
-  const msg = (messages || errorMessages)[code] + (additionalMessage || ``) ;
-  const error = new SyntaxError(String(msg));
-  error.code = code;
-  error.loc = loc;
-  return error;
-}
-const errorMessages = {
-  // parse errors
-  [0]: "Illegal comment.",
-  [1]: "CDATA section is allowed only in XML context.",
-  [2]: "Duplicate attribute.",
-  [3]: "End tag cannot have attributes.",
-  [4]: "Illegal '/' in tags.",
-  [5]: "Unexpected EOF in tag.",
-  [6]: "Unexpected EOF in CDATA section.",
-  [7]: "Unexpected EOF in comment.",
-  [8]: "Unexpected EOF in script.",
-  [9]: "Unexpected EOF in tag.",
-  [10]: "Incorrectly closed comment.",
-  [11]: "Incorrectly opened comment.",
-  [12]: "Illegal tag name. Use '&lt;' to print '<'.",
-  [13]: "Attribute value was expected.",
-  [14]: "End tag name was expected.",
-  [15]: "Whitespace was expected.",
-  [16]: "Unexpected '<!--' in comment.",
-  [17]: `Attribute name cannot contain U+0022 ("), U+0027 ('), and U+003C (<).`,
-  [18]: "Unquoted attribute value cannot contain U+0022 (\"), U+0027 ('), U+003C (<), U+003D (=), and U+0060 (`).",
-  [19]: "Attribute name cannot start with '='.",
-  [21]: "'<?' is allowed only in XML context.",
-  [20]: `Unexpected null character.`,
-  [22]: "Illegal '/' in tags.",
-  // Vue-specific parse errors
-  [23]: "Invalid end tag.",
-  [24]: "Element is missing end tag.",
-  [25]: "Interpolation end sign was not found.",
-  [27]: "End bracket for dynamic directive argument was not found. Note that dynamic directive argument cannot contain spaces.",
-  [26]: "Legal directive name was expected.",
-  // transform errors
-  [28]: `v-if/v-else-if is missing expression.`,
-  [29]: `v-if/else branches must use unique keys.`,
-  [30]: `v-else/v-else-if has no adjacent v-if or v-else-if.`,
-  [31]: `v-for is missing expression.`,
-  [32]: `v-for has invalid expression.`,
-  [33]: `<template v-for> key should be placed on the <template> tag.`,
-  [34]: `v-bind is missing expression.`,
-  [52]: `v-bind with same-name shorthand only allows static argument.`,
-  [35]: `v-on is missing expression.`,
-  [36]: `Unexpected custom directive on <slot> outlet.`,
-  [37]: `Mixed v-slot usage on both the component and nested <template>. When there are multiple named slots, all slots should use <template> syntax to avoid scope ambiguity.`,
-  [38]: `Duplicate slot names found. `,
-  [39]: `Extraneous children found when component already has explicitly named default slot. These children will be ignored.`,
-  [40]: `v-slot can only be used on components or <template> tags.`,
-  [41]: `v-model is missing expression.`,
-  [42]: `v-model value must be a valid JavaScript member expression.`,
-  [43]: `v-model cannot be used on v-for or v-slot scope variables because they are not writable.`,
-  [44]: `v-model cannot be used on a prop, because local prop bindings are not writable.
-Use a v-bind binding combined with a v-on listener that emits update:x event instead.`,
-  [45]: `Error parsing JavaScript expression: `,
-  [46]: `<KeepAlive> expects exactly one child component.`,
-  [51]: `@vnode-* hooks in templates are no longer supported. Use the vue: prefix instead. For example, @vnode-mounted should be changed to @vue:mounted. @vnode-* hooks support has been removed in 3.4.`,
-  // generic errors
-  [47]: `"prefixIdentifiers" option is not supported in this build of compiler.`,
-  [48]: `ES module mode is not supported in this build of compiler.`,
-  [49]: `"cacheHandlers" option is only supported when the "prefixIdentifiers" option is enabled.`,
-  [50]: `"scopeId" option is only supported in module mode.`,
-  // just to fulfill types
-  [53]: ``
-};
-
-const isStaticExp = (p) => p.type === 4 && p.isStatic;
-function isCoreComponent(tag) {
-  switch (tag) {
-    case "Teleport":
-    case "teleport":
-      return TELEPORT;
-    case "Suspense":
-    case "suspense":
-      return SUSPENSE;
-    case "KeepAlive":
-    case "keep-alive":
-      return KEEP_ALIVE;
-    case "BaseTransition":
-    case "base-transition":
-      return BASE_TRANSITION;
-  }
-}
-const nonIdentifierRE = /^\d|[^\$\w\xA0-\uFFFF]/;
-const isSimpleIdentifier = (name) => !nonIdentifierRE.test(name);
-const validFirstIdentCharRE = /[A-Za-z_$\xA0-\uFFFF]/;
-const validIdentCharRE = /[\.\?\w$\xA0-\uFFFF]/;
-const whitespaceRE = /\s+[.[]\s*|\s*[.[]\s+/g;
-const getExpSource = (exp) => exp.type === 4 ? exp.content : exp.loc.source;
-const isMemberExpressionBrowser = (exp) => {
-  const path = getExpSource(exp).trim().replace(whitespaceRE, (s) => s.trim());
-  let state = 0 /* inMemberExp */;
-  let stateStack = [];
-  let currentOpenBracketCount = 0;
-  let currentOpenParensCount = 0;
-  let currentStringType = null;
-  for (let i = 0; i < path.length; i++) {
-    const char = path.charAt(i);
-    switch (state) {
-      case 0 /* inMemberExp */:
-        if (char === "[") {
-          stateStack.push(state);
-          state = 1 /* inBrackets */;
-          currentOpenBracketCount++;
-        } else if (char === "(") {
-          stateStack.push(state);
-          state = 2 /* inParens */;
-          currentOpenParensCount++;
-        } else if (!(i === 0 ? validFirstIdentCharRE : validIdentCharRE).test(char)) {
-          return false;
-        }
-        break;
-      case 1 /* inBrackets */:
-        if (char === `'` || char === `"` || char === "`") {
-          stateStack.push(state);
-          state = 3 /* inString */;
-          currentStringType = char;
-        } else if (char === `[`) {
-          currentOpenBracketCount++;
-        } else if (char === `]`) {
-          if (!--currentOpenBracketCount) {
-            state = stateStack.pop();
-          }
-        }
-        break;
-      case 2 /* inParens */:
-        if (char === `'` || char === `"` || char === "`") {
-          stateStack.push(state);
-          state = 3 /* inString */;
-          currentStringType = char;
-        } else if (char === `(`) {
-          currentOpenParensCount++;
-        } else if (char === `)`) {
-          if (i === path.length - 1) {
-            return false;
-          }
-          if (!--currentOpenParensCount) {
-            state = stateStack.pop();
-          }
-        }
-        break;
-      case 3 /* inString */:
-        if (char === currentStringType) {
-          state = stateStack.pop();
-          currentStringType = null;
-        }
-        break;
-    }
-  }
-  return !currentOpenBracketCount && !currentOpenParensCount;
-};
-const isMemberExpression = isMemberExpressionBrowser ;
-const fnExpRE = /^\s*(async\s*)?(\([^)]*?\)|[\w$_]+)\s*(:[^=]+)?=>|^\s*(async\s+)?function(?:\s+[\w$]+)?\s*\(/;
-const isFnExpressionBrowser = (exp) => fnExpRE.test(getExpSource(exp));
-const isFnExpression = isFnExpressionBrowser ;
-function assert(condition, msg) {
-  if (!condition) {
-    throw new Error(msg || `unexpected compiler condition`);
-  }
-}
-function findDir(node, name, allowEmpty = false) {
-  for (let i = 0; i < node.props.length; i++) {
-    const p = node.props[i];
-    if (p.type === 7 && (allowEmpty || p.exp) && (isString(name) ? p.name === name : name.test(p.name))) {
-      return p;
-    }
-  }
-}
-function findProp(node, name, dynamicOnly = false, allowEmpty = false) {
-  for (let i = 0; i < node.props.length; i++) {
-    const p = node.props[i];
-    if (p.type === 6) {
-      if (dynamicOnly) continue;
-      if (p.name === name && (p.value || allowEmpty)) {
-        return p;
-      }
-    } else if (p.name === "bind" && (p.exp || allowEmpty) && isStaticArgOf(p.arg, name)) {
-      return p;
-    }
-  }
-}
-function isStaticArgOf(arg, name) {
-  return !!(arg && isStaticExp(arg) && arg.content === name);
-}
-function hasDynamicKeyVBind(node) {
-  return node.props.some(
-    (p) => p.type === 7 && p.name === "bind" && (!p.arg || // v-bind="obj"
-    p.arg.type !== 4 || // v-bind:[_ctx.foo]
-    !p.arg.isStatic)
-    // v-bind:[foo]
-  );
-}
-function isText$1(node) {
-  return node.type === 5 || node.type === 2;
-}
-function isVSlot(p) {
-  return p.type === 7 && p.name === "slot";
-}
-function isTemplateNode(node) {
-  return node.type === 1 && node.tagType === 3;
-}
-function isSlotOutlet(node) {
-  return node.type === 1 && node.tagType === 2;
-}
-const propsHelperSet = /* @__PURE__ */ new Set([NORMALIZE_PROPS, GUARD_REACTIVE_PROPS]);
-function getUnnormalizedProps(props, callPath = []) {
-  if (props && !isString(props) && props.type === 14) {
-    const callee = props.callee;
-    if (!isString(callee) && propsHelperSet.has(callee)) {
-      return getUnnormalizedProps(
-        props.arguments[0],
-        callPath.concat(props)
-      );
-    }
-  }
-  return [props, callPath];
-}
-function injectProp(node, prop, context) {
-  let propsWithInjection;
-  let props = node.type === 13 ? node.props : node.arguments[2];
-  let callPath = [];
-  let parentCall;
-  if (props && !isString(props) && props.type === 14) {
-    const ret = getUnnormalizedProps(props);
-    props = ret[0];
-    callPath = ret[1];
-    parentCall = callPath[callPath.length - 1];
-  }
-  if (props == null || isString(props)) {
-    propsWithInjection = createObjectExpression([prop]);
-  } else if (props.type === 14) {
-    const first = props.arguments[0];
-    if (!isString(first) && first.type === 15) {
-      if (!hasProp(prop, first)) {
-        first.properties.unshift(prop);
-      }
-    } else {
-      if (props.callee === TO_HANDLERS) {
-        propsWithInjection = createCallExpression(context.helper(MERGE_PROPS), [
-          createObjectExpression([prop]),
-          props
-        ]);
-      } else {
-        props.arguments.unshift(createObjectExpression([prop]));
-      }
-    }
-    !propsWithInjection && (propsWithInjection = props);
-  } else if (props.type === 15) {
-    if (!hasProp(prop, props)) {
-      props.properties.unshift(prop);
-    }
-    propsWithInjection = props;
-  } else {
-    propsWithInjection = createCallExpression(context.helper(MERGE_PROPS), [
-      createObjectExpression([prop]),
-      props
-    ]);
-    if (parentCall && parentCall.callee === GUARD_REACTIVE_PROPS) {
-      parentCall = callPath[callPath.length - 2];
-    }
-  }
-  if (node.type === 13) {
-    if (parentCall) {
-      parentCall.arguments[0] = propsWithInjection;
-    } else {
-      node.props = propsWithInjection;
-    }
-  } else {
-    if (parentCall) {
-      parentCall.arguments[0] = propsWithInjection;
-    } else {
-      node.arguments[2] = propsWithInjection;
-    }
-  }
-}
-function hasProp(prop, props) {
-  let result = false;
-  if (prop.key.type === 4) {
-    const propKeyName = prop.key.content;
-    result = props.properties.some(
-      (p) => p.key.type === 4 && p.key.content === propKeyName
-    );
-  }
-  return result;
-}
-function toValidAssetId(name, type) {
-  return `_${type}_${name.replace(/[^\w]/g, (searchValue, replaceValue) => {
-    return searchValue === "-" ? "_" : name.charCodeAt(replaceValue).toString();
-  })}`;
-}
-function getMemoedVNodeCall(node) {
-  if (node.type === 14 && node.callee === WITH_MEMO) {
-    return node.arguments[1].returns;
-  } else {
-    return node;
-  }
-}
-const forAliasRE = /([\s\S]*?)\s+(?:in|of)\s+(\S[\s\S]*)/;
-
-const defaultParserOptions = {
-  parseMode: "base",
-  ns: 0,
-  delimiters: [`{{`, `}}`],
-  getNamespace: () => 0,
-  isVoidTag: NO,
-  isPreTag: NO,
-  isIgnoreNewlineTag: NO,
-  isCustomElement: NO,
-  onError: defaultOnError,
-  onWarn: defaultOnWarn,
-  comments: true,
-  prefixIdentifiers: false
-};
-let currentOptions = defaultParserOptions;
-let currentRoot = null;
-let currentInput = "";
-let currentOpenTag = null;
-let currentProp = null;
-let currentAttrValue = "";
-let currentAttrStartIndex = -1;
-let currentAttrEndIndex = -1;
-let inPre = 0;
-let inVPre = false;
-let currentVPreBoundary = null;
-const stack = [];
-const tokenizer = new Tokenizer(stack, {
-  onerr: emitError,
-  ontext(start, end) {
-    onText(getSlice(start, end), start, end);
-  },
-  ontextentity(char, start, end) {
-    onText(char, start, end);
-  },
-  oninterpolation(start, end) {
-    if (inVPre) {
-      return onText(getSlice(start, end), start, end);
-    }
-    let innerStart = start + tokenizer.delimiterOpen.length;
-    let innerEnd = end - tokenizer.delimiterClose.length;
-    while (isWhitespace(currentInput.charCodeAt(innerStart))) {
-      innerStart++;
-    }
-    while (isWhitespace(currentInput.charCodeAt(innerEnd - 1))) {
-      innerEnd--;
-    }
-    let exp = getSlice(innerStart, innerEnd);
-    if (exp.includes("&")) {
-      {
-        exp = currentOptions.decodeEntities(exp, false);
-      }
-    }
-    addNode({
-      type: 5,
-      content: createExp(exp, false, getLoc(innerStart, innerEnd)),
-      loc: getLoc(start, end)
-    });
-  },
-  onopentagname(start, end) {
-    const name = getSlice(start, end);
-    currentOpenTag = {
-      type: 1,
-      tag: name,
-      ns: currentOptions.getNamespace(name, stack[0], currentOptions.ns),
-      tagType: 0,
-      // will be refined on tag close
-      props: [],
-      children: [],
-      loc: getLoc(start - 1, end),
-      codegenNode: void 0
-    };
-  },
-  onopentagend(end) {
-    endOpenTag(end);
-  },
-  onclosetag(start, end) {
-    const name = getSlice(start, end);
-    if (!currentOptions.isVoidTag(name)) {
-      let found = false;
-      for (let i = 0; i < stack.length; i++) {
-        const e = stack[i];
-        if (e.tag.toLowerCase() === name.toLowerCase()) {
-          found = true;
-          if (i > 0) {
-            emitError(24, stack[0].loc.start.offset);
-          }
-          for (let j = 0; j <= i; j++) {
-            const el = stack.shift();
-            onCloseTag(el, end, j < i);
-          }
-          break;
-        }
-      }
-      if (!found) {
-        emitError(23, backTrack(start, 60));
-      }
-    }
-  },
-  onselfclosingtag(end) {
-    const name = currentOpenTag.tag;
-    currentOpenTag.isSelfClosing = true;
-    endOpenTag(end);
-    if (stack[0] && stack[0].tag === name) {
-      onCloseTag(stack.shift(), end);
-    }
-  },
-  onattribname(start, end) {
-    currentProp = {
-      type: 6,
-      name: getSlice(start, end),
-      nameLoc: getLoc(start, end),
-      value: void 0,
-      loc: getLoc(start)
-    };
-  },
-  ondirname(start, end) {
-    const raw = getSlice(start, end);
-    const name = raw === "." || raw === ":" ? "bind" : raw === "@" ? "on" : raw === "#" ? "slot" : raw.slice(2);
-    if (!inVPre && name === "") {
-      emitError(26, start);
-    }
-    if (inVPre || name === "") {
-      currentProp = {
-        type: 6,
-        name: raw,
-        nameLoc: getLoc(start, end),
-        value: void 0,
-        loc: getLoc(start)
-      };
-    } else {
-      currentProp = {
-        type: 7,
-        name,
-        rawName: raw,
-        exp: void 0,
-        arg: void 0,
-        modifiers: raw === "." ? [createSimpleExpression("prop")] : [],
-        loc: getLoc(start)
-      };
-      if (name === "pre") {
-        inVPre = tokenizer.inVPre = true;
-        currentVPreBoundary = currentOpenTag;
-        const props = currentOpenTag.props;
-        for (let i = 0; i < props.length; i++) {
-          if (props[i].type === 7) {
-            props[i] = dirToAttr(props[i]);
-          }
-        }
-      }
-    }
-  },
-  ondirarg(start, end) {
-    if (start === end) return;
-    const arg = getSlice(start, end);
-    if (inVPre) {
-      currentProp.name += arg;
-      setLocEnd(currentProp.nameLoc, end);
-    } else {
-      const isStatic = arg[0] !== `[`;
-      currentProp.arg = createExp(
-        isStatic ? arg : arg.slice(1, -1),
-        isStatic,
-        getLoc(start, end),
-        isStatic ? 3 : 0
-      );
-    }
-  },
-  ondirmodifier(start, end) {
-    const mod = getSlice(start, end);
-    if (inVPre) {
-      currentProp.name += "." + mod;
-      setLocEnd(currentProp.nameLoc, end);
-    } else if (currentProp.name === "slot") {
-      const arg = currentProp.arg;
-      if (arg) {
-        arg.content += "." + mod;
-        setLocEnd(arg.loc, end);
-      }
-    } else {
-      const exp = createSimpleExpression(mod, true, getLoc(start, end));
-      currentProp.modifiers.push(exp);
-    }
-  },
-  onattribdata(start, end) {
-    currentAttrValue += getSlice(start, end);
-    if (currentAttrStartIndex < 0) currentAttrStartIndex = start;
-    currentAttrEndIndex = end;
-  },
-  onattribentity(char, start, end) {
-    currentAttrValue += char;
-    if (currentAttrStartIndex < 0) currentAttrStartIndex = start;
-    currentAttrEndIndex = end;
-  },
-  onattribnameend(end) {
-    const start = currentProp.loc.start.offset;
-    const name = getSlice(start, end);
-    if (currentProp.type === 7) {
-      currentProp.rawName = name;
-    }
-    if (currentOpenTag.props.some(
-      (p) => (p.type === 7 ? p.rawName : p.name) === name
-    )) {
-      emitError(2, start);
-    }
-  },
-  onattribend(quote, end) {
-    if (currentOpenTag && currentProp) {
-      setLocEnd(currentProp.loc, end);
-      if (quote !== 0) {
-        if (currentAttrValue.includes("&")) {
-          currentAttrValue = currentOptions.decodeEntities(
-            currentAttrValue,
-            true
-          );
-        }
-        if (currentProp.type === 6) {
-          if (currentProp.name === "class") {
-            currentAttrValue = condense(currentAttrValue).trim();
-          }
-          if (quote === 1 && !currentAttrValue) {
-            emitError(13, end);
-          }
-          currentProp.value = {
-            type: 2,
-            content: currentAttrValue,
-            loc: quote === 1 ? getLoc(currentAttrStartIndex, currentAttrEndIndex) : getLoc(currentAttrStartIndex - 1, currentAttrEndIndex + 1)
-          };
-          if (tokenizer.inSFCRoot && currentOpenTag.tag === "template" && currentProp.name === "lang" && currentAttrValue && currentAttrValue !== "html") {
-            tokenizer.enterRCDATA(toCharCodes(`</template`), 0);
-          }
-        } else {
-          let expParseMode = 0 /* Normal */;
-          currentProp.exp = createExp(
-            currentAttrValue,
-            false,
-            getLoc(currentAttrStartIndex, currentAttrEndIndex),
-            0,
-            expParseMode
-          );
-          if (currentProp.name === "for") {
-            currentProp.forParseResult = parseForExpression(currentProp.exp);
-          }
-        }
-      }
-      if (currentProp.type !== 7 || currentProp.name !== "pre") {
-        currentOpenTag.props.push(currentProp);
-      }
-    }
-    currentAttrValue = "";
-    currentAttrStartIndex = currentAttrEndIndex = -1;
-  },
-  oncomment(start, end) {
-    if (currentOptions.comments) {
-      addNode({
-        type: 3,
-        content: getSlice(start, end),
-        loc: getLoc(start - 4, end + 3)
-      });
-    }
-  },
-  onend() {
-    const end = currentInput.length;
-    if (tokenizer.state !== 1) {
-      switch (tokenizer.state) {
-        case 5:
-        case 8:
-          emitError(5, end);
-          break;
-        case 3:
-        case 4:
-          emitError(
-            25,
-            tokenizer.sectionStart
-          );
-          break;
-        case 28:
-          if (tokenizer.currentSequence === Sequences.CdataEnd) {
-            emitError(6, end);
-          } else {
-            emitError(7, end);
-          }
-          break;
-        case 6:
-        case 7:
-        case 9:
-        case 11:
-        case 12:
-        case 13:
-        case 14:
-        case 15:
-        case 16:
-        case 17:
-        case 18:
-        case 19:
-        // "
-        case 20:
-        // '
-        case 21:
-          emitError(9, end);
-          break;
-      }
-    }
-    for (let index = 0; index < stack.length; index++) {
-      onCloseTag(stack[index], end - 1);
-      emitError(24, stack[index].loc.start.offset);
-    }
-  },
-  oncdata(start, end) {
-    if (stack[0].ns !== 0) {
-      onText(getSlice(start, end), start, end);
-    } else {
-      emitError(1, start - 9);
-    }
-  },
-  onprocessinginstruction(start) {
-    if ((stack[0] ? stack[0].ns : currentOptions.ns) === 0) {
-      emitError(
-        21,
-        start - 1
-      );
-    }
-  }
-});
-const forIteratorRE = /,([^,\}\]]*)(?:,([^,\}\]]*))?$/;
-const stripParensRE = /^\(|\)$/g;
-function parseForExpression(input) {
-  const loc = input.loc;
-  const exp = input.content;
-  const inMatch = exp.match(forAliasRE);
-  if (!inMatch) return;
-  const [, LHS, RHS] = inMatch;
-  const createAliasExpression = (content, offset, asParam = false) => {
-    const start = loc.start.offset + offset;
-    const end = start + content.length;
-    return createExp(
-      content,
-      false,
-      getLoc(start, end),
-      0,
-      asParam ? 1 /* Params */ : 0 /* Normal */
-    );
-  };
-  const result = {
-    source: createAliasExpression(RHS.trim(), exp.indexOf(RHS, LHS.length)),
-    value: void 0,
-    key: void 0,
-    index: void 0,
-    finalized: false
-  };
-  let valueContent = LHS.trim().replace(stripParensRE, "").trim();
-  const trimmedOffset = LHS.indexOf(valueContent);
-  const iteratorMatch = valueContent.match(forIteratorRE);
-  if (iteratorMatch) {
-    valueContent = valueContent.replace(forIteratorRE, "").trim();
-    const keyContent = iteratorMatch[1].trim();
-    let keyOffset;
-    if (keyContent) {
-      keyOffset = exp.indexOf(keyContent, trimmedOffset + valueContent.length);
-      result.key = createAliasExpression(keyContent, keyOffset, true);
-    }
-    if (iteratorMatch[2]) {
-      const indexContent = iteratorMatch[2].trim();
-      if (indexContent) {
-        result.index = createAliasExpression(
-          indexContent,
-          exp.indexOf(
-            indexContent,
-            result.key ? keyOffset + keyContent.length : trimmedOffset + valueContent.length
-          ),
-          true
-        );
-      }
-    }
-  }
-  if (valueContent) {
-    result.value = createAliasExpression(valueContent, trimmedOffset, true);
-  }
-  return result;
-}
-function getSlice(start, end) {
-  return currentInput.slice(start, end);
-}
-function endOpenTag(end) {
-  if (tokenizer.inSFCRoot) {
-    currentOpenTag.innerLoc = getLoc(end + 1, end + 1);
-  }
-  addNode(currentOpenTag);
-  const { tag, ns } = currentOpenTag;
-  if (ns === 0 && currentOptions.isPreTag(tag)) {
-    inPre++;
-  }
-  if (currentOptions.isVoidTag(tag)) {
-    onCloseTag(currentOpenTag, end);
-  } else {
-    stack.unshift(currentOpenTag);
-    if (ns === 1 || ns === 2) {
-      tokenizer.inXML = true;
-    }
-  }
-  currentOpenTag = null;
-}
-function onText(content, start, end) {
-  {
-    const tag = stack[0] && stack[0].tag;
-    if (tag !== "script" && tag !== "style" && content.includes("&")) {
-      content = currentOptions.decodeEntities(content, false);
-    }
-  }
-  const parent = stack[0] || currentRoot;
-  const lastNode = parent.children[parent.children.length - 1];
-  if (lastNode && lastNode.type === 2) {
-    lastNode.content += content;
-    setLocEnd(lastNode.loc, end);
-  } else {
-    parent.children.push({
-      type: 2,
-      content,
-      loc: getLoc(start, end)
-    });
-  }
-}
-function onCloseTag(el, end, isImplied = false) {
-  if (isImplied) {
-    setLocEnd(el.loc, backTrack(end, 60));
-  } else {
-    setLocEnd(el.loc, lookAhead(end, 62) + 1);
-  }
-  if (tokenizer.inSFCRoot) {
-    if (el.children.length) {
-      el.innerLoc.end = extend({}, el.children[el.children.length - 1].loc.end);
-    } else {
-      el.innerLoc.end = extend({}, el.innerLoc.start);
-    }
-    el.innerLoc.source = getSlice(
-      el.innerLoc.start.offset,
-      el.innerLoc.end.offset
-    );
-  }
-  const { tag, ns, children } = el;
-  if (!inVPre) {
-    if (tag === "slot") {
-      el.tagType = 2;
-    } else if (isFragmentTemplate(el)) {
-      el.tagType = 3;
-    } else if (isComponent(el)) {
-      el.tagType = 1;
-    }
-  }
-  if (!tokenizer.inRCDATA) {
-    el.children = condenseWhitespace(children);
-  }
-  if (ns === 0 && currentOptions.isIgnoreNewlineTag(tag)) {
-    const first = children[0];
-    if (first && first.type === 2) {
-      first.content = first.content.replace(/^\r?\n/, "");
-    }
-  }
-  if (ns === 0 && currentOptions.isPreTag(tag)) {
-    inPre--;
-  }
-  if (currentVPreBoundary === el) {
-    inVPre = tokenizer.inVPre = false;
-    currentVPreBoundary = null;
-  }
-  if (tokenizer.inXML && (stack[0] ? stack[0].ns : currentOptions.ns) === 0) {
-    tokenizer.inXML = false;
-  }
-}
-function lookAhead(index, c) {
-  let i = index;
-  while (currentInput.charCodeAt(i) !== c && i < currentInput.length - 1) i++;
-  return i;
-}
-function backTrack(index, c) {
-  let i = index;
-  while (currentInput.charCodeAt(i) !== c && i >= 0) i--;
-  return i;
-}
-const specialTemplateDir = /* @__PURE__ */ new Set(["if", "else", "else-if", "for", "slot"]);
-function isFragmentTemplate({ tag, props }) {
-  if (tag === "template") {
-    for (let i = 0; i < props.length; i++) {
-      if (props[i].type === 7 && specialTemplateDir.has(props[i].name)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-function isComponent({ tag, props }) {
-  if (currentOptions.isCustomElement(tag)) {
-    return false;
-  }
-  if (tag === "component" || isUpperCase(tag.charCodeAt(0)) || isCoreComponent(tag) || currentOptions.isBuiltInComponent && currentOptions.isBuiltInComponent(tag) || currentOptions.isNativeTag && !currentOptions.isNativeTag(tag)) {
-    return true;
-  }
-  for (let i = 0; i < props.length; i++) {
-    const p = props[i];
-    if (p.type === 6) {
-      if (p.name === "is" && p.value) {
-        if (p.value.content.startsWith("vue:")) {
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-function isUpperCase(c) {
-  return c > 64 && c < 91;
-}
-const windowsNewlineRE = /\r\n/g;
-function condenseWhitespace(nodes, tag) {
-  const shouldCondense = currentOptions.whitespace !== "preserve";
-  let removedWhitespace = false;
-  for (let i = 0; i < nodes.length; i++) {
-    const node = nodes[i];
-    if (node.type === 2) {
-      if (!inPre) {
-        if (isAllWhitespace(node.content)) {
-          const prev = nodes[i - 1] && nodes[i - 1].type;
-          const next = nodes[i + 1] && nodes[i + 1].type;
-          if (!prev || !next || shouldCondense && (prev === 3 && (next === 3 || next === 1) || prev === 1 && (next === 3 || next === 1 && hasNewlineChar(node.content)))) {
-            removedWhitespace = true;
-            nodes[i] = null;
-          } else {
-            node.content = " ";
-          }
-        } else if (shouldCondense) {
-          node.content = condense(node.content);
-        }
-      } else {
-        node.content = node.content.replace(windowsNewlineRE, "\n");
-      }
-    }
-  }
-  return removedWhitespace ? nodes.filter(Boolean) : nodes;
-}
-function isAllWhitespace(str) {
-  for (let i = 0; i < str.length; i++) {
-    if (!isWhitespace(str.charCodeAt(i))) {
-      return false;
-    }
-  }
-  return true;
-}
-function hasNewlineChar(str) {
-  for (let i = 0; i < str.length; i++) {
-    const c = str.charCodeAt(i);
-    if (c === 10 || c === 13) {
-      return true;
-    }
-  }
-  return false;
-}
-function condense(str) {
-  let ret = "";
-  let prevCharIsWhitespace = false;
-  for (let i = 0; i < str.length; i++) {
-    if (isWhitespace(str.charCodeAt(i))) {
-      if (!prevCharIsWhitespace) {
-        ret += " ";
-        prevCharIsWhitespace = true;
-      }
-    } else {
-      ret += str[i];
-      prevCharIsWhitespace = false;
-    }
-  }
-  return ret;
-}
-function addNode(node) {
-  (stack[0] || currentRoot).children.push(node);
-}
-function getLoc(start, end) {
-  return {
-    start: tokenizer.getPos(start),
-    // @ts-expect-error allow late attachment
-    end: end == null ? end : tokenizer.getPos(end),
-    // @ts-expect-error allow late attachment
-    source: end == null ? end : getSlice(start, end)
-  };
-}
-function cloneLoc(loc) {
-  return getLoc(loc.start.offset, loc.end.offset);
-}
-function setLocEnd(loc, end) {
-  loc.end = tokenizer.getPos(end);
-  loc.source = getSlice(loc.start.offset, end);
-}
-function dirToAttr(dir) {
-  const attr = {
-    type: 6,
-    name: dir.rawName,
-    nameLoc: getLoc(
-      dir.loc.start.offset,
-      dir.loc.start.offset + dir.rawName.length
-    ),
-    value: void 0,
-    loc: dir.loc
-  };
-  if (dir.exp) {
-    const loc = dir.exp.loc;
-    if (loc.end.offset < dir.loc.end.offset) {
-      loc.start.offset--;
-      loc.start.column--;
-      loc.end.offset++;
-      loc.end.column++;
-    }
-    attr.value = {
-      type: 2,
-      content: dir.exp.content,
-      loc
-    };
-  }
-  return attr;
-}
-function createExp(content, isStatic = false, loc, constType = 0, parseMode = 0 /* Normal */) {
-  const exp = createSimpleExpression(content, isStatic, loc, constType);
-  return exp;
-}
-function emitError(code, index, message) {
-  currentOptions.onError(
-    createCompilerError(code, getLoc(index, index), void 0, message)
-  );
-}
-function reset() {
-  tokenizer.reset();
-  currentOpenTag = null;
-  currentProp = null;
-  currentAttrValue = "";
-  currentAttrStartIndex = -1;
-  currentAttrEndIndex = -1;
-  stack.length = 0;
-}
-function baseParse(input, options) {
-  reset();
-  currentInput = input;
-  currentOptions = extend({}, defaultParserOptions);
-  if (options) {
-    let key;
-    for (key in options) {
-      if (options[key] != null) {
-        currentOptions[key] = options[key];
-      }
-    }
-  }
-  {
-    if (!currentOptions.decodeEntities) {
-      throw new Error(
-        `[@vue/compiler-core] decodeEntities option is required in browser builds.`
-      );
-    }
-  }
-  tokenizer.mode = currentOptions.parseMode === "html" ? 1 : currentOptions.parseMode === "sfc" ? 2 : 0;
-  tokenizer.inXML = currentOptions.ns === 1 || currentOptions.ns === 2;
-  const delimiters = options && options.delimiters;
-  if (delimiters) {
-    tokenizer.delimiterOpen = toCharCodes(delimiters[0]);
-    tokenizer.delimiterClose = toCharCodes(delimiters[1]);
-  }
-  const root = currentRoot = createRoot([], input);
-  tokenizer.parse(currentInput);
-  root.loc = getLoc(0, input.length);
-  root.children = condenseWhitespace(root.children);
-  currentRoot = null;
-  return root;
-}
-
-function cacheStatic(root, context) {
-  walk(
-    root,
-    void 0,
-    context,
-    // Root node is unfortunately non-hoistable due to potential parent
-    // fallthrough attributes.
-    isSingleElementRoot(root, root.children[0])
-  );
-}
-function isSingleElementRoot(root, child) {
-  const { children } = root;
-  return children.length === 1 && child.type === 1 && !isSlotOutlet(child);
-}
-function walk(node, parent, context, doNotHoistNode = false, inFor = false) {
-  const { children } = node;
-  const toCache = [];
-  for (let i = 0; i < children.length; i++) {
-    const child = children[i];
-    if (child.type === 1 && child.tagType === 0) {
-      const constantType = doNotHoistNode ? 0 : getConstantType(child, context);
-      if (constantType > 0) {
-        if (constantType >= 2) {
-          child.codegenNode.patchFlag = -1;
-          toCache.push(child);
-          continue;
-        }
-      } else {
-        const codegenNode = child.codegenNode;
-        if (codegenNode.type === 13) {
-          const flag = codegenNode.patchFlag;
-          if ((flag === void 0 || flag === 512 || flag === 1) && getGeneratedPropsConstantType(child, context) >= 2) {
-            const props = getNodeProps(child);
-            if (props) {
-              codegenNode.props = context.hoist(props);
-            }
-          }
-          if (codegenNode.dynamicProps) {
-            codegenNode.dynamicProps = context.hoist(codegenNode.dynamicProps);
-          }
-        }
-      }
-    } else if (child.type === 12) {
-      const constantType = doNotHoistNode ? 0 : getConstantType(child, context);
-      if (constantType >= 2) {
-        toCache.push(child);
-        continue;
-      }
-    }
-    if (child.type === 1) {
-      const isComponent = child.tagType === 1;
-      if (isComponent) {
-        context.scopes.vSlot++;
-      }
-      walk(child, node, context, false, inFor);
-      if (isComponent) {
-        context.scopes.vSlot--;
-      }
-    } else if (child.type === 11) {
-      walk(child, node, context, child.children.length === 1, true);
-    } else if (child.type === 9) {
-      for (let i2 = 0; i2 < child.branches.length; i2++) {
-        walk(
-          child.branches[i2],
-          node,
-          context,
-          child.branches[i2].children.length === 1,
-          inFor
-        );
-      }
-    }
-  }
-  let cachedAsArray = false;
-  if (toCache.length === children.length && node.type === 1) {
-    if (node.tagType === 0 && node.codegenNode && node.codegenNode.type === 13 && isArray(node.codegenNode.children)) {
-      node.codegenNode.children = getCacheExpression(
-        createArrayExpression(node.codegenNode.children)
-      );
-      cachedAsArray = true;
-    } else if (node.tagType === 1 && node.codegenNode && node.codegenNode.type === 13 && node.codegenNode.children && !isArray(node.codegenNode.children) && node.codegenNode.children.type === 15) {
-      const slot = getSlotNode(node.codegenNode, "default");
-      if (slot) {
-        slot.returns = getCacheExpression(
-          createArrayExpression(slot.returns)
-        );
-        cachedAsArray = true;
-      }
-    } else if (node.tagType === 3 && parent && parent.type === 1 && parent.tagType === 1 && parent.codegenNode && parent.codegenNode.type === 13 && parent.codegenNode.children && !isArray(parent.codegenNode.children) && parent.codegenNode.children.type === 15) {
-      const slotName = findDir(node, "slot", true);
-      const slot = slotName && slotName.arg && getSlotNode(parent.codegenNode, slotName.arg);
-      if (slot) {
-        slot.returns = getCacheExpression(
-          createArrayExpression(slot.returns)
-        );
-        cachedAsArray = true;
-      }
-    }
-  }
-  if (!cachedAsArray) {
-    for (const child of toCache) {
-      child.codegenNode = context.cache(child.codegenNode);
-    }
-  }
-  function getCacheExpression(value) {
-    const exp = context.cache(value);
-    if (inFor && context.hmr) {
-      exp.needArraySpread = true;
-    }
-    return exp;
-  }
-  function getSlotNode(node2, name) {
-    if (node2.children && !isArray(node2.children) && node2.children.type === 15) {
-      const slot = node2.children.properties.find(
-        (p) => p.key === name || p.key.content === name
-      );
-      return slot && slot.value;
-    }
-  }
-  if (toCache.length && context.transformHoist) {
-    context.transformHoist(children, context, node);
-  }
-}
-function getConstantType(node, context) {
-  const { constantCache } = context;
-  switch (node.type) {
-    case 1:
-      if (node.tagType !== 0) {
-        return 0;
-      }
-      const cached = constantCache.get(node);
-      if (cached !== void 0) {
-        return cached;
-      }
-      const codegenNode = node.codegenNode;
-      if (codegenNode.type !== 13) {
-        return 0;
-      }
-      if (codegenNode.isBlock && node.tag !== "svg" && node.tag !== "foreignObject" && node.tag !== "math") {
-        return 0;
-      }
-      if (codegenNode.patchFlag === void 0) {
-        let returnType2 = 3;
-        const generatedPropsType = getGeneratedPropsConstantType(node, context);
-        if (generatedPropsType === 0) {
-          constantCache.set(node, 0);
-          return 0;
-        }
-        if (generatedPropsType < returnType2) {
-          returnType2 = generatedPropsType;
-        }
-        for (let i = 0; i < node.children.length; i++) {
-          const childType = getConstantType(node.children[i], context);
-          if (childType === 0) {
-            constantCache.set(node, 0);
-            return 0;
-          }
-          if (childType < returnType2) {
-            returnType2 = childType;
-          }
-        }
-        if (returnType2 > 1) {
-          for (let i = 0; i < node.props.length; i++) {
-            const p = node.props[i];
-            if (p.type === 7 && p.name === "bind" && p.exp) {
-              const expType = getConstantType(p.exp, context);
-              if (expType === 0) {
-                constantCache.set(node, 0);
-                return 0;
-              }
-              if (expType < returnType2) {
-                returnType2 = expType;
-              }
-            }
-          }
-        }
-        if (codegenNode.isBlock) {
-          for (let i = 0; i < node.props.length; i++) {
-            const p = node.props[i];
-            if (p.type === 7) {
-              constantCache.set(node, 0);
-              return 0;
-            }
-          }
-          context.removeHelper(OPEN_BLOCK);
-          context.removeHelper(
-            getVNodeBlockHelper(context.inSSR, codegenNode.isComponent)
-          );
-          codegenNode.isBlock = false;
-          context.helper(getVNodeHelper(context.inSSR, codegenNode.isComponent));
-        }
-        constantCache.set(node, returnType2);
-        return returnType2;
-      } else {
-        constantCache.set(node, 0);
-        return 0;
-      }
-    case 2:
-    case 3:
-      return 3;
-    case 9:
-    case 11:
-    case 10:
-      return 0;
-    case 5:
-    case 12:
-      return getConstantType(node.content, context);
-    case 4:
-      return node.constType;
-    case 8:
-      let returnType = 3;
-      for (let i = 0; i < node.children.length; i++) {
-        const child = node.children[i];
-        if (isString(child) || isSymbol(child)) {
-          continue;
-        }
-        const childType = getConstantType(child, context);
-        if (childType === 0) {
-          return 0;
-        } else if (childType < returnType) {
-          returnType = childType;
-        }
-      }
-      return returnType;
-    case 20:
-      return 2;
-    default:
-      return 0;
-  }
-}
-const allowHoistedHelperSet = /* @__PURE__ */ new Set([
-  NORMALIZE_CLASS,
-  NORMALIZE_STYLE,
-  NORMALIZE_PROPS,
-  GUARD_REACTIVE_PROPS
-]);
-function getConstantTypeOfHelperCall(value, context) {
-  if (value.type === 14 && !isString(value.callee) && allowHoistedHelperSet.has(value.callee)) {
-    const arg = value.arguments[0];
-    if (arg.type === 4) {
-      return getConstantType(arg, context);
-    } else if (arg.type === 14) {
-      return getConstantTypeOfHelperCall(arg, context);
-    }
-  }
-  return 0;
-}
-function getGeneratedPropsConstantType(node, context) {
-  let returnType = 3;
-  const props = getNodeProps(node);
-  if (props && props.type === 15) {
-    const { properties } = props;
-    for (let i = 0; i < properties.length; i++) {
-      const { key, value } = properties[i];
-      const keyType = getConstantType(key, context);
-      if (keyType === 0) {
-        return keyType;
-      }
-      if (keyType < returnType) {
-        returnType = keyType;
-      }
-      let valueType;
-      if (value.type === 4) {
-        valueType = getConstantType(value, context);
-      } else if (value.type === 14) {
-        valueType = getConstantTypeOfHelperCall(value, context);
-      } else {
-        valueType = 0;
-      }
-      if (valueType === 0) {
-        return valueType;
-      }
-      if (valueType < returnType) {
-        returnType = valueType;
-      }
-    }
-  }
-  return returnType;
-}
-function getNodeProps(node) {
-  const codegenNode = node.codegenNode;
-  if (codegenNode.type === 13) {
-    return codegenNode.props;
-  }
-}
-
-function createTransformContext(root, {
-  filename = "",
-  prefixIdentifiers = false,
-  hoistStatic = false,
-  hmr = false,
-  cacheHandlers = false,
-  nodeTransforms = [],
-  directiveTransforms = {},
-  transformHoist = null,
-  isBuiltInComponent = NOOP,
-  isCustomElement = NOOP,
-  expressionPlugins = [],
-  scopeId = null,
-  slotted = true,
-  ssr = false,
-  inSSR = false,
-  ssrCssVars = ``,
-  bindingMetadata = EMPTY_OBJ,
-  inline = false,
-  isTS = false,
-  onError = defaultOnError,
-  onWarn = defaultOnWarn,
-  compatConfig
-}) {
-  const nameMatch = filename.replace(/\?.*$/, "").match(/([^/\\]+)\.\w+$/);
-  const context = {
-    // options
-    filename,
-    selfName: nameMatch && capitalize(camelize(nameMatch[1])),
-    prefixIdentifiers,
-    hoistStatic,
-    hmr,
-    cacheHandlers,
-    nodeTransforms,
-    directiveTransforms,
-    transformHoist,
-    isBuiltInComponent,
-    isCustomElement,
-    expressionPlugins,
-    scopeId,
-    slotted,
-    ssr,
-    inSSR,
-    ssrCssVars,
-    bindingMetadata,
-    inline,
-    isTS,
-    onError,
-    onWarn,
-    compatConfig,
-    // state
-    root,
-    helpers: /* @__PURE__ */ new Map(),
-    components: /* @__PURE__ */ new Set(),
-    directives: /* @__PURE__ */ new Set(),
-    hoists: [],
-    imports: [],
-    cached: [],
-    constantCache: /* @__PURE__ */ new WeakMap(),
-    temps: 0,
-    identifiers: /* @__PURE__ */ Object.create(null),
-    scopes: {
-      vFor: 0,
-      vSlot: 0,
-      vPre: 0,
-      vOnce: 0
-    },
-    parent: null,
-    grandParent: null,
-    currentNode: root,
-    childIndex: 0,
-    inVOnce: false,
-    // methods
-    helper(name) {
-      const count = context.helpers.get(name) || 0;
-      context.helpers.set(name, count + 1);
-      return name;
-    },
-    removeHelper(name) {
-      const count = context.helpers.get(name);
-      if (count) {
-        const currentCount = count - 1;
-        if (!currentCount) {
-          context.helpers.delete(name);
-        } else {
-          context.helpers.set(name, currentCount);
-        }
-      }
-    },
-    helperString(name) {
-      return `_${helperNameMap[context.helper(name)]}`;
-    },
-    replaceNode(node) {
-      {
-        if (!context.currentNode) {
-          throw new Error(`Node being replaced is already removed.`);
-        }
-        if (!context.parent) {
-          throw new Error(`Cannot replace root node.`);
-        }
-      }
-      context.parent.children[context.childIndex] = context.currentNode = node;
-    },
-    removeNode(node) {
-      if (!context.parent) {
-        throw new Error(`Cannot remove root node.`);
-      }
-      const list = context.parent.children;
-      const removalIndex = node ? list.indexOf(node) : context.currentNode ? context.childIndex : -1;
-      if (removalIndex < 0) {
-        throw new Error(`node being removed is not a child of current parent`);
-      }
-      if (!node || node === context.currentNode) {
-        context.currentNode = null;
-        context.onNodeRemoved();
-      } else {
-        if (context.childIndex > removalIndex) {
-          context.childIndex--;
-          context.onNodeRemoved();
-        }
-      }
-      context.parent.children.splice(removalIndex, 1);
-    },
-    onNodeRemoved: NOOP,
-    addIdentifiers(exp) {
-    },
-    removeIdentifiers(exp) {
-    },
-    hoist(exp) {
-      if (isString(exp)) exp = createSimpleExpression(exp);
-      context.hoists.push(exp);
-      const identifier = createSimpleExpression(
-        `_hoisted_${context.hoists.length}`,
-        false,
-        exp.loc,
-        2
-      );
-      identifier.hoisted = exp;
-      return identifier;
-    },
-    cache(exp, isVNode = false) {
-      const cacheExp = createCacheExpression(
-        context.cached.length,
-        exp,
-        isVNode
-      );
-      context.cached.push(cacheExp);
-      return cacheExp;
-    }
-  };
-  return context;
-}
-function transform(root, options) {
-  const context = createTransformContext(root, options);
-  traverseNode(root, context);
-  if (options.hoistStatic) {
-    cacheStatic(root, context);
-  }
-  if (!options.ssr) {
-    createRootCodegen(root, context);
-  }
-  root.helpers = /* @__PURE__ */ new Set([...context.helpers.keys()]);
-  root.components = [...context.components];
-  root.directives = [...context.directives];
-  root.imports = context.imports;
-  root.hoists = context.hoists;
-  root.temps = context.temps;
-  root.cached = context.cached;
-  root.transformed = true;
-}
-function createRootCodegen(root, context) {
-  const { helper } = context;
-  const { children } = root;
-  if (children.length === 1) {
-    const child = children[0];
-    if (isSingleElementRoot(root, child) && child.codegenNode) {
-      const codegenNode = child.codegenNode;
-      if (codegenNode.type === 13) {
-        convertToBlock(codegenNode, context);
-      }
-      root.codegenNode = codegenNode;
-    } else {
-      root.codegenNode = child;
-    }
-  } else if (children.length > 1) {
-    let patchFlag = 64;
-    if (children.filter((c) => c.type !== 3).length === 1) {
-      patchFlag |= 2048;
-    }
-    root.codegenNode = createVNodeCall(
-      context,
-      helper(FRAGMENT),
-      void 0,
-      root.children,
-      patchFlag,
-      void 0,
-      void 0,
-      true,
-      void 0,
-      false
-    );
-  } else ;
-}
-function traverseChildren(parent, context) {
-  let i = 0;
-  const nodeRemoved = () => {
-    i--;
-  };
-  for (; i < parent.children.length; i++) {
-    const child = parent.children[i];
-    if (isString(child)) continue;
-    context.grandParent = context.parent;
-    context.parent = parent;
-    context.childIndex = i;
-    context.onNodeRemoved = nodeRemoved;
-    traverseNode(child, context);
-  }
-}
-function traverseNode(node, context) {
-  context.currentNode = node;
-  const { nodeTransforms } = context;
-  const exitFns = [];
-  for (let i2 = 0; i2 < nodeTransforms.length; i2++) {
-    const onExit = nodeTransforms[i2](node, context);
-    if (onExit) {
-      if (isArray(onExit)) {
-        exitFns.push(...onExit);
-      } else {
-        exitFns.push(onExit);
-      }
-    }
-    if (!context.currentNode) {
-      return;
-    } else {
-      node = context.currentNode;
-    }
-  }
-  switch (node.type) {
-    case 3:
-      if (!context.ssr) {
-        context.helper(CREATE_COMMENT);
-      }
-      break;
-    case 5:
-      if (!context.ssr) {
-        context.helper(TO_DISPLAY_STRING);
-      }
-      break;
-    // for container types, further traverse downwards
-    case 9:
-      for (let i2 = 0; i2 < node.branches.length; i2++) {
-        traverseNode(node.branches[i2], context);
-      }
-      break;
-    case 10:
-    case 11:
-    case 1:
-    case 0:
-      traverseChildren(node, context);
-      break;
-  }
-  context.currentNode = node;
-  let i = exitFns.length;
-  while (i--) {
-    exitFns[i]();
-  }
-}
-function createStructuralDirectiveTransform(name, fn) {
-  const matches = isString(name) ? (n) => n === name : (n) => name.test(n);
-  return (node, context) => {
-    if (node.type === 1) {
-      const { props } = node;
-      if (node.tagType === 3 && props.some(isVSlot)) {
-        return;
-      }
-      const exitFns = [];
-      for (let i = 0; i < props.length; i++) {
-        const prop = props[i];
-        if (prop.type === 7 && matches(prop.name)) {
-          props.splice(i, 1);
-          i--;
-          const onExit = fn(node, prop, context);
-          if (onExit) exitFns.push(onExit);
-        }
-      }
-      return exitFns;
-    }
-  };
-}
-
-const PURE_ANNOTATION = `/*@__PURE__*/`;
-const aliasHelper = (s) => `${helperNameMap[s]}: _${helperNameMap[s]}`;
-function createCodegenContext(ast, {
-  mode = "function",
-  prefixIdentifiers = mode === "module",
-  sourceMap = false,
-  filename = `template.vue.html`,
-  scopeId = null,
-  optimizeImports = false,
-  runtimeGlobalName = `Vue`,
-  runtimeModuleName = `vue`,
-  ssrRuntimeModuleName = "vue/server-renderer",
-  ssr = false,
-  isTS = false,
-  inSSR = false
-}) {
-  const context = {
-    mode,
-    prefixIdentifiers,
-    sourceMap,
-    filename,
-    scopeId,
-    optimizeImports,
-    runtimeGlobalName,
-    runtimeModuleName,
-    ssrRuntimeModuleName,
-    ssr,
-    isTS,
-    inSSR,
-    source: ast.source,
-    code: ``,
-    column: 1,
-    line: 1,
-    offset: 0,
-    indentLevel: 0,
-    pure: false,
-    map: void 0,
-    helper(key) {
-      return `_${helperNameMap[key]}`;
-    },
-    push(code, newlineIndex = -2 /* None */, node) {
-      context.code += code;
-    },
-    indent() {
-      newline(++context.indentLevel);
-    },
-    deindent(withoutNewLine = false) {
-      if (withoutNewLine) {
-        --context.indentLevel;
-      } else {
-        newline(--context.indentLevel);
-      }
-    },
-    newline() {
-      newline(context.indentLevel);
-    }
-  };
-  function newline(n) {
-    context.push("\n" + `  `.repeat(n), 0 /* Start */);
-  }
-  return context;
-}
-function generate(ast, options = {}) {
-  const context = createCodegenContext(ast, options);
-  if (options.onContextCreated) options.onContextCreated(context);
-  const {
-    mode,
-    push,
-    prefixIdentifiers,
-    indent,
-    deindent,
-    newline,
-    scopeId,
-    ssr
-  } = context;
-  const helpers = Array.from(ast.helpers);
-  const hasHelpers = helpers.length > 0;
-  const useWithBlock = !prefixIdentifiers && mode !== "module";
-  const preambleContext = context;
-  {
-    genFunctionPreamble(ast, preambleContext);
-  }
-  const functionName = ssr ? `ssrRender` : `render`;
-  const args = ssr ? ["_ctx", "_push", "_parent", "_attrs"] : ["_ctx", "_cache"];
-  const signature = args.join(", ");
-  {
-    push(`function ${functionName}(${signature}) {`);
-  }
-  indent();
-  if (useWithBlock) {
-    push(`with (_ctx) {`);
-    indent();
-    if (hasHelpers) {
-      push(
-        `const { ${helpers.map(aliasHelper).join(", ")} } = _Vue
-`,
-        -1 /* End */
-      );
-      newline();
-    }
-  }
-  if (ast.components.length) {
-    genAssets(ast.components, "component", context);
-    if (ast.directives.length || ast.temps > 0) {
-      newline();
-    }
-  }
-  if (ast.directives.length) {
-    genAssets(ast.directives, "directive", context);
-    if (ast.temps > 0) {
-      newline();
-    }
-  }
-  if (ast.temps > 0) {
-    push(`let `);
-    for (let i = 0; i < ast.temps; i++) {
-      push(`${i > 0 ? `, ` : ``}_temp${i}`);
-    }
-  }
-  if (ast.components.length || ast.directives.length || ast.temps) {
-    push(`
-`, 0 /* Start */);
-    newline();
-  }
-  if (!ssr) {
-    push(`return `);
-  }
-  if (ast.codegenNode) {
-    genNode(ast.codegenNode, context);
-  } else {
-    push(`null`);
-  }
-  if (useWithBlock) {
-    deindent();
-    push(`}`);
-  }
-  deindent();
-  push(`}`);
-  return {
-    ast,
-    code: context.code,
-    preamble: ``,
-    map: context.map ? context.map.toJSON() : void 0
-  };
-}
-function genFunctionPreamble(ast, context) {
-  const {
-    ssr,
-    prefixIdentifiers,
-    push,
-    newline,
-    runtimeModuleName,
-    runtimeGlobalName,
-    ssrRuntimeModuleName
-  } = context;
-  const VueBinding = runtimeGlobalName;
-  const helpers = Array.from(ast.helpers);
-  if (helpers.length > 0) {
-    {
-      push(`const _Vue = ${VueBinding}
-`, -1 /* End */);
-      if (ast.hoists.length) {
-        const staticHelpers = [
-          CREATE_VNODE,
-          CREATE_ELEMENT_VNODE,
-          CREATE_COMMENT,
-          CREATE_TEXT,
-          CREATE_STATIC
-        ].filter((helper) => helpers.includes(helper)).map(aliasHelper).join(", ");
-        push(`const { ${staticHelpers} } = _Vue
-`, -1 /* End */);
-      }
-    }
-  }
-  genHoists(ast.hoists, context);
-  newline();
-  push(`return `);
-}
-function genAssets(assets, type, { helper, push, newline, isTS }) {
-  const resolver = helper(
-    type === "component" ? RESOLVE_COMPONENT : RESOLVE_DIRECTIVE
-  );
-  for (let i = 0; i < assets.length; i++) {
-    let id = assets[i];
-    const maybeSelfReference = id.endsWith("__self");
-    if (maybeSelfReference) {
-      id = id.slice(0, -6);
-    }
-    push(
-      `const ${toValidAssetId(id, type)} = ${resolver}(${JSON.stringify(id)}${maybeSelfReference ? `, true` : ``})${isTS ? `!` : ``}`
-    );
-    if (i < assets.length - 1) {
-      newline();
-    }
-  }
-}
-function genHoists(hoists, context) {
-  if (!hoists.length) {
-    return;
-  }
-  context.pure = true;
-  const { push, newline } = context;
-  newline();
-  for (let i = 0; i < hoists.length; i++) {
-    const exp = hoists[i];
-    if (exp) {
-      push(`const _hoisted_${i + 1} = `);
-      genNode(exp, context);
-      newline();
-    }
-  }
-  context.pure = false;
-}
-function isText(n) {
-  return isString(n) || n.type === 4 || n.type === 2 || n.type === 5 || n.type === 8;
-}
-function genNodeListAsArray(nodes, context) {
-  const multilines = nodes.length > 3 || nodes.some((n) => isArray(n) || !isText(n));
-  context.push(`[`);
-  multilines && context.indent();
-  genNodeList(nodes, context, multilines);
-  multilines && context.deindent();
-  context.push(`]`);
-}
-function genNodeList(nodes, context, multilines = false, comma = true) {
-  const { push, newline } = context;
-  for (let i = 0; i < nodes.length; i++) {
-    const node = nodes[i];
-    if (isString(node)) {
-      push(node, -3 /* Unknown */);
-    } else if (isArray(node)) {
-      genNodeListAsArray(node, context);
-    } else {
-      genNode(node, context);
-    }
-    if (i < nodes.length - 1) {
-      if (multilines) {
-        comma && push(",");
-        newline();
-      } else {
-        comma && push(", ");
-      }
-    }
-  }
-}
-function genNode(node, context) {
-  if (isString(node)) {
-    context.push(node, -3 /* Unknown */);
-    return;
-  }
-  if (isSymbol(node)) {
-    context.push(context.helper(node));
-    return;
-  }
-  switch (node.type) {
-    case 1:
-    case 9:
-    case 11:
-      assert(
-        node.codegenNode != null,
-        `Codegen node is missing for element/if/for node. Apply appropriate transforms first.`
-      );
-      genNode(node.codegenNode, context);
-      break;
-    case 2:
-      genText(node, context);
-      break;
-    case 4:
-      genExpression(node, context);
-      break;
-    case 5:
-      genInterpolation(node, context);
-      break;
-    case 12:
-      genNode(node.codegenNode, context);
-      break;
-    case 8:
-      genCompoundExpression(node, context);
-      break;
-    case 3:
-      genComment(node, context);
-      break;
-    case 13:
-      genVNodeCall(node, context);
-      break;
-    case 14:
-      genCallExpression(node, context);
-      break;
-    case 15:
-      genObjectExpression(node, context);
-      break;
-    case 17:
-      genArrayExpression(node, context);
-      break;
-    case 18:
-      genFunctionExpression(node, context);
-      break;
-    case 19:
-      genConditionalExpression(node, context);
-      break;
-    case 20:
-      genCacheExpression(node, context);
-      break;
-    case 21:
-      genNodeList(node.body, context, true, false);
-      break;
-    // SSR only types
-    case 22:
-      break;
-    case 23:
-      break;
-    case 24:
-      break;
-    case 25:
-      break;
-    case 26:
-      break;
-    /* v8 ignore start */
-    case 10:
-      break;
-    default:
-      {
-        assert(false, `unhandled codegen node type: ${node.type}`);
-        const exhaustiveCheck = node;
-        return exhaustiveCheck;
-      }
-  }
-}
-function genText(node, context) {
-  context.push(JSON.stringify(node.content), -3 /* Unknown */, node);
-}
-function genExpression(node, context) {
-  const { content, isStatic } = node;
-  context.push(
-    isStatic ? JSON.stringify(content) : content,
-    -3 /* Unknown */,
-    node
-  );
-}
-function genInterpolation(node, context) {
-  const { push, helper, pure } = context;
-  if (pure) push(PURE_ANNOTATION);
-  push(`${helper(TO_DISPLAY_STRING)}(`);
-  genNode(node.content, context);
-  push(`)`);
-}
-function genCompoundExpression(node, context) {
-  for (let i = 0; i < node.children.length; i++) {
-    const child = node.children[i];
-    if (isString(child)) {
-      context.push(child, -3 /* Unknown */);
-    } else {
-      genNode(child, context);
-    }
-  }
-}
-function genExpressionAsPropertyKey(node, context) {
-  const { push } = context;
-  if (node.type === 8) {
-    push(`[`);
-    genCompoundExpression(node, context);
-    push(`]`);
-  } else if (node.isStatic) {
-    const text = isSimpleIdentifier(node.content) ? node.content : JSON.stringify(node.content);
-    push(text, -2 /* None */, node);
-  } else {
-    push(`[${node.content}]`, -3 /* Unknown */, node);
-  }
-}
-function genComment(node, context) {
-  const { push, helper, pure } = context;
-  if (pure) {
-    push(PURE_ANNOTATION);
-  }
-  push(
-    `${helper(CREATE_COMMENT)}(${JSON.stringify(node.content)})`,
-    -3 /* Unknown */,
-    node
-  );
-}
-function genVNodeCall(node, context) {
-  const { push, helper, pure } = context;
-  const {
-    tag,
-    props,
-    children,
-    patchFlag,
-    dynamicProps,
-    directives,
-    isBlock,
-    disableTracking,
-    isComponent
-  } = node;
-  let patchFlagString;
-  if (patchFlag) {
-    {
-      if (patchFlag < 0) {
-        patchFlagString = patchFlag + ` /* ${PatchFlagNames[patchFlag]} */`;
-      } else {
-        const flagNames = Object.keys(PatchFlagNames).map(Number).filter((n) => n > 0 && patchFlag & n).map((n) => PatchFlagNames[n]).join(`, `);
-        patchFlagString = patchFlag + ` /* ${flagNames} */`;
-      }
-    }
-  }
-  if (directives) {
-    push(helper(WITH_DIRECTIVES) + `(`);
-  }
-  if (isBlock) {
-    push(`(${helper(OPEN_BLOCK)}(${disableTracking ? `true` : ``}), `);
-  }
-  if (pure) {
-    push(PURE_ANNOTATION);
-  }
-  const callHelper = isBlock ? getVNodeBlockHelper(context.inSSR, isComponent) : getVNodeHelper(context.inSSR, isComponent);
-  push(helper(callHelper) + `(`, -2 /* None */, node);
-  genNodeList(
-    genNullableArgs([tag, props, children, patchFlagString, dynamicProps]),
-    context
-  );
-  push(`)`);
-  if (isBlock) {
-    push(`)`);
-  }
-  if (directives) {
-    push(`, `);
-    genNode(directives, context);
-    push(`)`);
-  }
-}
-function genNullableArgs(args) {
-  let i = args.length;
-  while (i--) {
-    if (args[i] != null) break;
-  }
-  return args.slice(0, i + 1).map((arg) => arg || `null`);
-}
-function genCallExpression(node, context) {
-  const { push, helper, pure } = context;
-  const callee = isString(node.callee) ? node.callee : helper(node.callee);
-  if (pure) {
-    push(PURE_ANNOTATION);
-  }
-  push(callee + `(`, -2 /* None */, node);
-  genNodeList(node.arguments, context);
-  push(`)`);
-}
-function genObjectExpression(node, context) {
-  const { push, indent, deindent, newline } = context;
-  const { properties } = node;
-  if (!properties.length) {
-    push(`{}`, -2 /* None */, node);
-    return;
-  }
-  const multilines = properties.length > 1 || properties.some((p) => p.value.type !== 4);
-  push(multilines ? `{` : `{ `);
-  multilines && indent();
-  for (let i = 0; i < properties.length; i++) {
-    const { key, value } = properties[i];
-    genExpressionAsPropertyKey(key, context);
-    push(`: `);
-    genNode(value, context);
-    if (i < properties.length - 1) {
-      push(`,`);
-      newline();
-    }
-  }
-  multilines && deindent();
-  push(multilines ? `}` : ` }`);
-}
-function genArrayExpression(node, context) {
-  genNodeListAsArray(node.elements, context);
-}
-function genFunctionExpression(node, context) {
-  const { push, indent, deindent } = context;
-  const { params, returns, body, newline, isSlot } = node;
-  if (isSlot) {
-    push(`_${helperNameMap[WITH_CTX]}(`);
-  }
-  push(`(`, -2 /* None */, node);
-  if (isArray(params)) {
-    genNodeList(params, context);
-  } else if (params) {
-    genNode(params, context);
-  }
-  push(`) => `);
-  if (newline || body) {
-    push(`{`);
-    indent();
-  }
-  if (returns) {
-    if (newline) {
-      push(`return `);
-    }
-    if (isArray(returns)) {
-      genNodeListAsArray(returns, context);
-    } else {
-      genNode(returns, context);
-    }
-  } else if (body) {
-    genNode(body, context);
-  }
-  if (newline || body) {
-    deindent();
-    push(`}`);
-  }
-  if (isSlot) {
-    push(`)`);
-  }
-}
-function genConditionalExpression(node, context) {
-  const { test, consequent, alternate, newline: needNewline } = node;
-  const { push, indent, deindent, newline } = context;
-  if (test.type === 4) {
-    const needsParens = !isSimpleIdentifier(test.content);
-    needsParens && push(`(`);
-    genExpression(test, context);
-    needsParens && push(`)`);
-  } else {
-    push(`(`);
-    genNode(test, context);
-    push(`)`);
-  }
-  needNewline && indent();
-  context.indentLevel++;
-  needNewline || push(` `);
-  push(`? `);
-  genNode(consequent, context);
-  context.indentLevel--;
-  needNewline && newline();
-  needNewline || push(` `);
-  push(`: `);
-  const isNested = alternate.type === 19;
-  if (!isNested) {
-    context.indentLevel++;
-  }
-  genNode(alternate, context);
-  if (!isNested) {
-    context.indentLevel--;
-  }
-  needNewline && deindent(
-    true
-    /* without newline */
-  );
-}
-function genCacheExpression(node, context) {
-  const { push, helper, indent, deindent, newline } = context;
-  const { needPauseTracking, needArraySpread } = node;
-  if (needArraySpread) {
-    push(`[...(`);
-  }
-  push(`_cache[${node.index}] || (`);
-  if (needPauseTracking) {
-    indent();
-    push(`${helper(SET_BLOCK_TRACKING)}(-1),`);
-    newline();
-    push(`(`);
-  }
-  push(`_cache[${node.index}] = `);
-  genNode(node.value, context);
-  if (needPauseTracking) {
-    push(`).cacheIndex = ${node.index},`);
-    newline();
-    push(`${helper(SET_BLOCK_TRACKING)}(1),`);
-    newline();
-    push(`_cache[${node.index}]`);
-    deindent();
-  }
-  push(`)`);
-  if (needArraySpread) {
-    push(`)]`);
-  }
-}
-
-const prohibitedKeywordRE = new RegExp(
-  "\\b" + "arguments,await,break,case,catch,class,const,continue,debugger,default,delete,do,else,export,extends,finally,for,function,if,import,let,new,return,super,switch,throw,try,var,void,while,with,yield".split(",").join("\\b|\\b") + "\\b"
-);
-const stripStringRE = /'(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*"|`(?:[^`\\]|\\.)*\$\{|\}(?:[^`\\]|\\.)*`|`(?:[^`\\]|\\.)*`/g;
-function validateBrowserExpression(node, context, asParams = false, asRawStatements = false) {
-  const exp = node.content;
-  if (!exp.trim()) {
-    return;
-  }
-  try {
-    new Function(
-      asRawStatements ? ` ${exp} ` : `return ${asParams ? `(${exp}) => {}` : `(${exp})`}`
-    );
-  } catch (e) {
-    let message = e.message;
-    const keywordMatch = exp.replace(stripStringRE, "").match(prohibitedKeywordRE);
-    if (keywordMatch) {
-      message = `avoid using JavaScript keyword as property name: "${keywordMatch[0]}"`;
-    }
-    context.onError(
-      createCompilerError(
-        45,
-        node.loc,
-        void 0,
-        message
-      )
-    );
-  }
-}
-
-const transformExpression = (node, context) => {
-  if (node.type === 5) {
-    node.content = processExpression(
-      node.content,
-      context
-    );
-  } else if (node.type === 1) {
-    for (let i = 0; i < node.props.length; i++) {
-      const dir = node.props[i];
-      if (dir.type === 7 && dir.name !== "for") {
-        const exp = dir.exp;
-        const arg = dir.arg;
-        if (exp && exp.type === 4 && !(dir.name === "on" && arg)) {
-          dir.exp = processExpression(
-            exp,
-            context,
-            // slot args must be processed as function params
-            dir.name === "slot"
-          );
-        }
-        if (arg && arg.type === 4 && !arg.isStatic) {
-          dir.arg = processExpression(arg, context);
-        }
-      }
-    }
-  }
-};
-function processExpression(node, context, asParams = false, asRawStatements = false, localVars = Object.create(context.identifiers)) {
-  {
-    {
-      validateBrowserExpression(node, context, asParams, asRawStatements);
-    }
-    return node;
-  }
-}
-
-const transformIf = createStructuralDirectiveTransform(
-  /^(if|else|else-if)$/,
-  (node, dir, context) => {
-    return processIf(node, dir, context, (ifNode, branch, isRoot) => {
-      const siblings = context.parent.children;
-      let i = siblings.indexOf(ifNode);
-      let key = 0;
-      while (i-- >= 0) {
-        const sibling = siblings[i];
-        if (sibling && sibling.type === 9) {
-          key += sibling.branches.length;
-        }
-      }
-      return () => {
-        if (isRoot) {
-          ifNode.codegenNode = createCodegenNodeForBranch(
-            branch,
-            key,
-            context
-          );
-        } else {
-          const parentCondition = getParentCondition(ifNode.codegenNode);
-          parentCondition.alternate = createCodegenNodeForBranch(
-            branch,
-            key + ifNode.branches.length - 1,
-            context
-          );
-        }
-      };
-    });
-  }
-);
-function processIf(node, dir, context, processCodegen) {
-  if (dir.name !== "else" && (!dir.exp || !dir.exp.content.trim())) {
-    const loc = dir.exp ? dir.exp.loc : node.loc;
-    context.onError(
-      createCompilerError(28, dir.loc)
-    );
-    dir.exp = createSimpleExpression(`true`, false, loc);
-  }
-  if (dir.exp) {
-    validateBrowserExpression(dir.exp, context);
-  }
-  if (dir.name === "if") {
-    const branch = createIfBranch(node, dir);
-    const ifNode = {
-      type: 9,
-      loc: cloneLoc(node.loc),
-      branches: [branch]
-    };
-    context.replaceNode(ifNode);
-    if (processCodegen) {
-      return processCodegen(ifNode, branch, true);
-    }
-  } else {
-    const siblings = context.parent.children;
-    const comments = [];
-    let i = siblings.indexOf(node);
-    while (i-- >= -1) {
-      const sibling = siblings[i];
-      if (sibling && sibling.type === 3) {
-        context.removeNode(sibling);
-        comments.unshift(sibling);
-        continue;
-      }
-      if (sibling && sibling.type === 2 && !sibling.content.trim().length) {
-        context.removeNode(sibling);
-        continue;
-      }
-      if (sibling && sibling.type === 9) {
-        if (dir.name === "else-if" && sibling.branches[sibling.branches.length - 1].condition === void 0) {
-          context.onError(
-            createCompilerError(30, node.loc)
-          );
-        }
-        context.removeNode();
-        const branch = createIfBranch(node, dir);
-        if (comments.length && // #3619 ignore comments if the v-if is direct child of <transition>
-        !(context.parent && context.parent.type === 1 && (context.parent.tag === "transition" || context.parent.tag === "Transition"))) {
-          branch.children = [...comments, ...branch.children];
-        }
-        {
-          const key = branch.userKey;
-          if (key) {
-            sibling.branches.forEach(({ userKey }) => {
-              if (isSameKey(userKey, key)) {
-                context.onError(
-                  createCompilerError(
-                    29,
-                    branch.userKey.loc
-                  )
-                );
-              }
-            });
-          }
-        }
-        sibling.branches.push(branch);
-        const onExit = processCodegen && processCodegen(sibling, branch, false);
-        traverseNode(branch, context);
-        if (onExit) onExit();
-        context.currentNode = null;
-      } else {
-        context.onError(
-          createCompilerError(30, node.loc)
-        );
-      }
-      break;
-    }
-  }
-}
-function createIfBranch(node, dir) {
-  const isTemplateIf = node.tagType === 3;
-  return {
-    type: 10,
-    loc: node.loc,
-    condition: dir.name === "else" ? void 0 : dir.exp,
-    children: isTemplateIf && !findDir(node, "for") ? node.children : [node],
-    userKey: findProp(node, `key`),
-    isTemplateIf
-  };
-}
-function createCodegenNodeForBranch(branch, keyIndex, context) {
-  if (branch.condition) {
-    return createConditionalExpression(
-      branch.condition,
-      createChildrenCodegenNode(branch, keyIndex, context),
-      // make sure to pass in asBlock: true so that the comment node call
-      // closes the current block.
-      createCallExpression(context.helper(CREATE_COMMENT), [
-        '"v-if"' ,
-        "true"
-      ])
-    );
-  } else {
-    return createChildrenCodegenNode(branch, keyIndex, context);
-  }
-}
-function createChildrenCodegenNode(branch, keyIndex, context) {
-  const { helper } = context;
-  const keyProperty = createObjectProperty(
-    `key`,
-    createSimpleExpression(
-      `${keyIndex}`,
-      false,
-      locStub,
-      2
-    )
-  );
-  const { children } = branch;
-  const firstChild = children[0];
-  const needFragmentWrapper = children.length !== 1 || firstChild.type !== 1;
-  if (needFragmentWrapper) {
-    if (children.length === 1 && firstChild.type === 11) {
-      const vnodeCall = firstChild.codegenNode;
-      injectProp(vnodeCall, keyProperty, context);
-      return vnodeCall;
-    } else {
-      let patchFlag = 64;
-      if (!branch.isTemplateIf && children.filter((c) => c.type !== 3).length === 1) {
-        patchFlag |= 2048;
-      }
-      return createVNodeCall(
-        context,
-        helper(FRAGMENT),
-        createObjectExpression([keyProperty]),
-        children,
-        patchFlag,
-        void 0,
-        void 0,
-        true,
-        false,
-        false,
-        branch.loc
-      );
-    }
-  } else {
-    const ret = firstChild.codegenNode;
-    const vnodeCall = getMemoedVNodeCall(ret);
-    if (vnodeCall.type === 13) {
-      convertToBlock(vnodeCall, context);
-    }
-    injectProp(vnodeCall, keyProperty, context);
-    return ret;
-  }
-}
-function isSameKey(a, b) {
-  if (!a || a.type !== b.type) {
-    return false;
-  }
-  if (a.type === 6) {
-    if (a.value.content !== b.value.content) {
-      return false;
-    }
-  } else {
-    const exp = a.exp;
-    const branchExp = b.exp;
-    if (exp.type !== branchExp.type) {
-      return false;
-    }
-    if (exp.type !== 4 || exp.isStatic !== branchExp.isStatic || exp.content !== branchExp.content) {
-      return false;
-    }
-  }
-  return true;
-}
-function getParentCondition(node) {
-  while (true) {
-    if (node.type === 19) {
-      if (node.alternate.type === 19) {
-        node = node.alternate;
-      } else {
-        return node;
-      }
-    } else if (node.type === 20) {
-      node = node.value;
-    }
-  }
-}
-
-const transformBind = (dir, _node, context) => {
-  const { modifiers, loc } = dir;
-  const arg = dir.arg;
-  let { exp } = dir;
-  if (exp && exp.type === 4 && !exp.content.trim()) {
-    {
-      exp = void 0;
-    }
-  }
-  if (!exp) {
-    if (arg.type !== 4 || !arg.isStatic) {
-      context.onError(
-        createCompilerError(
-          52,
-          arg.loc
-        )
-      );
-      return {
-        props: [
-          createObjectProperty(arg, createSimpleExpression("", true, loc))
-        ]
-      };
-    }
-    transformBindShorthand(dir);
-    exp = dir.exp;
-  }
-  if (arg.type !== 4) {
-    arg.children.unshift(`(`);
-    arg.children.push(`) || ""`);
-  } else if (!arg.isStatic) {
-    arg.content = `${arg.content} || ""`;
-  }
-  if (modifiers.some((mod) => mod.content === "camel")) {
-    if (arg.type === 4) {
-      if (arg.isStatic) {
-        arg.content = camelize(arg.content);
-      } else {
-        arg.content = `${context.helperString(CAMELIZE)}(${arg.content})`;
-      }
-    } else {
-      arg.children.unshift(`${context.helperString(CAMELIZE)}(`);
-      arg.children.push(`)`);
-    }
-  }
-  if (!context.inSSR) {
-    if (modifiers.some((mod) => mod.content === "prop")) {
-      injectPrefix(arg, ".");
-    }
-    if (modifiers.some((mod) => mod.content === "attr")) {
-      injectPrefix(arg, "^");
-    }
-  }
-  return {
-    props: [createObjectProperty(arg, exp)]
-  };
-};
-const transformBindShorthand = (dir, context) => {
-  const arg = dir.arg;
-  const propName = camelize(arg.content);
-  dir.exp = createSimpleExpression(propName, false, arg.loc);
-};
-const injectPrefix = (arg, prefix) => {
-  if (arg.type === 4) {
-    if (arg.isStatic) {
-      arg.content = prefix + arg.content;
-    } else {
-      arg.content = `\`${prefix}\${${arg.content}}\``;
-    }
-  } else {
-    arg.children.unshift(`'${prefix}' + (`);
-    arg.children.push(`)`);
-  }
-};
-
-const transformFor = createStructuralDirectiveTransform(
-  "for",
-  (node, dir, context) => {
-    const { helper, removeHelper } = context;
-    return processFor(node, dir, context, (forNode) => {
-      const renderExp = createCallExpression(helper(RENDER_LIST), [
-        forNode.source
-      ]);
-      const isTemplate = isTemplateNode(node);
-      const memo = findDir(node, "memo");
-      const keyProp = findProp(node, `key`, false, true);
-      if (keyProp && keyProp.type === 7 && !keyProp.exp) {
-        transformBindShorthand(keyProp);
-      }
-      const keyExp = keyProp && (keyProp.type === 6 ? keyProp.value ? createSimpleExpression(keyProp.value.content, true) : void 0 : keyProp.exp);
-      const keyProperty = keyProp && keyExp ? createObjectProperty(`key`, keyExp) : null;
-      const isStableFragment = forNode.source.type === 4 && forNode.source.constType > 0;
-      const fragmentFlag = isStableFragment ? 64 : keyProp ? 128 : 256;
-      forNode.codegenNode = createVNodeCall(
-        context,
-        helper(FRAGMENT),
-        void 0,
-        renderExp,
-        fragmentFlag,
-        void 0,
-        void 0,
-        true,
-        !isStableFragment,
-        false,
-        node.loc
-      );
-      return () => {
-        let childBlock;
-        const { children } = forNode;
-        if (isTemplate) {
-          node.children.some((c) => {
-            if (c.type === 1) {
-              const key = findProp(c, "key");
-              if (key) {
-                context.onError(
-                  createCompilerError(
-                    33,
-                    key.loc
-                  )
-                );
-                return true;
-              }
-            }
-          });
-        }
-        const needFragmentWrapper = children.length !== 1 || children[0].type !== 1;
-        const slotOutlet = isSlotOutlet(node) ? node : isTemplate && node.children.length === 1 && isSlotOutlet(node.children[0]) ? node.children[0] : null;
-        if (slotOutlet) {
-          childBlock = slotOutlet.codegenNode;
-          if (isTemplate && keyProperty) {
-            injectProp(childBlock, keyProperty, context);
-          }
-        } else if (needFragmentWrapper) {
-          childBlock = createVNodeCall(
-            context,
-            helper(FRAGMENT),
-            keyProperty ? createObjectExpression([keyProperty]) : void 0,
-            node.children,
-            64,
-            void 0,
-            void 0,
-            true,
-            void 0,
-            false
-          );
-        } else {
-          childBlock = children[0].codegenNode;
-          if (isTemplate && keyProperty) {
-            injectProp(childBlock, keyProperty, context);
-          }
-          if (childBlock.isBlock !== !isStableFragment) {
-            if (childBlock.isBlock) {
-              removeHelper(OPEN_BLOCK);
-              removeHelper(
-                getVNodeBlockHelper(context.inSSR, childBlock.isComponent)
-              );
-            } else {
-              removeHelper(
-                getVNodeHelper(context.inSSR, childBlock.isComponent)
-              );
-            }
-          }
-          childBlock.isBlock = !isStableFragment;
-          if (childBlock.isBlock) {
-            helper(OPEN_BLOCK);
-            helper(getVNodeBlockHelper(context.inSSR, childBlock.isComponent));
-          } else {
-            helper(getVNodeHelper(context.inSSR, childBlock.isComponent));
-          }
-        }
-        if (memo) {
-          const loop = createFunctionExpression(
-            createForLoopParams(forNode.parseResult, [
-              createSimpleExpression(`_cached`)
-            ])
-          );
-          loop.body = createBlockStatement([
-            createCompoundExpression([`const _memo = (`, memo.exp, `)`]),
-            createCompoundExpression([
-              `if (_cached`,
-              ...keyExp ? [` && _cached.key === `, keyExp] : [],
-              ` && ${context.helperString(
-                IS_MEMO_SAME
-              )}(_cached, _memo)) return _cached`
-            ]),
-            createCompoundExpression([`const _item = `, childBlock]),
-            createSimpleExpression(`_item.memo = _memo`),
-            createSimpleExpression(`return _item`)
-          ]);
-          renderExp.arguments.push(
-            loop,
-            createSimpleExpression(`_cache`),
-            createSimpleExpression(String(context.cached.length))
-          );
-          context.cached.push(null);
-        } else {
-          renderExp.arguments.push(
-            createFunctionExpression(
-              createForLoopParams(forNode.parseResult),
-              childBlock,
-              true
-            )
-          );
-        }
-      };
-    });
-  }
-);
-function processFor(node, dir, context, processCodegen) {
-  if (!dir.exp) {
-    context.onError(
-      createCompilerError(31, dir.loc)
-    );
-    return;
-  }
-  const parseResult = dir.forParseResult;
-  if (!parseResult) {
-    context.onError(
-      createCompilerError(32, dir.loc)
-    );
-    return;
-  }
-  finalizeForParseResult(parseResult, context);
-  const { addIdentifiers, removeIdentifiers, scopes } = context;
-  const { source, value, key, index } = parseResult;
-  const forNode = {
-    type: 11,
-    loc: dir.loc,
-    source,
-    valueAlias: value,
-    keyAlias: key,
-    objectIndexAlias: index,
-    parseResult,
-    children: isTemplateNode(node) ? node.children : [node]
-  };
-  context.replaceNode(forNode);
-  scopes.vFor++;
-  const onExit = processCodegen && processCodegen(forNode);
-  return () => {
-    scopes.vFor--;
-    if (onExit) onExit();
-  };
-}
-function finalizeForParseResult(result, context) {
-  if (result.finalized) return;
-  {
-    validateBrowserExpression(result.source, context);
-    if (result.key) {
-      validateBrowserExpression(
-        result.key,
-        context,
-        true
-      );
-    }
-    if (result.index) {
-      validateBrowserExpression(
-        result.index,
-        context,
-        true
-      );
-    }
-    if (result.value) {
-      validateBrowserExpression(
-        result.value,
-        context,
-        true
-      );
-    }
-  }
-  result.finalized = true;
-}
-function createForLoopParams({ value, key, index }, memoArgs = []) {
-  return createParamsList([value, key, index, ...memoArgs]);
-}
-function createParamsList(args) {
-  let i = args.length;
-  while (i--) {
-    if (args[i]) break;
-  }
-  return args.slice(0, i + 1).map((arg, i2) => arg || createSimpleExpression(`_`.repeat(i2 + 1), false));
-}
-
-const defaultFallback = createSimpleExpression(`undefined`, false);
-const trackSlotScopes = (node, context) => {
-  if (node.type === 1 && (node.tagType === 1 || node.tagType === 3)) {
-    const vSlot = findDir(node, "slot");
-    if (vSlot) {
-      vSlot.exp;
-      context.scopes.vSlot++;
-      return () => {
-        context.scopes.vSlot--;
-      };
-    }
-  }
-};
-const buildClientSlotFn = (props, _vForExp, children, loc) => createFunctionExpression(
-  props,
-  children,
-  false,
-  true,
-  children.length ? children[0].loc : loc
-);
-function buildSlots(node, context, buildSlotFn = buildClientSlotFn) {
-  context.helper(WITH_CTX);
-  const { children, loc } = node;
-  const slotsProperties = [];
-  const dynamicSlots = [];
-  let hasDynamicSlots = context.scopes.vSlot > 0 || context.scopes.vFor > 0;
-  const onComponentSlot = findDir(node, "slot", true);
-  if (onComponentSlot) {
-    const { arg, exp } = onComponentSlot;
-    if (arg && !isStaticExp(arg)) {
-      hasDynamicSlots = true;
-    }
-    slotsProperties.push(
-      createObjectProperty(
-        arg || createSimpleExpression("default", true),
-        buildSlotFn(exp, void 0, children, loc)
-      )
-    );
-  }
-  let hasTemplateSlots = false;
-  let hasNamedDefaultSlot = false;
-  const implicitDefaultChildren = [];
-  const seenSlotNames = /* @__PURE__ */ new Set();
-  let conditionalBranchIndex = 0;
-  for (let i = 0; i < children.length; i++) {
-    const slotElement = children[i];
-    let slotDir;
-    if (!isTemplateNode(slotElement) || !(slotDir = findDir(slotElement, "slot", true))) {
-      if (slotElement.type !== 3) {
-        implicitDefaultChildren.push(slotElement);
-      }
-      continue;
-    }
-    if (onComponentSlot) {
-      context.onError(
-        createCompilerError(37, slotDir.loc)
-      );
-      break;
-    }
-    hasTemplateSlots = true;
-    const { children: slotChildren, loc: slotLoc } = slotElement;
-    const {
-      arg: slotName = createSimpleExpression(`default`, true),
-      exp: slotProps,
-      loc: dirLoc
-    } = slotDir;
-    let staticSlotName;
-    if (isStaticExp(slotName)) {
-      staticSlotName = slotName ? slotName.content : `default`;
-    } else {
-      hasDynamicSlots = true;
-    }
-    const vFor = findDir(slotElement, "for");
-    const slotFunction = buildSlotFn(slotProps, vFor, slotChildren, slotLoc);
-    let vIf;
-    let vElse;
-    if (vIf = findDir(slotElement, "if")) {
-      hasDynamicSlots = true;
-      dynamicSlots.push(
-        createConditionalExpression(
-          vIf.exp,
-          buildDynamicSlot(slotName, slotFunction, conditionalBranchIndex++),
-          defaultFallback
-        )
-      );
-    } else if (vElse = findDir(
-      slotElement,
-      /^else(-if)?$/,
-      true
-      /* allowEmpty */
-    )) {
-      let j = i;
-      let prev;
-      while (j--) {
-        prev = children[j];
-        if (prev.type !== 3) {
-          break;
-        }
-      }
-      if (prev && isTemplateNode(prev) && findDir(prev, /^(else-)?if$/)) {
-        let conditional = dynamicSlots[dynamicSlots.length - 1];
-        while (conditional.alternate.type === 19) {
-          conditional = conditional.alternate;
-        }
-        conditional.alternate = vElse.exp ? createConditionalExpression(
-          vElse.exp,
-          buildDynamicSlot(
-            slotName,
-            slotFunction,
-            conditionalBranchIndex++
-          ),
-          defaultFallback
-        ) : buildDynamicSlot(slotName, slotFunction, conditionalBranchIndex++);
-      } else {
-        context.onError(
-          createCompilerError(30, vElse.loc)
-        );
-      }
-    } else if (vFor) {
-      hasDynamicSlots = true;
-      const parseResult = vFor.forParseResult;
-      if (parseResult) {
-        finalizeForParseResult(parseResult, context);
-        dynamicSlots.push(
-          createCallExpression(context.helper(RENDER_LIST), [
-            parseResult.source,
-            createFunctionExpression(
-              createForLoopParams(parseResult),
-              buildDynamicSlot(slotName, slotFunction),
-              true
-            )
-          ])
-        );
-      } else {
-        context.onError(
-          createCompilerError(
-            32,
-            vFor.loc
-          )
-        );
-      }
-    } else {
-      if (staticSlotName) {
-        if (seenSlotNames.has(staticSlotName)) {
-          context.onError(
-            createCompilerError(
-              38,
-              dirLoc
-            )
-          );
-          continue;
-        }
-        seenSlotNames.add(staticSlotName);
-        if (staticSlotName === "default") {
-          hasNamedDefaultSlot = true;
-        }
-      }
-      slotsProperties.push(createObjectProperty(slotName, slotFunction));
-    }
-  }
-  if (!onComponentSlot) {
-    const buildDefaultSlotProperty = (props, children2) => {
-      const fn = buildSlotFn(props, void 0, children2, loc);
-      return createObjectProperty(`default`, fn);
-    };
-    if (!hasTemplateSlots) {
-      slotsProperties.push(buildDefaultSlotProperty(void 0, children));
-    } else if (implicitDefaultChildren.length && // #3766
-    // with whitespace: 'preserve', whitespaces between slots will end up in
-    // implicitDefaultChildren. Ignore if all implicit children are whitespaces.
-    implicitDefaultChildren.some((node2) => isNonWhitespaceContent(node2))) {
-      if (hasNamedDefaultSlot) {
-        context.onError(
-          createCompilerError(
-            39,
-            implicitDefaultChildren[0].loc
-          )
-        );
-      } else {
-        slotsProperties.push(
-          buildDefaultSlotProperty(void 0, implicitDefaultChildren)
-        );
-      }
-    }
-  }
-  const slotFlag = hasDynamicSlots ? 2 : hasForwardedSlots(node.children) ? 3 : 1;
-  let slots = createObjectExpression(
-    slotsProperties.concat(
-      createObjectProperty(
-        `_`,
-        // 2 = compiled but dynamic = can skip normalization, but must run diff
-        // 1 = compiled and static = can skip normalization AND diff as optimized
-        createSimpleExpression(
-          slotFlag + (` /* ${slotFlagsText[slotFlag]} */` ),
-          false
-        )
-      )
-    ),
-    loc
-  );
-  if (dynamicSlots.length) {
-    slots = createCallExpression(context.helper(CREATE_SLOTS), [
-      slots,
-      createArrayExpression(dynamicSlots)
-    ]);
-  }
-  return {
-    slots,
-    hasDynamicSlots
-  };
-}
-function buildDynamicSlot(name, fn, index) {
-  const props = [
-    createObjectProperty(`name`, name),
-    createObjectProperty(`fn`, fn)
-  ];
-  if (index != null) {
-    props.push(
-      createObjectProperty(`key`, createSimpleExpression(String(index), true))
-    );
-  }
-  return createObjectExpression(props);
-}
-function hasForwardedSlots(children) {
-  for (let i = 0; i < children.length; i++) {
-    const child = children[i];
-    switch (child.type) {
-      case 1:
-        if (child.tagType === 2 || hasForwardedSlots(child.children)) {
-          return true;
-        }
-        break;
-      case 9:
-        if (hasForwardedSlots(child.branches)) return true;
-        break;
-      case 10:
-      case 11:
-        if (hasForwardedSlots(child.children)) return true;
-        break;
-    }
-  }
-  return false;
-}
-function isNonWhitespaceContent(node) {
-  if (node.type !== 2 && node.type !== 12)
-    return true;
-  return node.type === 2 ? !!node.content.trim() : isNonWhitespaceContent(node.content);
-}
-
-const directiveImportMap = /* @__PURE__ */ new WeakMap();
-const transformElement = (node, context) => {
-  return function postTransformElement() {
-    node = context.currentNode;
-    if (!(node.type === 1 && (node.tagType === 0 || node.tagType === 1))) {
-      return;
-    }
-    const { tag, props } = node;
-    const isComponent = node.tagType === 1;
-    let vnodeTag = isComponent ? resolveComponentType(node, context) : `"${tag}"`;
-    const isDynamicComponent = isObject(vnodeTag) && vnodeTag.callee === RESOLVE_DYNAMIC_COMPONENT;
-    let vnodeProps;
-    let vnodeChildren;
-    let patchFlag = 0;
-    let vnodeDynamicProps;
-    let dynamicPropNames;
-    let vnodeDirectives;
-    let shouldUseBlock = (
-      // dynamic component may resolve to plain elements
-      isDynamicComponent || vnodeTag === TELEPORT || vnodeTag === SUSPENSE || !isComponent && // <svg> and <foreignObject> must be forced into blocks so that block
-      // updates inside get proper isSVG flag at runtime. (#639, #643)
-      // This is technically web-specific, but splitting the logic out of core
-      // leads to too much unnecessary complexity.
-      (tag === "svg" || tag === "foreignObject" || tag === "math")
-    );
-    if (props.length > 0) {
-      const propsBuildResult = buildProps(
-        node,
-        context,
-        void 0,
-        isComponent,
-        isDynamicComponent
-      );
-      vnodeProps = propsBuildResult.props;
-      patchFlag = propsBuildResult.patchFlag;
-      dynamicPropNames = propsBuildResult.dynamicPropNames;
-      const directives = propsBuildResult.directives;
-      vnodeDirectives = directives && directives.length ? createArrayExpression(
-        directives.map((dir) => buildDirectiveArgs(dir, context))
-      ) : void 0;
-      if (propsBuildResult.shouldUseBlock) {
-        shouldUseBlock = true;
-      }
-    }
-    if (node.children.length > 0) {
-      if (vnodeTag === KEEP_ALIVE) {
-        shouldUseBlock = true;
-        patchFlag |= 1024;
-        if (node.children.length > 1) {
-          context.onError(
-            createCompilerError(46, {
-              start: node.children[0].loc.start,
-              end: node.children[node.children.length - 1].loc.end,
-              source: ""
-            })
-          );
-        }
-      }
-      const shouldBuildAsSlots = isComponent && // Teleport is not a real component and has dedicated runtime handling
-      vnodeTag !== TELEPORT && // explained above.
-      vnodeTag !== KEEP_ALIVE;
-      if (shouldBuildAsSlots) {
-        const { slots, hasDynamicSlots } = buildSlots(node, context);
-        vnodeChildren = slots;
-        if (hasDynamicSlots) {
-          patchFlag |= 1024;
-        }
-      } else if (node.children.length === 1 && vnodeTag !== TELEPORT) {
-        const child = node.children[0];
-        const type = child.type;
-        const hasDynamicTextChild = type === 5 || type === 8;
-        if (hasDynamicTextChild && getConstantType(child, context) === 0) {
-          patchFlag |= 1;
-        }
-        if (hasDynamicTextChild || type === 2) {
-          vnodeChildren = child;
-        } else {
-          vnodeChildren = node.children;
-        }
-      } else {
-        vnodeChildren = node.children;
-      }
-    }
-    if (dynamicPropNames && dynamicPropNames.length) {
-      vnodeDynamicProps = stringifyDynamicPropNames(dynamicPropNames);
-    }
-    node.codegenNode = createVNodeCall(
-      context,
-      vnodeTag,
-      vnodeProps,
-      vnodeChildren,
-      patchFlag === 0 ? void 0 : patchFlag,
-      vnodeDynamicProps,
-      vnodeDirectives,
-      !!shouldUseBlock,
-      false,
-      isComponent,
-      node.loc
-    );
-  };
-};
-function resolveComponentType(node, context, ssr = false) {
-  let { tag } = node;
-  const isExplicitDynamic = isComponentTag(tag);
-  const isProp = findProp(
-    node,
-    "is",
-    false,
-    true
-    /* allow empty */
-  );
-  if (isProp) {
-    if (isExplicitDynamic || false) {
-      let exp;
-      if (isProp.type === 6) {
-        exp = isProp.value && createSimpleExpression(isProp.value.content, true);
-      } else {
-        exp = isProp.exp;
-        if (!exp) {
-          exp = createSimpleExpression(`is`, false, isProp.arg.loc);
-        }
-      }
-      if (exp) {
-        return createCallExpression(context.helper(RESOLVE_DYNAMIC_COMPONENT), [
-          exp
-        ]);
-      }
-    } else if (isProp.type === 6 && isProp.value.content.startsWith("vue:")) {
-      tag = isProp.value.content.slice(4);
-    }
-  }
-  const builtIn = isCoreComponent(tag) || context.isBuiltInComponent(tag);
-  if (builtIn) {
-    if (!ssr) context.helper(builtIn);
-    return builtIn;
-  }
-  context.helper(RESOLVE_COMPONENT);
-  context.components.add(tag);
-  return toValidAssetId(tag, `component`);
-}
-function buildProps(node, context, props = node.props, isComponent, isDynamicComponent, ssr = false) {
-  const { tag, loc: elementLoc, children } = node;
-  let properties = [];
-  const mergeArgs = [];
-  const runtimeDirectives = [];
-  const hasChildren = children.length > 0;
-  let shouldUseBlock = false;
-  let patchFlag = 0;
-  let hasRef = false;
-  let hasClassBinding = false;
-  let hasStyleBinding = false;
-  let hasHydrationEventBinding = false;
-  let hasDynamicKeys = false;
-  let hasVnodeHook = false;
-  const dynamicPropNames = [];
-  const pushMergeArg = (arg) => {
-    if (properties.length) {
-      mergeArgs.push(
-        createObjectExpression(dedupeProperties(properties), elementLoc)
-      );
-      properties = [];
-    }
-    if (arg) mergeArgs.push(arg);
-  };
-  const pushRefVForMarker = () => {
-    if (context.scopes.vFor > 0) {
-      properties.push(
-        createObjectProperty(
-          createSimpleExpression("ref_for", true),
-          createSimpleExpression("true")
-        )
-      );
-    }
-  };
-  const analyzePatchFlag = ({ key, value }) => {
-    if (isStaticExp(key)) {
-      const name = key.content;
-      const isEventHandler = isOn(name);
-      if (isEventHandler && (!isComponent || isDynamicComponent) && // omit the flag for click handlers because hydration gives click
-      // dedicated fast path.
-      name.toLowerCase() !== "onclick" && // omit v-model handlers
-      name !== "onUpdate:modelValue" && // omit onVnodeXXX hooks
-      !isReservedProp(name)) {
-        hasHydrationEventBinding = true;
-      }
-      if (isEventHandler && isReservedProp(name)) {
-        hasVnodeHook = true;
-      }
-      if (isEventHandler && value.type === 14) {
-        value = value.arguments[0];
-      }
-      if (value.type === 20 || (value.type === 4 || value.type === 8) && getConstantType(value, context) > 0) {
-        return;
-      }
-      if (name === "ref") {
-        hasRef = true;
-      } else if (name === "class") {
-        hasClassBinding = true;
-      } else if (name === "style") {
-        hasStyleBinding = true;
-      } else if (name !== "key" && !dynamicPropNames.includes(name)) {
-        dynamicPropNames.push(name);
-      }
-      if (isComponent && (name === "class" || name === "style") && !dynamicPropNames.includes(name)) {
-        dynamicPropNames.push(name);
-      }
-    } else {
-      hasDynamicKeys = true;
-    }
-  };
-  for (let i = 0; i < props.length; i++) {
-    const prop = props[i];
-    if (prop.type === 6) {
-      const { loc, name, nameLoc, value } = prop;
-      let isStatic = true;
-      if (name === "ref") {
-        hasRef = true;
-        pushRefVForMarker();
-      }
-      if (name === "is" && (isComponentTag(tag) || value && value.content.startsWith("vue:") || false)) {
-        continue;
-      }
-      properties.push(
-        createObjectProperty(
-          createSimpleExpression(name, true, nameLoc),
-          createSimpleExpression(
-            value ? value.content : "",
-            isStatic,
-            value ? value.loc : loc
-          )
-        )
-      );
-    } else {
-      const { name, arg, exp, loc, modifiers } = prop;
-      const isVBind = name === "bind";
-      const isVOn = name === "on";
-      if (name === "slot") {
-        if (!isComponent) {
-          context.onError(
-            createCompilerError(40, loc)
-          );
-        }
-        continue;
-      }
-      if (name === "once" || name === "memo") {
-        continue;
-      }
-      if (name === "is" || isVBind && isStaticArgOf(arg, "is") && (isComponentTag(tag) || false)) {
-        continue;
-      }
-      if (isVOn && ssr) {
-        continue;
-      }
-      if (
-        // #938: elements with dynamic keys should be forced into blocks
-        isVBind && isStaticArgOf(arg, "key") || // inline before-update hooks need to force block so that it is invoked
-        // before children
-        isVOn && hasChildren && isStaticArgOf(arg, "vue:before-update")
-      ) {
-        shouldUseBlock = true;
-      }
-      if (isVBind && isStaticArgOf(arg, "ref")) {
-        pushRefVForMarker();
-      }
-      if (!arg && (isVBind || isVOn)) {
-        hasDynamicKeys = true;
-        if (exp) {
-          if (isVBind) {
-            pushRefVForMarker();
-            pushMergeArg();
-            mergeArgs.push(exp);
-          } else {
-            pushMergeArg({
-              type: 14,
-              loc,
-              callee: context.helper(TO_HANDLERS),
-              arguments: isComponent ? [exp] : [exp, `true`]
-            });
-          }
-        } else {
-          context.onError(
-            createCompilerError(
-              isVBind ? 34 : 35,
-              loc
-            )
-          );
-        }
-        continue;
-      }
-      if (isVBind && modifiers.some((mod) => mod.content === "prop")) {
-        patchFlag |= 32;
-      }
-      const directiveTransform = context.directiveTransforms[name];
-      if (directiveTransform) {
-        const { props: props2, needRuntime } = directiveTransform(prop, node, context);
-        !ssr && props2.forEach(analyzePatchFlag);
-        if (isVOn && arg && !isStaticExp(arg)) {
-          pushMergeArg(createObjectExpression(props2, elementLoc));
-        } else {
-          properties.push(...props2);
-        }
-        if (needRuntime) {
-          runtimeDirectives.push(prop);
-          if (isSymbol(needRuntime)) {
-            directiveImportMap.set(prop, needRuntime);
-          }
-        }
-      } else if (!isBuiltInDirective(name)) {
-        runtimeDirectives.push(prop);
-        if (hasChildren) {
-          shouldUseBlock = true;
-        }
-      }
-    }
-  }
-  let propsExpression = void 0;
-  if (mergeArgs.length) {
-    pushMergeArg();
-    if (mergeArgs.length > 1) {
-      propsExpression = createCallExpression(
-        context.helper(MERGE_PROPS),
-        mergeArgs,
-        elementLoc
-      );
-    } else {
-      propsExpression = mergeArgs[0];
-    }
-  } else if (properties.length) {
-    propsExpression = createObjectExpression(
-      dedupeProperties(properties),
-      elementLoc
-    );
-  }
-  if (hasDynamicKeys) {
-    patchFlag |= 16;
-  } else {
-    if (hasClassBinding && !isComponent) {
-      patchFlag |= 2;
-    }
-    if (hasStyleBinding && !isComponent) {
-      patchFlag |= 4;
-    }
-    if (dynamicPropNames.length) {
-      patchFlag |= 8;
-    }
-    if (hasHydrationEventBinding) {
-      patchFlag |= 32;
-    }
-  }
-  if (!shouldUseBlock && (patchFlag === 0 || patchFlag === 32) && (hasRef || hasVnodeHook || runtimeDirectives.length > 0)) {
-    patchFlag |= 512;
-  }
-  if (!context.inSSR && propsExpression) {
-    switch (propsExpression.type) {
-      case 15:
-        let classKeyIndex = -1;
-        let styleKeyIndex = -1;
-        let hasDynamicKey = false;
-        for (let i = 0; i < propsExpression.properties.length; i++) {
-          const key = propsExpression.properties[i].key;
-          if (isStaticExp(key)) {
-            if (key.content === "class") {
-              classKeyIndex = i;
-            } else if (key.content === "style") {
-              styleKeyIndex = i;
-            }
-          } else if (!key.isHandlerKey) {
-            hasDynamicKey = true;
-          }
-        }
-        const classProp = propsExpression.properties[classKeyIndex];
-        const styleProp = propsExpression.properties[styleKeyIndex];
-        if (!hasDynamicKey) {
-          if (classProp && !isStaticExp(classProp.value)) {
-            classProp.value = createCallExpression(
-              context.helper(NORMALIZE_CLASS),
-              [classProp.value]
-            );
-          }
-          if (styleProp && // the static style is compiled into an object,
-          // so use `hasStyleBinding` to ensure that it is a dynamic style binding
-          (hasStyleBinding || styleProp.value.type === 4 && styleProp.value.content.trim()[0] === `[` || // v-bind:style and style both exist,
-          // v-bind:style with static literal object
-          styleProp.value.type === 17)) {
-            styleProp.value = createCallExpression(
-              context.helper(NORMALIZE_STYLE),
-              [styleProp.value]
-            );
-          }
-        } else {
-          propsExpression = createCallExpression(
-            context.helper(NORMALIZE_PROPS),
-            [propsExpression]
-          );
-        }
-        break;
-      case 14:
-        break;
-      default:
-        propsExpression = createCallExpression(
-          context.helper(NORMALIZE_PROPS),
-          [
-            createCallExpression(context.helper(GUARD_REACTIVE_PROPS), [
-              propsExpression
-            ])
-          ]
-        );
-        break;
-    }
-  }
-  return {
-    props: propsExpression,
-    directives: runtimeDirectives,
-    patchFlag,
-    dynamicPropNames,
-    shouldUseBlock
-  };
-}
-function dedupeProperties(properties) {
-  const knownProps = /* @__PURE__ */ new Map();
-  const deduped = [];
-  for (let i = 0; i < properties.length; i++) {
-    const prop = properties[i];
-    if (prop.key.type === 8 || !prop.key.isStatic) {
-      deduped.push(prop);
-      continue;
-    }
-    const name = prop.key.content;
-    const existing = knownProps.get(name);
-    if (existing) {
-      if (name === "style" || name === "class" || isOn(name)) {
-        mergeAsArray(existing, prop);
-      }
-    } else {
-      knownProps.set(name, prop);
-      deduped.push(prop);
-    }
-  }
-  return deduped;
-}
-function mergeAsArray(existing, incoming) {
-  if (existing.value.type === 17) {
-    existing.value.elements.push(incoming.value);
-  } else {
-    existing.value = createArrayExpression(
-      [existing.value, incoming.value],
-      existing.loc
-    );
-  }
-}
-function buildDirectiveArgs(dir, context) {
-  const dirArgs = [];
-  const runtime = directiveImportMap.get(dir);
-  if (runtime) {
-    dirArgs.push(context.helperString(runtime));
-  } else {
-    {
-      context.helper(RESOLVE_DIRECTIVE);
-      context.directives.add(dir.name);
-      dirArgs.push(toValidAssetId(dir.name, `directive`));
-    }
-  }
-  const { loc } = dir;
-  if (dir.exp) dirArgs.push(dir.exp);
-  if (dir.arg) {
-    if (!dir.exp) {
-      dirArgs.push(`void 0`);
-    }
-    dirArgs.push(dir.arg);
-  }
-  if (Object.keys(dir.modifiers).length) {
-    if (!dir.arg) {
-      if (!dir.exp) {
-        dirArgs.push(`void 0`);
-      }
-      dirArgs.push(`void 0`);
-    }
-    const trueExpression = createSimpleExpression(`true`, false, loc);
-    dirArgs.push(
-      createObjectExpression(
-        dir.modifiers.map(
-          (modifier) => createObjectProperty(modifier, trueExpression)
-        ),
-        loc
-      )
-    );
-  }
-  return createArrayExpression(dirArgs, dir.loc);
-}
-function stringifyDynamicPropNames(props) {
-  let propsNamesString = `[`;
-  for (let i = 0, l = props.length; i < l; i++) {
-    propsNamesString += JSON.stringify(props[i]);
-    if (i < l - 1) propsNamesString += ", ";
-  }
-  return propsNamesString + `]`;
-}
-function isComponentTag(tag) {
-  return tag === "component" || tag === "Component";
-}
-
-const transformSlotOutlet = (node, context) => {
-  if (isSlotOutlet(node)) {
-    const { children, loc } = node;
-    const { slotName, slotProps } = processSlotOutlet(node, context);
-    const slotArgs = [
-      context.prefixIdentifiers ? `_ctx.$slots` : `$slots`,
-      slotName,
-      "{}",
-      "undefined",
-      "true"
-    ];
-    let expectedLen = 2;
-    if (slotProps) {
-      slotArgs[2] = slotProps;
-      expectedLen = 3;
-    }
-    if (children.length) {
-      slotArgs[3] = createFunctionExpression([], children, false, false, loc);
-      expectedLen = 4;
-    }
-    if (context.scopeId && !context.slotted) {
-      expectedLen = 5;
-    }
-    slotArgs.splice(expectedLen);
-    node.codegenNode = createCallExpression(
-      context.helper(RENDER_SLOT),
-      slotArgs,
-      loc
-    );
-  }
-};
-function processSlotOutlet(node, context) {
-  let slotName = `"default"`;
-  let slotProps = void 0;
-  const nonNameProps = [];
-  for (let i = 0; i < node.props.length; i++) {
-    const p = node.props[i];
-    if (p.type === 6) {
-      if (p.value) {
-        if (p.name === "name") {
-          slotName = JSON.stringify(p.value.content);
-        } else {
-          p.name = camelize(p.name);
-          nonNameProps.push(p);
-        }
-      }
-    } else {
-      if (p.name === "bind" && isStaticArgOf(p.arg, "name")) {
-        if (p.exp) {
-          slotName = p.exp;
-        } else if (p.arg && p.arg.type === 4) {
-          const name = camelize(p.arg.content);
-          slotName = p.exp = createSimpleExpression(name, false, p.arg.loc);
-        }
-      } else {
-        if (p.name === "bind" && p.arg && isStaticExp(p.arg)) {
-          p.arg.content = camelize(p.arg.content);
-        }
-        nonNameProps.push(p);
-      }
-    }
-  }
-  if (nonNameProps.length > 0) {
-    const { props, directives } = buildProps(
-      node,
-      context,
-      nonNameProps,
-      false,
-      false
-    );
-    slotProps = props;
-    if (directives.length) {
-      context.onError(
-        createCompilerError(
-          36,
-          directives[0].loc
-        )
-      );
-    }
-  }
-  return {
-    slotName,
-    slotProps
-  };
-}
-
-const transformOn$1 = (dir, node, context, augmentor) => {
-  const { loc, modifiers, arg } = dir;
-  if (!dir.exp && !modifiers.length) {
-    context.onError(createCompilerError(35, loc));
-  }
-  let eventName;
-  if (arg.type === 4) {
-    if (arg.isStatic) {
-      let rawName = arg.content;
-      if (rawName.startsWith("vnode")) {
-        context.onError(createCompilerError(51, arg.loc));
-      }
-      if (rawName.startsWith("vue:")) {
-        rawName = `vnode-${rawName.slice(4)}`;
-      }
-      const eventString = node.tagType !== 0 || rawName.startsWith("vnode") || !/[A-Z]/.test(rawName) ? (
-        // for non-element and vnode lifecycle event listeners, auto convert
-        // it to camelCase. See issue #2249
-        toHandlerKey(camelize(rawName))
-      ) : (
-        // preserve case for plain element listeners that have uppercase
-        // letters, as these may be custom elements' custom events
-        `on:${rawName}`
-      );
-      eventName = createSimpleExpression(eventString, true, arg.loc);
-    } else {
-      eventName = createCompoundExpression([
-        `${context.helperString(TO_HANDLER_KEY)}(`,
-        arg,
-        `)`
-      ]);
-    }
-  } else {
-    eventName = arg;
-    eventName.children.unshift(`${context.helperString(TO_HANDLER_KEY)}(`);
-    eventName.children.push(`)`);
-  }
-  let exp = dir.exp;
-  if (exp && !exp.content.trim()) {
-    exp = void 0;
-  }
-  let shouldCache = context.cacheHandlers && !exp && !context.inVOnce;
-  if (exp) {
-    const isMemberExp = isMemberExpression(exp);
-    const isInlineStatement = !(isMemberExp || isFnExpression(exp));
-    const hasMultipleStatements = exp.content.includes(`;`);
-    {
-      validateBrowserExpression(
-        exp,
-        context,
-        false,
-        hasMultipleStatements
-      );
-    }
-    if (isInlineStatement || shouldCache && isMemberExp) {
-      exp = createCompoundExpression([
-        `${isInlineStatement ? `$event` : `${``}(...args)`} => ${hasMultipleStatements ? `{` : `(`}`,
-        exp,
-        hasMultipleStatements ? `}` : `)`
-      ]);
-    }
-  }
-  let ret = {
-    props: [
-      createObjectProperty(
-        eventName,
-        exp || createSimpleExpression(`() => {}`, false, loc)
-      )
-    ]
-  };
-  if (augmentor) {
-    ret = augmentor(ret);
-  }
-  if (shouldCache) {
-    ret.props[0].value = context.cache(ret.props[0].value);
-  }
-  ret.props.forEach((p) => p.key.isHandlerKey = true);
-  return ret;
-};
-
-const transformText = (node, context) => {
-  if (node.type === 0 || node.type === 1 || node.type === 11 || node.type === 10) {
-    return () => {
-      const children = node.children;
-      let currentContainer = void 0;
-      let hasText = false;
-      for (let i = 0; i < children.length; i++) {
-        const child = children[i];
-        if (isText$1(child)) {
-          hasText = true;
-          for (let j = i + 1; j < children.length; j++) {
-            const next = children[j];
-            if (isText$1(next)) {
-              if (!currentContainer) {
-                currentContainer = children[i] = createCompoundExpression(
-                  [child],
-                  child.loc
-                );
-              }
-              currentContainer.children.push(` + `, next);
-              children.splice(j, 1);
-              j--;
-            } else {
-              currentContainer = void 0;
-              break;
-            }
-          }
-        }
-      }
-      if (!hasText || // if this is a plain element with a single text child, leave it
-      // as-is since the runtime has dedicated fast path for this by directly
-      // setting textContent of the element.
-      // for component root it's always normalized anyway.
-      children.length === 1 && (node.type === 0 || node.type === 1 && node.tagType === 0 && // #3756
-      // custom directives can potentially add DOM elements arbitrarily,
-      // we need to avoid setting textContent of the element at runtime
-      // to avoid accidentally overwriting the DOM elements added
-      // by the user through custom directives.
-      !node.props.find(
-        (p) => p.type === 7 && !context.directiveTransforms[p.name]
-      ) && // in compat mode, <template> tags with no special directives
-      // will be rendered as a fragment so its children must be
-      // converted into vnodes.
-      true)) {
-        return;
-      }
-      for (let i = 0; i < children.length; i++) {
-        const child = children[i];
-        if (isText$1(child) || child.type === 8) {
-          const callArgs = [];
-          if (child.type !== 2 || child.content !== " ") {
-            callArgs.push(child);
-          }
-          if (!context.ssr && getConstantType(child, context) === 0) {
-            callArgs.push(
-              1 + (` /* ${PatchFlagNames[1]} */` )
-            );
-          }
-          children[i] = {
-            type: 12,
-            content: child,
-            loc: child.loc,
-            codegenNode: createCallExpression(
-              context.helper(CREATE_TEXT),
-              callArgs
-            )
-          };
-        }
-      }
-    };
-  }
-};
-
-const seen$1 = /* @__PURE__ */ new WeakSet();
-const transformOnce = (node, context) => {
-  if (node.type === 1 && findDir(node, "once", true)) {
-    if (seen$1.has(node) || context.inVOnce || context.inSSR) {
-      return;
-    }
-    seen$1.add(node);
-    context.inVOnce = true;
-    context.helper(SET_BLOCK_TRACKING);
-    return () => {
-      context.inVOnce = false;
-      const cur = context.currentNode;
-      if (cur.codegenNode) {
-        cur.codegenNode = context.cache(
-          cur.codegenNode,
-          true
-          /* isVNode */
-        );
-      }
-    };
-  }
-};
-
-const transformModel$1 = (dir, node, context) => {
-  const { exp, arg } = dir;
-  if (!exp) {
-    context.onError(
-      createCompilerError(41, dir.loc)
-    );
-    return createTransformProps();
-  }
-  const rawExp = exp.loc.source.trim();
-  const expString = exp.type === 4 ? exp.content : rawExp;
-  const bindingType = context.bindingMetadata[rawExp];
-  if (bindingType === "props" || bindingType === "props-aliased") {
-    context.onError(createCompilerError(44, exp.loc));
-    return createTransformProps();
-  }
-  const maybeRef = false;
-  if (!expString.trim() || !isMemberExpression(exp) && !maybeRef) {
-    context.onError(
-      createCompilerError(42, exp.loc)
-    );
-    return createTransformProps();
-  }
-  const propName = arg ? arg : createSimpleExpression("modelValue", true);
-  const eventName = arg ? isStaticExp(arg) ? `onUpdate:${camelize(arg.content)}` : createCompoundExpression(['"onUpdate:" + ', arg]) : `onUpdate:modelValue`;
-  let assignmentExp;
-  const eventArg = context.isTS ? `($event: any)` : `$event`;
-  {
-    assignmentExp = createCompoundExpression([
-      `${eventArg} => ((`,
-      exp,
-      `) = $event)`
-    ]);
-  }
-  const props = [
-    // modelValue: foo
-    createObjectProperty(propName, dir.exp),
-    // "onUpdate:modelValue": $event => (foo = $event)
-    createObjectProperty(eventName, assignmentExp)
-  ];
-  if (dir.modifiers.length && node.tagType === 1) {
-    const modifiers = dir.modifiers.map((m) => m.content).map((m) => (isSimpleIdentifier(m) ? m : JSON.stringify(m)) + `: true`).join(`, `);
-    const modifiersKey = arg ? isStaticExp(arg) ? `${arg.content}Modifiers` : createCompoundExpression([arg, ' + "Modifiers"']) : `modelModifiers`;
-    props.push(
-      createObjectProperty(
-        modifiersKey,
-        createSimpleExpression(
-          `{ ${modifiers} }`,
-          false,
-          dir.loc,
-          2
-        )
-      )
-    );
-  }
-  return createTransformProps(props);
-};
-function createTransformProps(props = []) {
-  return { props };
-}
-
-const seen = /* @__PURE__ */ new WeakSet();
-const transformMemo = (node, context) => {
-  if (node.type === 1) {
-    const dir = findDir(node, "memo");
-    if (!dir || seen.has(node)) {
-      return;
-    }
-    seen.add(node);
-    return () => {
-      const codegenNode = node.codegenNode || context.currentNode.codegenNode;
-      if (codegenNode && codegenNode.type === 13) {
-        if (node.tagType !== 1) {
-          convertToBlock(codegenNode, context);
-        }
-        node.codegenNode = createCallExpression(context.helper(WITH_MEMO), [
-          dir.exp,
-          createFunctionExpression(void 0, codegenNode),
-          `_cache`,
-          String(context.cached.length)
-        ]);
-        context.cached.push(null);
-      }
-    };
-  }
-};
-
-function getBaseTransformPreset(prefixIdentifiers) {
-  return [
-    [
-      transformOnce,
-      transformIf,
-      transformMemo,
-      transformFor,
-      ...[],
-      ...[transformExpression] ,
-      transformSlotOutlet,
-      transformElement,
-      trackSlotScopes,
-      transformText
-    ],
-    {
-      on: transformOn$1,
-      bind: transformBind,
-      model: transformModel$1
-    }
-  ];
-}
-function baseCompile(source, options = {}) {
-  const onError = options.onError || defaultOnError;
-  const isModuleMode = options.mode === "module";
-  {
-    if (options.prefixIdentifiers === true) {
-      onError(createCompilerError(47));
-    } else if (isModuleMode) {
-      onError(createCompilerError(48));
-    }
-  }
-  const prefixIdentifiers = false;
-  if (options.cacheHandlers) {
-    onError(createCompilerError(49));
-  }
-  if (options.scopeId && !isModuleMode) {
-    onError(createCompilerError(50));
-  }
-  const resolvedOptions = extend({}, options, {
-    prefixIdentifiers
-  });
-  const ast = isString(source) ? baseParse(source, resolvedOptions) : source;
-  const [nodeTransforms, directiveTransforms] = getBaseTransformPreset();
-  transform(
-    ast,
-    extend({}, resolvedOptions, {
-      nodeTransforms: [
-        ...nodeTransforms,
-        ...options.nodeTransforms || []
-        // user transforms
-      ],
-      directiveTransforms: extend(
-        {},
-        directiveTransforms,
-        options.directiveTransforms || {}
-        // user transforms
-      )
-    })
-  );
-  return generate(ast, resolvedOptions);
-}
-
-const noopDirectiveTransform = () => ({ props: [] });
-
-const V_MODEL_RADIO = Symbol(`vModelRadio` );
-const V_MODEL_CHECKBOX = Symbol(
-  `vModelCheckbox` 
-);
-const V_MODEL_TEXT = Symbol(`vModelText` );
-const V_MODEL_SELECT = Symbol(
-  `vModelSelect` 
-);
-const V_MODEL_DYNAMIC = Symbol(
-  `vModelDynamic` 
-);
-const V_ON_WITH_MODIFIERS = Symbol(
-  `vOnModifiersGuard` 
-);
-const V_ON_WITH_KEYS = Symbol(
-  `vOnKeysGuard` 
-);
-const V_SHOW = Symbol(`vShow` );
-const TRANSITION = Symbol(`Transition` );
-const TRANSITION_GROUP = Symbol(
-  `TransitionGroup` 
-);
-registerRuntimeHelpers({
-  [V_MODEL_RADIO]: `vModelRadio`,
-  [V_MODEL_CHECKBOX]: `vModelCheckbox`,
-  [V_MODEL_TEXT]: `vModelText`,
-  [V_MODEL_SELECT]: `vModelSelect`,
-  [V_MODEL_DYNAMIC]: `vModelDynamic`,
-  [V_ON_WITH_MODIFIERS]: `withModifiers`,
-  [V_ON_WITH_KEYS]: `withKeys`,
-  [V_SHOW]: `vShow`,
-  [TRANSITION]: `Transition`,
-  [TRANSITION_GROUP]: `TransitionGroup`
-});
-
-let decoder;
-function decodeHtmlBrowser(raw, asAttr = false) {
-  if (!decoder) {
-    decoder = document.createElement("div");
-  }
-  if (asAttr) {
-    decoder.innerHTML = `<div foo="${raw.replace(/"/g, "&quot;")}">`;
-    return decoder.children[0].getAttribute("foo");
-  } else {
-    decoder.innerHTML = raw;
-    return decoder.textContent;
-  }
-}
-
-const parserOptions = {
-  parseMode: "html",
-  isVoidTag,
-  isNativeTag: (tag) => isHTMLTag(tag) || isSVGTag(tag) || isMathMLTag(tag),
-  isPreTag: (tag) => tag === "pre",
-  isIgnoreNewlineTag: (tag) => tag === "pre" || tag === "textarea",
-  decodeEntities: decodeHtmlBrowser ,
-  isBuiltInComponent: (tag) => {
-    if (tag === "Transition" || tag === "transition") {
-      return TRANSITION;
-    } else if (tag === "TransitionGroup" || tag === "transition-group") {
-      return TRANSITION_GROUP;
-    }
-  },
-  // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction-dispatcher
-  getNamespace(tag, parent, rootNamespace) {
-    let ns = parent ? parent.ns : rootNamespace;
-    if (parent && ns === 2) {
-      if (parent.tag === "annotation-xml") {
-        if (tag === "svg") {
-          return 1;
-        }
-        if (parent.props.some(
-          (a) => a.type === 6 && a.name === "encoding" && a.value != null && (a.value.content === "text/html" || a.value.content === "application/xhtml+xml")
-        )) {
-          ns = 0;
-        }
-      } else if (/^m(?:[ions]|text)$/.test(parent.tag) && tag !== "mglyph" && tag !== "malignmark") {
-        ns = 0;
-      }
-    } else if (parent && ns === 1) {
-      if (parent.tag === "foreignObject" || parent.tag === "desc" || parent.tag === "title") {
-        ns = 0;
-      }
-    }
-    if (ns === 0) {
-      if (tag === "svg") {
-        return 1;
-      }
-      if (tag === "math") {
-        return 2;
-      }
-    }
-    return ns;
-  }
-};
-
-const transformStyle = (node) => {
-  if (node.type === 1) {
-    node.props.forEach((p, i) => {
-      if (p.type === 6 && p.name === "style" && p.value) {
-        node.props[i] = {
-          type: 7,
-          name: `bind`,
-          arg: createSimpleExpression(`style`, true, p.loc),
-          exp: parseInlineCSS(p.value.content, p.loc),
-          modifiers: [],
-          loc: p.loc
-        };
-      }
-    });
-  }
-};
-const parseInlineCSS = (cssText, loc) => {
-  const normalized = parseStringStyle(cssText);
-  return createSimpleExpression(
-    JSON.stringify(normalized),
-    false,
-    loc,
-    3
-  );
-};
-
-function createDOMCompilerError(code, loc) {
-  return createCompilerError(
-    code,
-    loc,
-    DOMErrorMessages 
-  );
-}
-const DOMErrorMessages = {
-  [53]: `v-html is missing expression.`,
-  [54]: `v-html will override element children.`,
-  [55]: `v-text is missing expression.`,
-  [56]: `v-text will override element children.`,
-  [57]: `v-model can only be used on <input>, <textarea> and <select> elements.`,
-  [58]: `v-model argument is not supported on plain elements.`,
-  [59]: `v-model cannot be used on file inputs since they are read-only. Use a v-on:change listener instead.`,
-  [60]: `Unnecessary value binding used alongside v-model. It will interfere with v-model's behavior.`,
-  [61]: `v-show is missing expression.`,
-  [62]: `<Transition> expects exactly one child element or component.`,
-  [63]: `Tags with side effect (<script> and <style>) are ignored in client component templates.`
-};
-
-const transformVHtml = (dir, node, context) => {
-  const { exp, loc } = dir;
-  if (!exp) {
-    context.onError(
-      createDOMCompilerError(53, loc)
-    );
-  }
-  if (node.children.length) {
-    context.onError(
-      createDOMCompilerError(54, loc)
-    );
-    node.children.length = 0;
-  }
-  return {
-    props: [
-      createObjectProperty(
-        createSimpleExpression(`innerHTML`, true, loc),
-        exp || createSimpleExpression("", true)
-      )
-    ]
-  };
-};
-
-const transformVText = (dir, node, context) => {
-  const { exp, loc } = dir;
-  if (!exp) {
-    context.onError(
-      createDOMCompilerError(55, loc)
-    );
-  }
-  if (node.children.length) {
-    context.onError(
-      createDOMCompilerError(56, loc)
-    );
-    node.children.length = 0;
-  }
-  return {
-    props: [
-      createObjectProperty(
-        createSimpleExpression(`textContent`, true),
-        exp ? getConstantType(exp, context) > 0 ? exp : createCallExpression(
-          context.helperString(TO_DISPLAY_STRING),
-          [exp],
-          loc
-        ) : createSimpleExpression("", true)
-      )
-    ]
-  };
-};
-
-const transformModel = (dir, node, context) => {
-  const baseResult = transformModel$1(dir, node, context);
-  if (!baseResult.props.length || node.tagType === 1) {
-    return baseResult;
-  }
-  if (dir.arg) {
-    context.onError(
-      createDOMCompilerError(
-        58,
-        dir.arg.loc
-      )
-    );
-  }
-  function checkDuplicatedValue() {
-    const value = findDir(node, "bind");
-    if (value && isStaticArgOf(value.arg, "value")) {
-      context.onError(
-        createDOMCompilerError(
-          60,
-          value.loc
-        )
-      );
-    }
-  }
-  const { tag } = node;
-  const isCustomElement = context.isCustomElement(tag);
-  if (tag === "input" || tag === "textarea" || tag === "select" || isCustomElement) {
-    let directiveToUse = V_MODEL_TEXT;
-    let isInvalidType = false;
-    if (tag === "input" || isCustomElement) {
-      const type = findProp(node, `type`);
-      if (type) {
-        if (type.type === 7) {
-          directiveToUse = V_MODEL_DYNAMIC;
-        } else if (type.value) {
-          switch (type.value.content) {
-            case "radio":
-              directiveToUse = V_MODEL_RADIO;
-              break;
-            case "checkbox":
-              directiveToUse = V_MODEL_CHECKBOX;
-              break;
-            case "file":
-              isInvalidType = true;
-              context.onError(
-                createDOMCompilerError(
-                  59,
-                  dir.loc
-                )
-              );
-              break;
-            default:
-              checkDuplicatedValue();
-              break;
-          }
-        }
-      } else if (hasDynamicKeyVBind(node)) {
-        directiveToUse = V_MODEL_DYNAMIC;
-      } else {
-        checkDuplicatedValue();
-      }
-    } else if (tag === "select") {
-      directiveToUse = V_MODEL_SELECT;
-    } else {
-      checkDuplicatedValue();
-    }
-    if (!isInvalidType) {
-      baseResult.needRuntime = context.helper(directiveToUse);
-    }
-  } else {
-    context.onError(
-      createDOMCompilerError(
-        57,
-        dir.loc
-      )
-    );
-  }
-  baseResult.props = baseResult.props.filter(
-    (p) => !(p.key.type === 4 && p.key.content === "modelValue")
-  );
-  return baseResult;
-};
-
-const isEventOptionModifier = /* @__PURE__ */ makeMap(`passive,once,capture`);
-const isNonKeyModifier = /* @__PURE__ */ makeMap(
-  // event propagation management
-  `stop,prevent,self,ctrl,shift,alt,meta,exact,middle`
-);
-const maybeKeyModifier = /* @__PURE__ */ makeMap("left,right");
-const isKeyboardEvent = /* @__PURE__ */ makeMap(`onkeyup,onkeydown,onkeypress`);
-const resolveModifiers = (key, modifiers, context, loc) => {
-  const keyModifiers = [];
-  const nonKeyModifiers = [];
-  const eventOptionModifiers = [];
-  for (let i = 0; i < modifiers.length; i++) {
-    const modifier = modifiers[i].content;
-    if (isEventOptionModifier(modifier)) {
-      eventOptionModifiers.push(modifier);
-    } else {
-      if (maybeKeyModifier(modifier)) {
-        if (isStaticExp(key)) {
-          if (isKeyboardEvent(key.content.toLowerCase())) {
-            keyModifiers.push(modifier);
-          } else {
-            nonKeyModifiers.push(modifier);
-          }
-        } else {
-          keyModifiers.push(modifier);
-          nonKeyModifiers.push(modifier);
-        }
-      } else {
-        if (isNonKeyModifier(modifier)) {
-          nonKeyModifiers.push(modifier);
-        } else {
-          keyModifiers.push(modifier);
-        }
-      }
-    }
-  }
-  return {
-    keyModifiers,
-    nonKeyModifiers,
-    eventOptionModifiers
-  };
-};
-const transformClick = (key, event) => {
-  const isStaticClick = isStaticExp(key) && key.content.toLowerCase() === "onclick";
-  return isStaticClick ? createSimpleExpression(event, true) : key.type !== 4 ? createCompoundExpression([
-    `(`,
-    key,
-    `) === "onClick" ? "${event}" : (`,
-    key,
-    `)`
-  ]) : key;
-};
-const transformOn = (dir, node, context) => {
-  return transformOn$1(dir, node, context, (baseResult) => {
-    const { modifiers } = dir;
-    if (!modifiers.length) return baseResult;
-    let { key, value: handlerExp } = baseResult.props[0];
-    const { keyModifiers, nonKeyModifiers, eventOptionModifiers } = resolveModifiers(key, modifiers, context, dir.loc);
-    if (nonKeyModifiers.includes("right")) {
-      key = transformClick(key, `onContextmenu`);
-    }
-    if (nonKeyModifiers.includes("middle")) {
-      key = transformClick(key, `onMouseup`);
-    }
-    if (nonKeyModifiers.length) {
-      handlerExp = createCallExpression(context.helper(V_ON_WITH_MODIFIERS), [
-        handlerExp,
-        JSON.stringify(nonKeyModifiers)
-      ]);
-    }
-    if (keyModifiers.length && // if event name is dynamic, always wrap with keys guard
-    (!isStaticExp(key) || isKeyboardEvent(key.content.toLowerCase()))) {
-      handlerExp = createCallExpression(context.helper(V_ON_WITH_KEYS), [
-        handlerExp,
-        JSON.stringify(keyModifiers)
-      ]);
-    }
-    if (eventOptionModifiers.length) {
-      const modifierPostfix = eventOptionModifiers.map(capitalize).join("");
-      key = isStaticExp(key) ? createSimpleExpression(`${key.content}${modifierPostfix}`, true) : createCompoundExpression([`(`, key, `) + "${modifierPostfix}"`]);
-    }
-    return {
-      props: [createObjectProperty(key, handlerExp)]
-    };
-  });
-};
-
-const transformShow = (dir, node, context) => {
-  const { exp, loc } = dir;
-  if (!exp) {
-    context.onError(
-      createDOMCompilerError(61, loc)
-    );
-  }
-  return {
-    props: [],
-    needRuntime: context.helper(V_SHOW)
-  };
-};
-
-const transformTransition = (node, context) => {
-  if (node.type === 1 && node.tagType === 1) {
-    const component = context.isBuiltInComponent(node.tag);
-    if (component === TRANSITION) {
-      return () => {
-        if (!node.children.length) {
-          return;
-        }
-        if (hasMultipleChildren(node)) {
-          context.onError(
-            createDOMCompilerError(
-              62,
-              {
-                start: node.children[0].loc.start,
-                end: node.children[node.children.length - 1].loc.end,
-                source: ""
-              }
-            )
-          );
-        }
-        const child = node.children[0];
-        if (child.type === 1) {
-          for (const p of child.props) {
-            if (p.type === 7 && p.name === "show") {
-              node.props.push({
-                type: 6,
-                name: "persisted",
-                nameLoc: node.loc,
-                value: void 0,
-                loc: node.loc
-              });
-            }
-          }
-        }
-      };
-    }
-  }
-};
-function hasMultipleChildren(node) {
-  const children = node.children = node.children.filter(
-    (c) => c.type !== 3 && !(c.type === 2 && !c.content.trim())
-  );
-  const child = children[0];
-  return children.length !== 1 || child.type === 11 || child.type === 9 && child.branches.some(hasMultipleChildren);
-}
-
-const ignoreSideEffectTags = (node, context) => {
-  if (node.type === 1 && node.tagType === 0 && (node.tag === "script" || node.tag === "style")) {
-    context.onError(
-      createDOMCompilerError(
-        63,
-        node.loc
-      )
-    );
-    context.removeNode();
-  }
-};
-
-function isValidHTMLNesting(parent, child) {
-  if (parent in onlyValidChildren) {
-    return onlyValidChildren[parent].has(child);
-  }
-  if (child in onlyValidParents) {
-    return onlyValidParents[child].has(parent);
-  }
-  if (parent in knownInvalidChildren) {
-    if (knownInvalidChildren[parent].has(child)) return false;
-  }
-  if (child in knownInvalidParents) {
-    if (knownInvalidParents[child].has(parent)) return false;
-  }
-  return true;
-}
-const headings = /* @__PURE__ */ new Set(["h1", "h2", "h3", "h4", "h5", "h6"]);
-const emptySet = /* @__PURE__ */ new Set([]);
-const onlyValidChildren = {
-  head: /* @__PURE__ */ new Set([
-    "base",
-    "basefront",
-    "bgsound",
-    "link",
-    "meta",
-    "title",
-    "noscript",
-    "noframes",
-    "style",
-    "script",
-    "template"
-  ]),
-  optgroup: /* @__PURE__ */ new Set(["option"]),
-  select: /* @__PURE__ */ new Set(["optgroup", "option", "hr"]),
-  // table
-  table: /* @__PURE__ */ new Set(["caption", "colgroup", "tbody", "tfoot", "thead"]),
-  tr: /* @__PURE__ */ new Set(["td", "th"]),
-  colgroup: /* @__PURE__ */ new Set(["col"]),
-  tbody: /* @__PURE__ */ new Set(["tr"]),
-  thead: /* @__PURE__ */ new Set(["tr"]),
-  tfoot: /* @__PURE__ */ new Set(["tr"]),
-  // these elements can not have any children elements
-  script: emptySet,
-  iframe: emptySet,
-  option: emptySet,
-  textarea: emptySet,
-  style: emptySet,
-  title: emptySet
-};
-const onlyValidParents = {
-  // sections
-  html: emptySet,
-  body: /* @__PURE__ */ new Set(["html"]),
-  head: /* @__PURE__ */ new Set(["html"]),
-  // table
-  td: /* @__PURE__ */ new Set(["tr"]),
-  colgroup: /* @__PURE__ */ new Set(["table"]),
-  caption: /* @__PURE__ */ new Set(["table"]),
-  tbody: /* @__PURE__ */ new Set(["table"]),
-  tfoot: /* @__PURE__ */ new Set(["table"]),
-  col: /* @__PURE__ */ new Set(["colgroup"]),
-  th: /* @__PURE__ */ new Set(["tr"]),
-  thead: /* @__PURE__ */ new Set(["table"]),
-  tr: /* @__PURE__ */ new Set(["tbody", "thead", "tfoot"]),
-  // data list
-  dd: /* @__PURE__ */ new Set(["dl", "div"]),
-  dt: /* @__PURE__ */ new Set(["dl", "div"]),
-  // other
-  figcaption: /* @__PURE__ */ new Set(["figure"]),
-  // li: new Set(["ul", "ol"]),
-  summary: /* @__PURE__ */ new Set(["details"]),
-  area: /* @__PURE__ */ new Set(["map"])
-};
-const knownInvalidChildren = {
-  p: /* @__PURE__ */ new Set([
-    "address",
-    "article",
-    "aside",
-    "blockquote",
-    "center",
-    "details",
-    "dialog",
-    "dir",
-    "div",
-    "dl",
-    "fieldset",
-    "figure",
-    "footer",
-    "form",
-    "h1",
-    "h2",
-    "h3",
-    "h4",
-    "h5",
-    "h6",
-    "header",
-    "hgroup",
-    "hr",
-    "li",
-    "main",
-    "nav",
-    "menu",
-    "ol",
-    "p",
-    "pre",
-    "section",
-    "table",
-    "ul"
-  ]),
-  svg: /* @__PURE__ */ new Set([
-    "b",
-    "blockquote",
-    "br",
-    "code",
-    "dd",
-    "div",
-    "dl",
-    "dt",
-    "em",
-    "embed",
-    "h1",
-    "h2",
-    "h3",
-    "h4",
-    "h5",
-    "h6",
-    "hr",
-    "i",
-    "img",
-    "li",
-    "menu",
-    "meta",
-    "ol",
-    "p",
-    "pre",
-    "ruby",
-    "s",
-    "small",
-    "span",
-    "strong",
-    "sub",
-    "sup",
-    "table",
-    "u",
-    "ul",
-    "var"
-  ])
-};
-const knownInvalidParents = {
-  a: /* @__PURE__ */ new Set(["a"]),
-  button: /* @__PURE__ */ new Set(["button"]),
-  dd: /* @__PURE__ */ new Set(["dd", "dt"]),
-  dt: /* @__PURE__ */ new Set(["dd", "dt"]),
-  form: /* @__PURE__ */ new Set(["form"]),
-  li: /* @__PURE__ */ new Set(["li"]),
-  h1: headings,
-  h2: headings,
-  h3: headings,
-  h4: headings,
-  h5: headings,
-  h6: headings
-};
-
-const validateHtmlNesting = (node, context) => {
-  if (node.type === 1 && node.tagType === 0 && context.parent && context.parent.type === 1 && context.parent.tagType === 0 && !isValidHTMLNesting(context.parent.tag, node.tag)) {
-    const error = new SyntaxError(
-      `<${node.tag}> cannot be child of <${context.parent.tag}>, according to HTML specifications. This can cause hydration errors or potentially disrupt future functionality.`
-    );
-    error.loc = node.loc;
-    context.onWarn(error);
-  }
-};
-
-const DOMNodeTransforms = [
-  transformStyle,
-  ...[transformTransition, validateHtmlNesting] 
-];
-const DOMDirectiveTransforms = {
-  cloak: noopDirectiveTransform,
-  html: transformVHtml,
-  text: transformVText,
-  model: transformModel,
-  // override compiler-core
-  on: transformOn,
-  // override compiler-core
-  show: transformShow
-};
-function compile(src, options = {}) {
-  return baseCompile(
-    src,
-    extend({}, parserOptions, options, {
-      nodeTransforms: [
-        // ignore <script> and <tag>
-        // this is not put inside DOMNodeTransforms because that list is used
-        // by compiler-ssr to generate vnode fallback branches
-        ignoreSideEffectTags,
-        ...DOMNodeTransforms,
-        ...options.nodeTransforms || []
-      ],
-      directiveTransforms: extend(
-        {},
-        DOMDirectiveTransforms,
-        options.directiveTransforms || {}
-      ),
-      transformHoist: null 
-    })
-  );
-}
-
-{
-  initDev();
-}
-const compileCache = /* @__PURE__ */ Object.create(null);
-function compileToFunction(template, options) {
-  if (!isString(template)) {
-    if (template.nodeType) {
-      template = template.innerHTML;
-    } else {
-      warn(`invalid template option: `, template);
-      return NOOP;
-    }
-  }
-  const key = genCacheKey(template, options);
-  const cached = compileCache[key];
-  if (cached) {
-    return cached;
-  }
-  if (template[0] === "#") {
-    const el = document.querySelector(template);
-    if (!el) {
-      warn(`Template element not found or is empty: ${template}`);
-    }
-    template = el ? el.innerHTML : ``;
-  }
-  const opts = extend(
-    {
-      hoistStatic: true,
-      onError: onError ,
-      onWarn: (e) => onError(e, true) 
-    },
-    options
-  );
-  if (!opts.isCustomElement && typeof customElements !== "undefined") {
-    opts.isCustomElement = (tag) => !!customElements.get(tag);
-  }
-  const { code } = compile(template, opts);
-  function onError(err, asWarning = false) {
-    const message = asWarning ? err.message : `Template compilation error: ${err.message}`;
-    const codeFrame = err.loc && generateCodeFrame(
-      template,
-      err.loc.start.offset,
-      err.loc.end.offset
-    );
-    warn(codeFrame ? `${message}
-${codeFrame}` : message);
-  }
-  const render = new Function("Vue", code)(runtimeDom);
-  render._rc = true;
-  return compileCache[key] = render;
-}
-registerRuntimeCompiler(compileToFunction);
-
-export { BaseTransition, BaseTransitionPropsValidators, Comment, DeprecationTypes, EffectScope, ErrorCodes, ErrorTypeStrings, Fragment, KeepAlive, ReactiveEffect, Static, Suspense, Teleport, Text, TrackOpTypes, Transition, TransitionGroup, TriggerOpTypes, VueElement, assertNumber, callWithAsyncErrorHandling, callWithErrorHandling, camelize, capitalize, cloneVNode, compatUtils, compileToFunction as compile, computed, createApp, createBlock, createCommentVNode, createElementBlock, createBaseVNode as createElementVNode, createHydrationRenderer, createPropsRestProxy, createRenderer, createSSRApp, createSlots, createStaticVNode, createTextVNode, createVNode, customRef, defineAsyncComponent, defineComponent, defineCustomElement, defineEmits, defineExpose, defineModel, defineOptions, defineProps, defineSSRCustomElement, defineSlots, devtools, effect, effectScope, getCurrentInstance, getCurrentScope, getCurrentWatcher, getTransitionRawChildren, guardReactiveProps, h, handleError, hasInjectionContext, hydrate, hydrateOnIdle, hydrateOnInteraction, hydrateOnMediaQuery, hydrateOnVisible, initCustomFormatter, initDirectivesForSSR, inject, isMemoSame, isProxy, isReactive, isReadonly, isRef, isRuntimeOnly, isShallow, isVNode, markRaw, mergeDefaults, mergeModels, mergeProps, nextTick, normalizeClass, normalizeProps, normalizeStyle, onActivated, onBeforeMount, onBeforeUnmount, onBeforeUpdate, onDeactivated, onErrorCaptured, onMounted, onRenderTracked, onRenderTriggered, onScopeDispose, onServerPrefetch, onUnmounted, onUpdated, onWatcherCleanup, openBlock, popScopeId, provide, proxyRefs, pushScopeId, queuePostFlushCb, reactive, readonly, ref, registerRuntimeCompiler, render, renderList, renderSlot, resolveComponent, resolveDirective, resolveDynamicComponent, resolveFilter, resolveTransitionHooks, setBlockTracking, setDevtoolsHook, setTransitionHooks, shallowReactive, shallowReadonly, shallowRef, ssrContextKey, ssrUtils, stop, toDisplayString, toHandlerKey, toHandlers, toRaw, toRef, toRefs, toValue, transformVNodeArgs, triggerRef, unref, useAttrs, useCssModule, useCssVars, useHost, useId, useModel, useSSRContext, useShadowRoot, useSlots, useTemplateRef, useTransitionState, vModelCheckbox, vModelDynamic, vModelRadio, vModelSelect, vModelText, vShow, version, warn, watch, watchEffect, watchPostEffect, watchSyncEffect, withAsyncContext, withCtx, withDefaults, withDirectives, withKeys, withMemo, withModifiers, withScopeId };
-
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index c54260867..250729a44 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -1,204 +1,276 @@
+<!--
+  This is a single file build of the frontend.
+  It is automatically generated by the build process.
+  Do not edit this file directly.
+  To make changes, refer to the "Web UI" section in the README.
+-->
+<!DOCTYPE html>
 <html>
 <head>
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
   <meta name="color-scheme" content="light dark">
   <title>🦙 llama.cpp - chat</title>
-
-  <!-- Note: dependencies can de updated using ./deps.sh script -->
-  <link href="./deps_daisyui.min.css" rel="stylesheet" type="text/css" />
-  <script src="./deps_tailwindcss.js"></script>
-  <style type="text/tailwindcss">
-    .markdown {
-      h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
-      pre {
-        @apply whitespace-pre-wrap rounded-lg p-2;
-        border: 1px solid currentColor;
-      }
-      /* TODO: fix markdown table */
-    }
-    /*
-      Note for daisyui: because we're using a subset of daisyui via CDN, many things won't be included
-      We can manually add the missing styles from https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/full.css
-    */
-    .bg-base-100 {background-color: var(--fallback-b1,oklch(var(--b1)/1))}
-    .bg-base-200 {background-color: var(--fallback-b2,oklch(var(--b2)/1))}
-    .bg-base-300 {background-color: var(--fallback-b3,oklch(var(--b3)/1))}
-    .text-base-content {color: var(--fallback-bc,oklch(var(--bc)/1))}
-    .show-on-hover {
-      @apply opacity-0 group-hover:opacity-100;
-    }
-    .btn-mini {
-      @apply cursor-pointer hover:shadow-md;
-    }
-    .chat-screen { max-width: 900px; }
-    /* because the default bubble color is quite dark, we will make a custom one using bg-base-300 */
-    .chat-bubble-base-300 {
-      --tw-bg-opacity: 1;
-      --tw-text-opacity: 1;
-      @apply bg-base-300 text-base-content;
-    }
-  </style>
+  <script type="module" crossorigin>(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const r of document.querySelectorAll('link[rel="modulepreload"]'))n(r);new MutationObserver(r=>{for(const i of r)if(i.type==="childList")for(const s of i.addedNodes)s.tagName==="LINK"&&s.rel==="modulepreload"&&n(s)}).observe(document,{childList:!0,subtree:!0});function u(r){const i={};return r.integrity&&(i.integrity=r.integrity),r.referrerPolicy&&(i.referrerPolicy=r.referrerPolicy),r.crossOrigin==="use-credentials"?i.credentials="include":r.crossOrigin==="anonymous"?i.credentials="omit":i.credentials="same-origin",i}function n(r){if(r.ep)return;r.ep=!0;const i=u(r);fetch(r.href,i)}})();/**
+* @vue/shared v3.5.13
+* (c) 2018-present Yuxi (Evan) You and Vue contributors
+* @license MIT
+**//*! #__NO_SIDE_EFFECTS__ */function Be(e){const t=Object.create(null);for(const u of e.split(","))t[u]=1;return u=>u in t}const ee={},Pu=[],we=()=>{},dn=()=>!1,vu=e=>e.charCodeAt(0)===111&&e.charCodeAt(1)===110&&(e.charCodeAt(2)>122||e.charCodeAt(2)<97),Rs=e=>e.startsWith("onUpdate:"),te=Object.assign,Ms=(e,t)=>{const u=e.indexOf(t);u>-1&&e.splice(u,1)},Uf=Object.prototype.hasOwnProperty,re=(e,t)=>Uf.call(e,t),$=Array.isArray,Lu=e=>tn(e)==="[object Map]",Du=e=>tn(e)==="[object Set]",Ko=e=>tn(e)==="[object Date]",qf=e=>tn(e)==="[object RegExp]",W=e=>typeof e=="function",Z=e=>typeof e=="string",Ze=e=>typeof e=="symbol",oe=e=>e!==null&&typeof e=="object",Ps=e=>(oe(e)||W(e))&&W(e.then)&&W(e.catch),yl=Object.prototype.toString,tn=e=>yl.call(e),jf=e=>tn(e).slice(8,-1),ti=e=>tn(e)==="[object Object]",Ls=e=>Z(e)&&e!=="NaN"&&e[0]!=="-"&&""+parseInt(e,10)===e,tu=Be(",key,ref,ref_for,ref_key,onVnodeBeforeMount,onVnodeMounted,onVnodeBeforeUpdate,onVnodeUpdated,onVnodeBeforeUnmount,onVnodeUnmounted"),zf=Be("bind,cloak,else-if,else,for,html,if,model,on,once,pre,show,slot,text,memo"),ui=e=>{const t=Object.create(null);return u=>t[u]||(t[u]=e(u))},Kf=/-(\w)/g,de=ui(e=>e.replace(Kf,(t,u)=>u?u.toUpperCase():"")),Wf=/\B([A-Z])/g,qe=ui(e=>e.replace(Wf,"-$1").toLowerCase()),Tu=ui(e=>e.charAt(0).toUpperCase()+e.slice(1)),Bu=ui(e=>e?`on${Tu(e)}`:""),He=(e,t)=>!Object.is(e,t),Vu=(e,...t)=>{for(let u=0;u<e.length;u++)e[u](...t)},ju=(e,t,u,n=!1)=>{Object.defineProperty(e,t,{configurable:!0,enumerable:!1,writable:n,value:u})},Sr=e=>{const t=parseFloat(e);return isNaN(t)?e:t},vr=e=>{const t=Z(e)?Number(e):NaN;return isNaN(t)?e:t};let Wo;const gu=()=>Wo||(Wo=typeof globalThis<"u"?globalThis:typeof self<"u"?self:typeof window<"u"?window:typeof global<"u"?global:{});function Gf(e,t){return e+JSON.stringify(t,(u,n)=>typeof n=="function"?n.toString():n)}const Zf="Infinity,undefined,NaN,isFinite,isNaN,parseFloat,parseInt,decodeURI,decodeURIComponent,encodeURI,encodeURIComponent,Math,Number,Date,Array,Object,Boolean,String,RegExp,Map,Set,JSON,Intl,BigInt,console,Error,Symbol",Jf=Be(Zf);function un(e){if($(e)){const t={};for(let u=0;u<e.length;u++){const n=e[u],r=Z(n)?xl(n):un(n);if(r)for(const i in r)t[i]=r[i]}return t}else if(Z(e)||oe(e))return e}const Yf=/;(?![^(]*\))/g,Xf=/:([^]+)/,Qf=/\/\*[^]*?\*\//g;function xl(e){const t={};return e.replace(Qf,"").split(Yf).forEach(u=>{if(u){const n=u.split(Xf);n.length>1&&(t[n[0].trim()]=n[1].trim())}}),t}function ed(e){if(!e)return"";if(Z(e))return e;let t="";for(const u in e){const n=e[u];if(Z(n)||typeof n=="number"){const r=u.startsWith("--")?u:qe(u);t+=`${r}:${n};`}}return t}function nn(e){let t="";if(Z(e))t=e;else if($(e))for(let u=0;u<e.length;u++){const n=nn(e[u]);n&&(t+=n+" ")}else if(oe(e))for(const u in e)e[u]&&(t+=u+" ");return t.trim()}function td(e){if(!e)return null;let{class:t,style:u}=e;return t&&!Z(t)&&(e.class=nn(t)),u&&(e.style=un(u)),e}const ud="html,body,base,head,link,meta,style,title,address,article,aside,footer,header,hgroup,h1,h2,h3,h4,h5,h6,nav,section,div,dd,dl,dt,figcaption,figure,picture,hr,img,li,main,ol,p,pre,ul,a,b,abbr,bdi,bdo,br,cite,code,data,dfn,em,i,kbd,mark,q,rp,rt,ruby,s,samp,small,span,strong,sub,sup,time,u,var,wbr,area,audio,map,track,video,embed,object,param,source,canvas,script,noscript,del,ins,caption,col,colgroup,table,thead,tbody,td,th,tr,button,datalist,fieldset,form,input,label,legend,meter,optgroup,option,output,progress,select,textarea,details,dialog,menu,summary,template,blockquote,iframe,tfoot",nd="svg,animate,animateMotion,animateTransform,circle,clipPath,color-profile,defs,desc,discard,ellipse,feBlend,feColorMatrix,feComponentTransfer,feComposite,feConvolveMatrix,feDiffuseLighting,feDisplacementMap,feDistantLight,feDropShadow,feFlood,feFuncA,feFuncB,feFuncG,feFuncR,feGaussianBlur,feImage,feMerge,feMergeNode,feMorphology,feOffset,fePointLight,feSpecularLighting,feSpotLight,feTile,feTurbulence,filter,foreignObject,g,hatch,hatchpath,image,line,linearGradient,marker,mask,mesh,meshgradient,meshpatch,meshrow,metadata,mpath,path,pattern,polygon,polyline,radialGradient,rect,set,solidcolor,stop,switch,symbol,text,textPath,title,tspan,unknown,use,view",rd="annotation,annotation-xml,maction,maligngroup,malignmark,math,menclose,merror,mfenced,mfrac,mfraction,mglyph,mi,mlabeledtr,mlongdiv,mmultiscripts,mn,mo,mover,mpadded,mphantom,mprescripts,mroot,mrow,ms,mscarries,mscarry,msgroup,msline,mspace,msqrt,msrow,mstack,mstyle,msub,msubsup,msup,mtable,mtd,mtext,mtr,munder,munderover,none,semantics",id="area,base,br,col,embed,hr,img,input,link,meta,param,source,track,wbr",sd=Be(ud),od=Be(nd),cd=Be(rd),ld=Be(id),El="itemscope,allowfullscreen,formnovalidate,ismap,nomodule,novalidate,readonly",ad=Be(El),Go=Be(El+",async,autofocus,autoplay,controls,default,defer,disabled,hidden,inert,loop,open,required,reversed,scoped,seamless,checked,muted,multiple,selected");function Bs(e){return!!e||e===""}const fd=Be("accept,accept-charset,accesskey,action,align,allow,alt,async,autocapitalize,autocomplete,autofocus,autoplay,background,bgcolor,border,buffered,capture,challenge,charset,checked,cite,class,code,codebase,color,cols,colspan,content,contenteditable,contextmenu,controls,coords,crossorigin,csp,data,datetime,decoding,default,defer,dir,dirname,disabled,download,draggable,dropzone,enctype,enterkeyhint,for,form,formaction,formenctype,formmethod,formnovalidate,formtarget,headers,height,hidden,high,href,hreflang,http-equiv,icon,id,importance,inert,integrity,ismap,itemprop,keytype,kind,label,lang,language,loading,list,loop,low,manifest,max,maxlength,minlength,media,min,multiple,muted,name,novalidate,open,optimum,pattern,ping,placeholder,poster,preload,radiogroup,readonly,referrerpolicy,rel,required,reversed,rows,rowspan,sandbox,scope,scoped,selected,shape,size,sizes,slot,span,spellcheck,src,srcdoc,srclang,srcset,start,step,style,summary,tabindex,target,title,translate,type,usemap,value,width,wrap"),dd=Be("xmlns,accent-height,accumulate,additive,alignment-baseline,alphabetic,amplitude,arabic-form,ascent,attributeName,attributeType,azimuth,baseFrequency,baseline-shift,baseProfile,bbox,begin,bias,by,calcMode,cap-height,class,clip,clipPathUnits,clip-path,clip-rule,color,color-interpolation,color-interpolation-filters,color-profile,color-rendering,contentScriptType,contentStyleType,crossorigin,cursor,cx,cy,d,decelerate,descent,diffuseConstant,direction,display,divisor,dominant-baseline,dur,dx,dy,edgeMode,elevation,enable-background,end,exponent,fill,fill-opacity,fill-rule,filter,filterRes,filterUnits,flood-color,flood-opacity,font-family,font-size,font-size-adjust,font-stretch,font-style,font-variant,font-weight,format,from,fr,fx,fy,g1,g2,glyph-name,glyph-orientation-horizontal,glyph-orientation-vertical,glyphRef,gradientTransform,gradientUnits,hanging,height,href,hreflang,horiz-adv-x,horiz-origin-x,id,ideographic,image-rendering,in,in2,intercept,k,k1,k2,k3,k4,kernelMatrix,kernelUnitLength,kerning,keyPoints,keySplines,keyTimes,lang,lengthAdjust,letter-spacing,lighting-color,limitingConeAngle,local,marker-end,marker-mid,marker-start,markerHeight,markerUnits,markerWidth,mask,maskContentUnits,maskUnits,mathematical,max,media,method,min,mode,name,numOctaves,offset,opacity,operator,order,orient,orientation,origin,overflow,overline-position,overline-thickness,panose-1,paint-order,path,pathLength,patternContentUnits,patternTransform,patternUnits,ping,pointer-events,points,pointsAtX,pointsAtY,pointsAtZ,preserveAlpha,preserveAspectRatio,primitiveUnits,r,radius,referrerPolicy,refX,refY,rel,rendering-intent,repeatCount,repeatDur,requiredExtensions,requiredFeatures,restart,result,rotate,rx,ry,scale,seed,shape-rendering,slope,spacing,specularConstant,specularExponent,speed,spreadMethod,startOffset,stdDeviation,stemh,stemv,stitchTiles,stop-color,stop-opacity,strikethrough-position,strikethrough-thickness,string,stroke,stroke-dasharray,stroke-dashoffset,stroke-linecap,stroke-linejoin,stroke-miterlimit,stroke-opacity,stroke-width,style,surfaceScale,systemLanguage,tabindex,tableValues,target,targetX,targetY,text-anchor,text-decoration,text-rendering,textLength,to,transform,transform-origin,type,u1,u2,underline-position,underline-thickness,unicode,unicode-bidi,unicode-range,units-per-em,v-alphabetic,v-hanging,v-ideographic,v-mathematical,values,vector-effect,version,vert-adv-y,vert-origin-x,vert-origin-y,viewBox,viewTarget,visibility,width,widths,word-spacing,writing-mode,x,x-height,x1,x2,xChannelSelector,xlink:actuate,xlink:arcrole,xlink:href,xlink:role,xlink:show,xlink:title,xlink:type,xmlns:xlink,xml:base,xml:lang,xml:space,y,y1,y2,yChannelSelector,z,zoomAndPan");function hd(e){if(e==null)return!1;const t=typeof e;return t==="string"||t==="number"||t==="boolean"}const pd=/[ !"#$%&'()*+,./:;<=>?@[\\\]^`{|}~]/g;function md(e,t){return e.replace(pd,u=>`\\${u}`)}function bd(e,t){if(e.length!==t.length)return!1;let u=!0;for(let n=0;u&&n<e.length;n++)u=su(e[n],t[n]);return u}function su(e,t){if(e===t)return!0;let u=Ko(e),n=Ko(t);if(u||n)return u&&n?e.getTime()===t.getTime():!1;if(u=Ze(e),n=Ze(t),u||n)return e===t;if(u=$(e),n=$(t),u||n)return u&&n?bd(e,t):!1;if(u=oe(e),n=oe(t),u||n){if(!u||!n)return!1;const r=Object.keys(e).length,i=Object.keys(t).length;if(r!==i)return!1;for(const s in e){const o=e.hasOwnProperty(s),c=t.hasOwnProperty(s);if(o&&!c||!o&&c||!su(e[s],t[s]))return!1}}return String(e)===String(t)}function ni(e,t){return e.findIndex(u=>su(u,t))}const Cl=e=>!!(e&&e.__v_isRef===!0),kl=e=>Z(e)?e:e==null?"":$(e)||oe(e)&&(e.toString===yl||!W(e.toString))?Cl(e)?kl(e.value):JSON.stringify(e,Al,2):String(e),Al=(e,t)=>Cl(t)?Al(e,t.value):Lu(t)?{[`Map(${t.size})`]:[...t.entries()].reduce((u,[n,r],i)=>(u[Ni(n,i)+" =>"]=r,u),{})}:Du(t)?{[`Set(${t.size})`]:[...t.values()].map(u=>Ni(u))}:Ze(t)?Ni(t):oe(t)&&!$(t)&&!ti(t)?String(t):t,Ni=(e,t="")=>{var u;return Ze(e)?`Symbol(${(u=e.description)!=null?u:t})`:e};/**
+* @vue/reactivity v3.5.13
+* (c) 2018-present Yuxi (Evan) You and Vue contributors
+* @license MIT
+**/let $e;class Vs{constructor(t=!1){this.detached=t,this._active=!0,this.effects=[],this.cleanups=[],this._isPaused=!1,this.parent=$e,!t&&$e&&(this.index=($e.scopes||($e.scopes=[])).push(this)-1)}get active(){return this._active}pause(){if(this._active){this._isPaused=!0;let t,u;if(this.scopes)for(t=0,u=this.scopes.length;t<u;t++)this.scopes[t].pause();for(t=0,u=this.effects.length;t<u;t++)this.effects[t].pause()}}resume(){if(this._active&&this._isPaused){this._isPaused=!1;let t,u;if(this.scopes)for(t=0,u=this.scopes.length;t<u;t++)this.scopes[t].resume();for(t=0,u=this.effects.length;t<u;t++)this.effects[t].resume()}}run(t){if(this._active){const u=$e;try{return $e=this,t()}finally{$e=u}}}on(){$e=this}off(){$e=this.parent}stop(t){if(this._active){this._active=!1;let u,n;for(u=0,n=this.effects.length;u<n;u++)this.effects[u].stop();for(this.effects.length=0,u=0,n=this.cleanups.length;u<n;u++)this.cleanups[u]();if(this.cleanups.length=0,this.scopes){for(u=0,n=this.scopes.length;u<n;u++)this.scopes[u].stop(!0);this.scopes.length=0}if(!this.detached&&this.parent&&!t){const r=this.parent.scopes.pop();r&&r!==this&&(this.parent.scopes[this.index]=r,r.index=this.index)}this.parent=void 0}}}function gd(e){return new Vs(e)}function Sl(){return $e}function _d(e,t=!1){$e&&$e.cleanups.push(e)}let le;const Ri=new WeakSet;class vn{constructor(t){this.fn=t,this.deps=void 0,this.depsTail=void 0,this.flags=5,this.next=void 0,this.cleanup=void 0,this.scheduler=void 0,$e&&$e.active&&$e.effects.push(this)}pause(){this.flags|=64}resume(){this.flags&64&&(this.flags&=-65,Ri.has(this)&&(Ri.delete(this),this.trigger()))}notify(){this.flags&2&&!(this.flags&32)||this.flags&8||Dl(this)}run(){if(!(this.flags&1))return this.fn();this.flags|=2,Zo(this),Tl(this);const t=le,u=bt;le=this,bt=!0;try{return this.fn()}finally{wl(this),le=t,bt=u,this.flags&=-3}}stop(){if(this.flags&1){for(let t=this.deps;t;t=t.nextDep)Us(t);this.deps=this.depsTail=void 0,Zo(this),this.onStop&&this.onStop(),this.flags&=-2}}trigger(){this.flags&64?Ri.add(this):this.scheduler?this.scheduler():this.runIfDirty()}runIfDirty(){ts(this)&&this.run()}get dirty(){return ts(this)}}let vl=0,gn,_n;function Dl(e,t=!1){if(e.flags|=8,t){e.next=_n,_n=e;return}e.next=gn,gn=e}function $s(){vl++}function Hs(){if(--vl>0)return;if(_n){let t=_n;for(_n=void 0;t;){const u=t.next;t.next=void 0,t.flags&=-9,t=u}}let e;for(;gn;){let t=gn;for(gn=void 0;t;){const u=t.next;if(t.next=void 0,t.flags&=-9,t.flags&1)try{t.trigger()}catch(n){e||(e=n)}t=u}}if(e)throw e}function Tl(e){for(let t=e.deps;t;t=t.nextDep)t.version=-1,t.prevActiveLink=t.dep.activeLink,t.dep.activeLink=t}function wl(e){let t,u=e.depsTail,n=u;for(;n;){const r=n.prevDep;n.version===-1?(n===u&&(u=r),Us(n),yd(n)):t=n,n.dep.activeLink=n.prevActiveLink,n.prevActiveLink=void 0,n=r}e.deps=t,e.depsTail=u}function ts(e){for(let t=e.deps;t;t=t.nextDep)if(t.dep.version!==t.version||t.dep.computed&&(Fl(t.dep.computed)||t.dep.version!==t.version))return!0;return!!e._dirty}function Fl(e){if(e.flags&4&&!(e.flags&16)||(e.flags&=-17,e.globalVersion===Dn))return;e.globalVersion=Dn;const t=e.dep;if(e.flags|=2,t.version>0&&!e.isSSR&&e.deps&&!ts(e)){e.flags&=-3;return}const u=le,n=bt;le=e,bt=!0;try{Tl(e);const r=e.fn(e._value);(t.version===0||He(r,e._value))&&(e._value=r,t.version++)}catch(r){throw t.version++,r}finally{le=u,bt=n,wl(e),e.flags&=-3}}function Us(e,t=!1){const{dep:u,prevSub:n,nextSub:r}=e;if(n&&(n.nextSub=r,e.prevSub=void 0),r&&(r.prevSub=n,e.nextSub=void 0),u.subs===e&&(u.subs=n,!n&&u.computed)){u.computed.flags&=-5;for(let i=u.computed.deps;i;i=i.nextDep)Us(i,!0)}!t&&!--u.sc&&u.map&&u.map.delete(u.key)}function yd(e){const{prevDep:t,nextDep:u}=e;t&&(t.nextDep=u,e.prevDep=void 0),u&&(u.prevDep=t,e.nextDep=void 0)}function xd(e,t){e.effect instanceof vn&&(e=e.effect.fn);const u=new vn(e);t&&te(u,t);try{u.run()}catch(r){throw u.stop(),r}const n=u.run.bind(u);return n.effect=u,n}function Ed(e){e.effect.stop()}let bt=!0;const Il=[];function $t(){Il.push(bt),bt=!1}function Ht(){const e=Il.pop();bt=e===void 0?!0:e}function Zo(e){const{cleanup:t}=e;if(e.cleanup=void 0,t){const u=le;le=void 0;try{t()}finally{le=u}}}let Dn=0;class Cd{constructor(t,u){this.sub=t,this.dep=u,this.version=u.version,this.nextDep=this.prevDep=this.nextSub=this.prevSub=this.prevActiveLink=void 0}}class ri{constructor(t){this.computed=t,this.version=0,this.activeLink=void 0,this.subs=void 0,this.map=void 0,this.key=void 0,this.sc=0}track(t){if(!le||!bt||le===this.computed)return;let u=this.activeLink;if(u===void 0||u.sub!==le)u=this.activeLink=new Cd(le,this),le.deps?(u.prevDep=le.depsTail,le.depsTail.nextDep=u,le.depsTail=u):le.deps=le.depsTail=u,Ol(u);else if(u.version===-1&&(u.version=this.version,u.nextDep)){const n=u.nextDep;n.prevDep=u.prevDep,u.prevDep&&(u.prevDep.nextDep=n),u.prevDep=le.depsTail,u.nextDep=void 0,le.depsTail.nextDep=u,le.depsTail=u,le.deps===u&&(le.deps=n)}return u}trigger(t){this.version++,Dn++,this.notify(t)}notify(t){$s();try{for(let u=this.subs;u;u=u.prevSub)u.sub.notify()&&u.sub.dep.notify()}finally{Hs()}}}function Ol(e){if(e.dep.sc++,e.sub.flags&4){const t=e.dep.computed;if(t&&!e.dep.subs){t.flags|=20;for(let n=t.deps;n;n=n.nextDep)Ol(n)}const u=e.dep.subs;u!==e&&(e.prevSub=u,u&&(u.nextSub=e)),e.dep.subs=e}}const Dr=new WeakMap,_u=Symbol(""),us=Symbol(""),Tn=Symbol("");function Re(e,t,u){if(bt&&le){let n=Dr.get(e);n||Dr.set(e,n=new Map);let r=n.get(u);r||(n.set(u,r=new ri),r.map=n,r.key=u),r.track()}}function Nt(e,t,u,n,r,i){const s=Dr.get(e);if(!s){Dn++;return}const o=c=>{c&&c.trigger()};if($s(),t==="clear")s.forEach(o);else{const c=$(e),l=c&&Ls(u);if(c&&u==="length"){const a=Number(n);s.forEach((f,p)=>{(p==="length"||p===Tn||!Ze(p)&&p>=a)&&o(f)})}else switch((u!==void 0||s.has(void 0))&&o(s.get(u)),l&&o(s.get(Tn)),t){case"add":c?l&&o(s.get("length")):(o(s.get(_u)),Lu(e)&&o(s.get(us)));break;case"delete":c||(o(s.get(_u)),Lu(e)&&o(s.get(us)));break;case"set":Lu(e)&&o(s.get(_u));break}}Hs()}function kd(e,t){const u=Dr.get(e);return u&&u.get(t)}function Ou(e){const t=Q(e);return t===e?t:(Re(t,"iterate",Tn),ut(e)?t:t.map(Me))}function ii(e){return Re(e=Q(e),"iterate",Tn),e}const Ad={__proto__:null,[Symbol.iterator](){return Mi(this,Symbol.iterator,Me)},concat(...e){return Ou(this).concat(...e.map(t=>$(t)?Ou(t):t))},entries(){return Mi(this,"entries",e=>(e[1]=Me(e[1]),e))},every(e,t){return wt(this,"every",e,t,void 0,arguments)},filter(e,t){return wt(this,"filter",e,t,u=>u.map(Me),arguments)},find(e,t){return wt(this,"find",e,t,Me,arguments)},findIndex(e,t){return wt(this,"findIndex",e,t,void 0,arguments)},findLast(e,t){return wt(this,"findLast",e,t,Me,arguments)},findLastIndex(e,t){return wt(this,"findLastIndex",e,t,void 0,arguments)},forEach(e,t){return wt(this,"forEach",e,t,void 0,arguments)},includes(...e){return Pi(this,"includes",e)},indexOf(...e){return Pi(this,"indexOf",e)},join(e){return Ou(this).join(e)},lastIndexOf(...e){return Pi(this,"lastIndexOf",e)},map(e,t){return wt(this,"map",e,t,void 0,arguments)},pop(){return cn(this,"pop")},push(...e){return cn(this,"push",e)},reduce(e,...t){return Jo(this,"reduce",e,t)},reduceRight(e,...t){return Jo(this,"reduceRight",e,t)},shift(){return cn(this,"shift")},some(e,t){return wt(this,"some",e,t,void 0,arguments)},splice(...e){return cn(this,"splice",e)},toReversed(){return Ou(this).toReversed()},toSorted(e){return Ou(this).toSorted(e)},toSpliced(...e){return Ou(this).toSpliced(...e)},unshift(...e){return cn(this,"unshift",e)},values(){return Mi(this,"values",Me)}};function Mi(e,t,u){const n=ii(e),r=n[t]();return n!==e&&!ut(e)&&(r._next=r.next,r.next=()=>{const i=r._next();return i.value&&(i.value=u(i.value)),i}),r}const Sd=Array.prototype;function wt(e,t,u,n,r,i){const s=ii(e),o=s!==e&&!ut(e),c=s[t];if(c!==Sd[t]){const f=c.apply(e,i);return o?Me(f):f}let l=u;s!==e&&(o?l=function(f,p){return u.call(this,Me(f),p,e)}:u.length>2&&(l=function(f,p){return u.call(this,f,p,e)}));const a=c.call(s,l,n);return o&&r?r(a):a}function Jo(e,t,u,n){const r=ii(e);let i=u;return r!==e&&(ut(e)?u.length>3&&(i=function(s,o,c){return u.call(this,s,o,c,e)}):i=function(s,o,c){return u.call(this,s,Me(o),c,e)}),r[t](i,...n)}function Pi(e,t,u){const n=Q(e);Re(n,"iterate",Tn);const r=n[t](...u);return(r===-1||r===!1)&&li(u[0])?(u[0]=Q(u[0]),n[t](...u)):r}function cn(e,t,u=[]){$t(),$s();const n=Q(e)[t].apply(e,u);return Hs(),Ht(),n}const vd=Be("__proto__,__v_isRef,__isVue"),Nl=new Set(Object.getOwnPropertyNames(Symbol).filter(e=>e!=="arguments"&&e!=="caller").map(e=>Symbol[e]).filter(Ze));function Dd(e){Ze(e)||(e=String(e));const t=Q(this);return Re(t,"has",e),t.hasOwnProperty(e)}class Rl{constructor(t=!1,u=!1){this._isReadonly=t,this._isShallow=u}get(t,u,n){if(u==="__v_skip")return t.__v_skip;const r=this._isReadonly,i=this._isShallow;if(u==="__v_isReactive")return!r;if(u==="__v_isReadonly")return r;if(u==="__v_isShallow")return i;if(u==="__v_raw")return n===(r?i?$l:Vl:i?Bl:Ll).get(t)||Object.getPrototypeOf(t)===Object.getPrototypeOf(n)?t:void 0;const s=$(t);if(!r){let c;if(s&&(c=Ad[u]))return c;if(u==="hasOwnProperty")return Dd}const o=Reflect.get(t,u,Ce(t)?t:n);return(Ze(u)?Nl.has(u):vd(u))||(r||Re(t,"get",u),i)?o:Ce(o)?s&&Ls(u)?o:o.value:oe(o)?r?qs(o):oi(o):o}}class Ml extends Rl{constructor(t=!1){super(!1,t)}set(t,u,n,r){let i=t[u];if(!this._isShallow){const c=ou(i);if(!ut(n)&&!ou(n)&&(i=Q(i),n=Q(n)),!$(t)&&Ce(i)&&!Ce(n))return c?!1:(i.value=n,!0)}const s=$(t)&&Ls(u)?Number(u)<t.length:re(t,u),o=Reflect.set(t,u,n,Ce(t)?t:r);return t===Q(r)&&(s?He(n,i)&&Nt(t,"set",u,n):Nt(t,"add",u,n)),o}deleteProperty(t,u){const n=re(t,u);t[u];const r=Reflect.deleteProperty(t,u);return r&&n&&Nt(t,"delete",u,void 0),r}has(t,u){const n=Reflect.has(t,u);return(!Ze(u)||!Nl.has(u))&&Re(t,"has",u),n}ownKeys(t){return Re(t,"iterate",$(t)?"length":_u),Reflect.ownKeys(t)}}class Pl extends Rl{constructor(t=!1){super(!0,t)}set(t,u){return!0}deleteProperty(t,u){return!0}}const Td=new Ml,wd=new Pl,Fd=new Ml(!0),Id=new Pl(!0),ns=e=>e,rr=e=>Reflect.getPrototypeOf(e);function Od(e,t,u){return function(...n){const r=this.__v_raw,i=Q(r),s=Lu(i),o=e==="entries"||e===Symbol.iterator&&s,c=e==="keys"&&s,l=r[e](...n),a=u?ns:t?rs:Me;return!t&&Re(i,"iterate",c?us:_u),{next(){const{value:f,done:p}=l.next();return p?{value:f,done:p}:{value:o?[a(f[0]),a(f[1])]:a(f),done:p}},[Symbol.iterator](){return this}}}}function ir(e){return function(...t){return e==="delete"?!1:e==="clear"?void 0:this}}function Nd(e,t){const u={get(r){const i=this.__v_raw,s=Q(i),o=Q(r);e||(He(r,o)&&Re(s,"get",r),Re(s,"get",o));const{has:c}=rr(s),l=t?ns:e?rs:Me;if(c.call(s,r))return l(i.get(r));if(c.call(s,o))return l(i.get(o));i!==s&&i.get(r)},get size(){const r=this.__v_raw;return!e&&Re(Q(r),"iterate",_u),Reflect.get(r,"size",r)},has(r){const i=this.__v_raw,s=Q(i),o=Q(r);return e||(He(r,o)&&Re(s,"has",r),Re(s,"has",o)),r===o?i.has(r):i.has(r)||i.has(o)},forEach(r,i){const s=this,o=s.__v_raw,c=Q(o),l=t?ns:e?rs:Me;return!e&&Re(c,"iterate",_u),o.forEach((a,f)=>r.call(i,l(a),l(f),s))}};return te(u,e?{add:ir("add"),set:ir("set"),delete:ir("delete"),clear:ir("clear")}:{add(r){!t&&!ut(r)&&!ou(r)&&(r=Q(r));const i=Q(this);return rr(i).has.call(i,r)||(i.add(r),Nt(i,"add",r,r)),this},set(r,i){!t&&!ut(i)&&!ou(i)&&(i=Q(i));const s=Q(this),{has:o,get:c}=rr(s);let l=o.call(s,r);l||(r=Q(r),l=o.call(s,r));const a=c.call(s,r);return s.set(r,i),l?He(i,a)&&Nt(s,"set",r,i):Nt(s,"add",r,i),this},delete(r){const i=Q(this),{has:s,get:o}=rr(i);let c=s.call(i,r);c||(r=Q(r),c=s.call(i,r)),o&&o.call(i,r);const l=i.delete(r);return c&&Nt(i,"delete",r,void 0),l},clear(){const r=Q(this),i=r.size!==0,s=r.clear();return i&&Nt(r,"clear",void 0,void 0),s}}),["keys","values","entries",Symbol.iterator].forEach(r=>{u[r]=Od(r,e,t)}),u}function si(e,t){const u=Nd(e,t);return(n,r,i)=>r==="__v_isReactive"?!e:r==="__v_isReadonly"?e:r==="__v_raw"?n:Reflect.get(re(u,r)&&r in n?u:n,r,i)}const Rd={get:si(!1,!1)},Md={get:si(!1,!0)},Pd={get:si(!0,!1)},Ld={get:si(!0,!0)},Ll=new WeakMap,Bl=new WeakMap,Vl=new WeakMap,$l=new WeakMap;function Bd(e){switch(e){case"Object":case"Array":return 1;case"Map":case"Set":case"WeakMap":case"WeakSet":return 2;default:return 0}}function Vd(e){return e.__v_skip||!Object.isExtensible(e)?0:Bd(jf(e))}function oi(e){return ou(e)?e:ci(e,!1,Td,Rd,Ll)}function Hl(e){return ci(e,!1,Fd,Md,Bl)}function qs(e){return ci(e,!0,wd,Pd,Vl)}function $d(e){return ci(e,!0,Id,Ld,$l)}function ci(e,t,u,n,r){if(!oe(e)||e.__v_raw&&!(t&&e.__v_isReactive))return e;const i=r.get(e);if(i)return i;const s=Vd(e);if(s===0)return e;const o=new Proxy(e,s===2?n:u);return r.set(e,o),o}function uu(e){return ou(e)?uu(e.__v_raw):!!(e&&e.__v_isReactive)}function ou(e){return!!(e&&e.__v_isReadonly)}function ut(e){return!!(e&&e.__v_isShallow)}function li(e){return e?!!e.__v_raw:!1}function Q(e){const t=e&&e.__v_raw;return t?Q(t):e}function Ul(e){return!re(e,"__v_skip")&&Object.isExtensible(e)&&ju(e,"__v_skip",!0),e}const Me=e=>oe(e)?oi(e):e,rs=e=>oe(e)?qs(e):e;function Ce(e){return e?e.__v_isRef===!0:!1}function yn(e){return ql(e,!1)}function js(e){return ql(e,!0)}function ql(e,t){return Ce(e)?e:new Hd(e,t)}class Hd{constructor(t,u){this.dep=new ri,this.__v_isRef=!0,this.__v_isShallow=!1,this._rawValue=u?t:Q(t),this._value=u?t:Me(t),this.__v_isShallow=u}get value(){return this.dep.track(),this._value}set value(t){const u=this._rawValue,n=this.__v_isShallow||ut(t)||ou(t);t=n?t:Q(t),He(t,u)&&(this._rawValue=t,this._value=n?t:Me(t),this.dep.trigger())}}function Ud(e){e.dep&&e.dep.trigger()}function ai(e){return Ce(e)?e.value:e}function qd(e){return W(e)?e():ai(e)}const jd={get:(e,t,u)=>t==="__v_raw"?e:ai(Reflect.get(e,t,u)),set:(e,t,u,n)=>{const r=e[t];return Ce(r)&&!Ce(u)?(r.value=u,!0):Reflect.set(e,t,u,n)}};function zs(e){return uu(e)?e:new Proxy(e,jd)}class zd{constructor(t){this.__v_isRef=!0,this._value=void 0;const u=this.dep=new ri,{get:n,set:r}=t(u.track.bind(u),u.trigger.bind(u));this._get=n,this._set=r}get value(){return this._value=this._get()}set value(t){this._set(t)}}function jl(e){return new zd(e)}function Kd(e){const t=$(e)?new Array(e.length):{};for(const u in e)t[u]=zl(e,u);return t}class Wd{constructor(t,u,n){this._object=t,this._key=u,this._defaultValue=n,this.__v_isRef=!0,this._value=void 0}get value(){const t=this._object[this._key];return this._value=t===void 0?this._defaultValue:t}set value(t){this._object[this._key]=t}get dep(){return kd(Q(this._object),this._key)}}class Gd{constructor(t){this._getter=t,this.__v_isRef=!0,this.__v_isReadonly=!0,this._value=void 0}get value(){return this._value=this._getter()}}function Zd(e,t,u){return Ce(e)?e:W(e)?new Gd(e):oe(e)&&arguments.length>1?zl(e,t,u):yn(e)}function zl(e,t,u){const n=e[t];return Ce(n)?n:new Wd(e,t,u)}class Jd{constructor(t,u,n){this.fn=t,this.setter=u,this._value=void 0,this.dep=new ri(this),this.__v_isRef=!0,this.deps=void 0,this.depsTail=void 0,this.flags=16,this.globalVersion=Dn-1,this.next=void 0,this.effect=this,this.__v_isReadonly=!u,this.isSSR=n}notify(){if(this.flags|=16,!(this.flags&8)&&le!==this)return Dl(this,!0),!0}get value(){const t=this.dep.track();return Fl(this),t&&(t.version=this.dep.version),this._value}set value(t){this.setter&&this.setter(t)}}function Yd(e,t,u=!1){let n,r;return W(e)?n=e:(n=e.get,r=e.set),new Jd(n,r,u)}const Xd={GET:"get",HAS:"has",ITERATE:"iterate"},Qd={SET:"set",ADD:"add",DELETE:"delete",CLEAR:"clear"},sr={},Tr=new WeakMap;let Wt;function eh(){return Wt}function Kl(e,t=!1,u=Wt){if(u){let n=Tr.get(u);n||Tr.set(u,n=[]),n.push(e)}}function th(e,t,u=ee){const{immediate:n,deep:r,once:i,scheduler:s,augmentJob:o,call:c}=u,l=g=>r?g:ut(g)||r===!1||r===0?Rt(g,1):Rt(g);let a,f,p,d,h=!1,_=!1;if(Ce(e)?(f=()=>e.value,h=ut(e)):uu(e)?(f=()=>l(e),h=!0):$(e)?(_=!0,h=e.some(g=>uu(g)||ut(g)),f=()=>e.map(g=>{if(Ce(g))return g.value;if(uu(g))return l(g);if(W(g))return c?c(g,2):g()})):W(e)?t?f=c?()=>c(e,2):e:f=()=>{if(p){$t();try{p()}finally{Ht()}}const g=Wt;Wt=a;try{return c?c(e,3,[d]):e(d)}finally{Wt=g}}:f=we,t&&r){const g=f,x=r===!0?1/0:r;f=()=>Rt(g(),x)}const v=Sl(),A=()=>{a.stop(),v&&v.active&&Ms(v.effects,a)};if(i&&t){const g=t;t=(...x)=>{g(...x),A()}}let E=_?new Array(e.length).fill(sr):sr;const m=g=>{if(!(!(a.flags&1)||!a.dirty&&!g))if(t){const x=a.run();if(r||h||(_?x.some((k,I)=>He(k,E[I])):He(x,E))){p&&p();const k=Wt;Wt=a;try{const I=[x,E===sr?void 0:_&&E[0]===sr?[]:E,d];c?c(t,3,I):t(...I),E=x}finally{Wt=k}}}else a.run()};return o&&o(m),a=new vn(f),a.scheduler=s?()=>s(m,!1):m,d=g=>Kl(g,!1,a),p=a.onStop=()=>{const g=Tr.get(a);if(g){if(c)c(g,4);else for(const x of g)x();Tr.delete(a)}},t?n?m(!0):E=a.run():s?s(m.bind(null,!0),!0):a.run(),A.pause=a.pause.bind(a),A.resume=a.resume.bind(a),A.stop=A,A}function Rt(e,t=1/0,u){if(t<=0||!oe(e)||e.__v_skip||(u=u||new Set,u.has(e)))return e;if(u.add(e),t--,Ce(e))Rt(e.value,t,u);else if($(e))for(let n=0;n<e.length;n++)Rt(e[n],t,u);else if(Du(e)||Lu(e))e.forEach(n=>{Rt(n,t,u)});else if(ti(e)){for(const n in e)Rt(e[n],t,u);for(const n of Object.getOwnPropertySymbols(e))Object.prototype.propertyIsEnumerable.call(e,n)&&Rt(e[n],t,u)}return e}/**
+* @vue/runtime-core v3.5.13
+* (c) 2018-present Yuxi (Evan) You and Vue contributors
+* @license MIT
+**/const yu=[];function uh(e){yu.push(e)}function nh(){yu.pop()}let Li=!1;function zt(e,...t){if(Li)return;Li=!0,$t();const u=yu.length?yu[yu.length-1].component:null,n=u&&u.appContext.config.warnHandler,r=rh();if(n)wu(n,u,11,[e+t.map(i=>{var s,o;return(o=(s=i.toString)==null?void 0:s.call(i))!=null?o:JSON.stringify(i)}).join(""),u&&u.proxy,r.map(({vnode:i})=>`at <${e0(u,i.type)}>`).join(`
+`),r]);else{const i=[`[Vue warn]: ${e}`,...t];r.length&&i.push(`
+`,...ih(r)),console.warn(...i)}Ht(),Li=!1}function rh(){let e=yu[yu.length-1];if(!e)return[];const t=[];for(;e;){const u=t[0];u&&u.vnode===e?u.recurseCount++:t.push({vnode:e,recurseCount:0});const n=e.component&&e.component.parent;e=n&&n.vnode}return t}function ih(e){const t=[];return e.forEach((u,n)=>{t.push(...n===0?[]:[`
+`],...sh(u))}),t}function sh({vnode:e,recurseCount:t}){const u=t>0?`... (${t} recursive calls)`:"",n=e.component?e.component.parent==null:!1,r=` at <${e0(e.component,e.type,n)}`,i=">"+u;return e.props?[r,...oh(e.props),i]:[r+i]}function oh(e){const t=[],u=Object.keys(e);return u.slice(0,3).forEach(n=>{t.push(...Wl(n,e[n]))}),u.length>3&&t.push(" ..."),t}function Wl(e,t,u){return Z(t)?(t=JSON.stringify(t),u?t:[`${e}=${t}`]):typeof t=="number"||typeof t=="boolean"||t==null?u?t:[`${e}=${t}`]:Ce(t)?(t=Wl(e,Q(t.value),!0),u?t:[`${e}=Ref<`,t,">"]):W(t)?[`${e}=fn${t.name?`<${t.name}>`:""}`]:(t=Q(t),u?t:[`${e}=`,t])}function ch(e,t){}const lh={SETUP_FUNCTION:0,0:"SETUP_FUNCTION",RENDER_FUNCTION:1,1:"RENDER_FUNCTION",NATIVE_EVENT_HANDLER:5,5:"NATIVE_EVENT_HANDLER",COMPONENT_EVENT_HANDLER:6,6:"COMPONENT_EVENT_HANDLER",VNODE_HOOK:7,7:"VNODE_HOOK",DIRECTIVE_HOOK:8,8:"DIRECTIVE_HOOK",TRANSITION_HOOK:9,9:"TRANSITION_HOOK",APP_ERROR_HANDLER:10,10:"APP_ERROR_HANDLER",APP_WARN_HANDLER:11,11:"APP_WARN_HANDLER",FUNCTION_REF:12,12:"FUNCTION_REF",ASYNC_COMPONENT_LOADER:13,13:"ASYNC_COMPONENT_LOADER",SCHEDULER:14,14:"SCHEDULER",COMPONENT_UPDATE:15,15:"COMPONENT_UPDATE",APP_UNMOUNT_CLEANUP:16,16:"APP_UNMOUNT_CLEANUP"},ah={sp:"serverPrefetch hook",bc:"beforeCreate hook",c:"created hook",bm:"beforeMount hook",m:"mounted hook",bu:"beforeUpdate hook",u:"updated",bum:"beforeUnmount hook",um:"unmounted hook",a:"activated hook",da:"deactivated hook",ec:"errorCaptured hook",rtc:"renderTracked hook",rtg:"renderTriggered hook",0:"setup function",1:"render function",2:"watcher getter",3:"watcher callback",4:"watcher cleanup function",5:"native event handler",6:"component event handler",7:"vnode hook",8:"directive hook",9:"transition hook",10:"app errorHandler",11:"app warnHandler",12:"ref function",13:"async component loader",14:"scheduler flush",15:"component update",16:"app unmount cleanup function"};function wu(e,t,u,n){try{return n?e(...n):e()}catch(r){Fu(r,t,u)}}function lt(e,t,u,n){if(W(e)){const r=wu(e,t,u,n);return r&&Ps(r)&&r.catch(i=>{Fu(i,t,u)}),r}if($(e)){const r=[];for(let i=0;i<e.length;i++)r.push(lt(e[i],t,u,n));return r}}function Fu(e,t,u,n=!0){const r=t?t.vnode:null,{errorHandler:i,throwUnhandledErrorInProduction:s}=t&&t.appContext.config||ee;if(t){let o=t.parent;const c=t.proxy,l=`https://vuejs.org/error-reference/#runtime-${u}`;for(;o;){const a=o.ec;if(a){for(let f=0;f<a.length;f++)if(a[f](e,c,l)===!1)return}o=o.parent}if(i){$t(),wu(i,null,10,[e,c,l]),Ht();return}}fh(e,u,r,n,s)}function fh(e,t,u,n=!0,r=!1){if(r)throw e;console.error(e)}const Ue=[];let Ct=-1;const $u=[];let Gt=null,Mu=0;const Gl=Promise.resolve();let wr=null;function fi(e){const t=wr||Gl;return e?t.then(this?e.bind(this):e):t}function dh(e){let t=Ct+1,u=Ue.length;for(;t<u;){const n=t+u>>>1,r=Ue[n],i=Fn(r);i<e||i===e&&r.flags&2?t=n+1:u=n}return t}function Ks(e){if(!(e.flags&1)){const t=Fn(e),u=Ue[Ue.length-1];!u||!(e.flags&2)&&t>=Fn(u)?Ue.push(e):Ue.splice(dh(t),0,e),e.flags|=1,Zl()}}function Zl(){wr||(wr=Gl.then(Jl))}function wn(e){$(e)?$u.push(...e):Gt&&e.id===-1?Gt.splice(Mu+1,0,e):e.flags&1||($u.push(e),e.flags|=1),Zl()}function Yo(e,t,u=Ct+1){for(;u<Ue.length;u++){const n=Ue[u];if(n&&n.flags&2){if(e&&n.id!==e.uid)continue;Ue.splice(u,1),u--,n.flags&4&&(n.flags&=-2),n(),n.flags&4||(n.flags&=-2)}}}function Fr(e){if($u.length){const t=[...new Set($u)].sort((u,n)=>Fn(u)-Fn(n));if($u.length=0,Gt){Gt.push(...t);return}for(Gt=t,Mu=0;Mu<Gt.length;Mu++){const u=Gt[Mu];u.flags&4&&(u.flags&=-2),u.flags&8||u(),u.flags&=-2}Gt=null,Mu=0}}const Fn=e=>e.id==null?e.flags&2?-1:1/0:e.id;function Jl(e){try{for(Ct=0;Ct<Ue.length;Ct++){const t=Ue[Ct];t&&!(t.flags&8)&&(t.flags&4&&(t.flags&=-2),wu(t,t.i,t.i?15:14),t.flags&4||(t.flags&=-2))}}finally{for(;Ct<Ue.length;Ct++){const t=Ue[Ct];t&&(t.flags&=-2)}Ct=-1,Ue.length=0,Fr(),wr=null,(Ue.length||$u.length)&&Jl()}}let pt,hn=[],is=!1;function di(e,...t){pt?pt.emit(e,...t):is||hn.push({event:e,args:t})}function Ws(e,t){var u,n;pt=e,pt?(pt.enabled=!0,hn.forEach(({event:r,args:i})=>pt.emit(r,...i)),hn=[]):typeof window<"u"&&window.HTMLElement&&!((n=(u=window.navigator)==null?void 0:u.userAgent)!=null&&n.includes("jsdom"))?((t.__VUE_DEVTOOLS_HOOK_REPLAY__=t.__VUE_DEVTOOLS_HOOK_REPLAY__||[]).push(i=>{Ws(i,t)}),setTimeout(()=>{pt||(t.__VUE_DEVTOOLS_HOOK_REPLAY__=null,is=!0,hn=[])},3e3)):(is=!0,hn=[])}function hh(e,t){di("app:init",e,t,{Fragment:Ee,Text:Pt,Comment:ye,Static:ru})}function ph(e){di("app:unmount",e)}const ss=Gs("component:added"),Yl=Gs("component:updated"),mh=Gs("component:removed"),bh=e=>{pt&&typeof pt.cleanupBuffer=="function"&&!pt.cleanupBuffer(e)&&mh(e)};/*! #__NO_SIDE_EFFECTS__ */function Gs(e){return t=>{di(e,t.appContext.app,t.uid,t.parent?t.parent.uid:void 0,t)}}function gh(e,t,u){di("component:emit",e.appContext.app,e,t,u)}let ve=null,hi=null;function In(e){const t=ve;return ve=e,hi=e&&e.type.__scopeId||null,t}function _h(e){hi=e}function yh(){hi=null}const xh=e=>Zs;function Zs(e,t=ve,u){if(!t||e._n)return e;const n=(...r)=>{n._d&&ps(-1);const i=In(t);let s;try{s=e(...r)}finally{In(i),n._d&&ps(1)}return __VUE_PROD_DEVTOOLS__&&Yl(t),s};return n._n=!0,n._c=!0,n._d=!0,n}function Eh(e,t){if(ve===null)return e;const u=Jn(ve),n=e.dirs||(e.dirs=[]);for(let r=0;r<t.length;r++){let[i,s,o,c=ee]=t[r];i&&(W(i)&&(i={mounted:i,updated:i}),i.deep&&Rt(s),n.push({dir:i,instance:u,value:s,oldValue:void 0,arg:o,modifiers:c}))}return e}function kt(e,t,u,n){const r=e.dirs,i=t&&t.dirs;for(let s=0;s<r.length;s++){const o=r[s];i&&(o.oldValue=i[s].value);let c=o.dir[n];c&&($t(),lt(c,u,8,[e.el,o,e,t]),Ht())}}const Xl=Symbol("_vte"),Ql=e=>e.__isTeleport,xn=e=>e&&(e.disabled||e.disabled===""),Xo=e=>e&&(e.defer||e.defer===""),Qo=e=>typeof SVGElement<"u"&&e instanceof SVGElement,ec=e=>typeof MathMLElement=="function"&&e instanceof MathMLElement,os=(e,t)=>{const u=e&&e.to;return Z(u)?t?t(u):null:u},ea={name:"Teleport",__isTeleport:!0,process(e,t,u,n,r,i,s,o,c,l){const{mc:a,pc:f,pbc:p,o:{insert:d,querySelector:h,createText:_,createComment:v}}=l,A=xn(t.props);let{shapeFlag:E,children:m,dynamicChildren:g}=t;if(e==null){const x=t.el=_(""),k=t.anchor=_("");d(x,u,n),d(k,u,n);const I=(C,S)=>{E&16&&(r&&r.isCE&&(r.ce._teleportTarget=C),a(m,C,S,r,i,s,o,c))},w=()=>{const C=t.target=os(t.props,h),S=ta(C,t,_,d);C&&(s!=="svg"&&Qo(C)?s="svg":s!=="mathml"&&ec(C)&&(s="mathml"),A||(I(C,S),gr(t,!1)))};A&&(I(u,k),gr(t,!0)),Xo(t.props)?ke(()=>{w(),t.el.__isMounted=!0},i):w()}else{if(Xo(t.props)&&!e.el.__isMounted){ke(()=>{ea.process(e,t,u,n,r,i,s,o,c,l),delete e.el.__isMounted},i);return}t.el=e.el,t.targetStart=e.targetStart;const x=t.anchor=e.anchor,k=t.target=e.target,I=t.targetAnchor=e.targetAnchor,w=xn(e.props),C=w?u:k,S=w?x:I;if(s==="svg"||Qo(k)?s="svg":(s==="mathml"||ec(k))&&(s="mathml"),g?(p(e.dynamicChildren,g,C,r,i,s,o),io(e,t,!0)):c||f(e,t,C,S,r,i,s,o,!1),A)w?t.props&&e.props&&t.props.to!==e.props.to&&(t.props.to=e.props.to):or(t,u,x,l,1);else if((t.props&&t.props.to)!==(e.props&&e.props.to)){const R=t.target=os(t.props,h);R&&or(t,R,null,l,0)}else w&&or(t,k,I,l,1);gr(t,A)}},remove(e,t,u,{um:n,o:{remove:r}},i){const{shapeFlag:s,children:o,anchor:c,targetStart:l,targetAnchor:a,target:f,props:p}=e;if(f&&(r(l),r(a)),i&&r(c),s&16){const d=i||!xn(p);for(let h=0;h<o.length;h++){const _=o[h];n(_,t,u,d,!!_.dynamicChildren)}}},move:or,hydrate:Ch};function or(e,t,u,{o:{insert:n},m:r},i=2){i===0&&n(e.targetAnchor,t,u);const{el:s,anchor:o,shapeFlag:c,children:l,props:a}=e,f=i===2;if(f&&n(s,t,u),(!f||xn(a))&&c&16)for(let p=0;p<l.length;p++)r(l[p],t,u,2);f&&n(o,t,u)}function Ch(e,t,u,n,r,i,{o:{nextSibling:s,parentNode:o,querySelector:c,insert:l,createText:a}},f){const p=t.target=os(t.props,c);if(p){const d=xn(t.props),h=p._lpa||p.firstChild;if(t.shapeFlag&16)if(d)t.anchor=f(s(e),t,o(e),u,n,r,i),t.targetStart=h,t.targetAnchor=h&&s(h);else{t.anchor=s(e);let _=h;for(;_;){if(_&&_.nodeType===8){if(_.data==="teleport start anchor")t.targetStart=_;else if(_.data==="teleport anchor"){t.targetAnchor=_,p._lpa=t.targetAnchor&&s(t.targetAnchor);break}}_=s(_)}t.targetAnchor||ta(p,t,a,l),f(h&&s(h),t,p,u,n,r,i)}gr(t,d)}return t.anchor&&s(t.anchor)}const kh=ea;function gr(e,t){const u=e.ctx;if(u&&u.ut){let n,r;for(t?(n=e.el,r=e.anchor):(n=e.targetStart,r=e.targetAnchor);n&&n!==r;)n.nodeType===1&&n.setAttribute("data-v-owner",u.uid),n=n.nextSibling;u.ut()}}function ta(e,t,u,n){const r=t.targetStart=u(""),i=t.targetAnchor=u("");return r[Xl]=i,e&&(n(r,e),n(i,e)),i}const Zt=Symbol("_leaveCb"),cr=Symbol("_enterCb");function Js(){const e={isMounted:!1,isLeaving:!1,isUnmounting:!1,leavingVNodes:new Map};return Gn(()=>{e.isMounted=!0}),gi(()=>{e.isUnmounting=!0}),e}const it=[Function,Array],Ys={mode:String,appear:Boolean,persisted:Boolean,onBeforeEnter:it,onEnter:it,onAfterEnter:it,onEnterCancelled:it,onBeforeLeave:it,onLeave:it,onAfterLeave:it,onLeaveCancelled:it,onBeforeAppear:it,onAppear:it,onAfterAppear:it,onAppearCancelled:it},ua=e=>{const t=e.subTree;return t.component?ua(t.component):t},Ah={name:"BaseTransition",props:Ys,setup(e,{slots:t}){const u=ft(),n=Js();return()=>{const r=t.default&&pi(t.default(),!0);if(!r||!r.length)return;const i=na(r),s=Q(e),{mode:o}=s;if(n.isLeaving)return Bi(i);const c=tc(i);if(!c)return Bi(i);let l=zu(c,s,n,u,f=>l=f);c.type!==ye&&Bt(c,l);let a=u.subTree&&tc(u.subTree);if(a&&a.type!==ye&&!mt(c,a)&&ua(u).type!==ye){let f=zu(a,s,n,u);if(Bt(a,f),o==="out-in"&&c.type!==ye)return n.isLeaving=!0,f.afterLeave=()=>{n.isLeaving=!1,u.job.flags&8||u.update(),delete f.afterLeave,a=void 0},Bi(i);o==="in-out"&&c.type!==ye?f.delayLeave=(p,d,h)=>{const _=ia(n,a);_[String(a.key)]=a,p[Zt]=()=>{d(),p[Zt]=void 0,delete l.delayedLeave,a=void 0},l.delayedLeave=()=>{h(),delete l.delayedLeave,a=void 0}}:a=void 0}else a&&(a=void 0);return i}}};function na(e){let t=e[0];if(e.length>1){for(const u of e)if(u.type!==ye){t=u;break}}return t}const ra=Ah;function ia(e,t){const{leavingVNodes:u}=e;let n=u.get(t.type);return n||(n=Object.create(null),u.set(t.type,n)),n}function zu(e,t,u,n,r){const{appear:i,mode:s,persisted:o=!1,onBeforeEnter:c,onEnter:l,onAfterEnter:a,onEnterCancelled:f,onBeforeLeave:p,onLeave:d,onAfterLeave:h,onLeaveCancelled:_,onBeforeAppear:v,onAppear:A,onAfterAppear:E,onAppearCancelled:m}=t,g=String(e.key),x=ia(u,e),k=(C,S)=>{C&&lt(C,n,9,S)},I=(C,S)=>{const R=S[1];k(C,S),$(C)?C.every(T=>T.length<=1)&&R():C.length<=1&&R()},w={mode:s,persisted:o,beforeEnter(C){let S=c;if(!u.isMounted)if(i)S=v||c;else return;C[Zt]&&C[Zt](!0);const R=x[g];R&&mt(e,R)&&R.el[Zt]&&R.el[Zt](),k(S,[C])},enter(C){let S=l,R=a,T=f;if(!u.isMounted)if(i)S=A||l,R=E||a,T=m||f;else return;let P=!1;const q=C[cr]=z=>{P||(P=!0,z?k(T,[C]):k(R,[C]),w.delayedLeave&&w.delayedLeave(),C[cr]=void 0)};S?I(S,[C,q]):q()},leave(C,S){const R=String(e.key);if(C[cr]&&C[cr](!0),u.isUnmounting)return S();k(p,[C]);let T=!1;const P=C[Zt]=q=>{T||(T=!0,S(),q?k(_,[C]):k(h,[C]),C[Zt]=void 0,x[R]===e&&delete x[R])};x[R]=e,d?I(d,[C,P]):P()},clone(C){const S=zu(C,t,u,n,r);return r&&r(S),S}};return w}function Bi(e){if(Wn(e))return e=vt(e),e.children=null,e}function tc(e){if(!Wn(e))return Ql(e.type)&&e.children?na(e.children):e;const{shapeFlag:t,children:u}=e;if(u){if(t&16)return u[0];if(t&32&&W(u.default))return u.default()}}function Bt(e,t){e.shapeFlag&6&&e.component?(e.transition=t,Bt(e.component.subTree,t)):e.shapeFlag&128?(e.ssContent.transition=t.clone(e.ssContent),e.ssFallback.transition=t.clone(e.ssFallback)):e.transition=t}function pi(e,t=!1,u){let n=[],r=0;for(let i=0;i<e.length;i++){let s=e[i];const o=u==null?s.key:String(u)+String(s.key!=null?s.key:i);s.type===Ee?(s.patchFlag&128&&r++,n=n.concat(pi(s.children,t,o))):(t||s.type!==ye)&&n.push(o!=null?vt(s,{key:o}):s)}if(r>1)for(let i=0;i<n.length;i++)n[i].patchFlag=-2;return n}/*! #__NO_SIDE_EFFECTS__ */function Kn(e,t){return W(e)?te({name:e.name},t,{setup:e}):e}function Sh(){const e=ft();return e?(e.appContext.config.idPrefix||"v")+"-"+e.ids[0]+e.ids[1]++:""}function Xs(e){e.ids=[e.ids[0]+e.ids[2]+++"-",0,0]}function vh(e){const t=ft(),u=js(null);if(t){const r=t.refs===ee?t.refs={}:t.refs;Object.defineProperty(r,e,{enumerable:!0,get:()=>u.value,set:i=>u.value=i})}return u}function On(e,t,u,n,r=!1){if($(e)){e.forEach((h,_)=>On(h,t&&($(t)?t[_]:t),u,n,r));return}if(nu(n)&&!r){n.shapeFlag&512&&n.type.__asyncResolved&&n.component.subTree.component&&On(e,t,u,n.component.subTree);return}const i=n.shapeFlag&4?Jn(n.component):n.el,s=r?null:i,{i:o,r:c}=e,l=t&&t.r,a=o.refs===ee?o.refs={}:o.refs,f=o.setupState,p=Q(f),d=f===ee?()=>!1:h=>re(p,h);if(l!=null&&l!==c&&(Z(l)?(a[l]=null,d(l)&&(f[l]=null)):Ce(l)&&(l.value=null)),W(c))wu(c,o,12,[s,a]);else{const h=Z(c),_=Ce(c);if(h||_){const v=()=>{if(e.f){const A=h?d(c)?f[c]:a[c]:c.value;r?$(A)&&Ms(A,i):$(A)?A.includes(i)||A.push(i):h?(a[c]=[i],d(c)&&(f[c]=a[c])):(c.value=[i],e.k&&(a[e.k]=c.value))}else h?(a[c]=s,d(c)&&(f[c]=s)):_&&(c.value=s,e.k&&(a[e.k]=s))};s?(v.id=-1,ke(v,u)):v()}}}let uc=!1;const au=()=>{uc||(console.error("Hydration completed but contains mismatches."),uc=!0)},Dh=e=>e.namespaceURI.includes("svg")&&e.tagName!=="foreignObject",Th=e=>e.namespaceURI.includes("MathML"),lr=e=>{if(e.nodeType===1){if(Dh(e))return"svg";if(Th(e))return"mathml"}},pu=e=>e.nodeType===8;function wh(e){const{mt:t,p:u,o:{patchProp:n,createText:r,nextSibling:i,parentNode:s,remove:o,insert:c,createComment:l}}=e,a=(m,g)=>{if(!g.hasChildNodes()){__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&zt("Attempting to hydrate existing markup but container is empty. Performing full mount instead."),u(null,m,g),Fr(),g._vnode=m;return}f(g.firstChild,m,null,null,null),Fr(),g._vnode=m},f=(m,g,x,k,I,w=!1)=>{w=w||!!g.dynamicChildren;const C=pu(m)&&m.data==="[",S=()=>_(m,g,x,k,I,C),{type:R,ref:T,shapeFlag:P,patchFlag:q}=g;let z=m.nodeType;g.el=m,__VUE_PROD_DEVTOOLS__&&(ju(m,"__vnode",g,!0),ju(m,"__vueParentComponent",x,!0)),q===-2&&(w=!1,g.dynamicChildren=null);let H=null;switch(R){case Pt:z!==3?g.children===""?(c(g.el=r(""),s(m),m),H=m):H=S():(m.data!==g.children&&(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&zt("Hydration text mismatch in",m.parentNode,`
+  - rendered on server: ${JSON.stringify(m.data)}
+  - expected on client: ${JSON.stringify(g.children)}`),au(),m.data=g.children),H=i(m));break;case ye:E(m)?(H=i(m),A(g.el=m.content.firstChild,m,x)):z!==8||C?H=S():H=i(m);break;case ru:if(C&&(m=i(m),z=m.nodeType),z===1||z===3){H=m;const J=!g.children.length;for(let j=0;j<g.staticCount;j++)J&&(g.children+=H.nodeType===1?H.outerHTML:H.data),j===g.staticCount-1&&(g.anchor=H),H=i(H);return C?i(H):H}else S();break;case Ee:C?H=h(m,g,x,k,I,w):H=S();break;default:if(P&1)(z!==1||g.type.toLowerCase()!==m.tagName.toLowerCase())&&!E(m)?H=S():H=p(m,g,x,k,I,w);else if(P&6){g.slotScopeIds=I;const J=s(m);if(C?H=v(m):pu(m)&&m.data==="teleport start"?H=v(m,m.data,"teleport end"):H=i(m),t(g,J,null,x,k,lr(J),w),nu(g)&&!g.type.__asyncResolved){let j;C?(j=fe(Ee),j.anchor=H?H.previousSibling:J.lastChild):j=m.nodeType===3?oo(""):fe("div"),j.el=m,g.component.subTree=j}}else P&64?z!==8?H=S():H=g.type.hydrate(m,g,x,k,I,w,e,d):P&128?H=g.type.hydrate(m,g,x,k,lr(s(m)),I,w,e,f):__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&zt("Invalid HostVNode type:",R,`(${typeof R})`)}return T!=null&&On(T,null,k,g),H},p=(m,g,x,k,I,w)=>{w=w||!!g.dynamicChildren;const{type:C,props:S,patchFlag:R,shapeFlag:T,dirs:P,transition:q}=g,z=C==="input"||C==="option";if(z||R!==-1){P&&kt(g,null,x,"created");let H=!1;if(E(m)){H=Na(null,q)&&x&&x.vnode.props&&x.vnode.props.appear;const j=m.content.firstChild;H&&q.beforeEnter(j),A(j,m,x),g.el=m=j}if(T&16&&!(S&&(S.innerHTML||S.textContent))){let j=d(m.firstChild,g,m,x,k,I,w),be=!1;for(;j;){pn(m,1)||(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&!be&&(zt("Hydration children mismatch on",m,`
+Server rendered element contains more child nodes than client vdom.`),be=!0),au());const je=j;j=j.nextSibling,o(je)}}else if(T&8){let j=g.children;j[0]===`
+`&&(m.tagName==="PRE"||m.tagName==="TEXTAREA")&&(j=j.slice(1)),m.textContent!==j&&(pn(m,0)||(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&zt("Hydration text content mismatch on",m,`
+  - rendered on server: ${m.textContent}
+  - expected on client: ${g.children}`),au()),m.textContent=g.children)}if(S){if(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__||z||!w||R&48){const j=m.tagName.includes("-");for(const be in S)__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&!(P&&P.some(je=>je.dir.created))&&Fh(m,be,S[be],g,x)&&au(),(z&&(be.endsWith("value")||be==="indeterminate")||vu(be)&&!tu(be)||be[0]==="."||j)&&n(m,be,null,S[be],void 0,x)}else if(S.onClick)n(m,"onClick",null,S.onClick,void 0,x);else if(R&4&&uu(S.style))for(const j in S.style)S.style[j]}let J;(J=S&&S.onVnodeBeforeMount)&&Ke(J,x,g),P&&kt(g,null,x,"beforeMount"),((J=S&&S.onVnodeMounted)||P||H)&&Ua(()=>{J&&Ke(J,x,g),H&&q.enter(m),P&&kt(g,null,x,"mounted")},k)}return m.nextSibling},d=(m,g,x,k,I,w,C)=>{C=C||!!g.dynamicChildren;const S=g.children,R=S.length;let T=!1;for(let P=0;P<R;P++){const q=C?S[P]:S[P]=We(S[P]),z=q.type===Pt;m?(z&&!C&&P+1<R&&We(S[P+1]).type===Pt&&(c(r(m.data.slice(q.children.length)),x,i(m)),m.data=q.children),m=f(m,q,k,I,w,C)):z&&!q.children?c(q.el=r(""),x):(pn(x,1)||(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&!T&&(zt("Hydration children mismatch on",x,`
+Server rendered element contains fewer child nodes than client vdom.`),T=!0),au()),u(null,q,x,null,k,I,lr(x),w))}return m},h=(m,g,x,k,I,w)=>{const{slotScopeIds:C}=g;C&&(I=I?I.concat(C):C);const S=s(m),R=d(i(m),g,S,x,k,I,w);return R&&pu(R)&&R.data==="]"?i(g.anchor=R):(au(),c(g.anchor=l("]"),S,R),R)},_=(m,g,x,k,I,w)=>{if(pn(m.parentElement,1)||(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&zt(`Hydration node mismatch:
+- rendered on server:`,m,m.nodeType===3?"(text)":pu(m)&&m.data==="["?"(start of fragment)":"",`
+- expected on client:`,g.type),au()),g.el=null,w){const R=v(m);for(;;){const T=i(m);if(T&&T!==R)o(T);else break}}const C=i(m),S=s(m);return o(m),u(null,g,S,C,x,k,lr(S),I),x&&(x.vnode.el=g.el,xi(x,g.el)),C},v=(m,g="[",x="]")=>{let k=0;for(;m;)if(m=i(m),m&&pu(m)&&(m.data===g&&k++,m.data===x)){if(k===0)return i(m);k--}return m},A=(m,g,x)=>{const k=g.parentNode;k&&k.replaceChild(m,g);let I=x;for(;I;)I.vnode.el===g&&(I.vnode.el=I.subTree.el=m),I=I.parent},E=m=>m.nodeType===1&&m.tagName==="TEMPLATE";return[a,f]}function Fh(e,t,u,n,r){let i,s,o,c;if(t==="class")o=e.getAttribute("class"),c=nn(u),Ih(nc(o||""),nc(c))||(i=2,s="class");else if(t==="style"){o=e.getAttribute("style")||"",c=Z(u)?u:ed(un(u));const l=rc(o),a=rc(c);if(n.dirs)for(const{dir:f,value:p}of n.dirs)f.name==="show"&&!p&&a.set("display","none");r&&sa(r,n,a),Oh(l,a)||(i=3,s="style")}else(e instanceof SVGElement&&dd(t)||e instanceof HTMLElement&&(Go(t)||fd(t)))&&(Go(t)?(o=e.hasAttribute(t),c=Bs(u)):u==null?(o=e.hasAttribute(t),c=!1):(e.hasAttribute(t)?o=e.getAttribute(t):t==="value"&&e.tagName==="TEXTAREA"?o=e.value:o=!1,c=hd(u)?String(u):!1),o!==c&&(i=4,s=t));if(i!=null&&!pn(e,i)){const l=p=>p===!1?"(not rendered)":`${s}="${p}"`,a=`Hydration ${oa[i]} mismatch on`,f=`
+  - rendered on server: ${l(o)}
+  - expected on client: ${l(c)}
+  Note: this mismatch is check-only. The DOM will not be rectified in production due to performance overhead.
+  You should fix the source of the mismatch.`;return zt(a,e,f),!0}return!1}function nc(e){return new Set(e.trim().split(/\s+/))}function Ih(e,t){if(e.size!==t.size)return!1;for(const u of e)if(!t.has(u))return!1;return!0}function rc(e){const t=new Map;for(const u of e.split(";")){let[n,r]=u.split(":");n=n.trim(),r=r&&r.trim(),n&&r&&t.set(n,r)}return t}function Oh(e,t){if(e.size!==t.size)return!1;for(const[u,n]of e)if(n!==t.get(u))return!1;return!0}function sa(e,t,u){const n=e.subTree;if(e.getCssVars&&(t===n||n&&n.type===Ee&&n.children.includes(t))){const r=e.getCssVars();for(const i in r)u.set(`--${md(i)}`,String(r[i]))}t===n&&e.parent&&sa(e.parent,e.vnode,u)}const ic="data-allow-mismatch",oa={0:"text",1:"children",2:"class",3:"style",4:"attribute"};function pn(e,t){if(t===0||t===1)for(;e&&!e.hasAttribute(ic);)e=e.parentElement;const u=e&&e.getAttribute(ic);if(u==null)return!1;if(u==="")return!0;{const n=u.split(",");return t===0&&n.includes("children")?!0:u.split(",").includes(oa[t])}}const Nh=gu().requestIdleCallback||(e=>setTimeout(e,1)),Rh=gu().cancelIdleCallback||(e=>clearTimeout(e)),Mh=(e=1e4)=>t=>{const u=Nh(t,{timeout:e});return()=>Rh(u)};function Ph(e){const{top:t,left:u,bottom:n,right:r}=e.getBoundingClientRect(),{innerHeight:i,innerWidth:s}=window;return(t>0&&t<i||n>0&&n<i)&&(u>0&&u<s||r>0&&r<s)}const Lh=e=>(t,u)=>{const n=new IntersectionObserver(r=>{for(const i of r)if(i.isIntersecting){n.disconnect(),t();break}},e);return u(r=>{if(r instanceof Element){if(Ph(r))return t(),n.disconnect(),!1;n.observe(r)}}),()=>n.disconnect()},Bh=e=>t=>{if(e){const u=matchMedia(e);if(u.matches)t();else return u.addEventListener("change",t,{once:!0}),()=>u.removeEventListener("change",t)}},Vh=(e=[])=>(t,u)=>{Z(e)&&(e=[e]);let n=!1;const r=s=>{n||(n=!0,i(),t(),s.target.dispatchEvent(new s.constructor(s.type,s)))},i=()=>{u(s=>{for(const o of e)s.removeEventListener(o,r)})};return u(s=>{for(const o of e)s.addEventListener(o,r,{once:!0})}),i};function $h(e,t){if(pu(e)&&e.data==="["){let u=1,n=e.nextSibling;for(;n;){if(n.nodeType===1){if(t(n)===!1)break}else if(pu(n))if(n.data==="]"){if(--u===0)break}else n.data==="["&&u++;n=n.nextSibling}}else t(e)}const nu=e=>!!e.type.__asyncLoader;/*! #__NO_SIDE_EFFECTS__ */function Hh(e){W(e)&&(e={loader:e});const{loader:t,loadingComponent:u,errorComponent:n,delay:r=200,hydrate:i,timeout:s,suspensible:o=!0,onError:c}=e;let l=null,a,f=0;const p=()=>(f++,l=null,d()),d=()=>{let h;return l||(h=l=t().catch(_=>{if(_=_ instanceof Error?_:new Error(String(_)),c)return new Promise((v,A)=>{c(_,()=>v(p()),()=>A(_),f+1)});throw _}).then(_=>h!==l&&l?l:(_&&(_.__esModule||_[Symbol.toStringTag]==="Module")&&(_=_.default),a=_,_)))};return Kn({name:"AsyncComponentWrapper",__asyncLoader:d,__asyncHydrate(h,_,v){const A=i?()=>{const E=i(v,m=>$h(h,m));E&&(_.bum||(_.bum=[])).push(E)}:v;a?A():d().then(()=>!_.isUnmounted&&A())},get __asyncResolved(){return a},setup(){const h=Se;if(Xs(h),a)return()=>Vi(a,h);const _=m=>{l=null,Fu(m,h,13,!n)};if(o&&h.suspense||Ku)return d().then(m=>()=>Vi(m,h)).catch(m=>(_(m),()=>n?fe(n,{error:m}):null));const v=yn(!1),A=yn(),E=yn(!!r);return r&&setTimeout(()=>{E.value=!1},r),s!=null&&setTimeout(()=>{if(!v.value&&!A.value){const m=new Error(`Async component timed out after ${s}ms.`);_(m),A.value=m}},s),d().then(()=>{v.value=!0,h.parent&&Wn(h.parent.vnode)&&h.parent.update()}).catch(m=>{_(m),A.value=m}),()=>{if(v.value&&a)return Vi(a,h);if(A.value&&n)return fe(n,{error:A.value});if(u&&!E.value)return fe(u)}}})}function Vi(e,t){const{ref:u,props:n,children:r,ce:i}=t.vnode,s=fe(e,n,r);return s.ref=u,s.ce=i,delete t.vnode.ce,s}const Wn=e=>e.type.__isKeepAlive,Uh={name:"KeepAlive",__isKeepAlive:!0,props:{include:[String,RegExp,Array],exclude:[String,RegExp,Array],max:[String,Number]},setup(e,{slots:t}){const u=ft(),n=u.ctx;if(!n.renderer)return()=>{const E=t.default&&t.default();return E&&E.length===1?E[0]:E};const r=new Map,i=new Set;let s=null;__VUE_PROD_DEVTOOLS__&&(u.__v_cache=r);const o=u.suspense,{renderer:{p:c,m:l,um:a,o:{createElement:f}}}=n,p=f("div");n.activate=(E,m,g,x,k)=>{const I=E.component;l(E,m,g,0,o),c(I.vnode,E,m,g,I,o,x,E.slotScopeIds,k),ke(()=>{I.isDeactivated=!1,I.a&&Vu(I.a);const w=E.props&&E.props.onVnodeMounted;w&&Ke(w,I.parent,E)},o),__VUE_PROD_DEVTOOLS__&&ss(I)},n.deactivate=E=>{const m=E.component;Or(m.m),Or(m.a),l(E,p,null,1,o),ke(()=>{m.da&&Vu(m.da);const g=E.props&&E.props.onVnodeUnmounted;g&&Ke(g,m.parent,E),m.isDeactivated=!0},o),__VUE_PROD_DEVTOOLS__&&ss(m)};function d(E){$i(E),a(E,u,o,!0)}function h(E){r.forEach((m,g)=>{const x=Lr(m.type);x&&!E(x)&&_(g)})}function _(E){const m=r.get(E);m&&(!s||!mt(m,s))?d(m):s&&$i(s),r.delete(E),i.delete(E)}Hu(()=>[e.include,e.exclude],([E,m])=>{E&&h(g=>mn(E,g)),m&&h(g=>!mn(m,g))},{flush:"post",deep:!0});let v=null;const A=()=>{v!=null&&(Nr(u.subTree.type)?ke(()=>{r.set(v,ar(u.subTree))},u.subTree.suspense):r.set(v,ar(u.subTree)))};return Gn(A),bi(A),gi(()=>{r.forEach(E=>{const{subTree:m,suspense:g}=u,x=ar(m);if(E.type===x.type&&E.key===x.key){$i(x);const k=x.component.da;k&&ke(k,g);return}d(E)})}),()=>{if(v=null,!t.default)return s=null;const E=t.default(),m=E[0];if(E.length>1)return s=null,E;if(!Vt(m)||!(m.shapeFlag&4)&&!(m.shapeFlag&128))return s=null,m;let g=ar(m);if(g.type===ye)return s=null,g;const x=g.type,k=Lr(nu(g)?g.type.__asyncResolved||{}:x),{include:I,exclude:w,max:C}=e;if(I&&(!k||!mn(I,k))||w&&k&&mn(w,k))return g.shapeFlag&=-257,s=g,m;const S=g.key==null?x:g.key,R=r.get(S);return g.el&&(g=vt(g),m.shapeFlag&128&&(m.ssContent=g)),v=S,R?(g.el=R.el,g.component=R.component,g.transition&&Bt(g,g.transition),g.shapeFlag|=512,i.delete(S),i.add(S)):(i.add(S),C&&i.size>parseInt(C,10)&&_(i.values().next().value)),g.shapeFlag|=256,s=g,Nr(m.type)?m:g}}},qh=Uh;function mn(e,t){return $(e)?e.some(u=>mn(u,t)):Z(e)?e.split(",").includes(t):qf(e)?(e.lastIndex=0,e.test(t)):!1}function ca(e,t){aa(e,"a",t)}function la(e,t){aa(e,"da",t)}function aa(e,t,u=Se){const n=e.__wdc||(e.__wdc=()=>{let r=u;for(;r;){if(r.isDeactivated)return;r=r.parent}return e()});if(mi(t,n,u),u){let r=u.parent;for(;r&&r.parent;)Wn(r.parent.vnode)&&jh(n,t,u,r),r=r.parent}}function jh(e,t,u,n){const r=mi(t,e,n,!0);_i(()=>{Ms(n[t],r)},u)}function $i(e){e.shapeFlag&=-257,e.shapeFlag&=-513}function ar(e){return e.shapeFlag&128?e.ssContent:e}function mi(e,t,u=Se,n=!1){if(u){const r=u[e]||(u[e]=[]),i=t.__weh||(t.__weh=(...s)=>{$t();const o=Au(u),c=lt(t,u,e,s);return o(),Ht(),c});return n?r.unshift(i):r.push(i),i}}const Ut=e=>(t,u=Se)=>{(!Ku||e==="sp")&&mi(e,(...n)=>t(...n),u)},fa=Ut("bm"),Gn=Ut("m"),Qs=Ut("bu"),bi=Ut("u"),gi=Ut("bum"),_i=Ut("um"),da=Ut("sp"),ha=Ut("rtg"),pa=Ut("rtc");function ma(e,t=Se){mi("ec",e,t)}const eo="components",zh="directives";function Kh(e,t){return to(eo,e,!0,t)||e}const ba=Symbol.for("v-ndc");function Wh(e){return Z(e)?to(eo,e,!1)||e:e||ba}function Gh(e){return to(zh,e)}function to(e,t,u=!0,n=!1){const r=ve||Se;if(r){const i=r.type;if(e===eo){const o=Lr(i,!1);if(o&&(o===t||o===de(t)||o===Tu(de(t))))return i}const s=sc(r[e]||i[e],t)||sc(r.appContext[e],t);return!s&&n?i:s}}function sc(e,t){return e&&(e[t]||e[de(t)]||e[Tu(de(t))])}function Zh(e,t,u,n){let r;const i=u&&u[n],s=$(e);if(s||Z(e)){const o=s&&uu(e);let c=!1;o&&(c=!ut(e),e=ii(e)),r=new Array(e.length);for(let l=0,a=e.length;l<a;l++)r[l]=t(c?Me(e[l]):e[l],l,void 0,i&&i[l])}else if(typeof e=="number"){r=new Array(e);for(let o=0;o<e;o++)r[o]=t(o+1,o,void 0,i&&i[o])}else if(oe(e))if(e[Symbol.iterator])r=Array.from(e,(o,c)=>t(o,c,void 0,i&&i[c]));else{const o=Object.keys(e);r=new Array(o.length);for(let c=0,l=o.length;c<l;c++){const a=o[c];r[c]=t(e[a],a,c,i&&i[c])}}else r=[];return u&&(u[n]=r),r}function Jh(e,t){for(let u=0;u<t.length;u++){const n=t[u];if($(n))for(let r=0;r<n.length;r++)e[n[r].name]=n[r].fn;else n&&(e[n.name]=n.key?(...r)=>{const i=n.fn(...r);return i&&(i.key=n.key),i}:n.fn)}return e}function Yh(e,t,u={},n,r){if(ve.ce||ve.parent&&nu(ve.parent)&&ve.parent.ce)return t!=="default"&&(u.name=t),Mn(),Rr(Ee,null,[fe("slot",u,n&&n())],64);let i=e[t];i&&i._c&&(i._d=!1),Mn();const s=i&&uo(i(u)),o=u.key||s&&s.key,c=Rr(Ee,{key:(o&&!Ze(o)?o:`_${t}`)+(!s&&n?"_fb":"")},s||(n?n():[]),s&&e._===1?64:-2);return!r&&c.scopeId&&(c.slotScopeIds=[c.scopeId+"-s"]),i&&i._c&&(i._d=!0),c}function uo(e){return e.some(t=>Vt(t)?!(t.type===ye||t.type===Ee&&!uo(t.children)):!0)?e:null}function Xh(e,t){const u={};for(const n in e)u[t&&/[A-Z]/.test(n)?`on:${n}`:Bu(n)]=e[n];return u}const cs=e=>e?Za(e)?Jn(e):cs(e.parent):null,En=te(Object.create(null),{$:e=>e,$el:e=>e.vnode.el,$data:e=>e.data,$props:e=>e.props,$attrs:e=>e.attrs,$slots:e=>e.slots,$refs:e=>e.refs,$parent:e=>cs(e.parent),$root:e=>cs(e.root),$host:e=>e.ce,$emit:e=>e.emit,$options:e=>__VUE_OPTIONS_API__?no(e):e.type,$forceUpdate:e=>e.f||(e.f=()=>{Ks(e.update)}),$nextTick:e=>e.n||(e.n=fi.bind(e.proxy)),$watch:e=>__VUE_OPTIONS_API__?I1.bind(e):we}),Hi=(e,t)=>e!==ee&&!e.__isScriptSetup&&re(e,t),ls={get({_:e},t){if(t==="__v_skip")return!0;const{ctx:u,setupState:n,data:r,props:i,accessCache:s,type:o,appContext:c}=e;let l;if(t[0]!=="$"){const d=s[t];if(d!==void 0)switch(d){case 1:return n[t];case 2:return r[t];case 4:return u[t];case 3:return i[t]}else{if(Hi(n,t))return s[t]=1,n[t];if(r!==ee&&re(r,t))return s[t]=2,r[t];if((l=e.propsOptions[0])&&re(l,t))return s[t]=3,i[t];if(u!==ee&&re(u,t))return s[t]=4,u[t];(!__VUE_OPTIONS_API__||as)&&(s[t]=0)}}const a=En[t];let f,p;if(a)return t==="$attrs"&&Re(e.attrs,"get",""),a(e);if((f=o.__cssModules)&&(f=f[t]))return f;if(u!==ee&&re(u,t))return s[t]=4,u[t];if(p=c.config.globalProperties,re(p,t))return p[t]},set({_:e},t,u){const{data:n,setupState:r,ctx:i}=e;return Hi(r,t)?(r[t]=u,!0):n!==ee&&re(n,t)?(n[t]=u,!0):re(e.props,t)||t[0]==="$"&&t.slice(1)in e?!1:(i[t]=u,!0)},has({_:{data:e,setupState:t,accessCache:u,ctx:n,appContext:r,propsOptions:i}},s){let o;return!!u[s]||e!==ee&&re(e,s)||Hi(t,s)||(o=i[0])&&re(o,s)||re(n,s)||re(En,s)||re(r.config.globalProperties,s)},defineProperty(e,t,u){return u.get!=null?e._.accessCache[t]=0:re(u,"value")&&this.set(e,t,u.value,null),Reflect.defineProperty(e,t,u)}},Qh=te({},ls,{get(e,t){if(t!==Symbol.unscopables)return ls.get(e,t,e)},has(e,t){return t[0]!=="_"&&!Jf(t)}});function e1(){return null}function t1(){return null}function u1(e){}function n1(e){}function r1(){return null}function i1(){}function s1(e,t){return null}function o1(){return ga().slots}function c1(){return ga().attrs}function ga(){const e=ft();return e.setupContext||(e.setupContext=Qa(e))}function Nn(e){return $(e)?e.reduce((t,u)=>(t[u]=null,t),{}):e}function l1(e,t){const u=Nn(e);for(const n in t){if(n.startsWith("__skip"))continue;let r=u[n];r?$(r)||W(r)?r=u[n]={type:r,default:t[n]}:r.default=t[n]:r===null&&(r=u[n]={default:t[n]}),r&&t[`__skip_${n}`]&&(r.skipFactory=!0)}return u}function a1(e,t){return!e||!t?e||t:$(e)&&$(t)?e.concat(t):te({},Nn(e),Nn(t))}function f1(e,t){const u={};for(const n in e)t.includes(n)||Object.defineProperty(u,n,{enumerable:!0,get:()=>e[n]});return u}function d1(e){const t=ft();let u=e();return bs(),Ps(u)&&(u=u.catch(n=>{throw Au(t),n})),[u,()=>Au(t)]}let as=!0;function h1(e){const t=no(e),u=e.proxy,n=e.ctx;as=!1,t.beforeCreate&&oc(t.beforeCreate,e,"bc");const{data:r,computed:i,methods:s,watch:o,provide:c,inject:l,created:a,beforeMount:f,mounted:p,beforeUpdate:d,updated:h,activated:_,deactivated:v,beforeDestroy:A,beforeUnmount:E,destroyed:m,unmounted:g,render:x,renderTracked:k,renderTriggered:I,errorCaptured:w,serverPrefetch:C,expose:S,inheritAttrs:R,components:T,directives:P,filters:q}=t;if(l&&p1(l,n,null),s)for(const J in s){const j=s[J];W(j)&&(n[J]=j.bind(u))}if(r){const J=r.call(u,u);oe(J)&&(e.data=oi(J))}if(as=!0,i)for(const J in i){const j=i[J],be=W(j)?j.bind(u,u):W(j.get)?j.get.bind(u,u):we,je=!W(j)&&W(j.set)?j.set.bind(u):we,dt=lo({get:be,set:je});Object.defineProperty(n,J,{enumerable:!0,configurable:!0,get:()=>dt.value,set:yt=>dt.value=yt})}if(o)for(const J in o)_a(o[J],n,u,J);if(c){const J=W(c)?c.call(u):c;Reflect.ownKeys(J).forEach(j=>{xa(j,J[j])})}a&&oc(a,e,"c");function H(J,j){$(j)?j.forEach(be=>J(be.bind(u))):j&&J(j.bind(u))}if(H(fa,f),H(Gn,p),H(Qs,d),H(bi,h),H(ca,_),H(la,v),H(ma,w),H(pa,k),H(ha,I),H(gi,E),H(_i,g),H(da,C),$(S))if(S.length){const J=e.exposed||(e.exposed={});S.forEach(j=>{Object.defineProperty(J,j,{get:()=>u[j],set:be=>u[j]=be})})}else e.exposed||(e.exposed={});x&&e.render===we&&(e.render=x),R!=null&&(e.inheritAttrs=R),T&&(e.components=T),P&&(e.directives=P),C&&Xs(e)}function p1(e,t,u=we){$(e)&&(e=fs(e));for(const n in e){const r=e[n];let i;oe(r)?"default"in r?i=Cn(r.from||n,r.default,!0):i=Cn(r.from||n):i=Cn(r),Ce(i)?Object.defineProperty(t,n,{enumerable:!0,configurable:!0,get:()=>i.value,set:s=>i.value=s}):t[n]=i}}function oc(e,t,u){lt($(e)?e.map(n=>n.bind(t.proxy)):e.bind(t.proxy),t,u)}function _a(e,t,u,n){let r=n.includes(".")?Ba(u,n):()=>u[n];if(Z(e)){const i=t[e];W(i)&&Hu(r,i)}else if(W(e))Hu(r,e.bind(u));else if(oe(e))if($(e))e.forEach(i=>_a(i,t,u,n));else{const i=W(e.handler)?e.handler.bind(u):t[e.handler];W(i)&&Hu(r,i,e)}}function no(e){const t=e.type,{mixins:u,extends:n}=t,{mixins:r,optionsCache:i,config:{optionMergeStrategies:s}}=e.appContext,o=i.get(t);let c;return o?c=o:!r.length&&!u&&!n?c=t:(c={},r.length&&r.forEach(l=>Ir(c,l,s,!0)),Ir(c,t,s)),oe(t)&&i.set(t,c),c}function Ir(e,t,u,n=!1){const{mixins:r,extends:i}=t;i&&Ir(e,i,u,!0),r&&r.forEach(s=>Ir(e,s,u,!0));for(const s in t)if(!(n&&s==="expose")){const o=m1[s]||u&&u[s];e[s]=o?o(e[s],t[s]):t[s]}return e}const m1={data:cc,props:lc,emits:lc,methods:bn,computed:bn,beforeCreate:Ve,created:Ve,beforeMount:Ve,mounted:Ve,beforeUpdate:Ve,updated:Ve,beforeDestroy:Ve,beforeUnmount:Ve,destroyed:Ve,unmounted:Ve,activated:Ve,deactivated:Ve,errorCaptured:Ve,serverPrefetch:Ve,components:bn,directives:bn,watch:g1,provide:cc,inject:b1};function cc(e,t){return t?e?function(){return te(W(e)?e.call(this,this):e,W(t)?t.call(this,this):t)}:t:e}function b1(e,t){return bn(fs(e),fs(t))}function fs(e){if($(e)){const t={};for(let u=0;u<e.length;u++)t[e[u]]=e[u];return t}return e}function Ve(e,t){return e?[...new Set([].concat(e,t))]:t}function bn(e,t){return e?te(Object.create(null),e,t):t}function lc(e,t){return e?$(e)&&$(t)?[...new Set([...e,...t])]:te(Object.create(null),Nn(e),Nn(t??{})):t}function g1(e,t){if(!e)return t;if(!t)return e;const u=te(Object.create(null),e);for(const n in t)u[n]=Ve(e[n],t[n]);return u}function ya(){return{app:null,config:{isNativeTag:dn,performance:!1,globalProperties:{},optionMergeStrategies:{},errorHandler:void 0,warnHandler:void 0,compilerOptions:{}},mixins:[],components:{},directives:{},provides:Object.create(null),optionsCache:new WeakMap,propsCache:new WeakMap,emitsCache:new WeakMap}}let _1=0;function y1(e,t){return function(n,r=null){W(n)||(n=te({},n)),r!=null&&!oe(r)&&(r=null);const i=ya(),s=new WeakSet,o=[];let c=!1;const l=i.app={_uid:_1++,_component:n,_props:r,_container:null,_context:i,_instance:null,version:ys,get config(){return i.config},set config(a){},use(a,...f){return s.has(a)||(a&&W(a.install)?(s.add(a),a.install(l,...f)):W(a)&&(s.add(a),a(l,...f))),l},mixin(a){return __VUE_OPTIONS_API__&&(i.mixins.includes(a)||i.mixins.push(a)),l},component(a,f){return f?(i.components[a]=f,l):i.components[a]},directive(a,f){return f?(i.directives[a]=f,l):i.directives[a]},mount(a,f,p){if(!c){const d=l._ceVNode||fe(n,r);return d.appContext=i,p===!0?p="svg":p===!1&&(p=void 0),f&&t?t(d,a):e(d,a,p),c=!0,l._container=a,a.__vue_app__=l,__VUE_PROD_DEVTOOLS__&&(l._instance=d.component,hh(l,ys)),Jn(d.component)}},onUnmount(a){o.push(a)},unmount(){c&&(lt(o,l._instance,16),e(null,l._container),__VUE_PROD_DEVTOOLS__&&(l._instance=null,ph(l)),delete l._container.__vue_app__)},provide(a,f){return i.provides[a]=f,l},runWithContext(a){const f=xu;xu=l;try{return a()}finally{xu=f}}};return l}}let xu=null;function xa(e,t){if(Se){let u=Se.provides;const n=Se.parent&&Se.parent.provides;n===u&&(u=Se.provides=Object.create(n)),u[e]=t}}function Cn(e,t,u=!1){const n=Se||ve;if(n||xu){const r=xu?xu._context.provides:n?n.parent==null?n.vnode.appContext&&n.vnode.appContext.provides:n.parent.provides:void 0;if(r&&e in r)return r[e];if(arguments.length>1)return u&&W(t)?t.call(n&&n.proxy):t}}function x1(){return!!(Se||ve||xu)}const Ea={},Ca=()=>Object.create(Ea),ka=e=>Object.getPrototypeOf(e)===Ea;function E1(e,t,u,n=!1){const r={},i=Ca();e.propsDefaults=Object.create(null),Aa(e,t,r,i);for(const s in e.propsOptions[0])s in r||(r[s]=void 0);u?e.props=n?r:Hl(r):e.type.props?e.props=r:e.props=i,e.attrs=i}function C1(e,t,u,n){const{props:r,attrs:i,vnode:{patchFlag:s}}=e,o=Q(r),[c]=e.propsOptions;let l=!1;if((n||s>0)&&!(s&16)){if(s&8){const a=e.vnode.dynamicProps;for(let f=0;f<a.length;f++){let p=a[f];if(yi(e.emitsOptions,p))continue;const d=t[p];if(c)if(re(i,p))d!==i[p]&&(i[p]=d,l=!0);else{const h=de(p);r[h]=ds(c,o,h,d,e,!1)}else d!==i[p]&&(i[p]=d,l=!0)}}}else{Aa(e,t,r,i)&&(l=!0);let a;for(const f in o)(!t||!re(t,f)&&((a=qe(f))===f||!re(t,a)))&&(c?u&&(u[f]!==void 0||u[a]!==void 0)&&(r[f]=ds(c,o,f,void 0,e,!0)):delete r[f]);if(i!==o)for(const f in i)(!t||!re(t,f))&&(delete i[f],l=!0)}l&&Nt(e.attrs,"set","")}function Aa(e,t,u,n){const[r,i]=e.propsOptions;let s=!1,o;if(t)for(let c in t){if(tu(c))continue;const l=t[c];let a;r&&re(r,a=de(c))?!i||!i.includes(a)?u[a]=l:(o||(o={}))[a]=l:yi(e.emitsOptions,c)||(!(c in n)||l!==n[c])&&(n[c]=l,s=!0)}if(i){const c=Q(u),l=o||ee;for(let a=0;a<i.length;a++){const f=i[a];u[f]=ds(r,c,f,l[f],e,!re(l,f))}}return s}function ds(e,t,u,n,r,i){const s=e[u];if(s!=null){const o=re(s,"default");if(o&&n===void 0){const c=s.default;if(s.type!==Function&&!s.skipFactory&&W(c)){const{propsDefaults:l}=r;if(u in l)n=l[u];else{const a=Au(r);n=l[u]=c.call(null,t),a()}}else n=c;r.ce&&r.ce._setProp(u,n)}s[0]&&(i&&!o?n=!1:s[1]&&(n===""||n===qe(u))&&(n=!0))}return n}const k1=new WeakMap;function Sa(e,t,u=!1){const n=__VUE_OPTIONS_API__&&u?k1:t.propsCache,r=n.get(e);if(r)return r;const i=e.props,s={},o=[];let c=!1;if(__VUE_OPTIONS_API__&&!W(e)){const a=f=>{c=!0;const[p,d]=Sa(f,t,!0);te(s,p),d&&o.push(...d)};!u&&t.mixins.length&&t.mixins.forEach(a),e.extends&&a(e.extends),e.mixins&&e.mixins.forEach(a)}if(!i&&!c)return oe(e)&&n.set(e,Pu),Pu;if($(i))for(let a=0;a<i.length;a++){const f=de(i[a]);ac(f)&&(s[f]=ee)}else if(i)for(const a in i){const f=de(a);if(ac(f)){const p=i[a],d=s[f]=$(p)||W(p)?{type:p}:te({},p),h=d.type;let _=!1,v=!0;if($(h))for(let A=0;A<h.length;++A){const E=h[A],m=W(E)&&E.name;if(m==="Boolean"){_=!0;break}else m==="String"&&(v=!1)}else _=W(h)&&h.name==="Boolean";d[0]=_,d[1]=v,(_||re(d,"default"))&&o.push(f)}}const l=[s,o];return oe(e)&&n.set(e,l),l}function ac(e){return e[0]!=="$"&&!tu(e)}const va=e=>e[0]==="_"||e==="$stable",ro=e=>$(e)?e.map(We):[We(e)],A1=(e,t,u)=>{if(t._n)return t;const n=Zs((...r)=>ro(t(...r)),u);return n._c=!1,n},Da=(e,t,u)=>{const n=e._ctx;for(const r in e){if(va(r))continue;const i=e[r];if(W(i))t[r]=A1(r,i,n);else if(i!=null){const s=ro(i);t[r]=()=>s}}},Ta=(e,t)=>{const u=ro(t);e.slots.default=()=>u},wa=(e,t,u)=>{for(const n in t)(u||n!=="_")&&(e[n]=t[n])},S1=(e,t,u)=>{const n=e.slots=Ca();if(e.vnode.shapeFlag&32){const r=t._;r?(wa(n,t,u),u&&ju(n,"_",r,!0)):Da(t,n)}else t&&Ta(e,t)},v1=(e,t,u)=>{const{vnode:n,slots:r}=e;let i=!0,s=ee;if(n.shapeFlag&32){const o=t._;o?u&&o===1?i=!1:wa(r,t,u):(i=!t.$stable,Da(t,r)),s=t}else t&&(Ta(e,t),s={default:1});if(i)for(const o in r)!va(o)&&s[o]==null&&delete r[o]};function D1(){typeof __VUE_OPTIONS_API__!="boolean"&&(gu().__VUE_OPTIONS_API__=!0),typeof __VUE_PROD_DEVTOOLS__!="boolean"&&(gu().__VUE_PROD_DEVTOOLS__=!1),typeof __VUE_PROD_HYDRATION_MISMATCH_DETAILS__!="boolean"&&(gu().__VUE_PROD_HYDRATION_MISMATCH_DETAILS__=!1)}const ke=Ua;function Fa(e){return Oa(e)}function Ia(e){return Oa(e,wh)}function Oa(e,t){D1();const u=gu();u.__VUE__=!0,__VUE_PROD_DEVTOOLS__&&Ws(u.__VUE_DEVTOOLS_GLOBAL_HOOK__,u);const{insert:n,remove:r,patchProp:i,createElement:s,createText:o,createComment:c,setText:l,setElementText:a,parentNode:f,nextSibling:p,setScopeId:d=we,insertStaticContent:h}=e,_=(b,y,D,N=null,F=null,O=null,V=void 0,B=null,L=!!y.dynamicChildren)=>{if(b===y)return;b&&!mt(b,y)&&(N=nr(b),yt(b,F,O,!0),b=null),y.patchFlag===-2&&(L=!1,y.dynamicChildren=null);const{type:M,ref:G,shapeFlag:U}=y;switch(M){case Pt:v(b,y,D,N);break;case ye:A(b,y,D,N);break;case ru:b==null&&E(y,D,N,V);break;case Ee:T(b,y,D,N,F,O,V,B,L);break;default:U&1?x(b,y,D,N,F,O,V,B,L):U&6?P(b,y,D,N,F,O,V,B,L):(U&64||U&128)&&M.process(b,y,D,N,F,O,V,B,L,Iu)}G!=null&&F&&On(G,b&&b.ref,O,y||b,!y)},v=(b,y,D,N)=>{if(b==null)n(y.el=o(y.children),D,N);else{const F=y.el=b.el;y.children!==b.children&&l(F,y.children)}},A=(b,y,D,N)=>{b==null?n(y.el=c(y.children||""),D,N):y.el=b.el},E=(b,y,D,N)=>{[b.el,b.anchor]=h(b.children,y,D,N,b.el,b.anchor)},m=({el:b,anchor:y},D,N)=>{let F;for(;b&&b!==y;)F=p(b),n(b,D,N),b=F;n(y,D,N)},g=({el:b,anchor:y})=>{let D;for(;b&&b!==y;)D=p(b),r(b),b=D;r(y)},x=(b,y,D,N,F,O,V,B,L)=>{y.type==="svg"?V="svg":y.type==="math"&&(V="mathml"),b==null?k(y,D,N,F,O,V,B,L):C(b,y,F,O,V,B,L)},k=(b,y,D,N,F,O,V,B)=>{let L,M;const{props:G,shapeFlag:U,transition:K,dirs:Y}=b;if(L=b.el=s(b.type,O,G&&G.is,G),U&8?a(L,b.children):U&16&&w(b.children,L,null,N,F,Ui(b,O),V,B),Y&&kt(b,null,N,"created"),I(L,b,b.scopeId,V,N),G){for(const ce in G)ce!=="value"&&!tu(ce)&&i(L,ce,null,G[ce],O,N);"value"in G&&i(L,"value",null,G.value,O),(M=G.onVnodeBeforeMount)&&Ke(M,N,b)}__VUE_PROD_DEVTOOLS__&&(ju(L,"__vnode",b,!0),ju(L,"__vueParentComponent",N,!0)),Y&&kt(b,null,N,"beforeMount");const ue=Na(F,K);ue&&K.beforeEnter(L),n(L,y,D),((M=G&&G.onVnodeMounted)||ue||Y)&&ke(()=>{M&&Ke(M,N,b),ue&&K.enter(L),Y&&kt(b,null,N,"mounted")},F)},I=(b,y,D,N,F)=>{if(D&&d(b,D),N)for(let O=0;O<N.length;O++)d(b,N[O]);if(F){let O=F.subTree;if(y===O||Nr(O.type)&&(O.ssContent===y||O.ssFallback===y)){const V=F.vnode;I(b,V,V.scopeId,V.slotScopeIds,F.parent)}}},w=(b,y,D,N,F,O,V,B,L=0)=>{for(let M=L;M<b.length;M++){const G=b[M]=B?Jt(b[M]):We(b[M]);_(null,G,y,D,N,F,O,V,B)}},C=(b,y,D,N,F,O,V)=>{const B=y.el=b.el;__VUE_PROD_DEVTOOLS__&&(B.__vnode=y);let{patchFlag:L,dynamicChildren:M,dirs:G}=y;L|=b.patchFlag&16;const U=b.props||ee,K=y.props||ee;let Y;if(D&&fu(D,!1),(Y=K.onVnodeBeforeUpdate)&&Ke(Y,D,y,b),G&&kt(y,b,D,"beforeUpdate"),D&&fu(D,!0),(U.innerHTML&&K.innerHTML==null||U.textContent&&K.textContent==null)&&a(B,""),M?S(b.dynamicChildren,M,B,D,N,Ui(y,F),O):V||j(b,y,B,null,D,N,Ui(y,F),O,!1),L>0){if(L&16)R(B,U,K,D,F);else if(L&2&&U.class!==K.class&&i(B,"class",null,K.class,F),L&4&&i(B,"style",U.style,K.style,F),L&8){const ue=y.dynamicProps;for(let ce=0;ce<ue.length;ce++){const se=ue[ce],Ye=U[se],Ie=K[se];(Ie!==Ye||se==="value")&&i(B,se,Ye,Ie,F,D)}}L&1&&b.children!==y.children&&a(B,y.children)}else!V&&M==null&&R(B,U,K,D,F);((Y=K.onVnodeUpdated)||G)&&ke(()=>{Y&&Ke(Y,D,y,b),G&&kt(y,b,D,"updated")},N)},S=(b,y,D,N,F,O,V)=>{for(let B=0;B<y.length;B++){const L=b[B],M=y[B],G=L.el&&(L.type===Ee||!mt(L,M)||L.shapeFlag&70)?f(L.el):D;_(L,M,G,null,N,F,O,V,!0)}},R=(b,y,D,N,F)=>{if(y!==D){if(y!==ee)for(const O in y)!tu(O)&&!(O in D)&&i(b,O,y[O],null,F,N);for(const O in D){if(tu(O))continue;const V=D[O],B=y[O];V!==B&&O!=="value"&&i(b,O,B,V,F,N)}"value"in D&&i(b,"value",y.value,D.value,F)}},T=(b,y,D,N,F,O,V,B,L)=>{const M=y.el=b?b.el:o(""),G=y.anchor=b?b.anchor:o("");let{patchFlag:U,dynamicChildren:K,slotScopeIds:Y}=y;Y&&(B=B?B.concat(Y):Y),b==null?(n(M,D,N),n(G,D,N),w(y.children||[],D,G,F,O,V,B,L)):U>0&&U&64&&K&&b.dynamicChildren?(S(b.dynamicChildren,K,D,F,O,V,B),(y.key!=null||F&&y===F.subTree)&&io(b,y,!0)):j(b,y,D,G,F,O,V,B,L)},P=(b,y,D,N,F,O,V,B,L)=>{y.slotScopeIds=B,b==null?y.shapeFlag&512?F.ctx.activate(y,D,N,V,L):q(y,D,N,F,O,V,L):z(b,y,L)},q=(b,y,D,N,F,O,V)=>{const B=b.component=Ga(b,N,F);if(Wn(b)&&(B.ctx.renderer=Iu),Ja(B,!1,V),B.asyncDep){if(F&&F.registerDep(B,H,V),!b.el){const L=B.subTree=fe(ye);A(null,L,y,D)}}else H(B,b,y,D,F,O,V)},z=(b,y,D)=>{const N=y.component=b.component;if(L1(b,y,D))if(N.asyncDep&&!N.asyncResolved){J(N,y,D);return}else N.next=y,N.update();else y.el=b.el,N.vnode=y},H=(b,y,D,N,F,O,V)=>{const B=()=>{if(b.isMounted){let{next:U,bu:K,u:Y,parent:ue,vnode:ce}=b;{const Xe=Ra(b);if(Xe){U&&(U.el=ce.el,J(b,U,V)),Xe.asyncDep.then(()=>{b.isUnmounted||B()});return}}let se=U,Ye;fu(b,!1),U?(U.el=ce.el,J(b,U,V)):U=ce,K&&Vu(K),(Ye=U.props&&U.props.onVnodeBeforeUpdate)&&Ke(Ye,ue,U,ce),fu(b,!0);const Ie=_r(b),ht=b.subTree;b.subTree=Ie,_(ht,Ie,f(ht.el),nr(ht),b,F,O),U.el=Ie.el,se===null&&xi(b,Ie.el),Y&&ke(Y,F),(Ye=U.props&&U.props.onVnodeUpdated)&&ke(()=>Ke(Ye,ue,U,ce),F),__VUE_PROD_DEVTOOLS__&&Yl(b)}else{let U;const{el:K,props:Y}=y,{bm:ue,m:ce,parent:se,root:Ye,type:Ie}=b,ht=nu(y);if(fu(b,!1),ue&&Vu(ue),!ht&&(U=Y&&Y.onVnodeBeforeMount)&&Ke(U,se,y),fu(b,!0),K&&Oi){const Xe=()=>{b.subTree=_r(b),Oi(K,b.subTree,b,F,null)};ht&&Ie.__asyncHydrate?Ie.__asyncHydrate(K,b,Xe):Xe()}else{Ye.ce&&Ye.ce._injectChildStyle(Ie);const Xe=b.subTree=_r(b);_(null,Xe,D,N,b,F,O),y.el=Xe.el}if(ce&&ke(ce,F),!ht&&(U=Y&&Y.onVnodeMounted)){const Xe=y;ke(()=>Ke(U,se,Xe),F)}(y.shapeFlag&256||se&&nu(se.vnode)&&se.vnode.shapeFlag&256)&&b.a&&ke(b.a,F),b.isMounted=!0,__VUE_PROD_DEVTOOLS__&&ss(b),y=D=N=null}};b.scope.on();const L=b.effect=new vn(B);b.scope.off();const M=b.update=L.run.bind(L),G=b.job=L.runIfDirty.bind(L);G.i=b,G.id=b.uid,L.scheduler=()=>Ks(G),fu(b,!0),M()},J=(b,y,D)=>{y.component=b;const N=b.vnode.props;b.vnode=y,b.next=null,C1(b,y.props,N,D),v1(b,y.children,D),$t(),Yo(b),Ht()},j=(b,y,D,N,F,O,V,B,L=!1)=>{const M=b&&b.children,G=b?b.shapeFlag:0,U=y.children,{patchFlag:K,shapeFlag:Y}=y;if(K>0){if(K&128){je(M,U,D,N,F,O,V,B,L);return}else if(K&256){be(M,U,D,N,F,O,V,B,L);return}}Y&8?(G&16&&sn(M,F,O),U!==M&&a(D,U)):G&16?Y&16?je(M,U,D,N,F,O,V,B,L):sn(M,F,O,!0):(G&8&&a(D,""),Y&16&&w(U,D,N,F,O,V,B,L))},be=(b,y,D,N,F,O,V,B,L)=>{b=b||Pu,y=y||Pu;const M=b.length,G=y.length,U=Math.min(M,G);let K;for(K=0;K<U;K++){const Y=y[K]=L?Jt(y[K]):We(y[K]);_(b[K],Y,D,null,F,O,V,B,L)}M>G?sn(b,F,O,!0,!1,U):w(y,D,N,F,O,V,B,L,U)},je=(b,y,D,N,F,O,V,B,L)=>{let M=0;const G=y.length;let U=b.length-1,K=G-1;for(;M<=U&&M<=K;){const Y=b[M],ue=y[M]=L?Jt(y[M]):We(y[M]);if(mt(Y,ue))_(Y,ue,D,null,F,O,V,B,L);else break;M++}for(;M<=U&&M<=K;){const Y=b[U],ue=y[K]=L?Jt(y[K]):We(y[K]);if(mt(Y,ue))_(Y,ue,D,null,F,O,V,B,L);else break;U--,K--}if(M>U){if(M<=K){const Y=K+1,ue=Y<G?y[Y].el:N;for(;M<=K;)_(null,y[M]=L?Jt(y[M]):We(y[M]),D,ue,F,O,V,B,L),M++}}else if(M>K)for(;M<=U;)yt(b[M],F,O,!0),M++;else{const Y=M,ue=M,ce=new Map;for(M=ue;M<=K;M++){const Qe=y[M]=L?Jt(y[M]):We(y[M]);Qe.key!=null&&ce.set(Qe.key,M)}let se,Ye=0;const Ie=K-ue+1;let ht=!1,Xe=0;const on=new Array(Ie);for(M=0;M<Ie;M++)on[M]=0;for(M=Y;M<=U;M++){const Qe=b[M];if(Ye>=Ie){yt(Qe,F,O,!0);continue}let xt;if(Qe.key!=null)xt=ce.get(Qe.key);else for(se=ue;se<=K;se++)if(on[se-ue]===0&&mt(Qe,y[se])){xt=se;break}xt===void 0?yt(Qe,F,O,!0):(on[xt-ue]=M+1,xt>=Xe?Xe=xt:ht=!0,_(Qe,y[xt],D,null,F,O,V,B,L),Ye++)}const jo=ht?T1(on):Pu;for(se=jo.length-1,M=Ie-1;M>=0;M--){const Qe=ue+M,xt=y[Qe],zo=Qe+1<G?y[Qe+1].el:N;on[M]===0?_(null,xt,D,zo,F,O,V,B,L):ht&&(se<0||M!==jo[se]?dt(xt,D,zo,2):se--)}}},dt=(b,y,D,N,F=null)=>{const{el:O,type:V,transition:B,children:L,shapeFlag:M}=b;if(M&6){dt(b.component.subTree,y,D,N);return}if(M&128){b.suspense.move(y,D,N);return}if(M&64){V.move(b,y,D,Iu);return}if(V===Ee){n(O,y,D);for(let U=0;U<L.length;U++)dt(L[U],y,D,N);n(b.anchor,y,D);return}if(V===ru){m(b,y,D);return}if(N!==2&&M&1&&B)if(N===0)B.beforeEnter(O),n(O,y,D),ke(()=>B.enter(O),F);else{const{leave:U,delayLeave:K,afterLeave:Y}=B,ue=()=>n(O,y,D),ce=()=>{U(O,()=>{ue(),Y&&Y()})};K?K(O,ue,ce):ce()}else n(O,y,D)},yt=(b,y,D,N=!1,F=!1)=>{const{type:O,props:V,ref:B,children:L,dynamicChildren:M,shapeFlag:G,patchFlag:U,dirs:K,cacheIndex:Y}=b;if(U===-2&&(F=!1),B!=null&&On(B,null,D,b,!0),Y!=null&&(y.renderCache[Y]=void 0),G&256){y.ctx.deactivate(b);return}const ue=G&1&&K,ce=!nu(b);let se;if(ce&&(se=V&&V.onVnodeBeforeUnmount)&&Ke(se,y,b),G&6)Hf(b.component,D,N);else{if(G&128){b.suspense.unmount(D,N);return}ue&&kt(b,null,y,"beforeUnmount"),G&64?b.type.remove(b,y,D,Iu,N):M&&!M.hasOnce&&(O!==Ee||U>0&&U&64)?sn(M,y,D,!1,!0):(O===Ee&&U&384||!F&&G&16)&&sn(L,y,D),N&&Uo(b)}(ce&&(se=V&&V.onVnodeUnmounted)||ue)&&ke(()=>{se&&Ke(se,y,b),ue&&kt(b,null,y,"unmounted")},D)},Uo=b=>{const{type:y,el:D,anchor:N,transition:F}=b;if(y===Ee){$f(D,N);return}if(y===ru){g(b);return}const O=()=>{r(D),F&&!F.persisted&&F.afterLeave&&F.afterLeave()};if(b.shapeFlag&1&&F&&!F.persisted){const{leave:V,delayLeave:B}=F,L=()=>V(D,O);B?B(b.el,O,L):L()}else O()},$f=(b,y)=>{let D;for(;b!==y;)D=p(b),r(b),b=D;r(y)},Hf=(b,y,D)=>{const{bum:N,scope:F,job:O,subTree:V,um:B,m:L,a:M}=b;Or(L),Or(M),N&&Vu(N),F.stop(),O&&(O.flags|=8,yt(V,b,y,D)),B&&ke(B,y),ke(()=>{b.isUnmounted=!0},y),y&&y.pendingBranch&&!y.isUnmounted&&b.asyncDep&&!b.asyncResolved&&b.suspenseId===y.pendingId&&(y.deps--,y.deps===0&&y.resolve()),__VUE_PROD_DEVTOOLS__&&bh(b)},sn=(b,y,D,N=!1,F=!1,O=0)=>{for(let V=O;V<b.length;V++)yt(b[V],y,D,N,F)},nr=b=>{if(b.shapeFlag&6)return nr(b.component.subTree);if(b.shapeFlag&128)return b.suspense.next();const y=p(b.anchor||b.el),D=y&&y[Xl];return D?p(D):y};let Fi=!1;const qo=(b,y,D)=>{b==null?y._vnode&&yt(y._vnode,null,null,!0):_(y._vnode||null,b,y,null,null,null,D),y._vnode=b,Fi||(Fi=!0,Yo(),Fr(),Fi=!1)},Iu={p:_,um:yt,m:dt,r:Uo,mt:q,mc:w,pc:j,pbc:S,n:nr,o:e};let Ii,Oi;return t&&([Ii,Oi]=t(Iu)),{render:qo,hydrate:Ii,createApp:y1(qo,Ii)}}function Ui({type:e,props:t},u){return u==="svg"&&e==="foreignObject"||u==="mathml"&&e==="annotation-xml"&&t&&t.encoding&&t.encoding.includes("html")?void 0:u}function fu({effect:e,job:t},u){u?(e.flags|=32,t.flags|=4):(e.flags&=-33,t.flags&=-5)}function Na(e,t){return(!e||e&&!e.pendingBranch)&&t&&!t.persisted}function io(e,t,u=!1){const n=e.children,r=t.children;if($(n)&&$(r))for(let i=0;i<n.length;i++){const s=n[i];let o=r[i];o.shapeFlag&1&&!o.dynamicChildren&&((o.patchFlag<=0||o.patchFlag===32)&&(o=r[i]=Jt(r[i]),o.el=s.el),!u&&o.patchFlag!==-2&&io(s,o)),o.type===Pt&&(o.el=s.el)}}function T1(e){const t=e.slice(),u=[0];let n,r,i,s,o;const c=e.length;for(n=0;n<c;n++){const l=e[n];if(l!==0){if(r=u[u.length-1],e[r]<l){t[n]=r,u.push(n);continue}for(i=0,s=u.length-1;i<s;)o=i+s>>1,e[u[o]]<l?i=o+1:s=o;l<e[u[i]]&&(i>0&&(t[n]=u[i-1]),u[i]=n)}}for(i=u.length,s=u[i-1];i-- >0;)u[i]=s,s=t[s];return u}function Ra(e){const t=e.subTree.component;if(t)return t.asyncDep&&!t.asyncResolved?t:Ra(t)}function Or(e){if(e)for(let t=0;t<e.length;t++)e[t].flags|=8}const Ma=Symbol.for("v-scx"),Pa=()=>Cn(Ma);function w1(e,t){return Zn(e,null,t)}function F1(e,t){return Zn(e,null,{flush:"post"})}function La(e,t){return Zn(e,null,{flush:"sync"})}function Hu(e,t,u){return Zn(e,t,u)}function Zn(e,t,u=ee){const{immediate:n,deep:r,flush:i,once:s}=u,o=te({},u),c=t&&n||!t&&i!=="post";let l;if(Ku){if(i==="sync"){const d=Pa();l=d.__watcherHandles||(d.__watcherHandles=[])}else if(!c){const d=()=>{};return d.stop=we,d.resume=we,d.pause=we,d}}const a=Se;o.call=(d,h,_)=>lt(d,a,h,_);let f=!1;i==="post"?o.scheduler=d=>{ke(d,a&&a.suspense)}:i!=="sync"&&(f=!0,o.scheduler=(d,h)=>{h?d():Ks(d)}),o.augmentJob=d=>{t&&(d.flags|=4),f&&(d.flags|=2,a&&(d.id=a.uid,d.i=a))};const p=th(e,t,o);return Ku&&(l?l.push(p):c&&p()),p}function I1(e,t,u){const n=this.proxy,r=Z(e)?e.includes(".")?Ba(n,e):()=>n[e]:e.bind(n,n);let i;W(t)?i=t:(i=t.handler,u=t);const s=Au(this),o=Zn(r,i.bind(n),u);return s(),o}function Ba(e,t){const u=t.split(".");return()=>{let n=e;for(let r=0;r<u.length&&n;r++)n=n[u[r]];return n}}function O1(e,t,u=ee){const n=ft(),r=de(t),i=qe(t),s=Va(e,r),o=jl((c,l)=>{let a,f=ee,p;return La(()=>{const d=e[r];He(a,d)&&(a=d,l())}),{get(){return c(),u.get?u.get(a):a},set(d){const h=u.set?u.set(d):d;if(!He(h,a)&&!(f!==ee&&He(d,f)))return;const _=n.vnode.props;_&&(t in _||r in _||i in _)&&(`onUpdate:${t}`in _||`onUpdate:${r}`in _||`onUpdate:${i}`in _)||(a=d,l()),n.emit(`update:${t}`,h),He(d,h)&&He(d,f)&&!He(h,p)&&l(),f=d,p=h}}});return o[Symbol.iterator]=()=>{let c=0;return{next(){return c<2?{value:c++?s||ee:o,done:!1}:{done:!0}}}},o}const Va=(e,t)=>t==="modelValue"||t==="model-value"?e.modelModifiers:e[`${t}Modifiers`]||e[`${de(t)}Modifiers`]||e[`${qe(t)}Modifiers`];function N1(e,t,...u){if(e.isUnmounted)return;const n=e.vnode.props||ee;let r=u;const i=t.startsWith("update:"),s=i&&Va(n,t.slice(7));s&&(s.trim&&(r=u.map(a=>Z(a)?a.trim():a)),s.number&&(r=u.map(Sr))),__VUE_PROD_DEVTOOLS__&&gh(e,t,r);let o,c=n[o=Bu(t)]||n[o=Bu(de(t))];!c&&i&&(c=n[o=Bu(qe(t))]),c&&lt(c,e,6,r);const l=n[o+"Once"];if(l){if(!e.emitted)e.emitted={};else if(e.emitted[o])return;e.emitted[o]=!0,lt(l,e,6,r)}}function $a(e,t,u=!1){const n=t.emitsCache,r=n.get(e);if(r!==void 0)return r;const i=e.emits;let s={},o=!1;if(__VUE_OPTIONS_API__&&!W(e)){const c=l=>{const a=$a(l,t,!0);a&&(o=!0,te(s,a))};!u&&t.mixins.length&&t.mixins.forEach(c),e.extends&&c(e.extends),e.mixins&&e.mixins.forEach(c)}return!i&&!o?(oe(e)&&n.set(e,null),null):($(i)?i.forEach(c=>s[c]=null):te(s,i),oe(e)&&n.set(e,s),s)}function yi(e,t){return!e||!vu(t)?!1:(t=t.slice(2).replace(/Once$/,""),re(e,t[0].toLowerCase()+t.slice(1))||re(e,qe(t))||re(e,t))}function _r(e){const{type:t,vnode:u,proxy:n,withProxy:r,propsOptions:[i],slots:s,attrs:o,emit:c,render:l,renderCache:a,props:f,data:p,setupState:d,ctx:h,inheritAttrs:_}=e,v=In(e);let A,E;try{if(u.shapeFlag&4){const g=r||n,x=g;A=We(l.call(x,g,a,f,d,p,h)),E=o}else{const g=t;A=We(g.length>1?g(f,{attrs:o,slots:s,emit:c}):g(f,null)),E=t.props?o:M1(o)}}catch(g){kn.length=0,Fu(g,e,1),A=fe(ye)}let m=A;if(E&&_!==!1){const g=Object.keys(E),{shapeFlag:x}=m;g.length&&x&7&&(i&&g.some(Rs)&&(E=P1(E,i)),m=vt(m,E,!1,!0))}return u.dirs&&(m=vt(m,null,!1,!0),m.dirs=m.dirs?m.dirs.concat(u.dirs):u.dirs),u.transition&&Bt(m,u.transition),A=m,In(v),A}function R1(e,t=!0){let u;for(let n=0;n<e.length;n++){const r=e[n];if(Vt(r)){if(r.type!==ye||r.children==="v-if"){if(u)return;u=r}}else return}return u}const M1=e=>{let t;for(const u in e)(u==="class"||u==="style"||vu(u))&&((t||(t={}))[u]=e[u]);return t},P1=(e,t)=>{const u={};for(const n in e)(!Rs(n)||!(n.slice(9)in t))&&(u[n]=e[n]);return u};function L1(e,t,u){const{props:n,children:r,component:i}=e,{props:s,children:o,patchFlag:c}=t,l=i.emitsOptions;if(t.dirs||t.transition)return!0;if(u&&c>=0){if(c&1024)return!0;if(c&16)return n?fc(n,s,l):!!s;if(c&8){const a=t.dynamicProps;for(let f=0;f<a.length;f++){const p=a[f];if(s[p]!==n[p]&&!yi(l,p))return!0}}}else return(r||o)&&(!o||!o.$stable)?!0:n===s?!1:n?s?fc(n,s,l):!0:!!s;return!1}function fc(e,t,u){const n=Object.keys(t);if(n.length!==Object.keys(e).length)return!0;for(let r=0;r<n.length;r++){const i=n[r];if(t[i]!==e[i]&&!yi(u,i))return!0}return!1}function xi({vnode:e,parent:t},u){for(;t;){const n=t.subTree;if(n.suspense&&n.suspense.activeBranch===e&&(n.el=e.el),n===e)(e=t.vnode).el=u,t=t.parent;else break}}const Nr=e=>e.__isSuspense;let hs=0;const B1={name:"Suspense",__isSuspense:!0,process(e,t,u,n,r,i,s,o,c,l){if(e==null)$1(t,u,n,r,i,s,o,c,l);else{if(i&&i.deps>0&&!e.suspense.isInFallback){t.suspense=e.suspense,t.suspense.vnode=t,t.el=e.el;return}H1(e,t,u,n,r,s,o,c,l)}},hydrate:U1,normalize:q1},V1=B1;function Rn(e,t){const u=e.props&&e.props[t];W(u)&&u()}function $1(e,t,u,n,r,i,s,o,c){const{p:l,o:{createElement:a}}=c,f=a("div"),p=e.suspense=Ha(e,r,n,t,f,u,i,s,o,c);l(null,p.pendingBranch=e.ssContent,f,null,n,p,i,s),p.deps>0?(Rn(e,"onPending"),Rn(e,"onFallback"),l(null,e.ssFallback,t,u,n,null,i,s),Uu(p,e.ssFallback)):p.resolve(!1,!0)}function H1(e,t,u,n,r,i,s,o,{p:c,um:l,o:{createElement:a}}){const f=t.suspense=e.suspense;f.vnode=t,t.el=e.el;const p=t.ssContent,d=t.ssFallback,{activeBranch:h,pendingBranch:_,isInFallback:v,isHydrating:A}=f;if(_)f.pendingBranch=p,mt(p,_)?(c(_,p,f.hiddenContainer,null,r,f,i,s,o),f.deps<=0?f.resolve():v&&(A||(c(h,d,u,n,r,null,i,s,o),Uu(f,d)))):(f.pendingId=hs++,A?(f.isHydrating=!1,f.activeBranch=_):l(_,r,f),f.deps=0,f.effects.length=0,f.hiddenContainer=a("div"),v?(c(null,p,f.hiddenContainer,null,r,f,i,s,o),f.deps<=0?f.resolve():(c(h,d,u,n,r,null,i,s,o),Uu(f,d))):h&&mt(p,h)?(c(h,p,u,n,r,f,i,s,o),f.resolve(!0)):(c(null,p,f.hiddenContainer,null,r,f,i,s,o),f.deps<=0&&f.resolve()));else if(h&&mt(p,h))c(h,p,u,n,r,f,i,s,o),Uu(f,p);else if(Rn(t,"onPending"),f.pendingBranch=p,p.shapeFlag&512?f.pendingId=p.component.suspenseId:f.pendingId=hs++,c(null,p,f.hiddenContainer,null,r,f,i,s,o),f.deps<=0)f.resolve();else{const{timeout:E,pendingId:m}=f;E>0?setTimeout(()=>{f.pendingId===m&&f.fallback(d)},E):E===0&&f.fallback(d)}}function Ha(e,t,u,n,r,i,s,o,c,l,a=!1){const{p:f,m:p,um:d,n:h,o:{parentNode:_,remove:v}}=l;let A;const E=j1(e);E&&t&&t.pendingBranch&&(A=t.pendingId,t.deps++);const m=e.props?vr(e.props.timeout):void 0,g=i,x={vnode:e,parent:t,parentComponent:u,namespace:s,container:n,hiddenContainer:r,deps:0,pendingId:hs++,timeout:typeof m=="number"?m:-1,activeBranch:null,pendingBranch:null,isInFallback:!a,isHydrating:a,isUnmounted:!1,effects:[],resolve(k=!1,I=!1){const{vnode:w,activeBranch:C,pendingBranch:S,pendingId:R,effects:T,parentComponent:P,container:q}=x;let z=!1;x.isHydrating?x.isHydrating=!1:k||(z=C&&S.transition&&S.transition.mode==="out-in",z&&(C.transition.afterLeave=()=>{R===x.pendingId&&(p(S,q,i===g?h(C):i,0),wn(T))}),C&&(_(C.el)===q&&(i=h(C)),d(C,P,x,!0)),z||p(S,q,i,0)),Uu(x,S),x.pendingBranch=null,x.isInFallback=!1;let H=x.parent,J=!1;for(;H;){if(H.pendingBranch){H.effects.push(...T),J=!0;break}H=H.parent}!J&&!z&&wn(T),x.effects=[],E&&t&&t.pendingBranch&&A===t.pendingId&&(t.deps--,t.deps===0&&!I&&t.resolve()),Rn(w,"onResolve")},fallback(k){if(!x.pendingBranch)return;const{vnode:I,activeBranch:w,parentComponent:C,container:S,namespace:R}=x;Rn(I,"onFallback");const T=h(w),P=()=>{x.isInFallback&&(f(null,k,S,T,C,null,R,o,c),Uu(x,k))},q=k.transition&&k.transition.mode==="out-in";q&&(w.transition.afterLeave=P),x.isInFallback=!0,d(w,C,null,!0),q||P()},move(k,I,w){x.activeBranch&&p(x.activeBranch,k,I,w),x.container=k},next(){return x.activeBranch&&h(x.activeBranch)},registerDep(k,I,w){const C=!!x.pendingBranch;C&&x.deps++;const S=k.vnode.el;k.asyncDep.catch(R=>{Fu(R,k,0)}).then(R=>{if(k.isUnmounted||x.isUnmounted||x.pendingId!==k.suspenseId)return;k.asyncResolved=!0;const{vnode:T}=k;gs(k,R,!1),S&&(T.el=S);const P=!S&&k.subTree.el;I(k,T,_(S||k.subTree.el),S?null:h(k.subTree),x,s,w),P&&v(P),xi(k,T.el),C&&--x.deps===0&&x.resolve()})},unmount(k,I){x.isUnmounted=!0,x.activeBranch&&d(x.activeBranch,u,k,I),x.pendingBranch&&d(x.pendingBranch,u,k,I)}};return x}function U1(e,t,u,n,r,i,s,o,c){const l=t.suspense=Ha(t,n,u,e.parentNode,document.createElement("div"),null,r,i,s,o,!0),a=c(e,l.pendingBranch=t.ssContent,u,l,i,s);return l.deps===0&&l.resolve(!1,!0),a}function q1(e){const{shapeFlag:t,children:u}=e,n=t&32;e.ssContent=dc(n?u.default:u),e.ssFallback=n?dc(u.fallback):fe(ye)}function dc(e){let t;if(W(e)){const u=ku&&e._c;u&&(e._d=!1,Mn()),e=e(),u&&(e._d=!0,t=Pe,qa())}return $(e)&&(e=R1(e)),e=We(e),t&&!e.dynamicChildren&&(e.dynamicChildren=t.filter(u=>u!==e)),e}function Ua(e,t){t&&t.pendingBranch?$(e)?t.effects.push(...e):t.effects.push(e):wn(e)}function Uu(e,t){e.activeBranch=t;const{vnode:u,parentComponent:n}=e;let r=t.el;for(;!r&&t.component;)t=t.component.subTree,r=t.el;u.el=r,n&&n.subTree===u&&(n.vnode.el=r,xi(n,r))}function j1(e){const t=e.props&&e.props.suspensible;return t!=null&&t!==!1}const Ee=Symbol.for("v-fgt"),Pt=Symbol.for("v-txt"),ye=Symbol.for("v-cmt"),ru=Symbol.for("v-stc"),kn=[];let Pe=null;function Mn(e=!1){kn.push(Pe=e?null:[])}function qa(){kn.pop(),Pe=kn[kn.length-1]||null}let ku=1;function ps(e,t=!1){ku+=e,e<0&&Pe&&t&&(Pe.hasOnce=!0)}function ja(e){return e.dynamicChildren=ku>0?Pe||Pu:null,qa(),ku>0&&Pe&&Pe.push(e),e}function z1(e,t,u,n,r,i){return ja(so(e,t,u,n,r,i,!0))}function Rr(e,t,u,n,r){return ja(fe(e,t,u,n,r,!0))}function Vt(e){return e?e.__v_isVNode===!0:!1}function mt(e,t){return e.type===t.type&&e.key===t.key}function K1(e){}const za=({key:e})=>e??null,yr=({ref:e,ref_key:t,ref_for:u})=>(typeof e=="number"&&(e=""+e),e!=null?Z(e)||Ce(e)||W(e)?{i:ve,r:e,k:t,f:!!u}:e:null);function so(e,t=null,u=null,n=0,r=null,i=e===Ee?0:1,s=!1,o=!1){const c={__v_isVNode:!0,__v_skip:!0,type:e,props:t,key:t&&za(t),ref:t&&yr(t),scopeId:hi,slotScopeIds:null,children:u,component:null,suspense:null,ssContent:null,ssFallback:null,dirs:null,transition:null,el:null,anchor:null,target:null,targetStart:null,targetAnchor:null,staticCount:0,shapeFlag:i,patchFlag:n,dynamicProps:r,dynamicChildren:null,appContext:null,ctx:ve};return o?(co(c,u),i&128&&e.normalize(c)):u&&(c.shapeFlag|=Z(u)?8:16),ku>0&&!s&&Pe&&(c.patchFlag>0||i&6)&&c.patchFlag!==32&&Pe.push(c),c}const fe=W1;function W1(e,t=null,u=null,n=0,r=null,i=!1){if((!e||e===ba)&&(e=ye),Vt(e)){const o=vt(e,t,!0);return u&&co(o,u),ku>0&&!i&&Pe&&(o.shapeFlag&6?Pe[Pe.indexOf(e)]=o:Pe.push(o)),o.patchFlag=-2,o}if(np(e)&&(e=e.__vccOpts),t){t=Ka(t);let{class:o,style:c}=t;o&&!Z(o)&&(t.class=nn(o)),oe(c)&&(li(c)&&!$(c)&&(c=te({},c)),t.style=un(c))}const s=Z(e)?1:Nr(e)?128:Ql(e)?64:oe(e)?4:W(e)?2:0;return so(e,t,u,n,r,s,i,!0)}function Ka(e){return e?li(e)||ka(e)?te({},e):e:null}function vt(e,t,u=!1,n=!1){const{props:r,ref:i,patchFlag:s,children:o,transition:c}=e,l=t?Wa(r||{},t):r,a={__v_isVNode:!0,__v_skip:!0,type:e.type,props:l,key:l&&za(l),ref:t&&t.ref?u&&i?$(i)?i.concat(yr(t)):[i,yr(t)]:yr(t):i,scopeId:e.scopeId,slotScopeIds:e.slotScopeIds,children:o,target:e.target,targetStart:e.targetStart,targetAnchor:e.targetAnchor,staticCount:e.staticCount,shapeFlag:e.shapeFlag,patchFlag:t&&e.type!==Ee?s===-1?16:s|16:s,dynamicProps:e.dynamicProps,dynamicChildren:e.dynamicChildren,appContext:e.appContext,dirs:e.dirs,transition:c,component:e.component,suspense:e.suspense,ssContent:e.ssContent&&vt(e.ssContent),ssFallback:e.ssFallback&&vt(e.ssFallback),el:e.el,anchor:e.anchor,ctx:e.ctx,ce:e.ce};return c&&n&&Bt(a,c.clone(a)),a}function oo(e=" ",t=0){return fe(Pt,null,e,t)}function G1(e,t){const u=fe(ru,null,e);return u.staticCount=t,u}function Z1(e="",t=!1){return t?(Mn(),Rr(ye,null,e)):fe(ye,null,e)}function We(e){return e==null||typeof e=="boolean"?fe(ye):$(e)?fe(Ee,null,e.slice()):Vt(e)?Jt(e):fe(Pt,null,String(e))}function Jt(e){return e.el===null&&e.patchFlag!==-1||e.memo?e:vt(e)}function co(e,t){let u=0;const{shapeFlag:n}=e;if(t==null)t=null;else if($(t))u=16;else if(typeof t=="object")if(n&65){const r=t.default;r&&(r._c&&(r._d=!1),co(e,r()),r._c&&(r._d=!0));return}else{u=32;const r=t._;!r&&!ka(t)?t._ctx=ve:r===3&&ve&&(ve.slots._===1?t._=1:(t._=2,e.patchFlag|=1024))}else W(t)?(t={default:t,_ctx:ve},u=32):(t=String(t),n&64?(u=16,t=[oo(t)]):u=8);e.children=t,e.shapeFlag|=u}function Wa(...e){const t={};for(let u=0;u<e.length;u++){const n=e[u];for(const r in n)if(r==="class")t.class!==n.class&&(t.class=nn([t.class,n.class]));else if(r==="style")t.style=un([t.style,n.style]);else if(vu(r)){const i=t[r],s=n[r];s&&i!==s&&!($(i)&&i.includes(s))&&(t[r]=i?[].concat(i,s):s)}else r!==""&&(t[r]=n[r])}return t}function Ke(e,t,u,n=null){lt(e,t,7,[u,n])}const J1=ya();let Y1=0;function Ga(e,t,u){const n=e.type,r=(t?t.appContext:e.appContext)||J1,i={uid:Y1++,vnode:e,type:n,parent:t,appContext:r,root:null,next:null,subTree:null,effect:null,update:null,job:null,scope:new Vs(!0),render:null,proxy:null,exposed:null,exposeProxy:null,withProxy:null,provides:t?t.provides:Object.create(r.provides),ids:t?t.ids:["",0,0],accessCache:null,renderCache:[],components:null,directives:null,propsOptions:Sa(n,r),emitsOptions:$a(n,r),emit:null,emitted:null,propsDefaults:ee,inheritAttrs:n.inheritAttrs,ctx:ee,data:ee,props:ee,attrs:ee,slots:ee,refs:ee,setupState:ee,setupContext:null,suspense:u,suspenseId:u?u.pendingId:0,asyncDep:null,asyncResolved:!1,isMounted:!1,isUnmounted:!1,isDeactivated:!1,bc:null,c:null,bm:null,m:null,bu:null,u:null,um:null,bum:null,da:null,a:null,rtg:null,rtc:null,ec:null,sp:null};return i.ctx={_:i},i.root=t?t.root:i,i.emit=N1.bind(null,i),e.ce&&e.ce(i),i}let Se=null;const ft=()=>Se||ve;let Mr,ms;{const e=gu(),t=(u,n)=>{let r;return(r=e[u])||(r=e[u]=[]),r.push(n),i=>{r.length>1?r.forEach(s=>s(i)):r[0](i)}};Mr=t("__VUE_INSTANCE_SETTERS__",u=>Se=u),ms=t("__VUE_SSR_SETTERS__",u=>Ku=u)}const Au=e=>{const t=Se;return Mr(e),e.scope.on(),()=>{e.scope.off(),Mr(t)}},bs=()=>{Se&&Se.scope.off(),Mr(null)};function Za(e){return e.vnode.shapeFlag&4}let Ku=!1;function Ja(e,t=!1,u=!1){t&&ms(t);const{props:n,children:r}=e.vnode,i=Za(e);E1(e,n,i,t),S1(e,r,u);const s=i?X1(e,t):void 0;return t&&ms(!1),s}function X1(e,t){const u=e.type;e.accessCache=Object.create(null),e.proxy=new Proxy(e.ctx,ls);const{setup:n}=u;if(n){$t();const r=e.setupContext=n.length>1?Qa(e):null,i=Au(e),s=wu(n,e,0,[e.props,r]),o=Ps(s);if(Ht(),i(),(o||e.sp)&&!nu(e)&&Xs(e),o){if(s.then(bs,bs),t)return s.then(c=>{gs(e,c,t)}).catch(c=>{Fu(c,e,0)});e.asyncDep=s}else gs(e,s,t)}else Xa(e,t)}function gs(e,t,u){W(t)?e.type.__ssrInlineRender?e.ssrRender=t:e.render=t:oe(t)&&(__VUE_PROD_DEVTOOLS__&&(e.devtoolsRawSetupState=t),e.setupState=zs(t)),Xa(e,u)}let Pr,_s;function Ya(e){Pr=e,_s=t=>{t.render._rc&&(t.withProxy=new Proxy(t.ctx,Qh))}}const Q1=()=>!Pr;function Xa(e,t,u){const n=e.type;if(!e.render){if(!t&&Pr&&!n.render){const r=n.template||__VUE_OPTIONS_API__&&no(e).template;if(r){const{isCustomElement:i,compilerOptions:s}=e.appContext.config,{delimiters:o,compilerOptions:c}=n,l=te(te({isCustomElement:i,delimiters:o},s),c);n.render=Pr(r,l)}}e.render=n.render||we,_s&&_s(e)}if(__VUE_OPTIONS_API__){const r=Au(e);$t();try{h1(e)}finally{Ht(),r()}}}const ep={get(e,t){return Re(e,"get",""),e[t]}};function Qa(e){const t=u=>{e.exposed=u||{}};return{attrs:new Proxy(e.attrs,ep),slots:e.slots,emit:e.emit,expose:t}}function Jn(e){return e.exposed?e.exposeProxy||(e.exposeProxy=new Proxy(zs(Ul(e.exposed)),{get(t,u){if(u in t)return t[u];if(u in En)return En[u](e)},has(t,u){return u in t||u in En}})):e.proxy}const tp=/(?:^|[-_])(\w)/g,up=e=>e.replace(tp,t=>t.toUpperCase()).replace(/[-_]/g,"");function Lr(e,t=!0){return W(e)?e.displayName||e.name:e.name||t&&e.__name}function e0(e,t,u=!1){let n=Lr(t);if(!n&&t.__file){const r=t.__file.match(/([^/\\]+)\.\w+$/);r&&(n=r[1])}if(!n&&e&&e.parent){const r=i=>{for(const s in i)if(i[s]===t)return s};n=r(e.components||e.parent.type.components)||r(e.appContext.components)}return n?up(n):u?"App":"Anonymous"}function np(e){return W(e)&&"__vccOpts"in e}const lo=(e,t)=>Yd(e,t,Ku);function ao(e,t,u){const n=arguments.length;return n===2?oe(t)&&!$(t)?Vt(t)?fe(e,null,[t]):fe(e,t):fe(e,null,t):(n>3?u=Array.prototype.slice.call(arguments,2):n===3&&Vt(u)&&(u=[u]),fe(e,t,u))}function rp(){}function ip(e,t,u,n){const r=u[n];if(r&&t0(r,e))return r;const i=t();return i.memo=e.slice(),i.cacheIndex=n,u[n]=i}function t0(e,t){const u=e.memo;if(u.length!=t.length)return!1;for(let n=0;n<u.length;n++)if(He(u[n],t[n]))return!1;return ku>0&&Pe&&Pe.push(e),!0}const ys="3.5.13",sp=we,op=ah,cp=pt,lp=Ws,ap={createComponentInstance:Ga,setupComponent:Ja,renderComponentRoot:_r,setCurrentRenderingInstance:In,isVNode:Vt,normalizeVNode:We,getComponentPublicInstance:Jn,ensureValidVNode:uo,pushWarningContext:uh,popWarningContext:nh},fp=ap,dp=null,hp=null,pp=null;/**
+* @vue/runtime-dom v3.5.13
+* (c) 2018-present Yuxi (Evan) You and Vue contributors
+* @license MIT
+**/let xs;const hc=typeof window<"u"&&window.trustedTypes;if(hc)try{xs=hc.createPolicy("vue",{createHTML:e=>e})}catch{}const u0=xs?e=>xs.createHTML(e):e=>e,mp="http://www.w3.org/2000/svg",bp="http://www.w3.org/1998/Math/MathML",Ot=typeof document<"u"?document:null,pc=Ot&&Ot.createElement("template"),gp={insert:(e,t,u)=>{t.insertBefore(e,u||null)},remove:e=>{const t=e.parentNode;t&&t.removeChild(e)},createElement:(e,t,u,n)=>{const r=t==="svg"?Ot.createElementNS(mp,e):t==="mathml"?Ot.createElementNS(bp,e):u?Ot.createElement(e,{is:u}):Ot.createElement(e);return e==="select"&&n&&n.multiple!=null&&r.setAttribute("multiple",n.multiple),r},createText:e=>Ot.createTextNode(e),createComment:e=>Ot.createComment(e),setText:(e,t)=>{e.nodeValue=t},setElementText:(e,t)=>{e.textContent=t},parentNode:e=>e.parentNode,nextSibling:e=>e.nextSibling,querySelector:e=>Ot.querySelector(e),setScopeId(e,t){e.setAttribute(t,"")},insertStaticContent(e,t,u,n,r,i){const s=u?u.previousSibling:t.lastChild;if(r&&(r===i||r.nextSibling))for(;t.insertBefore(r.cloneNode(!0),u),!(r===i||!(r=r.nextSibling)););else{pc.innerHTML=u0(n==="svg"?`<svg>${e}</svg>`:n==="mathml"?`<math>${e}</math>`:e);const o=pc.content;if(n==="svg"||n==="mathml"){const c=o.firstChild;for(;c.firstChild;)o.appendChild(c.firstChild);o.removeChild(c)}t.insertBefore(o,u)}return[s?s.nextSibling:t.firstChild,u?u.previousSibling:t.lastChild]}},qt="transition",ln="animation",Wu=Symbol("_vtc"),n0={name:String,type:String,css:{type:Boolean,default:!0},duration:[String,Number,Object],enterFromClass:String,enterActiveClass:String,enterToClass:String,appearFromClass:String,appearActiveClass:String,appearToClass:String,leaveFromClass:String,leaveActiveClass:String,leaveToClass:String},r0=te({},Ys,n0),_p=e=>(e.displayName="Transition",e.props=r0,e),yp=_p((e,{slots:t})=>ao(ra,i0(e),t)),du=(e,t=[])=>{$(e)?e.forEach(u=>u(...t)):e&&e(...t)},mc=e=>e?$(e)?e.some(t=>t.length>1):e.length>1:!1;function i0(e){const t={};for(const T in e)T in n0||(t[T]=e[T]);if(e.css===!1)return t;const{name:u="v",type:n,duration:r,enterFromClass:i=`${u}-enter-from`,enterActiveClass:s=`${u}-enter-active`,enterToClass:o=`${u}-enter-to`,appearFromClass:c=i,appearActiveClass:l=s,appearToClass:a=o,leaveFromClass:f=`${u}-leave-from`,leaveActiveClass:p=`${u}-leave-active`,leaveToClass:d=`${u}-leave-to`}=e,h=xp(r),_=h&&h[0],v=h&&h[1],{onBeforeEnter:A,onEnter:E,onEnterCancelled:m,onLeave:g,onLeaveCancelled:x,onBeforeAppear:k=A,onAppear:I=E,onAppearCancelled:w=m}=t,C=(T,P,q,z)=>{T._enterCancelled=z,Kt(T,P?a:o),Kt(T,P?l:s),q&&q()},S=(T,P)=>{T._isLeaving=!1,Kt(T,f),Kt(T,d),Kt(T,p),P&&P()},R=T=>(P,q)=>{const z=T?I:E,H=()=>C(P,T,q);du(z,[P,H]),bc(()=>{Kt(P,T?c:i),Et(P,T?a:o),mc(z)||gc(P,n,_,H)})};return te(t,{onBeforeEnter(T){du(A,[T]),Et(T,i),Et(T,s)},onBeforeAppear(T){du(k,[T]),Et(T,c),Et(T,l)},onEnter:R(!1),onAppear:R(!0),onLeave(T,P){T._isLeaving=!0;const q=()=>S(T,P);Et(T,f),T._enterCancelled?(Et(T,p),Es()):(Es(),Et(T,p)),bc(()=>{T._isLeaving&&(Kt(T,f),Et(T,d),mc(g)||gc(T,n,v,q))}),du(g,[T,q])},onEnterCancelled(T){C(T,!1,void 0,!0),du(m,[T])},onAppearCancelled(T){C(T,!0,void 0,!0),du(w,[T])},onLeaveCancelled(T){S(T),du(x,[T])}})}function xp(e){if(e==null)return null;if(oe(e))return[qi(e.enter),qi(e.leave)];{const t=qi(e);return[t,t]}}function qi(e){return vr(e)}function Et(e,t){t.split(/\s+/).forEach(u=>u&&e.classList.add(u)),(e[Wu]||(e[Wu]=new Set)).add(t)}function Kt(e,t){t.split(/\s+/).forEach(n=>n&&e.classList.remove(n));const u=e[Wu];u&&(u.delete(t),u.size||(e[Wu]=void 0))}function bc(e){requestAnimationFrame(()=>{requestAnimationFrame(e)})}let Ep=0;function gc(e,t,u,n){const r=e._endId=++Ep,i=()=>{r===e._endId&&n()};if(u!=null)return setTimeout(i,u);const{type:s,timeout:o,propCount:c}=s0(e,t);if(!s)return n();const l=s+"end";let a=0;const f=()=>{e.removeEventListener(l,p),i()},p=d=>{d.target===e&&++a>=c&&f()};setTimeout(()=>{a<c&&f()},o+1),e.addEventListener(l,p)}function s0(e,t){const u=window.getComputedStyle(e),n=h=>(u[h]||"").split(", "),r=n(`${qt}Delay`),i=n(`${qt}Duration`),s=_c(r,i),o=n(`${ln}Delay`),c=n(`${ln}Duration`),l=_c(o,c);let a=null,f=0,p=0;t===qt?s>0&&(a=qt,f=s,p=i.length):t===ln?l>0&&(a=ln,f=l,p=c.length):(f=Math.max(s,l),a=f>0?s>l?qt:ln:null,p=a?a===qt?i.length:c.length:0);const d=a===qt&&/\b(transform|all)(,|$)/.test(n(`${qt}Property`).toString());return{type:a,timeout:f,propCount:p,hasTransform:d}}function _c(e,t){for(;e.length<t.length;)e=e.concat(e);return Math.max(...t.map((u,n)=>yc(u)+yc(e[n])))}function yc(e){return e==="auto"?0:Number(e.slice(0,-1).replace(",","."))*1e3}function Es(){return document.body.offsetHeight}function Cp(e,t,u){const n=e[Wu];n&&(t=(t?[t,...n]:[...n]).join(" ")),t==null?e.removeAttribute("class"):u?e.setAttribute("class",t):e.className=t}const Br=Symbol("_vod"),o0=Symbol("_vsh"),c0={beforeMount(e,{value:t},{transition:u}){e[Br]=e.style.display==="none"?"":e.style.display,u&&t?u.beforeEnter(e):an(e,t)},mounted(e,{value:t},{transition:u}){u&&t&&u.enter(e)},updated(e,{value:t,oldValue:u},{transition:n}){!t!=!u&&(n?t?(n.beforeEnter(e),an(e,!0),n.enter(e)):n.leave(e,()=>{an(e,!1)}):an(e,t))},beforeUnmount(e,{value:t}){an(e,t)}};function an(e,t){e.style.display=t?e[Br]:"none",e[o0]=!t}function kp(){c0.getSSRProps=({value:e})=>{if(!e)return{style:{display:"none"}}}}const l0=Symbol("");function Ap(e){const t=ft();if(!t)return;const u=t.ut=(r=e(t.proxy))=>{Array.from(document.querySelectorAll(`[data-v-owner="${t.uid}"]`)).forEach(i=>Vr(i,r))},n=()=>{const r=e(t.proxy);t.ce?Vr(t.ce,r):Cs(t.subTree,r),u(r)};Qs(()=>{wn(n)}),Gn(()=>{Hu(n,we,{flush:"post"});const r=new MutationObserver(n);r.observe(t.subTree.el.parentNode,{childList:!0}),_i(()=>r.disconnect())})}function Cs(e,t){if(e.shapeFlag&128){const u=e.suspense;e=u.activeBranch,u.pendingBranch&&!u.isHydrating&&u.effects.push(()=>{Cs(u.activeBranch,t)})}for(;e.component;)e=e.component.subTree;if(e.shapeFlag&1&&e.el)Vr(e.el,t);else if(e.type===Ee)e.children.forEach(u=>Cs(u,t));else if(e.type===ru){let{el:u,anchor:n}=e;for(;u&&(Vr(u,t),u!==n);)u=u.nextSibling}}function Vr(e,t){if(e.nodeType===1){const u=e.style;let n="";for(const r in t)u.setProperty(`--${r}`,t[r]),n+=`--${r}: ${t[r]};`;u[l0]=n}}const Sp=/(^|;)\s*display\s*:/;function vp(e,t,u){const n=e.style,r=Z(u);let i=!1;if(u&&!r){if(t)if(Z(t))for(const s of t.split(";")){const o=s.slice(0,s.indexOf(":")).trim();u[o]==null&&xr(n,o,"")}else for(const s in t)u[s]==null&&xr(n,s,"");for(const s in u)s==="display"&&(i=!0),xr(n,s,u[s])}else if(r){if(t!==u){const s=n[l0];s&&(u+=";"+s),n.cssText=u,i=Sp.test(u)}}else t&&e.removeAttribute("style");Br in e&&(e[Br]=i?n.display:"",e[o0]&&(n.display="none"))}const xc=/\s*!important$/;function xr(e,t,u){if($(u))u.forEach(n=>xr(e,t,n));else if(u==null&&(u=""),t.startsWith("--"))e.setProperty(t,u);else{const n=Dp(e,t);xc.test(u)?e.setProperty(qe(n),u.replace(xc,""),"important"):e[n]=u}}const Ec=["Webkit","Moz","ms"],ji={};function Dp(e,t){const u=ji[t];if(u)return u;let n=de(t);if(n!=="filter"&&n in e)return ji[t]=n;n=Tu(n);for(let r=0;r<Ec.length;r++){const i=Ec[r]+n;if(i in e)return ji[t]=i}return t}const Cc="http://www.w3.org/1999/xlink";function kc(e,t,u,n,r,i=ad(t)){n&&t.startsWith("xlink:")?u==null?e.removeAttributeNS(Cc,t.slice(6,t.length)):e.setAttributeNS(Cc,t,u):u==null||i&&!Bs(u)?e.removeAttribute(t):e.setAttribute(t,i?"":Ze(u)?String(u):u)}function Ac(e,t,u,n,r){if(t==="innerHTML"||t==="textContent"){u!=null&&(e[t]=t==="innerHTML"?u0(u):u);return}const i=e.tagName;if(t==="value"&&i!=="PROGRESS"&&!i.includes("-")){const o=i==="OPTION"?e.getAttribute("value")||"":e.value,c=u==null?e.type==="checkbox"?"on":"":String(u);(o!==c||!("_value"in e))&&(e.value=c),u==null&&e.removeAttribute(t),e._value=u;return}let s=!1;if(u===""||u==null){const o=typeof e[t];o==="boolean"?u=Bs(u):u==null&&o==="string"?(u="",s=!0):o==="number"&&(u=0,s=!0)}try{e[t]=u}catch{}s&&e.removeAttribute(r||t)}function Mt(e,t,u,n){e.addEventListener(t,u,n)}function Tp(e,t,u,n){e.removeEventListener(t,u,n)}const Sc=Symbol("_vei");function wp(e,t,u,n,r=null){const i=e[Sc]||(e[Sc]={}),s=i[t];if(n&&s)s.value=n;else{const[o,c]=Fp(t);if(n){const l=i[t]=Np(n,r);Mt(e,o,l,c)}else s&&(Tp(e,o,s,c),i[t]=void 0)}}const vc=/(?:Once|Passive|Capture)$/;function Fp(e){let t;if(vc.test(e)){t={};let n;for(;n=e.match(vc);)e=e.slice(0,e.length-n[0].length),t[n[0].toLowerCase()]=!0}return[e[2]===":"?e.slice(3):qe(e.slice(2)),t]}let zi=0;const Ip=Promise.resolve(),Op=()=>zi||(Ip.then(()=>zi=0),zi=Date.now());function Np(e,t){const u=n=>{if(!n._vts)n._vts=Date.now();else if(n._vts<=u.attached)return;lt(Rp(n,u.value),t,5,[n])};return u.value=e,u.attached=Op(),u}function Rp(e,t){if($(t)){const u=e.stopImmediatePropagation;return e.stopImmediatePropagation=()=>{u.call(e),e._stopped=!0},t.map(n=>r=>!r._stopped&&n&&n(r))}else return t}const Dc=e=>e.charCodeAt(0)===111&&e.charCodeAt(1)===110&&e.charCodeAt(2)>96&&e.charCodeAt(2)<123,Mp=(e,t,u,n,r,i)=>{const s=r==="svg";t==="class"?Cp(e,n,s):t==="style"?vp(e,u,n):vu(t)?Rs(t)||wp(e,t,u,n,i):(t[0]==="."?(t=t.slice(1),!0):t[0]==="^"?(t=t.slice(1),!1):Pp(e,t,n,s))?(Ac(e,t,n),!e.tagName.includes("-")&&(t==="value"||t==="checked"||t==="selected")&&kc(e,t,n,s,i,t!=="value")):e._isVueCE&&(/[A-Z]/.test(t)||!Z(n))?Ac(e,de(t),n,i,t):(t==="true-value"?e._trueValue=n:t==="false-value"&&(e._falseValue=n),kc(e,t,n,s))};function Pp(e,t,u,n){if(n)return!!(t==="innerHTML"||t==="textContent"||t in e&&Dc(t)&&W(u));if(t==="spellcheck"||t==="draggable"||t==="translate"||t==="form"||t==="list"&&e.tagName==="INPUT"||t==="type"&&e.tagName==="TEXTAREA")return!1;if(t==="width"||t==="height"){const r=e.tagName;if(r==="IMG"||r==="VIDEO"||r==="CANVAS"||r==="SOURCE")return!1}return Dc(t)&&Z(u)?!1:t in e}const Tc={};/*! #__NO_SIDE_EFFECTS__ */function a0(e,t,u){const n=Kn(e,t);ti(n)&&te(n,t);class r extends Ei{constructor(s){super(n,s,u)}}return r.def=n,r}/*! #__NO_SIDE_EFFECTS__ */const Lp=(e,t)=>a0(e,t,C0),Bp=typeof HTMLElement<"u"?HTMLElement:class{};class Ei extends Bp{constructor(t,u={},n=Ur){super(),this._def=t,this._props=u,this._createApp=n,this._isVueCE=!0,this._instance=null,this._app=null,this._nonce=this._def.nonce,this._connected=!1,this._resolved=!1,this._numberProps=null,this._styleChildren=new WeakSet,this._ob=null,this.shadowRoot&&n!==Ur?this._root=this.shadowRoot:t.shadowRoot!==!1?(this.attachShadow({mode:"open"}),this._root=this.shadowRoot):this._root=this,this._def.__asyncLoader||this._resolveProps(this._def)}connectedCallback(){if(!this.isConnected)return;this.shadowRoot||this._parseSlots(),this._connected=!0;let t=this;for(;t=t&&(t.parentNode||t.host);)if(t instanceof Ei){this._parent=t;break}this._instance||(this._resolved?(this._setParent(),this._update()):t&&t._pendingResolve?this._pendingResolve=t._pendingResolve.then(()=>{this._pendingResolve=void 0,this._resolveDef()}):this._resolveDef())}_setParent(t=this._parent){t&&(this._instance.parent=t._instance,this._instance.provides=t._instance.provides)}disconnectedCallback(){this._connected=!1,fi(()=>{this._connected||(this._ob&&(this._ob.disconnect(),this._ob=null),this._app&&this._app.unmount(),this._instance&&(this._instance.ce=void 0),this._app=this._instance=null)})}_resolveDef(){if(this._pendingResolve)return;for(let n=0;n<this.attributes.length;n++)this._setAttr(this.attributes[n].name);this._ob=new MutationObserver(n=>{for(const r of n)this._setAttr(r.attributeName)}),this._ob.observe(this,{attributes:!0});const t=(n,r=!1)=>{this._resolved=!0,this._pendingResolve=void 0;const{props:i,styles:s}=n;let o;if(i&&!$(i))for(const c in i){const l=i[c];(l===Number||l&&l.type===Number)&&(c in this._props&&(this._props[c]=vr(this._props[c])),(o||(o=Object.create(null)))[de(c)]=!0)}this._numberProps=o,r&&this._resolveProps(n),this.shadowRoot&&this._applyStyles(s),this._mount(n)},u=this._def.__asyncLoader;u?this._pendingResolve=u().then(n=>t(this._def=n,!0)):t(this._def)}_mount(t){__VUE_PROD_DEVTOOLS__&&!t.name&&(t.name="VueElement"),this._app=this._createApp(t),t.configureApp&&t.configureApp(this._app),this._app._ceVNode=this._createVNode(),this._app.mount(this._root);const u=this._instance&&this._instance.exposed;if(u)for(const n in u)re(this,n)||Object.defineProperty(this,n,{get:()=>ai(u[n])})}_resolveProps(t){const{props:u}=t,n=$(u)?u:Object.keys(u||{});for(const r of Object.keys(this))r[0]!=="_"&&n.includes(r)&&this._setProp(r,this[r]);for(const r of n.map(de))Object.defineProperty(this,r,{get(){return this._getProp(r)},set(i){this._setProp(r,i,!0,!0)}})}_setAttr(t){if(t.startsWith("data-v-"))return;const u=this.hasAttribute(t);let n=u?this.getAttribute(t):Tc;const r=de(t);u&&this._numberProps&&this._numberProps[r]&&(n=vr(n)),this._setProp(r,n,!1,!0)}_getProp(t){return this._props[t]}_setProp(t,u,n=!0,r=!1){if(u!==this._props[t]&&(u===Tc?delete this._props[t]:(this._props[t]=u,t==="key"&&this._app&&(this._app._ceVNode.key=u)),r&&this._instance&&this._update(),n)){const i=this._ob;i&&i.disconnect(),u===!0?this.setAttribute(qe(t),""):typeof u=="string"||typeof u=="number"?this.setAttribute(qe(t),u+""):u||this.removeAttribute(qe(t)),i&&i.observe(this,{attributes:!0})}}_update(){E0(this._createVNode(),this._root)}_createVNode(){const t={};this.shadowRoot||(t.onVnodeMounted=t.onVnodeUpdated=this._renderSlots.bind(this));const u=fe(this._def,te(t,this._props));return this._instance||(u.ce=n=>{this._instance=n,n.ce=this,n.isCE=!0;const r=(i,s)=>{this.dispatchEvent(new CustomEvent(i,ti(s[0])?te({detail:s},s[0]):{detail:s}))};n.emit=(i,...s)=>{r(i,s),qe(i)!==i&&r(qe(i),s)},this._setParent()}),u}_applyStyles(t,u){if(!t)return;if(u){if(u===this._def||this._styleChildren.has(u))return;this._styleChildren.add(u)}const n=this._nonce;for(let r=t.length-1;r>=0;r--){const i=document.createElement("style");n&&i.setAttribute("nonce",n),i.textContent=t[r],this.shadowRoot.prepend(i)}}_parseSlots(){const t=this._slots={};let u;for(;u=this.firstChild;){const n=u.nodeType===1&&u.getAttribute("slot")||"default";(t[n]||(t[n]=[])).push(u),this.removeChild(u)}}_renderSlots(){const t=(this._teleportTarget||this).querySelectorAll("slot"),u=this._instance.type.__scopeId;for(let n=0;n<t.length;n++){const r=t[n],i=r.getAttribute("name")||"default",s=this._slots[i],o=r.parentNode;if(s)for(const c of s){if(u&&c.nodeType===1){const l=u+"-s",a=document.createTreeWalker(c,1);c.setAttribute(l,"");let f;for(;f=a.nextNode();)f.setAttribute(l,"")}o.insertBefore(c,r)}else for(;r.firstChild;)o.insertBefore(r.firstChild,r);o.removeChild(r)}}_injectChildStyle(t){this._applyStyles(t.styles,t)}_removeChildStyle(t){}}function f0(e){const t=ft(),u=t&&t.ce;return u||null}function Vp(){const e=f0();return e&&e.shadowRoot}function $p(e="$style"){{const t=ft();if(!t)return ee;const u=t.type.__cssModules;if(!u)return ee;const n=u[e];return n||ee}}const d0=new WeakMap,h0=new WeakMap,$r=Symbol("_moveCb"),wc=Symbol("_enterCb"),Hp=e=>(delete e.props.mode,e),Up=Hp({name:"TransitionGroup",props:te({},r0,{tag:String,moveClass:String}),setup(e,{slots:t}){const u=ft(),n=Js();let r,i;return bi(()=>{if(!r.length)return;const s=e.moveClass||`${e.name||"v"}-move`;if(!Wp(r[0].el,u.vnode.el,s))return;r.forEach(jp),r.forEach(zp);const o=r.filter(Kp);Es(),o.forEach(c=>{const l=c.el,a=l.style;Et(l,s),a.transform=a.webkitTransform=a.transitionDuration="";const f=l[$r]=p=>{p&&p.target!==l||(!p||/transform$/.test(p.propertyName))&&(l.removeEventListener("transitionend",f),l[$r]=null,Kt(l,s))};l.addEventListener("transitionend",f)})}),()=>{const s=Q(e),o=i0(s);let c=s.tag||Ee;if(r=[],i)for(let l=0;l<i.length;l++){const a=i[l];a.el&&a.el instanceof Element&&(r.push(a),Bt(a,zu(a,o,n,u)),d0.set(a,a.el.getBoundingClientRect()))}i=t.default?pi(t.default()):[];for(let l=0;l<i.length;l++){const a=i[l];a.key!=null&&Bt(a,zu(a,o,n,u))}return fe(c,null,i)}}}),qp=Up;function jp(e){const t=e.el;t[$r]&&t[$r](),t[wc]&&t[wc]()}function zp(e){h0.set(e,e.el.getBoundingClientRect())}function Kp(e){const t=d0.get(e),u=h0.get(e),n=t.left-u.left,r=t.top-u.top;if(n||r){const i=e.el.style;return i.transform=i.webkitTransform=`translate(${n}px,${r}px)`,i.transitionDuration="0s",e}}function Wp(e,t,u){const n=e.cloneNode(),r=e[Wu];r&&r.forEach(o=>{o.split(/\s+/).forEach(c=>c&&n.classList.remove(c))}),u.split(/\s+/).forEach(o=>o&&n.classList.add(o)),n.style.display="none";const i=t.nodeType===1?t:t.parentNode;i.appendChild(n);const{hasTransform:s}=s0(n);return i.removeChild(n),s}const cu=e=>{const t=e.props["onUpdate:modelValue"]||!1;return $(t)?u=>Vu(t,u):t};function Gp(e){e.target.composing=!0}function Fc(e){const t=e.target;t.composing&&(t.composing=!1,t.dispatchEvent(new Event("input")))}const ct=Symbol("_assign"),Hr={created(e,{modifiers:{lazy:t,trim:u,number:n}},r){e[ct]=cu(r);const i=n||r.props&&r.props.type==="number";Mt(e,t?"change":"input",s=>{if(s.target.composing)return;let o=e.value;u&&(o=o.trim()),i&&(o=Sr(o)),e[ct](o)}),u&&Mt(e,"change",()=>{e.value=e.value.trim()}),t||(Mt(e,"compositionstart",Gp),Mt(e,"compositionend",Fc),Mt(e,"change",Fc))},mounted(e,{value:t}){e.value=t??""},beforeUpdate(e,{value:t,oldValue:u,modifiers:{lazy:n,trim:r,number:i}},s){if(e[ct]=cu(s),e.composing)return;const o=(i||e.type==="number")&&!/^0\d/.test(e.value)?Sr(e.value):e.value,c=t??"";o!==c&&(document.activeElement===e&&e.type!=="range"&&(n&&t===u||r&&e.value.trim()===c)||(e.value=c))}},fo={deep:!0,created(e,t,u){e[ct]=cu(u),Mt(e,"change",()=>{const n=e._modelValue,r=Gu(e),i=e.checked,s=e[ct];if($(n)){const o=ni(n,r),c=o!==-1;if(i&&!c)s(n.concat(r));else if(!i&&c){const l=[...n];l.splice(o,1),s(l)}}else if(Du(n)){const o=new Set(n);i?o.add(r):o.delete(r),s(o)}else s(m0(e,i))})},mounted:Ic,beforeUpdate(e,t,u){e[ct]=cu(u),Ic(e,t,u)}};function Ic(e,{value:t,oldValue:u},n){e._modelValue=t;let r;if($(t))r=ni(t,n.props.value)>-1;else if(Du(t))r=t.has(n.props.value);else{if(t===u)return;r=su(t,m0(e,!0))}e.checked!==r&&(e.checked=r)}const ho={created(e,{value:t},u){e.checked=su(t,u.props.value),e[ct]=cu(u),Mt(e,"change",()=>{e[ct](Gu(e))})},beforeUpdate(e,{value:t,oldValue:u},n){e[ct]=cu(n),t!==u&&(e.checked=su(t,n.props.value))}},p0={deep:!0,created(e,{value:t,modifiers:{number:u}},n){const r=Du(t);Mt(e,"change",()=>{const i=Array.prototype.filter.call(e.options,s=>s.selected).map(s=>u?Sr(Gu(s)):Gu(s));e[ct](e.multiple?r?new Set(i):i:i[0]),e._assigning=!0,fi(()=>{e._assigning=!1})}),e[ct]=cu(n)},mounted(e,{value:t}){Oc(e,t)},beforeUpdate(e,t,u){e[ct]=cu(u)},updated(e,{value:t}){e._assigning||Oc(e,t)}};function Oc(e,t){const u=e.multiple,n=$(t);if(!(u&&!n&&!Du(t))){for(let r=0,i=e.options.length;r<i;r++){const s=e.options[r],o=Gu(s);if(u)if(n){const c=typeof o;c==="string"||c==="number"?s.selected=t.some(l=>String(l)===String(o)):s.selected=ni(t,o)>-1}else s.selected=t.has(o);else if(su(Gu(s),t)){e.selectedIndex!==r&&(e.selectedIndex=r);return}}!u&&e.selectedIndex!==-1&&(e.selectedIndex=-1)}}function Gu(e){return"_value"in e?e._value:e.value}function m0(e,t){const u=t?"_trueValue":"_falseValue";return u in e?e[u]:t}const b0={created(e,t,u){fr(e,t,u,null,"created")},mounted(e,t,u){fr(e,t,u,null,"mounted")},beforeUpdate(e,t,u,n){fr(e,t,u,n,"beforeUpdate")},updated(e,t,u,n){fr(e,t,u,n,"updated")}};function g0(e,t){switch(e){case"SELECT":return p0;case"TEXTAREA":return Hr;default:switch(t){case"checkbox":return fo;case"radio":return ho;default:return Hr}}}function fr(e,t,u,n,r){const s=g0(e.tagName,u.props&&u.props.type)[r];s&&s(e,t,u,n)}function Zp(){Hr.getSSRProps=({value:e})=>({value:e}),ho.getSSRProps=({value:e},t)=>{if(t.props&&su(t.props.value,e))return{checked:!0}},fo.getSSRProps=({value:e},t)=>{if($(e)){if(t.props&&ni(e,t.props.value)>-1)return{checked:!0}}else if(Du(e)){if(t.props&&e.has(t.props.value))return{checked:!0}}else if(e)return{checked:!0}},b0.getSSRProps=(e,t)=>{if(typeof t.type!="string")return;const u=g0(t.type.toUpperCase(),t.props&&t.props.type);if(u.getSSRProps)return u.getSSRProps(e,t)}}const Jp=["ctrl","shift","alt","meta"],Yp={stop:e=>e.stopPropagation(),prevent:e=>e.preventDefault(),self:e=>e.target!==e.currentTarget,ctrl:e=>!e.ctrlKey,shift:e=>!e.shiftKey,alt:e=>!e.altKey,meta:e=>!e.metaKey,left:e=>"button"in e&&e.button!==0,middle:e=>"button"in e&&e.button!==1,right:e=>"button"in e&&e.button!==2,exact:(e,t)=>Jp.some(u=>e[`${u}Key`]&&!t.includes(u))},Xp=(e,t)=>{const u=e._withMods||(e._withMods={}),n=t.join(".");return u[n]||(u[n]=(r,...i)=>{for(let s=0;s<t.length;s++){const o=Yp[t[s]];if(o&&o(r,t))return}return e(r,...i)})},Qp={esc:"escape",space:" ",up:"arrow-up",left:"arrow-left",right:"arrow-right",down:"arrow-down",delete:"backspace"},e2=(e,t)=>{const u=e._withKeys||(e._withKeys={}),n=t.join(".");return u[n]||(u[n]=r=>{if(!("key"in r))return;const i=qe(r.key);if(t.some(s=>s===i||Qp[s]===i))return e(r)})},_0=te({patchProp:Mp},gp);let An,Nc=!1;function y0(){return An||(An=Fa(_0))}function x0(){return An=Nc?An:Ia(_0),Nc=!0,An}const E0=(...e)=>{y0().render(...e)},t2=(...e)=>{x0().hydrate(...e)},Ur=(...e)=>{const t=y0().createApp(...e),{mount:u}=t;return t.mount=n=>{const r=A0(n);if(!r)return;const i=t._component;!W(i)&&!i.render&&!i.template&&(i.template=r.innerHTML),r.nodeType===1&&(r.textContent="");const s=u(r,!1,k0(r));return r instanceof Element&&(r.removeAttribute("v-cloak"),r.setAttribute("data-v-app","")),s},t},C0=(...e)=>{const t=x0().createApp(...e),{mount:u}=t;return t.mount=n=>{const r=A0(n);if(r)return u(r,!0,k0(r))},t};function k0(e){if(e instanceof SVGElement)return"svg";if(typeof MathMLElement=="function"&&e instanceof MathMLElement)return"mathml"}function A0(e){return Z(e)?document.querySelector(e):e}let Rc=!1;const u2=()=>{Rc||(Rc=!0,Zp(),kp())},n2=Object.freeze(Object.defineProperty({__proto__:null,BaseTransition:ra,BaseTransitionPropsValidators:Ys,Comment:ye,DeprecationTypes:pp,EffectScope:Vs,ErrorCodes:lh,ErrorTypeStrings:op,Fragment:Ee,KeepAlive:qh,ReactiveEffect:vn,Static:ru,Suspense:V1,Teleport:kh,Text:Pt,TrackOpTypes:Xd,Transition:yp,TransitionGroup:qp,TriggerOpTypes:Qd,VueElement:Ei,assertNumber:ch,callWithAsyncErrorHandling:lt,callWithErrorHandling:wu,camelize:de,capitalize:Tu,cloneVNode:vt,compatUtils:hp,computed:lo,createApp:Ur,createBlock:Rr,createCommentVNode:Z1,createElementBlock:z1,createElementVNode:so,createHydrationRenderer:Ia,createPropsRestProxy:f1,createRenderer:Fa,createSSRApp:C0,createSlots:Jh,createStaticVNode:G1,createTextVNode:oo,createVNode:fe,customRef:jl,defineAsyncComponent:Hh,defineComponent:Kn,defineCustomElement:a0,defineEmits:t1,defineExpose:u1,defineModel:i1,defineOptions:n1,defineProps:e1,defineSSRCustomElement:Lp,defineSlots:r1,devtools:cp,effect:xd,effectScope:gd,getCurrentInstance:ft,getCurrentScope:Sl,getCurrentWatcher:eh,getTransitionRawChildren:pi,guardReactiveProps:Ka,h:ao,handleError:Fu,hasInjectionContext:x1,hydrate:t2,hydrateOnIdle:Mh,hydrateOnInteraction:Vh,hydrateOnMediaQuery:Bh,hydrateOnVisible:Lh,initCustomFormatter:rp,initDirectivesForSSR:u2,inject:Cn,isMemoSame:t0,isProxy:li,isReactive:uu,isReadonly:ou,isRef:Ce,isRuntimeOnly:Q1,isShallow:ut,isVNode:Vt,markRaw:Ul,mergeDefaults:l1,mergeModels:a1,mergeProps:Wa,nextTick:fi,normalizeClass:nn,normalizeProps:td,normalizeStyle:un,onActivated:ca,onBeforeMount:fa,onBeforeUnmount:gi,onBeforeUpdate:Qs,onDeactivated:la,onErrorCaptured:ma,onMounted:Gn,onRenderTracked:pa,onRenderTriggered:ha,onScopeDispose:_d,onServerPrefetch:da,onUnmounted:_i,onUpdated:bi,onWatcherCleanup:Kl,openBlock:Mn,popScopeId:yh,provide:xa,proxyRefs:zs,pushScopeId:_h,queuePostFlushCb:wn,reactive:oi,readonly:qs,ref:yn,registerRuntimeCompiler:Ya,render:E0,renderList:Zh,renderSlot:Yh,resolveComponent:Kh,resolveDirective:Gh,resolveDynamicComponent:Wh,resolveFilter:dp,resolveTransitionHooks:zu,setBlockTracking:ps,setDevtoolsHook:lp,setTransitionHooks:Bt,shallowReactive:Hl,shallowReadonly:$d,shallowRef:js,ssrContextKey:Ma,ssrUtils:fp,stop:Ed,toDisplayString:kl,toHandlerKey:Bu,toHandlers:Xh,toRaw:Q,toRef:Zd,toRefs:Kd,toValue:qd,transformVNodeArgs:K1,triggerRef:Ud,unref:ai,useAttrs:c1,useCssModule:$p,useCssVars:Ap,useHost:f0,useId:Sh,useModel:O1,useSSRContext:Pa,useShadowRoot:Vp,useSlots:o1,useTemplateRef:vh,useTransitionState:Js,vModelCheckbox:fo,vModelDynamic:b0,vModelRadio:ho,vModelSelect:p0,vModelText:Hr,vShow:c0,version:ys,warn:sp,watch:Hu,watchEffect:w1,watchPostEffect:F1,watchSyncEffect:La,withAsyncContext:d1,withCtx:Zs,withDefaults:s1,withDirectives:Eh,withKeys:e2,withMemo:ip,withModifiers:Xp,withScopeId:xh},Symbol.toStringTag,{value:"Module"}));/**
+* @vue/compiler-core v3.5.13
+* (c) 2018-present Yuxi (Evan) You and Vue contributors
+* @license MIT
+**/const Pn=Symbol(""),Sn=Symbol(""),po=Symbol(""),qr=Symbol(""),S0=Symbol(""),Su=Symbol(""),v0=Symbol(""),D0=Symbol(""),mo=Symbol(""),bo=Symbol(""),Yn=Symbol(""),go=Symbol(""),T0=Symbol(""),_o=Symbol(""),yo=Symbol(""),xo=Symbol(""),Eo=Symbol(""),Co=Symbol(""),ko=Symbol(""),w0=Symbol(""),F0=Symbol(""),Ci=Symbol(""),jr=Symbol(""),Ao=Symbol(""),So=Symbol(""),Ln=Symbol(""),Xn=Symbol(""),vo=Symbol(""),ks=Symbol(""),r2=Symbol(""),As=Symbol(""),zr=Symbol(""),i2=Symbol(""),s2=Symbol(""),Do=Symbol(""),o2=Symbol(""),c2=Symbol(""),To=Symbol(""),I0=Symbol(""),Zu={[Pn]:"Fragment",[Sn]:"Teleport",[po]:"Suspense",[qr]:"KeepAlive",[S0]:"BaseTransition",[Su]:"openBlock",[v0]:"createBlock",[D0]:"createElementBlock",[mo]:"createVNode",[bo]:"createElementVNode",[Yn]:"createCommentVNode",[go]:"createTextVNode",[T0]:"createStaticVNode",[_o]:"resolveComponent",[yo]:"resolveDynamicComponent",[xo]:"resolveDirective",[Eo]:"resolveFilter",[Co]:"withDirectives",[ko]:"renderList",[w0]:"renderSlot",[F0]:"createSlots",[Ci]:"toDisplayString",[jr]:"mergeProps",[Ao]:"normalizeClass",[So]:"normalizeStyle",[Ln]:"normalizeProps",[Xn]:"guardReactiveProps",[vo]:"toHandlers",[ks]:"camelize",[r2]:"capitalize",[As]:"toHandlerKey",[zr]:"setBlockTracking",[i2]:"pushScopeId",[s2]:"popScopeId",[Do]:"withCtx",[o2]:"unref",[c2]:"isRef",[To]:"withMemo",[I0]:"isMemoSame"};function l2(e){Object.getOwnPropertySymbols(e).forEach(t=>{Zu[t]=e[t]})}const rt={start:{line:1,column:1,offset:0},end:{line:1,column:1,offset:0},source:""};function a2(e,t=""){return{type:0,source:t,children:e,helpers:new Set,components:[],directives:[],hoists:[],imports:[],cached:[],temps:0,codegenNode:void 0,loc:rt}}function Bn(e,t,u,n,r,i,s,o=!1,c=!1,l=!1,a=rt){return e&&(o?(e.helper(Su),e.helper(Xu(e.inSSR,l))):e.helper(Yu(e.inSSR,l)),s&&e.helper(Co)),{type:13,tag:t,props:u,children:n,patchFlag:r,dynamicProps:i,directives:s,isBlock:o,disableTracking:c,isComponent:l,loc:a}}function Eu(e,t=rt){return{type:17,loc:t,elements:e}}function ot(e,t=rt){return{type:15,loc:t,properties:e}}function xe(e,t){return{type:16,loc:rt,key:Z(e)?X(e,!0):e,value:t}}function X(e,t=!1,u=rt,n=0){return{type:4,loc:u,content:e,isStatic:t,constType:t?3:n}}function gt(e,t=rt){return{type:8,loc:t,children:e}}function Ae(e,t=[],u=rt){return{type:14,loc:u,callee:e,arguments:t}}function Ju(e,t=void 0,u=!1,n=!1,r=rt){return{type:18,params:e,returns:t,newline:u,isSlot:n,loc:r}}function Ss(e,t,u,n=!0){return{type:19,test:e,consequent:t,alternate:u,newline:n,loc:rt}}function f2(e,t,u=!1,n=!1){return{type:20,index:e,value:t,needPauseTracking:u,inVOnce:n,needArraySpread:!1,loc:rt}}function d2(e){return{type:21,body:e,loc:rt}}function Yu(e,t){return e||t?mo:bo}function Xu(e,t){return e||t?v0:D0}function wo(e,{helper:t,removeHelper:u,inSSR:n}){e.isBlock||(e.isBlock=!0,u(Yu(n,e.isComponent)),t(Su),t(Xu(n,e.isComponent)))}const Mc=new Uint8Array([123,123]),Pc=new Uint8Array([125,125]);function Lc(e){return e>=97&&e<=122||e>=65&&e<=90}function et(e){return e===32||e===10||e===9||e===12||e===13}function jt(e){return e===47||e===62||et(e)}function Kr(e){const t=new Uint8Array(e.length);for(let u=0;u<e.length;u++)t[u]=e.charCodeAt(u);return t}const Oe={Cdata:new Uint8Array([67,68,65,84,65,91]),CdataEnd:new Uint8Array([93,93,62]),CommentEnd:new Uint8Array([45,45,62]),ScriptEnd:new Uint8Array([60,47,115,99,114,105,112,116]),StyleEnd:new Uint8Array([60,47,115,116,121,108,101]),TitleEnd:new Uint8Array([60,47,116,105,116,108,101]),TextareaEnd:new Uint8Array([60,47,116,101,120,116,97,114,101,97])};class h2{constructor(t,u){this.stack=t,this.cbs=u,this.state=1,this.buffer="",this.sectionStart=0,this.index=0,this.entityStart=0,this.baseState=1,this.inRCDATA=!1,this.inXML=!1,this.inVPre=!1,this.newlines=[],this.mode=0,this.delimiterOpen=Mc,this.delimiterClose=Pc,this.delimiterIndex=-1,this.currentSequence=void 0,this.sequenceIndex=0}get inSFCRoot(){return this.mode===2&&this.stack.length===0}reset(){this.state=1,this.mode=0,this.buffer="",this.sectionStart=0,this.index=0,this.baseState=1,this.inRCDATA=!1,this.currentSequence=void 0,this.newlines.length=0,this.delimiterOpen=Mc,this.delimiterClose=Pc}getPos(t){let u=1,n=t+1;for(let r=this.newlines.length-1;r>=0;r--){const i=this.newlines[r];if(t>i){u=r+2,n=t-i;break}}return{column:n,line:u,offset:t}}peek(){return this.buffer.charCodeAt(this.index+1)}stateText(t){t===60?(this.index>this.sectionStart&&this.cbs.ontext(this.sectionStart,this.index),this.state=5,this.sectionStart=this.index):!this.inVPre&&t===this.delimiterOpen[0]&&(this.state=2,this.delimiterIndex=0,this.stateInterpolationOpen(t))}stateInterpolationOpen(t){if(t===this.delimiterOpen[this.delimiterIndex])if(this.delimiterIndex===this.delimiterOpen.length-1){const u=this.index+1-this.delimiterOpen.length;u>this.sectionStart&&this.cbs.ontext(this.sectionStart,u),this.state=3,this.sectionStart=u}else this.delimiterIndex++;else this.inRCDATA?(this.state=32,this.stateInRCDATA(t)):(this.state=1,this.stateText(t))}stateInterpolation(t){t===this.delimiterClose[0]&&(this.state=4,this.delimiterIndex=0,this.stateInterpolationClose(t))}stateInterpolationClose(t){t===this.delimiterClose[this.delimiterIndex]?this.delimiterIndex===this.delimiterClose.length-1?(this.cbs.oninterpolation(this.sectionStart,this.index+1),this.inRCDATA?this.state=32:this.state=1,this.sectionStart=this.index+1):this.delimiterIndex++:(this.state=3,this.stateInterpolation(t))}stateSpecialStartSequence(t){const u=this.sequenceIndex===this.currentSequence.length;if(!(u?jt(t):(t|32)===this.currentSequence[this.sequenceIndex]))this.inRCDATA=!1;else if(!u){this.sequenceIndex++;return}this.sequenceIndex=0,this.state=6,this.stateInTagName(t)}stateInRCDATA(t){if(this.sequenceIndex===this.currentSequence.length){if(t===62||et(t)){const u=this.index-this.currentSequence.length;if(this.sectionStart<u){const n=this.index;this.index=u,this.cbs.ontext(this.sectionStart,u),this.index=n}this.sectionStart=u+2,this.stateInClosingTagName(t),this.inRCDATA=!1;return}this.sequenceIndex=0}(t|32)===this.currentSequence[this.sequenceIndex]?this.sequenceIndex+=1:this.sequenceIndex===0?this.currentSequence===Oe.TitleEnd||this.currentSequence===Oe.TextareaEnd&&!this.inSFCRoot?!this.inVPre&&t===this.delimiterOpen[0]&&(this.state=2,this.delimiterIndex=0,this.stateInterpolationOpen(t)):this.fastForwardTo(60)&&(this.sequenceIndex=1):this.sequenceIndex=+(t===60)}stateCDATASequence(t){t===Oe.Cdata[this.sequenceIndex]?++this.sequenceIndex===Oe.Cdata.length&&(this.state=28,this.currentSequence=Oe.CdataEnd,this.sequenceIndex=0,this.sectionStart=this.index+1):(this.sequenceIndex=0,this.state=23,this.stateInDeclaration(t))}fastForwardTo(t){for(;++this.index<this.buffer.length;){const u=this.buffer.charCodeAt(this.index);if(u===10&&this.newlines.push(this.index),u===t)return!0}return this.index=this.buffer.length-1,!1}stateInCommentLike(t){t===this.currentSequence[this.sequenceIndex]?++this.sequenceIndex===this.currentSequence.length&&(this.currentSequence===Oe.CdataEnd?this.cbs.oncdata(this.sectionStart,this.index-2):this.cbs.oncomment(this.sectionStart,this.index-2),this.sequenceIndex=0,this.sectionStart=this.index+1,this.state=1):this.sequenceIndex===0?this.fastForwardTo(this.currentSequence[0])&&(this.sequenceIndex=1):t!==this.currentSequence[this.sequenceIndex-1]&&(this.sequenceIndex=0)}startSpecial(t,u){this.enterRCDATA(t,u),this.state=31}enterRCDATA(t,u){this.inRCDATA=!0,this.currentSequence=t,this.sequenceIndex=u}stateBeforeTagName(t){t===33?(this.state=22,this.sectionStart=this.index+1):t===63?(this.state=24,this.sectionStart=this.index+1):Lc(t)?(this.sectionStart=this.index,this.mode===0?this.state=6:this.inSFCRoot?this.state=34:this.inXML?this.state=6:t===116?this.state=30:this.state=t===115?29:6):t===47?this.state=8:(this.state=1,this.stateText(t))}stateInTagName(t){jt(t)&&this.handleTagName(t)}stateInSFCRootTagName(t){if(jt(t)){const u=this.buffer.slice(this.sectionStart,this.index);u!=="template"&&this.enterRCDATA(Kr("</"+u),0),this.handleTagName(t)}}handleTagName(t){this.cbs.onopentagname(this.sectionStart,this.index),this.sectionStart=-1,this.state=11,this.stateBeforeAttrName(t)}stateBeforeClosingTagName(t){et(t)||(t===62?(this.state=1,this.sectionStart=this.index+1):(this.state=Lc(t)?9:27,this.sectionStart=this.index))}stateInClosingTagName(t){(t===62||et(t))&&(this.cbs.onclosetag(this.sectionStart,this.index),this.sectionStart=-1,this.state=10,this.stateAfterClosingTagName(t))}stateAfterClosingTagName(t){t===62&&(this.state=1,this.sectionStart=this.index+1)}stateBeforeAttrName(t){t===62?(this.cbs.onopentagend(this.index),this.inRCDATA?this.state=32:this.state=1,this.sectionStart=this.index+1):t===47?this.state=7:t===60&&this.peek()===47?(this.cbs.onopentagend(this.index),this.state=5,this.sectionStart=this.index):et(t)||this.handleAttrStart(t)}handleAttrStart(t){t===118&&this.peek()===45?(this.state=13,this.sectionStart=this.index):t===46||t===58||t===64||t===35?(this.cbs.ondirname(this.index,this.index+1),this.state=14,this.sectionStart=this.index+1):(this.state=12,this.sectionStart=this.index)}stateInSelfClosingTag(t){t===62?(this.cbs.onselfclosingtag(this.index),this.state=1,this.sectionStart=this.index+1,this.inRCDATA=!1):et(t)||(this.state=11,this.stateBeforeAttrName(t))}stateInAttrName(t){(t===61||jt(t))&&(this.cbs.onattribname(this.sectionStart,this.index),this.handleAttrNameEnd(t))}stateInDirName(t){t===61||jt(t)?(this.cbs.ondirname(this.sectionStart,this.index),this.handleAttrNameEnd(t)):t===58?(this.cbs.ondirname(this.sectionStart,this.index),this.state=14,this.sectionStart=this.index+1):t===46&&(this.cbs.ondirname(this.sectionStart,this.index),this.state=16,this.sectionStart=this.index+1)}stateInDirArg(t){t===61||jt(t)?(this.cbs.ondirarg(this.sectionStart,this.index),this.handleAttrNameEnd(t)):t===91?this.state=15:t===46&&(this.cbs.ondirarg(this.sectionStart,this.index),this.state=16,this.sectionStart=this.index+1)}stateInDynamicDirArg(t){t===93?this.state=14:(t===61||jt(t))&&(this.cbs.ondirarg(this.sectionStart,this.index+1),this.handleAttrNameEnd(t))}stateInDirModifier(t){t===61||jt(t)?(this.cbs.ondirmodifier(this.sectionStart,this.index),this.handleAttrNameEnd(t)):t===46&&(this.cbs.ondirmodifier(this.sectionStart,this.index),this.sectionStart=this.index+1)}handleAttrNameEnd(t){this.sectionStart=this.index,this.state=17,this.cbs.onattribnameend(this.index),this.stateAfterAttrName(t)}stateAfterAttrName(t){t===61?this.state=18:t===47||t===62?(this.cbs.onattribend(0,this.sectionStart),this.sectionStart=-1,this.state=11,this.stateBeforeAttrName(t)):et(t)||(this.cbs.onattribend(0,this.sectionStart),this.handleAttrStart(t))}stateBeforeAttrValue(t){t===34?(this.state=19,this.sectionStart=this.index+1):t===39?(this.state=20,this.sectionStart=this.index+1):et(t)||(this.sectionStart=this.index,this.state=21,this.stateInAttrValueNoQuotes(t))}handleInAttrValue(t,u){(t===u||this.fastForwardTo(u))&&(this.cbs.onattribdata(this.sectionStart,this.index),this.sectionStart=-1,this.cbs.onattribend(u===34?3:2,this.index+1),this.state=11)}stateInAttrValueDoubleQuotes(t){this.handleInAttrValue(t,34)}stateInAttrValueSingleQuotes(t){this.handleInAttrValue(t,39)}stateInAttrValueNoQuotes(t){et(t)||t===62?(this.cbs.onattribdata(this.sectionStart,this.index),this.sectionStart=-1,this.cbs.onattribend(1,this.index),this.state=11,this.stateBeforeAttrName(t)):(t===39||t===60||t===61||t===96)&&this.cbs.onerr(18,this.index)}stateBeforeDeclaration(t){t===91?(this.state=26,this.sequenceIndex=0):this.state=t===45?25:23}stateInDeclaration(t){(t===62||this.fastForwardTo(62))&&(this.state=1,this.sectionStart=this.index+1)}stateInProcessingInstruction(t){(t===62||this.fastForwardTo(62))&&(this.cbs.onprocessinginstruction(this.sectionStart,this.index),this.state=1,this.sectionStart=this.index+1)}stateBeforeComment(t){t===45?(this.state=28,this.currentSequence=Oe.CommentEnd,this.sequenceIndex=2,this.sectionStart=this.index+1):this.state=23}stateInSpecialComment(t){(t===62||this.fastForwardTo(62))&&(this.cbs.oncomment(this.sectionStart,this.index),this.state=1,this.sectionStart=this.index+1)}stateBeforeSpecialS(t){t===Oe.ScriptEnd[3]?this.startSpecial(Oe.ScriptEnd,4):t===Oe.StyleEnd[3]?this.startSpecial(Oe.StyleEnd,4):(this.state=6,this.stateInTagName(t))}stateBeforeSpecialT(t){t===Oe.TitleEnd[3]?this.startSpecial(Oe.TitleEnd,4):t===Oe.TextareaEnd[3]?this.startSpecial(Oe.TextareaEnd,4):(this.state=6,this.stateInTagName(t))}startEntity(){}stateInEntity(){}parse(t){for(this.buffer=t;this.index<this.buffer.length;){const u=this.buffer.charCodeAt(this.index);switch(u===10&&this.newlines.push(this.index),this.state){case 1:{this.stateText(u);break}case 2:{this.stateInterpolationOpen(u);break}case 3:{this.stateInterpolation(u);break}case 4:{this.stateInterpolationClose(u);break}case 31:{this.stateSpecialStartSequence(u);break}case 32:{this.stateInRCDATA(u);break}case 26:{this.stateCDATASequence(u);break}case 19:{this.stateInAttrValueDoubleQuotes(u);break}case 12:{this.stateInAttrName(u);break}case 13:{this.stateInDirName(u);break}case 14:{this.stateInDirArg(u);break}case 15:{this.stateInDynamicDirArg(u);break}case 16:{this.stateInDirModifier(u);break}case 28:{this.stateInCommentLike(u);break}case 27:{this.stateInSpecialComment(u);break}case 11:{this.stateBeforeAttrName(u);break}case 6:{this.stateInTagName(u);break}case 34:{this.stateInSFCRootTagName(u);break}case 9:{this.stateInClosingTagName(u);break}case 5:{this.stateBeforeTagName(u);break}case 17:{this.stateAfterAttrName(u);break}case 20:{this.stateInAttrValueSingleQuotes(u);break}case 18:{this.stateBeforeAttrValue(u);break}case 8:{this.stateBeforeClosingTagName(u);break}case 10:{this.stateAfterClosingTagName(u);break}case 29:{this.stateBeforeSpecialS(u);break}case 30:{this.stateBeforeSpecialT(u);break}case 21:{this.stateInAttrValueNoQuotes(u);break}case 7:{this.stateInSelfClosingTag(u);break}case 23:{this.stateInDeclaration(u);break}case 22:{this.stateBeforeDeclaration(u);break}case 25:{this.stateBeforeComment(u);break}case 24:{this.stateInProcessingInstruction(u);break}case 33:{this.stateInEntity();break}}this.index++}this.cleanup(),this.finish()}cleanup(){this.sectionStart!==this.index&&(this.state===1||this.state===32&&this.sequenceIndex===0?(this.cbs.ontext(this.sectionStart,this.index),this.sectionStart=this.index):(this.state===19||this.state===20||this.state===21)&&(this.cbs.onattribdata(this.sectionStart,this.index),this.sectionStart=this.index))}finish(){this.handleTrailingData(),this.cbs.onend()}handleTrailingData(){const t=this.buffer.length;this.sectionStart>=t||(this.state===28?this.currentSequence===Oe.CdataEnd?this.cbs.oncdata(this.sectionStart,t):this.cbs.oncomment(this.sectionStart,t):this.state===6||this.state===11||this.state===18||this.state===17||this.state===12||this.state===13||this.state===14||this.state===15||this.state===16||this.state===20||this.state===19||this.state===21||this.state===9||this.cbs.ontext(this.sectionStart,t))}emitCodePoint(t,u){}}function Bc(e,{compatConfig:t}){const u=t&&t[e];return e==="MODE"?u||3:u}function Cu(e,t){const u=Bc("MODE",t),n=Bc(e,t);return u===3?n===!0:n!==!1}function Vn(e,t,u,...n){return Cu(e,t)}function Fo(e){throw e}function O0(e){}function me(e,t,u,n){const r=`https://vuejs.org/error-reference/#compiler-${e}`,i=new SyntaxError(String(r));return i.code=e,i.loc=t,i}const Ge=e=>e.type===4&&e.isStatic;function N0(e){switch(e){case"Teleport":case"teleport":return Sn;case"Suspense":case"suspense":return po;case"KeepAlive":case"keep-alive":return qr;case"BaseTransition":case"base-transition":return S0}}const p2=/^\d|[^\$\w\xA0-\uFFFF]/,Io=e=>!p2.test(e),m2=/[A-Za-z_$\xA0-\uFFFF]/,b2=/[\.\?\w$\xA0-\uFFFF]/,g2=/\s+[.[]\s*|\s*[.[]\s+/g,R0=e=>e.type===4?e.content:e.loc.source,_2=e=>{const t=R0(e).trim().replace(g2,o=>o.trim());let u=0,n=[],r=0,i=0,s=null;for(let o=0;o<t.length;o++){const c=t.charAt(o);switch(u){case 0:if(c==="[")n.push(u),u=1,r++;else if(c==="(")n.push(u),u=2,i++;else if(!(o===0?m2:b2).test(c))return!1;break;case 1:c==="'"||c==='"'||c==="`"?(n.push(u),u=3,s=c):c==="["?r++:c==="]"&&(--r||(u=n.pop()));break;case 2:if(c==="'"||c==='"'||c==="`")n.push(u),u=3,s=c;else if(c==="(")i++;else if(c===")"){if(o===t.length-1)return!1;--i||(u=n.pop())}break;case 3:c===s&&(u=n.pop(),s=null);break}}return!r&&!i},M0=_2,y2=/^\s*(async\s*)?(\([^)]*?\)|[\w$_]+)\s*(:[^=]+)?=>|^\s*(async\s+)?function(?:\s+[\w$]+)?\s*\(/,x2=e=>y2.test(R0(e)),E2=x2;function st(e,t,u=!1){for(let n=0;n<e.props.length;n++){const r=e.props[n];if(r.type===7&&(u||r.exp)&&(Z(t)?r.name===t:t.test(r.name)))return r}}function ki(e,t,u=!1,n=!1){for(let r=0;r<e.props.length;r++){const i=e.props[r];if(i.type===6){if(u)continue;if(i.name===t&&(i.value||n))return i}else if(i.name==="bind"&&(i.exp||n)&&mu(i.arg,t))return i}}function mu(e,t){return!!(e&&Ge(e)&&e.content===t)}function C2(e){return e.props.some(t=>t.type===7&&t.name==="bind"&&(!t.arg||t.arg.type!==4||!t.arg.isStatic))}function Ki(e){return e.type===5||e.type===2}function k2(e){return e.type===7&&e.name==="slot"}function Wr(e){return e.type===1&&e.tagType===3}function Gr(e){return e.type===1&&e.tagType===2}const A2=new Set([Ln,Xn]);function P0(e,t=[]){if(e&&!Z(e)&&e.type===14){const u=e.callee;if(!Z(u)&&A2.has(u))return P0(e.arguments[0],t.concat(e))}return[e,t]}function Zr(e,t,u){let n,r=e.type===13?e.props:e.arguments[2],i=[],s;if(r&&!Z(r)&&r.type===14){const o=P0(r);r=o[0],i=o[1],s=i[i.length-1]}if(r==null||Z(r))n=ot([t]);else if(r.type===14){const o=r.arguments[0];!Z(o)&&o.type===15?Vc(t,o)||o.properties.unshift(t):r.callee===vo?n=Ae(u.helper(jr),[ot([t]),r]):r.arguments.unshift(ot([t])),!n&&(n=r)}else r.type===15?(Vc(t,r)||r.properties.unshift(t),n=r):(n=Ae(u.helper(jr),[ot([t]),r]),s&&s.callee===Xn&&(s=i[i.length-2]));e.type===13?s?s.arguments[0]=n:e.props=n:s?s.arguments[0]=n:e.arguments[2]=n}function Vc(e,t){let u=!1;if(e.key.type===4){const n=e.key.content;u=t.properties.some(r=>r.key.type===4&&r.key.content===n)}return u}function $n(e,t){return`_${t}_${e.replace(/[^\w]/g,(u,n)=>u==="-"?"_":e.charCodeAt(n).toString())}`}function S2(e){return e.type===14&&e.callee===To?e.arguments[1].returns:e}const v2=/([\s\S]*?)\s+(?:in|of)\s+(\S[\s\S]*)/,L0={parseMode:"base",ns:0,delimiters:["{{","}}"],getNamespace:()=>0,isVoidTag:dn,isPreTag:dn,isIgnoreNewlineTag:dn,isCustomElement:dn,onError:Fo,onWarn:O0,comments:!1,prefixIdentifiers:!1};let ie=L0,Hn=null,Lt="",Ne=null,ne=null,ze="",It=-1,hu=-1,Oo=0,Yt=!1,vs=null;const pe=[],ge=new h2(pe,{onerr:Ft,ontext(e,t){dr(Fe(e,t),e,t)},ontextentity(e,t,u){dr(e,t,u)},oninterpolation(e,t){if(Yt)return dr(Fe(e,t),e,t);let u=e+ge.delimiterOpen.length,n=t-ge.delimiterClose.length;for(;et(Lt.charCodeAt(u));)u++;for(;et(Lt.charCodeAt(n-1));)n--;let r=Fe(u,n);r.includes("&")&&(r=ie.decodeEntities(r,!1)),Ds({type:5,content:Cr(r,!1,_e(u,n)),loc:_e(e,t)})},onopentagname(e,t){const u=Fe(e,t);Ne={type:1,tag:u,ns:ie.getNamespace(u,pe[0],ie.ns),tagType:0,props:[],children:[],loc:_e(e-1,t),codegenNode:void 0}},onopentagend(e){Hc(e)},onclosetag(e,t){const u=Fe(e,t);if(!ie.isVoidTag(u)){let n=!1;for(let r=0;r<pe.length;r++)if(pe[r].tag.toLowerCase()===u.toLowerCase()){n=!0,r>0&&Ft(24,pe[0].loc.start.offset);for(let s=0;s<=r;s++){const o=pe.shift();Er(o,t,s<r)}break}n||Ft(23,B0(e,60))}},onselfclosingtag(e){const t=Ne.tag;Ne.isSelfClosing=!0,Hc(e),pe[0]&&pe[0].tag===t&&Er(pe.shift(),e)},onattribname(e,t){ne={type:6,name:Fe(e,t),nameLoc:_e(e,t),value:void 0,loc:_e(e)}},ondirname(e,t){const u=Fe(e,t),n=u==="."||u===":"?"bind":u==="@"?"on":u==="#"?"slot":u.slice(2);if(!Yt&&n===""&&Ft(26,e),Yt||n==="")ne={type:6,name:u,nameLoc:_e(e,t),value:void 0,loc:_e(e)};else if(ne={type:7,name:n,rawName:u,exp:void 0,arg:void 0,modifiers:u==="."?[X("prop")]:[],loc:_e(e)},n==="pre"){Yt=ge.inVPre=!0,vs=Ne;const r=Ne.props;for(let i=0;i<r.length;i++)r[i].type===7&&(r[i]=L2(r[i]))}},ondirarg(e,t){if(e===t)return;const u=Fe(e,t);if(Yt)ne.name+=u,bu(ne.nameLoc,t);else{const n=u[0]!=="[";ne.arg=Cr(n?u:u.slice(1,-1),n,_e(e,t),n?3:0)}},ondirmodifier(e,t){const u=Fe(e,t);if(Yt)ne.name+="."+u,bu(ne.nameLoc,t);else if(ne.name==="slot"){const n=ne.arg;n&&(n.content+="."+u,bu(n.loc,t))}else{const n=X(u,!0,_e(e,t));ne.modifiers.push(n)}},onattribdata(e,t){ze+=Fe(e,t),It<0&&(It=e),hu=t},onattribentity(e,t,u){ze+=e,It<0&&(It=t),hu=u},onattribnameend(e){const t=ne.loc.start.offset,u=Fe(t,e);ne.type===7&&(ne.rawName=u),Ne.props.some(n=>(n.type===7?n.rawName:n.name)===u)&&Ft(2,t)},onattribend(e,t){if(Ne&&ne){if(bu(ne.loc,t),e!==0)if(ze.includes("&")&&(ze=ie.decodeEntities(ze,!0)),ne.type===6)ne.name==="class"&&(ze=$0(ze).trim()),e===1&&!ze&&Ft(13,t),ne.value={type:2,content:ze,loc:e===1?_e(It,hu):_e(It-1,hu+1)},ge.inSFCRoot&&Ne.tag==="template"&&ne.name==="lang"&&ze&&ze!=="html"&&ge.enterRCDATA(Kr("</template"),0);else{let u=0;ne.exp=Cr(ze,!1,_e(It,hu),0,u),ne.name==="for"&&(ne.forParseResult=T2(ne.exp));let n=-1;ne.name==="bind"&&(n=ne.modifiers.findIndex(r=>r.content==="sync"))>-1&&Vn("COMPILER_V_BIND_SYNC",ie,ne.loc,ne.rawName)&&(ne.name="model",ne.modifiers.splice(n,1))}(ne.type!==7||ne.name!=="pre")&&Ne.props.push(ne)}ze="",It=hu=-1},oncomment(e,t){ie.comments&&Ds({type:3,content:Fe(e,t),loc:_e(e-4,t+3)})},onend(){const e=Lt.length;for(let t=0;t<pe.length;t++)Er(pe[t],e-1),Ft(24,pe[t].loc.start.offset)},oncdata(e,t){pe[0].ns!==0?dr(Fe(e,t),e,t):Ft(1,e-9)},onprocessinginstruction(e){(pe[0]?pe[0].ns:ie.ns)===0&&Ft(21,e-1)}}),$c=/,([^,\}\]]*)(?:,([^,\}\]]*))?$/,D2=/^\(|\)$/g;function T2(e){const t=e.loc,u=e.content,n=u.match(v2);if(!n)return;const[,r,i]=n,s=(f,p,d=!1)=>{const h=t.start.offset+p,_=h+f.length;return Cr(f,!1,_e(h,_),0,d?1:0)},o={source:s(i.trim(),u.indexOf(i,r.length)),value:void 0,key:void 0,index:void 0,finalized:!1};let c=r.trim().replace(D2,"").trim();const l=r.indexOf(c),a=c.match($c);if(a){c=c.replace($c,"").trim();const f=a[1].trim();let p;if(f&&(p=u.indexOf(f,l+c.length),o.key=s(f,p,!0)),a[2]){const d=a[2].trim();d&&(o.index=s(d,u.indexOf(d,o.key?p+f.length:l+c.length),!0))}}return c&&(o.value=s(c,l,!0)),o}function Fe(e,t){return Lt.slice(e,t)}function Hc(e){ge.inSFCRoot&&(Ne.innerLoc=_e(e+1,e+1)),Ds(Ne);const{tag:t,ns:u}=Ne;u===0&&ie.isPreTag(t)&&Oo++,ie.isVoidTag(t)?Er(Ne,e):(pe.unshift(Ne),(u===1||u===2)&&(ge.inXML=!0)),Ne=null}function dr(e,t,u){{const i=pe[0]&&pe[0].tag;i!=="script"&&i!=="style"&&e.includes("&")&&(e=ie.decodeEntities(e,!1))}const n=pe[0]||Hn,r=n.children[n.children.length-1];r&&r.type===2?(r.content+=e,bu(r.loc,u)):n.children.push({type:2,content:e,loc:_e(t,u)})}function Er(e,t,u=!1){u?bu(e.loc,B0(t,60)):bu(e.loc,w2(t,62)+1),ge.inSFCRoot&&(e.children.length?e.innerLoc.end=te({},e.children[e.children.length-1].loc.end):e.innerLoc.end=te({},e.innerLoc.start),e.innerLoc.source=Fe(e.innerLoc.start.offset,e.innerLoc.end.offset));const{tag:n,ns:r,children:i}=e;if(Yt||(n==="slot"?e.tagType=2:Uc(e)?e.tagType=3:I2(e)&&(e.tagType=1)),ge.inRCDATA||(e.children=V0(i)),r===0&&ie.isIgnoreNewlineTag(n)){const s=i[0];s&&s.type===2&&(s.content=s.content.replace(/^\r?\n/,""))}r===0&&ie.isPreTag(n)&&Oo--,vs===e&&(Yt=ge.inVPre=!1,vs=null),ge.inXML&&(pe[0]?pe[0].ns:ie.ns)===0&&(ge.inXML=!1);{const s=e.props;if(!ge.inSFCRoot&&Cu("COMPILER_NATIVE_TEMPLATE",ie)&&e.tag==="template"&&!Uc(e)){const c=pe[0]||Hn,l=c.children.indexOf(e);c.children.splice(l,1,...e.children)}const o=s.find(c=>c.type===6&&c.name==="inline-template");o&&Vn("COMPILER_INLINE_TEMPLATE",ie,o.loc)&&e.children.length&&(o.value={type:2,content:Fe(e.children[0].loc.start.offset,e.children[e.children.length-1].loc.end.offset),loc:o.loc})}}function w2(e,t){let u=e;for(;Lt.charCodeAt(u)!==t&&u<Lt.length-1;)u++;return u}function B0(e,t){let u=e;for(;Lt.charCodeAt(u)!==t&&u>=0;)u--;return u}const F2=new Set(["if","else","else-if","for","slot"]);function Uc({tag:e,props:t}){if(e==="template"){for(let u=0;u<t.length;u++)if(t[u].type===7&&F2.has(t[u].name))return!0}return!1}function I2({tag:e,props:t}){if(ie.isCustomElement(e))return!1;if(e==="component"||O2(e.charCodeAt(0))||N0(e)||ie.isBuiltInComponent&&ie.isBuiltInComponent(e)||ie.isNativeTag&&!ie.isNativeTag(e))return!0;for(let u=0;u<t.length;u++){const n=t[u];if(n.type===6){if(n.name==="is"&&n.value){if(n.value.content.startsWith("vue:"))return!0;if(Vn("COMPILER_IS_ON_ELEMENT",ie,n.loc))return!0}}else if(n.name==="bind"&&mu(n.arg,"is")&&Vn("COMPILER_IS_ON_ELEMENT",ie,n.loc))return!0}return!1}function O2(e){return e>64&&e<91}const N2=/\r\n/g;function V0(e,t){const u=ie.whitespace!=="preserve";let n=!1;for(let r=0;r<e.length;r++){const i=e[r];if(i.type===2)if(Oo)i.content=i.content.replace(N2,`
+`);else if(R2(i.content)){const s=e[r-1]&&e[r-1].type,o=e[r+1]&&e[r+1].type;!s||!o||u&&(s===3&&(o===3||o===1)||s===1&&(o===3||o===1&&M2(i.content)))?(n=!0,e[r]=null):i.content=" "}else u&&(i.content=$0(i.content))}return n?e.filter(Boolean):e}function R2(e){for(let t=0;t<e.length;t++)if(!et(e.charCodeAt(t)))return!1;return!0}function M2(e){for(let t=0;t<e.length;t++){const u=e.charCodeAt(t);if(u===10||u===13)return!0}return!1}function $0(e){let t="",u=!1;for(let n=0;n<e.length;n++)et(e.charCodeAt(n))?u||(t+=" ",u=!0):(t+=e[n],u=!1);return t}function Ds(e){(pe[0]||Hn).children.push(e)}function _e(e,t){return{start:ge.getPos(e),end:t==null?t:ge.getPos(t),source:t==null?t:Fe(e,t)}}function P2(e){return _e(e.start.offset,e.end.offset)}function bu(e,t){e.end=ge.getPos(t),e.source=Fe(e.start.offset,t)}function L2(e){const t={type:6,name:e.rawName,nameLoc:_e(e.loc.start.offset,e.loc.start.offset+e.rawName.length),value:void 0,loc:e.loc};if(e.exp){const u=e.exp.loc;u.end.offset<e.loc.end.offset&&(u.start.offset--,u.start.column--,u.end.offset++,u.end.column++),t.value={type:2,content:e.exp.content,loc:u}}return t}function Cr(e,t=!1,u,n=0,r=0){return X(e,t,u,n)}function Ft(e,t,u){ie.onError(me(e,_e(t,t)))}function B2(){ge.reset(),Ne=null,ne=null,ze="",It=-1,hu=-1,pe.length=0}function V2(e,t){if(B2(),Lt=e,ie=te({},L0),t){let r;for(r in t)t[r]!=null&&(ie[r]=t[r])}ge.mode=ie.parseMode==="html"?1:ie.parseMode==="sfc"?2:0,ge.inXML=ie.ns===1||ie.ns===2;const u=t&&t.delimiters;u&&(ge.delimiterOpen=Kr(u[0]),ge.delimiterClose=Kr(u[1]));const n=Hn=a2([],e);return ge.parse(Lt),n.loc=_e(0,e.length),n.children=V0(n.children),Hn=null,n}function $2(e,t){kr(e,void 0,t,H0(e,e.children[0]))}function H0(e,t){const{children:u}=e;return u.length===1&&t.type===1&&!Gr(t)}function kr(e,t,u,n=!1,r=!1){const{children:i}=e,s=[];for(let a=0;a<i.length;a++){const f=i[a];if(f.type===1&&f.tagType===0){const p=n?0:tt(f,u);if(p>0){if(p>=2){f.codegenNode.patchFlag=-1,s.push(f);continue}}else{const d=f.codegenNode;if(d.type===13){const h=d.patchFlag;if((h===void 0||h===512||h===1)&&q0(f,u)>=2){const _=j0(f);_&&(d.props=u.hoist(_))}d.dynamicProps&&(d.dynamicProps=u.hoist(d.dynamicProps))}}}else if(f.type===12&&(n?0:tt(f,u))>=2){s.push(f);continue}if(f.type===1){const p=f.tagType===1;p&&u.scopes.vSlot++,kr(f,e,u,!1,r),p&&u.scopes.vSlot--}else if(f.type===11)kr(f,e,u,f.children.length===1,!0);else if(f.type===9)for(let p=0;p<f.branches.length;p++)kr(f.branches[p],e,u,f.branches[p].children.length===1,r)}let o=!1;if(s.length===i.length&&e.type===1){if(e.tagType===0&&e.codegenNode&&e.codegenNode.type===13&&$(e.codegenNode.children))e.codegenNode.children=c(Eu(e.codegenNode.children)),o=!0;else if(e.tagType===1&&e.codegenNode&&e.codegenNode.type===13&&e.codegenNode.children&&!$(e.codegenNode.children)&&e.codegenNode.children.type===15){const a=l(e.codegenNode,"default");a&&(a.returns=c(Eu(a.returns)),o=!0)}else if(e.tagType===3&&t&&t.type===1&&t.tagType===1&&t.codegenNode&&t.codegenNode.type===13&&t.codegenNode.children&&!$(t.codegenNode.children)&&t.codegenNode.children.type===15){const a=st(e,"slot",!0),f=a&&a.arg&&l(t.codegenNode,a.arg);f&&(f.returns=c(Eu(f.returns)),o=!0)}}if(!o)for(const a of s)a.codegenNode=u.cache(a.codegenNode);function c(a){const f=u.cache(a);return r&&u.hmr&&(f.needArraySpread=!0),f}function l(a,f){if(a.children&&!$(a.children)&&a.children.type===15){const p=a.children.properties.find(d=>d.key===f||d.key.content===f);return p&&p.value}}s.length&&u.transformHoist&&u.transformHoist(i,u,e)}function tt(e,t){const{constantCache:u}=t;switch(e.type){case 1:if(e.tagType!==0)return 0;const n=u.get(e);if(n!==void 0)return n;const r=e.codegenNode;if(r.type!==13||r.isBlock&&e.tag!=="svg"&&e.tag!=="foreignObject"&&e.tag!=="math")return 0;if(r.patchFlag===void 0){let s=3;const o=q0(e,t);if(o===0)return u.set(e,0),0;o<s&&(s=o);for(let c=0;c<e.children.length;c++){const l=tt(e.children[c],t);if(l===0)return u.set(e,0),0;l<s&&(s=l)}if(s>1)for(let c=0;c<e.props.length;c++){const l=e.props[c];if(l.type===7&&l.name==="bind"&&l.exp){const a=tt(l.exp,t);if(a===0)return u.set(e,0),0;a<s&&(s=a)}}if(r.isBlock){for(let c=0;c<e.props.length;c++)if(e.props[c].type===7)return u.set(e,0),0;t.removeHelper(Su),t.removeHelper(Xu(t.inSSR,r.isComponent)),r.isBlock=!1,t.helper(Yu(t.inSSR,r.isComponent))}return u.set(e,s),s}else return u.set(e,0),0;case 2:case 3:return 3;case 9:case 11:case 10:return 0;case 5:case 12:return tt(e.content,t);case 4:return e.constType;case 8:let i=3;for(let s=0;s<e.children.length;s++){const o=e.children[s];if(Z(o)||Ze(o))continue;const c=tt(o,t);if(c===0)return 0;c<i&&(i=c)}return i;case 20:return 2;default:return 0}}const H2=new Set([Ao,So,Ln,Xn]);function U0(e,t){if(e.type===14&&!Z(e.callee)&&H2.has(e.callee)){const u=e.arguments[0];if(u.type===4)return tt(u,t);if(u.type===14)return U0(u,t)}return 0}function q0(e,t){let u=3;const n=j0(e);if(n&&n.type===15){const{properties:r}=n;for(let i=0;i<r.length;i++){const{key:s,value:o}=r[i],c=tt(s,t);if(c===0)return c;c<u&&(u=c);let l;if(o.type===4?l=tt(o,t):o.type===14?l=U0(o,t):l=0,l===0)return l;l<u&&(u=l)}}return u}function j0(e){const t=e.codegenNode;if(t.type===13)return t.props}function U2(e,{filename:t="",prefixIdentifiers:u=!1,hoistStatic:n=!1,hmr:r=!1,cacheHandlers:i=!1,nodeTransforms:s=[],directiveTransforms:o={},transformHoist:c=null,isBuiltInComponent:l=we,isCustomElement:a=we,expressionPlugins:f=[],scopeId:p=null,slotted:d=!0,ssr:h=!1,inSSR:_=!1,ssrCssVars:v="",bindingMetadata:A=ee,inline:E=!1,isTS:m=!1,onError:g=Fo,onWarn:x=O0,compatConfig:k}){const I=t.replace(/\?.*$/,"").match(/([^/\\]+)\.\w+$/),w={filename:t,selfName:I&&Tu(de(I[1])),prefixIdentifiers:u,hoistStatic:n,hmr:r,cacheHandlers:i,nodeTransforms:s,directiveTransforms:o,transformHoist:c,isBuiltInComponent:l,isCustomElement:a,expressionPlugins:f,scopeId:p,slotted:d,ssr:h,inSSR:_,ssrCssVars:v,bindingMetadata:A,inline:E,isTS:m,onError:g,onWarn:x,compatConfig:k,root:e,helpers:new Map,components:new Set,directives:new Set,hoists:[],imports:[],cached:[],constantCache:new WeakMap,temps:0,identifiers:Object.create(null),scopes:{vFor:0,vSlot:0,vPre:0,vOnce:0},parent:null,grandParent:null,currentNode:e,childIndex:0,inVOnce:!1,helper(C){const S=w.helpers.get(C)||0;return w.helpers.set(C,S+1),C},removeHelper(C){const S=w.helpers.get(C);if(S){const R=S-1;R?w.helpers.set(C,R):w.helpers.delete(C)}},helperString(C){return`_${Zu[w.helper(C)]}`},replaceNode(C){w.parent.children[w.childIndex]=w.currentNode=C},removeNode(C){const S=w.parent.children,R=C?S.indexOf(C):w.currentNode?w.childIndex:-1;!C||C===w.currentNode?(w.currentNode=null,w.onNodeRemoved()):w.childIndex>R&&(w.childIndex--,w.onNodeRemoved()),w.parent.children.splice(R,1)},onNodeRemoved:we,addIdentifiers(C){},removeIdentifiers(C){},hoist(C){Z(C)&&(C=X(C)),w.hoists.push(C);const S=X(`_hoisted_${w.hoists.length}`,!1,C.loc,2);return S.hoisted=C,S},cache(C,S=!1,R=!1){const T=f2(w.cached.length,C,S,R);return w.cached.push(T),T}};return w.filters=new Set,w}function q2(e,t){const u=U2(e,t);Ai(e,u),t.hoistStatic&&$2(e,u),t.ssr||j2(e,u),e.helpers=new Set([...u.helpers.keys()]),e.components=[...u.components],e.directives=[...u.directives],e.imports=u.imports,e.hoists=u.hoists,e.temps=u.temps,e.cached=u.cached,e.transformed=!0,e.filters=[...u.filters]}function j2(e,t){const{helper:u}=t,{children:n}=e;if(n.length===1){const r=n[0];if(H0(e,r)&&r.codegenNode){const i=r.codegenNode;i.type===13&&wo(i,t),e.codegenNode=i}else e.codegenNode=r}else if(n.length>1){let r=64;e.codegenNode=Bn(t,u(Pn),void 0,e.children,r,void 0,void 0,!0,void 0,!1)}}function z2(e,t){let u=0;const n=()=>{u--};for(;u<e.children.length;u++){const r=e.children[u];Z(r)||(t.grandParent=t.parent,t.parent=e,t.childIndex=u,t.onNodeRemoved=n,Ai(r,t))}}function Ai(e,t){t.currentNode=e;const{nodeTransforms:u}=t,n=[];for(let i=0;i<u.length;i++){const s=u[i](e,t);if(s&&($(s)?n.push(...s):n.push(s)),t.currentNode)e=t.currentNode;else return}switch(e.type){case 3:t.ssr||t.helper(Yn);break;case 5:t.ssr||t.helper(Ci);break;case 9:for(let i=0;i<e.branches.length;i++)Ai(e.branches[i],t);break;case 10:case 11:case 1:case 0:z2(e,t);break}t.currentNode=e;let r=n.length;for(;r--;)n[r]()}function z0(e,t){const u=Z(e)?n=>n===e:n=>e.test(n);return(n,r)=>{if(n.type===1){const{props:i}=n;if(n.tagType===3&&i.some(k2))return;const s=[];for(let o=0;o<i.length;o++){const c=i[o];if(c.type===7&&u(c.name)){i.splice(o,1),o--;const l=t(n,c,r);l&&s.push(l)}}return s}}}const Si="/*@__PURE__*/",K0=e=>`${Zu[e]}: _${Zu[e]}`;function K2(e,{mode:t="function",prefixIdentifiers:u=t==="module",sourceMap:n=!1,filename:r="template.vue.html",scopeId:i=null,optimizeImports:s=!1,runtimeGlobalName:o="Vue",runtimeModuleName:c="vue",ssrRuntimeModuleName:l="vue/server-renderer",ssr:a=!1,isTS:f=!1,inSSR:p=!1}){const d={mode:t,prefixIdentifiers:u,sourceMap:n,filename:r,scopeId:i,optimizeImports:s,runtimeGlobalName:o,runtimeModuleName:c,ssrRuntimeModuleName:l,ssr:a,isTS:f,inSSR:p,source:e.source,code:"",column:1,line:1,offset:0,indentLevel:0,pure:!1,map:void 0,helper(_){return`_${Zu[_]}`},push(_,v=-2,A){d.code+=_},indent(){h(++d.indentLevel)},deindent(_=!1){_?--d.indentLevel:h(--d.indentLevel)},newline(){h(d.indentLevel)}};function h(_){d.push(`
+`+"  ".repeat(_),0)}return d}function W2(e,t={}){const u=K2(e,t);t.onContextCreated&&t.onContextCreated(u);const{mode:n,push:r,prefixIdentifiers:i,indent:s,deindent:o,newline:c,scopeId:l,ssr:a}=u,f=Array.from(e.helpers),p=f.length>0,d=!i&&n!=="module";G2(e,u);const _=a?"ssrRender":"render",A=(a?["_ctx","_push","_parent","_attrs"]:["_ctx","_cache"]).join(", ");if(r(`function ${_}(${A}) {`),s(),d&&(r("with (_ctx) {"),s(),p&&(r(`const { ${f.map(K0).join(", ")} } = _Vue
+`,-1),c())),e.components.length&&(Wi(e.components,"component",u),(e.directives.length||e.temps>0)&&c()),e.directives.length&&(Wi(e.directives,"directive",u),e.temps>0&&c()),e.filters&&e.filters.length&&(c(),Wi(e.filters,"filter",u),c()),e.temps>0){r("let ");for(let E=0;E<e.temps;E++)r(`${E>0?", ":""}_temp${E}`)}return(e.components.length||e.directives.length||e.temps)&&(r(`
+`,0),c()),a||r("return "),e.codegenNode?Le(e.codegenNode,u):r("null"),d&&(o(),r("}")),o(),r("}"),{ast:e,code:u.code,preamble:"",map:u.map?u.map.toJSON():void 0}}function G2(e,t){const{ssr:u,prefixIdentifiers:n,push:r,newline:i,runtimeModuleName:s,runtimeGlobalName:o,ssrRuntimeModuleName:c}=t,l=o,a=Array.from(e.helpers);if(a.length>0&&(r(`const _Vue = ${l}
+`,-1),e.hoists.length)){const f=[mo,bo,Yn,go,T0].filter(p=>a.includes(p)).map(K0).join(", ");r(`const { ${f} } = _Vue
+`,-1)}Z2(e.hoists,t),i(),r("return ")}function Wi(e,t,{helper:u,push:n,newline:r,isTS:i}){const s=u(t==="filter"?Eo:t==="component"?_o:xo);for(let o=0;o<e.length;o++){let c=e[o];const l=c.endsWith("__self");l&&(c=c.slice(0,-6)),n(`const ${$n(c,t)} = ${s}(${JSON.stringify(c)}${l?", true":""})${i?"!":""}`),o<e.length-1&&r()}}function Z2(e,t){if(!e.length)return;t.pure=!0;const{push:u,newline:n}=t;n();for(let r=0;r<e.length;r++){const i=e[r];i&&(u(`const _hoisted_${r+1} = `),Le(i,t),n())}t.pure=!1}function No(e,t){const u=e.length>3||!1;t.push("["),u&&t.indent(),Qn(e,t,u),u&&t.deindent(),t.push("]")}function Qn(e,t,u=!1,n=!0){const{push:r,newline:i}=t;for(let s=0;s<e.length;s++){const o=e[s];Z(o)?r(o,-3):$(o)?No(o,t):Le(o,t),s<e.length-1&&(u?(n&&r(","),i()):n&&r(", "))}}function Le(e,t){if(Z(e)){t.push(e,-3);return}if(Ze(e)){t.push(t.helper(e));return}switch(e.type){case 1:case 9:case 11:Le(e.codegenNode,t);break;case 2:J2(e,t);break;case 4:W0(e,t);break;case 5:Y2(e,t);break;case 12:Le(e.codegenNode,t);break;case 8:G0(e,t);break;case 3:Q2(e,t);break;case 13:em(e,t);break;case 14:um(e,t);break;case 15:nm(e,t);break;case 17:rm(e,t);break;case 18:im(e,t);break;case 19:sm(e,t);break;case 20:om(e,t);break;case 21:Qn(e.body,t,!0,!1);break}}function J2(e,t){t.push(JSON.stringify(e.content),-3,e)}function W0(e,t){const{content:u,isStatic:n}=e;t.push(n?JSON.stringify(u):u,-3,e)}function Y2(e,t){const{push:u,helper:n,pure:r}=t;r&&u(Si),u(`${n(Ci)}(`),Le(e.content,t),u(")")}function G0(e,t){for(let u=0;u<e.children.length;u++){const n=e.children[u];Z(n)?t.push(n,-3):Le(n,t)}}function X2(e,t){const{push:u}=t;if(e.type===8)u("["),G0(e,t),u("]");else if(e.isStatic){const n=Io(e.content)?e.content:JSON.stringify(e.content);u(n,-2,e)}else u(`[${e.content}]`,-3,e)}function Q2(e,t){const{push:u,helper:n,pure:r}=t;r&&u(Si),u(`${n(Yn)}(${JSON.stringify(e.content)})`,-3,e)}function em(e,t){const{push:u,helper:n,pure:r}=t,{tag:i,props:s,children:o,patchFlag:c,dynamicProps:l,directives:a,isBlock:f,disableTracking:p,isComponent:d}=e;let h;c&&(h=String(c)),a&&u(n(Co)+"("),f&&u(`(${n(Su)}(${p?"true":""}), `),r&&u(Si);const _=f?Xu(t.inSSR,d):Yu(t.inSSR,d);u(n(_)+"(",-2,e),Qn(tm([i,s,o,h,l]),t),u(")"),f&&u(")"),a&&(u(", "),Le(a,t),u(")"))}function tm(e){let t=e.length;for(;t--&&e[t]==null;);return e.slice(0,t+1).map(u=>u||"null")}function um(e,t){const{push:u,helper:n,pure:r}=t,i=Z(e.callee)?e.callee:n(e.callee);r&&u(Si),u(i+"(",-2,e),Qn(e.arguments,t),u(")")}function nm(e,t){const{push:u,indent:n,deindent:r,newline:i}=t,{properties:s}=e;if(!s.length){u("{}",-2,e);return}const o=s.length>1||!1;u(o?"{":"{ "),o&&n();for(let c=0;c<s.length;c++){const{key:l,value:a}=s[c];X2(l,t),u(": "),Le(a,t),c<s.length-1&&(u(","),i())}o&&r(),u(o?"}":" }")}function rm(e,t){No(e.elements,t)}function im(e,t){const{push:u,indent:n,deindent:r}=t,{params:i,returns:s,body:o,newline:c,isSlot:l}=e;l&&u(`_${Zu[Do]}(`),u("(",-2,e),$(i)?Qn(i,t):i&&Le(i,t),u(") => "),(c||o)&&(u("{"),n()),s?(c&&u("return "),$(s)?No(s,t):Le(s,t)):o&&Le(o,t),(c||o)&&(r(),u("}")),l&&(e.isNonScopedSlot&&u(", undefined, true"),u(")"))}function sm(e,t){const{test:u,consequent:n,alternate:r,newline:i}=e,{push:s,indent:o,deindent:c,newline:l}=t;if(u.type===4){const f=!Io(u.content);f&&s("("),W0(u,t),f&&s(")")}else s("("),Le(u,t),s(")");i&&o(),t.indentLevel++,i||s(" "),s("? "),Le(n,t),t.indentLevel--,i&&l(),i||s(" "),s(": ");const a=r.type===19;a||t.indentLevel++,Le(r,t),a||t.indentLevel--,i&&c(!0)}function om(e,t){const{push:u,helper:n,indent:r,deindent:i,newline:s}=t,{needPauseTracking:o,needArraySpread:c}=e;c&&u("[...("),u(`_cache[${e.index}] || (`),o&&(r(),u(`${n(zr)}(-1`),e.inVOnce&&u(", true"),u("),"),s(),u("(")),u(`_cache[${e.index}] = `),Le(e.value,t),o&&(u(`).cacheIndex = ${e.index},`),s(),u(`${n(zr)}(1),`),s(),u(`_cache[${e.index}]`),i()),u(")"),c&&u(")]")}new RegExp("\\b"+"arguments,await,break,case,catch,class,const,continue,debugger,default,delete,do,else,export,extends,finally,for,function,if,import,let,new,return,super,switch,throw,try,var,void,while,with,yield".split(",").join("\\b|\\b")+"\\b");const cm=z0(/^(if|else|else-if)$/,(e,t,u)=>lm(e,t,u,(n,r,i)=>{const s=u.parent.children;let o=s.indexOf(n),c=0;for(;o-->=0;){const l=s[o];l&&l.type===9&&(c+=l.branches.length)}return()=>{if(i)n.codegenNode=jc(r,c,u);else{const l=am(n.codegenNode);l.alternate=jc(r,c+n.branches.length-1,u)}}}));function lm(e,t,u,n){if(t.name!=="else"&&(!t.exp||!t.exp.content.trim())){const r=t.exp?t.exp.loc:e.loc;u.onError(me(28,t.loc)),t.exp=X("true",!1,r)}if(t.name==="if"){const r=qc(e,t),i={type:9,loc:P2(e.loc),branches:[r]};if(u.replaceNode(i),n)return n(i,r,!0)}else{const r=u.parent.children;let i=r.indexOf(e);for(;i-->=-1;){const s=r[i];if(s&&s.type===3){u.removeNode(s);continue}if(s&&s.type===2&&!s.content.trim().length){u.removeNode(s);continue}if(s&&s.type===9){t.name==="else-if"&&s.branches[s.branches.length-1].condition===void 0&&u.onError(me(30,e.loc)),u.removeNode();const o=qc(e,t);s.branches.push(o);const c=n&&n(s,o,!1);Ai(o,u),c&&c(),u.currentNode=null}else u.onError(me(30,e.loc));break}}}function qc(e,t){const u=e.tagType===3;return{type:10,loc:e.loc,condition:t.name==="else"?void 0:t.exp,children:u&&!st(e,"for")?e.children:[e],userKey:ki(e,"key"),isTemplateIf:u}}function jc(e,t,u){return e.condition?Ss(e.condition,zc(e,t,u),Ae(u.helper(Yn),['""',"true"])):zc(e,t,u)}function zc(e,t,u){const{helper:n}=u,r=xe("key",X(`${t}`,!1,rt,2)),{children:i}=e,s=i[0];if(i.length!==1||s.type!==1)if(i.length===1&&s.type===11){const c=s.codegenNode;return Zr(c,r,u),c}else return Bn(u,n(Pn),ot([r]),i,64,void 0,void 0,!0,!1,!1,e.loc);else{const c=s.codegenNode,l=S2(c);return l.type===13&&wo(l,u),Zr(l,r,u),c}}function am(e){for(;;)if(e.type===19)if(e.alternate.type===19)e=e.alternate;else return e;else e.type===20&&(e=e.value)}const fm=(e,t,u)=>{const{modifiers:n,loc:r}=e,i=e.arg;let{exp:s}=e;if(s&&s.type===4&&!s.content.trim()&&(s=void 0),!s){if(i.type!==4||!i.isStatic)return u.onError(me(52,i.loc)),{props:[xe(i,X("",!0,r))]};Z0(e),s=e.exp}return i.type!==4?(i.children.unshift("("),i.children.push(') || ""')):i.isStatic||(i.content=`${i.content} || ""`),n.some(o=>o.content==="camel")&&(i.type===4?i.isStatic?i.content=de(i.content):i.content=`${u.helperString(ks)}(${i.content})`:(i.children.unshift(`${u.helperString(ks)}(`),i.children.push(")"))),u.inSSR||(n.some(o=>o.content==="prop")&&Kc(i,"."),n.some(o=>o.content==="attr")&&Kc(i,"^")),{props:[xe(i,s)]}},Z0=(e,t)=>{const u=e.arg,n=de(u.content);e.exp=X(n,!1,u.loc)},Kc=(e,t)=>{e.type===4?e.isStatic?e.content=t+e.content:e.content=`\`${t}\${${e.content}}\``:(e.children.unshift(`'${t}' + (`),e.children.push(")"))},dm=z0("for",(e,t,u)=>{const{helper:n,removeHelper:r}=u;return hm(e,t,u,i=>{const s=Ae(n(ko),[i.source]),o=Wr(e),c=st(e,"memo"),l=ki(e,"key",!1,!0);l&&l.type===7&&!l.exp&&Z0(l);let f=l&&(l.type===6?l.value?X(l.value.content,!0):void 0:l.exp);const p=l&&f?xe("key",f):null,d=i.source.type===4&&i.source.constType>0,h=d?64:l?128:256;return i.codegenNode=Bn(u,n(Pn),void 0,s,h,void 0,void 0,!0,!d,!1,e.loc),()=>{let _;const{children:v}=i,A=v.length!==1||v[0].type!==1,E=Gr(e)?e:o&&e.children.length===1&&Gr(e.children[0])?e.children[0]:null;if(E?(_=E.codegenNode,o&&p&&Zr(_,p,u)):A?_=Bn(u,n(Pn),p?ot([p]):void 0,e.children,64,void 0,void 0,!0,void 0,!1):(_=v[0].codegenNode,o&&p&&Zr(_,p,u),_.isBlock!==!d&&(_.isBlock?(r(Su),r(Xu(u.inSSR,_.isComponent))):r(Yu(u.inSSR,_.isComponent))),_.isBlock=!d,_.isBlock?(n(Su),n(Xu(u.inSSR,_.isComponent))):n(Yu(u.inSSR,_.isComponent))),c){const m=Ju(Ts(i.parseResult,[X("_cached")]));m.body=d2([gt(["const _memo = (",c.exp,")"]),gt(["if (_cached",...f?[" && _cached.key === ",f]:[],` && ${u.helperString(I0)}(_cached, _memo)) return _cached`]),gt(["const _item = ",_]),X("_item.memo = _memo"),X("return _item")]),s.arguments.push(m,X("_cache"),X(String(u.cached.length))),u.cached.push(null)}else s.arguments.push(Ju(Ts(i.parseResult),_,!0))}})});function hm(e,t,u,n){if(!t.exp){u.onError(me(31,t.loc));return}const r=t.forParseResult;if(!r){u.onError(me(32,t.loc));return}J0(r);const{addIdentifiers:i,removeIdentifiers:s,scopes:o}=u,{source:c,value:l,key:a,index:f}=r,p={type:11,loc:t.loc,source:c,valueAlias:l,keyAlias:a,objectIndexAlias:f,parseResult:r,children:Wr(e)?e.children:[e]};u.replaceNode(p),o.vFor++;const d=n&&n(p);return()=>{o.vFor--,d&&d()}}function J0(e,t){e.finalized||(e.finalized=!0)}function Ts({value:e,key:t,index:u},n=[]){return pm([e,t,u,...n])}function pm(e){let t=e.length;for(;t--&&!e[t];);return e.slice(0,t+1).map((u,n)=>u||X("_".repeat(n+1),!1))}const Wc=X("undefined",!1),mm=(e,t)=>{if(e.type===1&&(e.tagType===1||e.tagType===3)){const u=st(e,"slot");if(u)return u.exp,t.scopes.vSlot++,()=>{t.scopes.vSlot--}}},bm=(e,t,u,n)=>Ju(e,u,!1,!0,u.length?u[0].loc:n);function gm(e,t,u=bm){t.helper(Do);const{children:n,loc:r}=e,i=[],s=[];let o=t.scopes.vSlot>0||t.scopes.vFor>0;const c=st(e,"slot",!0);if(c){const{arg:v,exp:A}=c;v&&!Ge(v)&&(o=!0),i.push(xe(v||X("default",!0),u(A,void 0,n,r)))}let l=!1,a=!1;const f=[],p=new Set;let d=0;for(let v=0;v<n.length;v++){const A=n[v];let E;if(!Wr(A)||!(E=st(A,"slot",!0))){A.type!==3&&f.push(A);continue}if(c){t.onError(me(37,E.loc));break}l=!0;const{children:m,loc:g}=A,{arg:x=X("default",!0),exp:k,loc:I}=E;let w;Ge(x)?w=x?x.content:"default":o=!0;const C=st(A,"for"),S=u(k,C,m,g);let R,T;if(R=st(A,"if"))o=!0,s.push(Ss(R.exp,hr(x,S,d++),Wc));else if(T=st(A,/^else(-if)?$/,!0)){let P=v,q;for(;P--&&(q=n[P],q.type===3););if(q&&Wr(q)&&st(q,/^(else-)?if$/)){let z=s[s.length-1];for(;z.alternate.type===19;)z=z.alternate;z.alternate=T.exp?Ss(T.exp,hr(x,S,d++),Wc):hr(x,S,d++)}else t.onError(me(30,T.loc))}else if(C){o=!0;const P=C.forParseResult;P?(J0(P),s.push(Ae(t.helper(ko),[P.source,Ju(Ts(P),hr(x,S),!0)]))):t.onError(me(32,C.loc))}else{if(w){if(p.has(w)){t.onError(me(38,I));continue}p.add(w),w==="default"&&(a=!0)}i.push(xe(x,S))}}if(!c){const v=(A,E)=>{const m=u(A,void 0,E,r);return t.compatConfig&&(m.isNonScopedSlot=!0),xe("default",m)};l?f.length&&f.some(A=>Y0(A))&&(a?t.onError(me(39,f[0].loc)):i.push(v(void 0,f))):i.push(v(void 0,n))}const h=o?2:Ar(e.children)?3:1;let _=ot(i.concat(xe("_",X(h+"",!1))),r);return s.length&&(_=Ae(t.helper(F0),[_,Eu(s)])),{slots:_,hasDynamicSlots:o}}function hr(e,t,u){const n=[xe("name",e),xe("fn",t)];return u!=null&&n.push(xe("key",X(String(u),!0))),ot(n)}function Ar(e){for(let t=0;t<e.length;t++){const u=e[t];switch(u.type){case 1:if(u.tagType===2||Ar(u.children))return!0;break;case 9:if(Ar(u.branches))return!0;break;case 10:case 11:if(Ar(u.children))return!0;break}}return!1}function Y0(e){return e.type!==2&&e.type!==12?!0:e.type===2?!!e.content.trim():Y0(e.content)}const X0=new WeakMap,_m=(e,t)=>function(){if(e=t.currentNode,!(e.type===1&&(e.tagType===0||e.tagType===1)))return;const{tag:n,props:r}=e,i=e.tagType===1;let s=i?ym(e,t):`"${n}"`;const o=oe(s)&&s.callee===yo;let c,l,a=0,f,p,d,h=o||s===Sn||s===po||!i&&(n==="svg"||n==="foreignObject"||n==="math");if(r.length>0){const _=Q0(e,t,void 0,i,o);c=_.props,a=_.patchFlag,p=_.dynamicPropNames;const v=_.directives;d=v&&v.length?Eu(v.map(A=>Em(A,t))):void 0,_.shouldUseBlock&&(h=!0)}if(e.children.length>0)if(s===qr&&(h=!0,a|=1024),i&&s!==Sn&&s!==qr){const{slots:v,hasDynamicSlots:A}=gm(e,t);l=v,A&&(a|=1024)}else if(e.children.length===1&&s!==Sn){const v=e.children[0],A=v.type,E=A===5||A===8;E&&tt(v,t)===0&&(a|=1),E||A===2?l=v:l=e.children}else l=e.children;p&&p.length&&(f=Cm(p)),e.codegenNode=Bn(t,s,c,l,a===0?void 0:a,f,d,!!h,!1,i,e.loc)};function ym(e,t,u=!1){let{tag:n}=e;const r=ws(n),i=ki(e,"is",!1,!0);if(i)if(r||Cu("COMPILER_IS_ON_ELEMENT",t)){let o;if(i.type===6?o=i.value&&X(i.value.content,!0):(o=i.exp,o||(o=X("is",!1,i.arg.loc))),o)return Ae(t.helper(yo),[o])}else i.type===6&&i.value.content.startsWith("vue:")&&(n=i.value.content.slice(4));const s=N0(n)||t.isBuiltInComponent(n);return s?(u||t.helper(s),s):(t.helper(_o),t.components.add(n),$n(n,"component"))}function Q0(e,t,u=e.props,n,r,i=!1){const{tag:s,loc:o,children:c}=e;let l=[];const a=[],f=[],p=c.length>0;let d=!1,h=0,_=!1,v=!1,A=!1,E=!1,m=!1,g=!1;const x=[],k=S=>{l.length&&(a.push(ot(Gc(l),o)),l=[]),S&&a.push(S)},I=()=>{t.scopes.vFor>0&&l.push(xe(X("ref_for",!0),X("true")))},w=({key:S,value:R})=>{if(Ge(S)){const T=S.content,P=vu(T);if(P&&(!n||r)&&T.toLowerCase()!=="onclick"&&T!=="onUpdate:modelValue"&&!tu(T)&&(E=!0),P&&tu(T)&&(g=!0),P&&R.type===14&&(R=R.arguments[0]),R.type===20||(R.type===4||R.type===8)&&tt(R,t)>0)return;T==="ref"?_=!0:T==="class"?v=!0:T==="style"?A=!0:T!=="key"&&!x.includes(T)&&x.push(T),n&&(T==="class"||T==="style")&&!x.includes(T)&&x.push(T)}else m=!0};for(let S=0;S<u.length;S++){const R=u[S];if(R.type===6){const{loc:T,name:P,nameLoc:q,value:z}=R;let H=!0;if(P==="ref"&&(_=!0,I()),P==="is"&&(ws(s)||z&&z.content.startsWith("vue:")||Cu("COMPILER_IS_ON_ELEMENT",t)))continue;l.push(xe(X(P,!0,q),X(z?z.content:"",H,z?z.loc:T)))}else{const{name:T,arg:P,exp:q,loc:z,modifiers:H}=R,J=T==="bind",j=T==="on";if(T==="slot"){n||t.onError(me(40,z));continue}if(T==="once"||T==="memo"||T==="is"||J&&mu(P,"is")&&(ws(s)||Cu("COMPILER_IS_ON_ELEMENT",t))||j&&i)continue;if((J&&mu(P,"key")||j&&p&&mu(P,"vue:before-update"))&&(d=!0),J&&mu(P,"ref")&&I(),!P&&(J||j)){if(m=!0,q)if(J){if(I(),k(),Cu("COMPILER_V_BIND_OBJECT_ORDER",t)){a.unshift(q);continue}a.push(q)}else k({type:14,loc:z,callee:t.helper(vo),arguments:n?[q]:[q,"true"]});else t.onError(me(J?34:35,z));continue}J&&H.some(je=>je.content==="prop")&&(h|=32);const be=t.directiveTransforms[T];if(be){const{props:je,needRuntime:dt}=be(R,e,t);!i&&je.forEach(w),j&&P&&!Ge(P)?k(ot(je,o)):l.push(...je),dt&&(f.push(R),Ze(dt)&&X0.set(R,dt))}else zf(T)||(f.push(R),p&&(d=!0))}}let C;if(a.length?(k(),a.length>1?C=Ae(t.helper(jr),a,o):C=a[0]):l.length&&(C=ot(Gc(l),o)),m?h|=16:(v&&!n&&(h|=2),A&&!n&&(h|=4),x.length&&(h|=8),E&&(h|=32)),!d&&(h===0||h===32)&&(_||g||f.length>0)&&(h|=512),!t.inSSR&&C)switch(C.type){case 15:let S=-1,R=-1,T=!1;for(let z=0;z<C.properties.length;z++){const H=C.properties[z].key;Ge(H)?H.content==="class"?S=z:H.content==="style"&&(R=z):H.isHandlerKey||(T=!0)}const P=C.properties[S],q=C.properties[R];T?C=Ae(t.helper(Ln),[C]):(P&&!Ge(P.value)&&(P.value=Ae(t.helper(Ao),[P.value])),q&&(A||q.value.type===4&&q.value.content.trim()[0]==="["||q.value.type===17)&&(q.value=Ae(t.helper(So),[q.value])));break;case 14:break;default:C=Ae(t.helper(Ln),[Ae(t.helper(Xn),[C])]);break}return{props:C,directives:f,patchFlag:h,dynamicPropNames:x,shouldUseBlock:d}}function Gc(e){const t=new Map,u=[];for(let n=0;n<e.length;n++){const r=e[n];if(r.key.type===8||!r.key.isStatic){u.push(r);continue}const i=r.key.content,s=t.get(i);s?(i==="style"||i==="class"||vu(i))&&xm(s,r):(t.set(i,r),u.push(r))}return u}function xm(e,t){e.value.type===17?e.value.elements.push(t.value):e.value=Eu([e.value,t.value],e.loc)}function Em(e,t){const u=[],n=X0.get(e);n?u.push(t.helperString(n)):(t.helper(xo),t.directives.add(e.name),u.push($n(e.name,"directive")));const{loc:r}=e;if(e.exp&&u.push(e.exp),e.arg&&(e.exp||u.push("void 0"),u.push(e.arg)),Object.keys(e.modifiers).length){e.arg||(e.exp||u.push("void 0"),u.push("void 0"));const i=X("true",!1,r);u.push(ot(e.modifiers.map(s=>xe(s,i)),r))}return Eu(u,e.loc)}function Cm(e){let t="[";for(let u=0,n=e.length;u<n;u++)t+=JSON.stringify(e[u]),u<n-1&&(t+=", ");return t+"]"}function ws(e){return e==="component"||e==="Component"}const km=(e,t)=>{if(Gr(e)){const{children:u,loc:n}=e,{slotName:r,slotProps:i}=Am(e,t),s=[t.prefixIdentifiers?"_ctx.$slots":"$slots",r,"{}","undefined","true"];let o=2;i&&(s[2]=i,o=3),u.length&&(s[3]=Ju([],u,!1,!1,n),o=4),t.scopeId&&!t.slotted&&(o=5),s.splice(o),e.codegenNode=Ae(t.helper(w0),s,n)}};function Am(e,t){let u='"default"',n;const r=[];for(let i=0;i<e.props.length;i++){const s=e.props[i];if(s.type===6)s.value&&(s.name==="name"?u=JSON.stringify(s.value.content):(s.name=de(s.name),r.push(s)));else if(s.name==="bind"&&mu(s.arg,"name")){if(s.exp)u=s.exp;else if(s.arg&&s.arg.type===4){const o=de(s.arg.content);u=s.exp=X(o,!1,s.arg.loc)}}else s.name==="bind"&&s.arg&&Ge(s.arg)&&(s.arg.content=de(s.arg.content)),r.push(s)}if(r.length>0){const{props:i,directives:s}=Q0(e,t,r,!1,!1);n=i,s.length&&t.onError(me(36,s[0].loc))}return{slotName:u,slotProps:n}}const ef=(e,t,u,n)=>{const{loc:r,modifiers:i,arg:s}=e;!e.exp&&!i.length&&u.onError(me(35,r));let o;if(s.type===4)if(s.isStatic){let f=s.content;f.startsWith("vue:")&&(f=`vnode-${f.slice(4)}`);const p=t.tagType!==0||f.startsWith("vnode")||!/[A-Z]/.test(f)?Bu(de(f)):`on:${f}`;o=X(p,!0,s.loc)}else o=gt([`${u.helperString(As)}(`,s,")"]);else o=s,o.children.unshift(`${u.helperString(As)}(`),o.children.push(")");let c=e.exp;c&&!c.content.trim()&&(c=void 0);let l=u.cacheHandlers&&!c&&!u.inVOnce;if(c){const f=M0(c),p=!(f||E2(c)),d=c.content.includes(";");(p||l&&f)&&(c=gt([`${p?"$event":"(...args)"} => ${d?"{":"("}`,c,d?"}":")"]))}let a={props:[xe(o,c||X("() => {}",!1,r))]};return n&&(a=n(a)),l&&(a.props[0].value=u.cache(a.props[0].value)),a.props.forEach(f=>f.key.isHandlerKey=!0),a},Sm=(e,t)=>{if(e.type===0||e.type===1||e.type===11||e.type===10)return()=>{const u=e.children;let n,r=!1;for(let i=0;i<u.length;i++){const s=u[i];if(Ki(s)){r=!0;for(let o=i+1;o<u.length;o++){const c=u[o];if(Ki(c))n||(n=u[i]=gt([s],s.loc)),n.children.push(" + ",c),u.splice(o,1),o--;else{n=void 0;break}}}}if(!(!r||u.length===1&&(e.type===0||e.type===1&&e.tagType===0&&!e.props.find(i=>i.type===7&&!t.directiveTransforms[i.name])&&e.tag!=="template")))for(let i=0;i<u.length;i++){const s=u[i];if(Ki(s)||s.type===8){const o=[];(s.type!==2||s.content!==" ")&&o.push(s),!t.ssr&&tt(s,t)===0&&o.push("1"),u[i]={type:12,content:s,loc:s.loc,codegenNode:Ae(t.helper(go),o)}}}}},Zc=new WeakSet,vm=(e,t)=>{if(e.type===1&&st(e,"once",!0))return Zc.has(e)||t.inVOnce||t.inSSR?void 0:(Zc.add(e),t.inVOnce=!0,t.helper(zr),()=>{t.inVOnce=!1;const u=t.currentNode;u.codegenNode&&(u.codegenNode=t.cache(u.codegenNode,!0,!0))})},tf=(e,t,u)=>{const{exp:n,arg:r}=e;if(!n)return u.onError(me(41,e.loc)),pr();const i=n.loc.source.trim(),s=n.type===4?n.content:i,o=u.bindingMetadata[i];if(o==="props"||o==="props-aliased")return u.onError(me(44,n.loc)),pr();if(!s.trim()||!M0(n)&&!!1)return u.onError(me(42,n.loc)),pr();const l=r||X("modelValue",!0),a=r?Ge(r)?`onUpdate:${de(r.content)}`:gt(['"onUpdate:" + ',r]):"onUpdate:modelValue";let f;const p=u.isTS?"($event: any)":"$event";f=gt([`${p} => ((`,n,") = $event)"]);const d=[xe(l,e.exp),xe(a,f)];if(e.modifiers.length&&t.tagType===1){const h=e.modifiers.map(v=>v.content).map(v=>(Io(v)?v:JSON.stringify(v))+": true").join(", "),_=r?Ge(r)?`${r.content}Modifiers`:gt([r,' + "Modifiers"']):"modelModifiers";d.push(xe(_,X(`{ ${h} }`,!1,e.loc,2)))}return pr(d)};function pr(e=[]){return{props:e}}const Dm=/[\w).+\-_$\]]/,Tm=(e,t)=>{Cu("COMPILER_FILTERS",t)&&(e.type===5?Jr(e.content,t):e.type===1&&e.props.forEach(u=>{u.type===7&&u.name!=="for"&&u.exp&&Jr(u.exp,t)}))};function Jr(e,t){if(e.type===4)Jc(e,t);else for(let u=0;u<e.children.length;u++){const n=e.children[u];typeof n=="object"&&(n.type===4?Jc(n,t):n.type===8?Jr(e,t):n.type===5&&Jr(n.content,t))}}function Jc(e,t){const u=e.content;let n=!1,r=!1,i=!1,s=!1,o=0,c=0,l=0,a=0,f,p,d,h,_=[];for(d=0;d<u.length;d++)if(p=f,f=u.charCodeAt(d),n)f===39&&p!==92&&(n=!1);else if(r)f===34&&p!==92&&(r=!1);else if(i)f===96&&p!==92&&(i=!1);else if(s)f===47&&p!==92&&(s=!1);else if(f===124&&u.charCodeAt(d+1)!==124&&u.charCodeAt(d-1)!==124&&!o&&!c&&!l)h===void 0?(a=d+1,h=u.slice(0,d).trim()):v();else{switch(f){case 34:r=!0;break;case 39:n=!0;break;case 96:i=!0;break;case 40:l++;break;case 41:l--;break;case 91:c++;break;case 93:c--;break;case 123:o++;break;case 125:o--;break}if(f===47){let A=d-1,E;for(;A>=0&&(E=u.charAt(A),E===" ");A--);(!E||!Dm.test(E))&&(s=!0)}}h===void 0?h=u.slice(0,d).trim():a!==0&&v();function v(){_.push(u.slice(a,d).trim()),a=d+1}if(_.length){for(d=0;d<_.length;d++)h=wm(h,_[d],t);e.content=h,e.ast=void 0}}function wm(e,t,u){u.helper(Eo);const n=t.indexOf("(");if(n<0)return u.filters.add(t),`${$n(t,"filter")}(${e})`;{const r=t.slice(0,n),i=t.slice(n+1);return u.filters.add(r),`${$n(r,"filter")}(${e}${i!==")"?","+i:i}`}}const Yc=new WeakSet,Fm=(e,t)=>{if(e.type===1){const u=st(e,"memo");return!u||Yc.has(e)?void 0:(Yc.add(e),()=>{const n=e.codegenNode||t.currentNode.codegenNode;n&&n.type===13&&(e.tagType!==1&&wo(n,t),e.codegenNode=Ae(t.helper(To),[u.exp,Ju(void 0,n),"_cache",String(t.cached.length)]),t.cached.push(null))})}};function Im(e){return[[vm,cm,Fm,dm,Tm,km,_m,mm,Sm],{on:ef,bind:fm,model:tf}]}function Om(e,t={}){const u=t.onError||Fo,n=t.mode==="module";t.prefixIdentifiers===!0?u(me(47)):n&&u(me(48));const r=!1;t.cacheHandlers&&u(me(49)),t.scopeId&&!n&&u(me(50));const i=te({},t,{prefixIdentifiers:r}),s=Z(e)?V2(e,i):e,[o,c]=Im();return q2(s,te({},i,{nodeTransforms:[...o,...t.nodeTransforms||[]],directiveTransforms:te({},c,t.directiveTransforms||{})})),W2(s,i)}const Nm=()=>({props:[]});/**
+* @vue/compiler-dom v3.5.13
+* (c) 2018-present Yuxi (Evan) You and Vue contributors
+* @license MIT
+**/const uf=Symbol(""),nf=Symbol(""),rf=Symbol(""),sf=Symbol(""),Fs=Symbol(""),of=Symbol(""),cf=Symbol(""),lf=Symbol(""),af=Symbol(""),ff=Symbol("");l2({[uf]:"vModelRadio",[nf]:"vModelCheckbox",[rf]:"vModelText",[sf]:"vModelSelect",[Fs]:"vModelDynamic",[of]:"withModifiers",[cf]:"withKeys",[lf]:"vShow",[af]:"Transition",[ff]:"TransitionGroup"});let Nu;function Rm(e,t=!1){return Nu||(Nu=document.createElement("div")),t?(Nu.innerHTML=`<div foo="${e.replace(/"/g,"&quot;")}">`,Nu.children[0].getAttribute("foo")):(Nu.innerHTML=e,Nu.textContent)}const Mm={parseMode:"html",isVoidTag:ld,isNativeTag:e=>sd(e)||od(e)||cd(e),isPreTag:e=>e==="pre",isIgnoreNewlineTag:e=>e==="pre"||e==="textarea",decodeEntities:Rm,isBuiltInComponent:e=>{if(e==="Transition"||e==="transition")return af;if(e==="TransitionGroup"||e==="transition-group")return ff},getNamespace(e,t,u){let n=t?t.ns:u;if(t&&n===2)if(t.tag==="annotation-xml"){if(e==="svg")return 1;t.props.some(r=>r.type===6&&r.name==="encoding"&&r.value!=null&&(r.value.content==="text/html"||r.value.content==="application/xhtml+xml"))&&(n=0)}else/^m(?:[ions]|text)$/.test(t.tag)&&e!=="mglyph"&&e!=="malignmark"&&(n=0);else t&&n===1&&(t.tag==="foreignObject"||t.tag==="desc"||t.tag==="title")&&(n=0);if(n===0){if(e==="svg")return 1;if(e==="math")return 2}return n}},Pm=e=>{e.type===1&&e.props.forEach((t,u)=>{t.type===6&&t.name==="style"&&t.value&&(e.props[u]={type:7,name:"bind",arg:X("style",!0,t.loc),exp:Lm(t.value.content,t.loc),modifiers:[],loc:t.loc})})},Lm=(e,t)=>{const u=xl(e);return X(JSON.stringify(u),!1,t,3)};function iu(e,t){return me(e,t)}const Bm=(e,t,u)=>{const{exp:n,loc:r}=e;return n||u.onError(iu(53,r)),t.children.length&&(u.onError(iu(54,r)),t.children.length=0),{props:[xe(X("innerHTML",!0,r),n||X("",!0))]}},Vm=(e,t,u)=>{const{exp:n,loc:r}=e;return n||u.onError(iu(55,r)),t.children.length&&(u.onError(iu(56,r)),t.children.length=0),{props:[xe(X("textContent",!0),n?tt(n,u)>0?n:Ae(u.helperString(Ci),[n],r):X("",!0))]}},$m=(e,t,u)=>{const n=tf(e,t,u);if(!n.props.length||t.tagType===1)return n;e.arg&&u.onError(iu(58,e.arg.loc));const{tag:r}=t,i=u.isCustomElement(r);if(r==="input"||r==="textarea"||r==="select"||i){let s=rf,o=!1;if(r==="input"||i){const c=ki(t,"type");if(c){if(c.type===7)s=Fs;else if(c.value)switch(c.value.content){case"radio":s=uf;break;case"checkbox":s=nf;break;case"file":o=!0,u.onError(iu(59,e.loc));break}}else C2(t)&&(s=Fs)}else r==="select"&&(s=sf);o||(n.needRuntime=u.helper(s))}else u.onError(iu(57,e.loc));return n.props=n.props.filter(s=>!(s.key.type===4&&s.key.content==="modelValue")),n},Hm=Be("passive,once,capture"),Um=Be("stop,prevent,self,ctrl,shift,alt,meta,exact,middle"),qm=Be("left,right"),df=Be("onkeyup,onkeydown,onkeypress"),jm=(e,t,u,n)=>{const r=[],i=[],s=[];for(let o=0;o<t.length;o++){const c=t[o].content;c==="native"&&Vn("COMPILER_V_ON_NATIVE",u)||Hm(c)?s.push(c):qm(c)?Ge(e)?df(e.content.toLowerCase())?r.push(c):i.push(c):(r.push(c),i.push(c)):Um(c)?i.push(c):r.push(c)}return{keyModifiers:r,nonKeyModifiers:i,eventOptionModifiers:s}},Xc=(e,t)=>Ge(e)&&e.content.toLowerCase()==="onclick"?X(t,!0):e.type!==4?gt(["(",e,`) === "onClick" ? "${t}" : (`,e,")"]):e,zm=(e,t,u)=>ef(e,t,u,n=>{const{modifiers:r}=e;if(!r.length)return n;let{key:i,value:s}=n.props[0];const{keyModifiers:o,nonKeyModifiers:c,eventOptionModifiers:l}=jm(i,r,u,e.loc);if(c.includes("right")&&(i=Xc(i,"onContextmenu")),c.includes("middle")&&(i=Xc(i,"onMouseup")),c.length&&(s=Ae(u.helper(of),[s,JSON.stringify(c)])),o.length&&(!Ge(i)||df(i.content.toLowerCase()))&&(s=Ae(u.helper(cf),[s,JSON.stringify(o)])),l.length){const a=l.map(Tu).join("");i=Ge(i)?X(`${i.content}${a}`,!0):gt(["(",i,`) + "${a}"`])}return{props:[xe(i,s)]}}),Km=(e,t,u)=>{const{exp:n,loc:r}=e;return n||u.onError(iu(61,r)),{props:[],needRuntime:u.helper(lf)}},Wm=(e,t)=>{e.type===1&&e.tagType===0&&(e.tag==="script"||e.tag==="style")&&t.removeNode()},Gm=[Pm],Zm={cloak:Nm,html:Bm,text:Vm,model:$m,on:zm,show:Km};function Jm(e,t={}){return Om(e,te({},Mm,t,{nodeTransforms:[Wm,...Gm,...t.nodeTransforms||[]],directiveTransforms:te({},Zm,t.directiveTransforms||{}),transformHoist:null}))}/**
+* vue v3.5.13
+* (c) 2018-present Yuxi (Evan) You and Vue contributors
+* @license MIT
+**/const Qc=Object.create(null);function Ym(e,t){if(!Z(e))if(e.nodeType)e=e.innerHTML;else return we;const u=Gf(e,t),n=Qc[u];if(n)return n;if(e[0]==="#"){const o=document.querySelector(e);e=o?o.innerHTML:""}const r=te({hoistStatic:!0,onError:void 0,onWarn:we},t);!r.isCustomElement&&typeof customElements<"u"&&(r.isCustomElement=o=>!!customElements.get(o));const{code:i}=Jm(e,r),s=new Function("Vue",i)(n2);return s._rc=!0,Qc[u]=s}Ya(Ym);const Xm={stream:!0,temperature:.2};let Qm=null;class el extends Error{constructor(t,u,n){super(t),this.name=u}}async function*eb(e,t={},u={}){var p;let n=u.controller;const r=((p=u.api_url)==null?void 0:p.replace(/\/+$/,""))||"";n||(n=new AbortController);const i={...Xm,...t,prompt:e},s=await fetch(`${r}${u.endpoint||"/completion"}`,{method:"POST",body:JSON.stringify(i),headers:{Connection:"keep-alive","Content-Type":"application/json",Accept:"text/event-stream",...t.api_key?{Authorization:`Bearer ${t.api_key}`}:{}},signal:n.signal});if(s.status!==200)try{const d=await s.json();if(d&&d.error&&d.error.message)throw new el(d.error.message,"ServerError")}catch(d){throw new el(d.message,"ServerError")}const c=s.body.getReader(),l=new TextDecoder;let a="",f="";try{let d=!0;for(;d;){const h=await c.read();if(h.done)break;const _=f+l.decode(h.value),v=_.endsWith(`
+`);let A=_.split(`
+`);v?f="":f=A.pop();const E=/^(\S+):\s(.*)$/gm;for(const m of A){const g=E.exec(m);if(g){if(h[g[1]]=g[2],h.data==="[DONE]"){d=!1;break}if(h.data&&(h.data=JSON.parse(h.data),a+=h.data.content,yield h,h.data.stop)){h.data.generation_settings&&(Qm=h.data.generation_settings),d=!1;break}if(h.error)try{if(h.error=JSON.parse(h.error),h.error.message.includes("slot unavailable"))throw new Error("slot unavailable");console.error(`llama.cpp error [${h.error.code} - ${h.error.type}]: ${h.error.message}`)}catch{console.error(`llama.cpp error ${h.error}`)}}}}}catch(d){throw d.name!=="AbortError"&&console.error("llama error: ",d),d}finally{n.abort()}return a}const tl={};function tb(e){let t=tl[e];if(t)return t;t=tl[e]=[];for(let u=0;u<128;u++){const n=String.fromCharCode(u);t.push(n)}for(let u=0;u<e.length;u++){const n=e.charCodeAt(u);t[n]="%"+("0"+n.toString(16).toUpperCase()).slice(-2)}return t}function Qu(e,t){typeof t!="string"&&(t=Qu.defaultChars);const u=tb(t);return e.replace(/(%[a-f0-9]{2})+/gi,function(n){let r="";for(let i=0,s=n.length;i<s;i+=3){const o=parseInt(n.slice(i+1,i+3),16);if(o<128){r+=u[o];continue}if((o&224)===192&&i+3<s){const c=parseInt(n.slice(i+4,i+6),16);if((c&192)===128){const l=o<<6&1984|c&63;l<128?r+="��":r+=String.fromCharCode(l),i+=3;continue}}if((o&240)===224&&i+6<s){const c=parseInt(n.slice(i+4,i+6),16),l=parseInt(n.slice(i+7,i+9),16);if((c&192)===128&&(l&192)===128){const a=o<<12&61440|c<<6&4032|l&63;a<2048||a>=55296&&a<=57343?r+="���":r+=String.fromCharCode(a),i+=6;continue}}if((o&248)===240&&i+9<s){const c=parseInt(n.slice(i+4,i+6),16),l=parseInt(n.slice(i+7,i+9),16),a=parseInt(n.slice(i+10,i+12),16);if((c&192)===128&&(l&192)===128&&(a&192)===128){let f=o<<18&1835008|c<<12&258048|l<<6&4032|a&63;f<65536||f>1114111?r+="����":(f-=65536,r+=String.fromCharCode(55296+(f>>10),56320+(f&1023))),i+=9;continue}}r+="�"}return r})}Qu.defaultChars=";/?:@&=+$,#";Qu.componentChars="";const ul={};function ub(e){let t=ul[e];if(t)return t;t=ul[e]=[];for(let u=0;u<128;u++){const n=String.fromCharCode(u);/^[0-9a-z]$/i.test(n)?t.push(n):t.push("%"+("0"+u.toString(16).toUpperCase()).slice(-2))}for(let u=0;u<e.length;u++)t[e.charCodeAt(u)]=e[u];return t}function er(e,t,u){typeof t!="string"&&(u=t,t=er.defaultChars),typeof u>"u"&&(u=!0);const n=ub(t);let r="";for(let i=0,s=e.length;i<s;i++){const o=e.charCodeAt(i);if(u&&o===37&&i+2<s&&/^[0-9a-f]{2}$/i.test(e.slice(i+1,i+3))){r+=e.slice(i,i+3),i+=2;continue}if(o<128){r+=n[o];continue}if(o>=55296&&o<=57343){if(o>=55296&&o<=56319&&i+1<s){const c=e.charCodeAt(i+1);if(c>=56320&&c<=57343){r+=encodeURIComponent(e[i]+e[i+1]),i++;continue}}r+="%EF%BF%BD";continue}r+=encodeURIComponent(e[i])}return r}er.defaultChars=";/?:@&=+$,-_.!~*'()#";er.componentChars="-_.!~*'()";function Ro(e){let t="";return t+=e.protocol||"",t+=e.slashes?"//":"",t+=e.auth?e.auth+"@":"",e.hostname&&e.hostname.indexOf(":")!==-1?t+="["+e.hostname+"]":t+=e.hostname||"",t+=e.port?":"+e.port:"",t+=e.pathname||"",t+=e.search||"",t+=e.hash||"",t}function Yr(){this.protocol=null,this.slashes=null,this.auth=null,this.port=null,this.hostname=null,this.hash=null,this.search=null,this.pathname=null}const nb=/^([a-z0-9.+-]+:)/i,rb=/:[0-9]*$/,ib=/^(\/\/?(?!\/)[^\?\s]*)(\?[^\s]*)?$/,sb=["<",">",'"',"`"," ","\r",`
+`,"	"],ob=["{","}","|","\\","^","`"].concat(sb),cb=["'"].concat(ob),nl=["%","/","?",";","#"].concat(cb),rl=["/","?","#"],lb=255,il=/^[+a-z0-9A-Z_-]{0,63}$/,ab=/^([+a-z0-9A-Z_-]{0,63})(.*)$/,sl={javascript:!0,"javascript:":!0},ol={http:!0,https:!0,ftp:!0,gopher:!0,file:!0,"http:":!0,"https:":!0,"ftp:":!0,"gopher:":!0,"file:":!0};function Mo(e,t){if(e&&e instanceof Yr)return e;const u=new Yr;return u.parse(e,t),u}Yr.prototype.parse=function(e,t){let u,n,r,i=e;if(i=i.trim(),!t&&e.split("#").length===1){const l=ib.exec(i);if(l)return this.pathname=l[1],l[2]&&(this.search=l[2]),this}let s=nb.exec(i);if(s&&(s=s[0],u=s.toLowerCase(),this.protocol=s,i=i.substr(s.length)),(t||s||i.match(/^\/\/[^@\/]+@[^@\/]+/))&&(r=i.substr(0,2)==="//",r&&!(s&&sl[s])&&(i=i.substr(2),this.slashes=!0)),!sl[s]&&(r||s&&!ol[s])){let l=-1;for(let h=0;h<rl.length;h++)n=i.indexOf(rl[h]),n!==-1&&(l===-1||n<l)&&(l=n);let a,f;l===-1?f=i.lastIndexOf("@"):f=i.lastIndexOf("@",l),f!==-1&&(a=i.slice(0,f),i=i.slice(f+1),this.auth=a),l=-1;for(let h=0;h<nl.length;h++)n=i.indexOf(nl[h]),n!==-1&&(l===-1||n<l)&&(l=n);l===-1&&(l=i.length),i[l-1]===":"&&l--;const p=i.slice(0,l);i=i.slice(l),this.parseHost(p),this.hostname=this.hostname||"";const d=this.hostname[0]==="["&&this.hostname[this.hostname.length-1]==="]";if(!d){const h=this.hostname.split(/\./);for(let _=0,v=h.length;_<v;_++){const A=h[_];if(A&&!A.match(il)){let E="";for(let m=0,g=A.length;m<g;m++)A.charCodeAt(m)>127?E+="x":E+=A[m];if(!E.match(il)){const m=h.slice(0,_),g=h.slice(_+1),x=A.match(ab);x&&(m.push(x[1]),g.unshift(x[2])),g.length&&(i=g.join(".")+i),this.hostname=m.join(".");break}}}}this.hostname.length>lb&&(this.hostname=""),d&&(this.hostname=this.hostname.substr(1,this.hostname.length-2))}const o=i.indexOf("#");o!==-1&&(this.hash=i.substr(o),i=i.slice(0,o));const c=i.indexOf("?");return c!==-1&&(this.search=i.substr(c),i=i.slice(0,c)),i&&(this.pathname=i),ol[u]&&this.hostname&&!this.pathname&&(this.pathname=""),this};Yr.prototype.parseHost=function(e){let t=rb.exec(e);t&&(t=t[0],t!==":"&&(this.port=t.substr(1)),e=e.substr(0,e.length-t.length)),e&&(this.hostname=e)};const fb=Object.freeze(Object.defineProperty({__proto__:null,decode:Qu,encode:er,format:Ro,parse:Mo},Symbol.toStringTag,{value:"Module"})),hf=/[\0-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]/,pf=/[\0-\x1F\x7F-\x9F]/,db=/[\xAD\u0600-\u0605\u061C\u06DD\u070F\u0890\u0891\u08E2\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\uFFF9-\uFFFB]|\uD804[\uDCBD\uDCCD]|\uD80D[\uDC30-\uDC3F]|\uD82F[\uDCA0-\uDCA3]|\uD834[\uDD73-\uDD7A]|\uDB40[\uDC01\uDC20-\uDC7F]/,Po=/[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061D-\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1B7D\u1B7E\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u2E52-\u2E5D\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDEAD\uDF55-\uDF59\uDF86-\uDF89]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5A\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDEB9\uDF3C-\uDF3E]|\uD806[\uDC3B\uDD44-\uDD46\uDDE2\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2\uDF00-\uDF09]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8\uDF43-\uDF4F\uDFFF]|\uD809[\uDC70-\uDC74]|\uD80B[\uDFF1\uDFF2]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A\uDFE2]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]/,mf=/[\$\+<->\^`\|~\xA2-\xA6\xA8\xA9\xAC\xAE-\xB1\xB4\xB8\xD7\xF7\u02C2-\u02C5\u02D2-\u02DF\u02E5-\u02EB\u02ED\u02EF-\u02FF\u0375\u0384\u0385\u03F6\u0482\u058D-\u058F\u0606-\u0608\u060B\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u07FE\u07FF\u0888\u09F2\u09F3\u09FA\u09FB\u0AF1\u0B70\u0BF3-\u0BFA\u0C7F\u0D4F\u0D79\u0E3F\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399\u166D\u17DB\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u1FBD\u1FBF-\u1FC1\u1FCD-\u1FCF\u1FDD-\u1FDF\u1FED-\u1FEF\u1FFD\u1FFE\u2044\u2052\u207A-\u207C\u208A-\u208C\u20A0-\u20C0\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116-\u2118\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u2140-\u2144\u214A-\u214D\u214F\u218A\u218B\u2190-\u2307\u230C-\u2328\u232B-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u2767\u2794-\u27C4\u27C7-\u27E5\u27F0-\u2982\u2999-\u29D7\u29DC-\u29FB\u29FE-\u2B73\u2B76-\u2B95\u2B97-\u2BFF\u2CE5-\u2CEA\u2E50\u2E51\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFF\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u309B\u309C\u3190\u3191\u3196-\u319F\u31C0-\u31E3\u31EF\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u33FF\u4DC0-\u4DFF\uA490-\uA4C6\uA700-\uA716\uA720\uA721\uA789\uA78A\uA828-\uA82B\uA836-\uA839\uAA77-\uAA79\uAB5B\uAB6A\uAB6B\uFB29\uFBB2-\uFBC2\uFD40-\uFD4F\uFDCF\uFDFC-\uFDFF\uFE62\uFE64-\uFE66\uFE69\uFF04\uFF0B\uFF1C-\uFF1E\uFF3E\uFF40\uFF5C\uFF5E\uFFE0-\uFFE6\uFFE8-\uFFEE\uFFFC\uFFFD]|\uD800[\uDD37-\uDD3F\uDD79-\uDD89\uDD8C-\uDD8E\uDD90-\uDD9C\uDDA0\uDDD0-\uDDFC]|\uD802[\uDC77\uDC78\uDEC8]|\uD805\uDF3F|\uD807[\uDFD5-\uDFF1]|\uD81A[\uDF3C-\uDF3F\uDF45]|\uD82F\uDC9C|\uD833[\uDF50-\uDFC3]|\uD834[\uDC00-\uDCF5\uDD00-\uDD26\uDD29-\uDD64\uDD6A-\uDD6C\uDD83\uDD84\uDD8C-\uDDA9\uDDAE-\uDDEA\uDE00-\uDE41\uDE45\uDF00-\uDF56]|\uD835[\uDEC1\uDEDB\uDEFB\uDF15\uDF35\uDF4F\uDF6F\uDF89\uDFA9\uDFC3]|\uD836[\uDC00-\uDDFF\uDE37-\uDE3A\uDE6D-\uDE74\uDE76-\uDE83\uDE85\uDE86]|\uD838[\uDD4F\uDEFF]|\uD83B[\uDCAC\uDCB0\uDD2E\uDEF0\uDEF1]|\uD83C[\uDC00-\uDC2B\uDC30-\uDC93\uDCA0-\uDCAE\uDCB1-\uDCBF\uDCC1-\uDCCF\uDCD1-\uDCF5\uDD0D-\uDDAD\uDDE6-\uDE02\uDE10-\uDE3B\uDE40-\uDE48\uDE50\uDE51\uDE60-\uDE65\uDF00-\uDFFF]|\uD83D[\uDC00-\uDED7\uDEDC-\uDEEC\uDEF0-\uDEFC\uDF00-\uDF76\uDF7B-\uDFD9\uDFE0-\uDFEB\uDFF0]|\uD83E[\uDC00-\uDC0B\uDC10-\uDC47\uDC50-\uDC59\uDC60-\uDC87\uDC90-\uDCAD\uDCB0\uDCB1\uDD00-\uDE53\uDE60-\uDE6D\uDE70-\uDE7C\uDE80-\uDE88\uDE90-\uDEBD\uDEBF-\uDEC5\uDECE-\uDEDB\uDEE0-\uDEE8\uDEF0-\uDEF8\uDF00-\uDF92\uDF94-\uDFCA]/,bf=/[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]/,hb=Object.freeze(Object.defineProperty({__proto__:null,Any:hf,Cc:pf,Cf:db,P:Po,S:mf,Z:bf},Symbol.toStringTag,{value:"Module"})),pb=new Uint16Array('ᵁ<Õıʊҝջאٵ۞ޢߖࠏ੊ઑඡ๭༉༦჊ረዡᐕᒝᓃᓟᔥ\0\0\0\0\0\0ᕫᛍᦍᰒᷝ὾⁠↰⊍⏀⏻⑂⠤⤒ⴈ⹈⿎〖㊺㘹㞬㣾㨨㩱㫠㬮ࠀEMabcfglmnoprstu\\bfms¦³¹ÈÏlig耻Æ䃆P耻&䀦cute耻Á䃁reve;䄂Āiyx}rc耻Â䃂;䐐r;쀀𝔄rave耻À䃀pha;䎑acr;䄀d;橓Āgp¡on;䄄f;쀀𝔸plyFunction;恡ing耻Å䃅Ācs¾Ãr;쀀𝒜ign;扔ilde耻Ã䃃ml耻Ä䃄ЀaceforsuåûþėĜĢħĪĀcrêòkslash;或Ŷöø;櫧ed;挆y;䐑ƀcrtąċĔause;戵noullis;愬a;䎒r;쀀𝔅pf;쀀𝔹eve;䋘còēmpeq;扎܀HOacdefhilorsuōőŖƀƞƢƵƷƺǜȕɳɸɾcy;䐧PY耻©䂩ƀcpyŝŢźute;䄆Ā;iŧŨ拒talDifferentialD;慅leys;愭ȀaeioƉƎƔƘron;䄌dil耻Ç䃇rc;䄈nint;戰ot;䄊ĀdnƧƭilla;䂸terDot;䂷òſi;䎧rcleȀDMPTǇǋǑǖot;抙inus;抖lus;投imes;抗oĀcsǢǸkwiseContourIntegral;戲eCurlyĀDQȃȏoubleQuote;思uote;怙ȀlnpuȞȨɇɕonĀ;eȥȦ户;橴ƀgitȯȶȺruent;扡nt;戯ourIntegral;戮ĀfrɌɎ;愂oduct;成nterClockwiseContourIntegral;戳oss;樯cr;쀀𝒞pĀ;Cʄʅ拓ap;才րDJSZacefiosʠʬʰʴʸˋ˗ˡ˦̳ҍĀ;oŹʥtrahd;椑cy;䐂cy;䐅cy;䐏ƀgrsʿ˄ˇger;怡r;憡hv;櫤Āayː˕ron;䄎;䐔lĀ;t˝˞戇a;䎔r;쀀𝔇Āaf˫̧Ācm˰̢riticalȀADGT̖̜̀̆cute;䂴oŴ̋̍;䋙bleAcute;䋝rave;䁠ilde;䋜ond;拄ferentialD;慆Ѱ̽\0\0\0͔͂\0Ѕf;쀀𝔻ƀ;DE͈͉͍䂨ot;惜qual;扐blèCDLRUVͣͲ΂ϏϢϸontourIntegraìȹoɴ͹\0\0ͻ»͉nArrow;懓Āeo·ΤftƀARTΐΖΡrrow;懐ightArrow;懔eåˊngĀLRΫτeftĀARγιrrow;柸ightArrow;柺ightArrow;柹ightĀATϘϞrrow;懒ee;抨pɁϩ\0\0ϯrrow;懑ownArrow;懕erticalBar;戥ǹABLRTaВЪаўѿͼrrowƀ;BUНОТ憓ar;椓pArrow;懵reve;䌑eft˒к\0ц\0ѐightVector;楐eeVector;楞ectorĀ;Bљњ憽ar;楖ightǔѧ\0ѱeeVector;楟ectorĀ;BѺѻ懁ar;楗eeĀ;A҆҇护rrow;憧ĀctҒҗr;쀀𝒟rok;䄐ࠀNTacdfglmopqstuxҽӀӄӋӞӢӧӮӵԡԯԶՒ՝ՠեG;䅊H耻Ð䃐cute耻É䃉ƀaiyӒӗӜron;䄚rc耻Ê䃊;䐭ot;䄖r;쀀𝔈rave耻È䃈ement;戈ĀapӺӾcr;䄒tyɓԆ\0\0ԒmallSquare;旻erySmallSquare;斫ĀgpԦԪon;䄘f;쀀𝔼silon;䎕uĀaiԼՉlĀ;TՂՃ橵ilde;扂librium;懌Āci՗՚r;愰m;橳a;䎗ml耻Ë䃋Āipժկsts;戃onentialE;慇ʀcfiosօֈ֍ֲ׌y;䐤r;쀀𝔉lledɓ֗\0\0֣mallSquare;旼erySmallSquare;斪Ͱֺ\0ֿ\0\0ׄf;쀀𝔽All;戀riertrf;愱cò׋؀JTabcdfgorstר׬ׯ׺؀ؒؖ؛؝أ٬ٲcy;䐃耻>䀾mmaĀ;d׷׸䎓;䏜reve;䄞ƀeiy؇،ؐdil;䄢rc;䄜;䐓ot;䄠r;쀀𝔊;拙pf;쀀𝔾eater̀EFGLSTصلَٖٛ٦qualĀ;Lؾؿ扥ess;招ullEqual;执reater;檢ess;扷lantEqual;橾ilde;扳cr;쀀𝒢;扫ЀAacfiosuڅڋږڛڞڪھۊRDcy;䐪Āctڐڔek;䋇;䁞irc;䄤r;愌lbertSpace;愋ǰگ\0ڲf;愍izontalLine;攀Āctۃۅòکrok;䄦mpńېۘownHumðįqual;扏܀EJOacdfgmnostuۺ۾܃܇܎ܚܞܡܨ݄ݸދޏޕcy;䐕lig;䄲cy;䐁cute耻Í䃍Āiyܓܘrc耻Î䃎;䐘ot;䄰r;愑rave耻Ì䃌ƀ;apܠܯܿĀcgܴܷr;䄪inaryI;慈lieóϝǴ݉\0ݢĀ;eݍݎ戬Āgrݓݘral;戫section;拂isibleĀCTݬݲomma;恣imes;恢ƀgptݿރވon;䄮f;쀀𝕀a;䎙cr;愐ilde;䄨ǫޚ\0ޞcy;䐆l耻Ï䃏ʀcfosuެ޷޼߂ߐĀiyޱ޵rc;䄴;䐙r;쀀𝔍pf;쀀𝕁ǣ߇\0ߌr;쀀𝒥rcy;䐈kcy;䐄΀HJacfosߤߨ߽߬߱ࠂࠈcy;䐥cy;䐌ppa;䎚Āey߶߻dil;䄶;䐚r;쀀𝔎pf;쀀𝕂cr;쀀𝒦րJTaceflmostࠥࠩࠬࡐࡣ঳সে্਷ੇcy;䐉耻<䀼ʀcmnpr࠷࠼ࡁࡄࡍute;䄹bda;䎛g;柪lacetrf;愒r;憞ƀaeyࡗ࡜ࡡron;䄽dil;䄻;䐛Āfsࡨ॰tԀACDFRTUVarࡾࢩࢱࣦ࣠ࣼयज़ΐ४Ānrࢃ࢏gleBracket;柨rowƀ;BR࢙࢚࢞憐ar;懤ightArrow;懆eiling;挈oǵࢷ\0ࣃbleBracket;柦nǔࣈ\0࣒eeVector;楡ectorĀ;Bࣛࣜ懃ar;楙loor;挊ightĀAV࣯ࣵrrow;憔ector;楎Āerँगeƀ;AVउऊऐ抣rrow;憤ector;楚iangleƀ;BEतथऩ抲ar;槏qual;抴pƀDTVषूौownVector;楑eeVector;楠ectorĀ;Bॖॗ憿ar;楘ectorĀ;B॥०憼ar;楒ightáΜs̀EFGLSTॾঋকঝঢভqualGreater;拚ullEqual;扦reater;扶ess;檡lantEqual;橽ilde;扲r;쀀𝔏Ā;eঽা拘ftarrow;懚idot;䄿ƀnpw৔ਖਛgȀLRlr৞৷ਂਐeftĀAR০৬rrow;柵ightArrow;柷ightArrow;柶eftĀarγਊightáοightáϊf;쀀𝕃erĀLRਢਬeftArrow;憙ightArrow;憘ƀchtਾੀੂòࡌ;憰rok;䅁;扪Ѐacefiosuਗ਼੝੠੷੼અઋ઎p;椅y;䐜Ādl੥੯iumSpace;恟lintrf;愳r;쀀𝔐nusPlus;戓pf;쀀𝕄cò੶;䎜ҀJacefostuણધભીଔଙඑ඗ඞcy;䐊cute;䅃ƀaey઴હાron;䅇dil;䅅;䐝ƀgswે૰଎ativeƀMTV૓૟૨ediumSpace;怋hiĀcn૦૘ë૙eryThiî૙tedĀGL૸ଆreaterGreateòٳessLesóੈLine;䀊r;쀀𝔑ȀBnptଢନଷ଺reak;恠BreakingSpace;䂠f;愕ڀ;CDEGHLNPRSTV୕ୖ୪୼஡௫ఄ౞಄ದ೘ൡඅ櫬Āou୛୤ngruent;扢pCap;扭oubleVerticalBar;戦ƀlqxஃஊ஛ement;戉ualĀ;Tஒஓ扠ilde;쀀≂̸ists;戄reater΀;EFGLSTஶஷ஽௉௓௘௥扯qual;扱ullEqual;쀀≧̸reater;쀀≫̸ess;批lantEqual;쀀⩾̸ilde;扵umpń௲௽ownHump;쀀≎̸qual;쀀≏̸eĀfsఊధtTriangleƀ;BEచఛడ拪ar;쀀⧏̸qual;括s̀;EGLSTవశ఼ౄోౘ扮qual;扰reater;扸ess;쀀≪̸lantEqual;쀀⩽̸ilde;扴estedĀGL౨౹reaterGreater;쀀⪢̸essLess;쀀⪡̸recedesƀ;ESಒಓಛ技qual;쀀⪯̸lantEqual;拠ĀeiಫಹverseElement;戌ghtTriangleƀ;BEೋೌ೒拫ar;쀀⧐̸qual;拭ĀquೝഌuareSuĀbp೨೹setĀ;E೰ೳ쀀⊏̸qual;拢ersetĀ;Eഃആ쀀⊐̸qual;拣ƀbcpഓതൎsetĀ;Eഛഞ쀀⊂⃒qual;抈ceedsȀ;ESTലള഻െ抁qual;쀀⪰̸lantEqual;拡ilde;쀀≿̸ersetĀ;E൘൛쀀⊃⃒qual;抉ildeȀ;EFT൮൯൵ൿ扁qual;扄ullEqual;扇ilde;扉erticalBar;戤cr;쀀𝒩ilde耻Ñ䃑;䎝܀Eacdfgmoprstuvලෂ෉෕ෛ෠෧෼ขภยา฿ไlig;䅒cute耻Ó䃓Āiy෎ීrc耻Ô䃔;䐞blac;䅐r;쀀𝔒rave耻Ò䃒ƀaei෮ෲ෶cr;䅌ga;䎩cron;䎟pf;쀀𝕆enCurlyĀDQฎบoubleQuote;怜uote;怘;橔Āclวฬr;쀀𝒪ash耻Ø䃘iŬื฼de耻Õ䃕es;樷ml耻Ö䃖erĀBP๋๠Āar๐๓r;怾acĀek๚๜;揞et;掴arenthesis;揜Ҁacfhilors๿ງຊຏຒດຝະ໼rtialD;戂y;䐟r;쀀𝔓i;䎦;䎠usMinus;䂱Āipຢອncareplanåڝf;愙Ȁ;eio຺ູ໠໤檻cedesȀ;EST່້໏໚扺qual;檯lantEqual;扼ilde;找me;怳Ādp໩໮uct;戏ortionĀ;aȥ໹l;戝Āci༁༆r;쀀𝒫;䎨ȀUfos༑༖༛༟OT耻"䀢r;쀀𝔔pf;愚cr;쀀𝒬؀BEacefhiorsu༾གྷཇའཱིྦྷྪྭ႖ႩႴႾarr;椐G耻®䂮ƀcnrཎནབute;䅔g;柫rĀ;tཛྷཝ憠l;椖ƀaeyཧཬཱron;䅘dil;䅖;䐠Ā;vླྀཹ愜erseĀEUྂྙĀlq྇ྎement;戋uilibrium;懋pEquilibrium;楯r»ཹo;䎡ghtЀACDFTUVa࿁࿫࿳ဢဨၛႇϘĀnr࿆࿒gleBracket;柩rowƀ;BL࿜࿝࿡憒ar;懥eftArrow;懄eiling;按oǵ࿹\0စbleBracket;柧nǔည\0နeeVector;楝ectorĀ;Bဝသ懂ar;楕loor;挋Āerိ၃eƀ;AVဵံြ抢rrow;憦ector;楛iangleƀ;BEၐၑၕ抳ar;槐qual;抵pƀDTVၣၮၸownVector;楏eeVector;楜ectorĀ;Bႂႃ憾ar;楔ectorĀ;B႑႒懀ar;楓Āpuႛ႞f;愝ndImplies;楰ightarrow;懛ĀchႹႼr;愛;憱leDelayed;槴ڀHOacfhimoqstuფჱჷჽᄙᄞᅑᅖᅡᅧᆵᆻᆿĀCcჩხHcy;䐩y;䐨FTcy;䐬cute;䅚ʀ;aeiyᄈᄉᄎᄓᄗ檼ron;䅠dil;䅞rc;䅜;䐡r;쀀𝔖ortȀDLRUᄪᄴᄾᅉownArrow»ОeftArrow»࢚ightArrow»࿝pArrow;憑gma;䎣allCircle;战pf;쀀𝕊ɲᅭ\0\0ᅰt;戚areȀ;ISUᅻᅼᆉᆯ斡ntersection;抓uĀbpᆏᆞsetĀ;Eᆗᆘ抏qual;抑ersetĀ;Eᆨᆩ抐qual;抒nion;抔cr;쀀𝒮ar;拆ȀbcmpᇈᇛሉላĀ;sᇍᇎ拐etĀ;Eᇍᇕqual;抆ĀchᇠህeedsȀ;ESTᇭᇮᇴᇿ扻qual;檰lantEqual;扽ilde;承Tháྌ;我ƀ;esሒሓሣ拑rsetĀ;Eሜም抃qual;抇et»ሓրHRSacfhiorsሾቄ቉ቕ቞ቱቶኟዂወዑORN耻Þ䃞ADE;愢ĀHc቎ቒcy;䐋y;䐦Ābuቚቜ;䀉;䎤ƀaeyብቪቯron;䅤dil;䅢;䐢r;쀀𝔗Āeiቻ኉ǲኀ\0ኇefore;戴a;䎘Ācn኎ኘkSpace;쀀  Space;怉ldeȀ;EFTካኬኲኼ戼qual;扃ullEqual;扅ilde;扈pf;쀀𝕋ipleDot;惛Āctዖዛr;쀀𝒯rok;䅦ૡዷጎጚጦ\0ጬጱ\0\0\0\0\0ጸጽ፷ᎅ\0᏿ᐄᐊᐐĀcrዻጁute耻Ú䃚rĀ;oጇገ憟cir;楉rǣጓ\0጖y;䐎ve;䅬Āiyጞጣrc耻Û䃛;䐣blac;䅰r;쀀𝔘rave耻Ù䃙acr;䅪Ādiፁ፩erĀBPፈ፝Āarፍፐr;䁟acĀekፗፙ;揟et;掵arenthesis;揝onĀ;P፰፱拃lus;抎Āgp፻፿on;䅲f;쀀𝕌ЀADETadps᎕ᎮᎸᏄϨᏒᏗᏳrrowƀ;BDᅐᎠᎤar;椒ownArrow;懅ownArrow;憕quilibrium;楮eeĀ;AᏋᏌ报rrow;憥ownáϳerĀLRᏞᏨeftArrow;憖ightArrow;憗iĀ;lᏹᏺ䏒on;䎥ing;䅮cr;쀀𝒰ilde;䅨ml耻Ü䃜ҀDbcdefosvᐧᐬᐰᐳᐾᒅᒊᒐᒖash;披ar;櫫y;䐒ashĀ;lᐻᐼ抩;櫦Āerᑃᑅ;拁ƀbtyᑌᑐᑺar;怖Ā;iᑏᑕcalȀBLSTᑡᑥᑪᑴar;戣ine;䁼eparator;杘ilde;所ThinSpace;怊r;쀀𝔙pf;쀀𝕍cr;쀀𝒱dash;抪ʀcefosᒧᒬᒱᒶᒼirc;䅴dge;拀r;쀀𝔚pf;쀀𝕎cr;쀀𝒲Ȁfiosᓋᓐᓒᓘr;쀀𝔛;䎞pf;쀀𝕏cr;쀀𝒳ҀAIUacfosuᓱᓵᓹᓽᔄᔏᔔᔚᔠcy;䐯cy;䐇cy;䐮cute耻Ý䃝Āiyᔉᔍrc;䅶;䐫r;쀀𝔜pf;쀀𝕐cr;쀀𝒴ml;䅸ЀHacdefosᔵᔹᔿᕋᕏᕝᕠᕤcy;䐖cute;䅹Āayᕄᕉron;䅽;䐗ot;䅻ǲᕔ\0ᕛoWidtè૙a;䎖r;愨pf;愤cr;쀀𝒵௡ᖃᖊᖐ\0ᖰᖶᖿ\0\0\0\0ᗆᗛᗫᙟ᙭\0ᚕ᚛ᚲᚹ\0ᚾcute耻á䃡reve;䄃̀;Ediuyᖜᖝᖡᖣᖨᖭ戾;쀀∾̳;房rc耻â䃢te肻´̆;䐰lig耻æ䃦Ā;r²ᖺ;쀀𝔞rave耻à䃠ĀepᗊᗖĀfpᗏᗔsym;愵èᗓha;䎱ĀapᗟcĀclᗤᗧr;䄁g;樿ɤᗰ\0\0ᘊʀ;adsvᗺᗻᗿᘁᘇ戧nd;橕;橜lope;橘;橚΀;elmrszᘘᘙᘛᘞᘿᙏᙙ戠;榤e»ᘙsdĀ;aᘥᘦ戡ѡᘰᘲᘴᘶᘸᘺᘼᘾ;榨;榩;榪;榫;榬;榭;榮;榯tĀ;vᙅᙆ戟bĀ;dᙌᙍ抾;榝Āptᙔᙗh;戢»¹arr;捼Āgpᙣᙧon;䄅f;쀀𝕒΀;Eaeiop዁ᙻᙽᚂᚄᚇᚊ;橰cir;橯;扊d;手s;䀧roxĀ;e዁ᚒñᚃing耻å䃥ƀctyᚡᚦᚨr;쀀𝒶;䀪mpĀ;e዁ᚯñʈilde耻ã䃣ml耻ä䃤Āciᛂᛈoninôɲnt;樑ࠀNabcdefiklnoprsu᛭ᛱᜰ᜼ᝃᝈ᝸᝽០៦ᠹᡐᜍ᤽᥈ᥰot;櫭Ācrᛶ᜞kȀcepsᜀᜅᜍᜓong;扌psilon;䏶rime;怵imĀ;e᜚᜛戽q;拍Ŷᜢᜦee;抽edĀ;gᜬᜭ挅e»ᜭrkĀ;t፜᜷brk;掶Āoyᜁᝁ;䐱quo;怞ʀcmprtᝓ᝛ᝡᝤᝨausĀ;eĊĉptyv;榰séᜌnoõēƀahwᝯ᝱ᝳ;䎲;愶een;扬r;쀀𝔟g΀costuvwឍឝឳេ៕៛៞ƀaiuបពរðݠrc;旯p»፱ƀdptឤឨឭot;樀lus;樁imes;樂ɱឹ\0\0ើcup;樆ar;昅riangleĀdu៍្own;施p;斳plus;樄eåᑄåᒭarow;植ƀako៭ᠦᠵĀcn៲ᠣkƀlst៺֫᠂ozenge;槫riangleȀ;dlr᠒᠓᠘᠝斴own;斾eft;旂ight;斸k;搣Ʊᠫ\0ᠳƲᠯ\0ᠱ;斒;斑4;斓ck;斈ĀeoᠾᡍĀ;qᡃᡆ쀀=⃥uiv;쀀≡⃥t;挐Ȁptwxᡙᡞᡧᡬf;쀀𝕓Ā;tᏋᡣom»Ꮜtie;拈؀DHUVbdhmptuvᢅᢖᢪᢻᣗᣛᣬ᣿ᤅᤊᤐᤡȀLRlrᢎᢐᢒᢔ;敗;敔;敖;敓ʀ;DUduᢡᢢᢤᢦᢨ敐;敦;敩;敤;敧ȀLRlrᢳᢵᢷᢹ;敝;敚;敜;教΀;HLRhlrᣊᣋᣍᣏᣑᣓᣕ救;敬;散;敠;敫;敢;敟ox;槉ȀLRlrᣤᣦᣨᣪ;敕;敒;攐;攌ʀ;DUduڽ᣷᣹᣻᣽;敥;敨;攬;攴inus;抟lus;択imes;抠ȀLRlrᤙᤛᤝ᤟;敛;敘;攘;攔΀;HLRhlrᤰᤱᤳᤵᤷ᤻᤹攂;敪;敡;敞;攼;攤;攜Āevģ᥂bar耻¦䂦Ȁceioᥑᥖᥚᥠr;쀀𝒷mi;恏mĀ;e᜚᜜lƀ;bhᥨᥩᥫ䁜;槅sub;柈Ŭᥴ᥾lĀ;e᥹᥺怢t»᥺pƀ;Eeįᦅᦇ;檮Ā;qۜۛೡᦧ\0᧨ᨑᨕᨲ\0ᨷᩐ\0\0᪴\0\0᫁\0\0ᬡᬮ᭍᭒\0᯽\0ᰌƀcpr᦭ᦲ᧝ute;䄇̀;abcdsᦿᧀᧄ᧊᧕᧙戩nd;橄rcup;橉Āau᧏᧒p;橋p;橇ot;橀;쀀∩︀Āeo᧢᧥t;恁îړȀaeiu᧰᧻ᨁᨅǰ᧵\0᧸s;橍on;䄍dil耻ç䃧rc;䄉psĀ;sᨌᨍ橌m;橐ot;䄋ƀdmnᨛᨠᨦil肻¸ƭptyv;榲t脀¢;eᨭᨮ䂢räƲr;쀀𝔠ƀceiᨽᩀᩍy;䑇ckĀ;mᩇᩈ朓ark»ᩈ;䏇r΀;Ecefms᩟᩠ᩢᩫ᪤᪪᪮旋;槃ƀ;elᩩᩪᩭ䋆q;扗eɡᩴ\0\0᪈rrowĀlr᩼᪁eft;憺ight;憻ʀRSacd᪒᪔᪖᪚᪟»ཇ;擈st;抛irc;抚ash;抝nint;樐id;櫯cir;槂ubsĀ;u᪻᪼晣it»᪼ˬ᫇᫔᫺\0ᬊonĀ;eᫍᫎ䀺Ā;qÇÆɭ᫙\0\0᫢aĀ;t᫞᫟䀬;䁀ƀ;fl᫨᫩᫫戁îᅠeĀmx᫱᫶ent»᫩eóɍǧ᫾\0ᬇĀ;dኻᬂot;橭nôɆƀfryᬐᬔᬗ;쀀𝕔oäɔ脀©;sŕᬝr;愗Āaoᬥᬩrr;憵ss;朗Ācuᬲᬷr;쀀𝒸Ābpᬼ᭄Ā;eᭁᭂ櫏;櫑Ā;eᭉᭊ櫐;櫒dot;拯΀delprvw᭠᭬᭷ᮂᮬᯔ᯹arrĀlr᭨᭪;椸;椵ɰ᭲\0\0᭵r;拞c;拟arrĀ;p᭿ᮀ憶;椽̀;bcdosᮏᮐᮖᮡᮥᮨ截rcap;橈Āauᮛᮞp;橆p;橊ot;抍r;橅;쀀∪︀Ȁalrv᮵ᮿᯞᯣrrĀ;mᮼᮽ憷;椼yƀevwᯇᯔᯘqɰᯎ\0\0ᯒreã᭳uã᭵ee;拎edge;拏en耻¤䂤earrowĀlrᯮ᯳eft»ᮀight»ᮽeäᯝĀciᰁᰇoninôǷnt;戱lcty;挭ঀAHabcdefhijlorstuwz᰸᰻᰿ᱝᱩᱵᲊᲞᲬᲷ᳻᳿ᴍᵻᶑᶫᶻ᷆᷍rò΁ar;楥Ȁglrs᱈ᱍ᱒᱔ger;怠eth;愸òᄳhĀ;vᱚᱛ怐»ऊūᱡᱧarow;椏aã̕Āayᱮᱳron;䄏;䐴ƀ;ao̲ᱼᲄĀgrʿᲁr;懊tseq;橷ƀglmᲑᲔᲘ耻°䂰ta;䎴ptyv;榱ĀirᲣᲨsht;楿;쀀𝔡arĀlrᲳᲵ»ࣜ»သʀaegsv᳂͸᳖᳜᳠mƀ;oș᳊᳔ndĀ;ș᳑uit;晦amma;䏝in;拲ƀ;io᳧᳨᳸䃷de脀÷;o᳧ᳰntimes;拇nø᳷cy;䑒cɯᴆ\0\0ᴊrn;挞op;挍ʀlptuwᴘᴝᴢᵉᵕlar;䀤f;쀀𝕕ʀ;emps̋ᴭᴷᴽᵂqĀ;d͒ᴳot;扑inus;戸lus;戔quare;抡blebarwedgåúnƀadhᄮᵝᵧownarrowóᲃarpoonĀlrᵲᵶefôᲴighôᲶŢᵿᶅkaro÷གɯᶊ\0\0ᶎrn;挟op;挌ƀcotᶘᶣᶦĀryᶝᶡ;쀀𝒹;䑕l;槶rok;䄑Ādrᶰᶴot;拱iĀ;fᶺ᠖斿Āah᷀᷃ròЩaòྦangle;榦Āci᷒ᷕy;䑟grarr;柿ऀDacdefglmnopqrstuxḁḉḙḸոḼṉṡṾấắẽỡἪἷὄ὎὚ĀDoḆᴴoôᲉĀcsḎḔute耻é䃩ter;橮ȀaioyḢḧḱḶron;䄛rĀ;cḭḮ扖耻ê䃪lon;払;䑍ot;䄗ĀDrṁṅot;扒;쀀𝔢ƀ;rsṐṑṗ檚ave耻è䃨Ā;dṜṝ檖ot;檘Ȁ;ilsṪṫṲṴ檙nters;揧;愓Ā;dṹṺ檕ot;檗ƀapsẅẉẗcr;䄓tyƀ;svẒẓẕ戅et»ẓpĀ1;ẝẤĳạả;怄;怅怃ĀgsẪẬ;䅋p;怂ĀgpẴẸon;䄙f;쀀𝕖ƀalsỄỎỒrĀ;sỊị拕l;槣us;橱iƀ;lvỚớở䎵on»ớ;䏵ȀcsuvỪỳἋἣĀioữḱrc»Ḯɩỹ\0\0ỻíՈantĀglἂἆtr»ṝess»Ṻƀaeiἒ἖Ἒls;䀽st;扟vĀ;DȵἠD;橸parsl;槥ĀDaἯἳot;打rr;楱ƀcdiἾὁỸr;愯oô͒ĀahὉὋ;䎷耻ð䃰Āmrὓὗl耻ë䃫o;悬ƀcipὡὤὧl;䀡sôծĀeoὬὴctatioîՙnentialåչৡᾒ\0ᾞ\0ᾡᾧ\0\0ῆῌ\0ΐ\0ῦῪ \0 ⁚llingdotseñṄy;䑄male;晀ƀilrᾭᾳ῁lig;耀ﬃɩᾹ\0\0᾽g;耀ﬀig;耀ﬄ;쀀𝔣lig;耀ﬁlig;쀀fjƀaltῙ῜ῡt;晭ig;耀ﬂns;斱of;䆒ǰ΅\0ῳf;쀀𝕗ĀakֿῷĀ;vῼ´拔;櫙artint;樍Āao‌⁕Ācs‑⁒α‚‰‸⁅⁈\0⁐β•‥‧‪‬\0‮耻½䂽;慓耻¼䂼;慕;慙;慛Ƴ‴\0‶;慔;慖ʴ‾⁁\0\0⁃耻¾䂾;慗;慜5;慘ƶ⁌\0⁎;慚;慝8;慞l;恄wn;挢cr;쀀𝒻ࢀEabcdefgijlnorstv₂₉₟₥₰₴⃰⃵⃺⃿℃ℒℸ̗ℾ⅒↞Ā;lٍ₇;檌ƀcmpₐₕ₝ute;䇵maĀ;dₜ᳚䎳;檆reve;䄟Āiy₪₮rc;䄝;䐳ot;䄡Ȁ;lqsؾق₽⃉ƀ;qsؾٌ⃄lanô٥Ȁ;cdl٥⃒⃥⃕c;檩otĀ;o⃜⃝檀Ā;l⃢⃣檂;檄Ā;e⃪⃭쀀⋛︀s;檔r;쀀𝔤Ā;gٳ؛mel;愷cy;䑓Ȁ;Eajٚℌℎℐ;檒;檥;檤ȀEaesℛℝ℩ℴ;扩pĀ;p℣ℤ檊rox»ℤĀ;q℮ℯ檈Ā;q℮ℛim;拧pf;쀀𝕘Āci⅃ⅆr;愊mƀ;el٫ⅎ⅐;檎;檐茀>;cdlqr׮ⅠⅪⅮⅳⅹĀciⅥⅧ;檧r;橺ot;拗Par;榕uest;橼ʀadelsↄⅪ←ٖ↛ǰ↉\0↎proø₞r;楸qĀlqؿ↖lesó₈ií٫Āen↣↭rtneqq;쀀≩︀Å↪ԀAabcefkosy⇄⇇⇱⇵⇺∘∝∯≨≽ròΠȀilmr⇐⇔⇗⇛rsðᒄf»․ilôکĀdr⇠⇤cy;䑊ƀ;cwࣴ⇫⇯ir;楈;憭ar;意irc;䄥ƀalr∁∎∓rtsĀ;u∉∊晥it»∊lip;怦con;抹r;쀀𝔥sĀew∣∩arow;椥arow;椦ʀamopr∺∾≃≞≣rr;懿tht;戻kĀlr≉≓eftarrow;憩ightarrow;憪f;쀀𝕙bar;怕ƀclt≯≴≸r;쀀𝒽asè⇴rok;䄧Ābp⊂⊇ull;恃hen»ᱛૡ⊣\0⊪\0⊸⋅⋎\0⋕⋳\0\0⋸⌢⍧⍢⍿\0⎆⎪⎴cute耻í䃭ƀ;iyݱ⊰⊵rc耻î䃮;䐸Ācx⊼⊿y;䐵cl耻¡䂡ĀfrΟ⋉;쀀𝔦rave耻ì䃬Ȁ;inoܾ⋝⋩⋮Āin⋢⋦nt;樌t;戭fin;槜ta;愩lig;䄳ƀaop⋾⌚⌝ƀcgt⌅⌈⌗r;䄫ƀelpܟ⌏⌓inåގarôܠh;䄱f;抷ed;䆵ʀ;cfotӴ⌬⌱⌽⍁are;愅inĀ;t⌸⌹戞ie;槝doô⌙ʀ;celpݗ⍌⍐⍛⍡al;抺Āgr⍕⍙eróᕣã⍍arhk;樗rod;樼Ȁcgpt⍯⍲⍶⍻y;䑑on;䄯f;쀀𝕚a;䎹uest耻¿䂿Āci⎊⎏r;쀀𝒾nʀ;EdsvӴ⎛⎝⎡ӳ;拹ot;拵Ā;v⎦⎧拴;拳Ā;iݷ⎮lde;䄩ǫ⎸\0⎼cy;䑖l耻ï䃯̀cfmosu⏌⏗⏜⏡⏧⏵Āiy⏑⏕rc;䄵;䐹r;쀀𝔧ath;䈷pf;쀀𝕛ǣ⏬\0⏱r;쀀𝒿rcy;䑘kcy;䑔Ѐacfghjos␋␖␢␧␭␱␵␻ppaĀ;v␓␔䎺;䏰Āey␛␠dil;䄷;䐺r;쀀𝔨reen;䄸cy;䑅cy;䑜pf;쀀𝕜cr;쀀𝓀஀ABEHabcdefghjlmnoprstuv⑰⒁⒆⒍⒑┎┽╚▀♎♞♥♹♽⚚⚲⛘❝❨➋⟀⠁⠒ƀart⑷⑺⑼rò৆òΕail;椛arr;椎Ā;gঔ⒋;檋ar;楢ॣ⒥\0⒪\0⒱\0\0\0\0\0⒵Ⓔ\0ⓆⓈⓍ\0⓹ute;䄺mptyv;榴raîࡌbda;䎻gƀ;dlࢎⓁⓃ;榑åࢎ;檅uo耻«䂫rЀ;bfhlpst࢙ⓞⓦⓩ⓫⓮⓱⓵Ā;f࢝ⓣs;椟s;椝ë≒p;憫l;椹im;楳l;憢ƀ;ae⓿─┄檫il;椙Ā;s┉┊檭;쀀⪭︀ƀabr┕┙┝rr;椌rk;杲Āak┢┬cĀek┨┪;䁻;䁛Āes┱┳;榋lĀdu┹┻;榏;榍Ȁaeuy╆╋╖╘ron;䄾Ādi═╔il;䄼ìࢰâ┩;䐻Ȁcqrs╣╦╭╽a;椶uoĀ;rนᝆĀdu╲╷har;楧shar;楋h;憲ʀ;fgqs▋▌উ◳◿扤tʀahlrt▘▤▷◂◨rrowĀ;t࢙□aé⓶arpoonĀdu▯▴own»њp»०eftarrows;懇ightƀahs◍◖◞rrowĀ;sࣴࢧarpoonó྘quigarro÷⇰hreetimes;拋ƀ;qs▋ও◺lanôবʀ;cdgsব☊☍☝☨c;檨otĀ;o☔☕橿Ā;r☚☛檁;檃Ā;e☢☥쀀⋚︀s;檓ʀadegs☳☹☽♉♋pproøⓆot;拖qĀgq♃♅ôউgtò⒌ôছiíলƀilr♕࣡♚sht;楼;쀀𝔩Ā;Eজ♣;檑š♩♶rĀdu▲♮Ā;l॥♳;楪lk;斄cy;䑙ʀ;achtੈ⚈⚋⚑⚖rò◁orneòᴈard;楫ri;旺Āio⚟⚤dot;䅀ustĀ;a⚬⚭掰che»⚭ȀEaes⚻⚽⛉⛔;扨pĀ;p⛃⛄檉rox»⛄Ā;q⛎⛏檇Ā;q⛎⚻im;拦Ѐabnoptwz⛩⛴⛷✚✯❁❇❐Ānr⛮⛱g;柬r;懽rëࣁgƀlmr⛿✍✔eftĀar০✇ightá৲apsto;柼ightá৽parrowĀlr✥✩efô⓭ight;憬ƀafl✶✹✽r;榅;쀀𝕝us;樭imes;樴š❋❏st;戗áፎƀ;ef❗❘᠀旊nge»❘arĀ;l❤❥䀨t;榓ʀachmt❳❶❼➅➇ròࢨorneòᶌarĀ;d྘➃;業;怎ri;抿̀achiqt➘➝ੀ➢➮➻quo;怹r;쀀𝓁mƀ;egল➪➬;檍;檏Ābu┪➳oĀ;rฟ➹;怚rok;䅂萀<;cdhilqrࠫ⟒☹⟜⟠⟥⟪⟰Āci⟗⟙;檦r;橹reå◲mes;拉arr;楶uest;橻ĀPi⟵⟹ar;榖ƀ;ef⠀भ᠛旃rĀdu⠇⠍shar;楊har;楦Āen⠗⠡rtneqq;쀀≨︀Å⠞܀Dacdefhilnopsu⡀⡅⢂⢎⢓⢠⢥⢨⣚⣢⣤ઃ⣳⤂Dot;戺Ȁclpr⡎⡒⡣⡽r耻¯䂯Āet⡗⡙;時Ā;e⡞⡟朠se»⡟Ā;sျ⡨toȀ;dluျ⡳⡷⡻owîҌefôएðᏑker;斮Āoy⢇⢌mma;権;䐼ash;怔asuredangle»ᘦr;쀀𝔪o;愧ƀcdn⢯⢴⣉ro耻µ䂵Ȁ;acdᑤ⢽⣀⣄sôᚧir;櫰ot肻·Ƶusƀ;bd⣒ᤃ⣓戒Ā;uᴼ⣘;横ţ⣞⣡p;櫛ò−ðઁĀdp⣩⣮els;抧f;쀀𝕞Āct⣸⣽r;쀀𝓂pos»ᖝƀ;lm⤉⤊⤍䎼timap;抸ఀGLRVabcdefghijlmoprstuvw⥂⥓⥾⦉⦘⧚⧩⨕⨚⩘⩝⪃⪕⪤⪨⬄⬇⭄⭿⮮ⰴⱧⱼ⳩Āgt⥇⥋;쀀⋙̸Ā;v⥐௏쀀≫⃒ƀelt⥚⥲⥶ftĀar⥡⥧rrow;懍ightarrow;懎;쀀⋘̸Ā;v⥻ే쀀≪⃒ightarrow;懏ĀDd⦎⦓ash;抯ash;抮ʀbcnpt⦣⦧⦬⦱⧌la»˞ute;䅄g;쀀∠⃒ʀ;Eiop඄⦼⧀⧅⧈;쀀⩰̸d;쀀≋̸s;䅉roø඄urĀ;a⧓⧔普lĀ;s⧓ସǳ⧟\0⧣p肻 ଷmpĀ;e௹ఀʀaeouy⧴⧾⨃⨐⨓ǰ⧹\0⧻;橃on;䅈dil;䅆ngĀ;dൾ⨊ot;쀀⩭̸p;橂;䐽ash;怓΀;Aadqsxஒ⨩⨭⨻⩁⩅⩐rr;懗rĀhr⨳⨶k;椤Ā;oᏲᏰot;쀀≐̸uiöୣĀei⩊⩎ar;椨í஘istĀ;s஠டr;쀀𝔫ȀEest௅⩦⩹⩼ƀ;qs஼⩭௡ƀ;qs஼௅⩴lanô௢ií௪Ā;rஶ⪁»ஷƀAap⪊⪍⪑rò⥱rr;憮ar;櫲ƀ;svྍ⪜ྌĀ;d⪡⪢拼;拺cy;䑚΀AEadest⪷⪺⪾⫂⫅⫶⫹rò⥦;쀀≦̸rr;憚r;急Ȁ;fqs఻⫎⫣⫯tĀar⫔⫙rro÷⫁ightarro÷⪐ƀ;qs఻⪺⫪lanôౕĀ;sౕ⫴»శiíౝĀ;rవ⫾iĀ;eచథiäඐĀpt⬌⬑f;쀀𝕟膀¬;in⬙⬚⬶䂬nȀ;Edvஉ⬤⬨⬮;쀀⋹̸ot;쀀⋵̸ǡஉ⬳⬵;拷;拶iĀ;vಸ⬼ǡಸ⭁⭃;拾;拽ƀaor⭋⭣⭩rȀ;ast୻⭕⭚⭟lleì୻l;쀀⫽⃥;쀀∂̸lint;樔ƀ;ceಒ⭰⭳uåಥĀ;cಘ⭸Ā;eಒ⭽ñಘȀAait⮈⮋⮝⮧rò⦈rrƀ;cw⮔⮕⮙憛;쀀⤳̸;쀀↝̸ghtarrow»⮕riĀ;eೋೖ΀chimpqu⮽⯍⯙⬄୸⯤⯯Ȁ;cerല⯆ഷ⯉uå൅;쀀𝓃ortɭ⬅\0\0⯖ará⭖mĀ;e൮⯟Ā;q൴൳suĀbp⯫⯭å೸åഋƀbcp⯶ⰑⰙȀ;Ees⯿ⰀഢⰄ抄;쀀⫅̸etĀ;eഛⰋqĀ;qണⰀcĀ;eലⰗñസȀ;EesⰢⰣൟⰧ抅;쀀⫆̸etĀ;e൘ⰮqĀ;qൠⰣȀgilrⰽⰿⱅⱇìௗlde耻ñ䃱çృiangleĀlrⱒⱜeftĀ;eచⱚñదightĀ;eೋⱥñ೗Ā;mⱬⱭ䎽ƀ;esⱴⱵⱹ䀣ro;愖p;怇ҀDHadgilrsⲏⲔⲙⲞⲣⲰⲶⳓⳣash;抭arr;椄p;쀀≍⃒ash;抬ĀetⲨⲬ;쀀≥⃒;쀀>⃒nfin;槞ƀAetⲽⳁⳅrr;椂;쀀≤⃒Ā;rⳊⳍ쀀<⃒ie;쀀⊴⃒ĀAtⳘⳜrr;椃rie;쀀⊵⃒im;쀀∼⃒ƀAan⳰⳴ⴂrr;懖rĀhr⳺⳽k;椣Ā;oᏧᏥear;椧ቓ᪕\0\0\0\0\0\0\0\0\0\0\0\0\0ⴭ\0ⴸⵈⵠⵥ⵲ⶄᬇ\0\0ⶍⶫ\0ⷈⷎ\0ⷜ⸙⸫⸾⹃Ācsⴱ᪗ute耻ó䃳ĀiyⴼⵅrĀ;c᪞ⵂ耻ô䃴;䐾ʀabios᪠ⵒⵗǈⵚlac;䅑v;樸old;榼lig;䅓Ācr⵩⵭ir;榿;쀀𝔬ͯ⵹\0\0⵼\0ⶂn;䋛ave耻ò䃲;槁Ābmⶈ෴ar;榵Ȁacitⶕ⶘ⶥⶨrò᪀Āir⶝ⶠr;榾oss;榻nå๒;槀ƀaeiⶱⶵⶹcr;䅍ga;䏉ƀcdnⷀⷅǍron;䎿;榶pf;쀀𝕠ƀaelⷔ⷗ǒr;榷rp;榹΀;adiosvⷪⷫⷮ⸈⸍⸐⸖戨rò᪆Ȁ;efmⷷⷸ⸂⸅橝rĀ;oⷾⷿ愴f»ⷿ耻ª䂪耻º䂺gof;抶r;橖lope;橗;橛ƀclo⸟⸡⸧ò⸁ash耻ø䃸l;折iŬⸯ⸴de耻õ䃵esĀ;aǛ⸺s;樶ml耻ö䃶bar;挽ૡ⹞\0⹽\0⺀⺝\0⺢⺹\0\0⻋ຜ\0⼓\0\0⼫⾼\0⿈rȀ;astЃ⹧⹲຅脀¶;l⹭⹮䂶leìЃɩ⹸\0\0⹻m;櫳;櫽y;䐿rʀcimpt⺋⺏⺓ᡥ⺗nt;䀥od;䀮il;怰enk;怱r;쀀𝔭ƀimo⺨⺰⺴Ā;v⺭⺮䏆;䏕maô੶ne;明ƀ;tv⺿⻀⻈䏀chfork»´;䏖Āau⻏⻟nĀck⻕⻝kĀ;h⇴⻛;愎ö⇴sҀ;abcdemst⻳⻴ᤈ⻹⻽⼄⼆⼊⼎䀫cir;樣ir;樢Āouᵀ⼂;樥;橲n肻±ຝim;樦wo;樧ƀipu⼙⼠⼥ntint;樕f;쀀𝕡nd耻£䂣Ԁ;Eaceinosu່⼿⽁⽄⽇⾁⾉⾒⽾⾶;檳p;檷uå໙Ā;c໎⽌̀;acens່⽙⽟⽦⽨⽾pproø⽃urlyeñ໙ñ໎ƀaes⽯⽶⽺pprox;檹qq;檵im;拨iíໟmeĀ;s⾈ຮ怲ƀEas⽸⾐⽺ð⽵ƀdfp໬⾙⾯ƀals⾠⾥⾪lar;挮ine;挒urf;挓Ā;t໻⾴ï໻rel;抰Āci⿀⿅r;쀀𝓅;䏈ncsp;怈̀fiopsu⿚⋢⿟⿥⿫⿱r;쀀𝔮pf;쀀𝕢rime;恗cr;쀀𝓆ƀaeo⿸〉〓tĀei⿾々rnionóڰnt;樖stĀ;e【】䀿ñἙô༔઀ABHabcdefhilmnoprstux぀けさすムㄎㄫㅇㅢㅲㆎ㈆㈕㈤㈩㉘㉮㉲㊐㊰㊷ƀartぇおがròႳòϝail;検aròᱥar;楤΀cdenqrtとふへみわゔヌĀeuねぱ;쀀∽̱te;䅕iãᅮmptyv;榳gȀ;del࿑らるろ;榒;榥å࿑uo耻»䂻rր;abcfhlpstw࿜ガクシスゼゾダッデナp;極Ā;f࿠ゴs;椠;椳s;椞ë≝ð✮l;楅im;楴l;憣;憝Āaiパフil;椚oĀ;nホボ戶aló༞ƀabrョリヮrò៥rk;杳ĀakンヽcĀekヹ・;䁽;䁝Āes㄂㄄;榌lĀduㄊㄌ;榎;榐Ȁaeuyㄗㄜㄧㄩron;䅙Ādiㄡㄥil;䅗ì࿲âヺ;䑀Ȁclqsㄴㄷㄽㅄa;椷dhar;楩uoĀ;rȎȍh;憳ƀacgㅎㅟངlȀ;ipsླྀㅘㅛႜnåႻarôྩt;断ƀilrㅩဣㅮsht;楽;쀀𝔯ĀaoㅷㆆrĀduㅽㅿ»ѻĀ;l႑ㆄ;楬Ā;vㆋㆌ䏁;䏱ƀgns㆕ㇹㇼht̀ahlrstㆤㆰ㇂㇘㇤㇮rrowĀ;t࿜ㆭaéトarpoonĀduㆻㆿowîㅾp»႒eftĀah㇊㇐rrowó࿪arpoonóՑightarrows;應quigarro÷ニhreetimes;拌g;䋚ingdotseñἲƀahm㈍㈐㈓rò࿪aòՑ;怏oustĀ;a㈞㈟掱che»㈟mid;櫮Ȁabpt㈲㈽㉀㉒Ānr㈷㈺g;柭r;懾rëဃƀafl㉇㉊㉎r;榆;쀀𝕣us;樮imes;樵Āap㉝㉧rĀ;g㉣㉤䀩t;榔olint;樒arò㇣Ȁachq㉻㊀Ⴜ㊅quo;怺r;쀀𝓇Ābu・㊊oĀ;rȔȓƀhir㊗㊛㊠reåㇸmes;拊iȀ;efl㊪ၙᠡ㊫方tri;槎luhar;楨;愞ൡ㋕㋛㋟㌬㌸㍱\0㍺㎤\0\0㏬㏰\0㐨㑈㑚㒭㒱㓊㓱\0㘖\0\0㘳cute;䅛quï➺Ԁ;Eaceinpsyᇭ㋳㋵㋿㌂㌋㌏㌟㌦㌩;檴ǰ㋺\0㋼;檸on;䅡uåᇾĀ;dᇳ㌇il;䅟rc;䅝ƀEas㌖㌘㌛;檶p;檺im;择olint;樓iíሄ;䑁otƀ;be㌴ᵇ㌵担;橦΀Aacmstx㍆㍊㍗㍛㍞㍣㍭rr;懘rĀhr㍐㍒ë∨Ā;oਸ਼਴t耻§䂧i;䀻war;椩mĀin㍩ðnuóñt;朶rĀ;o㍶⁕쀀𝔰Ȁacoy㎂㎆㎑㎠rp;景Āhy㎋㎏cy;䑉;䑈rtɭ㎙\0\0㎜iäᑤaraì⹯耻­䂭Āgm㎨㎴maƀ;fv㎱㎲㎲䏃;䏂Ѐ;deglnprካ㏅㏉㏎㏖㏞㏡㏦ot;橪Ā;q኱ኰĀ;E㏓㏔檞;檠Ā;E㏛㏜檝;檟e;扆lus;樤arr;楲aròᄽȀaeit㏸㐈㐏㐗Āls㏽㐄lsetmé㍪hp;樳parsl;槤Ādlᑣ㐔e;挣Ā;e㐜㐝檪Ā;s㐢㐣檬;쀀⪬︀ƀflp㐮㐳㑂tcy;䑌Ā;b㐸㐹䀯Ā;a㐾㐿槄r;挿f;쀀𝕤aĀdr㑍ЂesĀ;u㑔㑕晠it»㑕ƀcsu㑠㑹㒟Āau㑥㑯pĀ;sᆈ㑫;쀀⊓︀pĀ;sᆴ㑵;쀀⊔︀uĀbp㑿㒏ƀ;esᆗᆜ㒆etĀ;eᆗ㒍ñᆝƀ;esᆨᆭ㒖etĀ;eᆨ㒝ñᆮƀ;afᅻ㒦ְrť㒫ֱ»ᅼaròᅈȀcemt㒹㒾㓂㓅r;쀀𝓈tmîñiì㐕aræᆾĀar㓎㓕rĀ;f㓔ឿ昆Āan㓚㓭ightĀep㓣㓪psiloîỠhé⺯s»⡒ʀbcmnp㓻㕞ሉ㖋㖎Ҁ;Edemnprs㔎㔏㔑㔕㔞㔣㔬㔱㔶抂;櫅ot;檽Ā;dᇚ㔚ot;櫃ult;櫁ĀEe㔨㔪;櫋;把lus;檿arr;楹ƀeiu㔽㕒㕕tƀ;en㔎㕅㕋qĀ;qᇚ㔏eqĀ;q㔫㔨m;櫇Ābp㕚㕜;櫕;櫓c̀;acensᇭ㕬㕲㕹㕻㌦pproø㋺urlyeñᇾñᇳƀaes㖂㖈㌛pproø㌚qñ㌗g;晪ڀ123;Edehlmnps㖩㖬㖯ሜ㖲㖴㗀㗉㗕㗚㗟㗨㗭耻¹䂹耻²䂲耻³䂳;櫆Āos㖹㖼t;檾ub;櫘Ā;dሢ㗅ot;櫄sĀou㗏㗒l;柉b;櫗arr;楻ult;櫂ĀEe㗤㗦;櫌;抋lus;櫀ƀeiu㗴㘉㘌tƀ;enሜ㗼㘂qĀ;qሢ㖲eqĀ;q㗧㗤m;櫈Ābp㘑㘓;櫔;櫖ƀAan㘜㘠㘭rr;懙rĀhr㘦㘨ë∮Ā;oਫ਩war;椪lig耻ß䃟௡㙑㙝㙠ዎ㙳㙹\0㙾㛂\0\0\0\0\0㛛㜃\0㜉㝬\0\0\0㞇ɲ㙖\0\0㙛get;挖;䏄rë๟ƀaey㙦㙫㙰ron;䅥dil;䅣;䑂lrec;挕r;쀀𝔱Ȁeiko㚆㚝㚵㚼ǲ㚋\0㚑eĀ4fኄኁaƀ;sv㚘㚙㚛䎸ym;䏑Ācn㚢㚲kĀas㚨㚮pproø዁im»ኬsðኞĀas㚺㚮ð዁rn耻þ䃾Ǭ̟㛆⋧es膀×;bd㛏㛐㛘䃗Ā;aᤏ㛕r;樱;樰ƀeps㛡㛣㜀á⩍Ȁ;bcf҆㛬㛰㛴ot;挶ir;櫱Ā;o㛹㛼쀀𝕥rk;櫚á㍢rime;怴ƀaip㜏㜒㝤dåቈ΀adempst㜡㝍㝀㝑㝗㝜㝟ngleʀ;dlqr㜰㜱㜶㝀㝂斵own»ᶻeftĀ;e⠀㜾ñम;扜ightĀ;e㊪㝋ñၚot;旬inus;樺lus;樹b;槍ime;樻ezium;揢ƀcht㝲㝽㞁Āry㝷㝻;쀀𝓉;䑆cy;䑛rok;䅧Āio㞋㞎xô᝷headĀlr㞗㞠eftarro÷ࡏightarrow»ཝऀAHabcdfghlmoprstuw㟐㟓㟗㟤㟰㟼㠎㠜㠣㠴㡑㡝㡫㢩㣌㣒㣪㣶ròϭar;楣Ācr㟜㟢ute耻ú䃺òᅐrǣ㟪\0㟭y;䑞ve;䅭Āiy㟵㟺rc耻û䃻;䑃ƀabh㠃㠆㠋ròᎭlac;䅱aòᏃĀir㠓㠘sht;楾;쀀𝔲rave耻ù䃹š㠧㠱rĀlr㠬㠮»ॗ»ႃlk;斀Āct㠹㡍ɯ㠿\0\0㡊rnĀ;e㡅㡆挜r»㡆op;挏ri;旸Āal㡖㡚cr;䅫肻¨͉Āgp㡢㡦on;䅳f;쀀𝕦̀adhlsuᅋ㡸㡽፲㢑㢠ownáᎳarpoonĀlr㢈㢌efô㠭ighô㠯iƀ;hl㢙㢚㢜䏅»ᏺon»㢚parrows;懈ƀcit㢰㣄㣈ɯ㢶\0\0㣁rnĀ;e㢼㢽挝r»㢽op;挎ng;䅯ri;旹cr;쀀𝓊ƀdir㣙㣝㣢ot;拰lde;䅩iĀ;f㜰㣨»᠓Āam㣯㣲rò㢨l耻ü䃼angle;榧ހABDacdeflnoprsz㤜㤟㤩㤭㦵㦸㦽㧟㧤㧨㧳㧹㧽㨁㨠ròϷarĀ;v㤦㤧櫨;櫩asèϡĀnr㤲㤷grt;榜΀eknprst㓣㥆㥋㥒㥝㥤㦖appá␕othinçẖƀhir㓫⻈㥙opô⾵Ā;hᎷ㥢ïㆍĀiu㥩㥭gmá㎳Ābp㥲㦄setneqĀ;q㥽㦀쀀⊊︀;쀀⫋︀setneqĀ;q㦏㦒쀀⊋︀;쀀⫌︀Āhr㦛㦟etá㚜iangleĀlr㦪㦯eft»थight»ၑy;䐲ash»ံƀelr㧄㧒㧗ƀ;beⷪ㧋㧏ar;抻q;扚lip;拮Ābt㧜ᑨaòᑩr;쀀𝔳tré㦮suĀbp㧯㧱»ജ»൙pf;쀀𝕧roð໻tré㦴Ācu㨆㨋r;쀀𝓋Ābp㨐㨘nĀEe㦀㨖»㥾nĀEe㦒㨞»㦐igzag;榚΀cefoprs㨶㨻㩖㩛㩔㩡㩪irc;䅵Ādi㩀㩑Ābg㩅㩉ar;機eĀ;qᗺ㩏;扙erp;愘r;쀀𝔴pf;쀀𝕨Ā;eᑹ㩦atèᑹcr;쀀𝓌ૣណ㪇\0㪋\0㪐㪛\0\0㪝㪨㪫㪯\0\0㫃㫎\0㫘ៜ៟tré៑r;쀀𝔵ĀAa㪔㪗ròσrò৶;䎾ĀAa㪡㪤ròθrò৫að✓is;拻ƀdptឤ㪵㪾Āfl㪺ឩ;쀀𝕩imåឲĀAa㫇㫊ròώròਁĀcq㫒ីr;쀀𝓍Āpt៖㫜ré។Ѐacefiosu㫰㫽㬈㬌㬑㬕㬛㬡cĀuy㫶㫻te耻ý䃽;䑏Āiy㬂㬆rc;䅷;䑋n耻¥䂥r;쀀𝔶cy;䑗pf;쀀𝕪cr;쀀𝓎Ācm㬦㬩y;䑎l耻ÿ䃿Ԁacdefhiosw㭂㭈㭔㭘㭤㭩㭭㭴㭺㮀cute;䅺Āay㭍㭒ron;䅾;䐷ot;䅼Āet㭝㭡træᕟa;䎶r;쀀𝔷cy;䐶grarr;懝pf;쀀𝕫cr;쀀𝓏Ājn㮅㮇;怍j;怌'.split("").map(e=>e.charCodeAt(0))),mb=new Uint16Array("Ȁaglq	\x1Bɭ\0\0p;䀦os;䀧t;䀾t;䀼uot;䀢".split("").map(e=>e.charCodeAt(0)));var Gi;const bb=new Map([[0,65533],[128,8364],[130,8218],[131,402],[132,8222],[133,8230],[134,8224],[135,8225],[136,710],[137,8240],[138,352],[139,8249],[140,338],[142,381],[145,8216],[146,8217],[147,8220],[148,8221],[149,8226],[150,8211],[151,8212],[152,732],[153,8482],[154,353],[155,8250],[156,339],[158,382],[159,376]]),gb=(Gi=String.fromCodePoint)!==null&&Gi!==void 0?Gi:function(e){let t="";return e>65535&&(e-=65536,t+=String.fromCharCode(e>>>10&1023|55296),e=56320|e&1023),t+=String.fromCharCode(e),t};function _b(e){var t;return e>=55296&&e<=57343||e>1114111?65533:(t=bb.get(e))!==null&&t!==void 0?t:e}var Te;(function(e){e[e.NUM=35]="NUM",e[e.SEMI=59]="SEMI",e[e.EQUALS=61]="EQUALS",e[e.ZERO=48]="ZERO",e[e.NINE=57]="NINE",e[e.LOWER_A=97]="LOWER_A",e[e.LOWER_F=102]="LOWER_F",e[e.LOWER_X=120]="LOWER_X",e[e.LOWER_Z=122]="LOWER_Z",e[e.UPPER_A=65]="UPPER_A",e[e.UPPER_F=70]="UPPER_F",e[e.UPPER_Z=90]="UPPER_Z"})(Te||(Te={}));const yb=32;var eu;(function(e){e[e.VALUE_LENGTH=49152]="VALUE_LENGTH",e[e.BRANCH_LENGTH=16256]="BRANCH_LENGTH",e[e.JUMP_TABLE=127]="JUMP_TABLE"})(eu||(eu={}));function Is(e){return e>=Te.ZERO&&e<=Te.NINE}function xb(e){return e>=Te.UPPER_A&&e<=Te.UPPER_F||e>=Te.LOWER_A&&e<=Te.LOWER_F}function Eb(e){return e>=Te.UPPER_A&&e<=Te.UPPER_Z||e>=Te.LOWER_A&&e<=Te.LOWER_Z||Is(e)}function Cb(e){return e===Te.EQUALS||Eb(e)}var De;(function(e){e[e.EntityStart=0]="EntityStart",e[e.NumericStart=1]="NumericStart",e[e.NumericDecimal=2]="NumericDecimal",e[e.NumericHex=3]="NumericHex",e[e.NamedEntity=4]="NamedEntity"})(De||(De={}));var Qt;(function(e){e[e.Legacy=0]="Legacy",e[e.Strict=1]="Strict",e[e.Attribute=2]="Attribute"})(Qt||(Qt={}));class kb{constructor(t,u,n){this.decodeTree=t,this.emitCodePoint=u,this.errors=n,this.state=De.EntityStart,this.consumed=1,this.result=0,this.treeIndex=0,this.excess=1,this.decodeMode=Qt.Strict}startEntity(t){this.decodeMode=t,this.state=De.EntityStart,this.result=0,this.treeIndex=0,this.excess=1,this.consumed=1}write(t,u){switch(this.state){case De.EntityStart:return t.charCodeAt(u)===Te.NUM?(this.state=De.NumericStart,this.consumed+=1,this.stateNumericStart(t,u+1)):(this.state=De.NamedEntity,this.stateNamedEntity(t,u));case De.NumericStart:return this.stateNumericStart(t,u);case De.NumericDecimal:return this.stateNumericDecimal(t,u);case De.NumericHex:return this.stateNumericHex(t,u);case De.NamedEntity:return this.stateNamedEntity(t,u)}}stateNumericStart(t,u){return u>=t.length?-1:(t.charCodeAt(u)|yb)===Te.LOWER_X?(this.state=De.NumericHex,this.consumed+=1,this.stateNumericHex(t,u+1)):(this.state=De.NumericDecimal,this.stateNumericDecimal(t,u))}addToNumericResult(t,u,n,r){if(u!==n){const i=n-u;this.result=this.result*Math.pow(r,i)+parseInt(t.substr(u,i),r),this.consumed+=i}}stateNumericHex(t,u){const n=u;for(;u<t.length;){const r=t.charCodeAt(u);if(Is(r)||xb(r))u+=1;else return this.addToNumericResult(t,n,u,16),this.emitNumericEntity(r,3)}return this.addToNumericResult(t,n,u,16),-1}stateNumericDecimal(t,u){const n=u;for(;u<t.length;){const r=t.charCodeAt(u);if(Is(r))u+=1;else return this.addToNumericResult(t,n,u,10),this.emitNumericEntity(r,2)}return this.addToNumericResult(t,n,u,10),-1}emitNumericEntity(t,u){var n;if(this.consumed<=u)return(n=this.errors)===null||n===void 0||n.absenceOfDigitsInNumericCharacterReference(this.consumed),0;if(t===Te.SEMI)this.consumed+=1;else if(this.decodeMode===Qt.Strict)return 0;return this.emitCodePoint(_b(this.result),this.consumed),this.errors&&(t!==Te.SEMI&&this.errors.missingSemicolonAfterCharacterReference(),this.errors.validateNumericCharacterReference(this.result)),this.consumed}stateNamedEntity(t,u){const{decodeTree:n}=this;let r=n[this.treeIndex],i=(r&eu.VALUE_LENGTH)>>14;for(;u<t.length;u++,this.excess++){const s=t.charCodeAt(u);if(this.treeIndex=Ab(n,r,this.treeIndex+Math.max(1,i),s),this.treeIndex<0)return this.result===0||this.decodeMode===Qt.Attribute&&(i===0||Cb(s))?0:this.emitNotTerminatedNamedEntity();if(r=n[this.treeIndex],i=(r&eu.VALUE_LENGTH)>>14,i!==0){if(s===Te.SEMI)return this.emitNamedEntityData(this.treeIndex,i,this.consumed+this.excess);this.decodeMode!==Qt.Strict&&(this.result=this.treeIndex,this.consumed+=this.excess,this.excess=0)}}return-1}emitNotTerminatedNamedEntity(){var t;const{result:u,decodeTree:n}=this,r=(n[u]&eu.VALUE_LENGTH)>>14;return this.emitNamedEntityData(u,r,this.consumed),(t=this.errors)===null||t===void 0||t.missingSemicolonAfterCharacterReference(),this.consumed}emitNamedEntityData(t,u,n){const{decodeTree:r}=this;return this.emitCodePoint(u===1?r[t]&~eu.VALUE_LENGTH:r[t+1],n),u===3&&this.emitCodePoint(r[t+2],n),n}end(){var t;switch(this.state){case De.NamedEntity:return this.result!==0&&(this.decodeMode!==Qt.Attribute||this.result===this.treeIndex)?this.emitNotTerminatedNamedEntity():0;case De.NumericDecimal:return this.emitNumericEntity(0,2);case De.NumericHex:return this.emitNumericEntity(0,3);case De.NumericStart:return(t=this.errors)===null||t===void 0||t.absenceOfDigitsInNumericCharacterReference(this.consumed),0;case De.EntityStart:return 0}}}function gf(e){let t="";const u=new kb(e,n=>t+=gb(n));return function(r,i){let s=0,o=0;for(;(o=r.indexOf("&",o))>=0;){t+=r.slice(s,o),u.startEntity(i);const l=u.write(r,o+1);if(l<0){s=o+u.end();break}s=o+l,o=l===0?s+1:s}const c=t+r.slice(s);return t="",c}}function Ab(e,t,u,n){const r=(t&eu.BRANCH_LENGTH)>>7,i=t&eu.JUMP_TABLE;if(r===0)return i!==0&&n===i?u:-1;if(i){const c=n-i;return c<0||c>=r?-1:e[u+c]-1}let s=u,o=s+r-1;for(;s<=o;){const c=s+o>>>1,l=e[c];if(l<n)s=c+1;else if(l>n)o=c-1;else return e[c+r]}return-1}const Sb=gf(pb);gf(mb);function _f(e,t=Qt.Legacy){return Sb(e,t)}function vb(e){return Object.prototype.toString.call(e)}function Lo(e){return vb(e)==="[object String]"}const Db=Object.prototype.hasOwnProperty;function Tb(e,t){return Db.call(e,t)}function vi(e){return Array.prototype.slice.call(arguments,1).forEach(function(u){if(u){if(typeof u!="object")throw new TypeError(u+"must be object");Object.keys(u).forEach(function(n){e[n]=u[n]})}}),e}function yf(e,t,u){return[].concat(e.slice(0,t),u,e.slice(t+1))}function Bo(e){return!(e>=55296&&e<=57343||e>=64976&&e<=65007||(e&65535)===65535||(e&65535)===65534||e>=0&&e<=8||e===11||e>=14&&e<=31||e>=127&&e<=159||e>1114111)}function Xr(e){if(e>65535){e-=65536;const t=55296+(e>>10),u=56320+(e&1023);return String.fromCharCode(t,u)}return String.fromCharCode(e)}const xf=/\\([!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])/g,wb=/&([a-z#][a-z0-9]{1,31});/gi,Fb=new RegExp(xf.source+"|"+wb.source,"gi"),Ib=/^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))$/i;function Ob(e,t){if(t.charCodeAt(0)===35&&Ib.test(t)){const n=t[1].toLowerCase()==="x"?parseInt(t.slice(2),16):parseInt(t.slice(1),10);return Bo(n)?Xr(n):e}const u=_f(e);return u!==e?u:e}function Nb(e){return e.indexOf("\\")<0?e:e.replace(xf,"$1")}function en(e){return e.indexOf("\\")<0&&e.indexOf("&")<0?e:e.replace(Fb,function(t,u,n){return u||Ob(t,n)})}const Rb=/[&<>"]/,Mb=/[&<>"]/g,Pb={"&":"&amp;","<":"&lt;",">":"&gt;",'"':"&quot;"};function Lb(e){return Pb[e]}function lu(e){return Rb.test(e)?e.replace(Mb,Lb):e}const Bb=/[.?*+^$[\]\\(){}|-]/g;function Vb(e){return e.replace(Bb,"\\$&")}function ae(e){switch(e){case 9:case 32:return!0}return!1}function Un(e){if(e>=8192&&e<=8202)return!0;switch(e){case 9:case 10:case 11:case 12:case 13:case 32:case 160:case 5760:case 8239:case 8287:case 12288:return!0}return!1}function qn(e){return Po.test(e)||mf.test(e)}function jn(e){switch(e){case 33:case 34:case 35:case 36:case 37:case 38:case 39:case 40:case 41:case 42:case 43:case 44:case 45:case 46:case 47:case 58:case 59:case 60:case 61:case 62:case 63:case 64:case 91:case 92:case 93:case 94:case 95:case 96:case 123:case 124:case 125:case 126:return!0;default:return!1}}function Di(e){return e=e.trim().replace(/\s+/g," "),"ẞ".toLowerCase()==="Ṿ"&&(e=e.replace(/ẞ/g,"ß")),e.toLowerCase().toUpperCase()}const $b={mdurl:fb,ucmicro:hb},Hb=Object.freeze(Object.defineProperty({__proto__:null,arrayReplaceAt:yf,assign:vi,escapeHtml:lu,escapeRE:Vb,fromCodePoint:Xr,has:Tb,isMdAsciiPunct:jn,isPunctChar:qn,isSpace:ae,isString:Lo,isValidEntityCode:Bo,isWhiteSpace:Un,lib:$b,normalizeReference:Di,unescapeAll:en,unescapeMd:Nb},Symbol.toStringTag,{value:"Module"}));function Ub(e,t,u){let n,r,i,s;const o=e.posMax,c=e.pos;for(e.pos=t+1,n=1;e.pos<o;){if(i=e.src.charCodeAt(e.pos),i===93&&(n--,n===0)){r=!0;break}if(s=e.pos,e.md.inline.skipToken(e),i===91){if(s===e.pos-1)n++;else if(u)return e.pos=c,-1}}let l=-1;return r&&(l=e.pos),e.pos=c,l}function qb(e,t,u){let n,r=t;const i={ok:!1,pos:0,str:""};if(e.charCodeAt(r)===60){for(r++;r<u;){if(n=e.charCodeAt(r),n===10||n===60)return i;if(n===62)return i.pos=r+1,i.str=en(e.slice(t+1,r)),i.ok=!0,i;if(n===92&&r+1<u){r+=2;continue}r++}return i}let s=0;for(;r<u&&(n=e.charCodeAt(r),!(n===32||n<32||n===127));){if(n===92&&r+1<u){if(e.charCodeAt(r+1)===32)break;r+=2;continue}if(n===40&&(s++,s>32))return i;if(n===41){if(s===0)break;s--}r++}return t===r||s!==0||(i.str=en(e.slice(t,r)),i.pos=r,i.ok=!0),i}function jb(e,t,u,n){let r,i=t;const s={ok:!1,can_continue:!1,pos:0,str:"",marker:0};if(n)s.str=n.str,s.marker=n.marker;else{if(i>=u)return s;let o=e.charCodeAt(i);if(o!==34&&o!==39&&o!==40)return s;t++,i++,o===40&&(o=41),s.marker=o}for(;i<u;){if(r=e.charCodeAt(i),r===s.marker)return s.pos=i+1,s.str+=en(e.slice(t,i)),s.ok=!0,s;if(r===40&&s.marker===41)return s;r===92&&i+1<u&&i++,i++}return s.can_continue=!0,s.str+=en(e.slice(t,i)),s}const zb=Object.freeze(Object.defineProperty({__proto__:null,parseLinkDestination:qb,parseLinkLabel:Ub,parseLinkTitle:jb},Symbol.toStringTag,{value:"Module"})),Dt={};Dt.code_inline=function(e,t,u,n,r){const i=e[t];return"<code"+r.renderAttrs(i)+">"+lu(i.content)+"</code>"};Dt.code_block=function(e,t,u,n,r){const i=e[t];return"<pre"+r.renderAttrs(i)+"><code>"+lu(e[t].content)+`</code></pre>
+`};Dt.fence=function(e,t,u,n,r){const i=e[t],s=i.info?en(i.info).trim():"";let o="",c="";if(s){const a=s.split(/(\s+)/g);o=a[0],c=a.slice(2).join("")}let l;if(u.highlight?l=u.highlight(i.content,o,c)||lu(i.content):l=lu(i.content),l.indexOf("<pre")===0)return l+`
+`;if(s){const a=i.attrIndex("class"),f=i.attrs?i.attrs.slice():[];a<0?f.push(["class",u.langPrefix+o]):(f[a]=f[a].slice(),f[a][1]+=" "+u.langPrefix+o);const p={attrs:f};return`<pre><code${r.renderAttrs(p)}>${l}</code></pre>
+`}return`<pre><code${r.renderAttrs(i)}>${l}</code></pre>
+`};Dt.image=function(e,t,u,n,r){const i=e[t];return i.attrs[i.attrIndex("alt")][1]=r.renderInlineAsText(i.children,u,n),r.renderToken(e,t,u)};Dt.hardbreak=function(e,t,u){return u.xhtmlOut?`<br />
+`:`<br>
+`};Dt.softbreak=function(e,t,u){return u.breaks?u.xhtmlOut?`<br />
+`:`<br>
+`:`
+`};Dt.text=function(e,t){return lu(e[t].content)};Dt.html_block=function(e,t){return e[t].content};Dt.html_inline=function(e,t){return e[t].content};function rn(){this.rules=vi({},Dt)}rn.prototype.renderAttrs=function(t){let u,n,r;if(!t.attrs)return"";for(r="",u=0,n=t.attrs.length;u<n;u++)r+=" "+lu(t.attrs[u][0])+'="'+lu(t.attrs[u][1])+'"';return r};rn.prototype.renderToken=function(t,u,n){const r=t[u];let i="";if(r.hidden)return"";r.block&&r.nesting!==-1&&u&&t[u-1].hidden&&(i+=`
+`),i+=(r.nesting===-1?"</":"<")+r.tag,i+=this.renderAttrs(r),r.nesting===0&&n.xhtmlOut&&(i+=" /");let s=!1;if(r.block&&(s=!0,r.nesting===1&&u+1<t.length)){const o=t[u+1];(o.type==="inline"||o.hidden||o.nesting===-1&&o.tag===r.tag)&&(s=!1)}return i+=s?`>
+`:">",i};rn.prototype.renderInline=function(e,t,u){let n="";const r=this.rules;for(let i=0,s=e.length;i<s;i++){const o=e[i].type;typeof r[o]<"u"?n+=r[o](e,i,t,u,this):n+=this.renderToken(e,i,t)}return n};rn.prototype.renderInlineAsText=function(e,t,u){let n="";for(let r=0,i=e.length;r<i;r++)switch(e[r].type){case"text":n+=e[r].content;break;case"image":n+=this.renderInlineAsText(e[r].children,t,u);break;case"html_inline":case"html_block":n+=e[r].content;break;case"softbreak":case"hardbreak":n+=`
+`;break}return n};rn.prototype.render=function(e,t,u){let n="";const r=this.rules;for(let i=0,s=e.length;i<s;i++){const o=e[i].type;o==="inline"?n+=this.renderInline(e[i].children,t,u):typeof r[o]<"u"?n+=r[o](e,i,t,u,this):n+=this.renderToken(e,i,t,u)}return n};function Je(){this.__rules__=[],this.__cache__=null}Je.prototype.__find__=function(e){for(let t=0;t<this.__rules__.length;t++)if(this.__rules__[t].name===e)return t;return-1};Je.prototype.__compile__=function(){const e=this,t=[""];e.__rules__.forEach(function(u){u.enabled&&u.alt.forEach(function(n){t.indexOf(n)<0&&t.push(n)})}),e.__cache__={},t.forEach(function(u){e.__cache__[u]=[],e.__rules__.forEach(function(n){n.enabled&&(u&&n.alt.indexOf(u)<0||e.__cache__[u].push(n.fn))})})};Je.prototype.at=function(e,t,u){const n=this.__find__(e),r=u||{};if(n===-1)throw new Error("Parser rule not found: "+e);this.__rules__[n].fn=t,this.__rules__[n].alt=r.alt||[],this.__cache__=null};Je.prototype.before=function(e,t,u,n){const r=this.__find__(e),i=n||{};if(r===-1)throw new Error("Parser rule not found: "+e);this.__rules__.splice(r,0,{name:t,enabled:!0,fn:u,alt:i.alt||[]}),this.__cache__=null};Je.prototype.after=function(e,t,u,n){const r=this.__find__(e),i=n||{};if(r===-1)throw new Error("Parser rule not found: "+e);this.__rules__.splice(r+1,0,{name:t,enabled:!0,fn:u,alt:i.alt||[]}),this.__cache__=null};Je.prototype.push=function(e,t,u){const n=u||{};this.__rules__.push({name:e,enabled:!0,fn:t,alt:n.alt||[]}),this.__cache__=null};Je.prototype.enable=function(e,t){Array.isArray(e)||(e=[e]);const u=[];return e.forEach(function(n){const r=this.__find__(n);if(r<0){if(t)return;throw new Error("Rules manager: invalid rule name "+n)}this.__rules__[r].enabled=!0,u.push(n)},this),this.__cache__=null,u};Je.prototype.enableOnly=function(e,t){Array.isArray(e)||(e=[e]),this.__rules__.forEach(function(u){u.enabled=!1}),this.enable(e,t)};Je.prototype.disable=function(e,t){Array.isArray(e)||(e=[e]);const u=[];return e.forEach(function(n){const r=this.__find__(n);if(r<0){if(t)return;throw new Error("Rules manager: invalid rule name "+n)}this.__rules__[r].enabled=!1,u.push(n)},this),this.__cache__=null,u};Je.prototype.getRules=function(e){return this.__cache__===null&&this.__compile__(),this.__cache__[e]||[]};function _t(e,t,u){this.type=e,this.tag=t,this.attrs=null,this.map=null,this.nesting=u,this.level=0,this.children=null,this.content="",this.markup="",this.info="",this.meta=null,this.block=!1,this.hidden=!1}_t.prototype.attrIndex=function(t){if(!this.attrs)return-1;const u=this.attrs;for(let n=0,r=u.length;n<r;n++)if(u[n][0]===t)return n;return-1};_t.prototype.attrPush=function(t){this.attrs?this.attrs.push(t):this.attrs=[t]};_t.prototype.attrSet=function(t,u){const n=this.attrIndex(t),r=[t,u];n<0?this.attrPush(r):this.attrs[n]=r};_t.prototype.attrGet=function(t){const u=this.attrIndex(t);let n=null;return u>=0&&(n=this.attrs[u][1]),n};_t.prototype.attrJoin=function(t,u){const n=this.attrIndex(t);n<0?this.attrPush([t,u]):this.attrs[n][1]=this.attrs[n][1]+" "+u};function Ef(e,t,u){this.src=e,this.env=u,this.tokens=[],this.inlineMode=!1,this.md=t}Ef.prototype.Token=_t;const Kb=/\r\n?|\n/g,Wb=/\0/g;function Gb(e){let t;t=e.src.replace(Kb,`
+`),t=t.replace(Wb,"�"),e.src=t}function Zb(e){let t;e.inlineMode?(t=new e.Token("inline","",0),t.content=e.src,t.map=[0,1],t.children=[],e.tokens.push(t)):e.md.block.parse(e.src,e.md,e.env,e.tokens)}function Jb(e){const t=e.tokens;for(let u=0,n=t.length;u<n;u++){const r=t[u];r.type==="inline"&&e.md.inline.parse(r.content,e.md,e.env,r.children)}}function Yb(e){return/^<a[>\s]/i.test(e)}function Xb(e){return/^<\/a\s*>/i.test(e)}function Qb(e){const t=e.tokens;if(e.md.options.linkify)for(let u=0,n=t.length;u<n;u++){if(t[u].type!=="inline"||!e.md.linkify.pretest(t[u].content))continue;let r=t[u].children,i=0;for(let s=r.length-1;s>=0;s--){const o=r[s];if(o.type==="link_close"){for(s--;r[s].level!==o.level&&r[s].type!=="link_open";)s--;continue}if(o.type==="html_inline"&&(Yb(o.content)&&i>0&&i--,Xb(o.content)&&i++),!(i>0)&&o.type==="text"&&e.md.linkify.test(o.content)){const c=o.content;let l=e.md.linkify.match(c);const a=[];let f=o.level,p=0;l.length>0&&l[0].index===0&&s>0&&r[s-1].type==="text_special"&&(l=l.slice(1));for(let d=0;d<l.length;d++){const h=l[d].url,_=e.md.normalizeLink(h);if(!e.md.validateLink(_))continue;let v=l[d].text;l[d].schema?l[d].schema==="mailto:"&&!/^mailto:/i.test(v)?v=e.md.normalizeLinkText("mailto:"+v).replace(/^mailto:/,""):v=e.md.normalizeLinkText(v):v=e.md.normalizeLinkText("http://"+v).replace(/^http:\/\//,"");const A=l[d].index;if(A>p){const x=new e.Token("text","",0);x.content=c.slice(p,A),x.level=f,a.push(x)}const E=new e.Token("link_open","a",1);E.attrs=[["href",_]],E.level=f++,E.markup="linkify",E.info="auto",a.push(E);const m=new e.Token("text","",0);m.content=v,m.level=f,a.push(m);const g=new e.Token("link_close","a",-1);g.level=--f,g.markup="linkify",g.info="auto",a.push(g),p=l[d].lastIndex}if(p<c.length){const d=new e.Token("text","",0);d.content=c.slice(p),d.level=f,a.push(d)}t[u].children=r=yf(r,s,a)}}}}const Cf=/\+-|\.\.|\?\?\?\?|!!!!|,,|--/,eg=/\((c|tm|r)\)/i,tg=/\((c|tm|r)\)/ig,ug={c:"©",r:"®",tm:"™"};function ng(e,t){return ug[t.toLowerCase()]}function rg(e){let t=0;for(let u=e.length-1;u>=0;u--){const n=e[u];n.type==="text"&&!t&&(n.content=n.content.replace(tg,ng)),n.type==="link_open"&&n.info==="auto"&&t--,n.type==="link_close"&&n.info==="auto"&&t++}}function ig(e){let t=0;for(let u=e.length-1;u>=0;u--){const n=e[u];n.type==="text"&&!t&&Cf.test(n.content)&&(n.content=n.content.replace(/\+-/g,"±").replace(/\.{2,}/g,"…").replace(/([?!])…/g,"$1..").replace(/([?!]){4,}/g,"$1$1$1").replace(/,{2,}/g,",").replace(/(^|[^-])---(?=[^-]|$)/mg,"$1—").replace(/(^|\s)--(?=\s|$)/mg,"$1–").replace(/(^|[^-\s])--(?=[^-\s]|$)/mg,"$1–")),n.type==="link_open"&&n.info==="auto"&&t--,n.type==="link_close"&&n.info==="auto"&&t++}}function sg(e){let t;if(e.md.options.typographer)for(t=e.tokens.length-1;t>=0;t--)e.tokens[t].type==="inline"&&(eg.test(e.tokens[t].content)&&rg(e.tokens[t].children),Cf.test(e.tokens[t].content)&&ig(e.tokens[t].children))}const og=/['"]/,cl=/['"]/g,ll="’";function mr(e,t,u){return e.slice(0,t)+u+e.slice(t+1)}function cg(e,t){let u;const n=[];for(let r=0;r<e.length;r++){const i=e[r],s=e[r].level;for(u=n.length-1;u>=0&&!(n[u].level<=s);u--);if(n.length=u+1,i.type!=="text")continue;let o=i.content,c=0,l=o.length;e:for(;c<l;){cl.lastIndex=c;const a=cl.exec(o);if(!a)break;let f=!0,p=!0;c=a.index+1;const d=a[0]==="'";let h=32;if(a.index-1>=0)h=o.charCodeAt(a.index-1);else for(u=r-1;u>=0&&!(e[u].type==="softbreak"||e[u].type==="hardbreak");u--)if(e[u].content){h=e[u].content.charCodeAt(e[u].content.length-1);break}let _=32;if(c<l)_=o.charCodeAt(c);else for(u=r+1;u<e.length&&!(e[u].type==="softbreak"||e[u].type==="hardbreak");u++)if(e[u].content){_=e[u].content.charCodeAt(0);break}const v=jn(h)||qn(String.fromCharCode(h)),A=jn(_)||qn(String.fromCharCode(_)),E=Un(h),m=Un(_);if(m?f=!1:A&&(E||v||(f=!1)),E?p=!1:v&&(m||A||(p=!1)),_===34&&a[0]==='"'&&h>=48&&h<=57&&(p=f=!1),f&&p&&(f=v,p=A),!f&&!p){d&&(i.content=mr(i.content,a.index,ll));continue}if(p)for(u=n.length-1;u>=0;u--){let g=n[u];if(n[u].level<s)break;if(g.single===d&&n[u].level===s){g=n[u];let x,k;d?(x=t.md.options.quotes[2],k=t.md.options.quotes[3]):(x=t.md.options.quotes[0],k=t.md.options.quotes[1]),i.content=mr(i.content,a.index,k),e[g.token].content=mr(e[g.token].content,g.pos,x),c+=k.length-1,g.token===r&&(c+=x.length-1),o=i.content,l=o.length,n.length=u;continue e}}f?n.push({token:r,pos:a.index,single:d,level:s}):p&&d&&(i.content=mr(i.content,a.index,ll))}}}function lg(e){if(e.md.options.typographer)for(let t=e.tokens.length-1;t>=0;t--)e.tokens[t].type!=="inline"||!og.test(e.tokens[t].content)||cg(e.tokens[t].children,e)}function ag(e){let t,u;const n=e.tokens,r=n.length;for(let i=0;i<r;i++){if(n[i].type!=="inline")continue;const s=n[i].children,o=s.length;for(t=0;t<o;t++)s[t].type==="text_special"&&(s[t].type="text");for(t=u=0;t<o;t++)s[t].type==="text"&&t+1<o&&s[t+1].type==="text"?s[t+1].content=s[t].content+s[t+1].content:(t!==u&&(s[u]=s[t]),u++);t!==u&&(s.length=u)}}const Zi=[["normalize",Gb],["block",Zb],["inline",Jb],["linkify",Qb],["replacements",sg],["smartquotes",lg],["text_join",ag]];function Vo(){this.ruler=new Je;for(let e=0;e<Zi.length;e++)this.ruler.push(Zi[e][0],Zi[e][1])}Vo.prototype.process=function(e){const t=this.ruler.getRules("");for(let u=0,n=t.length;u<n;u++)t[u](e)};Vo.prototype.State=Ef;function Tt(e,t,u,n){this.src=e,this.md=t,this.env=u,this.tokens=n,this.bMarks=[],this.eMarks=[],this.tShift=[],this.sCount=[],this.bsCount=[],this.blkIndent=0,this.line=0,this.lineMax=0,this.tight=!1,this.ddIndent=-1,this.listIndent=-1,this.parentType="root",this.level=0;const r=this.src;for(let i=0,s=0,o=0,c=0,l=r.length,a=!1;s<l;s++){const f=r.charCodeAt(s);if(!a)if(ae(f)){o++,f===9?c+=4-c%4:c++;continue}else a=!0;(f===10||s===l-1)&&(f!==10&&s++,this.bMarks.push(i),this.eMarks.push(s),this.tShift.push(o),this.sCount.push(c),this.bsCount.push(0),a=!1,o=0,c=0,i=s+1)}this.bMarks.push(r.length),this.eMarks.push(r.length),this.tShift.push(0),this.sCount.push(0),this.bsCount.push(0),this.lineMax=this.bMarks.length-1}Tt.prototype.push=function(e,t,u){const n=new _t(e,t,u);return n.block=!0,u<0&&this.level--,n.level=this.level,u>0&&this.level++,this.tokens.push(n),n};Tt.prototype.isEmpty=function(t){return this.bMarks[t]+this.tShift[t]>=this.eMarks[t]};Tt.prototype.skipEmptyLines=function(t){for(let u=this.lineMax;t<u&&!(this.bMarks[t]+this.tShift[t]<this.eMarks[t]);t++);return t};Tt.prototype.skipSpaces=function(t){for(let u=this.src.length;t<u;t++){const n=this.src.charCodeAt(t);if(!ae(n))break}return t};Tt.prototype.skipSpacesBack=function(t,u){if(t<=u)return t;for(;t>u;)if(!ae(this.src.charCodeAt(--t)))return t+1;return t};Tt.prototype.skipChars=function(t,u){for(let n=this.src.length;t<n&&this.src.charCodeAt(t)===u;t++);return t};Tt.prototype.skipCharsBack=function(t,u,n){if(t<=n)return t;for(;t>n;)if(u!==this.src.charCodeAt(--t))return t+1;return t};Tt.prototype.getLines=function(t,u,n,r){if(t>=u)return"";const i=new Array(u-t);for(let s=0,o=t;o<u;o++,s++){let c=0;const l=this.bMarks[o];let a=l,f;for(o+1<u||r?f=this.eMarks[o]+1:f=this.eMarks[o];a<f&&c<n;){const p=this.src.charCodeAt(a);if(ae(p))p===9?c+=4-(c+this.bsCount[o])%4:c++;else if(a-l<this.tShift[o])c++;else break;a++}c>n?i[s]=new Array(c-n+1).join(" ")+this.src.slice(a,f):i[s]=this.src.slice(a,f)}return i.join("")};Tt.prototype.Token=_t;const fg=65536;function Ji(e,t){const u=e.bMarks[t]+e.tShift[t],n=e.eMarks[t];return e.src.slice(u,n)}function al(e){const t=[],u=e.length;let n=0,r=e.charCodeAt(n),i=!1,s=0,o="";for(;n<u;)r===124&&(i?(o+=e.substring(s,n-1),s=n):(t.push(o+e.substring(s,n)),o="",s=n+1)),i=r===92,n++,r=e.charCodeAt(n);return t.push(o+e.substring(s)),t}function dg(e,t,u,n){if(t+2>u)return!1;let r=t+1;if(e.sCount[r]<e.blkIndent||e.sCount[r]-e.blkIndent>=4)return!1;let i=e.bMarks[r]+e.tShift[r];if(i>=e.eMarks[r])return!1;const s=e.src.charCodeAt(i++);if(s!==124&&s!==45&&s!==58||i>=e.eMarks[r])return!1;const o=e.src.charCodeAt(i++);if(o!==124&&o!==45&&o!==58&&!ae(o)||s===45&&ae(o))return!1;for(;i<e.eMarks[r];){const g=e.src.charCodeAt(i);if(g!==124&&g!==45&&g!==58&&!ae(g))return!1;i++}let c=Ji(e,t+1),l=c.split("|");const a=[];for(let g=0;g<l.length;g++){const x=l[g].trim();if(!x){if(g===0||g===l.length-1)continue;return!1}if(!/^:?-+:?$/.test(x))return!1;x.charCodeAt(x.length-1)===58?a.push(x.charCodeAt(0)===58?"center":"right"):x.charCodeAt(0)===58?a.push("left"):a.push("")}if(c=Ji(e,t).trim(),c.indexOf("|")===-1||e.sCount[t]-e.blkIndent>=4)return!1;l=al(c),l.length&&l[0]===""&&l.shift(),l.length&&l[l.length-1]===""&&l.pop();const f=l.length;if(f===0||f!==a.length)return!1;if(n)return!0;const p=e.parentType;e.parentType="table";const d=e.md.block.ruler.getRules("blockquote"),h=e.push("table_open","table",1),_=[t,0];h.map=_;const v=e.push("thead_open","thead",1);v.map=[t,t+1];const A=e.push("tr_open","tr",1);A.map=[t,t+1];for(let g=0;g<l.length;g++){const x=e.push("th_open","th",1);a[g]&&(x.attrs=[["style","text-align:"+a[g]]]);const k=e.push("inline","",0);k.content=l[g].trim(),k.children=[],e.push("th_close","th",-1)}e.push("tr_close","tr",-1),e.push("thead_close","thead",-1);let E,m=0;for(r=t+2;r<u&&!(e.sCount[r]<e.blkIndent);r++){let g=!1;for(let k=0,I=d.length;k<I;k++)if(d[k](e,r,u,!0)){g=!0;break}if(g||(c=Ji(e,r).trim(),!c)||e.sCount[r]-e.blkIndent>=4||(l=al(c),l.length&&l[0]===""&&l.shift(),l.length&&l[l.length-1]===""&&l.pop(),m+=f-l.length,m>fg))break;if(r===t+2){const k=e.push("tbody_open","tbody",1);k.map=E=[t+2,0]}const x=e.push("tr_open","tr",1);x.map=[r,r+1];for(let k=0;k<f;k++){const I=e.push("td_open","td",1);a[k]&&(I.attrs=[["style","text-align:"+a[k]]]);const w=e.push("inline","",0);w.content=l[k]?l[k].trim():"",w.children=[],e.push("td_close","td",-1)}e.push("tr_close","tr",-1)}return E&&(e.push("tbody_close","tbody",-1),E[1]=r),e.push("table_close","table",-1),_[1]=r,e.parentType=p,e.line=r,!0}function hg(e,t,u){if(e.sCount[t]-e.blkIndent<4)return!1;let n=t+1,r=n;for(;n<u;){if(e.isEmpty(n)){n++;continue}if(e.sCount[n]-e.blkIndent>=4){n++,r=n;continue}break}e.line=r;const i=e.push("code_block","code",0);return i.content=e.getLines(t,r,4+e.blkIndent,!1)+`
+`,i.map=[t,e.line],!0}function pg(e,t,u,n){let r=e.bMarks[t]+e.tShift[t],i=e.eMarks[t];if(e.sCount[t]-e.blkIndent>=4||r+3>i)return!1;const s=e.src.charCodeAt(r);if(s!==126&&s!==96)return!1;let o=r;r=e.skipChars(r,s);let c=r-o;if(c<3)return!1;const l=e.src.slice(o,r),a=e.src.slice(r,i);if(s===96&&a.indexOf(String.fromCharCode(s))>=0)return!1;if(n)return!0;let f=t,p=!1;for(;f++,!(f>=u||(r=o=e.bMarks[f]+e.tShift[f],i=e.eMarks[f],r<i&&e.sCount[f]<e.blkIndent));)if(e.src.charCodeAt(r)===s&&!(e.sCount[f]-e.blkIndent>=4)&&(r=e.skipChars(r,s),!(r-o<c)&&(r=e.skipSpaces(r),!(r<i)))){p=!0;break}c=e.sCount[t],e.line=f+(p?1:0);const d=e.push("fence","code",0);return d.info=a,d.content=e.getLines(t+1,f,c,!0),d.markup=l,d.map=[t,e.line],!0}function mg(e,t,u,n){let r=e.bMarks[t]+e.tShift[t],i=e.eMarks[t];const s=e.lineMax;if(e.sCount[t]-e.blkIndent>=4||e.src.charCodeAt(r)!==62)return!1;if(n)return!0;const o=[],c=[],l=[],a=[],f=e.md.block.ruler.getRules("blockquote"),p=e.parentType;e.parentType="blockquote";let d=!1,h;for(h=t;h<u;h++){const m=e.sCount[h]<e.blkIndent;if(r=e.bMarks[h]+e.tShift[h],i=e.eMarks[h],r>=i)break;if(e.src.charCodeAt(r++)===62&&!m){let x=e.sCount[h]+1,k,I;e.src.charCodeAt(r)===32?(r++,x++,I=!1,k=!0):e.src.charCodeAt(r)===9?(k=!0,(e.bsCount[h]+x)%4===3?(r++,x++,I=!1):I=!0):k=!1;let w=x;for(o.push(e.bMarks[h]),e.bMarks[h]=r;r<i;){const C=e.src.charCodeAt(r);if(ae(C))C===9?w+=4-(w+e.bsCount[h]+(I?1:0))%4:w++;else break;r++}d=r>=i,c.push(e.bsCount[h]),e.bsCount[h]=e.sCount[h]+1+(k?1:0),l.push(e.sCount[h]),e.sCount[h]=w-x,a.push(e.tShift[h]),e.tShift[h]=r-e.bMarks[h];continue}if(d)break;let g=!1;for(let x=0,k=f.length;x<k;x++)if(f[x](e,h,u,!0)){g=!0;break}if(g){e.lineMax=h,e.blkIndent!==0&&(o.push(e.bMarks[h]),c.push(e.bsCount[h]),a.push(e.tShift[h]),l.push(e.sCount[h]),e.sCount[h]-=e.blkIndent);break}o.push(e.bMarks[h]),c.push(e.bsCount[h]),a.push(e.tShift[h]),l.push(e.sCount[h]),e.sCount[h]=-1}const _=e.blkIndent;e.blkIndent=0;const v=e.push("blockquote_open","blockquote",1);v.markup=">";const A=[t,0];v.map=A,e.md.block.tokenize(e,t,h);const E=e.push("blockquote_close","blockquote",-1);E.markup=">",e.lineMax=s,e.parentType=p,A[1]=e.line;for(let m=0;m<a.length;m++)e.bMarks[m+t]=o[m],e.tShift[m+t]=a[m],e.sCount[m+t]=l[m],e.bsCount[m+t]=c[m];return e.blkIndent=_,!0}function bg(e,t,u,n){const r=e.eMarks[t];if(e.sCount[t]-e.blkIndent>=4)return!1;let i=e.bMarks[t]+e.tShift[t];const s=e.src.charCodeAt(i++);if(s!==42&&s!==45&&s!==95)return!1;let o=1;for(;i<r;){const l=e.src.charCodeAt(i++);if(l!==s&&!ae(l))return!1;l===s&&o++}if(o<3)return!1;if(n)return!0;e.line=t+1;const c=e.push("hr","hr",0);return c.map=[t,e.line],c.markup=Array(o+1).join(String.fromCharCode(s)),!0}function fl(e,t){const u=e.eMarks[t];let n=e.bMarks[t]+e.tShift[t];const r=e.src.charCodeAt(n++);if(r!==42&&r!==45&&r!==43)return-1;if(n<u){const i=e.src.charCodeAt(n);if(!ae(i))return-1}return n}function dl(e,t){const u=e.bMarks[t]+e.tShift[t],n=e.eMarks[t];let r=u;if(r+1>=n)return-1;let i=e.src.charCodeAt(r++);if(i<48||i>57)return-1;for(;;){if(r>=n)return-1;if(i=e.src.charCodeAt(r++),i>=48&&i<=57){if(r-u>=10)return-1;continue}if(i===41||i===46)break;return-1}return r<n&&(i=e.src.charCodeAt(r),!ae(i))?-1:r}function gg(e,t){const u=e.level+2;for(let n=t+2,r=e.tokens.length-2;n<r;n++)e.tokens[n].level===u&&e.tokens[n].type==="paragraph_open"&&(e.tokens[n+2].hidden=!0,e.tokens[n].hidden=!0,n+=2)}function _g(e,t,u,n){let r,i,s,o,c=t,l=!0;if(e.sCount[c]-e.blkIndent>=4||e.listIndent>=0&&e.sCount[c]-e.listIndent>=4&&e.sCount[c]<e.blkIndent)return!1;let a=!1;n&&e.parentType==="paragraph"&&e.sCount[c]>=e.blkIndent&&(a=!0);let f,p,d;if((d=dl(e,c))>=0){if(f=!0,s=e.bMarks[c]+e.tShift[c],p=Number(e.src.slice(s,d-1)),a&&p!==1)return!1}else if((d=fl(e,c))>=0)f=!1;else return!1;if(a&&e.skipSpaces(d)>=e.eMarks[c])return!1;if(n)return!0;const h=e.src.charCodeAt(d-1),_=e.tokens.length;f?(o=e.push("ordered_list_open","ol",1),p!==1&&(o.attrs=[["start",p]])):o=e.push("bullet_list_open","ul",1);const v=[c,0];o.map=v,o.markup=String.fromCharCode(h);let A=!1;const E=e.md.block.ruler.getRules("list"),m=e.parentType;for(e.parentType="list";c<u;){i=d,r=e.eMarks[c];const g=e.sCount[c]+d-(e.bMarks[c]+e.tShift[c]);let x=g;for(;i<r;){const z=e.src.charCodeAt(i);if(z===9)x+=4-(x+e.bsCount[c])%4;else if(z===32)x++;else break;i++}const k=i;let I;k>=r?I=1:I=x-g,I>4&&(I=1);const w=g+I;o=e.push("list_item_open","li",1),o.markup=String.fromCharCode(h);const C=[c,0];o.map=C,f&&(o.info=e.src.slice(s,d-1));const S=e.tight,R=e.tShift[c],T=e.sCount[c],P=e.listIndent;if(e.listIndent=e.blkIndent,e.blkIndent=w,e.tight=!0,e.tShift[c]=k-e.bMarks[c],e.sCount[c]=x,k>=r&&e.isEmpty(c+1)?e.line=Math.min(e.line+2,u):e.md.block.tokenize(e,c,u,!0),(!e.tight||A)&&(l=!1),A=e.line-c>1&&e.isEmpty(e.line-1),e.blkIndent=e.listIndent,e.listIndent=P,e.tShift[c]=R,e.sCount[c]=T,e.tight=S,o=e.push("list_item_close","li",-1),o.markup=String.fromCharCode(h),c=e.line,C[1]=c,c>=u||e.sCount[c]<e.blkIndent||e.sCount[c]-e.blkIndent>=4)break;let q=!1;for(let z=0,H=E.length;z<H;z++)if(E[z](e,c,u,!0)){q=!0;break}if(q)break;if(f){if(d=dl(e,c),d<0)break;s=e.bMarks[c]+e.tShift[c]}else if(d=fl(e,c),d<0)break;if(h!==e.src.charCodeAt(d-1))break}return f?o=e.push("ordered_list_close","ol",-1):o=e.push("bullet_list_close","ul",-1),o.markup=String.fromCharCode(h),v[1]=c,e.line=c,e.parentType=m,l&&gg(e,_),!0}function yg(e,t,u,n){let r=e.bMarks[t]+e.tShift[t],i=e.eMarks[t],s=t+1;if(e.sCount[t]-e.blkIndent>=4||e.src.charCodeAt(r)!==91)return!1;function o(E){const m=e.lineMax;if(E>=m||e.isEmpty(E))return null;let g=!1;if(e.sCount[E]-e.blkIndent>3&&(g=!0),e.sCount[E]<0&&(g=!0),!g){const I=e.md.block.ruler.getRules("reference"),w=e.parentType;e.parentType="reference";let C=!1;for(let S=0,R=I.length;S<R;S++)if(I[S](e,E,m,!0)){C=!0;break}if(e.parentType=w,C)return null}const x=e.bMarks[E]+e.tShift[E],k=e.eMarks[E];return e.src.slice(x,k+1)}let c=e.src.slice(r,i+1);i=c.length;let l=-1;for(r=1;r<i;r++){const E=c.charCodeAt(r);if(E===91)return!1;if(E===93){l=r;break}else if(E===10){const m=o(s);m!==null&&(c+=m,i=c.length,s++)}else if(E===92&&(r++,r<i&&c.charCodeAt(r)===10)){const m=o(s);m!==null&&(c+=m,i=c.length,s++)}}if(l<0||c.charCodeAt(l+1)!==58)return!1;for(r=l+2;r<i;r++){const E=c.charCodeAt(r);if(E===10){const m=o(s);m!==null&&(c+=m,i=c.length,s++)}else if(!ae(E))break}const a=e.md.helpers.parseLinkDestination(c,r,i);if(!a.ok)return!1;const f=e.md.normalizeLink(a.str);if(!e.md.validateLink(f))return!1;r=a.pos;const p=r,d=s,h=r;for(;r<i;r++){const E=c.charCodeAt(r);if(E===10){const m=o(s);m!==null&&(c+=m,i=c.length,s++)}else if(!ae(E))break}let _=e.md.helpers.parseLinkTitle(c,r,i);for(;_.can_continue;){const E=o(s);if(E===null)break;c+=E,r=i,i=c.length,s++,_=e.md.helpers.parseLinkTitle(c,r,i,_)}let v;for(r<i&&h!==r&&_.ok?(v=_.str,r=_.pos):(v="",r=p,s=d);r<i;){const E=c.charCodeAt(r);if(!ae(E))break;r++}if(r<i&&c.charCodeAt(r)!==10&&v)for(v="",r=p,s=d;r<i;){const E=c.charCodeAt(r);if(!ae(E))break;r++}if(r<i&&c.charCodeAt(r)!==10)return!1;const A=Di(c.slice(1,l));return A?(n||(typeof e.env.references>"u"&&(e.env.references={}),typeof e.env.references[A]>"u"&&(e.env.references[A]={title:v,href:f}),e.line=s),!0):!1}const xg=["address","article","aside","base","basefont","blockquote","body","caption","center","col","colgroup","dd","details","dialog","dir","div","dl","dt","fieldset","figcaption","figure","footer","form","frame","frameset","h1","h2","h3","h4","h5","h6","head","header","hr","html","iframe","legend","li","link","main","menu","menuitem","nav","noframes","ol","optgroup","option","p","param","search","section","summary","table","tbody","td","tfoot","th","thead","title","tr","track","ul"],Eg="[a-zA-Z_:][a-zA-Z0-9:._-]*",Cg="[^\"'=<>`\\x00-\\x20]+",kg="'[^']*'",Ag='"[^"]*"',Sg="(?:"+Cg+"|"+kg+"|"+Ag+")",vg="(?:\\s+"+Eg+"(?:\\s*=\\s*"+Sg+")?)",kf="<[A-Za-z][A-Za-z0-9\\-]*"+vg+"*\\s*\\/?>",Af="<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>",Dg="\x3C!---?>|\x3C!--(?:[^-]|-[^-]|--[^>])*-->",Tg="<[?][\\s\\S]*?[?]>",wg="<![A-Za-z][^>]*>",Fg="<!\\[CDATA\\[[\\s\\S]*?\\]\\]>",Ig=new RegExp("^(?:"+kf+"|"+Af+"|"+Dg+"|"+Tg+"|"+wg+"|"+Fg+")"),Og=new RegExp("^(?:"+kf+"|"+Af+")"),Ru=[[/^<(script|pre|style|textarea)(?=(\s|>|$))/i,/<\/(script|pre|style|textarea)>/i,!0],[/^\x3C!--/,/-->/,!0],[/^<\?/,/\?>/,!0],[/^<![A-Z]/,/>/,!0],[/^<!\[CDATA\[/,/\]\]>/,!0],[new RegExp("^</?("+xg.join("|")+")(?=(\\s|/?>|$))","i"),/^$/,!0],[new RegExp(Og.source+"\\s*$"),/^$/,!1]];function Ng(e,t,u,n){let r=e.bMarks[t]+e.tShift[t],i=e.eMarks[t];if(e.sCount[t]-e.blkIndent>=4||!e.md.options.html||e.src.charCodeAt(r)!==60)return!1;let s=e.src.slice(r,i),o=0;for(;o<Ru.length&&!Ru[o][0].test(s);o++);if(o===Ru.length)return!1;if(n)return Ru[o][2];let c=t+1;if(!Ru[o][1].test(s)){for(;c<u&&!(e.sCount[c]<e.blkIndent);c++)if(r=e.bMarks[c]+e.tShift[c],i=e.eMarks[c],s=e.src.slice(r,i),Ru[o][1].test(s)){s.length!==0&&c++;break}}e.line=c;const l=e.push("html_block","",0);return l.map=[t,c],l.content=e.getLines(t,c,e.blkIndent,!0),!0}function Rg(e,t,u,n){let r=e.bMarks[t]+e.tShift[t],i=e.eMarks[t];if(e.sCount[t]-e.blkIndent>=4)return!1;let s=e.src.charCodeAt(r);if(s!==35||r>=i)return!1;let o=1;for(s=e.src.charCodeAt(++r);s===35&&r<i&&o<=6;)o++,s=e.src.charCodeAt(++r);if(o>6||r<i&&!ae(s))return!1;if(n)return!0;i=e.skipSpacesBack(i,r);const c=e.skipCharsBack(i,35,r);c>r&&ae(e.src.charCodeAt(c-1))&&(i=c),e.line=t+1;const l=e.push("heading_open","h"+String(o),1);l.markup="########".slice(0,o),l.map=[t,e.line];const a=e.push("inline","",0);a.content=e.src.slice(r,i).trim(),a.map=[t,e.line],a.children=[];const f=e.push("heading_close","h"+String(o),-1);return f.markup="########".slice(0,o),!0}function Mg(e,t,u){const n=e.md.block.ruler.getRules("paragraph");if(e.sCount[t]-e.blkIndent>=4)return!1;const r=e.parentType;e.parentType="paragraph";let i=0,s,o=t+1;for(;o<u&&!e.isEmpty(o);o++){if(e.sCount[o]-e.blkIndent>3)continue;if(e.sCount[o]>=e.blkIndent){let d=e.bMarks[o]+e.tShift[o];const h=e.eMarks[o];if(d<h&&(s=e.src.charCodeAt(d),(s===45||s===61)&&(d=e.skipChars(d,s),d=e.skipSpaces(d),d>=h))){i=s===61?1:2;break}}if(e.sCount[o]<0)continue;let p=!1;for(let d=0,h=n.length;d<h;d++)if(n[d](e,o,u,!0)){p=!0;break}if(p)break}if(!i)return!1;const c=e.getLines(t,o,e.blkIndent,!1).trim();e.line=o+1;const l=e.push("heading_open","h"+String(i),1);l.markup=String.fromCharCode(s),l.map=[t,e.line];const a=e.push("inline","",0);a.content=c,a.map=[t,e.line-1],a.children=[];const f=e.push("heading_close","h"+String(i),-1);return f.markup=String.fromCharCode(s),e.parentType=r,!0}function Pg(e,t,u){const n=e.md.block.ruler.getRules("paragraph"),r=e.parentType;let i=t+1;for(e.parentType="paragraph";i<u&&!e.isEmpty(i);i++){if(e.sCount[i]-e.blkIndent>3||e.sCount[i]<0)continue;let l=!1;for(let a=0,f=n.length;a<f;a++)if(n[a](e,i,u,!0)){l=!0;break}if(l)break}const s=e.getLines(t,i,e.blkIndent,!1).trim();e.line=i;const o=e.push("paragraph_open","p",1);o.map=[t,e.line];const c=e.push("inline","",0);return c.content=s,c.map=[t,e.line],c.children=[],e.push("paragraph_close","p",-1),e.parentType=r,!0}const br=[["table",dg,["paragraph","reference"]],["code",hg],["fence",pg,["paragraph","reference","blockquote","list"]],["blockquote",mg,["paragraph","reference","blockquote","list"]],["hr",bg,["paragraph","reference","blockquote","list"]],["list",_g,["paragraph","reference","blockquote"]],["reference",yg],["html_block",Ng,["paragraph","reference","blockquote"]],["heading",Rg,["paragraph","reference","blockquote"]],["lheading",Mg],["paragraph",Pg]];function Ti(){this.ruler=new Je;for(let e=0;e<br.length;e++)this.ruler.push(br[e][0],br[e][1],{alt:(br[e][2]||[]).slice()})}Ti.prototype.tokenize=function(e,t,u){const n=this.ruler.getRules(""),r=n.length,i=e.md.options.maxNesting;let s=t,o=!1;for(;s<u&&(e.line=s=e.skipEmptyLines(s),!(s>=u||e.sCount[s]<e.blkIndent));){if(e.level>=i){e.line=u;break}const c=e.line;let l=!1;for(let a=0;a<r;a++)if(l=n[a](e,s,u,!1),l){if(c>=e.line)throw new Error("block rule didn't increment state.line");break}if(!l)throw new Error("none of the block rules matched");e.tight=!o,e.isEmpty(e.line-1)&&(o=!0),s=e.line,s<u&&e.isEmpty(s)&&(o=!0,s++,e.line=s)}};Ti.prototype.parse=function(e,t,u,n){if(!e)return;const r=new this.State(e,t,u,n);this.tokenize(r,r.line,r.lineMax)};Ti.prototype.State=Tt;function tr(e,t,u,n){this.src=e,this.env=u,this.md=t,this.tokens=n,this.tokens_meta=Array(n.length),this.pos=0,this.posMax=this.src.length,this.level=0,this.pending="",this.pendingLevel=0,this.cache={},this.delimiters=[],this._prev_delimiters=[],this.backticks={},this.backticksScanned=!1,this.linkLevel=0}tr.prototype.pushPending=function(){const e=new _t("text","",0);return e.content=this.pending,e.level=this.pendingLevel,this.tokens.push(e),this.pending="",e};tr.prototype.push=function(e,t,u){this.pending&&this.pushPending();const n=new _t(e,t,u);let r=null;return u<0&&(this.level--,this.delimiters=this._prev_delimiters.pop()),n.level=this.level,u>0&&(this.level++,this._prev_delimiters.push(this.delimiters),this.delimiters=[],r={delimiters:this.delimiters}),this.pendingLevel=this.level,this.tokens.push(n),this.tokens_meta.push(r),n};tr.prototype.scanDelims=function(e,t){const u=this.posMax,n=this.src.charCodeAt(e),r=e>0?this.src.charCodeAt(e-1):32;let i=e;for(;i<u&&this.src.charCodeAt(i)===n;)i++;const s=i-e,o=i<u?this.src.charCodeAt(i):32,c=jn(r)||qn(String.fromCharCode(r)),l=jn(o)||qn(String.fromCharCode(o)),a=Un(r),f=Un(o),p=!f&&(!l||a||c),d=!a&&(!c||f||l);return{can_open:p&&(t||!d||c),can_close:d&&(t||!p||l),length:s}};tr.prototype.Token=_t;function Lg(e){switch(e){case 10:case 33:case 35:case 36:case 37:case 38:case 42:case 43:case 45:case 58:case 60:case 61:case 62:case 64:case 91:case 92:case 93:case 94:case 95:case 96:case 123:case 125:case 126:return!0;default:return!1}}function Bg(e,t){let u=e.pos;for(;u<e.posMax&&!Lg(e.src.charCodeAt(u));)u++;return u===e.pos?!1:(t||(e.pending+=e.src.slice(e.pos,u)),e.pos=u,!0)}const Vg=/(?:^|[^a-z0-9.+-])([a-z][a-z0-9.+-]*)$/i;function $g(e,t){if(!e.md.options.linkify||e.linkLevel>0)return!1;const u=e.pos,n=e.posMax;if(u+3>n||e.src.charCodeAt(u)!==58||e.src.charCodeAt(u+1)!==47||e.src.charCodeAt(u+2)!==47)return!1;const r=e.pending.match(Vg);if(!r)return!1;const i=r[1],s=e.md.linkify.matchAtStart(e.src.slice(u-i.length));if(!s)return!1;let o=s.url;if(o.length<=i.length)return!1;o=o.replace(/\*+$/,"");const c=e.md.normalizeLink(o);if(!e.md.validateLink(c))return!1;if(!t){e.pending=e.pending.slice(0,-i.length);const l=e.push("link_open","a",1);l.attrs=[["href",c]],l.markup="linkify",l.info="auto";const a=e.push("text","",0);a.content=e.md.normalizeLinkText(o);const f=e.push("link_close","a",-1);f.markup="linkify",f.info="auto"}return e.pos+=o.length-i.length,!0}function Hg(e,t){let u=e.pos;if(e.src.charCodeAt(u)!==10)return!1;const n=e.pending.length-1,r=e.posMax;if(!t)if(n>=0&&e.pending.charCodeAt(n)===32)if(n>=1&&e.pending.charCodeAt(n-1)===32){let i=n-1;for(;i>=1&&e.pending.charCodeAt(i-1)===32;)i--;e.pending=e.pending.slice(0,i),e.push("hardbreak","br",0)}else e.pending=e.pending.slice(0,-1),e.push("softbreak","br",0);else e.push("softbreak","br",0);for(u++;u<r&&ae(e.src.charCodeAt(u));)u++;return e.pos=u,!0}const $o=[];for(let e=0;e<256;e++)$o.push(0);"\\!\"#$%&'()*+,./:;<=>?@[]^_`{|}~-".split("").forEach(function(e){$o[e.charCodeAt(0)]=1});function Ug(e,t){let u=e.pos;const n=e.posMax;if(e.src.charCodeAt(u)!==92||(u++,u>=n))return!1;let r=e.src.charCodeAt(u);if(r===10){for(t||e.push("hardbreak","br",0),u++;u<n&&(r=e.src.charCodeAt(u),!!ae(r));)u++;return e.pos=u,!0}let i=e.src[u];if(r>=55296&&r<=56319&&u+1<n){const o=e.src.charCodeAt(u+1);o>=56320&&o<=57343&&(i+=e.src[u+1],u++)}const s="\\"+i;if(!t){const o=e.push("text_special","",0);r<256&&$o[r]!==0?o.content=i:o.content=s,o.markup=s,o.info="escape"}return e.pos=u+1,!0}function qg(e,t){let u=e.pos;if(e.src.charCodeAt(u)!==96)return!1;const r=u;u++;const i=e.posMax;for(;u<i&&e.src.charCodeAt(u)===96;)u++;const s=e.src.slice(r,u),o=s.length;if(e.backticksScanned&&(e.backticks[o]||0)<=r)return t||(e.pending+=s),e.pos+=o,!0;let c=u,l;for(;(l=e.src.indexOf("`",c))!==-1;){for(c=l+1;c<i&&e.src.charCodeAt(c)===96;)c++;const a=c-l;if(a===o){if(!t){const f=e.push("code_inline","code",0);f.markup=s,f.content=e.src.slice(u,l).replace(/\n/g," ").replace(/^ (.+) $/,"$1")}return e.pos=c,!0}e.backticks[a]=l}return e.backticksScanned=!0,t||(e.pending+=s),e.pos+=o,!0}function jg(e,t){const u=e.pos,n=e.src.charCodeAt(u);if(t||n!==126)return!1;const r=e.scanDelims(e.pos,!0);let i=r.length;const s=String.fromCharCode(n);if(i<2)return!1;let o;i%2&&(o=e.push("text","",0),o.content=s,i--);for(let c=0;c<i;c+=2)o=e.push("text","",0),o.content=s+s,e.delimiters.push({marker:n,length:0,token:e.tokens.length-1,end:-1,open:r.can_open,close:r.can_close});return e.pos+=r.length,!0}function hl(e,t){let u;const n=[],r=t.length;for(let i=0;i<r;i++){const s=t[i];if(s.marker!==126||s.end===-1)continue;const o=t[s.end];u=e.tokens[s.token],u.type="s_open",u.tag="s",u.nesting=1,u.markup="~~",u.content="",u=e.tokens[o.token],u.type="s_close",u.tag="s",u.nesting=-1,u.markup="~~",u.content="",e.tokens[o.token-1].type==="text"&&e.tokens[o.token-1].content==="~"&&n.push(o.token-1)}for(;n.length;){const i=n.pop();let s=i+1;for(;s<e.tokens.length&&e.tokens[s].type==="s_close";)s++;s--,i!==s&&(u=e.tokens[s],e.tokens[s]=e.tokens[i],e.tokens[i]=u)}}function zg(e){const t=e.tokens_meta,u=e.tokens_meta.length;hl(e,e.delimiters);for(let n=0;n<u;n++)t[n]&&t[n].delimiters&&hl(e,t[n].delimiters)}const Sf={tokenize:jg,postProcess:zg};function Kg(e,t){const u=e.pos,n=e.src.charCodeAt(u);if(t||n!==95&&n!==42)return!1;const r=e.scanDelims(e.pos,n===42);for(let i=0;i<r.length;i++){const s=e.push("text","",0);s.content=String.fromCharCode(n),e.delimiters.push({marker:n,length:r.length,token:e.tokens.length-1,end:-1,open:r.can_open,close:r.can_close})}return e.pos+=r.length,!0}function pl(e,t){const u=t.length;for(let n=u-1;n>=0;n--){const r=t[n];if(r.marker!==95&&r.marker!==42||r.end===-1)continue;const i=t[r.end],s=n>0&&t[n-1].end===r.end+1&&t[n-1].marker===r.marker&&t[n-1].token===r.token-1&&t[r.end+1].token===i.token+1,o=String.fromCharCode(r.marker),c=e.tokens[r.token];c.type=s?"strong_open":"em_open",c.tag=s?"strong":"em",c.nesting=1,c.markup=s?o+o:o,c.content="";const l=e.tokens[i.token];l.type=s?"strong_close":"em_close",l.tag=s?"strong":"em",l.nesting=-1,l.markup=s?o+o:o,l.content="",s&&(e.tokens[t[n-1].token].content="",e.tokens[t[r.end+1].token].content="",n--)}}function Wg(e){const t=e.tokens_meta,u=e.tokens_meta.length;pl(e,e.delimiters);for(let n=0;n<u;n++)t[n]&&t[n].delimiters&&pl(e,t[n].delimiters)}const vf={tokenize:Kg,postProcess:Wg};function Gg(e,t){let u,n,r,i,s="",o="",c=e.pos,l=!0;if(e.src.charCodeAt(e.pos)!==91)return!1;const a=e.pos,f=e.posMax,p=e.pos+1,d=e.md.helpers.parseLinkLabel(e,e.pos,!0);if(d<0)return!1;let h=d+1;if(h<f&&e.src.charCodeAt(h)===40){for(l=!1,h++;h<f&&(u=e.src.charCodeAt(h),!(!ae(u)&&u!==10));h++);if(h>=f)return!1;if(c=h,r=e.md.helpers.parseLinkDestination(e.src,h,e.posMax),r.ok){for(s=e.md.normalizeLink(r.str),e.md.validateLink(s)?h=r.pos:s="",c=h;h<f&&(u=e.src.charCodeAt(h),!(!ae(u)&&u!==10));h++);if(r=e.md.helpers.parseLinkTitle(e.src,h,e.posMax),h<f&&c!==h&&r.ok)for(o=r.str,h=r.pos;h<f&&(u=e.src.charCodeAt(h),!(!ae(u)&&u!==10));h++);}(h>=f||e.src.charCodeAt(h)!==41)&&(l=!0),h++}if(l){if(typeof e.env.references>"u")return!1;if(h<f&&e.src.charCodeAt(h)===91?(c=h+1,h=e.md.helpers.parseLinkLabel(e,h),h>=0?n=e.src.slice(c,h++):h=d+1):h=d+1,n||(n=e.src.slice(p,d)),i=e.env.references[Di(n)],!i)return e.pos=a,!1;s=i.href,o=i.title}if(!t){e.pos=p,e.posMax=d;const _=e.push("link_open","a",1),v=[["href",s]];_.attrs=v,o&&v.push(["title",o]),e.linkLevel++,e.md.inline.tokenize(e),e.linkLevel--,e.push("link_close","a",-1)}return e.pos=h,e.posMax=f,!0}function Zg(e,t){let u,n,r,i,s,o,c,l,a="";const f=e.pos,p=e.posMax;if(e.src.charCodeAt(e.pos)!==33||e.src.charCodeAt(e.pos+1)!==91)return!1;const d=e.pos+2,h=e.md.helpers.parseLinkLabel(e,e.pos+1,!1);if(h<0)return!1;if(i=h+1,i<p&&e.src.charCodeAt(i)===40){for(i++;i<p&&(u=e.src.charCodeAt(i),!(!ae(u)&&u!==10));i++);if(i>=p)return!1;for(l=i,o=e.md.helpers.parseLinkDestination(e.src,i,e.posMax),o.ok&&(a=e.md.normalizeLink(o.str),e.md.validateLink(a)?i=o.pos:a=""),l=i;i<p&&(u=e.src.charCodeAt(i),!(!ae(u)&&u!==10));i++);if(o=e.md.helpers.parseLinkTitle(e.src,i,e.posMax),i<p&&l!==i&&o.ok)for(c=o.str,i=o.pos;i<p&&(u=e.src.charCodeAt(i),!(!ae(u)&&u!==10));i++);else c="";if(i>=p||e.src.charCodeAt(i)!==41)return e.pos=f,!1;i++}else{if(typeof e.env.references>"u")return!1;if(i<p&&e.src.charCodeAt(i)===91?(l=i+1,i=e.md.helpers.parseLinkLabel(e,i),i>=0?r=e.src.slice(l,i++):i=h+1):i=h+1,r||(r=e.src.slice(d,h)),s=e.env.references[Di(r)],!s)return e.pos=f,!1;a=s.href,c=s.title}if(!t){n=e.src.slice(d,h);const _=[];e.md.inline.parse(n,e.md,e.env,_);const v=e.push("image","img",0),A=[["src",a],["alt",""]];v.attrs=A,v.children=_,v.content=n,c&&A.push(["title",c])}return e.pos=i,e.posMax=p,!0}const Jg=/^([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$/,Yg=/^([a-zA-Z][a-zA-Z0-9+.-]{1,31}):([^<>\x00-\x20]*)$/;function Xg(e,t){let u=e.pos;if(e.src.charCodeAt(u)!==60)return!1;const n=e.pos,r=e.posMax;for(;;){if(++u>=r)return!1;const s=e.src.charCodeAt(u);if(s===60)return!1;if(s===62)break}const i=e.src.slice(n+1,u);if(Yg.test(i)){const s=e.md.normalizeLink(i);if(!e.md.validateLink(s))return!1;if(!t){const o=e.push("link_open","a",1);o.attrs=[["href",s]],o.markup="autolink",o.info="auto";const c=e.push("text","",0);c.content=e.md.normalizeLinkText(i);const l=e.push("link_close","a",-1);l.markup="autolink",l.info="auto"}return e.pos+=i.length+2,!0}if(Jg.test(i)){const s=e.md.normalizeLink("mailto:"+i);if(!e.md.validateLink(s))return!1;if(!t){const o=e.push("link_open","a",1);o.attrs=[["href",s]],o.markup="autolink",o.info="auto";const c=e.push("text","",0);c.content=e.md.normalizeLinkText(i);const l=e.push("link_close","a",-1);l.markup="autolink",l.info="auto"}return e.pos+=i.length+2,!0}return!1}function Qg(e){return/^<a[>\s]/i.test(e)}function e3(e){return/^<\/a\s*>/i.test(e)}function t3(e){const t=e|32;return t>=97&&t<=122}function u3(e,t){if(!e.md.options.html)return!1;const u=e.posMax,n=e.pos;if(e.src.charCodeAt(n)!==60||n+2>=u)return!1;const r=e.src.charCodeAt(n+1);if(r!==33&&r!==63&&r!==47&&!t3(r))return!1;const i=e.src.slice(n).match(Ig);if(!i)return!1;if(!t){const s=e.push("html_inline","",0);s.content=i[0],Qg(s.content)&&e.linkLevel++,e3(s.content)&&e.linkLevel--}return e.pos+=i[0].length,!0}const n3=/^&#((?:x[a-f0-9]{1,6}|[0-9]{1,7}));/i,r3=/^&([a-z][a-z0-9]{1,31});/i;function i3(e,t){const u=e.pos,n=e.posMax;if(e.src.charCodeAt(u)!==38||u+1>=n)return!1;if(e.src.charCodeAt(u+1)===35){const i=e.src.slice(u).match(n3);if(i){if(!t){const s=i[1][0].toLowerCase()==="x"?parseInt(i[1].slice(1),16):parseInt(i[1],10),o=e.push("text_special","",0);o.content=Bo(s)?Xr(s):Xr(65533),o.markup=i[0],o.info="entity"}return e.pos+=i[0].length,!0}}else{const i=e.src.slice(u).match(r3);if(i){const s=_f(i[0]);if(s!==i[0]){if(!t){const o=e.push("text_special","",0);o.content=s,o.markup=i[0],o.info="entity"}return e.pos+=i[0].length,!0}}}return!1}function ml(e){const t={},u=e.length;if(!u)return;let n=0,r=-2;const i=[];for(let s=0;s<u;s++){const o=e[s];if(i.push(0),(e[n].marker!==o.marker||r!==o.token-1)&&(n=s),r=o.token,o.length=o.length||0,!o.close)continue;t.hasOwnProperty(o.marker)||(t[o.marker]=[-1,-1,-1,-1,-1,-1]);const c=t[o.marker][(o.open?3:0)+o.length%3];let l=n-i[n]-1,a=l;for(;l>c;l-=i[l]+1){const f=e[l];if(f.marker===o.marker&&f.open&&f.end<0){let p=!1;if((f.close||o.open)&&(f.length+o.length)%3===0&&(f.length%3!==0||o.length%3!==0)&&(p=!0),!p){const d=l>0&&!e[l-1].open?i[l-1]+1:0;i[s]=s-l+d,i[l]=d,o.open=!1,f.end=s,f.close=!1,a=-1,r=-2;break}}}a!==-1&&(t[o.marker][(o.open?3:0)+(o.length||0)%3]=a)}}function s3(e){const t=e.tokens_meta,u=e.tokens_meta.length;ml(e.delimiters);for(let n=0;n<u;n++)t[n]&&t[n].delimiters&&ml(t[n].delimiters)}function o3(e){let t,u,n=0;const r=e.tokens,i=e.tokens.length;for(t=u=0;t<i;t++)r[t].nesting<0&&n--,r[t].level=n,r[t].nesting>0&&n++,r[t].type==="text"&&t+1<i&&r[t+1].type==="text"?r[t+1].content=r[t].content+r[t+1].content:(t!==u&&(r[u]=r[t]),u++);t!==u&&(r.length=u)}const Yi=[["text",Bg],["linkify",$g],["newline",Hg],["escape",Ug],["backticks",qg],["strikethrough",Sf.tokenize],["emphasis",vf.tokenize],["link",Gg],["image",Zg],["autolink",Xg],["html_inline",u3],["entity",i3]],Xi=[["balance_pairs",s3],["strikethrough",Sf.postProcess],["emphasis",vf.postProcess],["fragments_join",o3]];function ur(){this.ruler=new Je;for(let e=0;e<Yi.length;e++)this.ruler.push(Yi[e][0],Yi[e][1]);this.ruler2=new Je;for(let e=0;e<Xi.length;e++)this.ruler2.push(Xi[e][0],Xi[e][1])}ur.prototype.skipToken=function(e){const t=e.pos,u=this.ruler.getRules(""),n=u.length,r=e.md.options.maxNesting,i=e.cache;if(typeof i[t]<"u"){e.pos=i[t];return}let s=!1;if(e.level<r){for(let o=0;o<n;o++)if(e.level++,s=u[o](e,!0),e.level--,s){if(t>=e.pos)throw new Error("inline rule didn't increment state.pos");break}}else e.pos=e.posMax;s||e.pos++,i[t]=e.pos};ur.prototype.tokenize=function(e){const t=this.ruler.getRules(""),u=t.length,n=e.posMax,r=e.md.options.maxNesting;for(;e.pos<n;){const i=e.pos;let s=!1;if(e.level<r){for(let o=0;o<u;o++)if(s=t[o](e,!1),s){if(i>=e.pos)throw new Error("inline rule didn't increment state.pos");break}}if(s){if(e.pos>=n)break;continue}e.pending+=e.src[e.pos++]}e.pending&&e.pushPending()};ur.prototype.parse=function(e,t,u,n){const r=new this.State(e,t,u,n);this.tokenize(r);const i=this.ruler2.getRules(""),s=i.length;for(let o=0;o<s;o++)i[o](r)};ur.prototype.State=tr;function c3(e){const t={};e=e||{},t.src_Any=hf.source,t.src_Cc=pf.source,t.src_Z=bf.source,t.src_P=Po.source,t.src_ZPCc=[t.src_Z,t.src_P,t.src_Cc].join("|"),t.src_ZCc=[t.src_Z,t.src_Cc].join("|");const u="[><｜]";return t.src_pseudo_letter="(?:(?!"+u+"|"+t.src_ZPCc+")"+t.src_Any+")",t.src_ip4="(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",t.src_auth="(?:(?:(?!"+t.src_ZCc+"|[@/\\[\\]()]).)+@)?",t.src_port="(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?",t.src_host_terminator="(?=$|"+u+"|"+t.src_ZPCc+")(?!"+(e["---"]?"-(?!--)|":"-|")+"_|:\\d|\\.-|\\.(?!$|"+t.src_ZPCc+"))",t.src_path="(?:[/?#](?:(?!"+t.src_ZCc+"|"+u+`|[()[\\]{}.,"'?!\\-;]).|\\[(?:(?!`+t.src_ZCc+"|\\]).)*\\]|\\((?:(?!"+t.src_ZCc+"|[)]).)*\\)|\\{(?:(?!"+t.src_ZCc+'|[}]).)*\\}|\\"(?:(?!'+t.src_ZCc+`|["]).)+\\"|\\'(?:(?!`+t.src_ZCc+"|[']).)+\\'|\\'(?="+t.src_pseudo_letter+"|[-])|\\.{2,}[a-zA-Z0-9%/&]|\\.(?!"+t.src_ZCc+"|[.]|$)|"+(e["---"]?"\\-(?!--(?:[^-]|$))(?:-*)|":"\\-+|")+",(?!"+t.src_ZCc+"|$)|;(?!"+t.src_ZCc+"|$)|\\!+(?!"+t.src_ZCc+"|[!]|$)|\\?(?!"+t.src_ZCc+"|[?]|$))+|\\/)?",t.src_email_name='[\\-;:&=\\+\\$,\\.a-zA-Z0-9_][\\-;:&=\\+\\$,\\"\\.a-zA-Z0-9_]*',t.src_xn="xn--[a-z0-9\\-]{1,59}",t.src_domain_root="(?:"+t.src_xn+"|"+t.src_pseudo_letter+"{1,63})",t.src_domain="(?:"+t.src_xn+"|(?:"+t.src_pseudo_letter+")|(?:"+t.src_pseudo_letter+"(?:-|"+t.src_pseudo_letter+"){0,61}"+t.src_pseudo_letter+"))",t.src_host="(?:(?:(?:(?:"+t.src_domain+")\\.)*"+t.src_domain+"))",t.tpl_host_fuzzy="(?:"+t.src_ip4+"|(?:(?:(?:"+t.src_domain+")\\.)+(?:%TLDS%)))",t.tpl_host_no_ip_fuzzy="(?:(?:(?:"+t.src_domain+")\\.)+(?:%TLDS%))",t.src_host_strict=t.src_host+t.src_host_terminator,t.tpl_host_fuzzy_strict=t.tpl_host_fuzzy+t.src_host_terminator,t.src_host_port_strict=t.src_host+t.src_port+t.src_host_terminator,t.tpl_host_port_fuzzy_strict=t.tpl_host_fuzzy+t.src_port+t.src_host_terminator,t.tpl_host_port_no_ip_fuzzy_strict=t.tpl_host_no_ip_fuzzy+t.src_port+t.src_host_terminator,t.tpl_host_fuzzy_test="localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:"+t.src_ZPCc+"|>|$))",t.tpl_email_fuzzy="(^|"+u+'|"|\\(|'+t.src_ZCc+")("+t.src_email_name+"@"+t.tpl_host_fuzzy_strict+")",t.tpl_link_fuzzy="(^|(?![.:/\\-_@])(?:[$+<=>^`|｜]|"+t.src_ZPCc+"))((?![$+<=>^`|｜])"+t.tpl_host_port_fuzzy_strict+t.src_path+")",t.tpl_link_no_ip_fuzzy="(^|(?![.:/\\-_@])(?:[$+<=>^`|｜]|"+t.src_ZPCc+"))((?![$+<=>^`|｜])"+t.tpl_host_port_no_ip_fuzzy_strict+t.src_path+")",t}function Os(e){return Array.prototype.slice.call(arguments,1).forEach(function(u){u&&Object.keys(u).forEach(function(n){e[n]=u[n]})}),e}function wi(e){return Object.prototype.toString.call(e)}function l3(e){return wi(e)==="[object String]"}function a3(e){return wi(e)==="[object Object]"}function f3(e){return wi(e)==="[object RegExp]"}function bl(e){return wi(e)==="[object Function]"}function d3(e){return e.replace(/[.?*+^$[\]\\(){}|-]/g,"\\$&")}const Df={fuzzyLink:!0,fuzzyEmail:!0,fuzzyIP:!1};function h3(e){return Object.keys(e||{}).reduce(function(t,u){return t||Df.hasOwnProperty(u)},!1)}const p3={"http:":{validate:function(e,t,u){const n=e.slice(t);return u.re.http||(u.re.http=new RegExp("^\\/\\/"+u.re.src_auth+u.re.src_host_port_strict+u.re.src_path,"i")),u.re.http.test(n)?n.match(u.re.http)[0].length:0}},"https:":"http:","ftp:":"http:","//":{validate:function(e,t,u){const n=e.slice(t);return u.re.no_http||(u.re.no_http=new RegExp("^"+u.re.src_auth+"(?:localhost|(?:(?:"+u.re.src_domain+")\\.)+"+u.re.src_domain_root+")"+u.re.src_port+u.re.src_host_terminator+u.re.src_path,"i")),u.re.no_http.test(n)?t>=3&&e[t-3]===":"||t>=3&&e[t-3]==="/"?0:n.match(u.re.no_http)[0].length:0}},"mailto:":{validate:function(e,t,u){const n=e.slice(t);return u.re.mailto||(u.re.mailto=new RegExp("^"+u.re.src_email_name+"@"+u.re.src_host_strict,"i")),u.re.mailto.test(n)?n.match(u.re.mailto)[0].length:0}}},m3="a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]",b3="biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф".split("|");function g3(e){e.__index__=-1,e.__text_cache__=""}function _3(e){return function(t,u){const n=t.slice(u);return e.test(n)?n.match(e)[0].length:0}}function gl(){return function(e,t){t.normalize(e)}}function Qr(e){const t=e.re=c3(e.__opts__),u=e.__tlds__.slice();e.onCompile(),e.__tlds_replaced__||u.push(m3),u.push(t.src_xn),t.src_tlds=u.join("|");function n(o){return o.replace("%TLDS%",t.src_tlds)}t.email_fuzzy=RegExp(n(t.tpl_email_fuzzy),"i"),t.link_fuzzy=RegExp(n(t.tpl_link_fuzzy),"i"),t.link_no_ip_fuzzy=RegExp(n(t.tpl_link_no_ip_fuzzy),"i"),t.host_fuzzy_test=RegExp(n(t.tpl_host_fuzzy_test),"i");const r=[];e.__compiled__={};function i(o,c){throw new Error('(LinkifyIt) Invalid schema "'+o+'": '+c)}Object.keys(e.__schemas__).forEach(function(o){const c=e.__schemas__[o];if(c===null)return;const l={validate:null,link:null};if(e.__compiled__[o]=l,a3(c)){f3(c.validate)?l.validate=_3(c.validate):bl(c.validate)?l.validate=c.validate:i(o,c),bl(c.normalize)?l.normalize=c.normalize:c.normalize?i(o,c):l.normalize=gl();return}if(l3(c)){r.push(o);return}i(o,c)}),r.forEach(function(o){e.__compiled__[e.__schemas__[o]]&&(e.__compiled__[o].validate=e.__compiled__[e.__schemas__[o]].validate,e.__compiled__[o].normalize=e.__compiled__[e.__schemas__[o]].normalize)}),e.__compiled__[""]={validate:null,normalize:gl()};const s=Object.keys(e.__compiled__).filter(function(o){return o.length>0&&e.__compiled__[o]}).map(d3).join("|");e.re.schema_test=RegExp("(^|(?!_)(?:[><｜]|"+t.src_ZPCc+"))("+s+")","i"),e.re.schema_search=RegExp("(^|(?!_)(?:[><｜]|"+t.src_ZPCc+"))("+s+")","ig"),e.re.schema_at_start=RegExp("^"+e.re.schema_search.source,"i"),e.re.pretest=RegExp("("+e.re.schema_test.source+")|("+e.re.host_fuzzy_test.source+")|@","i"),g3(e)}function y3(e,t){const u=e.__index__,n=e.__last_index__,r=e.__text_cache__.slice(u,n);this.schema=e.__schema__.toLowerCase(),this.index=u+t,this.lastIndex=n+t,this.raw=r,this.text=r,this.url=r}function Ns(e,t){const u=new y3(e,t);return e.__compiled__[u.schema].normalize(u,e),u}function nt(e,t){if(!(this instanceof nt))return new nt(e,t);t||h3(e)&&(t=e,e={}),this.__opts__=Os({},Df,t),this.__index__=-1,this.__last_index__=-1,this.__schema__="",this.__text_cache__="",this.__schemas__=Os({},p3,e),this.__compiled__={},this.__tlds__=b3,this.__tlds_replaced__=!1,this.re={},Qr(this)}nt.prototype.add=function(t,u){return this.__schemas__[t]=u,Qr(this),this};nt.prototype.set=function(t){return this.__opts__=Os(this.__opts__,t),this};nt.prototype.test=function(t){if(this.__text_cache__=t,this.__index__=-1,!t.length)return!1;let u,n,r,i,s,o,c,l,a;if(this.re.schema_test.test(t)){for(c=this.re.schema_search,c.lastIndex=0;(u=c.exec(t))!==null;)if(i=this.testSchemaAt(t,u[2],c.lastIndex),i){this.__schema__=u[2],this.__index__=u.index+u[1].length,this.__last_index__=u.index+u[0].length+i;break}}return this.__opts__.fuzzyLink&&this.__compiled__["http:"]&&(l=t.search(this.re.host_fuzzy_test),l>=0&&(this.__index__<0||l<this.__index__)&&(n=t.match(this.__opts__.fuzzyIP?this.re.link_fuzzy:this.re.link_no_ip_fuzzy))!==null&&(s=n.index+n[1].length,(this.__index__<0||s<this.__index__)&&(this.__schema__="",this.__index__=s,this.__last_index__=n.index+n[0].length))),this.__opts__.fuzzyEmail&&this.__compiled__["mailto:"]&&(a=t.indexOf("@"),a>=0&&(r=t.match(this.re.email_fuzzy))!==null&&(s=r.index+r[1].length,o=r.index+r[0].length,(this.__index__<0||s<this.__index__||s===this.__index__&&o>this.__last_index__)&&(this.__schema__="mailto:",this.__index__=s,this.__last_index__=o))),this.__index__>=0};nt.prototype.pretest=function(t){return this.re.pretest.test(t)};nt.prototype.testSchemaAt=function(t,u,n){return this.__compiled__[u.toLowerCase()]?this.__compiled__[u.toLowerCase()].validate(t,n,this):0};nt.prototype.match=function(t){const u=[];let n=0;this.__index__>=0&&this.__text_cache__===t&&(u.push(Ns(this,n)),n=this.__last_index__);let r=n?t.slice(n):t;for(;this.test(r);)u.push(Ns(this,n)),r=r.slice(this.__last_index__),n+=this.__last_index__;return u.length?u:null};nt.prototype.matchAtStart=function(t){if(this.__text_cache__=t,this.__index__=-1,!t.length)return null;const u=this.re.schema_at_start.exec(t);if(!u)return null;const n=this.testSchemaAt(t,u[2],u[0].length);return n?(this.__schema__=u[2],this.__index__=u.index+u[1].length,this.__last_index__=u.index+u[0].length+n,Ns(this,0)):null};nt.prototype.tlds=function(t,u){return t=Array.isArray(t)?t:[t],u?(this.__tlds__=this.__tlds__.concat(t).sort().filter(function(n,r,i){return n!==i[r-1]}).reverse(),Qr(this),this):(this.__tlds__=t.slice(),this.__tlds_replaced__=!0,Qr(this),this)};nt.prototype.normalize=function(t){t.schema||(t.url="http://"+t.url),t.schema==="mailto:"&&!/^mailto:/i.test(t.url)&&(t.url="mailto:"+t.url)};nt.prototype.onCompile=function(){};const qu=2147483647,At=36,Ho=1,zn=26,x3=38,E3=700,Tf=72,wf=128,Ff="-",C3=/^xn--/,k3=/[^\0-\x7F]/,A3=/[\x2E\u3002\uFF0E\uFF61]/g,S3={overflow:"Overflow: input needs wider integers to process","not-basic":"Illegal input >= 0x80 (not a basic code point)","invalid-input":"Invalid input"},Qi=At-Ho,St=Math.floor,es=String.fromCharCode;function Xt(e){throw new RangeError(S3[e])}function v3(e,t){const u=[];let n=e.length;for(;n--;)u[n]=t(e[n]);return u}function If(e,t){const u=e.split("@");let n="";u.length>1&&(n=u[0]+"@",e=u[1]),e=e.replace(A3,".");const r=e.split("."),i=v3(r,t).join(".");return n+i}function Of(e){const t=[];let u=0;const n=e.length;for(;u<n;){const r=e.charCodeAt(u++);if(r>=55296&&r<=56319&&u<n){const i=e.charCodeAt(u++);(i&64512)==56320?t.push(((r&1023)<<10)+(i&1023)+65536):(t.push(r),u--)}else t.push(r)}return t}const D3=e=>String.fromCodePoint(...e),T3=function(e){return e>=48&&e<58?26+(e-48):e>=65&&e<91?e-65:e>=97&&e<123?e-97:At},_l=function(e,t){return e+22+75*(e<26)-((t!=0)<<5)},Nf=function(e,t,u){let n=0;for(e=u?St(e/E3):e>>1,e+=St(e/t);e>Qi*zn>>1;n+=At)e=St(e/Qi);return St(n+(Qi+1)*e/(e+x3))},Rf=function(e){const t=[],u=e.length;let n=0,r=wf,i=Tf,s=e.lastIndexOf(Ff);s<0&&(s=0);for(let o=0;o<s;++o)e.charCodeAt(o)>=128&&Xt("not-basic"),t.push(e.charCodeAt(o));for(let o=s>0?s+1:0;o<u;){const c=n;for(let a=1,f=At;;f+=At){o>=u&&Xt("invalid-input");const p=T3(e.charCodeAt(o++));p>=At&&Xt("invalid-input"),p>St((qu-n)/a)&&Xt("overflow"),n+=p*a;const d=f<=i?Ho:f>=i+zn?zn:f-i;if(p<d)break;const h=At-d;a>St(qu/h)&&Xt("overflow"),a*=h}const l=t.length+1;i=Nf(n-c,l,c==0),St(n/l)>qu-r&&Xt("overflow"),r+=St(n/l),n%=l,t.splice(n++,0,r)}return String.fromCodePoint(...t)},Mf=function(e){const t=[];e=Of(e);const u=e.length;let n=wf,r=0,i=Tf;for(const c of e)c<128&&t.push(es(c));const s=t.length;let o=s;for(s&&t.push(Ff);o<u;){let c=qu;for(const a of e)a>=n&&a<c&&(c=a);const l=o+1;c-n>St((qu-r)/l)&&Xt("overflow"),r+=(c-n)*l,n=c;for(const a of e)if(a<n&&++r>qu&&Xt("overflow"),a===n){let f=r;for(let p=At;;p+=At){const d=p<=i?Ho:p>=i+zn?zn:p-i;if(f<d)break;const h=f-d,_=At-d;t.push(es(_l(d+h%_,0))),f=St(h/_)}t.push(es(_l(f,0))),i=Nf(r,l,o===s),r=0,++o}++r,++n}return t.join("")},w3=function(e){return If(e,function(t){return C3.test(t)?Rf(t.slice(4).toLowerCase()):t})},F3=function(e){return If(e,function(t){return k3.test(t)?"xn--"+Mf(t):t})},Pf={version:"2.3.1",ucs2:{decode:Of,encode:D3},decode:Rf,encode:Mf,toASCII:F3,toUnicode:w3},I3={options:{html:!1,xhtmlOut:!1,breaks:!1,langPrefix:"language-",linkify:!1,typographer:!1,quotes:"“”‘’",highlight:null,maxNesting:100},components:{core:{},block:{},inline:{}}},O3={options:{html:!1,xhtmlOut:!1,breaks:!1,langPrefix:"language-",linkify:!1,typographer:!1,quotes:"“”‘’",highlight:null,maxNesting:20},components:{core:{rules:["normalize","block","inline","text_join"]},block:{rules:["paragraph"]},inline:{rules:["text"],rules2:["balance_pairs","fragments_join"]}}},N3={options:{html:!0,xhtmlOut:!0,breaks:!1,langPrefix:"language-",linkify:!1,typographer:!1,quotes:"“”‘’",highlight:null,maxNesting:20},components:{core:{rules:["normalize","block","inline","text_join"]},block:{rules:["blockquote","code","fence","heading","hr","html_block","lheading","list","reference","paragraph"]},inline:{rules:["autolink","backticks","emphasis","entity","escape","html_inline","image","link","newline","text"],rules2:["balance_pairs","emphasis","fragments_join"]}}},R3={default:I3,zero:O3,commonmark:N3},M3=/^(vbscript|javascript|file|data):/,P3=/^data:image\/(gif|png|jpeg|webp);/;function L3(e){const t=e.trim().toLowerCase();return M3.test(t)?P3.test(t):!0}const Lf=["http:","https:","mailto:"];function B3(e){const t=Mo(e,!0);if(t.hostname&&(!t.protocol||Lf.indexOf(t.protocol)>=0))try{t.hostname=Pf.toASCII(t.hostname)}catch{}return er(Ro(t))}function V3(e){const t=Mo(e,!0);if(t.hostname&&(!t.protocol||Lf.indexOf(t.protocol)>=0))try{t.hostname=Pf.toUnicode(t.hostname)}catch{}return Qu(Ro(t),Qu.defaultChars+"%")}function at(e,t){if(!(this instanceof at))return new at(e,t);t||Lo(e)||(t=e||{},e="default"),this.inline=new ur,this.block=new Ti,this.core=new Vo,this.renderer=new rn,this.linkify=new nt,this.validateLink=L3,this.normalizeLink=B3,this.normalizeLinkText=V3,this.utils=Hb,this.helpers=vi({},zb),this.options={},this.configure(e),t&&this.set(t)}at.prototype.set=function(e){return vi(this.options,e),this};at.prototype.configure=function(e){const t=this;if(Lo(e)){const u=e;if(e=R3[u],!e)throw new Error('Wrong `markdown-it` preset "'+u+'", check name')}if(!e)throw new Error("Wrong `markdown-it` preset, can't be empty");return e.options&&t.set(e.options),e.components&&Object.keys(e.components).forEach(function(u){e.components[u].rules&&t[u].ruler.enableOnly(e.components[u].rules),e.components[u].rules2&&t[u].ruler2.enableOnly(e.components[u].rules2)}),this};at.prototype.enable=function(e,t){let u=[];Array.isArray(e)||(e=[e]),["core","block","inline"].forEach(function(r){u=u.concat(this[r].ruler.enable(e,!0))},this),u=u.concat(this.inline.ruler2.enable(e,!0));const n=e.filter(function(r){return u.indexOf(r)<0});if(n.length&&!t)throw new Error("MarkdownIt. Failed to enable unknown rule(s): "+n);return this};at.prototype.disable=function(e,t){let u=[];Array.isArray(e)||(e=[e]),["core","block","inline"].forEach(function(r){u=u.concat(this[r].ruler.disable(e,!0))},this),u=u.concat(this.inline.ruler2.disable(e,!0));const n=e.filter(function(r){return u.indexOf(r)<0});if(n.length&&!t)throw new Error("MarkdownIt. Failed to disable unknown rule(s): "+n);return this};at.prototype.use=function(e){const t=[this].concat(Array.prototype.slice.call(arguments,1));return e.apply(e,t),this};at.prototype.parse=function(e,t){if(typeof e!="string")throw new Error("Input data should be a String");const u=new this.core.State(e,this,t);return this.core.process(u),u.tokens};at.prototype.render=function(e,t){return t=t||{},this.renderer.render(this.parse(e,t),this.options,t)};at.prototype.parseInline=function(e,t){const u=new this.core.State(e,this,t);return u.inlineMode=!0,this.core.process(u),u.tokens};at.prototype.renderInline=function(e,t){return t=t||{},this.renderer.render(this.parseInline(e,t),this.options,t)};const $3=e=>!!e.toLowerCase,H3=e=>!$3(e)&&!isNaN(e),U3=e=>e.replace(/>/g,"&gt;").replace(/"/g,"&quot;"),Bf=e=>navigator.clipboard.writeText(e),q3=localStorage.getItem("base")||new URL(".",document.baseURI).href.toString(),ei={apiKey:"",systemMessage:"You are a helpful assistant.",samplers:"dkypmxt",temperature:.8,dynatemp_range:0,dynatemp_exponent:1,top_k:40,top_p:.95,min_p:.05,xtc_probability:0,xtc_threshold:.1,typical_p:1,repeat_last_n:64,repeat_penalty:1,presence_penalty:0,frequency_penalty:0,dry_multiplier:0,dry_base:1.75,dry_allowed_length:2,dry_penalty_last_n:-1,max_tokens:-1,custom:""},j3={apiKey:"Set the API Key if you are using --api-key option for the server.",systemMessage:"The starting message that defines how model should behave.",samplers:'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',temperature:"Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.",dynatemp_range:"Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.",dynatemp_exponent:"Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.",top_k:"Keeps only k top tokens.",top_p:"Limits tokens to those that together have a cumulative probability of at least p",min_p:"Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.",xtc_probability:"XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.",xtc_threshold:"XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.",typical_p:"Sorts and limits tokens based on the difference between log-probability and entropy.",repeat_last_n:"Last n tokens to consider for penalizing repetition",repeat_penalty:"Controls the repetition of token sequences in the generated text",presence_penalty:"Limits tokens based on whether they appear in the output or not.",frequency_penalty:"Limits tokens based on how often they appear in the output.",dry_multiplier:"DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.",dry_base:"DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.",dry_allowed_length:"DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.",dry_penalty_last_n:"DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.",max_tokens:"The maximum number of token per output.",custom:""},z3=Object.entries(ei).filter(e=>H3(e[1])).map(e=>e[0]),K3=["light","dark","cupcake","bumblebee","emerald","corporate","synthwave","retro","cyberpunk","valentine","halloween","garden","forest","aqua","lofi","pastel","fantasy","wireframe","black","luxury","dracula","cmyk","autumn","business","acid","lemonade","night","coffee","winter","dim","nord","sunset"],W3=Kn(e=>{const t=js(new at({breaks:!0})),u=t.value.renderer.rules.fence;t.value.renderer.rules.fence=(r,i,...s)=>{const o=r[i].content,c=u(r,i,...s);return`<div class="relative my-4">
+        <div class="text-right sticky top-4 mb-2 mr-2 h-0">
+          <button class="badge btn-mini" onclick="copyStr(${U3(JSON.stringify(o))})">📋 Copy</button>
+        </div>
+        ${c}
+      </div>`},window.copyStr=Bf;const n=lo(()=>t.value.render(e.source));return()=>ao("div",{innerHTML:n.value})},{props:["source"]}),G3=Kn({template:document.getElementById("settings-modal-short-input").innerHTML,props:{label:{type:String,required:!1},configKey:String,configDefault:Object,configInfo:Object,modelValue:[Object,String,Number]}}),he={getAllConversations(){const e=[];for(const t in localStorage)t.startsWith("conv-")&&e.push(JSON.parse(localStorage.getItem(t)));return e.sort((t,u)=>u.lastModified-t.lastModified),e},getOneConversation(e){return JSON.parse(localStorage.getItem(e)||"null")},appendMsg(e,t){if(t.content===null)return;const u=he.getOneConversation(e)||{id:e,lastModified:Date.now(),messages:[]};u.messages.push(t),u.lastModified=Date.now(),localStorage.setItem(e,JSON.stringify(u))},getNewConvId(){return`conv-${Date.now()}`},remove(e){localStorage.removeItem(e)},filterAndKeepMsgs(e,t){const u=he.getOneConversation(e);u&&(u.messages=u.messages.filter(t),u.lastModified=Date.now(),localStorage.setItem(e,JSON.stringify(u)))},popMsg(e){const t=he.getOneConversation(e);if(!t)return;const u=t.messages.pop();return t.lastModified=Date.now(),t.messages.length===0?he.remove(e):localStorage.setItem(e,JSON.stringify(t)),u},getConfig(){const e=JSON.parse(localStorage.getItem("config")||"{}");return{...ei,...e}},setConfig(e){localStorage.setItem("config",JSON.stringify(e))},getTheme(){return localStorage.getItem("theme")||"auto"},setTheme(e){e==="auto"?localStorage.removeItem("theme"):localStorage.setItem("theme",e)}},fn=e=>{const t=document.getElementById("messages-list"),u=t.scrollHeight-t.scrollTop-t.clientHeight;(!e||u<100)&&setTimeout(()=>t.scrollTo({top:t.scrollHeight}),1)},Vf=Ur({components:{VueMarkdown:W3,SettingsModalShortInput:G3},data(){return{conversations:he.getAllConversations(),messages:[],viewingConvId:he.getNewConvId(),inputMsg:"",isGenerating:!1,pendingMsg:null,stopGeneration:()=>{},selectedTheme:he.getTheme(),config:he.getConfig(),showConfigDialog:!1,editingMsg:null,themes:K3,configDefault:{...ei},configInfo:{...j3}}},computed:{},mounted(){document.getElementById("app").classList.remove("opacity-0");const e=document.getElementById("pending-msg");new ResizeObserver(()=>{this.isGenerating&&fn(!0)}).observe(e)},methods:{hideSidebar(){document.getElementById("toggle-drawer").checked=!1},setSelectedTheme(e){this.selectedTheme=e,he.setTheme(e)},newConversation(){this.isGenerating||(this.viewingConvId=he.getNewConvId(),this.editingMsg=null,this.fetchMessages(),fn(),this.hideSidebar())},setViewingConv(e){this.isGenerating||(this.viewingConvId=e,this.editingMsg=null,this.fetchMessages(),fn(),this.hideSidebar())},deleteConv(e){this.isGenerating||window.confirm("Are you sure to delete this conversation?")&&(he.remove(e),this.viewingConvId===e&&(this.viewingConvId=he.getNewConvId(),this.editingMsg=null),this.fetchConversation(),this.fetchMessages())},downloadConv(e){const t=he.getOneConversation(e);if(!t){alert("Conversation not found.");return}const u=JSON.stringify(t,null,2),n=new Blob([u],{type:"application/json"}),r=URL.createObjectURL(n),i=document.createElement("a");i.href=r,i.download=`conversation_${e}.json`,document.body.appendChild(i),i.click(),document.body.removeChild(i),URL.revokeObjectURL(r)},async sendMessage(){if(!this.inputMsg)return;const e=this.viewingConvId;he.appendMsg(e,{id:Date.now(),role:"user",content:this.inputMsg}),this.fetchConversation(),this.fetchMessages(),this.inputMsg="",this.editingMsg=null,this.generateMessage(e),fn()},async generateMessage(e){if(!this.isGenerating){this.pendingMsg={id:Date.now()+1,role:"assistant",content:null},this.isGenerating=!0,this.editingMsg=null;try{const t=new AbortController;this.stopGeneration=()=>t.abort();const u={messages:[{role:"system",content:this.config.systemMessage},...this.messages],stream:!0,cache_prompt:!0,samplers:this.config.samplers,temperature:this.config.temperature,dynatemp_range:this.config.dynatemp_range,dynatemp_exponent:this.config.dynatemp_exponent,top_k:this.config.top_k,top_p:this.config.top_p,min_p:this.config.min_p,typical_p:this.config.typical_p,xtc_probability:this.config.xtc_probability,xtc_threshold:this.config.xtc_threshold,repeat_last_n:this.config.repeat_last_n,repeat_penalty:this.config.repeat_penalty,presence_penalty:this.config.presence_penalty,frequency_penalty:this.config.frequency_penalty,dry_multiplier:this.config.dry_multiplier,dry_base:this.config.dry_base,dry_allowed_length:this.config.dry_allowed_length,dry_penalty_last_n:this.config.dry_penalty_last_n,max_tokens:this.config.max_tokens,...this.config.custom.length?JSON.parse(this.config.custom):{},...this.config.apiKey?{api_key:this.config.apiKey}:{}},n={controller:t,api_url:q3,endpoint:"/chat/completions"};for await(const r of eb(prompt,u,n)){const i=r.data.stop,s=r.data.choices[0].delta.content,o=this.pendingMsg.content||"";s&&(this.pendingMsg={id:this.pendingMsg.id,role:"assistant",content:o+s})}he.appendMsg(e,this.pendingMsg),this.fetchConversation(),this.fetchMessages(),setTimeout(()=>document.getElementById("msg-input").focus(),1)}catch(t){if(t.name==="AbortError")he.appendMsg(e,this.pendingMsg),this.fetchConversation(),this.fetchMessages();else{console.error(t),alert(t);const u=he.popMsg(e);this.inputMsg=u?u.content:""}}this.pendingMsg=null,this.isGenerating=!1,this.stopGeneration=()=>{},this.fetchMessages(),fn()}},regenerateMsg(e){if(this.isGenerating)return;const t=this.viewingConvId;he.filterAndKeepMsgs(t,u=>u.id<e.id),this.fetchConversation(),this.fetchMessages(),this.generateMessage(t)},copyMsg(e){Bf(e.content)},editUserMsgAndRegenerate(e){if(this.isGenerating)return;const t=this.viewingConvId,u=e.content;this.editingMsg=null,he.filterAndKeepMsgs(t,n=>n.id<e.id),he.appendMsg(t,{id:Date.now(),role:"user",content:u}),this.fetchConversation(),this.fetchMessages(),this.generateMessage(t)},closeAndSaveConfigDialog(){try{this.config.custom.length&&JSON.parse(this.config.custom)}catch{alert("Invalid JSON for custom config. Please either fix it or leave it empty.");return}for(const e of z3){if(isNaN(this.config[e])||this.config[e].toString().trim().length===0){alert(`Invalid number for ${e} (expected an integer or a float)`);return}this.config[e]=parseFloat(this.config[e])}this.showConfigDialog=!1,he.setConfig(this.config)},closeAndDiscardConfigDialog(){this.showConfigDialog=!1,this.config=he.getConfig()},resetConfigDialog(){window.confirm("Are you sure to reset all settings?")&&(this.config={...ei})},fetchConversation(){this.conversations=he.getAllConversations()},fetchMessages(){var e;this.messages=((e=he.getOneConversation(this.viewingConvId))==null?void 0:e.messages)??[]}}});Vf.config.errorHandler=alert;try{Vf.mount("#app")}catch(e){console.error(e),document.getElementById("app").innerHTML=`<div style="margin:2em auto">
+    Failed to start app. Please try clearing localStorage and try again.<br/>
+    <br/>
+    <button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
+  </div>`}</script>
+  <style rel="stylesheet" crossorigin>*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}:root,[data-theme]{background-color:var(--fallback-b1,oklch(var(--b1)/1));color:var(--fallback-bc,oklch(var(--bc)/1))}@supports not (color: oklch(0% 0 0)){:root{color-scheme:light;--fallback-p: #491eff;--fallback-pc: #d4dbff;--fallback-s: #ff41c7;--fallback-sc: #fff9fc;--fallback-a: #00cfbd;--fallback-ac: #00100d;--fallback-n: #2b3440;--fallback-nc: #d7dde4;--fallback-b1: #ffffff;--fallback-b2: #e5e6e6;--fallback-b3: #e5e6e6;--fallback-bc: #1f2937;--fallback-in: #00b3f0;--fallback-inc: #000000;--fallback-su: #00ca92;--fallback-suc: #000000;--fallback-wa: #ffc22d;--fallback-wac: #000000;--fallback-er: #ff6f70;--fallback-erc: #000000}@media (prefers-color-scheme: dark){:root{color-scheme:dark;--fallback-p: #7582ff;--fallback-pc: #050617;--fallback-s: #ff71cf;--fallback-sc: #190211;--fallback-a: #00c7b5;--fallback-ac: #000e0c;--fallback-n: #2a323c;--fallback-nc: #a6adbb;--fallback-b1: #1d232a;--fallback-b2: #191e24;--fallback-b3: #15191e;--fallback-bc: #a6adbb;--fallback-in: #00b3f0;--fallback-inc: #000000;--fallback-su: #00ca92;--fallback-suc: #000000;--fallback-wa: #ffc22d;--fallback-wac: #000000;--fallback-er: #ff6f70;--fallback-erc: #000000}}}html{-webkit-tap-highlight-color:transparent}*{scrollbar-color:color-mix(in oklch,currentColor 35%,transparent) transparent}*:hover{scrollbar-color:color-mix(in oklch,currentColor 60%,transparent) transparent}:root{color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 89.824% .06192 275.75;--ac: 15.352% .0368 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 49.12% .3096 275.75;--s: 69.71% .329 342.55;--sc: 98.71% .0106 342.55;--a: 76.76% .184 183.61;--n: 32.1785% .02476 255.701624;--nc: 89.4994% .011585 252.096176;--b1: 100% 0 0;--b2: 96.1151% 0 0;--b3: 92.4169% .00108 197.137559;--bc: 27.8078% .029596 256.847952}@media (prefers-color-scheme: dark){:root{color-scheme:dark;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 13.138% .0392 275.75;--sc: 14.96% .052 342.55;--ac: 14.902% .0334 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 65.69% .196 275.75;--s: 74.8% .26 342.55;--a: 74.51% .167 183.61;--n: 31.3815% .021108 254.139175;--nc: 74.6477% .0216 264.435964;--b1: 25.3267% .015896 252.417568;--b2: 23.2607% .013807 253.100675;--b3: 21.1484% .01165 254.087939;--bc: 74.6477% .0216 264.435964}}[data-theme=light]{color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 89.824% .06192 275.75;--ac: 15.352% .0368 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 49.12% .3096 275.75;--s: 69.71% .329 342.55;--sc: 98.71% .0106 342.55;--a: 76.76% .184 183.61;--n: 32.1785% .02476 255.701624;--nc: 89.4994% .011585 252.096176;--b1: 100% 0 0;--b2: 96.1151% 0 0;--b3: 92.4169% .00108 197.137559;--bc: 27.8078% .029596 256.847952}:root:has(input.theme-controller[value=light]:checked){color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 89.824% .06192 275.75;--ac: 15.352% .0368 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 49.12% .3096 275.75;--s: 69.71% .329 342.55;--sc: 98.71% .0106 342.55;--a: 76.76% .184 183.61;--n: 32.1785% .02476 255.701624;--nc: 89.4994% .011585 252.096176;--b1: 100% 0 0;--b2: 96.1151% 0 0;--b3: 92.4169% .00108 197.137559;--bc: 27.8078% .029596 256.847952}[data-theme=dark]{color-scheme:dark;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 13.138% .0392 275.75;--sc: 14.96% .052 342.55;--ac: 14.902% .0334 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 65.69% .196 275.75;--s: 74.8% .26 342.55;--a: 74.51% .167 183.61;--n: 31.3815% .021108 254.139175;--nc: 74.6477% .0216 264.435964;--b1: 25.3267% .015896 252.417568;--b2: 23.2607% .013807 253.100675;--b3: 21.1484% .01165 254.087939;--bc: 74.6477% .0216 264.435964}:root:has(input.theme-controller[value=dark]:checked){color-scheme:dark;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 13.138% .0392 275.75;--sc: 14.96% .052 342.55;--ac: 14.902% .0334 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 65.69% .196 275.75;--s: 74.8% .26 342.55;--a: 74.51% .167 183.61;--n: 31.3815% .021108 254.139175;--nc: 74.6477% .0216 264.435964;--b1: 25.3267% .015896 252.417568;--b2: 23.2607% .013807 253.100675;--b3: 21.1484% .01165 254.087939;--bc: 74.6477% .0216 264.435964}[data-theme=cupcake]{color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 15.2344% .017892 200.026556;--sc: 15.787% .020249 356.29965;--ac: 15.8762% .029206 78.618794;--nc: 84.7148% .013247 313.189598;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--p: 76.172% .089459 200.026556;--s: 78.9351% .101246 356.29965;--a: 79.3811% .146032 78.618794;--n: 23.5742% .066235 313.189598;--b1: 97.7882% .00418 56.375637;--b2: 93.9822% .007638 61.449292;--b3: 91.5861% .006811 53.440502;--bc: 23.5742% .066235 313.189598;--rounded-btn: 1.9rem;--tab-border: 2px;--tab-radius: .7rem}:root:has(input.theme-controller[value=cupcake]:checked){color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 15.2344% .017892 200.026556;--sc: 15.787% .020249 356.29965;--ac: 15.8762% .029206 78.618794;--nc: 84.7148% .013247 313.189598;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--p: 76.172% .089459 200.026556;--s: 78.9351% .101246 356.29965;--a: 79.3811% .146032 78.618794;--n: 23.5742% .066235 313.189598;--b1: 97.7882% .00418 56.375637;--b2: 93.9822% .007638 61.449292;--b3: 91.5861% .006811 53.440502;--bc: 23.5742% .066235 313.189598;--rounded-btn: 1.9rem;--tab-border: 2px;--tab-radius: .7rem}[data-theme=bumblebee]{color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 20% 0 0;--ac: 16.254% .0314 56.52;--nc: 82.55% .015 281.99;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 89.51% .2132 96.61;--pc: 38.92% .046 96.61;--s: 80.39% .194 70.76;--sc: 39.38% .068 70.76;--a: 81.27% .157 56.52;--n: 12.75% .075 281.99;--b1: 100% 0 0}:root:has(input.theme-controller[value=bumblebee]:checked){color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 20% 0 0;--ac: 16.254% .0314 56.52;--nc: 82.55% .015 281.99;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 89.51% .2132 96.61;--pc: 38.92% .046 96.61;--s: 80.39% .194 70.76;--sc: 39.38% .068 70.76;--a: 81.27% .157 56.52;--n: 12.75% .075 281.99;--b1: 100% 0 0}[data-theme=emerald]{color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 76.6626% .135433 153.450024;--pc: 33.3872% .040618 162.240129;--s: 61.3028% .202368 261.294233;--sc: 100% 0 0;--a: 72.7725% .149783 33.200363;--ac: 0% 0 0;--n: 35.5192% .032071 262.988584;--nc: 98.4625% .001706 247.838921;--b1: 100% 0 0;--bc: 35.5192% .032071 262.988584;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}:root:has(input.theme-controller[value=emerald]:checked){color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 76.6626% .135433 153.450024;--pc: 33.3872% .040618 162.240129;--s: 61.3028% .202368 261.294233;--sc: 100% 0 0;--a: 72.7725% .149783 33.200363;--ac: 0% 0 0;--n: 35.5192% .032071 262.988584;--nc: 98.4625% .001706 247.838921;--b1: 100% 0 0;--bc: 35.5192% .032071 262.988584;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}[data-theme=corporate]{color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 12.078% .0456 269.1;--sc: 13.0739% .010951 256.688055;--ac: 15.3934% .022799 163.57888;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--border-btn: 1px;--tab-border: 1px;--p: 60.39% .228 269.1;--s: 65.3694% .054756 256.688055;--a: 76.9669% .113994 163.57888;--n: 22.3899% .031305 278.07229;--nc: 95.8796% .008588 247.915135;--b1: 100% 0 0;--bc: 22.3899% .031305 278.07229;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem;--tab-radius: .25rem;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}:root:has(input.theme-controller[value=corporate]:checked){color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 12.078% .0456 269.1;--sc: 13.0739% .010951 256.688055;--ac: 15.3934% .022799 163.57888;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--border-btn: 1px;--tab-border: 1px;--p: 60.39% .228 269.1;--s: 65.3694% .054756 256.688055;--a: 76.9669% .113994 163.57888;--n: 22.3899% .031305 278.07229;--nc: 95.8796% .008588 247.915135;--b1: 100% 0 0;--bc: 22.3899% .031305 278.07229;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem;--tab-radius: .25rem;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}[data-theme=synthwave]{color-scheme:dark;--b2: 20.2941% .076211 287.835609;--b3: 18.7665% .070475 287.835609;--pc: 14.4421% .031903 342.009383;--sc: 15.6543% .02362 227.382405;--ac: 17.608% .0412 93.72;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 72.2105% .159514 342.009383;--s: 78.2714% .118101 227.382405;--a: 88.04% .206 93.72;--n: 25.5554% .103537 286.507967;--nc: 97.9365% .00819 301.358346;--b1: 21.8216% .081948 287.835609;--bc: 97.9365% .00819 301.358346;--in: 76.5197% .12273 231.831603;--inc: 23.5017% .096418 290.329844;--su: 86.0572% .115038 178.624677;--suc: 23.5017% .096418 290.329844;--wa: 85.531% .122117 93.722227;--wac: 23.5017% .096418 290.329844;--er: 73.7005% .121339 32.639257;--erc: 23.5017% .096418 290.329844}:root:has(input.theme-controller[value=synthwave]:checked){color-scheme:dark;--b2: 20.2941% .076211 287.835609;--b3: 18.7665% .070475 287.835609;--pc: 14.4421% .031903 342.009383;--sc: 15.6543% .02362 227.382405;--ac: 17.608% .0412 93.72;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 72.2105% .159514 342.009383;--s: 78.2714% .118101 227.382405;--a: 88.04% .206 93.72;--n: 25.5554% .103537 286.507967;--nc: 97.9365% .00819 301.358346;--b1: 21.8216% .081948 287.835609;--bc: 97.9365% .00819 301.358346;--in: 76.5197% .12273 231.831603;--inc: 23.5017% .096418 290.329844;--su: 86.0572% .115038 178.624677;--suc: 23.5017% .096418 290.329844;--wa: 85.531% .122117 93.722227;--wac: 23.5017% .096418 290.329844;--er: 73.7005% .121339 32.639257;--erc: 23.5017% .096418 290.329844}[data-theme=retro]{color-scheme:light;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 13.144% .0398 27.33;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 76.8664% .104092 22.664655;--pc: 26.5104% .006243 .522862;--s: 80.7415% .052534 159.094608;--sc: 26.5104% .006243 .522862;--a: 70.3919% .125455 52.953428;--ac: 26.5104% .006243 .522862;--n: 28.4181% .009519 355.534017;--nc: 92.5604% .025113 89.217311;--b1: 91.6374% .034554 90.51575;--b2: 88.2722% .049418 91.774344;--b3: 84.133% .065952 90.856665;--bc: 26.5104% .006243 .522862;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 65.72% .199 27.33;--rounded-box: .4rem;--rounded-btn: .4rem;--rounded-badge: .4rem;--tab-radius: .4rem}:root:has(input.theme-controller[value=retro]:checked){color-scheme:light;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 13.144% .0398 27.33;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 76.8664% .104092 22.664655;--pc: 26.5104% .006243 .522862;--s: 80.7415% .052534 159.094608;--sc: 26.5104% .006243 .522862;--a: 70.3919% .125455 52.953428;--ac: 26.5104% .006243 .522862;--n: 28.4181% .009519 355.534017;--nc: 92.5604% .025113 89.217311;--b1: 91.6374% .034554 90.51575;--b2: 88.2722% .049418 91.774344;--b3: 84.133% .065952 90.856665;--bc: 26.5104% .006243 .522862;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 65.72% .199 27.33;--rounded-box: .4rem;--rounded-btn: .4rem;--rounded-badge: .4rem;--tab-radius: .4rem}[data-theme=cyberpunk]{color-scheme:light;--b2: 87.8943% .16647 104.32;--b3: 81.2786% .15394 104.32;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 18.902% .0358 104.32;--pc: 14.844% .0418 6.35;--sc: 16.666% .0368 204.72;--ac: 14.372% .04352 310.43;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;--p: 74.22% .209 6.35;--s: 83.33% .184 204.72;--a: 71.86% .2176 310.43;--n: 23.04% .065 269.31;--nc: 94.51% .179 104.32;--b1: 94.51% .179 104.32;--rounded-box: 0;--rounded-btn: 0;--rounded-badge: 0;--tab-radius: 0}:root:has(input.theme-controller[value=cyberpunk]:checked){color-scheme:light;--b2: 87.8943% .16647 104.32;--b3: 81.2786% .15394 104.32;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 18.902% .0358 104.32;--pc: 14.844% .0418 6.35;--sc: 16.666% .0368 204.72;--ac: 14.372% .04352 310.43;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;--p: 74.22% .209 6.35;--s: 83.33% .184 204.72;--a: 71.86% .2176 310.43;--n: 23.04% .065 269.31;--nc: 94.51% .179 104.32;--b1: 94.51% .179 104.32;--rounded-box: 0;--rounded-btn: 0;--rounded-badge: 0;--tab-radius: 0}[data-theme=valentine]{color-scheme:light;--b2: 88.0567% .024834 337.06289;--b3: 81.4288% .022964 337.06289;--pc: 13.7239% .030755 15.066527;--sc: 14.3942% .029258 293.189609;--ac: 14.2537% .014961 197.828857;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 14.614% .0414 27.33;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 68.6197% .153774 15.066527;--s: 71.971% .14629 293.189609;--a: 71.2685% .074804 197.828857;--n: 54.6053% .143342 358.004839;--nc: 90.2701% .037202 336.955191;--b1: 94.6846% .026703 337.06289;--bc: 37.3085% .081131 4.606426;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 73.07% .207 27.33;--rounded-btn: 1.9rem;--tab-radius: .7rem}:root:has(input.theme-controller[value=valentine]:checked){color-scheme:light;--b2: 88.0567% .024834 337.06289;--b3: 81.4288% .022964 337.06289;--pc: 13.7239% .030755 15.066527;--sc: 14.3942% .029258 293.189609;--ac: 14.2537% .014961 197.828857;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 14.614% .0414 27.33;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 68.6197% .153774 15.066527;--s: 71.971% .14629 293.189609;--a: 71.2685% .074804 197.828857;--n: 54.6053% .143342 358.004839;--nc: 90.2701% .037202 336.955191;--b1: 94.6846% .026703 337.06289;--bc: 37.3085% .081131 4.606426;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 73.07% .207 27.33;--rounded-btn: 1.9rem;--tab-radius: .7rem}[data-theme=halloween]{color-scheme:dark;--b2: 23.0416% 0 0;--b3: 21.3072% 0 0;--bc: 84.9552% 0 0;--sc: 89.196% .0496 305.03;--nc: 84.8742% .009322 65.681484;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 13.144% .0398 27.33;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 77.48% .204 60.62;--pc: 19.6935% .004671 196.779412;--s: 45.98% .248 305.03;--a: 64.8% .223 136.073479;--ac: 0% 0 0;--n: 24.371% .046608 65.681484;--b1: 24.7759% 0 0;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 65.72% .199 27.33}:root:has(input.theme-controller[value=halloween]:checked){color-scheme:dark;--b2: 23.0416% 0 0;--b3: 21.3072% 0 0;--bc: 84.9552% 0 0;--sc: 89.196% .0496 305.03;--nc: 84.8742% .009322 65.681484;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 13.144% .0398 27.33;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 77.48% .204 60.62;--pc: 19.6935% .004671 196.779412;--s: 45.98% .248 305.03;--a: 64.8% .223 136.073479;--ac: 0% 0 0;--n: 24.371% .046608 65.681484;--b1: 24.7759% 0 0;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 65.72% .199 27.33}[data-theme=garden]{color-scheme:light;--b2: 86.4453% .002011 17.197414;--b3: 79.9386% .00186 17.197414;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--sc: 89.699% .022197 355.095988;--ac: 11.2547% .010859 154.390187;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 62.45% .278 3.83636;--pc: 100% 0 0;--s: 48.4952% .110985 355.095988;--a: 56.2735% .054297 154.390187;--n: 24.1559% .049362 89.070594;--nc: 92.9519% .002163 17.197414;--b1: 92.9519% .002163 17.197414;--bc: 16.9617% .001664 17.32068}:root:has(input.theme-controller[value=garden]:checked){color-scheme:light;--b2: 86.4453% .002011 17.197414;--b3: 79.9386% .00186 17.197414;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--sc: 89.699% .022197 355.095988;--ac: 11.2547% .010859 154.390187;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 62.45% .278 3.83636;--pc: 100% 0 0;--s: 48.4952% .110985 355.095988;--a: 56.2735% .054297 154.390187;--n: 24.1559% .049362 89.070594;--nc: 92.9519% .002163 17.197414;--b1: 92.9519% .002163 17.197414;--bc: 16.9617% .001664 17.32068}[data-theme=forest]{color-scheme:dark;--b2: 17.522% .007709 17.911578;--b3: 16.2032% .007129 17.911578;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 83.7682% .001658 17.911578;--sc: 13.9553% .027077 168.327128;--ac: 14.1257% .02389 185.713193;--nc: 86.1397% .007806 171.364646;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 68.6283% .185567 148.958922;--pc: 0% 0 0;--s: 69.7764% .135385 168.327128;--a: 70.6285% .119451 185.713193;--n: 30.6985% .039032 171.364646;--b1: 18.8409% .00829 17.911578;--rounded-btn: 1.9rem}:root:has(input.theme-controller[value=forest]:checked){color-scheme:dark;--b2: 17.522% .007709 17.911578;--b3: 16.2032% .007129 17.911578;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 83.7682% .001658 17.911578;--sc: 13.9553% .027077 168.327128;--ac: 14.1257% .02389 185.713193;--nc: 86.1397% .007806 171.364646;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 68.6283% .185567 148.958922;--pc: 0% 0 0;--s: 69.7764% .135385 168.327128;--a: 70.6285% .119451 185.713193;--n: 30.6985% .039032 171.364646;--b1: 18.8409% .00829 17.911578;--rounded-btn: 1.9rem}[data-theme=aqua]{color-scheme:dark;--b2: 45.3464% .118611 261.181672;--b3: 41.9333% .109683 261.181672;--bc: 89.7519% .025508 261.181672;--sc: 12.1365% .02175 309.782946;--ac: 18.6854% .020445 94.555431;--nc: 12.2124% .023402 243.760661;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 14.79% .038 27.33;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 85.6617% .14498 198.6458;--pc: 40.1249% .068266 197.603872;--s: 60.6827% .108752 309.782946;--a: 93.4269% .102225 94.555431;--n: 61.0622% .117009 243.760661;--b1: 48.7596% .127539 261.181672;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 73.95% .19 27.33}:root:has(input.theme-controller[value=aqua]:checked){color-scheme:dark;--b2: 45.3464% .118611 261.181672;--b3: 41.9333% .109683 261.181672;--bc: 89.7519% .025508 261.181672;--sc: 12.1365% .02175 309.782946;--ac: 18.6854% .020445 94.555431;--nc: 12.2124% .023402 243.760661;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 14.79% .038 27.33;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 85.6617% .14498 198.6458;--pc: 40.1249% .068266 197.603872;--s: 60.6827% .108752 309.782946;--a: 93.4269% .102225 94.555431;--n: 61.0622% .117009 243.760661;--b1: 48.7596% .127539 261.181672;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 73.95% .19 27.33}[data-theme=lofi]{color-scheme:light;--inc: 15.908% .0206 205.9;--suc: 18.026% .0306 164.14;--wac: 17.674% .027 79.94;--erc: 15.732% .03 28.47;--border-btn: 1px;--tab-border: 1px;--p: 15.9066% 0 0;--pc: 100% 0 0;--s: 21.455% .001566 17.278957;--sc: 100% 0 0;--a: 26.8618% 0 0;--ac: 100% 0 0;--n: 0% 0 0;--nc: 100% 0 0;--b1: 100% 0 0;--b2: 96.1151% 0 0;--b3: 92.268% .001082 17.17934;--bc: 0% 0 0;--in: 79.54% .103 205.9;--su: 90.13% .153 164.14;--wa: 88.37% .135 79.94;--er: 78.66% .15 28.47;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem;--tab-radius: .125rem;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}:root:has(input.theme-controller[value=lofi]:checked){color-scheme:light;--inc: 15.908% .0206 205.9;--suc: 18.026% .0306 164.14;--wac: 17.674% .027 79.94;--erc: 15.732% .03 28.47;--border-btn: 1px;--tab-border: 1px;--p: 15.9066% 0 0;--pc: 100% 0 0;--s: 21.455% .001566 17.278957;--sc: 100% 0 0;--a: 26.8618% 0 0;--ac: 100% 0 0;--n: 0% 0 0;--nc: 100% 0 0;--b1: 100% 0 0;--b2: 96.1151% 0 0;--b3: 92.268% .001082 17.17934;--bc: 0% 0 0;--in: 79.54% .103 205.9;--su: 90.13% .153 164.14;--wa: 88.37% .135 79.94;--er: 78.66% .15 28.47;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem;--tab-radius: .125rem;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}[data-theme=pastel]{color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 20% 0 0;--pc: 16.6166% .006979 316.8737;--sc: 17.6153% .009839 8.688364;--ac: 17.8419% .012056 170.923263;--nc: 14.2681% .014702 228.183906;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 83.0828% .034896 316.8737;--s: 88.0763% .049197 8.688364;--a: 89.2096% .06028 170.923263;--n: 71.3406% .07351 228.183906;--b1: 100% 0 0;--b2: 98.4625% .001706 247.838921;--b3: 87.1681% .009339 258.338227;--rounded-btn: 1.9rem;--tab-radius: .7rem}:root:has(input.theme-controller[value=pastel]:checked){color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 20% 0 0;--pc: 16.6166% .006979 316.8737;--sc: 17.6153% .009839 8.688364;--ac: 17.8419% .012056 170.923263;--nc: 14.2681% .014702 228.183906;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 83.0828% .034896 316.8737;--s: 88.0763% .049197 8.688364;--a: 89.2096% .06028 170.923263;--n: 71.3406% .07351 228.183906;--b1: 100% 0 0;--b2: 98.4625% .001706 247.838921;--b3: 87.1681% .009339 258.338227;--rounded-btn: 1.9rem;--tab-radius: .7rem}[data-theme=fantasy]{color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 87.49% .0378 325.02;--sc: 90.784% .0324 241.36;--ac: 15.196% .0408 56.72;--nc: 85.5616% .005919 256.847952;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 37.45% .189 325.02;--s: 53.92% .162 241.36;--a: 75.98% .204 56.72;--n: 27.8078% .029596 256.847952;--b1: 100% 0 0;--bc: 27.8078% .029596 256.847952}:root:has(input.theme-controller[value=fantasy]:checked){color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 87.49% .0378 325.02;--sc: 90.784% .0324 241.36;--ac: 15.196% .0408 56.72;--nc: 85.5616% .005919 256.847952;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 37.45% .189 325.02;--s: 53.92% .162 241.36;--a: 75.98% .204 56.72;--n: 27.8078% .029596 256.847952;--b1: 100% 0 0;--bc: 27.8078% .029596 256.847952}[data-theme=wireframe]{color-scheme:light;--bc: 20% 0 0;--pc: 15.6521% 0 0;--sc: 15.6521% 0 0;--ac: 15.6521% 0 0;--nc: 18.8014% 0 0;--inc: 89.0403% .062643 264.052021;--suc: 90.395% .035372 142.495339;--wac: 14.1626% .019994 108.702381;--erc: 12.5591% .051537 29.233885;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;font-family:Chalkboard,comic sans ms,"sans-serif";--p: 78.2604% 0 0;--s: 78.2604% 0 0;--a: 78.2604% 0 0;--n: 94.007% 0 0;--b1: 100% 0 0;--b2: 94.9119% 0 0;--b3: 89.7547% 0 0;--in: 45.2014% .313214 264.052021;--su: 51.9752% .176858 142.495339;--wa: 70.8131% .099969 108.702381;--er: 62.7955% .257683 29.233885;--rounded-box: .2rem;--rounded-btn: .2rem;--rounded-badge: .2rem;--tab-radius: .2rem}:root:has(input.theme-controller[value=wireframe]:checked){color-scheme:light;--bc: 20% 0 0;--pc: 15.6521% 0 0;--sc: 15.6521% 0 0;--ac: 15.6521% 0 0;--nc: 18.8014% 0 0;--inc: 89.0403% .062643 264.052021;--suc: 90.395% .035372 142.495339;--wac: 14.1626% .019994 108.702381;--erc: 12.5591% .051537 29.233885;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;font-family:Chalkboard,comic sans ms,"sans-serif";--p: 78.2604% 0 0;--s: 78.2604% 0 0;--a: 78.2604% 0 0;--n: 94.007% 0 0;--b1: 100% 0 0;--b2: 94.9119% 0 0;--b3: 89.7547% 0 0;--in: 45.2014% .313214 264.052021;--su: 51.9752% .176858 142.495339;--wa: 70.8131% .099969 108.702381;--er: 62.7955% .257683 29.233885;--rounded-box: .2rem;--rounded-btn: .2rem;--rounded-badge: .2rem;--tab-radius: .2rem}[data-theme=black]{color-scheme:dark;--pc: 86.736% 0 0;--sc: 86.736% 0 0;--ac: 86.736% 0 0;--nc: 86.736% 0 0;--inc: 89.0403% .062643 264.052021;--suc: 90.395% .035372 142.495339;--wac: 19.3597% .042201 109.769232;--erc: 12.5591% .051537 29.233885;--border-btn: 1px;--tab-border: 1px;--p: 33.6799% 0 0;--s: 33.6799% 0 0;--a: 33.6799% 0 0;--b1: 0% 0 0;--b2: 19.1251% 0 0;--b3: 26.8618% 0 0;--bc: 87.6096% 0 0;--n: 33.6799% 0 0;--in: 45.2014% .313214 264.052021;--su: 51.9752% .176858 142.495339;--wa: 96.7983% .211006 109.769232;--er: 62.7955% .257683 29.233885;--rounded-box: 0;--rounded-btn: 0;--rounded-badge: 0;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1;--tab-radius: 0}:root:has(input.theme-controller[value=black]:checked){color-scheme:dark;--pc: 86.736% 0 0;--sc: 86.736% 0 0;--ac: 86.736% 0 0;--nc: 86.736% 0 0;--inc: 89.0403% .062643 264.052021;--suc: 90.395% .035372 142.495339;--wac: 19.3597% .042201 109.769232;--erc: 12.5591% .051537 29.233885;--border-btn: 1px;--tab-border: 1px;--p: 33.6799% 0 0;--s: 33.6799% 0 0;--a: 33.6799% 0 0;--b1: 0% 0 0;--b2: 19.1251% 0 0;--b3: 26.8618% 0 0;--bc: 87.6096% 0 0;--n: 33.6799% 0 0;--in: 45.2014% .313214 264.052021;--su: 51.9752% .176858 142.495339;--wa: 96.7983% .211006 109.769232;--er: 62.7955% .257683 29.233885;--rounded-box: 0;--rounded-btn: 0;--rounded-badge: 0;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1;--tab-radius: 0}[data-theme=luxury]{color-scheme:dark;--pc: 20% 0 0;--sc: 85.5163% .012821 261.069149;--ac: 87.3349% .010348 338.82597;--inc: 15.8122% .024356 237.133883;--suc: 15.6239% .038579 132.154381;--wac: 17.2255% .027305 102.89115;--erc: 14.3506% .035271 22.568916;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 100% 0 0;--s: 27.5815% .064106 261.069149;--a: 36.6744% .051741 338.82597;--n: 24.27% .057015 59.825019;--nc: 93.2033% .089631 90.861683;--b1: 14.0765% .004386 285.822869;--b2: 20.2191% .004211 308.22937;--b3: 29.8961% .003818 308.318612;--bc: 75.6879% .123666 76.890484;--in: 79.0612% .121778 237.133883;--su: 78.1197% .192894 132.154381;--wa: 86.1274% .136524 102.89115;--er: 71.7531% .176357 22.568916}:root:has(input.theme-controller[value=luxury]:checked){color-scheme:dark;--pc: 20% 0 0;--sc: 85.5163% .012821 261.069149;--ac: 87.3349% .010348 338.82597;--inc: 15.8122% .024356 237.133883;--suc: 15.6239% .038579 132.154381;--wac: 17.2255% .027305 102.89115;--erc: 14.3506% .035271 22.568916;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 100% 0 0;--s: 27.5815% .064106 261.069149;--a: 36.6744% .051741 338.82597;--n: 24.27% .057015 59.825019;--nc: 93.2033% .089631 90.861683;--b1: 14.0765% .004386 285.822869;--b2: 20.2191% .004211 308.22937;--b3: 29.8961% .003818 308.318612;--bc: 75.6879% .123666 76.890484;--in: 79.0612% .121778 237.133883;--su: 78.1197% .192894 132.154381;--wa: 86.1274% .136524 102.89115;--er: 71.7531% .176357 22.568916}[data-theme=dracula]{color-scheme:dark;--b2: 26.8053% .020556 277.508664;--b3: 24.7877% .019009 277.508664;--pc: 15.0922% .036614 346.812432;--sc: 14.8405% .029709 301.883095;--ac: 16.6785% .024826 66.558491;--nc: 87.8891% .006515 275.524078;--inc: 17.6526% .018676 212.846491;--suc: 17.4199% .043903 148.024881;--wac: 19.1068% .026849 112.757109;--erc: 13.6441% .041266 24.430965;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 75.4611% .18307 346.812432;--s: 74.2023% .148546 301.883095;--a: 83.3927% .124132 66.558491;--n: 39.4456% .032576 275.524078;--b1: 28.8229% .022103 277.508664;--bc: 97.7477% .007913 106.545019;--in: 88.263% .09338 212.846491;--su: 87.0995% .219516 148.024881;--wa: 95.5338% .134246 112.757109;--er: 68.2204% .206328 24.430965}:root:has(input.theme-controller[value=dracula]:checked){color-scheme:dark;--b2: 26.8053% .020556 277.508664;--b3: 24.7877% .019009 277.508664;--pc: 15.0922% .036614 346.812432;--sc: 14.8405% .029709 301.883095;--ac: 16.6785% .024826 66.558491;--nc: 87.8891% .006515 275.524078;--inc: 17.6526% .018676 212.846491;--suc: 17.4199% .043903 148.024881;--wac: 19.1068% .026849 112.757109;--erc: 13.6441% .041266 24.430965;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 75.4611% .18307 346.812432;--s: 74.2023% .148546 301.883095;--a: 83.3927% .124132 66.558491;--n: 39.4456% .032576 275.524078;--b1: 28.8229% .022103 277.508664;--bc: 97.7477% .007913 106.545019;--in: 88.263% .09338 212.846491;--su: 87.0995% .219516 148.024881;--wa: 95.5338% .134246 112.757109;--er: 68.2204% .206328 24.430965}[data-theme=cmyk]{color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--bc: 20% 0 0;--pc: 14.3544% .02666 239.443325;--sc: 12.8953% .040552 359.339283;--ac: 18.8458% .037948 105.306968;--nc: 84.3557% 0 0;--inc: 13.6952% .0189 217.284104;--suc: 89.3898% .032505 321.406278;--wac: 14.2473% .031969 52.023412;--erc: 12.4027% .041677 28.717543;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 71.7722% .133298 239.443325;--s: 64.4766% .202758 359.339283;--a: 94.2289% .189741 105.306968;--n: 21.7787% 0 0;--b1: 100% 0 0;--in: 68.4759% .094499 217.284104;--su: 46.949% .162524 321.406278;--wa: 71.2364% .159843 52.023412;--er: 62.0133% .208385 28.717543}:root:has(input.theme-controller[value=cmyk]:checked){color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--bc: 20% 0 0;--pc: 14.3544% .02666 239.443325;--sc: 12.8953% .040552 359.339283;--ac: 18.8458% .037948 105.306968;--nc: 84.3557% 0 0;--inc: 13.6952% .0189 217.284104;--suc: 89.3898% .032505 321.406278;--wac: 14.2473% .031969 52.023412;--erc: 12.4027% .041677 28.717543;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 71.7722% .133298 239.443325;--s: 64.4766% .202758 359.339283;--a: 94.2289% .189741 105.306968;--n: 21.7787% 0 0;--b1: 100% 0 0;--in: 68.4759% .094499 217.284104;--su: 46.949% .162524 321.406278;--wa: 71.2364% .159843 52.023412;--er: 62.0133% .208385 28.717543}[data-theme=autumn]{color-scheme:light;--b2: 89.1077% 0 0;--b3: 82.4006% 0 0;--bc: 19.1629% 0 0;--pc: 88.1446% .032232 17.530175;--sc: 12.3353% .033821 23.865865;--ac: 14.6851% .018999 60.729616;--nc: 90.8734% .007475 51.902819;--inc: 13.8449% .019596 207.284192;--suc: 12.199% .016032 174.616213;--wac: 14.0163% .032982 56.844303;--erc: 90.614% .0482 24.16;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 40.7232% .16116 17.530175;--s: 61.6763% .169105 23.865865;--a: 73.4253% .094994 60.729616;--n: 54.3672% .037374 51.902819;--b1: 95.8147% 0 0;--in: 69.2245% .097979 207.284192;--su: 60.9951% .080159 174.616213;--wa: 70.0817% .164909 56.844303;--er: 53.07% .241 24.16}:root:has(input.theme-controller[value=autumn]:checked){color-scheme:light;--b2: 89.1077% 0 0;--b3: 82.4006% 0 0;--bc: 19.1629% 0 0;--pc: 88.1446% .032232 17.530175;--sc: 12.3353% .033821 23.865865;--ac: 14.6851% .018999 60.729616;--nc: 90.8734% .007475 51.902819;--inc: 13.8449% .019596 207.284192;--suc: 12.199% .016032 174.616213;--wac: 14.0163% .032982 56.844303;--erc: 90.614% .0482 24.16;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 40.7232% .16116 17.530175;--s: 61.6763% .169105 23.865865;--a: 73.4253% .094994 60.729616;--n: 54.3672% .037374 51.902819;--b1: 95.8147% 0 0;--in: 69.2245% .097979 207.284192;--su: 60.9951% .080159 174.616213;--wa: 70.0817% .164909 56.844303;--er: 53.07% .241 24.16}[data-theme=business]{color-scheme:dark;--b2: 22.6487% 0 0;--b3: 20.944% 0 0;--bc: 84.8707% 0 0;--pc: 88.3407% .019811 251.473931;--sc: 12.8185% .005481 229.389418;--ac: 13.4542% .033545 35.791525;--nc: 85.4882% .00265 253.041249;--inc: 12.5233% .028702 240.033697;--suc: 14.0454% .018919 156.59611;--wac: 15.4965% .023141 81.519177;--erc: 90.3221% .029356 29.674507;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 41.7036% .099057 251.473931;--s: 64.0924% .027405 229.389418;--a: 67.271% .167726 35.791525;--n: 27.441% .01325 253.041249;--b1: 24.3535% 0 0;--in: 62.6163% .143511 240.033697;--su: 70.2268% .094594 156.59611;--wa: 77.4824% .115704 81.519177;--er: 51.6105% .14678 29.674507;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem}:root:has(input.theme-controller[value=business]:checked){color-scheme:dark;--b2: 22.6487% 0 0;--b3: 20.944% 0 0;--bc: 84.8707% 0 0;--pc: 88.3407% .019811 251.473931;--sc: 12.8185% .005481 229.389418;--ac: 13.4542% .033545 35.791525;--nc: 85.4882% .00265 253.041249;--inc: 12.5233% .028702 240.033697;--suc: 14.0454% .018919 156.59611;--wac: 15.4965% .023141 81.519177;--erc: 90.3221% .029356 29.674507;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 41.7036% .099057 251.473931;--s: 64.0924% .027405 229.389418;--a: 67.271% .167726 35.791525;--n: 27.441% .01325 253.041249;--b1: 24.3535% 0 0;--in: 62.6163% .143511 240.033697;--su: 70.2268% .094594 156.59611;--wa: 77.4824% .115704 81.519177;--er: 51.6105% .14678 29.674507;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem}[data-theme=acid]{color-scheme:light;--b2: 91.6146% 0 0;--b3: 84.7189% 0 0;--bc: 19.7021% 0 0;--pc: 14.38% .0714 330.759573;--sc: 14.674% .0448 48.250878;--ac: 18.556% .0528 122.962951;--nc: 84.262% .0256 278.68;--inc: 12.144% .0454 252.05;--suc: 17.144% .0532 158.53;--wac: 18.202% .0424 100.5;--erc: 12.968% .0586 29.349188;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 71.9% .357 330.759573;--s: 73.37% .224 48.250878;--a: 92.78% .264 122.962951;--n: 21.31% .128 278.68;--b1: 98.5104% 0 0;--in: 60.72% .227 252.05;--su: 85.72% .266 158.53;--wa: 91.01% .212 100.5;--er: 64.84% .293 29.349188;--rounded-box: 1.25rem;--rounded-btn: 1rem;--rounded-badge: 1rem;--tab-radius: .7rem}:root:has(input.theme-controller[value=acid]:checked){color-scheme:light;--b2: 91.6146% 0 0;--b3: 84.7189% 0 0;--bc: 19.7021% 0 0;--pc: 14.38% .0714 330.759573;--sc: 14.674% .0448 48.250878;--ac: 18.556% .0528 122.962951;--nc: 84.262% .0256 278.68;--inc: 12.144% .0454 252.05;--suc: 17.144% .0532 158.53;--wac: 18.202% .0424 100.5;--erc: 12.968% .0586 29.349188;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 71.9% .357 330.759573;--s: 73.37% .224 48.250878;--a: 92.78% .264 122.962951;--n: 21.31% .128 278.68;--b1: 98.5104% 0 0;--in: 60.72% .227 252.05;--su: 85.72% .266 158.53;--wa: 91.01% .212 100.5;--er: 64.84% .293 29.349188;--rounded-box: 1.25rem;--rounded-btn: 1rem;--rounded-badge: 1rem;--tab-radius: .7rem}[data-theme=lemonade]{color-scheme:light;--b2: 91.8003% .0186 123.72;--b3: 84.8906% .0172 123.72;--bc: 19.742% .004 123.72;--pc: 11.784% .0398 134.6;--sc: 15.55% .0392 111.09;--ac: 17.078% .0402 100.73;--nc: 86.196% .015 108.6;--inc: 17.238% .0094 224.14;--suc: 17.238% .0094 157.85;--wac: 17.238% .0094 102.15;--erc: 17.238% .0094 25.85;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 58.92% .199 134.6;--s: 77.75% .196 111.09;--a: 85.39% .201 100.73;--n: 30.98% .075 108.6;--b1: 98.71% .02 123.72;--in: 86.19% .047 224.14;--su: 86.19% .047 157.85;--wa: 86.19% .047 102.15;--er: 86.19% .047 25.85}:root:has(input.theme-controller[value=lemonade]:checked){color-scheme:light;--b2: 91.8003% .0186 123.72;--b3: 84.8906% .0172 123.72;--bc: 19.742% .004 123.72;--pc: 11.784% .0398 134.6;--sc: 15.55% .0392 111.09;--ac: 17.078% .0402 100.73;--nc: 86.196% .015 108.6;--inc: 17.238% .0094 224.14;--suc: 17.238% .0094 157.85;--wac: 17.238% .0094 102.15;--erc: 17.238% .0094 25.85;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 58.92% .199 134.6;--s: 77.75% .196 111.09;--a: 85.39% .201 100.73;--n: 30.98% .075 108.6;--b1: 98.71% .02 123.72;--in: 86.19% .047 224.14;--su: 86.19% .047 157.85;--wa: 86.19% .047 102.15;--er: 86.19% .047 25.85}[data-theme=night]{color-scheme:dark;--b2: 19.3144% .037037 265.754874;--b3: 17.8606% .034249 265.754874;--bc: 84.1536% .007965 265.754874;--pc: 15.0703% .027798 232.66148;--sc: 13.6023% .031661 276.934902;--ac: 14.4721% .035244 350.048739;--nc: 85.5899% .00737 260.030984;--suc: 15.6904% .026506 181.911977;--wac: 16.6486% .027912 82.95003;--erc: 14.3572% .034051 13.11834;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 75.3513% .138989 232.66148;--s: 68.0113% .158303 276.934902;--a: 72.3603% .176218 350.048739;--n: 27.9495% .036848 260.030984;--b1: 20.7682% .039824 265.754874;--in: 68.4553% .148062 237.25135;--inc: 0% 0 0;--su: 78.452% .132529 181.911977;--wa: 83.2428% .139558 82.95003;--er: 71.7858% .170255 13.11834}:root:has(input.theme-controller[value=night]:checked){color-scheme:dark;--b2: 19.3144% .037037 265.754874;--b3: 17.8606% .034249 265.754874;--bc: 84.1536% .007965 265.754874;--pc: 15.0703% .027798 232.66148;--sc: 13.6023% .031661 276.934902;--ac: 14.4721% .035244 350.048739;--nc: 85.5899% .00737 260.030984;--suc: 15.6904% .026506 181.911977;--wac: 16.6486% .027912 82.95003;--erc: 14.3572% .034051 13.11834;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 75.3513% .138989 232.66148;--s: 68.0113% .158303 276.934902;--a: 72.3603% .176218 350.048739;--n: 27.9495% .036848 260.030984;--b1: 20.7682% .039824 265.754874;--in: 68.4553% .148062 237.25135;--inc: 0% 0 0;--su: 78.452% .132529 181.911977;--wa: 83.2428% .139558 82.95003;--er: 71.7858% .170255 13.11834}[data-theme=coffee]{color-scheme:dark;--b2: 20.1585% .021457 329.708637;--b3: 18.6412% .019842 329.708637;--pc: 14.3993% .024765 62.756393;--sc: 86.893% .00597 199.19444;--ac: 88.5243% .014881 224.389184;--nc: 83.3022% .003149 326.261446;--inc: 15.898% .012774 184.558367;--suc: 14.9445% .014491 131.116276;--wac: 17.6301% .028162 87.722413;--erc: 15.4637% .025644 31.871922;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 71.9967% .123825 62.756393;--s: 34.465% .029849 199.19444;--a: 42.6213% .074405 224.389184;--n: 16.5109% .015743 326.261446;--b1: 21.6758% .023072 329.708637;--bc: 72.3547% .092794 79.129387;--in: 79.4902% .063869 184.558367;--su: 74.7224% .072456 131.116276;--wa: 88.1503% .140812 87.722413;--er: 77.3187% .12822 31.871922}:root:has(input.theme-controller[value=coffee]:checked){color-scheme:dark;--b2: 20.1585% .021457 329.708637;--b3: 18.6412% .019842 329.708637;--pc: 14.3993% .024765 62.756393;--sc: 86.893% .00597 199.19444;--ac: 88.5243% .014881 224.389184;--nc: 83.3022% .003149 326.261446;--inc: 15.898% .012774 184.558367;--suc: 14.9445% .014491 131.116276;--wac: 17.6301% .028162 87.722413;--erc: 15.4637% .025644 31.871922;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 71.9967% .123825 62.756393;--s: 34.465% .029849 199.19444;--a: 42.6213% .074405 224.389184;--n: 16.5109% .015743 326.261446;--b1: 21.6758% .023072 329.708637;--bc: 72.3547% .092794 79.129387;--in: 79.4902% .063869 184.558367;--su: 74.7224% .072456 131.116276;--wa: 88.1503% .140812 87.722413;--er: 77.3187% .12822 31.871922}[data-theme=winter]{color-scheme:light;--pc: 91.372% .051 257.57;--sc: 88.5103% .03222 282.339433;--ac: 11.988% .038303 335.171434;--nc: 83.9233% .012704 257.651965;--inc: 17.6255% .017178 214.515264;--suc: 16.0988% .015404 197.823719;--wac: 17.8345% .009167 71.47031;--erc: 14.6185% .022037 20.076293;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 56.86% .255 257.57;--s: 42.5516% .161098 282.339433;--a: 59.9398% .191515 335.171434;--n: 19.6166% .063518 257.651965;--b1: 100% 0 0;--b2: 97.4663% .011947 259.822565;--b3: 93.2686% .016223 262.751375;--bc: 41.8869% .053885 255.824911;--in: 88.1275% .085888 214.515264;--su: 80.4941% .077019 197.823719;--wa: 89.1725% .045833 71.47031;--er: 73.0926% .110185 20.076293}:root:has(input.theme-controller[value=winter]:checked){color-scheme:light;--pc: 91.372% .051 257.57;--sc: 88.5103% .03222 282.339433;--ac: 11.988% .038303 335.171434;--nc: 83.9233% .012704 257.651965;--inc: 17.6255% .017178 214.515264;--suc: 16.0988% .015404 197.823719;--wac: 17.8345% .009167 71.47031;--erc: 14.6185% .022037 20.076293;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 56.86% .255 257.57;--s: 42.5516% .161098 282.339433;--a: 59.9398% .191515 335.171434;--n: 19.6166% .063518 257.651965;--b1: 100% 0 0;--b2: 97.4663% .011947 259.822565;--b3: 93.2686% .016223 262.751375;--bc: 41.8869% .053885 255.824911;--in: 88.1275% .085888 214.515264;--su: 80.4941% .077019 197.823719;--wa: 89.1725% .045833 71.47031;--er: 73.0926% .110185 20.076293}[data-theme=dim]{color-scheme:dark;--pc: 17.2267% .028331 139.549991;--sc: 14.6752% .033181 35.353059;--ac: 14.8459% .026728 311.37924;--inc: 17.2157% .028409 206.182959;--suc: 17.2343% .028437 166.534048;--wac: 17.2327% .028447 94.818679;--erc: 16.4838% .019914 33.756357;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 86.1335% .141656 139.549991;--s: 73.3759% .165904 35.353059;--a: 74.2296% .133641 311.37924;--n: 24.7311% .020483 264.094728;--nc: 82.9011% .031335 222.959324;--b1: 30.8577% .023243 264.149498;--b2: 28.0368% .01983 264.182074;--b3: 26.3469% .018403 262.177739;--bc: 82.9011% .031335 222.959324;--in: 86.0785% .142046 206.182959;--su: 86.1717% .142187 166.534048;--wa: 86.1634% .142236 94.818679;--er: 82.4189% .09957 33.756357}:root:has(input.theme-controller[value=dim]:checked){color-scheme:dark;--pc: 17.2267% .028331 139.549991;--sc: 14.6752% .033181 35.353059;--ac: 14.8459% .026728 311.37924;--inc: 17.2157% .028409 206.182959;--suc: 17.2343% .028437 166.534048;--wac: 17.2327% .028447 94.818679;--erc: 16.4838% .019914 33.756357;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 86.1335% .141656 139.549991;--s: 73.3759% .165904 35.353059;--a: 74.2296% .133641 311.37924;--n: 24.7311% .020483 264.094728;--nc: 82.9011% .031335 222.959324;--b1: 30.8577% .023243 264.149498;--b2: 28.0368% .01983 264.182074;--b3: 26.3469% .018403 262.177739;--bc: 82.9011% .031335 222.959324;--in: 86.0785% .142046 206.182959;--su: 86.1717% .142187 166.534048;--wa: 86.1634% .142236 94.818679;--er: 82.4189% .09957 33.756357}[data-theme=nord]{color-scheme:light;--pc: 11.8872% .015449 254.027774;--sc: 13.9303% .011822 248.687186;--ac: 15.4929% .01245 217.469017;--inc: 13.8414% .012499 332.664922;--suc: 15.3654% .01498 131.063061;--wac: 17.0972% .017847 84.093335;--erc: 12.122% .024119 15.341883;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 59.4359% .077246 254.027774;--s: 69.6516% .059108 248.687186;--a: 77.4643% .062249 217.469017;--n: 45.229% .035214 264.1312;--nc: 89.9258% .016374 262.749256;--b1: 95.1276% .007445 260.731539;--b2: 93.2996% .010389 261.788485;--b3: 89.9258% .016374 262.749256;--bc: 32.4374% .022945 264.182036;--in: 69.2072% .062496 332.664922;--su: 76.827% .074899 131.063061;--wa: 85.4862% .089234 84.093335;--er: 60.61% .120594 15.341883;--rounded-box: .4rem;--rounded-btn: .2rem;--rounded-badge: .4rem;--tab-radius: .2rem}:root:has(input.theme-controller[value=nord]:checked){color-scheme:light;--pc: 11.8872% .015449 254.027774;--sc: 13.9303% .011822 248.687186;--ac: 15.4929% .01245 217.469017;--inc: 13.8414% .012499 332.664922;--suc: 15.3654% .01498 131.063061;--wac: 17.0972% .017847 84.093335;--erc: 12.122% .024119 15.341883;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 59.4359% .077246 254.027774;--s: 69.6516% .059108 248.687186;--a: 77.4643% .062249 217.469017;--n: 45.229% .035214 264.1312;--nc: 89.9258% .016374 262.749256;--b1: 95.1276% .007445 260.731539;--b2: 93.2996% .010389 261.788485;--b3: 89.9258% .016374 262.749256;--bc: 32.4374% .022945 264.182036;--in: 69.2072% .062496 332.664922;--su: 76.827% .074899 131.063061;--wa: 85.4862% .089234 84.093335;--er: 60.61% .120594 15.341883;--rounded-box: .4rem;--rounded-btn: .2rem;--rounded-badge: .4rem;--tab-radius: .2rem}[data-theme=sunset]{color-scheme:dark;--pc: 14.9408% .031656 39.94703;--sc: 14.5075% .035531 2.72034;--ac: 14.2589% .033336 299.844533;--inc: 17.1119% .017054 206.015183;--suc: 17.1122% .017172 144.77874;--wac: 17.1139% .016961 74.427797;--erc: 17.1023% .015778 16.886379;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 74.7039% .158278 39.94703;--s: 72.5375% .177654 2.72034;--a: 71.2947% .166678 299.844533;--n: 26% .019 237.69;--nc: 70% .019 237.69;--b1: 22% .019 237.69;--b2: 20% .019 237.69;--b3: 18% .019 237.69;--bc: 77.3835% .043586 245.096534;--in: 85.5596% .085271 206.015183;--su: 85.5609% .08586 144.77874;--wa: 85.5695% .084806 74.427797;--er: 85.5116% .07889 16.886379;--rounded-box: 1.2rem;--rounded-btn: .8rem;--rounded-badge: .4rem;--tab-radius: .7rem}:root:has(input.theme-controller[value=sunset]:checked){color-scheme:dark;--pc: 14.9408% .031656 39.94703;--sc: 14.5075% .035531 2.72034;--ac: 14.2589% .033336 299.844533;--inc: 17.1119% .017054 206.015183;--suc: 17.1122% .017172 144.77874;--wac: 17.1139% .016961 74.427797;--erc: 17.1023% .015778 16.886379;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 74.7039% .158278 39.94703;--s: 72.5375% .177654 2.72034;--a: 71.2947% .166678 299.844533;--n: 26% .019 237.69;--nc: 70% .019 237.69;--b1: 22% .019 237.69;--b2: 20% .019 237.69;--b3: 18% .019 237.69;--bc: 77.3835% .043586 245.096534;--in: 85.5596% .085271 206.015183;--su: 85.5609% .08586 144.77874;--wa: 85.5695% .084806 74.427797;--er: 85.5116% .07889 16.886379;--rounded-box: 1.2rem;--rounded-btn: .8rem;--rounded-badge: .4rem;--tab-radius: .7rem}.alert{display:grid;width:100%;grid-auto-flow:row;align-content:flex-start;align-items:center;justify-items:center;gap:1rem;text-align:center;border-radius:var(--rounded-box, 1rem);border-width:1px;--tw-border-opacity: 1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));padding:1rem;--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--alert-bg: var(--fallback-b2,oklch(var(--b2)/1));--alert-bg-mix: var(--fallback-b1,oklch(var(--b1)/1));background-color:var(--alert-bg)}@media (min-width: 640px){.alert{grid-auto-flow:column;grid-template-columns:auto minmax(auto,1fr);justify-items:start;text-align:start}}.avatar.placeholder>div{display:flex;align-items:center;justify-content:center}.badge{display:inline-flex;align-items:center;justify-content:center;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s;height:1.25rem;font-size:.875rem;line-height:1.25rem;width:-moz-fit-content;width:fit-content;padding-left:.563rem;padding-right:.563rem;border-radius:var(--rounded-badge, 1.9rem);border-width:1px;--tw-border-opacity: 1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}@media (hover:hover){.label a:hover{--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.menu li>*:not(ul,.menu-title,details,.btn):active,.menu li>*:not(ul,.menu-title,details,.btn).active,.menu li>details>summary:active{--tw-bg-opacity: 1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}}.btn{display:inline-flex;height:3rem;min-height:3rem;flex-shrink:0;cursor:pointer;-webkit-user-select:none;-moz-user-select:none;user-select:none;flex-wrap:wrap;align-items:center;justify-content:center;border-radius:var(--rounded-btn, .5rem);border-color:transparent;border-color:oklch(var(--btn-color, var(--b2)) / var(--tw-border-opacity));padding-left:1rem;padding-right:1rem;text-align:center;font-size:.875rem;line-height:1em;gap:.5rem;font-weight:600;text-decoration-line:none;transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);border-width:var(--border-btn, 1px);transition-property:color,background-color,border-color,opacity,box-shadow,transform;--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-shadow: 0 1px 2px 0 rgb(0 0 0 / .05);--tw-shadow-colored: 0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow);outline-color:var(--fallback-bc,oklch(var(--bc)/1));background-color:oklch(var(--btn-color, var(--b2)) / var(--tw-bg-opacity));--tw-bg-opacity: 1;--tw-border-opacity: 1}.btn-disabled,.btn[disabled],.btn:disabled{pointer-events:none}:where(.btn:is(input[type=checkbox])),:where(.btn:is(input[type=radio])){width:auto;-webkit-appearance:none;-moz-appearance:none;appearance:none}.btn:is(input[type=checkbox]):after,.btn:is(input[type=radio]):after{--tw-content: attr(aria-label);content:var(--tw-content)}.chat{display:grid;grid-template-columns:repeat(2,minmax(0,1fr));-moz-column-gap:.75rem;column-gap:.75rem;padding-top:.25rem;padding-bottom:.25rem}.chat-bubble{position:relative;display:block;width:-moz-fit-content;width:fit-content;padding:.5rem 1rem;max-width:90%;border-radius:var(--rounded-box, 1rem);min-height:2.75rem;min-width:2.75rem;--tw-bg-opacity: 1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.chat-bubble:before{position:absolute;bottom:0;height:.75rem;width:.75rem;background-color:inherit;content:"";-webkit-mask-size:contain;mask-size:contain;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-position:center;mask-position:center}.chat-start{place-items:start;grid-template-columns:auto 1fr}.chat-start .chat-header,.chat-start .chat-footer{grid-column-start:2}.chat-start .chat-image{grid-column-start:1}.chat-start .chat-bubble{grid-column-start:2;border-end-start-radius:0px}.chat-start .chat-bubble:before{-webkit-mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e");mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e");inset-inline-start:-.749rem}[dir=rtl] .chat-start .chat-bubble:before{-webkit-mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e");mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e")}.chat-end{place-items:end;grid-template-columns:1fr auto}.chat-end .chat-header,.chat-end .chat-footer{grid-column-start:1}.chat-end .chat-image{grid-column-start:2}.chat-end .chat-bubble{grid-column-start:1;border-end-end-radius:0px}.chat-end .chat-bubble:before{-webkit-mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e");mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e");inset-inline-start:99.9%}[dir=rtl] .chat-end .chat-bubble:before{-webkit-mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e");mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e")}.checkbox{flex-shrink:0;--chkbg: var(--fallback-bc,oklch(var(--bc)/1));--chkfg: var(--fallback-b1,oklch(var(--b1)/1));height:1.5rem;width:1.5rem;cursor:pointer;-webkit-appearance:none;-moz-appearance:none;appearance:none;border-radius:var(--rounded-btn, .5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity: .2}.collapse:not(td):not(tr):not(colgroup){visibility:visible}.collapse{position:relative;display:grid;overflow:hidden;grid-template-rows:auto 0fr;transition:grid-template-rows .2s;width:100%;border-radius:var(--rounded-box, 1rem)}.collapse-title,.collapse>input[type=checkbox],.collapse>input[type=radio],.collapse-content{grid-column-start:1;grid-row-start:1}.collapse>input[type=checkbox],.collapse>input[type=radio]{-webkit-appearance:none;-moz-appearance:none;appearance:none;opacity:0}.collapse-content{visibility:hidden;grid-column-start:1;grid-row-start:2;min-height:0px;transition:visibility .2s;transition:padding .2s ease-out,background-color .2s ease-out;padding-left:1rem;padding-right:1rem;cursor:unset}.collapse[open],.collapse-open,.collapse:focus:not(.collapse-close){grid-template-rows:auto 1fr}.collapse:not(.collapse-close):has(>input[type=checkbox]:checked),.collapse:not(.collapse-close):has(>input[type=radio]:checked){grid-template-rows:auto 1fr}.collapse[open]>.collapse-content,.collapse-open>.collapse-content,.collapse:focus:not(.collapse-close)>.collapse-content,.collapse:not(.collapse-close)>input[type=checkbox]:checked~.collapse-content,.collapse:not(.collapse-close)>input[type=radio]:checked~.collapse-content{visibility:visible;min-height:-moz-fit-content;min-height:fit-content}.drawer{position:relative;display:grid;grid-auto-columns:max-content auto;width:100%}.drawer-content{grid-column-start:2;grid-row-start:1;min-width:0px}.drawer-side{pointer-events:none;position:fixed;inset-inline-start:0px;top:0;grid-column-start:1;grid-row-start:1;display:grid;width:100%;grid-template-columns:repeat(1,minmax(0,1fr));grid-template-rows:repeat(1,minmax(0,1fr));align-items:flex-start;justify-items:start;overflow-x:hidden;overflow-y:hidden;overscroll-behavior:contain;height:100vh;height:100dvh}.drawer-side>.drawer-overlay{position:sticky;top:0;place-self:stretch;cursor:pointer;background-color:transparent;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s}.drawer-side>*{grid-column-start:1;grid-row-start:1}.drawer-side>*:not(.drawer-overlay){transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.3s;will-change:transform;transform:translate(-100%)}[dir=rtl] .drawer-side>*:not(.drawer-overlay){transform:translate(100%)}.drawer-toggle{position:fixed;height:0px;width:0px;-webkit-appearance:none;-moz-appearance:none;appearance:none;opacity:0}.drawer-toggle:checked~.drawer-side{pointer-events:auto;visibility:visible;overflow-y:auto}.drawer-toggle:checked~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}.drawer-end>.drawer-toggle~.drawer-content{grid-column-start:1}.drawer-end>.drawer-toggle~.drawer-side{grid-column-start:2;justify-items:end}.drawer-end>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(100%)}[dir=rtl] .drawer-end>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(-100%)}.drawer-end>.drawer-toggle:checked~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}.dropdown{position:relative;display:inline-block}.dropdown>*:not(summary):focus{outline:2px solid transparent;outline-offset:2px}.dropdown .dropdown-content{position:absolute}.dropdown:is(:not(details)) .dropdown-content{visibility:hidden;opacity:0;transform-origin:top;--tw-scale-x: .95;--tw-scale-y: .95;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s}.dropdown-end .dropdown-content{inset-inline-end:0px}.dropdown-left .dropdown-content{bottom:auto;inset-inline-end:100%;top:0;transform-origin:right}.dropdown-right .dropdown-content{bottom:auto;inset-inline-start:100%;top:0;transform-origin:left}.dropdown-bottom .dropdown-content{bottom:auto;top:100%;transform-origin:top}.dropdown-top .dropdown-content{bottom:100%;top:auto;transform-origin:bottom}.dropdown-end.dropdown-right .dropdown-content,.dropdown-end.dropdown-left .dropdown-content{bottom:0;top:auto}.dropdown.dropdown-open .dropdown-content,.dropdown:not(.dropdown-hover):focus .dropdown-content,.dropdown:focus-within .dropdown-content{visibility:visible;opacity:1}@media (hover: hover){.dropdown.dropdown-hover:hover .dropdown-content{visibility:visible;opacity:1}.btm-nav>*.disabled:hover,.btm-nav>*[disabled]:hover{pointer-events:none;--tw-border-opacity: 0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity: .1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity: .2}.btn:hover{--tw-border-opacity: 1;border-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}@supports (color: color-mix(in oklab,black,black)){.btn:hover{background-color:color-mix(in oklab,oklch(var(--btn-color, var(--b2)) / var(--tw-bg-opacity, 1)) 90%,black);border-color:color-mix(in oklab,oklch(var(--btn-color, var(--b2)) / var(--tw-border-opacity, 1)) 90%,black)}}@supports not (color: oklch(0% 0 0)){.btn:hover{background-color:var(--btn-color, var(--fallback-b2));border-color:var(--btn-color, var(--fallback-b2))}}.btn.glass:hover{--glass-opacity: 25%;--glass-border-opacity: 15%}.btn-ghost:hover{border-color:transparent}@supports (color: oklch(0% 0 0)){.btn-ghost:hover{background-color:var(--fallback-bc,oklch(var(--bc)/.2))}}.btn-outline.btn-primary:hover{--tw-text-opacity: 1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@supports (color: color-mix(in oklab,black,black)){.btn-outline.btn-primary:hover{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black)}}.btn-disabled:hover,.btn[disabled]:hover,.btn:disabled:hover{--tw-border-opacity: 0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity: .2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity: .2}@supports (color: color-mix(in oklab,black,black)){.btn:is(input[type=checkbox]:checked):hover,.btn:is(input[type=radio]:checked):hover{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black)}}.dropdown.dropdown-hover:hover .dropdown-content{--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}:where(.menu li:not(.menu-title,.disabled)>*:not(ul,details,.menu-title)):not(.active,.btn):hover,:where(.menu li:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(.active,.btn):hover{cursor:pointer;outline:2px solid transparent;outline-offset:2px}@supports (color: oklch(0% 0 0)){:where(.menu li:not(.menu-title,.disabled)>*:not(ul,details,.menu-title)):not(.active,.btn):hover,:where(.menu li:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(.active,.btn):hover{background-color:var(--fallback-bc,oklch(var(--bc)/.1))}}}.dropdown:is(details) summary::-webkit-details-marker{display:none}.form-control{display:flex;flex-direction:column}.label{display:flex;-webkit-user-select:none;-moz-user-select:none;user-select:none;align-items:center;justify-content:space-between;padding:.5rem .25rem}.input{flex-shrink:1;-webkit-appearance:none;-moz-appearance:none;appearance:none;height:3rem;padding-left:1rem;padding-right:1rem;font-size:1rem;line-height:2;line-height:1.5rem;border-radius:var(--rounded-btn, .5rem);border-width:1px;border-color:transparent;--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.input[type=number]::-webkit-inner-spin-button,.input-md[type=number]::-webkit-inner-spin-button{margin-top:-1rem;margin-bottom:-1rem;margin-inline-end:-1rem}.join :where(.join-item){border-start-end-radius:0;border-end-end-radius:0;border-end-start-radius:0;border-start-start-radius:0}.join .join-item:not(:first-child):not(:last-child),.join *:not(:first-child):not(:last-child) .join-item{border-start-end-radius:0;border-end-end-radius:0;border-end-start-radius:0;border-start-start-radius:0}.join .join-item:first-child:not(:last-child),.join *:first-child:not(:last-child) .join-item{border-start-end-radius:0;border-end-end-radius:0}.join .dropdown .join-item:first-child:not(:last-child),.join *:first-child:not(:last-child) .dropdown .join-item{border-start-end-radius:inherit;border-end-end-radius:inherit}.join :where(.join-item:first-child:not(:last-child)),.join :where(*:first-child:not(:last-child) .join-item){border-end-start-radius:inherit;border-start-start-radius:inherit}.join .join-item:last-child:not(:first-child),.join *:last-child:not(:first-child) .join-item{border-end-start-radius:0;border-start-start-radius:0}.join :where(.join-item:last-child:not(:first-child)),.join :where(*:last-child:not(:first-child) .join-item){border-start-end-radius:inherit;border-end-end-radius:inherit}@supports selector(:has(*)){:where(.join *:has(.join-item)){border-radius:inherit}}.menu{display:flex;flex-direction:column;flex-wrap:wrap;font-size:.875rem;line-height:1.25rem;padding:.5rem}.menu :where(li ul){position:relative;white-space:nowrap;margin-inline-start:1rem;padding-inline-start:.5rem}.menu :where(li:not(.menu-title)>*:not(ul,details,.menu-title,.btn)),.menu :where(li:not(.menu-title)>details>summary:not(.menu-title)){display:grid;grid-auto-flow:column;align-content:flex-start;align-items:center;gap:.5rem;grid-auto-columns:minmax(auto,max-content) auto max-content;-webkit-user-select:none;-moz-user-select:none;user-select:none}.menu li.disabled{cursor:not-allowed;-webkit-user-select:none;-moz-user-select:none;user-select:none;color:var(--fallback-bc,oklch(var(--bc)/.3))}.menu :where(li>.menu-dropdown:not(.menu-dropdown-show)){display:none}:where(.menu li){position:relative;display:flex;flex-shrink:0;flex-direction:column;flex-wrap:wrap;align-items:stretch}:where(.menu li) .badge{justify-self:end}.modal{pointer-events:none;position:fixed;top:0;right:0;bottom:0;left:0;margin:0;display:grid;height:100%;max-height:none;width:100%;max-width:none;justify-items:center;padding:0;opacity:0;overscroll-behavior:contain;z-index:999;background-color:transparent;color:inherit;transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);transition-property:transform,opacity,visibility;overflow-y:hidden}:where(.modal){align-items:center}.modal-box{max-height:calc(100vh - 5em);grid-column-start:1;grid-row-start:1;width:91.666667%;max-width:32rem;--tw-scale-x: .9;--tw-scale-y: .9;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-bottom-right-radius:var(--rounded-box, 1rem);border-bottom-left-radius:var(--rounded-box, 1rem);border-top-left-radius:var(--rounded-box, 1rem);border-top-right-radius:var(--rounded-box, 1rem);--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding:1.5rem;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s;box-shadow:#00000040 0 25px 50px -12px;overflow-y:auto;overscroll-behavior:contain}.modal-open,.modal:target,.modal-toggle:checked+.modal,.modal[open]{pointer-events:auto;visibility:visible;opacity:1}.modal-action{display:flex;margin-top:1.5rem;justify-content:flex-end}:root:has(:is(.modal-open,.modal:target,.modal-toggle:checked+.modal,.modal[open])){overflow:hidden;scrollbar-gutter:stable}.radio{flex-shrink:0;--chkbg: var(--bc);height:1.5rem;width:1.5rem;cursor:pointer;-webkit-appearance:none;-moz-appearance:none;appearance:none;border-radius:9999px;border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity: .2}.range{height:1.5rem;width:100%;cursor:pointer;-moz-appearance:none;appearance:none;-webkit-appearance:none;--range-shdw: var(--fallback-bc,oklch(var(--bc)/1));overflow:hidden;border-radius:var(--rounded-box, 1rem);background-color:transparent}.range:focus{outline:none}.textarea{min-height:3rem;flex-shrink:1;padding:.5rem 1rem;font-size:.875rem;line-height:1.25rem;line-height:2;border-radius:var(--rounded-btn, .5rem);border-width:1px;border-color:transparent;--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.btm-nav>*.disabled,.btm-nav>*[disabled]{pointer-events:none;--tw-border-opacity: 0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity: .1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity: .2}.btm-nav>* .label{font-size:1rem;line-height:1.5rem}@media (prefers-reduced-motion: no-preference){.btn{animation:button-pop var(--animation-btn, .25s) ease-out}}.btn:active:hover,.btn:active:focus{animation:button-pop 0s ease-out;transform:scale(var(--btn-focus-scale, .97))}@supports not (color: oklch(0% 0 0)){.btn{background-color:var(--btn-color, var(--fallback-b2));border-color:var(--btn-color, var(--fallback-b2))}.btn-primary{--btn-color: var(--fallback-p)}.btn-neutral{--btn-color: var(--fallback-n)}}@supports (color: color-mix(in oklab,black,black)){.btn-active{background-color:color-mix(in oklab,oklch(var(--btn-color, var(--b3)) / var(--tw-bg-opacity, 1)) 90%,black);border-color:color-mix(in oklab,oklch(var(--btn-color, var(--b3)) / var(--tw-border-opacity, 1)) 90%,black)}.btn-outline.btn-primary.btn-active{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black)}.btn-outline.btn-secondary.btn-active{background-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,black)}.btn-outline.btn-accent.btn-active{background-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,black)}.btn-outline.btn-success.btn-active{background-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,black)}.btn-outline.btn-info.btn-active{background-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,black)}.btn-outline.btn-warning.btn-active{background-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,black)}.btn-outline.btn-error.btn-active{background-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,black)}}.btn:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px}.btn-primary{--tw-text-opacity: 1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}@supports (color: oklch(0% 0 0)){.btn-primary{--btn-color: var(--p)}.btn-neutral{--btn-color: var(--n)}}.btn-neutral{--tw-text-opacity: 1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));outline-color:var(--fallback-n,oklch(var(--n)/1))}.btn.glass{--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow);outline-color:currentColor}.btn.glass.btn-active{--glass-opacity: 25%;--glass-border-opacity: 15%}.btn-ghost{border-width:1px;border-color:transparent;background-color:transparent;color:currentColor;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow);outline-color:currentColor}.btn-ghost.btn-active{border-color:transparent;background-color:var(--fallback-bc,oklch(var(--bc)/.2))}.btn-link.btn-active{border-color:transparent;background-color:transparent;text-decoration-line:underline}.btn-outline.btn-active{--tw-border-opacity: 1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-b1,oklch(var(--b1)/var(--tw-text-opacity)))}.btn-outline.btn-primary{--tw-text-opacity: 1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}.btn-outline.btn-primary.btn-active{--tw-text-opacity: 1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.btn-outline.btn-secondary.btn-active{--tw-text-opacity: 1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.btn-outline.btn-accent.btn-active{--tw-text-opacity: 1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.btn-outline.btn-success.btn-active{--tw-text-opacity: 1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.btn-outline.btn-info.btn-active{--tw-text-opacity: 1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.btn-outline.btn-warning.btn-active{--tw-text-opacity: 1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.btn-outline.btn-error.btn-active{--tw-text-opacity: 1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.btn.btn-disabled,.btn[disabled],.btn:disabled{--tw-border-opacity: 0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity: .2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity: .2}.btn:is(input[type=checkbox]:checked),.btn:is(input[type=radio]:checked){--tw-border-opacity: 1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.btn:is(input[type=checkbox]:checked):focus-visible,.btn:is(input[type=radio]:checked):focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}@keyframes button-pop{0%{transform:scale(var(--btn-focus-scale, .98))}40%{transform:scale(1.02)}to{transform:scale(1)}}.checkbox:focus{box-shadow:none}.checkbox:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.checkbox:disabled{border-width:0px;cursor:not-allowed;border-color:transparent;--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.2}.checkbox:checked,.checkbox[aria-checked=true]{background-repeat:no-repeat;animation:checkmark var(--animation-input, .2s) ease-out;background-color:var(--chkbg);background-image:linear-gradient(-45deg,transparent 65%,var(--chkbg) 65.99%),linear-gradient(45deg,transparent 75%,var(--chkbg) 75.99%),linear-gradient(-45deg,var(--chkbg) 40%,transparent 40.99%),linear-gradient(45deg,var(--chkbg) 30%,var(--chkfg) 30.99%,var(--chkfg) 40%,transparent 40.99%),linear-gradient(-45deg,var(--chkfg) 50%,var(--chkbg) 50.99%)}.checkbox:indeterminate{--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));background-repeat:no-repeat;animation:checkmark var(--animation-input, .2s) ease-out;background-image:linear-gradient(90deg,transparent 80%,var(--chkbg) 80%),linear-gradient(-90deg,transparent 80%,var(--chkbg) 80%),linear-gradient(0deg,var(--chkbg) 43%,var(--chkfg) 43%,var(--chkfg) 57%,var(--chkbg) 57%)}@keyframes checkmark{0%{background-position-y:5px}50%{background-position-y:-2px}to{background-position-y:0}}details.collapse{width:100%}details.collapse summary{position:relative;display:block;outline:2px solid transparent;outline-offset:2px}details.collapse summary::-webkit-details-marker{display:none}.collapse:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.collapse:has(.collapse-title:focus-visible),.collapse:has(>input[type=checkbox]:focus-visible),.collapse:has(>input[type=radio]:focus-visible){outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.collapse-arrow>.collapse-title:after{position:absolute;display:block;height:.5rem;width:.5rem;--tw-translate-y: -100%;--tw-rotate: 45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.15s;transition-duration:.2s;top:1.9rem;inset-inline-end:1.4rem;content:"";transform-origin:75% 75%;box-shadow:2px 2px;pointer-events:none}.collapse-plus>.collapse-title:after{position:absolute;display:block;height:.5rem;width:.5rem;transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.3s;top:.9rem;inset-inline-end:1.4rem;content:"+";pointer-events:none}.collapse:not(.collapse-open):not(.collapse-close)>input[type=checkbox],.collapse:not(.collapse-open):not(.collapse-close)>input[type=radio]:not(:checked),.collapse:not(.collapse-open):not(.collapse-close)>.collapse-title{cursor:pointer}.collapse:focus:not(.collapse-open):not(.collapse-close):not(.collapse[open])>.collapse-title{cursor:unset}.collapse-title{position:relative}:where(.collapse>input[type=checkbox]),:where(.collapse>input[type=radio]){z-index:1}.collapse-title,:where(.collapse>input[type=checkbox]),:where(.collapse>input[type=radio]){width:100%;padding:1rem;padding-inline-end:3rem;min-height:3.75rem;transition:background-color .2s ease-out}.collapse[open]>:where(.collapse-content),.collapse-open>:where(.collapse-content),.collapse:focus:not(.collapse-close)>:where(.collapse-content),.collapse:not(.collapse-close)>:where(input[type=checkbox]:checked~.collapse-content),.collapse:not(.collapse-close)>:where(input[type=radio]:checked~.collapse-content){padding-bottom:1rem;transition:padding .2s ease-out,background-color .2s ease-out}.collapse[open].collapse-arrow>.collapse-title:after,.collapse-open.collapse-arrow>.collapse-title:after,.collapse-arrow:focus:not(.collapse-close)>.collapse-title:after,.collapse-arrow:not(.collapse-close)>input[type=checkbox]:checked~.collapse-title:after,.collapse-arrow:not(.collapse-close)>input[type=radio]:checked~.collapse-title:after{--tw-translate-y: -50%;--tw-rotate: 225deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.collapse[open].collapse-plus>.collapse-title:after,.collapse-open.collapse-plus>.collapse-title:after,.collapse-plus:focus:not(.collapse-close)>.collapse-title:after,.collapse-plus:not(.collapse-close)>input[type=checkbox]:checked~.collapse-title:after,.collapse-plus:not(.collapse-close)>input[type=radio]:checked~.collapse-title:after{content:"−"}.drawer-toggle:checked~.drawer-side>.drawer-overlay{background-color:#0006}.drawer-toggle:focus-visible~.drawer-content label.drawer-button{outline-style:solid;outline-width:2px;outline-offset:2px}.dropdown.dropdown-open .dropdown-content,.dropdown:focus .dropdown-content,.dropdown:focus-within .dropdown-content{--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.input input{--tw-bg-opacity: 1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));background-color:transparent}.input input:focus{outline:2px solid transparent;outline-offset:2px}.input[list]::-webkit-calendar-picker-indicator{line-height:1em}.input-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.input:focus,.input:focus-within{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.input:has(>input[disabled]),.input-disabled,.input:disabled,.input[disabled]{cursor:not-allowed;--tw-border-opacity: 1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.input:has(>input[disabled])::-moz-placeholder,.input-disabled::-moz-placeholder,.input:disabled::-moz-placeholder,.input[disabled]::-moz-placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity: .2}.input:has(>input[disabled])::placeholder,.input-disabled::placeholder,.input:disabled::placeholder,.input[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity: .2}.input:has(>input[disabled])>input[disabled]{cursor:not-allowed}.input::-webkit-date-and-time-value{text-align:inherit}.join>:where(*:not(:first-child)):is(.btn){margin-inline-start:calc(var(--border-btn) * -1)}.join-item:focus{isolation:isolate}.loading{pointer-events:none;display:inline-block;aspect-ratio:1 / 1;width:1.5rem;background-color:currentColor;-webkit-mask-size:100%;mask-size:100%;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-position:center;mask-position:center;-webkit-mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' stroke='%23000' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_V8m1%7Btransform-origin:center;animation:spinner_zKoa 2s linear infinite%7D.spinner_V8m1 circle%7Bstroke-linecap:round;animation:spinner_YpZS 1.5s ease-out infinite%7D%40keyframes spinner_zKoa%7B100%25%7Btransform:rotate(360deg)%7D%7D%40keyframes spinner_YpZS%7B0%25%7Bstroke-dasharray:0 150;stroke-dashoffset:0%7D47.5%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-16%7D95%25%2C100%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-59%7D%7D%3C%2Fstyle%3E%3Cg class='spinner_V8m1'%3E%3Ccircle cx='12' cy='12' r='9.5' fill='none' stroke-width='3'%3E%3C%2Fcircle%3E%3C%2Fg%3E%3C%2Fsvg%3E");mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' stroke='%23000' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_V8m1%7Btransform-origin:center;animation:spinner_zKoa 2s linear infinite%7D.spinner_V8m1 circle%7Bstroke-linecap:round;animation:spinner_YpZS 1.5s ease-out infinite%7D%40keyframes spinner_zKoa%7B100%25%7Btransform:rotate(360deg)%7D%7D%40keyframes spinner_YpZS%7B0%25%7Bstroke-dasharray:0 150;stroke-dashoffset:0%7D47.5%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-16%7D95%25%2C100%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-59%7D%7D%3C%2Fstyle%3E%3Cg class='spinner_V8m1'%3E%3Ccircle cx='12' cy='12' r='9.5' fill='none' stroke-width='3'%3E%3C%2Fcircle%3E%3C%2Fg%3E%3C%2Fsvg%3E")}.loading-dots{-webkit-mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_qM83%7Banimation:spinner_8HQG 1.05s infinite%7D.spinner_oXPr%7Banimation-delay:.1s%7D.spinner_ZTLf%7Banimation-delay:.2s%7D@keyframes spinner_8HQG%7B0%25,57.14%25%7Banimation-timing-function:cubic-bezier(0.33,.66,.66,1);transform:translate(0)%7D28.57%25%7Banimation-timing-function:cubic-bezier(0.33,0,.66,.33);transform:translateY(-6px)%7D100%25%7Btransform:translate(0)%7D%7D%3C/style%3E%3Ccircle class='spinner_qM83' cx='4' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_oXPr' cx='12' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_ZTLf' cx='20' cy='12' r='3'/%3E%3C/svg%3E");mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_qM83%7Banimation:spinner_8HQG 1.05s infinite%7D.spinner_oXPr%7Banimation-delay:.1s%7D.spinner_ZTLf%7Banimation-delay:.2s%7D@keyframes spinner_8HQG%7B0%25,57.14%25%7Banimation-timing-function:cubic-bezier(0.33,.66,.66,1);transform:translate(0)%7D28.57%25%7Banimation-timing-function:cubic-bezier(0.33,0,.66,.33);transform:translateY(-6px)%7D100%25%7Btransform:translate(0)%7D%7D%3C/style%3E%3Ccircle class='spinner_qM83' cx='4' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_oXPr' cx='12' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_ZTLf' cx='20' cy='12' r='3'/%3E%3C/svg%3E")}.loading-md{width:1.5rem}:where(.menu li:empty){--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.1;margin:.5rem 1rem;height:1px}.menu :where(li ul):before{position:absolute;bottom:.75rem;inset-inline-start:0px;top:.75rem;width:1px;--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.1;content:""}.menu :where(li:not(.menu-title)>*:not(ul,details,.menu-title,.btn)),.menu :where(li:not(.menu-title)>details>summary:not(.menu-title)){border-radius:var(--rounded-btn, .5rem);padding:.5rem 1rem;text-align:start;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s;text-wrap:balance}:where(.menu li:not(.menu-title,.disabled)>*:not(ul,details,.menu-title)):not(summary,.active,.btn).focus,:where(.menu li:not(.menu-title,.disabled)>*:not(ul,details,.menu-title)):not(summary,.active,.btn):focus,:where(.menu li:not(.menu-title,.disabled)>*:not(ul,details,.menu-title)):is(summary):not(.active,.btn):focus-visible,:where(.menu li:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(summary,.active,.btn).focus,:where(.menu li:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(summary,.active,.btn):focus,:where(.menu li:not(.menu-title,.disabled)>details>summary:not(.menu-title)):is(summary):not(.active,.btn):focus-visible{cursor:pointer;background-color:var(--fallback-bc,oklch(var(--bc)/.1));--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));outline:2px solid transparent;outline-offset:2px}.menu li>*:not(ul,.menu-title,details,.btn):active,.menu li>*:not(ul,.menu-title,details,.btn).active,.menu li>details>summary:active{--tw-bg-opacity: 1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.menu :where(li>details>summary)::-webkit-details-marker{display:none}.menu :where(li>details>summary):after,.menu :where(li>.menu-dropdown-toggle):after{justify-self:end;display:block;margin-top:-.5rem;height:.5rem;width:.5rem;transform:rotate(45deg);transition-property:transform,margin-top;transition-duration:.3s;transition-timing-function:cubic-bezier(.4,0,.2,1);content:"";transform-origin:75% 75%;box-shadow:2px 2px;pointer-events:none}.menu :where(li>details[open]>summary):after,.menu :where(li>.menu-dropdown-toggle.menu-dropdown-show):after{transform:rotate(225deg);margin-top:0}.mockup-browser .mockup-browser-toolbar .input{position:relative;margin-left:auto;margin-right:auto;display:block;height:1.75rem;width:24rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;--tw-bg-opacity: 1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding-left:2rem;direction:ltr}.mockup-browser .mockup-browser-toolbar .input:before{content:"";position:absolute;left:.5rem;top:50%;aspect-ratio:1 / 1;height:.75rem;--tw-translate-y: -50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:9999px;border-width:2px;border-color:currentColor;opacity:.6}.mockup-browser .mockup-browser-toolbar .input:after{content:"";position:absolute;left:1.25rem;top:50%;height:.5rem;--tw-translate-y: 25%;--tw-rotate: -45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:9999px;border-width:1px;border-color:currentColor;opacity:.6}.modal:not(dialog:not(.modal-open)),.modal::backdrop{background-color:#0006;animation:modal-pop .2s ease-out}.modal-open .modal-box,.modal-toggle:checked+.modal .modal-box,.modal:target .modal-box,.modal[open] .modal-box{--tw-translate-y: 0px;--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.modal-action>:not([hidden])~:not([hidden]){--tw-space-x-reverse: 0;margin-right:calc(.5rem * var(--tw-space-x-reverse));margin-left:calc(.5rem * calc(1 - var(--tw-space-x-reverse)))}@keyframes modal-pop{0%{opacity:0}}@keyframes progress-loading{50%{background-position-x:-115%}}.radio:focus{box-shadow:none}.radio:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.radio:checked,.radio[aria-checked=true]{--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));background-image:none;animation:radiomark var(--animation-input, .2s) ease-out;box-shadow:0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset}.radio:disabled{cursor:not-allowed;opacity:.2}@keyframes radiomark{0%{box-shadow:0 0 0 12px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 12px var(--fallback-b1,oklch(var(--b1)/1)) inset}50%{box-shadow:0 0 0 3px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 3px var(--fallback-b1,oklch(var(--b1)/1)) inset}to{box-shadow:0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset}}.range:focus-visible::-webkit-slider-thumb{--focus-shadow: 0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset, 0 0 0 2rem var(--range-shdw) inset}.range:focus-visible::-moz-range-thumb{--focus-shadow: 0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset, 0 0 0 2rem var(--range-shdw) inset}.range::-webkit-slider-runnable-track{height:.5rem;width:100%;border-radius:var(--rounded-box, 1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.range::-moz-range-track{height:.5rem;width:100%;border-radius:var(--rounded-box, 1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.range::-webkit-slider-thumb{position:relative;height:1.5rem;width:1.5rem;border-radius:var(--rounded-box, 1rem);border-style:none;--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));-moz-appearance:none;appearance:none;-webkit-appearance:none;top:50%;color:var(--range-shdw);transform:translateY(-50%);--filler-size: 100rem;--filler-offset: .6rem;box-shadow:0 0 0 3px var(--range-shdw) inset,var(--focus-shadow, 0 0),calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size)}.range::-moz-range-thumb{position:relative;height:1.5rem;width:1.5rem;border-radius:var(--rounded-box, 1rem);border-style:none;--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));top:50%;color:var(--range-shdw);--filler-size: 100rem;--filler-offset: .5rem;box-shadow:0 0 0 3px var(--range-shdw) inset,var(--focus-shadow, 0 0),calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size)}@keyframes rating-pop{0%{transform:translateY(-.125em)}40%{transform:translateY(-.125em)}to{transform:translateY(0)}}@keyframes skeleton{0%{background-position:150%}to{background-position:-50%}}.textarea-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.textarea:focus{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.textarea-disabled,.textarea:disabled,.textarea[disabled]{cursor:not-allowed;--tw-border-opacity: 1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.textarea-disabled::-moz-placeholder,.textarea:disabled::-moz-placeholder,.textarea[disabled]::-moz-placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity: .2}.textarea-disabled::placeholder,.textarea:disabled::placeholder,.textarea[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity: .2}@keyframes toast-pop{0%{transform:scale(.9);opacity:0}to{transform:scale(1);opacity:1}}.glass,.glass.btn-active{border:none;-webkit-backdrop-filter:blur(var(--glass-blur, 40px));backdrop-filter:blur(var(--glass-blur, 40px));background-color:transparent;background-image:linear-gradient(135deg,rgb(255 255 255 / var(--glass-opacity, 30%)),#0000),linear-gradient(var(--glass-reflex-degree, 100deg),rgb(255 255 255 / var(--glass-reflex-opacity, 10%)) 25%,rgb(0 0 0 / 0%) 25%);box-shadow:0 0 0 1px rgb(255 255 255 / var(--glass-border-opacity, 10%)) inset,0 0 0 2px #0000000d;text-shadow:0 1px rgb(0 0 0 / var(--glass-text-shadow-opacity, 5%))}@media (hover: hover){.glass.btn-active{border:none;-webkit-backdrop-filter:blur(var(--glass-blur, 40px));backdrop-filter:blur(var(--glass-blur, 40px));background-color:transparent;background-image:linear-gradient(135deg,rgb(255 255 255 / var(--glass-opacity, 30%)),#0000),linear-gradient(var(--glass-reflex-degree, 100deg),rgb(255 255 255 / var(--glass-reflex-opacity, 10%)) 25%,rgb(0 0 0 / 0%) 25%);box-shadow:0 0 0 1px rgb(255 255 255 / var(--glass-border-opacity, 10%)) inset,0 0 0 2px #0000000d;text-shadow:0 1px rgb(0 0 0 / var(--glass-text-shadow-opacity, 5%))}}.btn-sm{height:2rem;min-height:2rem;padding-left:.75rem;padding-right:.75rem;font-size:.875rem}.btn-block{width:100%}.btn-square:where(.btn-sm){height:2rem;width:2rem;padding:0}.btn-circle:where(.btn-sm){height:2rem;width:2rem;border-radius:9999px;padding:0}.drawer-open>.drawer-toggle{display:none}.drawer-open>.drawer-toggle~.drawer-side{pointer-events:auto;visibility:visible;position:sticky;display:block;width:auto;overscroll-behavior:auto}.drawer-open>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}[dir=rtl] .drawer-open>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}.drawer-open>.drawer-toggle:checked~.drawer-side{pointer-events:auto;visibility:visible}.drawer-open>.drawer-side{overflow-y:auto}html:has(.drawer-toggle:checked){overflow-y:hidden;scrollbar-gutter:stable}.join.join-vertical .join-item:first-child:not(:last-child),.join.join-vertical *:first-child:not(:last-child) .join-item{border-end-start-radius:0;border-end-end-radius:0;border-start-start-radius:inherit;border-start-end-radius:inherit}.join.join-vertical .join-item:last-child:not(:first-child),.join.join-vertical *:last-child:not(:first-child) .join-item{border-start-start-radius:0;border-start-end-radius:0;border-end-start-radius:inherit;border-end-end-radius:inherit}.join.join-horizontal .join-item:first-child:not(:last-child),.join.join-horizontal *:first-child:not(:last-child) .join-item{border-end-end-radius:0;border-start-end-radius:0;border-end-start-radius:inherit;border-start-start-radius:inherit}.join.join-horizontal .join-item:last-child:not(:first-child),.join.join-horizontal *:last-child:not(:first-child) .join-item{border-end-start-radius:0;border-start-start-radius:0;border-end-end-radius:inherit;border-start-end-radius:inherit}.drawer-open>.drawer-toggle~.drawer-side>.drawer-overlay{cursor:default;background-color:transparent}.join.join-vertical>:where(*:not(:first-child)):is(.btn){margin-top:calc(var(--border-btn) * -1)}.join.join-horizontal>:where(*:not(:first-child)):is(.btn){margin-inline-start:calc(var(--border-btn) * -1);margin-top:0}.modal-top :where(.modal-box){width:100%;max-width:none;--tw-translate-y: -2.5rem;--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-bottom-right-radius:var(--rounded-box, 1rem);border-bottom-left-radius:var(--rounded-box, 1rem);border-top-left-radius:0;border-top-right-radius:0}.modal-middle :where(.modal-box){width:91.666667%;max-width:32rem;--tw-translate-y: 0px;--tw-scale-x: .9;--tw-scale-y: .9;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-top-left-radius:var(--rounded-box, 1rem);border-top-right-radius:var(--rounded-box, 1rem);border-bottom-right-radius:var(--rounded-box, 1rem);border-bottom-left-radius:var(--rounded-box, 1rem)}.modal-bottom :where(.modal-box){width:100%;max-width:none;--tw-translate-y: 2.5rem;--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-top-left-radius:var(--rounded-box, 1rem);border-top-right-radius:var(--rounded-box, 1rem);border-bottom-right-radius:0;border-bottom-left-radius:0}.collapse{visibility:collapse}.relative{position:relative}.sticky{position:sticky}.top-4{top:1rem}.z-10{z-index:10}.z-50{z-index:50}.z-\[1\]{z-index:1}.m-1{margin:.25rem}.mx-4{margin-left:1rem;margin-right:1rem}.mx-auto{margin-left:auto;margin-right:auto}.my-4{margin-top:1rem;margin-bottom:1rem}.mb-2{margin-bottom:.5rem}.mb-4{margin-bottom:1rem}.mb-6{margin-bottom:1.5rem}.ml-2{margin-left:.5rem}.ml-4{margin-left:1rem}.mr-2{margin-right:.5rem}.mt-2{margin-top:.5rem}.mt-4{margin-top:1rem}.mt-6{margin-top:1.5rem}.mt-8{margin-top:2rem}.mt-auto{margin-top:auto}.inline{display:inline}.flex{display:flex}.h-0{height:0px}.h-24{height:6rem}.h-80{height:20rem}.h-\[calc\(90vh-12rem\)\]{height:calc(90vh - 12rem)}.h-screen{height:100vh}.min-h-full{min-height:100%}.w-52{width:13rem}.w-64{width:16rem}.w-\[calc\(90vw-8em\)\]{width:calc(90vw - 8em)}.w-full{width:100%}.w-screen{width:100vw}.max-w-\[calc\(100vw-2em\)\]{max-width:calc(100vw - 2em)}.grow{flex-grow:1}.flex-row{flex-direction:row}.flex-col{flex-direction:column}.items-center{align-items:center}.justify-start{justify-content:flex-start}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-2{gap:.5rem}.overflow-visible{overflow:visible}.overflow-y-auto{overflow-y:auto}.truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.rounded-box{border-radius:var(--rounded-box, 1rem)}.bg-base-100{--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity, 1)))}.bg-base-200{--tw-bg-opacity: 1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity, 1)))}.bg-base-300{--tw-bg-opacity: 1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity, 1)))}.p-2{padding:.5rem}.px-4{padding-left:1rem;padding-right:1rem}.py-4{padding-top:1rem;padding-bottom:1rem}.text-center{text-align:center}.text-right{text-align:right}.text-2xl{font-size:1.5rem;line-height:2rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-normal{font-weight:400}.text-base-content{--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity, 1)))}.text-error{--tw-text-opacity: 1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity, 1)))}.underline{text-decoration-line:underline}.opacity-0{opacity:0}.opacity-40{opacity:.4}.shadow{--tw-shadow: 0 1px 3px 0 rgb(0 0 0 / .1), 0 1px 2px -1px rgb(0 0 0 / .1);--tw-shadow-colored: 0 1px 3px 0 var(--tw-shadow-color), 0 1px 2px -1px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-2xl{--tw-shadow: 0 25px 50px -12px rgb(0 0 0 / .25);--tw-shadow-colored: 0 25px 50px -12px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.markdown h1,.markdown h2,.markdown h3,.markdown h4,.markdown h5,.markdown h6,.markdown ul,.markdown ol,.markdown li{all:revert}.markdown pre{white-space:pre-wrap}.markdown pre{border-radius:.5rem}.markdown pre{padding:.5rem}.markdown pre{border:1px solid currentColor}@media (min-width: 768px){.show-on-hover{opacity:0}.group:hover .show-on-hover{opacity:1}}.btn-mini{cursor:pointer}.btn-mini:hover{--tw-shadow: 0 4px 6px -1px rgb(0 0 0 / .1), 0 2px 4px -2px rgb(0 0 0 / .1);--tw-shadow-colored: 0 4px 6px -1px var(--tw-shadow-color), 0 2px 4px -2px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.chat-screen{max-width:900px}.chat-bubble-base-300{--tw-bg-opacity: 1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity, 1)));--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity, 1)))}@media (min-width: 1024px){.lg\:drawer-open>.drawer-toggle{display:none}.lg\:drawer-open>.drawer-toggle~.drawer-side{pointer-events:auto;visibility:visible;position:sticky;display:block;width:auto;overscroll-behavior:auto}.lg\:drawer-open>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}[dir=rtl] .lg\:drawer-open>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}.lg\:drawer-open>.drawer-toggle:checked~.drawer-side{pointer-events:auto;visibility:visible}.lg\:drawer-open>.drawer-side{overflow-y:auto}html:has(.lg\:drawer-open.lg\:drawer-open){overflow-y:auto;scrollbar-gutter:auto}.lg\:drawer-open>.drawer-toggle~.drawer-side>.drawer-overlay{cursor:default;background-color:transparent}.lg\:hidden{display:none}.lg\:h-screen{height:100vh}.lg\:w-96{width:24rem}.lg\:max-w-64{max-width:16rem}}</style>
 </head>
 
 <body>
-  <div id="app" class="flex flex-row opacity-0"> <!-- opacity-0 will be removed on app mounted -->
-    <!-- sidebar -->
-    <div class="flex flex-col bg-black bg-opacity-5 w-64 py-8 px-4 h-screen overflow-y-auto">
-      <h2 class="font-bold mb-4 ml-4">Conversations</h2>
+  <div id="app" class="opacity-0"> <!-- opacity-0 will be removed on app mounted -->
+    <div class="flex flex-row drawer lg:drawer-open">
+      <input id="toggle-drawer" type="checkbox" class="drawer-toggle" checked />
 
-      <!-- list of conversations -->
-      <div :class="{
-        'btn btn-ghost justify-start': true,
-        'btn-active': messages.length === 0,
-      }" @click="newConversation">
-        + New conversation
-      </div>
-      <div v-for="conv in conversations" :class="{
-        'btn btn-ghost justify-start font-normal': true,
-        'btn-active': conv.id === viewingConvId,
-      }" @click="setViewingConv(conv.id)">
-        <span class="truncate">{{ conv.messages[0].content }}</span>
-      </div>
-      <div class="text-center text-xs opacity-40 mt-auto mx-4">
-        Conversations are saved to browser's localStorage
-      </div>
-    </div>
+      <!-- sidebar -->
+      <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
+        <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
+        <div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4">
+          <div class="flex flex-row items-center justify-between mb-4 mt-4">
+            <h2 class="font-bold ml-4">Conversations</h2>
 
-    <div class="chat-screen flex flex-col w-screen h-screen px-8 mx-auto">
-      <!-- header -->
-      <div class="flex flex-row items-center">
-        <div class="grow text-2xl font-bold mt-8 mb-6">
-          🦙 llama.cpp - chat
-        </div>
-
-        <!-- action buttons (top right) -->
-        <div class="flex items-center">
-          <button v-if="messages.length > 0" class="btn mr-1" @click="deleteConv(viewingConvId)" :disabled="isGenerating">
-            <!-- delete conversation button -->
-            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-trash" viewBox="0 0 16 16">
-              <path d="M5.5 5.5A.5.5 0 0 1 6 6v6a.5.5 0 0 1-1 0V6a.5.5 0 0 1 .5-.5m2.5 0a.5.5 0 0 1 .5.5v6a.5.5 0 0 1-1 0V6a.5.5 0 0 1 .5-.5m3 .5a.5.5 0 0 0-1 0v6a.5.5 0 0 0 1 0z"/>
-              <path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/>
-            </svg>
-          </button>
-          <button v-if="messages.length > 0" class="btn mr-1" @click="downloadConv(viewingConvId)" :disabled="isGenerating">
-              <!-- download conversation button -->
-              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-download" viewBox="0 0 16 16">
-                  <path d="M.5 9.9a.5.5 0 0 1 .5.5v2.5a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-2.5a.5.5 0 0 1 1 0v2.5a2 2 0 0 1-2 2H2a2 2 0 0 1-2-2v-2.5a.5.5 0 0 1 .5-.5"/>
-                  <path d="M7.646 11.854a.5.5 0 0 0 .708 0l3-3a.5.5 0 0 0-.708-.708L8.5 10.293V1.5a.5.5 0 0 0-1 0v8.793L5.354 8.146a.5.5 0 1 0-.708.708z"/>
-            </svg>
-          </button>
-          <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
-            <!-- edit config button -->
-            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
-              <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
-              <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
-            </svg>
-          </button>
-
-          <!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
-          <div class="dropdown dropdown-end dropdown-bottom">
-            <div tabindex="0" role="button" class="btn m-1">
-              Theme
-              <svg width="12px" height="12px" class="inline-block h-2 w-2 fill-current opacity-60" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2048 2048">
-                <path d="M1799 349l242 241-1017 1017L7 590l242-241 775 775 775-775z"></path>
+            <!-- close sidebar button -->
+            <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
+              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-arrow-bar-left" viewBox="0 0 16 16">
+                <path fill-rule="evenodd" d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"/>
               </svg>
-            </div>
-            <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
-              <li>
-                <button
-                  class="btn btn-sm btn-block w-full btn-ghost justify-start"
-                  :class="{ 'btn-active': selectedTheme === 'auto' }"
-                  @click="setSelectedTheme('auto')">
-                  auto
-                </button>
-              </li>
-              <li v-for="theme in themes">
-                <input
-                  type="radio"
-                  name="theme-dropdown"
-                  class="theme-controller btn btn-sm btn-block w-full btn-ghost justify-start"
-                  :aria-label="theme"
-                  :value="theme"
-                  :checked="selectedTheme === theme"
-                  @click="setSelectedTheme(theme)" />
-              </li>
-            </ul>
+            </label>
           </div>
-        </div>
-      </div>
 
-      <!-- chat messages -->
-      <div id="messages-list" class="flex flex-col grow overflow-y-auto">
-        <div class="mt-auto flex justify-center">
-          <!-- placeholder to shift the message to the bottom -->
-           {{ messages.length === 0 ? 'Send a message to start' : '' }}
-        </div>
-        <div v-for="msg in messages" class="group">
+          <!-- list of conversations -->
           <div :class="{
-            'chat': true,
-            'chat-start': msg.role !== 'user',
-            'chat-end': msg.role === 'user',
-          }">
+            'btn btn-ghost justify-start': true,
+            'btn-active': messages.length === 0,
+          }" @click="newConversation">
+            + New conversation
+          </div>
+          <div v-for="conv in conversations" :class="{
+            'btn btn-ghost justify-start font-normal': true,
+            'btn-active': conv.id === viewingConvId,
+          }" @click="setViewingConv(conv.id)">
+            <span class="truncate">{{ conv.messages[0].content }}</span>
+          </div>
+          <div class="text-center text-xs opacity-40 mt-auto mx-4">
+            Conversations are saved to browser's localStorage
+          </div>
+        </div>
+      </div>
+
+      <!-- main view -->
+      <div class="chat-screen drawer-content grow flex flex-col h-screen w-screen mx-auto px-4">
+        <!-- header -->
+        <div class="flex flex-row items-center mt-6 mb-6">
+          <!-- open sidebar button -->
+          <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
+            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-list" viewBox="0 0 16 16">
+              <path fill-rule="evenodd" d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"/>
+            </svg>
+          </label>
+
+          <div class="grow text-2xl font-bold ml-2">llama.cpp</div>
+
+          <!-- action buttons (top right) -->
+          <div class="flex items-center">
+            <div v-if="messages.length > 0" class="dropdown dropdown-end">
+              <!-- "more" button -->
+              <button tabindex="0" role="button" class="btn m-1" :disabled="isGenerating">
+                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots-vertical" viewBox="0 0 16 16">
+                  <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0"/>
+                </svg>
+              </button>
+              <!-- "more" dropdown menu -->
+              <ul tabindex="0" class="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
+                <li @click="downloadConv(viewingConvId)"><a>Download</a></li>
+                <li class="text-error" @click="deleteConv(viewingConvId)"><a>Delete</a></li>
+              </ul>
+            </div>
+            <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
+              <!-- settings button -->
+              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
+                <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
+                <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
+              </svg>
+            </button>
+
+            <!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
+            <div class="dropdown dropdown-end dropdown-bottom">
+              <div tabindex="0" role="button" class="btn m-1">
+                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-palette2" viewBox="0 0 16 16">
+                  <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z"/>
+                </svg>
+              </div>
+              <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
+                <li>
+                  <button
+                    class="btn btn-sm btn-block btn-ghost justify-start"
+                    :class="{ 'btn-active': selectedTheme === 'auto' }"
+                    @click="setSelectedTheme('auto')">
+                    auto
+                  </button>
+                </li>
+                <li v-for="theme in themes">
+                  <input
+                    type="radio"
+                    name="theme-dropdown"
+                    class="theme-controller btn btn-sm btn-block btn-ghost justify-start"
+                    :aria-label="theme"
+                    :value="theme"
+                    :checked="selectedTheme === theme"
+                    @click="setSelectedTheme(theme)" />
+                </li>
+              </ul>
+            </div>
+          </div>
+        </div>
+
+        <!-- chat messages -->
+        <div id="messages-list" class="flex flex-col grow overflow-y-auto">
+          <div class="mt-auto flex justify-center">
+            <!-- placeholder to shift the message to the bottom -->
+            {{ messages.length === 0 ? 'Send a message to start' : '' }}
+          </div>
+          <div v-for="msg in messages" class="group">
             <div :class="{
-              'chat-bubble markdown': true,
-              'chat-bubble-base-300': msg.role !== 'user',
+              'chat': true,
+              'chat-start': msg.role !== 'user',
+              'chat-end': msg.role === 'user',
             }">
-              <!-- textarea for editing message -->
-              <template v-if="editingMsg && editingMsg.id === msg.id">
-                <textarea
-                  class="textarea textarea-bordered bg-base-100 text-base-content w-96"
-                  v-model="msg.content"></textarea>
-                <br/>
-                <button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
-                <button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
-              </template>
-              <!-- render message as markdown -->
-              <vue-markdown v-else :source="msg.content" />
+              <div :class="{
+                'chat-bubble markdown': true,
+                'chat-bubble-base-300': msg.role !== 'user',
+              }">
+                <!-- textarea for editing message -->
+                <template v-if="editingMsg && editingMsg.id === msg.id">
+                  <textarea
+                    class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
+                    v-model="msg.content"></textarea>
+                  <br/>
+                  <button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
+                  <button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
+                </template>
+                <!-- render message as markdown -->
+                <vue-markdown v-else :source="msg.content" />
+              </div>
+            </div>
+
+            <!-- actions for each message -->
+            <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
+              <!-- user message -->
+              <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
+                ✍️ Edit
+              </button>
+              <!-- assistant message -->
+              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
+                🔄 Regenerate
+              </button>
+              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
+                📋 Copy
+              </button>
             </div>
           </div>
 
-          <!-- actions for each message -->
-          <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
-            <!-- user message -->
-            <button v-if="msg.role === 'user'" class="badge btn-minishow-on-hover " @click="editingMsg = msg" :disabled="isGenerating">
-              ✍️ Edit
-            </button>
-            <!-- assistant message -->
-            <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
-              🔄 Regenerate
-            </button>
-            <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
-              📋 Copy
-            </button>
+          <!-- pending (ongoing) assistant message -->
+          <div id="pending-msg" class="chat chat-start">
+            <div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
+              <span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
+              <vue-markdown v-else :source="pendingMsg.content" />
+            </div>
           </div>
         </div>
 
-        <!-- pending (ongoing) assistant message -->
-        <div id="pending-msg" class="chat chat-start">
-          <div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
-            <span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
-            <vue-markdown v-else :source="pendingMsg.content" />
-          </div>
+        <!-- chat input -->
+        <div class="flex flex-row items-center mt-8 mb-6">
+          <textarea
+            class="textarea textarea-bordered w-full"
+            placeholder="Type a message (Shift+Enter to add a new line)"
+            v-model="inputMsg"
+            @keydown.enter.exact.prevent="sendMessage"
+            @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
+            :disabled="isGenerating"
+            id="msg-input"
+          ></textarea>
+          <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
+          <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
         </div>
       </div>
 
-      <!-- chat input -->
-      <div class="flex flex-row items-center mt-8 mb-6">
-        <textarea
-          class="textarea textarea-bordered w-full"
-          placeholder="Type a message (Shift+Enter to add a new line)"
-          v-model="inputMsg"
-          @keydown.enter.exact.prevent="sendMessage"
-          @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
-          :disabled="isGenerating"
-          id="msg-input"
-        ></textarea>
-        <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
-        <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
-      </div>
     </div>
 
+
     <!-- modal for editing config -->
     <dialog class="modal" :class="{'modal-open': showConfigDialog}">
       <div class="modal-box">
@@ -211,7 +283,7 @@
             <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
           </label>
           <template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
-            <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+            <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
           </template>
           <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
           <!-- Section: Other sampler settings -->
@@ -222,7 +294,7 @@
               <settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
               <!-- Samplers -->
               <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
-                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
               </template>
             </div>
           </details>
@@ -231,7 +303,7 @@
             <summary class="collapse-title font-bold">Penalties settings</summary>
             <div class="collapse-content">
               <template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
-                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
               </template>
             </div>
           </details>
@@ -252,10 +324,11 @@
         <div class="modal-action">
           <button class="btn" @click="resetConfigDialog">Reset to default</button>
           <button class="btn" @click="closeAndDiscardConfigDialog">Close</button>
-          <button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save and close</button>
+          <button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save</button>
         </div>
       </div>
     </dialog>
+
   </div>
 
   <!-- Template to be used by settings modal -->
@@ -273,458 +346,6 @@
     </label>
   </template>
 
-  <script src="./deps_markdown-it.js"></script>
-  <script type="module">
-    import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
-    import { llama } from './completion.js';
-
-    // utility functions
-    const isString = (x) => !!x.toLowerCase;
-    const isNumeric = (n) => !isString(n) && !isNaN(n);
-    const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
-    const copyStr = (str) => navigator.clipboard.writeText(str);
-
-    // constants
-    const BASE_URL = localStorage.getItem('base') // for debugging
-      || (new URL('.', document.baseURI).href).toString(); // for production
-    const CONFIG_DEFAULT = {
-      // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
-      apiKey: '',
-      systemMessage: 'You are a helpful assistant.',
-      // make sure these default values are in sync with `common.h`
-      samplers: 'dkypmxt',
-      temperature: 0.8,
-      dynatemp_range: 0.0,
-      dynatemp_exponent: 1.0,
-      top_k: 40,
-      top_p: 0.95,
-      min_p: 0.05,
-      xtc_probability: 0.0,
-      xtc_threshold: 0.1,
-      typical_p: 1.0,
-      repeat_last_n: 64,
-      repeat_penalty: 1.0,
-      presence_penalty: 0.0,
-      frequency_penalty: 0.0,
-      dry_multiplier: 0.0,
-      dry_base: 1.75,
-      dry_allowed_length: 2,
-      dry_penalty_last_n: -1,
-      max_tokens: -1,
-      custom: '', // custom json-stringified object
-    };
-    const CONFIG_INFO = {
-      apiKey: 'Set the API Key if you are using --api-key option for the server.',
-      systemMessage: 'The starting message that defines how model should behave.',
-      samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
-      temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
-      dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
-      dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
-      top_k: 'Keeps only k top tokens.',
-      top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
-      min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
-      xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
-      xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
-      typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
-      repeat_last_n: 'Last n tokens to consider for penalizing repetition',
-      repeat_penalty: 'Controls the repetition of token sequences in the generated text',
-      presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
-      frequency_penalty: 'Limits tokens based on how often they appear in the output.',
-      dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
-      dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
-      dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
-      dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
-      max_tokens: 'The maximum number of token per output.',
-      custom: '', // custom json-stringified object
-    };
-    // config keys having numeric value (i.e. temperature, top_k, top_p, etc)
-    const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
-    // list of themes supported by daisyui
-    const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
-
-    // markdown support
-    const VueMarkdown = defineComponent(
-      (props) => {
-        const md = shallowRef(new markdownit({ breaks: true }));
-        const origFenchRenderer = md.value.renderer.rules.fence;
-        md.value.renderer.rules.fence = (tokens, idx, ...args) => {
-          const content = tokens[idx].content;
-          const origRendered = origFenchRenderer(tokens, idx, ...args);
-          return `<div class="relative my-4">
-            <div class="text-right sticky top-4 mb-2 mr-2 h-0">
-              <button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
-            </div>
-            ${origRendered}
-          </div>`;
-        };
-        window.copyStr = copyStr;
-        const content = computed(() => md.value.render(props.source));
-        return () => h("div", { innerHTML: content.value });
-      },
-      { props: ["source"] }
-    );
-
-    // input field to be used by settings modal
-    const SettingsModalShortInput = defineComponent({
-      template: document.getElementById('settings-modal-short-input').innerHTML,
-      props: {
-        label: { type: String, required: false },
-        configKey: String,
-        configDefault: Object,
-        configInfo: Object,
-        modelValue: [Object, String, Number],
-      },
-    });
-
-    // coversations is stored in localStorage
-    // format: { [convId]: { id: string, lastModified: number, messages: [...] } }
-    // convId is a string prefixed with 'conv-'
-    const StorageUtils = {
-      // manage conversations
-      getAllConversations() {
-        const res = [];
-        for (const key in localStorage) {
-          if (key.startsWith('conv-')) {
-            res.push(JSON.parse(localStorage.getItem(key)));
-          }
-        }
-        res.sort((a, b) => b.lastModified - a.lastModified);
-        return res;
-      },
-      // can return null if convId does not exist
-      getOneConversation(convId) {
-        return JSON.parse(localStorage.getItem(convId) || 'null');
-      },
-      // if convId does not exist, create one
-      appendMsg(convId, msg) {
-        if (msg.content === null) return;
-        const conv = StorageUtils.getOneConversation(convId) || {
-          id: convId,
-          lastModified: Date.now(),
-          messages: [],
-        };
-        conv.messages.push(msg);
-        conv.lastModified = Date.now();
-        localStorage.setItem(convId, JSON.stringify(conv));
-      },
-      getNewConvId() {
-        return `conv-${Date.now()}`;
-      },
-      remove(convId) {
-        localStorage.removeItem(convId);
-      },
-      filterAndKeepMsgs(convId, predicate) {
-        const conv = StorageUtils.getOneConversation(convId);
-        if (!conv) return;
-        conv.messages = conv.messages.filter(predicate);
-        conv.lastModified = Date.now();
-        localStorage.setItem(convId, JSON.stringify(conv));
-      },
-      popMsg(convId) {
-        const conv = StorageUtils.getOneConversation(convId);
-        if (!conv) return;
-        const msg = conv.messages.pop();
-        conv.lastModified = Date.now();
-        if (conv.messages.length === 0) {
-          StorageUtils.remove(convId);
-        } else {
-          localStorage.setItem(convId, JSON.stringify(conv));
-        }
-        return msg;
-      },
-
-      // manage config
-      getConfig() {
-        const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
-        // to prevent breaking changes in the future, we always provide default value for missing keys
-        return {
-          ...CONFIG_DEFAULT,
-          ...savedVal,
-        };
-      },
-      setConfig(config) {
-        localStorage.setItem('config', JSON.stringify(config));
-      },
-      getTheme() {
-        return localStorage.getItem('theme') || 'auto';
-      },
-      setTheme(theme) {
-        if (theme === 'auto') {
-          localStorage.removeItem('theme');
-        } else {
-          localStorage.setItem('theme', theme);
-        }
-      },
-    };
-
-    // scroll to bottom of chat messages
-    // if requiresNearBottom is true, only auto-scroll if user is near bottom
-    const chatScrollToBottom = (requiresNearBottom) => {
-      const msgListElem = document.getElementById('messages-list');
-      const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight;
-      if (!requiresNearBottom || (spaceToBottom < 100)) {
-        setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1);
-      }
-    };
-
-    const mainApp = createApp({
-      components: {
-        VueMarkdown,
-        SettingsModalShortInput,
-      },
-      data() {
-        return {
-          conversations: StorageUtils.getAllConversations(),
-          messages: [], // { id: number, role: 'user' | 'assistant', content: string }
-          viewingConvId: StorageUtils.getNewConvId(),
-          inputMsg: '',
-          isGenerating: false,
-          pendingMsg: null, // the on-going message from assistant
-          stopGeneration: () => {},
-          selectedTheme: StorageUtils.getTheme(),
-          config: StorageUtils.getConfig(),
-          showConfigDialog: false,
-          editingMsg: null,
-          // const
-          themes: THEMES,
-          configDefault: {...CONFIG_DEFAULT},
-          configInfo: {...CONFIG_INFO},
-        }
-      },
-      computed: {},
-      mounted() {
-        document.getElementById('app').classList.remove('opacity-0'); // show app
-        // scroll to the bottom when the pending message height is updated
-        const pendingMsgElem = document.getElementById('pending-msg');
-        const resizeObserver = new ResizeObserver(() => {
-          if (this.isGenerating) chatScrollToBottom(true);
-        });
-        resizeObserver.observe(pendingMsgElem);
-      },
-      methods: {
-        setSelectedTheme(theme) {
-          this.selectedTheme = theme;
-          StorageUtils.setTheme(theme);
-        },
-        newConversation() {
-          if (this.isGenerating) return;
-          this.viewingConvId = StorageUtils.getNewConvId();
-          this.editingMsg = null;
-          this.fetchMessages();
-          chatScrollToBottom();
-        },
-        setViewingConv(convId) {
-          if (this.isGenerating) return;
-          this.viewingConvId = convId;
-          this.editingMsg = null;
-          this.fetchMessages();
-          chatScrollToBottom();
-        },
-        deleteConv(convId) {
-          if (this.isGenerating) return;
-          if (window.confirm('Are you sure to delete this conversation?')) {
-            StorageUtils.remove(convId);
-            if (this.viewingConvId === convId) {
-              this.viewingConvId = StorageUtils.getNewConvId();
-              this.editingMsg = null;
-            }
-            this.fetchConversation();
-            this.fetchMessages();
-          }
-        },
-        downloadConv(convId) {
-          const conversation = StorageUtils.getOneConversation(convId);
-          if (!conversation) {
-            alert('Conversation not found.');
-            return;
-          }
-          const conversationJson = JSON.stringify(conversation, null, 2);
-          const blob = new Blob([conversationJson], { type: 'application/json' });
-          const url = URL.createObjectURL(blob);
-          const a = document.createElement('a');
-          a.href = url;
-          a.download = `conversation_${convId}.json`;
-          document.body.appendChild(a);
-          a.click();
-          document.body.removeChild(a);
-          URL.revokeObjectURL(url);
-        },
-        async sendMessage() {
-          if (!this.inputMsg) return;
-          const currConvId = this.viewingConvId;
-
-          StorageUtils.appendMsg(currConvId, {
-            id: Date.now(),
-            role: 'user',
-            content: this.inputMsg,
-          });
-          this.fetchConversation();
-          this.fetchMessages();
-          this.inputMsg = '';
-          this.editingMsg = null;
-          this.generateMessage(currConvId);
-          chatScrollToBottom();
-        },
-        async generateMessage(currConvId) {
-          if (this.isGenerating) return;
-          this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
-          this.isGenerating = true;
-          this.editingMsg = null;
-
-          try {
-            const abortController = new AbortController();
-            this.stopGeneration = () => abortController.abort();
-            const params = {
-              messages: [
-                { role: 'system', content: this.config.systemMessage },
-                ...this.messages,
-              ],
-              stream: true,
-              cache_prompt: true,
-              samplers: this.config.samplers,
-              temperature: this.config.temperature,
-              dynatemp_range: this.config.dynatemp_range,
-              dynatemp_exponent: this.config.dynatemp_exponent,
-              top_k: this.config.top_k,
-              top_p: this.config.top_p,
-              min_p: this.config.min_p,
-              typical_p: this.config.typical_p,
-              xtc_probability: this.config.xtc_probability,
-              xtc_threshold: this.config.xtc_threshold,
-              repeat_last_n: this.config.repeat_last_n,
-              repeat_penalty: this.config.repeat_penalty,
-              presence_penalty: this.config.presence_penalty,
-              frequency_penalty: this.config.frequency_penalty,
-              dry_multiplier: this.config.dry_multiplier,
-              dry_base: this.config.dry_base,
-              dry_allowed_length: this.config.dry_allowed_length,
-              dry_penalty_last_n: this.config.dry_penalty_last_n,
-              max_tokens: this.config.max_tokens,
-              ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
-              ...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
-            };
-            const config = {
-              controller: abortController,
-              api_url: BASE_URL,
-              endpoint: '/chat/completions',
-            };
-            for await (const chunk of llama(prompt, params, config)) {
-              const stop = chunk.data.stop;
-              const addedContent = chunk.data.choices[0].delta.content;
-              const lastContent = this.pendingMsg.content || '';
-              if (addedContent) {
-                this.pendingMsg = {
-                  id: this.pendingMsg.id,
-                  role: 'assistant',
-                  content: lastContent + addedContent,
-                };
-              }
-            }
-
-            StorageUtils.appendMsg(currConvId, this.pendingMsg);
-            this.fetchConversation();
-            this.fetchMessages();
-            setTimeout(() => document.getElementById('msg-input').focus(), 1);
-          } catch (error) {
-            if (error.name === 'AbortError') {
-              // user stopped the generation via stopGeneration() function
-              StorageUtils.appendMsg(currConvId, this.pendingMsg);
-              this.fetchConversation();
-              this.fetchMessages();
-            } else {
-              console.error(error);
-              alert(error);
-              // pop last user message
-              const lastUserMsg = StorageUtils.popMsg(currConvId);
-              this.inputMsg = lastUserMsg ? lastUserMsg.content : '';
-            }
-          }
-
-          this.pendingMsg = null;
-          this.isGenerating = false;
-          this.stopGeneration = () => {};
-          this.fetchMessages();
-          chatScrollToBottom();
-        },
-
-        // message actions
-        regenerateMsg(msg) {
-          if (this.isGenerating) return;
-          // TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds
-          const currConvId = this.viewingConvId;
-          StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
-          this.fetchConversation();
-          this.fetchMessages();
-          this.generateMessage(currConvId);
-        },
-        copyMsg(msg) {
-          copyStr(msg.content);
-        },
-        editUserMsgAndRegenerate(msg) {
-          if (this.isGenerating) return;
-          const currConvId = this.viewingConvId;
-          const newContent = msg.content;
-          this.editingMsg = null;
-          StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
-          StorageUtils.appendMsg(currConvId, {
-            id: Date.now(),
-            role: 'user',
-            content: newContent,
-          });
-          this.fetchConversation();
-          this.fetchMessages();
-          this.generateMessage(currConvId);
-        },
-
-        // settings dialog methods
-        closeAndSaveConfigDialog() {
-          try {
-            if (this.config.custom.length) JSON.parse(this.config.custom);
-          } catch (error) {
-            alert('Invalid JSON for custom config. Please either fix it or leave it empty.');
-            return;
-          }
-          for (const key of CONFIG_NUMERIC_KEYS) {
-            if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) {
-              alert(`Invalid number for ${key} (expected an integer or a float)`);
-              return;
-            }
-            this.config[key] = parseFloat(this.config[key]);
-          }
-          this.showConfigDialog = false;
-          StorageUtils.setConfig(this.config);
-        },
-        closeAndDiscardConfigDialog() {
-          this.showConfigDialog = false;
-          this.config = StorageUtils.getConfig();
-        },
-        resetConfigDialog() {
-          if (window.confirm('Are you sure to reset all settings?')) {
-            this.config = {...CONFIG_DEFAULT};
-          }
-        },
-
-        // sync state functions
-        fetchConversation() {
-          this.conversations = StorageUtils.getAllConversations();
-        },
-        fetchMessages() {
-          this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
-        },
-      },
-    });
-    mainApp.config.errorHandler = alert;
-    try {
-      mainApp.mount('#app');
-    } catch (err) {
-      console.error(err);
-      document.getElementById('app').innerHTML = `<div style="margin:2em auto">
-        Failed to start app. Please try clearing localStorage and try again.<br/>
-        <br/>
-        <button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
-      </div>`;
-    }
-  </script>
 </body>
 
 </html>
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2ec13d7d2..9bca3f30e 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -16,12 +16,7 @@
 
 // auto generated files (update with ./deps.sh)
 #include "index.html.hpp"
-#include "completion.js.hpp"
 #include "loading.html.hpp"
-#include "deps_daisyui.min.css.hpp"
-#include "deps_markdown-it.js.hpp"
-#include "deps_tailwindcss.js.hpp"
-#include "deps_vue.esm-browser.js.hpp"
 
 #include <atomic>
 #include <condition_variable>
@@ -103,12 +98,6 @@ struct server_task_result {
     bool error;
 };
 
-struct server_static_file {
-    const unsigned char * data;
-    unsigned int size;
-    const char * mime_type;
-};
-
 struct slot_params {
     bool stream       = true;
     bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
@@ -2457,16 +2446,6 @@ int main(int argc, char ** argv) {
     LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     LOG_INF("\n");
 
-    // static files
-    std::map<std::string, server_static_file> static_files = {
-        { "/",                        { index_html,              index_html_len,              "text/html; charset=utf-8" }},
-        { "/completion.js",           { completion_js,           completion_js_len,           "text/javascript; charset=utf-8" }},
-        { "/deps_daisyui.min.css",    { deps_daisyui_min_css,    deps_daisyui_min_css_len,    "text/css; charset=utf-8" }},
-        { "/deps_markdown-it.js",     { deps_markdown_it_js,     deps_markdown_it_js_len,     "text/javascript; charset=utf-8" }},
-        { "/deps_tailwindcss.js",     { deps_tailwindcss_js,     deps_tailwindcss_js_len,     "text/javascript; charset=utf-8" }},
-        { "/deps_vue.esm-browser.js", { deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8" }},
-    };
-
     std::unique_ptr<httplib::Server> svr;
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
     if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
@@ -2547,7 +2526,7 @@ int main(int argc, char ** argv) {
     // Middlewares
     //
 
-    auto middleware_validate_api_key = [&params, &res_error, &static_files](const httplib::Request & req, httplib::Response & res) {
+    auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
         static const std::unordered_set<std::string> public_endpoints = {
             "/health",
             "/models",
@@ -2560,7 +2539,7 @@ int main(int argc, char ** argv) {
         }
 
         // If path is public or is static file, skip validation
-        if (public_endpoints.find(req.path) != public_endpoints.end() || static_files.find(req.path) != static_files.end()) {
+        if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
             return true;
         }
 
@@ -3317,14 +3296,11 @@ int main(int argc, char ** argv) {
             return 1;
         }
     } else {
-        // using embedded static files
-        for (const auto & it : static_files) {
-            const server_static_file & static_file = it.second;
-            svr->Get(it.first.c_str(), [&static_file](const httplib::Request &, httplib::Response & res) {
-                res.set_content(reinterpret_cast<const char*>(static_file.data), static_file.size, static_file.mime_type);
-                return false;
-            });
-        }
+        // using embedded static index.html
+        svr->Get("/", [](const httplib::Request &, httplib::Response & res) {
+            res.set_content(reinterpret_cast<const char*>(index_html), index_html_len, "text/html; charset=utf-8");
+            return false;
+        });
     }
 
     // register API routes
diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html
new file mode 100644
index 000000000..c7e18b45e
--- /dev/null
+++ b/examples/server/webui/index.html
@@ -0,0 +1,268 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>🦙 llama.cpp - chat</title>
+</head>
+
+<body>
+  <div id="app" class="opacity-0"> <!-- opacity-0 will be removed on app mounted -->
+    <div class="flex flex-row drawer lg:drawer-open">
+      <input id="toggle-drawer" type="checkbox" class="drawer-toggle" checked />
+
+      <!-- sidebar -->
+      <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
+        <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
+        <div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4">
+          <div class="flex flex-row items-center justify-between mb-4 mt-4">
+            <h2 class="font-bold ml-4">Conversations</h2>
+
+            <!-- close sidebar button -->
+            <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
+              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-arrow-bar-left" viewBox="0 0 16 16">
+                <path fill-rule="evenodd" d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"/>
+              </svg>
+            </label>
+          </div>
+
+          <!-- list of conversations -->
+          <div :class="{
+            'btn btn-ghost justify-start': true,
+            'btn-active': messages.length === 0,
+          }" @click="newConversation">
+            + New conversation
+          </div>
+          <div v-for="conv in conversations" :class="{
+            'btn btn-ghost justify-start font-normal': true,
+            'btn-active': conv.id === viewingConvId,
+          }" @click="setViewingConv(conv.id)">
+            <span class="truncate">{{ conv.messages[0].content }}</span>
+          </div>
+          <div class="text-center text-xs opacity-40 mt-auto mx-4">
+            Conversations are saved to browser's localStorage
+          </div>
+        </div>
+      </div>
+
+      <!-- main view -->
+      <div class="chat-screen drawer-content grow flex flex-col h-screen w-screen mx-auto px-4">
+        <!-- header -->
+        <div class="flex flex-row items-center mt-6 mb-6">
+          <!-- open sidebar button -->
+          <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
+            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-list" viewBox="0 0 16 16">
+              <path fill-rule="evenodd" d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"/>
+            </svg>
+          </label>
+
+          <div class="grow text-2xl font-bold ml-2">llama.cpp</div>
+
+          <!-- action buttons (top right) -->
+          <div class="flex items-center">
+            <div v-if="messages.length > 0" class="dropdown dropdown-end">
+              <!-- "more" button -->
+              <button tabindex="0" role="button" class="btn m-1" :disabled="isGenerating">
+                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots-vertical" viewBox="0 0 16 16">
+                  <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0"/>
+                </svg>
+              </button>
+              <!-- "more" dropdown menu -->
+              <ul tabindex="0" class="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
+                <li @click="downloadConv(viewingConvId)"><a>Download</a></li>
+                <li class="text-error" @click="deleteConv(viewingConvId)"><a>Delete</a></li>
+              </ul>
+            </div>
+            <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
+              <!-- settings button -->
+              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
+                <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
+                <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
+              </svg>
+            </button>
+
+            <!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
+            <div class="dropdown dropdown-end dropdown-bottom">
+              <div tabindex="0" role="button" class="btn m-1">
+                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-palette2" viewBox="0 0 16 16">
+                  <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z"/>
+                </svg>
+              </div>
+              <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
+                <li>
+                  <button
+                    class="btn btn-sm btn-block btn-ghost justify-start"
+                    :class="{ 'btn-active': selectedTheme === 'auto' }"
+                    @click="setSelectedTheme('auto')">
+                    auto
+                  </button>
+                </li>
+                <li v-for="theme in themes">
+                  <input
+                    type="radio"
+                    name="theme-dropdown"
+                    class="theme-controller btn btn-sm btn-block btn-ghost justify-start"
+                    :aria-label="theme"
+                    :value="theme"
+                    :checked="selectedTheme === theme"
+                    @click="setSelectedTheme(theme)" />
+                </li>
+              </ul>
+            </div>
+          </div>
+        </div>
+
+        <!-- chat messages -->
+        <div id="messages-list" class="flex flex-col grow overflow-y-auto">
+          <div class="mt-auto flex justify-center">
+            <!-- placeholder to shift the message to the bottom -->
+            {{ messages.length === 0 ? 'Send a message to start' : '' }}
+          </div>
+          <div v-for="msg in messages" class="group">
+            <div :class="{
+              'chat': true,
+              'chat-start': msg.role !== 'user',
+              'chat-end': msg.role === 'user',
+            }">
+              <div :class="{
+                'chat-bubble markdown': true,
+                'chat-bubble-base-300': msg.role !== 'user',
+              }">
+                <!-- textarea for editing message -->
+                <template v-if="editingMsg && editingMsg.id === msg.id">
+                  <textarea
+                    class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
+                    v-model="msg.content"></textarea>
+                  <br/>
+                  <button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
+                  <button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
+                </template>
+                <!-- render message as markdown -->
+                <vue-markdown v-else :source="msg.content" />
+              </div>
+            </div>
+
+            <!-- actions for each message -->
+            <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
+              <!-- user message -->
+              <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
+                ✍️ Edit
+              </button>
+              <!-- assistant message -->
+              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
+                🔄 Regenerate
+              </button>
+              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
+                📋 Copy
+              </button>
+            </div>
+          </div>
+
+          <!-- pending (ongoing) assistant message -->
+          <div id="pending-msg" class="chat chat-start">
+            <div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
+              <span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
+              <vue-markdown v-else :source="pendingMsg.content" />
+            </div>
+          </div>
+        </div>
+
+        <!-- chat input -->
+        <div class="flex flex-row items-center mt-8 mb-6">
+          <textarea
+            class="textarea textarea-bordered w-full"
+            placeholder="Type a message (Shift+Enter to add a new line)"
+            v-model="inputMsg"
+            @keydown.enter.exact.prevent="sendMessage"
+            @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
+            :disabled="isGenerating"
+            id="msg-input"
+          ></textarea>
+          <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
+          <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
+        </div>
+      </div>
+
+    </div>
+
+
+    <!-- modal for editing config -->
+    <dialog class="modal" :class="{'modal-open': showConfigDialog}">
+      <div class="modal-box">
+        <h3 class="text-lg font-bold mb-6">Settings</h3>
+        <div class="h-[calc(90vh-12rem)] overflow-y-auto">
+          <p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
+          <settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
+          <label class="form-control mb-2">
+            <div class="label">System Message</div>
+            <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
+          </label>
+          <template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
+            <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
+          </template>
+          <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
+          <!-- Section: Other sampler settings -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Other sampler settings</summary>
+            <div class="collapse-content">
+              <!-- Samplers queue -->
+              <settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
+              <!-- Samplers -->
+              <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
+                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
+              </template>
+            </div>
+          </details>
+          <!-- Section: Penalties settings -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Penalties settings</summary>
+            <div class="collapse-content">
+              <template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
+                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
+              </template>
+            </div>
+          </details>
+          <!-- Section: Advanced config -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Advanced config</summary>
+            <div class="collapse-content">
+              <label class="form-control mb-2">
+                <!-- Custom parameters input -->
+                <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
+                <textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
+              </label>
+            </div>
+          </details>
+        </div>
+
+        <!-- action buttons -->
+        <div class="modal-action">
+          <button class="btn" @click="resetConfigDialog">Reset to default</button>
+          <button class="btn" @click="closeAndDiscardConfigDialog">Close</button>
+          <button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save</button>
+        </div>
+      </div>
+    </dialog>
+
+  </div>
+
+  <!-- Template to be used by settings modal -->
+  <template id="settings-modal-short-input">
+    <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
+      <!-- Show help message on hovering on the input label -->
+      <div class="dropdown dropdown-hover">
+        <div tabindex="0" role="button" class="font-bold">{{ label || configKey }}</div>
+        <div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
+          {{ configInfo[configKey] || '(no help message available)' }}
+        </div>
+      </div>
+      <!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
+      <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
+    </label>
+  </template>
+
+  <script type="module" src="/src/main.js"></script>
+</body>
+
+</html>
diff --git a/examples/server/webui/package-lock.json b/examples/server/webui/package-lock.json
new file mode 100644
index 000000000..6b93090f0
--- /dev/null
+++ b/examples/server/webui/package-lock.json
@@ -0,0 +1,2783 @@
+{
+  "name": "webui",
+  "version": "0.0.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "webui",
+      "version": "0.0.0",
+      "dependencies": {
+        "autoprefixer": "^10.4.20",
+        "daisyui": "^4.12.14",
+        "markdown-it": "^14.1.0",
+        "postcss": "^8.4.49",
+        "tailwindcss": "^3.4.15",
+        "vite-plugin-singlefile": "^2.0.3",
+        "vue": "^3.5.13"
+      },
+      "devDependencies": {
+        "vite": "^5.4.10"
+      }
+    },
+    "node_modules/@alloc/quick-lru": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/@alloc/quick-lru/-/quick-lru-5.2.0.tgz",
+      "integrity": "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/@esbuild/aix-ppc64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",
+      "integrity": "sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "aix"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/android-arm": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.21.5.tgz",
+      "integrity": "sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/android-arm64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz",
+      "integrity": "sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/android-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.21.5.tgz",
+      "integrity": "sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/darwin-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz",
+      "integrity": "sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/freebsd-arm64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz",
+      "integrity": "sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/freebsd-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz",
+      "integrity": "sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-arm": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz",
+      "integrity": "sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-arm64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz",
+      "integrity": "sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-ia32": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz",
+      "integrity": "sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-loong64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz",
+      "integrity": "sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg==",
+      "cpu": [
+        "loong64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-mips64el": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz",
+      "integrity": "sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg==",
+      "cpu": [
+        "mips64el"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-ppc64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz",
+      "integrity": "sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-riscv64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz",
+      "integrity": "sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-s390x": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz",
+      "integrity": "sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A==",
+      "cpu": [
+        "s390x"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz",
+      "integrity": "sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/netbsd-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz",
+      "integrity": "sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/openbsd-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz",
+      "integrity": "sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/sunos-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz",
+      "integrity": "sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "sunos"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/win32-arm64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz",
+      "integrity": "sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/win32-ia32": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz",
+      "integrity": "sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/win32-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz",
+      "integrity": "sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@rollup/rollup-android-arm-eabi": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.28.0.tgz",
+      "integrity": "sha512-wLJuPLT6grGZsy34g4N1yRfYeouklTgPhH1gWXCYspenKYD0s3cR99ZevOGw5BexMNywkbV3UkjADisozBmpPQ==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ]
+    },
+    "node_modules/@rollup/rollup-android-arm64": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.28.0.tgz",
+      "integrity": "sha512-eiNkznlo0dLmVG/6wf+Ifi/v78G4d4QxRhuUl+s8EWZpDewgk7PX3ZyECUXU0Zq/Ca+8nU8cQpNC4Xgn2gFNDA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ]
+    },
+    "node_modules/@rollup/rollup-darwin-x64": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.28.0.tgz",
+      "integrity": "sha512-8hxgfReVs7k9Js1uAIhS6zq3I+wKQETInnWQtgzt8JfGx51R1N6DRVy3F4o0lQwumbErRz52YqwjfvuwRxGv1w==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/@rollup/rollup-freebsd-arm64": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.28.0.tgz",
+      "integrity": "sha512-lA1zZB3bFx5oxu9fYud4+g1mt+lYXCoch0M0V/xhqLoGatbzVse0wlSQ1UYOWKpuSu3gyN4qEc0Dxf/DII1bhQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ]
+    },
+    "node_modules/@rollup/rollup-freebsd-x64": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.28.0.tgz",
+      "integrity": "sha512-aI2plavbUDjCQB/sRbeUZWX9qp12GfYkYSJOrdYTL/C5D53bsE2/nBPuoiJKoWp5SN78v2Vr8ZPnB+/VbQ2pFA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm-gnueabihf": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.28.0.tgz",
+      "integrity": "sha512-WXveUPKtfqtaNvpf0iOb0M6xC64GzUX/OowbqfiCSXTdi/jLlOmH0Ba94/OkiY2yTGTwteo4/dsHRfh5bDCZ+w==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm-musleabihf": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.28.0.tgz",
+      "integrity": "sha512-yLc3O2NtOQR67lI79zsSc7lk31xjwcaocvdD1twL64PK1yNaIqCeWI9L5B4MFPAVGEVjH5k1oWSGuYX1Wutxpg==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm64-gnu": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.28.0.tgz",
+      "integrity": "sha512-+P9G9hjEpHucHRXqesY+3X9hD2wh0iNnJXX/QhS/J5vTdG6VhNYMxJ2rJkQOxRUd17u5mbMLHM7yWGZdAASfcg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm64-musl": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.28.0.tgz",
+      "integrity": "sha512-1xsm2rCKSTpKzi5/ypT5wfc+4bOGa/9yI/eaOLW0oMs7qpC542APWhl4A37AENGZ6St6GBMWhCCMM6tXgTIplw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-powerpc64le-gnu": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.28.0.tgz",
+      "integrity": "sha512-zgWxMq8neVQeXL+ouSf6S7DoNeo6EPgi1eeqHXVKQxqPy1B2NvTbaOUWPn/7CfMKL7xvhV0/+fq/Z/J69g1WAQ==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-riscv64-gnu": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.28.0.tgz",
+      "integrity": "sha512-VEdVYacLniRxbRJLNtzwGt5vwS0ycYshofI7cWAfj7Vg5asqj+pt+Q6x4n+AONSZW/kVm+5nklde0qs2EUwU2g==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-s390x-gnu": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.28.0.tgz",
+      "integrity": "sha512-LQlP5t2hcDJh8HV8RELD9/xlYtEzJkm/aWGsauvdO2ulfl3QYRjqrKW+mGAIWP5kdNCBheqqqYIGElSRCaXfpw==",
+      "cpu": [
+        "s390x"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-x64-gnu": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.28.0.tgz",
+      "integrity": "sha512-Nl4KIzteVEKE9BdAvYoTkW19pa7LR/RBrT6F1dJCV/3pbjwDcaOq+edkP0LXuJ9kflW/xOK414X78r+K84+msw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-x64-musl": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.28.0.tgz",
+      "integrity": "sha512-eKpJr4vBDOi4goT75MvW+0dXcNUqisK4jvibY9vDdlgLx+yekxSm55StsHbxUsRxSTt3JEQvlr3cGDkzcSP8bw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-arm64-msvc": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.28.0.tgz",
+      "integrity": "sha512-Vi+WR62xWGsE/Oj+mD0FNAPY2MEox3cfyG0zLpotZdehPFXwz6lypkGs5y38Jd/NVSbOD02aVad6q6QYF7i8Bg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-ia32-msvc": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.28.0.tgz",
+      "integrity": "sha512-kN/Vpip8emMLn/eOza+4JwqDZBL6MPNpkdaEsgUtW1NYN3DZvZqSQrbKzJcTL6hd8YNmFTn7XGWMwccOcJBL0A==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-x64-msvc": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.28.0.tgz",
+      "integrity": "sha512-Bvno2/aZT6usSa7lRDL2+hMjVAGjuqaymF1ApZm31JXzniR/hvr14jpU+/z4X6Gt5BPlzosscyJZGUvguXIqeQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@vue/compiler-dom": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.5.13.tgz",
+      "integrity": "sha512-ZOJ46sMOKUjO3e94wPdCzQ6P1Lx/vhp2RSvfaab88Ajexs0AHeV0uasYhi99WPaogmBlRHNRuly8xV75cNTMDA==",
+      "license": "MIT",
+      "dependencies": {
+        "@vue/compiler-core": "3.5.13",
+        "@vue/shared": "3.5.13"
+      }
+    },
+    "node_modules/@vue/compiler-dom/node_modules/@babel/helper-string-parser": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz",
+      "integrity": "sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@vue/compiler-dom/node_modules/@babel/helper-validator-identifier": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz",
+      "integrity": "sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@vue/compiler-dom/node_modules/@babel/parser": {
+      "version": "7.26.2",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.2.tgz",
+      "integrity": "sha512-DWMCZH9WA4Maitz2q21SRKHo9QXZxkDsbNZoVD62gusNtNBBqDg9i7uOhASfTfIGNzW+O+r7+jAlM8dwphcJKQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.26.0"
+      },
+      "bin": {
+        "parser": "bin/babel-parser.js"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@vue/compiler-dom/node_modules/@babel/types": {
+      "version": "7.26.0",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.0.tgz",
+      "integrity": "sha512-Z/yiTPj+lDVnF7lWeKCIJzaIkI0vYO87dMpZ4bg4TDrFe4XXLFWL1TbXU27gBP3QccxV9mZICCrnjnYlJjXHOA==",
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-string-parser": "^7.25.9",
+        "@babel/helper-validator-identifier": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@vue/compiler-dom/node_modules/@vue/compiler-core": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/compiler-core/-/compiler-core-3.5.13.tgz",
+      "integrity": "sha512-oOdAkwqUfW1WqpwSYJce06wvt6HljgY3fGeM9NcVA1HaYOij3mZG9Rkysn0OHuyUAGMbEbARIpsG+LPVlBJ5/Q==",
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.25.3",
+        "@vue/shared": "3.5.13",
+        "entities": "^4.5.0",
+        "estree-walker": "^2.0.2",
+        "source-map-js": "^1.2.0"
+      }
+    },
+    "node_modules/@vue/compiler-dom/node_modules/estree-walker": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz",
+      "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==",
+      "license": "MIT"
+    },
+    "node_modules/@vue/compiler-dom/node_modules/source-map-js": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
+      "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
+      "license": "BSD-3-Clause",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/@vue/compiler-sfc": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/compiler-sfc/-/compiler-sfc-3.5.13.tgz",
+      "integrity": "sha512-6VdaljMpD82w6c2749Zhf5T9u5uLBWKnVue6XWxprDobftnletJ8+oel7sexFfM3qIxNmVE7LSFGTpv6obNyaQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.25.3",
+        "@vue/compiler-core": "3.5.13",
+        "@vue/compiler-dom": "3.5.13",
+        "@vue/compiler-ssr": "3.5.13",
+        "@vue/shared": "3.5.13",
+        "estree-walker": "^2.0.2",
+        "magic-string": "^0.30.11",
+        "postcss": "^8.4.48",
+        "source-map-js": "^1.2.0"
+      }
+    },
+    "node_modules/@vue/compiler-sfc/node_modules/@babel/helper-string-parser": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz",
+      "integrity": "sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@vue/compiler-sfc/node_modules/@babel/helper-validator-identifier": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz",
+      "integrity": "sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@vue/compiler-sfc/node_modules/@babel/parser": {
+      "version": "7.26.2",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.2.tgz",
+      "integrity": "sha512-DWMCZH9WA4Maitz2q21SRKHo9QXZxkDsbNZoVD62gusNtNBBqDg9i7uOhASfTfIGNzW+O+r7+jAlM8dwphcJKQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.26.0"
+      },
+      "bin": {
+        "parser": "bin/babel-parser.js"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@vue/compiler-sfc/node_modules/@babel/types": {
+      "version": "7.26.0",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.0.tgz",
+      "integrity": "sha512-Z/yiTPj+lDVnF7lWeKCIJzaIkI0vYO87dMpZ4bg4TDrFe4XXLFWL1TbXU27gBP3QccxV9mZICCrnjnYlJjXHOA==",
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-string-parser": "^7.25.9",
+        "@babel/helper-validator-identifier": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@vue/compiler-sfc/node_modules/@jridgewell/sourcemap-codec": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz",
+      "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==",
+      "license": "MIT"
+    },
+    "node_modules/@vue/compiler-sfc/node_modules/@vue/compiler-core": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/compiler-core/-/compiler-core-3.5.13.tgz",
+      "integrity": "sha512-oOdAkwqUfW1WqpwSYJce06wvt6HljgY3fGeM9NcVA1HaYOij3mZG9Rkysn0OHuyUAGMbEbARIpsG+LPVlBJ5/Q==",
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.25.3",
+        "@vue/shared": "3.5.13",
+        "entities": "^4.5.0",
+        "estree-walker": "^2.0.2",
+        "source-map-js": "^1.2.0"
+      }
+    },
+    "node_modules/@vue/compiler-sfc/node_modules/@vue/compiler-ssr": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/compiler-ssr/-/compiler-ssr-3.5.13.tgz",
+      "integrity": "sha512-wMH6vrYHxQl/IybKJagqbquvxpWCuVYpoUJfCqFZwa/JY1GdATAQ+TgVtgrwwMZ0D07QhA99rs/EAAWfvG6KpA==",
+      "license": "MIT",
+      "dependencies": {
+        "@vue/compiler-dom": "3.5.13",
+        "@vue/shared": "3.5.13"
+      }
+    },
+    "node_modules/@vue/compiler-sfc/node_modules/estree-walker": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz",
+      "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==",
+      "license": "MIT"
+    },
+    "node_modules/@vue/compiler-sfc/node_modules/magic-string": {
+      "version": "0.30.14",
+      "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.14.tgz",
+      "integrity": "sha512-5c99P1WKTed11ZC0HMJOj6CDIue6F8ySu+bJL+85q1zBEIY8IklrJ1eiKC2NDRh3Ct3FcvmJPyQHb9erXMTJNw==",
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/sourcemap-codec": "^1.5.0"
+      }
+    },
+    "node_modules/@vue/compiler-sfc/node_modules/source-map-js": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
+      "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
+      "license": "BSD-3-Clause",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/@vue/runtime-dom": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/runtime-dom/-/runtime-dom-3.5.13.tgz",
+      "integrity": "sha512-dLaj94s93NYLqjLiyFzVs9X6dWhTdAlEAciC3Moq7gzAc13VJUdCnjjRurNM6uTLFATRHexHCTu/Xp3eW6yoog==",
+      "license": "MIT",
+      "dependencies": {
+        "@vue/reactivity": "3.5.13",
+        "@vue/runtime-core": "3.5.13",
+        "@vue/shared": "3.5.13",
+        "csstype": "^3.1.3"
+      }
+    },
+    "node_modules/@vue/runtime-dom/node_modules/@vue/reactivity": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/reactivity/-/reactivity-3.5.13.tgz",
+      "integrity": "sha512-NaCwtw8o48B9I6L1zl2p41OHo/2Z4wqYGGIK1Khu5T7yxrn+ATOixn/Udn2m+6kZKB/J7cuT9DbWWhRxqixACg==",
+      "license": "MIT",
+      "dependencies": {
+        "@vue/shared": "3.5.13"
+      }
+    },
+    "node_modules/@vue/runtime-dom/node_modules/@vue/runtime-core": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/runtime-core/-/runtime-core-3.5.13.tgz",
+      "integrity": "sha512-Fj4YRQ3Az0WTZw1sFe+QDb0aXCerigEpw418pw1HBUKFtnQHWzwojaukAs2X/c9DQz4MQ4bsXTGlcpGxU/RCIw==",
+      "license": "MIT",
+      "dependencies": {
+        "@vue/reactivity": "3.5.13",
+        "@vue/shared": "3.5.13"
+      }
+    },
+    "node_modules/@vue/runtime-dom/node_modules/csstype": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
+      "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==",
+      "license": "MIT"
+    },
+    "node_modules/@vue/server-renderer": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/server-renderer/-/server-renderer-3.5.13.tgz",
+      "integrity": "sha512-wAi4IRJV/2SAW3htkTlB+dHeRmpTiVIK1OGLWV1yeStVSebSQQOwGwIq0D3ZIoBj2C2qpgz5+vX9iEBkTdk5YA==",
+      "license": "MIT",
+      "dependencies": {
+        "@vue/compiler-ssr": "3.5.13",
+        "@vue/shared": "3.5.13"
+      },
+      "peerDependencies": {
+        "vue": "3.5.13"
+      }
+    },
+    "node_modules/@vue/server-renderer/node_modules/@vue/compiler-ssr": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/compiler-ssr/-/compiler-ssr-3.5.13.tgz",
+      "integrity": "sha512-wMH6vrYHxQl/IybKJagqbquvxpWCuVYpoUJfCqFZwa/JY1GdATAQ+TgVtgrwwMZ0D07QhA99rs/EAAWfvG6KpA==",
+      "license": "MIT",
+      "dependencies": {
+        "@vue/compiler-dom": "3.5.13",
+        "@vue/shared": "3.5.13"
+      }
+    },
+    "node_modules/@vue/shared": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@vue/shared/-/shared-3.5.13.tgz",
+      "integrity": "sha512-/hnE/qP5ZoGpol0a5mDi45bOd7t3tjYJBjsgCsivow7D48cJeV5l05RD82lPqi7gRiphZM37rnhW1l6ZoCNNnQ==",
+      "license": "MIT"
+    },
+    "node_modules/arg": {
+      "version": "5.0.2",
+      "resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz",
+      "integrity": "sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==",
+      "license": "MIT"
+    },
+    "node_modules/argparse": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
+      "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
+      "license": "Python-2.0"
+    },
+    "node_modules/autoprefixer": {
+      "version": "10.4.20",
+      "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz",
+      "integrity": "sha512-XY25y5xSv/wEoqzDyXXME4AFfkZI0P23z6Fs3YgymDnKJkCGOnkL0iTxCa85UTqaSgfcqyf3UA6+c7wUvx/16g==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/autoprefixer"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "browserslist": "^4.23.3",
+        "caniuse-lite": "^1.0.30001646",
+        "fraction.js": "^4.3.7",
+        "normalize-range": "^0.1.2",
+        "picocolors": "^1.0.1",
+        "postcss-value-parser": "^4.2.0"
+      },
+      "bin": {
+        "autoprefixer": "bin/autoprefixer"
+      },
+      "engines": {
+        "node": "^10 || ^12 || >=14"
+      },
+      "peerDependencies": {
+        "postcss": "^8.1.0"
+      }
+    },
+    "node_modules/browserslist": {
+      "version": "4.24.2",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.24.2.tgz",
+      "integrity": "sha512-ZIc+Q62revdMcqC6aChtW4jz3My3klmCO1fEmINZY/8J3EpBg5/A/D0AKmBveUh6pgoeycoMkVMko84tuYS+Gg==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/browserslist"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "caniuse-lite": "^1.0.30001669",
+        "electron-to-chromium": "^1.5.41",
+        "node-releases": "^2.0.18",
+        "update-browserslist-db": "^1.1.1"
+      },
+      "bin": {
+        "browserslist": "cli.js"
+      },
+      "engines": {
+        "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7"
+      }
+    },
+    "node_modules/browserslist/node_modules/electron-to-chromium": {
+      "version": "1.5.67",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.67.tgz",
+      "integrity": "sha512-nz88NNBsD7kQSAGGJyp8hS6xSPtWwqNogA0mjtc2nUYeEf3nURK9qpV18TuBdDmEDgVWotS8Wkzf+V52dSQ/LQ==",
+      "license": "ISC"
+    },
+    "node_modules/browserslist/node_modules/escalade": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
+      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/browserslist/node_modules/node-releases": {
+      "version": "2.0.18",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.18.tgz",
+      "integrity": "sha512-d9VeXT4SJ7ZeOqGX6R5EM022wpL+eWPooLI+5UpWn2jCT1aosUQEhQP214x33Wkwx3JQMvIm+tIoVOdodFS40g==",
+      "license": "MIT"
+    },
+    "node_modules/browserslist/node_modules/update-browserslist-db": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.1.tgz",
+      "integrity": "sha512-R8UzCaa9Az+38REPiJ1tXlImTJXlVfgHZsglwBD/k6nj76ctsH1E3q4doGrukiLQd3sGQYu56r5+lo5r94l29A==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/browserslist"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "escalade": "^3.2.0",
+        "picocolors": "^1.1.0"
+      },
+      "bin": {
+        "update-browserslist-db": "cli.js"
+      },
+      "peerDependencies": {
+        "browserslist": ">= 4.21.0"
+      }
+    },
+    "node_modules/caniuse-lite": {
+      "version": "1.0.30001684",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001684.tgz",
+      "integrity": "sha512-G1LRwLIQjBQoyq0ZJGqGIJUXzJ8irpbjHLpVRXDvBEScFJ9b17sgK6vlx0GAJFE21okD7zXl08rRRUfq6HdoEQ==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/caniuse-lite"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "CC-BY-4.0"
+    },
+    "node_modules/chokidar": {
+      "version": "3.6.0",
+      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz",
+      "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==",
+      "license": "MIT",
+      "dependencies": {
+        "anymatch": "~3.1.2",
+        "braces": "~3.0.2",
+        "glob-parent": "~5.1.2",
+        "is-binary-path": "~2.1.0",
+        "is-glob": "~4.0.1",
+        "normalize-path": "~3.0.0",
+        "readdirp": "~3.6.0"
+      },
+      "engines": {
+        "node": ">= 8.10.0"
+      },
+      "funding": {
+        "url": "https://paulmillr.com/funding/"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.2"
+      }
+    },
+    "node_modules/chokidar/node_modules/anymatch": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz",
+      "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==",
+      "license": "ISC",
+      "dependencies": {
+        "normalize-path": "^3.0.0",
+        "picomatch": "^2.0.4"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/chokidar/node_modules/binary-extensions": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz",
+      "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/chokidar/node_modules/braces": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
+      "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
+      "license": "MIT",
+      "dependencies": {
+        "fill-range": "^7.1.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/chokidar/node_modules/fill-range": {
+      "version": "7.1.1",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
+      "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
+      "license": "MIT",
+      "dependencies": {
+        "to-regex-range": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/chokidar/node_modules/glob-parent": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+      "license": "ISC",
+      "dependencies": {
+        "is-glob": "^4.0.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/chokidar/node_modules/is-binary-path": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz",
+      "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==",
+      "license": "MIT",
+      "dependencies": {
+        "binary-extensions": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/chokidar/node_modules/is-number": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
+      "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.12.0"
+      }
+    },
+    "node_modules/chokidar/node_modules/picomatch": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
+      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/jonschlinkert"
+      }
+    },
+    "node_modules/chokidar/node_modules/readdirp": {
+      "version": "3.6.0",
+      "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz",
+      "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==",
+      "license": "MIT",
+      "dependencies": {
+        "picomatch": "^2.2.1"
+      },
+      "engines": {
+        "node": ">=8.10.0"
+      }
+    },
+    "node_modules/chokidar/node_modules/to-regex-range": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
+      "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
+      "license": "MIT",
+      "dependencies": {
+        "is-number": "^7.0.0"
+      },
+      "engines": {
+        "node": ">=8.0"
+      }
+    },
+    "node_modules/css-selector-tokenizer": {
+      "version": "0.8.0",
+      "resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz",
+      "integrity": "sha512-Jd6Ig3/pe62/qe5SBPTN8h8LeUg/pT4lLgtavPf7updwwHpvFzxvOQBHYj2LZDMjUnBzgvIUSjRcf6oT5HzHFg==",
+      "license": "MIT",
+      "dependencies": {
+        "cssesc": "^3.0.0",
+        "fastparse": "^1.1.2"
+      }
+    },
+    "node_modules/css-selector-tokenizer/node_modules/cssesc": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz",
+      "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==",
+      "license": "MIT",
+      "bin": {
+        "cssesc": "bin/cssesc"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/css-selector-tokenizer/node_modules/fastparse": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/fastparse/-/fastparse-1.1.2.tgz",
+      "integrity": "sha512-483XLLxTVIwWK3QTrMGRqUfUpoOs/0hbQrl2oz4J0pAcm3A3bu84wxTFqGqkJzewCLdME38xJLJAxBABfQT8sQ==",
+      "license": "MIT"
+    },
+    "node_modules/culori": {
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/culori/-/culori-3.3.0.tgz",
+      "integrity": "sha512-pHJg+jbuFsCjz9iclQBqyL3B2HLCBF71BwVNujUYEvCeQMvV97R59MNK3R2+jgJ3a1fcZgI9B3vYgz8lzr/BFQ==",
+      "license": "MIT",
+      "engines": {
+        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
+      }
+    },
+    "node_modules/daisyui": {
+      "version": "4.12.14",
+      "resolved": "https://registry.npmjs.org/daisyui/-/daisyui-4.12.14.tgz",
+      "integrity": "sha512-hA27cdBasdwd4/iEjn+aidoCrRroDuo3G5W9NDKaVCJI437Mm/3eSL/2u7MkZ0pt8a+TrYF3aT2pFVemTS3how==",
+      "license": "MIT",
+      "dependencies": {
+        "css-selector-tokenizer": "^0.8",
+        "culori": "^3",
+        "picocolors": "^1",
+        "postcss-js": "^4"
+      },
+      "engines": {
+        "node": ">=16.9.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/daisyui"
+      }
+    },
+    "node_modules/didyoumean": {
+      "version": "1.2.2",
+      "resolved": "https://registry.npmjs.org/didyoumean/-/didyoumean-1.2.2.tgz",
+      "integrity": "sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==",
+      "license": "Apache-2.0"
+    },
+    "node_modules/dlv": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz",
+      "integrity": "sha512-+HlytyjlPKnIG8XuRG8WvmBP8xs8P71y+SKKS6ZXWoEgLuePxtDoUEiH7WkdePWrQ5JBpE6aoVqfZfJUQkjXwA==",
+      "license": "MIT"
+    },
+    "node_modules/entities": {
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
+      "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">=0.12"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/entities?sponsor=1"
+      }
+    },
+    "node_modules/esbuild": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.21.5.tgz",
+      "integrity": "sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "bin": {
+        "esbuild": "bin/esbuild"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "optionalDependencies": {
+        "@esbuild/aix-ppc64": "0.21.5",
+        "@esbuild/android-arm": "0.21.5",
+        "@esbuild/android-arm64": "0.21.5",
+        "@esbuild/android-x64": "0.21.5",
+        "@esbuild/darwin-arm64": "0.21.5",
+        "@esbuild/darwin-x64": "0.21.5",
+        "@esbuild/freebsd-arm64": "0.21.5",
+        "@esbuild/freebsd-x64": "0.21.5",
+        "@esbuild/linux-arm": "0.21.5",
+        "@esbuild/linux-arm64": "0.21.5",
+        "@esbuild/linux-ia32": "0.21.5",
+        "@esbuild/linux-loong64": "0.21.5",
+        "@esbuild/linux-mips64el": "0.21.5",
+        "@esbuild/linux-ppc64": "0.21.5",
+        "@esbuild/linux-riscv64": "0.21.5",
+        "@esbuild/linux-s390x": "0.21.5",
+        "@esbuild/linux-x64": "0.21.5",
+        "@esbuild/netbsd-x64": "0.21.5",
+        "@esbuild/openbsd-x64": "0.21.5",
+        "@esbuild/sunos-x64": "0.21.5",
+        "@esbuild/win32-arm64": "0.21.5",
+        "@esbuild/win32-ia32": "0.21.5",
+        "@esbuild/win32-x64": "0.21.5"
+      }
+    },
+    "node_modules/esbuild/node_modules/@esbuild/darwin-arm64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz",
+      "integrity": "sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/fast-glob": {
+      "version": "3.3.2",
+      "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.2.tgz",
+      "integrity": "sha512-oX2ruAFQwf/Orj8m737Y5adxDQO0LAB7/S5MnxCdTNDd4p6BsyIVsv9JQsATbTSq8KHRpLwIHbVlUNatxd+1Ow==",
+      "license": "MIT",
+      "dependencies": {
+        "@nodelib/fs.stat": "^2.0.2",
+        "@nodelib/fs.walk": "^1.2.3",
+        "glob-parent": "^5.1.2",
+        "merge2": "^1.3.0",
+        "micromatch": "^4.0.4"
+      },
+      "engines": {
+        "node": ">=8.6.0"
+      }
+    },
+    "node_modules/fast-glob/node_modules/@nodelib/fs.scandir": {
+      "version": "2.1.5",
+      "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
+      "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==",
+      "license": "MIT",
+      "dependencies": {
+        "@nodelib/fs.stat": "2.0.5",
+        "run-parallel": "^1.1.9"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/fast-glob/node_modules/@nodelib/fs.stat": {
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz",
+      "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/fast-glob/node_modules/@nodelib/fs.walk": {
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz",
+      "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==",
+      "license": "MIT",
+      "dependencies": {
+        "@nodelib/fs.scandir": "2.1.5",
+        "fastq": "^1.6.0"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/fast-glob/node_modules/fastq": {
+      "version": "1.17.1",
+      "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz",
+      "integrity": "sha512-sRVD3lWVIXWg6By68ZN7vho9a1pQcN/WBFaAAsDDFzlJjvoGx0P8z7V1t72grFJfJhu3YPZBuu25f7Kaw2jN1w==",
+      "license": "ISC",
+      "dependencies": {
+        "reusify": "^1.0.4"
+      }
+    },
+    "node_modules/fast-glob/node_modules/glob-parent": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+      "license": "ISC",
+      "dependencies": {
+        "is-glob": "^4.0.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/fast-glob/node_modules/merge2": {
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
+      "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/fast-glob/node_modules/queue-microtask": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
+      "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/fast-glob/node_modules/reusify": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz",
+      "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==",
+      "license": "MIT",
+      "engines": {
+        "iojs": ">=1.0.0",
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/fast-glob/node_modules/run-parallel": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
+      "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "queue-microtask": "^1.2.2"
+      }
+    },
+    "node_modules/fraction.js": {
+      "version": "4.3.7",
+      "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz",
+      "integrity": "sha512-ZsDfxO51wGAXREY55a7la9LScWpwv9RxIrYABrlvOFBlH/ShPnrtsXeuUIfXKKOVicNxQ+o8JTbJvjS4M89yew==",
+      "license": "MIT",
+      "engines": {
+        "node": "*"
+      },
+      "funding": {
+        "type": "patreon",
+        "url": "https://github.com/sponsors/rawify"
+      }
+    },
+    "node_modules/fsevents": {
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
+      "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
+    "node_modules/glob-parent": {
+      "version": "6.0.2",
+      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz",
+      "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==",
+      "license": "ISC",
+      "dependencies": {
+        "is-glob": "^4.0.3"
+      },
+      "engines": {
+        "node": ">=10.13.0"
+      }
+    },
+    "node_modules/is-glob": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
+      "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==",
+      "license": "MIT",
+      "dependencies": {
+        "is-extglob": "^2.1.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-glob/node_modules/is-extglob": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
+      "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/jiti": {
+      "version": "1.21.6",
+      "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.6.tgz",
+      "integrity": "sha512-2yTgeWTWzMWkHu6Jp9NKgePDaYHbntiwvYuuJLbbN9vl7DC9DvXKOB2BC3ZZ92D3cvV/aflH0osDfwpHepQ53w==",
+      "license": "MIT",
+      "bin": {
+        "jiti": "bin/jiti.js"
+      }
+    },
+    "node_modules/lilconfig": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz",
+      "integrity": "sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/linkify-it": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/linkify-it/-/linkify-it-5.0.0.tgz",
+      "integrity": "sha512-5aHCbzQRADcdP+ATqnDuhhJ/MRIqDkZX5pyjFHRRysS8vZ5AbqGEoFIb6pYHPZ+L/OC2Lc+xT8uHVVR5CAK/wQ==",
+      "license": "MIT",
+      "dependencies": {
+        "uc.micro": "^2.0.0"
+      }
+    },
+    "node_modules/markdown-it": {
+      "version": "14.1.0",
+      "resolved": "https://registry.npmjs.org/markdown-it/-/markdown-it-14.1.0.tgz",
+      "integrity": "sha512-a54IwgWPaeBCAAsv13YgmALOF1elABB08FxO9i+r4VFk5Vl4pKokRPeX8u5TCgSsPi6ec1otfLjdOpVcgbpshg==",
+      "license": "MIT",
+      "dependencies": {
+        "argparse": "^2.0.1",
+        "entities": "^4.4.0",
+        "linkify-it": "^5.0.0",
+        "mdurl": "^2.0.0",
+        "punycode.js": "^2.3.1",
+        "uc.micro": "^2.1.0"
+      },
+      "bin": {
+        "markdown-it": "bin/markdown-it.mjs"
+      }
+    },
+    "node_modules/mdurl": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdurl/-/mdurl-2.0.0.tgz",
+      "integrity": "sha512-Lf+9+2r+Tdp5wXDXC4PcIBjTDtq4UKjCPMQhKIuzpJNW0b96kVqSwW0bT7FhRSfmAiFYgP+SCRvdrDozfh0U5w==",
+      "license": "MIT"
+    },
+    "node_modules/micromatch": {
+      "version": "4.0.8",
+      "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz",
+      "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==",
+      "license": "MIT",
+      "dependencies": {
+        "braces": "^3.0.3",
+        "picomatch": "^2.3.1"
+      },
+      "engines": {
+        "node": ">=8.6"
+      }
+    },
+    "node_modules/micromatch/node_modules/braces": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
+      "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
+      "license": "MIT",
+      "dependencies": {
+        "fill-range": "^7.1.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/micromatch/node_modules/fill-range": {
+      "version": "7.1.1",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
+      "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
+      "license": "MIT",
+      "dependencies": {
+        "to-regex-range": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/micromatch/node_modules/is-number": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
+      "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.12.0"
+      }
+    },
+    "node_modules/micromatch/node_modules/picomatch": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
+      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/jonschlinkert"
+      }
+    },
+    "node_modules/micromatch/node_modules/to-regex-range": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
+      "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
+      "license": "MIT",
+      "dependencies": {
+        "is-number": "^7.0.0"
+      },
+      "engines": {
+        "node": ">=8.0"
+      }
+    },
+    "node_modules/normalize-path": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz",
+      "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/normalize-range": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/normalize-range/-/normalize-range-0.1.2.tgz",
+      "integrity": "sha512-bdok/XvKII3nUpklnV6P2hxtMNrCboOjAcyBuQnWEhO665FwrSNRxU+AqpsyvO6LgGYPspN+lu5CLtw4jPRKNA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/object-hash": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/object-hash/-/object-hash-3.0.0.tgz",
+      "integrity": "sha512-RSn9F68PjH9HqtltsSnqYC1XXoWe9Bju5+213R98cNGttag9q9yAOTzdbsqvIa7aNm5WffBZFpWYr2aWrklWAw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/picocolors": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
+      "license": "ISC"
+    },
+    "node_modules/postcss": {
+      "version": "8.4.49",
+      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.49.tgz",
+      "integrity": "sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/postcss"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "nanoid": "^3.3.7",
+        "picocolors": "^1.1.1",
+        "source-map-js": "^1.2.1"
+      },
+      "engines": {
+        "node": "^10 || ^12 || >=14"
+      }
+    },
+    "node_modules/postcss-import": {
+      "version": "15.1.0",
+      "resolved": "https://registry.npmjs.org/postcss-import/-/postcss-import-15.1.0.tgz",
+      "integrity": "sha512-hpr+J05B2FVYUAXHeK1YyI267J/dDDhMU6B6civm8hSY1jYJnBXxzKDKDswzJmtLHryrjhnDjqqp/49t8FALew==",
+      "license": "MIT",
+      "dependencies": {
+        "postcss-value-parser": "^4.0.0",
+        "read-cache": "^1.0.0",
+        "resolve": "^1.1.7"
+      },
+      "engines": {
+        "node": ">=14.0.0"
+      },
+      "peerDependencies": {
+        "postcss": "^8.0.0"
+      }
+    },
+    "node_modules/postcss-import/node_modules/pify": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz",
+      "integrity": "sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/postcss-import/node_modules/read-cache": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/read-cache/-/read-cache-1.0.0.tgz",
+      "integrity": "sha512-Owdv/Ft7IjOgm/i0xvNDZ1LrRANRfew4b2prF3OWMQLxLfu3bS8FVhCsrSCMK4lR56Y9ya+AThoTpDCTxCmpRA==",
+      "license": "MIT",
+      "dependencies": {
+        "pify": "^2.3.0"
+      }
+    },
+    "node_modules/postcss-js": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/postcss-js/-/postcss-js-4.0.1.tgz",
+      "integrity": "sha512-dDLF8pEO191hJMtlHFPRa8xsizHaM82MLfNkUHdUtVEV3tgTp5oj+8qbEqYM57SLfc74KSbw//4SeJma2LRVIw==",
+      "license": "MIT",
+      "dependencies": {
+        "camelcase-css": "^2.0.1"
+      },
+      "engines": {
+        "node": "^12 || ^14 || >= 16"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/postcss/"
+      },
+      "peerDependencies": {
+        "postcss": "^8.4.21"
+      }
+    },
+    "node_modules/postcss-js/node_modules/camelcase-css": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/camelcase-css/-/camelcase-css-2.0.1.tgz",
+      "integrity": "sha512-QOSvevhslijgYwRx6Rv7zKdMF8lbRmx+uQGx2+vDc+KI/eBnsy9kit5aj23AgGu3pa4t9AgwbnXWqS+iOY+2aA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/postcss-load-config": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-4.0.2.tgz",
+      "integrity": "sha512-bSVhyJGL00wMVoPUzAVAnbEoWyqRxkjv64tUl427SKnPrENtq6hJwUojroMz2VB+Q1edmi4IfrAPpami5VVgMQ==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "lilconfig": "^3.0.0",
+        "yaml": "^2.3.4"
+      },
+      "engines": {
+        "node": ">= 14"
+      },
+      "peerDependencies": {
+        "postcss": ">=8.0.9",
+        "ts-node": ">=9.0.0"
+      },
+      "peerDependenciesMeta": {
+        "postcss": {
+          "optional": true
+        },
+        "ts-node": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/postcss-load-config/node_modules/lilconfig": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.2.tgz",
+      "integrity": "sha512-eop+wDAvpItUys0FWkHIKeC9ybYrTGbU41U5K7+bttZZeohvnY7M9dZ5kB21GNWiFT2q1OoPTvncPCgSOVO5ow==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/antonk52"
+      }
+    },
+    "node_modules/postcss-load-config/node_modules/yaml": {
+      "version": "2.6.1",
+      "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.6.1.tgz",
+      "integrity": "sha512-7r0XPzioN/Q9kXBro/XPnA6kznR73DHq+GXh5ON7ZozRO6aMjbmiBuKste2wslTFkC5d1dw0GooOCepZXJ2SAg==",
+      "license": "ISC",
+      "bin": {
+        "yaml": "bin.mjs"
+      },
+      "engines": {
+        "node": ">= 14"
+      }
+    },
+    "node_modules/postcss-nested": {
+      "version": "6.2.0",
+      "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz",
+      "integrity": "sha512-HQbt28KulC5AJzG+cZtj9kvKB93CFCdLvog1WFLf1D+xmMvPGlBstkpTEZfK5+AN9hfJocyBFCNiqyS48bpgzQ==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "postcss-selector-parser": "^6.1.1"
+      },
+      "engines": {
+        "node": ">=12.0"
+      },
+      "peerDependencies": {
+        "postcss": "^8.2.14"
+      }
+    },
+    "node_modules/postcss-selector-parser": {
+      "version": "6.1.2",
+      "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.2.tgz",
+      "integrity": "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg==",
+      "license": "MIT",
+      "dependencies": {
+        "cssesc": "^3.0.0",
+        "util-deprecate": "^1.0.2"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/postcss-selector-parser/node_modules/cssesc": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz",
+      "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==",
+      "license": "MIT",
+      "bin": {
+        "cssesc": "bin/cssesc"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/postcss-selector-parser/node_modules/util-deprecate": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
+      "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
+      "license": "MIT"
+    },
+    "node_modules/postcss-value-parser": {
+      "version": "4.2.0",
+      "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz",
+      "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==",
+      "license": "MIT"
+    },
+    "node_modules/postcss/node_modules/nanoid": {
+      "version": "3.3.8",
+      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz",
+      "integrity": "sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "bin": {
+        "nanoid": "bin/nanoid.cjs"
+      },
+      "engines": {
+        "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
+      }
+    },
+    "node_modules/postcss/node_modules/source-map-js": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
+      "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
+      "license": "BSD-3-Clause",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/punycode.js": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/punycode.js/-/punycode.js-2.3.1.tgz",
+      "integrity": "sha512-uxFIHU0YlHYhDQtV4R9J6a52SLx28BCjT+4ieh7IGbgwVJWO+km431c4yRlREUAsAmt/uMjQUyQHNEPf0M39CA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/resolve": {
+      "version": "1.22.8",
+      "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz",
+      "integrity": "sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw==",
+      "license": "MIT",
+      "dependencies": {
+        "is-core-module": "^2.13.0",
+        "path-parse": "^1.0.7",
+        "supports-preserve-symlinks-flag": "^1.0.0"
+      },
+      "bin": {
+        "resolve": "bin/resolve"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/resolve/node_modules/function-bind": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
+      "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/resolve/node_modules/hasown": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
+      "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
+      "license": "MIT",
+      "dependencies": {
+        "function-bind": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/resolve/node_modules/is-core-module": {
+      "version": "2.15.1",
+      "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.15.1.tgz",
+      "integrity": "sha512-z0vtXSwucUJtANQWldhbtbt7BnL0vxiFjIdDLAatwhDYty2bad6s+rijD6Ri4YuYJubLzIJLUidCh09e1djEVQ==",
+      "license": "MIT",
+      "dependencies": {
+        "hasown": "^2.0.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/resolve/node_modules/path-parse": {
+      "version": "1.0.7",
+      "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz",
+      "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==",
+      "license": "MIT"
+    },
+    "node_modules/resolve/node_modules/supports-preserve-symlinks-flag": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz",
+      "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/rollup": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.28.0.tgz",
+      "integrity": "sha512-G9GOrmgWHBma4YfCcX8PjH0qhXSdH8B4HDE2o4/jaxj93S4DPCIDoLcXz99eWMji4hB29UFCEd7B2gwGJDR9cQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree": "1.0.6"
+      },
+      "bin": {
+        "rollup": "dist/bin/rollup"
+      },
+      "engines": {
+        "node": ">=18.0.0",
+        "npm": ">=8.0.0"
+      },
+      "optionalDependencies": {
+        "@rollup/rollup-android-arm-eabi": "4.28.0",
+        "@rollup/rollup-android-arm64": "4.28.0",
+        "@rollup/rollup-darwin-arm64": "4.28.0",
+        "@rollup/rollup-darwin-x64": "4.28.0",
+        "@rollup/rollup-freebsd-arm64": "4.28.0",
+        "@rollup/rollup-freebsd-x64": "4.28.0",
+        "@rollup/rollup-linux-arm-gnueabihf": "4.28.0",
+        "@rollup/rollup-linux-arm-musleabihf": "4.28.0",
+        "@rollup/rollup-linux-arm64-gnu": "4.28.0",
+        "@rollup/rollup-linux-arm64-musl": "4.28.0",
+        "@rollup/rollup-linux-powerpc64le-gnu": "4.28.0",
+        "@rollup/rollup-linux-riscv64-gnu": "4.28.0",
+        "@rollup/rollup-linux-s390x-gnu": "4.28.0",
+        "@rollup/rollup-linux-x64-gnu": "4.28.0",
+        "@rollup/rollup-linux-x64-musl": "4.28.0",
+        "@rollup/rollup-win32-arm64-msvc": "4.28.0",
+        "@rollup/rollup-win32-ia32-msvc": "4.28.0",
+        "@rollup/rollup-win32-x64-msvc": "4.28.0",
+        "fsevents": "~2.3.2"
+      }
+    },
+    "node_modules/rollup/node_modules/@rollup/rollup-darwin-arm64": {
+      "version": "4.28.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.28.0.tgz",
+      "integrity": "sha512-lmKx9yHsppblnLQZOGxdO66gT77bvdBtr/0P+TPOseowE7D9AJoBw8ZDULRasXRWf1Z86/gcOdpBrV6VDUY36Q==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/rollup/node_modules/@types/estree": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.6.tgz",
+      "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==",
+      "license": "MIT"
+    },
+    "node_modules/sucrase": {
+      "version": "3.35.0",
+      "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz",
+      "integrity": "sha512-8EbVDiu9iN/nESwxeSxDKe0dunta1GOlHufmSSXxMD2z2/tMZpDMpvXQGsc+ajGo8y2uYUmixaSRUc/QPoQ0GA==",
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/gen-mapping": "^0.3.2",
+        "commander": "^4.0.0",
+        "glob": "^10.3.10",
+        "lines-and-columns": "^1.1.6",
+        "mz": "^2.7.0",
+        "pirates": "^4.0.1",
+        "ts-interface-checker": "^0.1.9"
+      },
+      "bin": {
+        "sucrase": "bin/sucrase",
+        "sucrase-node": "bin/sucrase-node"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      }
+    },
+    "node_modules/sucrase/node_modules/@isaacs/cliui": {
+      "version": "8.0.2",
+      "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz",
+      "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==",
+      "license": "ISC",
+      "dependencies": {
+        "string-width": "^5.1.2",
+        "string-width-cjs": "npm:string-width@^4.2.0",
+        "strip-ansi": "^7.0.1",
+        "strip-ansi-cjs": "npm:strip-ansi@^6.0.1",
+        "wrap-ansi": "^8.1.0",
+        "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0"
+      },
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/sucrase/node_modules/@jridgewell/gen-mapping": {
+      "version": "0.3.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz",
+      "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==",
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/set-array": "^1.2.1",
+        "@jridgewell/sourcemap-codec": "^1.4.10",
+        "@jridgewell/trace-mapping": "^0.3.24"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/sucrase/node_modules/@jridgewell/resolve-uri": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
+      "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/sucrase/node_modules/@jridgewell/set-array": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz",
+      "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/sucrase/node_modules/@jridgewell/sourcemap-codec": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz",
+      "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==",
+      "license": "MIT"
+    },
+    "node_modules/sucrase/node_modules/@jridgewell/trace-mapping": {
+      "version": "0.3.25",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz",
+      "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/resolve-uri": "^3.1.0",
+        "@jridgewell/sourcemap-codec": "^1.4.14"
+      }
+    },
+    "node_modules/sucrase/node_modules/@pkgjs/parseargs": {
+      "version": "0.11.0",
+      "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
+      "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==",
+      "license": "MIT",
+      "optional": true,
+      "engines": {
+        "node": ">=14"
+      }
+    },
+    "node_modules/sucrase/node_modules/ansi-regex": {
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz",
+      "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-regex?sponsor=1"
+      }
+    },
+    "node_modules/sucrase/node_modules/ansi-styles": {
+      "version": "6.2.1",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
+      "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/sucrase/node_modules/any-promise": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz",
+      "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==",
+      "license": "MIT"
+    },
+    "node_modules/sucrase/node_modules/balanced-match": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
+      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
+      "license": "MIT"
+    },
+    "node_modules/sucrase/node_modules/brace-expansion": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
+      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+      "license": "MIT",
+      "dependencies": {
+        "balanced-match": "^1.0.0"
+      }
+    },
+    "node_modules/sucrase/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "license": "MIT",
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/sucrase/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "license": "MIT"
+    },
+    "node_modules/sucrase/node_modules/commander": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz",
+      "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/sucrase/node_modules/cross-spawn": {
+      "version": "7.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
+      "license": "MIT",
+      "dependencies": {
+        "path-key": "^3.1.0",
+        "shebang-command": "^2.0.0",
+        "which": "^2.0.1"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/sucrase/node_modules/eastasianwidth": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz",
+      "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==",
+      "license": "MIT"
+    },
+    "node_modules/sucrase/node_modules/emoji-regex": {
+      "version": "9.2.2",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
+      "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==",
+      "license": "MIT"
+    },
+    "node_modules/sucrase/node_modules/foreground-child": {
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.0.tgz",
+      "integrity": "sha512-Ld2g8rrAyMYFXBhEqMz8ZAHBi4J4uS1i/CxGMDnjyFWddMXLVcDp051DZfu+t7+ab7Wv6SMqpWmyFIj5UbfFvg==",
+      "license": "ISC",
+      "dependencies": {
+        "cross-spawn": "^7.0.0",
+        "signal-exit": "^4.0.1"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/sucrase/node_modules/glob": {
+      "version": "10.4.5",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz",
+      "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==",
+      "license": "ISC",
+      "dependencies": {
+        "foreground-child": "^3.1.0",
+        "jackspeak": "^3.1.2",
+        "minimatch": "^9.0.4",
+        "minipass": "^7.1.2",
+        "package-json-from-dist": "^1.0.0",
+        "path-scurry": "^1.11.1"
+      },
+      "bin": {
+        "glob": "dist/esm/bin.mjs"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/sucrase/node_modules/is-fullwidth-code-point": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
+      "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/isexe": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
+      "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
+      "license": "ISC"
+    },
+    "node_modules/sucrase/node_modules/jackspeak": {
+      "version": "3.4.3",
+      "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz",
+      "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==",
+      "license": "BlueOak-1.0.0",
+      "dependencies": {
+        "@isaacs/cliui": "^8.0.2"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      },
+      "optionalDependencies": {
+        "@pkgjs/parseargs": "^0.11.0"
+      }
+    },
+    "node_modules/sucrase/node_modules/lines-and-columns": {
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
+      "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==",
+      "license": "MIT"
+    },
+    "node_modules/sucrase/node_modules/lru-cache": {
+      "version": "10.4.3",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz",
+      "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==",
+      "license": "ISC"
+    },
+    "node_modules/sucrase/node_modules/minimatch": {
+      "version": "9.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
+      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
+      "license": "ISC",
+      "dependencies": {
+        "brace-expansion": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/sucrase/node_modules/minipass": {
+      "version": "7.1.2",
+      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz",
+      "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==",
+      "license": "ISC",
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      }
+    },
+    "node_modules/sucrase/node_modules/mz": {
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz",
+      "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==",
+      "license": "MIT",
+      "dependencies": {
+        "any-promise": "^1.0.0",
+        "object-assign": "^4.0.1",
+        "thenify-all": "^1.0.0"
+      }
+    },
+    "node_modules/sucrase/node_modules/object-assign": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
+      "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/sucrase/node_modules/package-json-from-dist": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.1.tgz",
+      "integrity": "sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==",
+      "license": "BlueOak-1.0.0"
+    },
+    "node_modules/sucrase/node_modules/path-key": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
+      "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/path-scurry": {
+      "version": "1.11.1",
+      "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz",
+      "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==",
+      "license": "BlueOak-1.0.0",
+      "dependencies": {
+        "lru-cache": "^10.2.0",
+        "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/sucrase/node_modules/pirates": {
+      "version": "4.0.6",
+      "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.6.tgz",
+      "integrity": "sha512-saLsH7WeYYPiD25LDuLRRY/i+6HaPYr6G1OUlN39otzkSTxKnubR9RTxS3/Kk50s1g2JTgFwWQDQyplC5/SHZg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/sucrase/node_modules/shebang-command": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
+      "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
+      "license": "MIT",
+      "dependencies": {
+        "shebang-regex": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/shebang-regex": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
+      "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/signal-exit": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
+      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
+      "license": "ISC",
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/sucrase/node_modules/string-width": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz",
+      "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==",
+      "license": "MIT",
+      "dependencies": {
+        "eastasianwidth": "^0.2.0",
+        "emoji-regex": "^9.2.2",
+        "strip-ansi": "^7.0.1"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/sucrase/node_modules/string-width-cjs": {
+      "name": "string-width",
+      "version": "4.2.3",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
+      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "license": "MIT",
+      "dependencies": {
+        "emoji-regex": "^8.0.0",
+        "is-fullwidth-code-point": "^3.0.0",
+        "strip-ansi": "^6.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/string-width-cjs/node_modules/ansi-regex": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/string-width-cjs/node_modules/emoji-regex": {
+      "version": "8.0.0",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
+      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
+      "license": "MIT"
+    },
+    "node_modules/sucrase/node_modules/string-width-cjs/node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/strip-ansi": {
+      "version": "7.1.0",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz",
+      "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-regex": "^6.0.1"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/strip-ansi?sponsor=1"
+      }
+    },
+    "node_modules/sucrase/node_modules/strip-ansi-cjs": {
+      "name": "strip-ansi",
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/strip-ansi-cjs/node_modules/ansi-regex": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/thenify": {
+      "version": "3.3.1",
+      "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz",
+      "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==",
+      "license": "MIT",
+      "dependencies": {
+        "any-promise": "^1.0.0"
+      }
+    },
+    "node_modules/sucrase/node_modules/thenify-all": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz",
+      "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==",
+      "license": "MIT",
+      "dependencies": {
+        "thenify": ">= 3.1.0 < 4"
+      },
+      "engines": {
+        "node": ">=0.8"
+      }
+    },
+    "node_modules/sucrase/node_modules/ts-interface-checker": {
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz",
+      "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==",
+      "license": "Apache-2.0"
+    },
+    "node_modules/sucrase/node_modules/which": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+      "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+      "license": "ISC",
+      "dependencies": {
+        "isexe": "^2.0.0"
+      },
+      "bin": {
+        "node-which": "bin/node-which"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/sucrase/node_modules/wrap-ansi": {
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz",
+      "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-styles": "^6.1.0",
+        "string-width": "^5.0.1",
+        "strip-ansi": "^7.0.1"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
+      }
+    },
+    "node_modules/sucrase/node_modules/wrap-ansi-cjs": {
+      "name": "wrap-ansi",
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
+      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-styles": "^4.0.0",
+        "string-width": "^4.1.0",
+        "strip-ansi": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
+      }
+    },
+    "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/ansi-regex": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "license": "MIT",
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/emoji-regex": {
+      "version": "8.0.0",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
+      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
+      "license": "MIT"
+    },
+    "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/string-width": {
+      "version": "4.2.3",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
+      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "license": "MIT",
+      "dependencies": {
+        "emoji-regex": "^8.0.0",
+        "is-fullwidth-code-point": "^3.0.0",
+        "strip-ansi": "^6.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/tailwindcss": {
+      "version": "3.4.15",
+      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.15.tgz",
+      "integrity": "sha512-r4MeXnfBmSOuKUWmXe6h2CcyfzJCEk4F0pptO5jlnYSIViUkVmsawj80N5h2lO3gwcmSb4n3PuN+e+GC1Guylw==",
+      "license": "MIT",
+      "dependencies": {
+        "@alloc/quick-lru": "^5.2.0",
+        "arg": "^5.0.2",
+        "chokidar": "^3.6.0",
+        "didyoumean": "^1.2.2",
+        "dlv": "^1.1.3",
+        "fast-glob": "^3.3.2",
+        "glob-parent": "^6.0.2",
+        "is-glob": "^4.0.3",
+        "jiti": "^1.21.6",
+        "lilconfig": "^2.1.0",
+        "micromatch": "^4.0.8",
+        "normalize-path": "^3.0.0",
+        "object-hash": "^3.0.0",
+        "picocolors": "^1.1.1",
+        "postcss": "^8.4.47",
+        "postcss-import": "^15.1.0",
+        "postcss-js": "^4.0.1",
+        "postcss-load-config": "^4.0.2",
+        "postcss-nested": "^6.2.0",
+        "postcss-selector-parser": "^6.1.2",
+        "resolve": "^1.22.8",
+        "sucrase": "^3.35.0"
+      },
+      "bin": {
+        "tailwind": "lib/cli.js",
+        "tailwindcss": "lib/cli.js"
+      },
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/uc.micro": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz",
+      "integrity": "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A==",
+      "license": "MIT"
+    },
+    "node_modules/vite": {
+      "version": "5.4.11",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.11.tgz",
+      "integrity": "sha512-c7jFQRklXua0mTzneGW9QVyxFjUgwcihC4bXEtujIo2ouWCe1Ajt/amn2PCxYnhYfd5k09JX3SB7OYWFKYqj8Q==",
+      "license": "MIT",
+      "dependencies": {
+        "esbuild": "^0.21.3",
+        "postcss": "^8.4.43",
+        "rollup": "^4.20.0"
+      },
+      "bin": {
+        "vite": "bin/vite.js"
+      },
+      "engines": {
+        "node": "^18.0.0 || >=20.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/vitejs/vite?sponsor=1"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.3"
+      },
+      "peerDependencies": {
+        "@types/node": "^18.0.0 || >=20.0.0",
+        "less": "*",
+        "lightningcss": "^1.21.0",
+        "sass": "*",
+        "sass-embedded": "*",
+        "stylus": "*",
+        "sugarss": "*",
+        "terser": "^5.4.0"
+      },
+      "peerDependenciesMeta": {
+        "@types/node": {
+          "optional": true
+        },
+        "less": {
+          "optional": true
+        },
+        "lightningcss": {
+          "optional": true
+        },
+        "sass": {
+          "optional": true
+        },
+        "sass-embedded": {
+          "optional": true
+        },
+        "stylus": {
+          "optional": true
+        },
+        "sugarss": {
+          "optional": true
+        },
+        "terser": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/vite-plugin-singlefile": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/vite-plugin-singlefile/-/vite-plugin-singlefile-2.0.3.tgz",
+      "integrity": "sha512-OEBEwdX8nCGPSdtaB1D7rryYnT+YfPTS8ojL1TDyeUF+bWDCTfRriQqw6T0vl9EbKI/KMg7szN3awst6cLrKkA==",
+      "license": "MIT",
+      "dependencies": {
+        "micromatch": "^4.0.8"
+      },
+      "engines": {
+        "node": ">18.0.0"
+      },
+      "peerDependencies": {
+        "rollup": "^4.24.3",
+        "vite": "^5.4.10"
+      }
+    },
+    "node_modules/vue": {
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.13.tgz",
+      "integrity": "sha512-wmeiSMxkZCSc+PM2w2VRsOYAZC8GdipNFRTsLSfodVqI9mbejKeXEGr8SckuLnrQPGe3oJN5c3K0vpoU9q/wCQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@vue/compiler-dom": "3.5.13",
+        "@vue/compiler-sfc": "3.5.13",
+        "@vue/runtime-dom": "3.5.13",
+        "@vue/server-renderer": "3.5.13",
+        "@vue/shared": "3.5.13"
+      },
+      "peerDependencies": {
+        "typescript": "*"
+      },
+      "peerDependenciesMeta": {
+        "typescript": {
+          "optional": true
+        }
+      }
+    }
+  }
+}
diff --git a/examples/server/webui/package.json b/examples/server/webui/package.json
new file mode 100644
index 000000000..2a45ece14
--- /dev/null
+++ b/examples/server/webui/package.json
@@ -0,0 +1,23 @@
+{
+  "name": "webui",
+  "private": true,
+  "version": "0.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "preview": "vite preview"
+  },
+  "devDependencies": {
+    "vite": "^5.4.10"
+  },
+  "dependencies": {
+    "autoprefixer": "^10.4.20",
+    "daisyui": "^4.12.14",
+    "markdown-it": "^14.1.0",
+    "postcss": "^8.4.49",
+    "tailwindcss": "^3.4.15",
+    "vite-plugin-singlefile": "^2.0.3",
+    "vue": "^3.5.13"
+  }
+}
diff --git a/examples/server/webui/postcss.config.js b/examples/server/webui/postcss.config.js
new file mode 100644
index 000000000..2e7af2b7f
--- /dev/null
+++ b/examples/server/webui/postcss.config.js
@@ -0,0 +1,6 @@
+export default {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}
diff --git a/examples/server/public/completion.js b/examples/server/webui/src/completion.js
similarity index 100%
rename from examples/server/public/completion.js
rename to examples/server/webui/src/completion.js
diff --git a/examples/server/webui/src/main.js b/examples/server/webui/src/main.js
new file mode 100644
index 000000000..9b5b12329
--- /dev/null
+++ b/examples/server/webui/src/main.js
@@ -0,0 +1,456 @@
+import './styles.css';
+import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
+import { llama } from './completion.js';
+import MarkdownIt from 'markdown-it';
+
+// utility functions
+const isString = (x) => !!x.toLowerCase;
+const isNumeric = (n) => !isString(n) && !isNaN(n);
+const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
+const copyStr = (str) => navigator.clipboard.writeText(str);
+
+// constants
+const BASE_URL = localStorage.getItem('base') // for debugging
+  || (new URL('.', document.baseURI).href).toString(); // for production
+const CONFIG_DEFAULT = {
+  // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
+  apiKey: '',
+  systemMessage: 'You are a helpful assistant.',
+  // make sure these default values are in sync with `common.h`
+  samplers: 'dkypmxt',
+  temperature: 0.8,
+  dynatemp_range: 0.0,
+  dynatemp_exponent: 1.0,
+  top_k: 40,
+  top_p: 0.95,
+  min_p: 0.05,
+  xtc_probability: 0.0,
+  xtc_threshold: 0.1,
+  typical_p: 1.0,
+  repeat_last_n: 64,
+  repeat_penalty: 1.0,
+  presence_penalty: 0.0,
+  frequency_penalty: 0.0,
+  dry_multiplier: 0.0,
+  dry_base: 1.75,
+  dry_allowed_length: 2,
+  dry_penalty_last_n: -1,
+  max_tokens: -1,
+  custom: '', // custom json-stringified object
+};
+const CONFIG_INFO = {
+  apiKey: 'Set the API Key if you are using --api-key option for the server.',
+  systemMessage: 'The starting message that defines how model should behave.',
+  samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
+  temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
+  dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
+  dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
+  top_k: 'Keeps only k top tokens.',
+  top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
+  min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
+  xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
+  xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
+  typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
+  repeat_last_n: 'Last n tokens to consider for penalizing repetition',
+  repeat_penalty: 'Controls the repetition of token sequences in the generated text',
+  presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
+  frequency_penalty: 'Limits tokens based on how often they appear in the output.',
+  dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
+  dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
+  dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
+  dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
+  max_tokens: 'The maximum number of token per output.',
+  custom: '', // custom json-stringified object
+};
+// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
+const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
+// list of themes supported by daisyui
+const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
+
+// markdown support
+const VueMarkdown = defineComponent(
+  (props) => {
+    const md = shallowRef(new MarkdownIt({ breaks: true }));
+    const origFenchRenderer = md.value.renderer.rules.fence;
+    md.value.renderer.rules.fence = (tokens, idx, ...args) => {
+      const content = tokens[idx].content;
+      const origRendered = origFenchRenderer(tokens, idx, ...args);
+      return `<div class="relative my-4">
+        <div class="text-right sticky top-4 mb-2 mr-2 h-0">
+          <button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
+        </div>
+        ${origRendered}
+      </div>`;
+    };
+    window.copyStr = copyStr;
+    const content = computed(() => md.value.render(props.source));
+    return () => h("div", { innerHTML: content.value });
+  },
+  { props: ["source"] }
+);
+
+// input field to be used by settings modal
+const SettingsModalShortInput = defineComponent({
+  template: document.getElementById('settings-modal-short-input').innerHTML,
+  props: {
+    label: { type: String, required: false },
+    configKey: String,
+    configDefault: Object,
+    configInfo: Object,
+    modelValue: [Object, String, Number],
+  },
+});
+
+// coversations is stored in localStorage
+// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
+// convId is a string prefixed with 'conv-'
+const StorageUtils = {
+  // manage conversations
+  getAllConversations() {
+    const res = [];
+    for (const key in localStorage) {
+      if (key.startsWith('conv-')) {
+        res.push(JSON.parse(localStorage.getItem(key)));
+      }
+    }
+    res.sort((a, b) => b.lastModified - a.lastModified);
+    return res;
+  },
+  // can return null if convId does not exist
+  getOneConversation(convId) {
+    return JSON.parse(localStorage.getItem(convId) || 'null');
+  },
+  // if convId does not exist, create one
+  appendMsg(convId, msg) {
+    if (msg.content === null) return;
+    const conv = StorageUtils.getOneConversation(convId) || {
+      id: convId,
+      lastModified: Date.now(),
+      messages: [],
+    };
+    conv.messages.push(msg);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+  },
+  getNewConvId() {
+    return `conv-${Date.now()}`;
+  },
+  remove(convId) {
+    localStorage.removeItem(convId);
+  },
+  filterAndKeepMsgs(convId, predicate) {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    conv.messages = conv.messages.filter(predicate);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+  },
+  popMsg(convId) {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    const msg = conv.messages.pop();
+    conv.lastModified = Date.now();
+    if (conv.messages.length === 0) {
+      StorageUtils.remove(convId);
+    } else {
+      localStorage.setItem(convId, JSON.stringify(conv));
+    }
+    return msg;
+  },
+
+  // manage config
+  getConfig() {
+    const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
+    // to prevent breaking changes in the future, we always provide default value for missing keys
+    return {
+      ...CONFIG_DEFAULT,
+      ...savedVal,
+    };
+  },
+  setConfig(config) {
+    localStorage.setItem('config', JSON.stringify(config));
+  },
+  getTheme() {
+    return localStorage.getItem('theme') || 'auto';
+  },
+  setTheme(theme) {
+    if (theme === 'auto') {
+      localStorage.removeItem('theme');
+    } else {
+      localStorage.setItem('theme', theme);
+    }
+  },
+};
+
+// scroll to bottom of chat messages
+// if requiresNearBottom is true, only auto-scroll if user is near bottom
+const chatScrollToBottom = (requiresNearBottom) => {
+  const msgListElem = document.getElementById('messages-list');
+  const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight;
+  if (!requiresNearBottom || (spaceToBottom < 100)) {
+    setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1);
+  }
+};
+
+const mainApp = createApp({
+  components: {
+    VueMarkdown,
+    SettingsModalShortInput,
+  },
+  data() {
+    return {
+      conversations: StorageUtils.getAllConversations(),
+      messages: [], // { id: number, role: 'user' | 'assistant', content: string }
+      viewingConvId: StorageUtils.getNewConvId(),
+      inputMsg: '',
+      isGenerating: false,
+      pendingMsg: null, // the on-going message from assistant
+      stopGeneration: () => {},
+      selectedTheme: StorageUtils.getTheme(),
+      config: StorageUtils.getConfig(),
+      showConfigDialog: false,
+      editingMsg: null,
+      // const
+      themes: THEMES,
+      configDefault: {...CONFIG_DEFAULT},
+      configInfo: {...CONFIG_INFO},
+    }
+  },
+  computed: {},
+  mounted() {
+    document.getElementById('app').classList.remove('opacity-0'); // show app
+    // scroll to the bottom when the pending message height is updated
+    const pendingMsgElem = document.getElementById('pending-msg');
+    const resizeObserver = new ResizeObserver(() => {
+      if (this.isGenerating) chatScrollToBottom(true);
+    });
+    resizeObserver.observe(pendingMsgElem);
+  },
+  methods: {
+    hideSidebar() {
+      document.getElementById('toggle-drawer').checked = false;
+    },
+    setSelectedTheme(theme) {
+      this.selectedTheme = theme;
+      StorageUtils.setTheme(theme);
+    },
+    newConversation() {
+      if (this.isGenerating) return;
+      this.viewingConvId = StorageUtils.getNewConvId();
+      this.editingMsg = null;
+      this.fetchMessages();
+      chatScrollToBottom();
+      this.hideSidebar();
+    },
+    setViewingConv(convId) {
+      if (this.isGenerating) return;
+      this.viewingConvId = convId;
+      this.editingMsg = null;
+      this.fetchMessages();
+      chatScrollToBottom();
+      this.hideSidebar();
+    },
+    deleteConv(convId) {
+      if (this.isGenerating) return;
+      if (window.confirm('Are you sure to delete this conversation?')) {
+        StorageUtils.remove(convId);
+        if (this.viewingConvId === convId) {
+          this.viewingConvId = StorageUtils.getNewConvId();
+          this.editingMsg = null;
+        }
+        this.fetchConversation();
+        this.fetchMessages();
+      }
+    },
+    downloadConv(convId) {
+      const conversation = StorageUtils.getOneConversation(convId);
+      if (!conversation) {
+        alert('Conversation not found.');
+        return;
+      }
+      const conversationJson = JSON.stringify(conversation, null, 2);
+      const blob = new Blob([conversationJson], { type: 'application/json' });
+      const url = URL.createObjectURL(blob);
+      const a = document.createElement('a');
+      a.href = url;
+      a.download = `conversation_${convId}.json`;
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+    },
+    async sendMessage() {
+      if (!this.inputMsg) return;
+      const currConvId = this.viewingConvId;
+
+      StorageUtils.appendMsg(currConvId, {
+        id: Date.now(),
+        role: 'user',
+        content: this.inputMsg,
+      });
+      this.fetchConversation();
+      this.fetchMessages();
+      this.inputMsg = '';
+      this.editingMsg = null;
+      this.generateMessage(currConvId);
+      chatScrollToBottom();
+    },
+    async generateMessage(currConvId) {
+      if (this.isGenerating) return;
+      this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
+      this.isGenerating = true;
+      this.editingMsg = null;
+
+      try {
+        const abortController = new AbortController();
+        this.stopGeneration = () => abortController.abort();
+        const params = {
+          messages: [
+            { role: 'system', content: this.config.systemMessage },
+            ...this.messages,
+          ],
+          stream: true,
+          cache_prompt: true,
+          samplers: this.config.samplers,
+          temperature: this.config.temperature,
+          dynatemp_range: this.config.dynatemp_range,
+          dynatemp_exponent: this.config.dynatemp_exponent,
+          top_k: this.config.top_k,
+          top_p: this.config.top_p,
+          min_p: this.config.min_p,
+          typical_p: this.config.typical_p,
+          xtc_probability: this.config.xtc_probability,
+          xtc_threshold: this.config.xtc_threshold,
+          repeat_last_n: this.config.repeat_last_n,
+          repeat_penalty: this.config.repeat_penalty,
+          presence_penalty: this.config.presence_penalty,
+          frequency_penalty: this.config.frequency_penalty,
+          dry_multiplier: this.config.dry_multiplier,
+          dry_base: this.config.dry_base,
+          dry_allowed_length: this.config.dry_allowed_length,
+          dry_penalty_last_n: this.config.dry_penalty_last_n,
+          max_tokens: this.config.max_tokens,
+          ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
+          ...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
+        };
+        const config = {
+          controller: abortController,
+          api_url: BASE_URL,
+          endpoint: '/chat/completions',
+        };
+        for await (const chunk of llama(prompt, params, config)) {
+          const stop = chunk.data.stop;
+          const addedContent = chunk.data.choices[0].delta.content;
+          const lastContent = this.pendingMsg.content || '';
+          if (addedContent) {
+            this.pendingMsg = {
+              id: this.pendingMsg.id,
+              role: 'assistant',
+              content: lastContent + addedContent,
+            };
+          }
+        }
+
+        StorageUtils.appendMsg(currConvId, this.pendingMsg);
+        this.fetchConversation();
+        this.fetchMessages();
+        setTimeout(() => document.getElementById('msg-input').focus(), 1);
+      } catch (error) {
+        if (error.name === 'AbortError') {
+          // user stopped the generation via stopGeneration() function
+          StorageUtils.appendMsg(currConvId, this.pendingMsg);
+          this.fetchConversation();
+          this.fetchMessages();
+        } else {
+          console.error(error);
+          alert(error);
+          // pop last user message
+          const lastUserMsg = StorageUtils.popMsg(currConvId);
+          this.inputMsg = lastUserMsg ? lastUserMsg.content : '';
+        }
+      }
+
+      this.pendingMsg = null;
+      this.isGenerating = false;
+      this.stopGeneration = () => {};
+      this.fetchMessages();
+      chatScrollToBottom();
+    },
+
+    // message actions
+    regenerateMsg(msg) {
+      if (this.isGenerating) return;
+      // TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds
+      const currConvId = this.viewingConvId;
+      StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
+      this.fetchConversation();
+      this.fetchMessages();
+      this.generateMessage(currConvId);
+    },
+    copyMsg(msg) {
+      copyStr(msg.content);
+    },
+    editUserMsgAndRegenerate(msg) {
+      if (this.isGenerating) return;
+      const currConvId = this.viewingConvId;
+      const newContent = msg.content;
+      this.editingMsg = null;
+      StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
+      StorageUtils.appendMsg(currConvId, {
+        id: Date.now(),
+        role: 'user',
+        content: newContent,
+      });
+      this.fetchConversation();
+      this.fetchMessages();
+      this.generateMessage(currConvId);
+    },
+
+    // settings dialog methods
+    closeAndSaveConfigDialog() {
+      try {
+        if (this.config.custom.length) JSON.parse(this.config.custom);
+      } catch (error) {
+        alert('Invalid JSON for custom config. Please either fix it or leave it empty.');
+        return;
+      }
+      for (const key of CONFIG_NUMERIC_KEYS) {
+        if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) {
+          alert(`Invalid number for ${key} (expected an integer or a float)`);
+          return;
+        }
+        this.config[key] = parseFloat(this.config[key]);
+      }
+      this.showConfigDialog = false;
+      StorageUtils.setConfig(this.config);
+    },
+    closeAndDiscardConfigDialog() {
+      this.showConfigDialog = false;
+      this.config = StorageUtils.getConfig();
+    },
+    resetConfigDialog() {
+      if (window.confirm('Are you sure to reset all settings?')) {
+        this.config = {...CONFIG_DEFAULT};
+      }
+    },
+
+    // sync state functions
+    fetchConversation() {
+      this.conversations = StorageUtils.getAllConversations();
+    },
+    fetchMessages() {
+      this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
+    },
+  },
+});
+mainApp.config.errorHandler = alert;
+try {
+  mainApp.mount('#app');
+} catch (err) {
+  console.error(err);
+  document.getElementById('app').innerHTML = `<div style="margin:2em auto">
+    Failed to start app. Please try clearing localStorage and try again.<br/>
+    <br/>
+    <button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
+  </div>`;
+}
diff --git a/examples/server/webui/src/styles.css b/examples/server/webui/src/styles.css
new file mode 100644
index 000000000..67d35b99e
--- /dev/null
+++ b/examples/server/webui/src/styles.css
@@ -0,0 +1,26 @@
+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+
+.markdown {
+  h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
+  pre {
+    @apply whitespace-pre-wrap rounded-lg p-2;
+    border: 1px solid currentColor;
+  }
+  /* TODO: fix markdown table */
+}
+
+.show-on-hover {
+  @apply md:opacity-0 md:group-hover:opacity-100;
+}
+.btn-mini {
+  @apply cursor-pointer hover:shadow-md;
+}
+.chat-screen { max-width: 900px; }
+
+.chat-bubble-base-300 {
+  --tw-bg-opacity: 1;
+  --tw-text-opacity: 1;
+  @apply bg-base-300 text-base-content;
+}
diff --git a/examples/server/webui/tailwind.config.js b/examples/server/webui/tailwind.config.js
new file mode 100644
index 000000000..c43066a19
--- /dev/null
+++ b/examples/server/webui/tailwind.config.js
@@ -0,0 +1,16 @@
+/** @type {import('tailwindcss').Config} */
+export default {
+  content: [
+    "./index.html",
+    "./src/**/*.{js,ts,jsx,tsx}",
+  ],
+  theme: {
+    extend: {},
+  },
+  plugins: [
+    require('daisyui'),
+  ],
+  daisyui: {
+    themes: ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'],
+  }
+}
diff --git a/examples/server/webui/vite.config.js b/examples/server/webui/vite.config.js
new file mode 100644
index 000000000..789bf9cbb
--- /dev/null
+++ b/examples/server/webui/vite.config.js
@@ -0,0 +1,36 @@
+
+import { viteSingleFile } from 'vite-plugin-singlefile';
+import path from 'path';
+import fs from 'fs';
+
+const GUIDE_FOR_FRONTEND = `
+<!--
+  This is a single file build of the frontend.
+  It is automatically generated by the build process.
+  Do not edit this file directly.
+  To make changes, refer to the "Web UI" section in the README.
+-->
+`.trim();
+
+export default {
+  plugins: [
+    viteSingleFile(),
+    (function llamaCppPlugin() {
+      let config;
+      return {
+        name: 'llamacpp:build',
+        apply: 'build',
+        async configResolved(_config) {
+          config = _config;
+        },
+        writeBundle() {
+          const outputIndexHtml = path.join(config.build.outDir, 'index.html');
+          const content = fs.readFileSync(outputIndexHtml, 'utf-8');
+
+          const targetOutputFile = path.join(config.build.outDir, '../../public/index.html');
+          fs.writeFileSync(targetOutputFile, GUIDE_FOR_FRONTEND + '\n' + content);
+        }
+      }
+    })(),
+  ],
+};

From cc98896db858df7aa40d0e16a505883ef196a482 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Tue, 3 Dec 2024 13:29:54 -0600
Subject: [PATCH 386/396] vulkan: optimize and reenable split_k (#10637)

Use vector loads when possible in mul_mat_split_k_reduce. Use split_k
when there aren't enough workgroups to fill the shaders.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 51 +++++++++++++++----
 .../mul_mat_split_k_reduce.comp               | 31 ++++++++---
 2 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index df6a659f4..17e1be105 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -165,6 +165,7 @@ struct vk_device_struct {
     vk_queue transfer_queue;
     bool single_queue;
     uint32_t subgroup_size;
+    uint32_t shader_core_count;
     bool uma;
 
     size_t idx;
@@ -1498,7 +1499,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
@@ -1610,11 +1611,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
         const std::vector<vk::ExtensionProperties> ext_props = device->physical_device.enumerateDeviceExtensionProperties();
 
         bool maintenance4_support = false;
+        bool sm_builtins = false;
 
         // Check if maintenance4 is supported
         for (const auto& properties : ext_props) {
             if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
                 maintenance4_support = true;
+            } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) {
+                sm_builtins = true;
             }
         }
 
@@ -1622,11 +1626,21 @@ static vk_device ggml_vk_get_device(size_t idx) {
         vk::PhysicalDeviceMaintenance3Properties props3;
         vk::PhysicalDeviceMaintenance4Properties props4;
         vk::PhysicalDeviceSubgroupProperties subgroup_props;
+        vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
         props2.pNext = &props3;
         props3.pNext = &subgroup_props;
+
+        VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&subgroup_props;
+
         if (maintenance4_support) {
-            subgroup_props.pNext = &props4;
+            last_struct->pNext = (VkBaseOutStructure *)&props4;
+            last_struct = (VkBaseOutStructure *)&props4;
         }
+        if (sm_builtins) {
+            last_struct->pNext = (VkBaseOutStructure *)&sm_props;
+            last_struct = (VkBaseOutStructure *)&sm_props;
+        }
+
         device->physical_device.getProperties2(&props2);
         device->properties = props2.properties;
 
@@ -1643,6 +1657,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
         device->vendor_id = device->properties.vendorID;
         device->subgroup_size = subgroup_props.subgroupSize;
         device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
+        if (sm_builtins) {
+            device->shader_core_count = sm_props.shaderSMCount;
+        } else {
+            device->shader_core_count = 0;
+        }
 
         bool fp16_storage = false;
         bool fp16_compute = false;
@@ -2732,15 +2751,25 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
     dst->device->device.resetFences({ dst->device->fence });
 }
 
-static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
+static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
     VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
-    // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
-    //     return 4;
-    // }
 
-    return 1;
+    uint32_t split_k = 1;
+    if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
+        // If k is 'large' and the SMs will fill less than halfway, use split_k.
+        uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
+        uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
+        if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
+            split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
+            // Clamp to 2 or 4
+            split_k = std::min(split_k, 4u);
+            if (split_k == 3) {
+                split_k = 2;
+            }
+        }
+    }
 
-    GGML_UNUSED(m); GGML_UNUSED(n); GGML_UNUSED(k);
+    return split_k;
 }
 
 static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
@@ -2964,10 +2993,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
     const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
     const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
 
-    const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
-
     vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned);
 
+    const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
+
     const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
     const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
     const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
@@ -2993,7 +3022,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
     if (dryrun) {
         const uint64_t x_sz_upd = x_sz * ne02 * ne03;
         const uint64_t y_sz_upd = y_sz * ne12 * ne13;
-        const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * 4 : 0;
+        const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
         if (
                 (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
                 (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp
index 825b91031..4c64fd47a 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp
@@ -5,7 +5,9 @@
 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer A {float data_a[];};
+layout (binding = 0) readonly buffer A4 {vec4 data_a4[];};
 layout (binding = 1) writeonly buffer D {float data_d[];};
+layout (binding = 1) writeonly buffer D4 {vec4 data_d4[];};
 
 layout (push_constant) uniform parameter {
     uint ne;
@@ -13,17 +15,34 @@ layout (push_constant) uniform parameter {
 } p;
 
 void main() {
-    const uint idx = gl_GlobalInvocationID.x;
+    // Each invocation handles four consecutive components
+    const uint idx = gl_GlobalInvocationID.x * 4;
 
     if (idx >= p.ne) {
         return;
     }
 
-    float result = 0.0f;
+    // Check if all four components are in bounds and aligned,
+    // then use vector loads
+    if (idx + 3 < p.ne && (p.ne % 4) == 0) {
+        vec4 result = vec4(0.0f);
 
-    [[unroll]] for (uint i = 0; i < p.k_num; i++) {
-        result += data_a[i * p.ne + idx];
+        [[unroll]] for (uint i = 0; i < p.k_num; i++) {
+            result += data_a4[(i * p.ne + idx) / 4];
+        }
+
+        data_d4[idx / 4] = result;
+    } else {
+        [[unroll]] for (uint j = 0; j < 4; ++j) {
+            if (idx + j < p.ne) {
+                float result = 0.0f;
+
+                [[unroll]] for (uint i = 0; i < p.k_num; i++) {
+                    result += data_a[i * p.ne + idx + j];
+                }
+
+                data_d[idx + j] = result;
+            }
+        }
     }
-
-    data_d[idx] = result;
 }

From 01e6d9bb71eb71fe1f811f2fdef15753232cd0f2 Mon Sep 17 00:00:00 2001
From: piDack <104877312+piDack@users.noreply.github.com>
Date: Wed, 4 Dec 2024 08:26:37 +0800
Subject: [PATCH 387/396] clip : add sycl support (#10574)

Co-authored-by: piDack <pcdack@hotmail.co>
---
 examples/llava/clip.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 7ba4cea58..d7c94352b 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -12,6 +12,10 @@
 #include "ggml-cuda.h"
 #endif
 
+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
@@ -1169,6 +1173,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
 #endif
 
+#ifdef GGML_USE_SYCL
+    new_clip->backend = ggml_backend_sycl_init(0);
+    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
+#endif
+
     if (!new_clip->backend) {
         new_clip->backend = ggml_backend_cpu_init();
         LOG_INF("%s: CLIP using CPU backend\n", __func__);

From da6aac91f150a3b0bcc26d3fd50288accb15f179 Mon Sep 17 00:00:00 2001
From: Benson Wong <mostlygeek@gmail.com>
Date: Tue, 3 Dec 2024 16:40:36 -0800
Subject: [PATCH 388/396] Add docs for creating a static build (#10268)
 (#10630)

* Add notes for a static build

* Update docs/build.md

---------

Co-authored-by: Diego Devesa <slarengh@gmail.com>
---
 docs/build.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/build.md b/docs/build.md
index 97e340ab6..a4964cbd1 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -39,6 +39,11 @@ cmake --build build --config Release
        ```
 
     For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
+- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
+  ```
+  cmake -B build -DBUILD_SHARED_LIBS=OFF
+  cmake --build build --config Release
+  ```
 
 - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
     - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):

From cd2f37b304f8e88b9de8424b31078b97f9cf7c60 Mon Sep 17 00:00:00 2001
From: Frankie Robertson <frankier@users.noreply.github.com>
Date: Wed, 4 Dec 2024 02:41:37 +0200
Subject: [PATCH 389/396] Avoid using __fp16 on ARM with old nvcc (#10616)

---
 ggml/src/ggml-impl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 78e3af8f2..00a1546a7 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -310,14 +310,14 @@ void ggml_aligned_free(void * ptr, size_t size);
 // FP16 to FP32 conversion
 
 #if defined(__ARM_NEON)
-    #ifdef _MSC_VER
+    #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
         typedef uint16_t ggml_fp16_internal_t;
     #else
         typedef __fp16 ggml_fp16_internal_t;
     #endif
 #endif
 
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
+#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
     #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
     #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 

From 98036d5670f21e9b9a99d5e3dbb3bf7589f5c4e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?=
 <wangr@smail.nju.edu.cn>
Date: Wed, 4 Dec 2024 09:22:50 +0800
Subject: [PATCH 390/396] fix typo of README.md (#10605)

---
 grammars/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grammars/README.md b/grammars/README.md
index 4e57bca5f..976954091 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -46,7 +46,7 @@ Terminals support the full range of Unicode. Unicode characters can be specified
 
 Character ranges can be negated with `^`:
 ```
-single-line ::= [^\n]+ "\n"`
+single-line ::= [^\n]+ "\n"
 ```
 
 ## Sequences and Alternatives

From 40c6d79fb52f995f47507fedfeaae2ac05d9b35c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= <nicolo.scipione@codeplay.com>
Date: Wed, 4 Dec 2024 02:29:20 +0100
Subject: [PATCH 391/396] SYCL : Move to compile time oneMKL interface backend
 selection for NVIDIA backend (#10584)

* [SYCL] Move to Compile Time backend selection on oneMKL Interface for NVIDIA backend

Move to compile time selection to backend to avoid latency at run time.
Add it to all mkl gemm calls and only for NVIDIA backend.

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>

* Formatting

* Address PR comments to increase readibility

---------

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
---
 ggml/src/ggml-sycl/CMakeLists.txt  |  3 ++-
 ggml/src/ggml-sycl/dpct/helper.hpp | 43 +++++++++++++++++++++---------
 ggml/src/ggml-sycl/ggml-sycl.cpp   | 13 ++++++---
 ggml/src/ggml-sycl/outprod.cpp     | 16 +++++------
 4 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 83f223fd7..3579a311a 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -68,7 +68,8 @@ else()
         target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
     elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
-        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
+        add_compile_definitions(GGML_SYCL_NVIDIA)
+        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas)
     elseif (GGML_SYCL_TARGET STREQUAL "AMD")
         if (NOT GGML_SYCL_DEVICE_ARCH)
             message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index c2f28bb49..d1b5dd87c 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -1689,9 +1689,14 @@ namespace dpct
             auto data_a = get_memory<const Ta>(a);
             auto data_b = get_memory<const Tb>(b);
             auto data_c = get_memory<Tc>(c);
-            oneapi::mkl::blas::column_major::gemm(
-                q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-                data_b, ldb, beta_value, data_c, ldc);
+#ifdef GGML_SYCL_NVIDIA
+            oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q },
+                                                  a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
+                                                  beta_value, data_c, ldc);
+#else
+            oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
+                                                  beta_value, data_c, ldc);
+#endif
         }
 
         template <typename VecT, class BinaryOperation, class = void>
@@ -1754,14 +1759,22 @@ namespace dpct
             matrix_info->ld_info[2] = ldc;
             matrix_info->groupsize_info = batch_size;
 
+#ifdef GGML_SYCL_NVIDIA
             sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
-                q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
-                matrix_info->size_info, matrix_info->size_info + 1,
-                matrix_info->size_info + 2, matrix_info->value_info,
-                reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
-                reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
-                matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
+                oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
+                matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
+                matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast<const Ta **>(a),
+                matrix_info->ld_info, reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
+                matrix_info->value_info + 1, reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1,
+                &(matrix_info->groupsize_info));
+#else
+            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
+                q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
+                matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info,
+                reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
+                matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
                 matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
+#endif
 
             q.submit([&](sycl::handler &cgh)
                      {
@@ -1783,10 +1796,16 @@ namespace dpct
             auto data_a = get_memory<const Ta>(a);
             auto data_b = get_memory<const Tb>(b);
             auto data_c = get_memory<Tc>(c);
+#ifdef GGML_SYCL_NVIDIA
             oneapi::mkl::blas::column_major::gemm_batch(
-                q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-                stride_a, data_b, ldb, stride_b, beta_value,
-                data_c, ldc, stride_c, batch_size);
+                oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, a_trans, b_trans, m, n, k,
+                alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c,
+                batch_size);
+#else
+            oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+                                                        stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc,
+                                                        stride_c, batch_size);
+#endif
         }
 
     } // namespace detail
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 1310981e5..135efb521 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -2573,12 +2573,17 @@ inline void ggml_sycl_op_mul_mat_sycl(
         const float alpha = 1.0f;
         const float beta = 0.0f;
 #if !GGML_SYCL_DNNL
+#    ifdef GGML_SYCL_NVIDIA
         SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
-            *stream, oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
-            dpct::get_value(&alpha, *stream), src0_ddf_i, ne00,
-            src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
+            oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i,
+            ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
+#    else
+        SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
+            *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
+            dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
             dst_dd_i, ldc)));
+#    endif
 #else
         auto dnnl_stream = ctx.stream_dnnl(stream);
          DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp
index e61cdc2ca..ef9af0b76 100644
--- a/ggml/src/ggml-sycl/outprod.cpp
+++ b/ggml/src/ggml-sycl/outprod.cpp
@@ -40,14 +40,14 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* sr
 
     try {
         // Perform matrix multiplication using oneMKL GEMM
-        oneapi::mkl::blas::column_major::gemm(*stream,
-            oneapi::mkl::transpose::nontrans, src1_op,
-            ne0, ne1, ne01,
-            alpha,
-            src0_d, ne00,
-            src1_d, ldb,
-            beta,
-            dst_d, ne0);
+#ifdef GGML_SYCL_NVIDIA
+        oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream },
+                                              oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d,
+                                              ne00, src1_d, ldb, beta, dst_d, ne0);
+#else
+        oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha,
+                                              src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
+#endif
     }
     catch (sycl::exception const& exc) {
         std::cerr << exc.what() << std::endl;

From 2759916d86b70e7aceaed4d0b4e7ed126f0f9e51 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 4 Dec 2024 01:28:59 -0600
Subject: [PATCH 392/396] vulkan: Implement "fast divide" (mul+shift) for unary
 ops like copy (#10642)

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 45 ++++++++++++++++++-
 .../vulkan-shaders/generic_unary_head.comp    | 27 ++++++++---
 tests/test-backend-ops.cpp                    |  2 +
 3 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 17e1be105..07b45d6b9 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -353,7 +353,45 @@ struct vk_op_unary_push_constants {
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
     uint32_t d_offset;
     float param1; float param2;
+    uint32_t ne0_012mp; uint32_t ne0_012L;
+    uint32_t ne0_01mp;  uint32_t ne0_01L;
+    uint32_t ne0_0mp;   uint32_t ne0_0L;
+    uint32_t ne1_012mp; uint32_t ne1_012L;
+    uint32_t ne1_01mp;  uint32_t ne1_01L;
+    uint32_t ne1_0mp;   uint32_t ne1_0L;
 };
+static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
+
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
+// Precompute mp (m' in the paper) and L such that division
+// can be computed using a multiply (high 32b of 64b result)
+// and a shift:
+//
+// n/d = (mulhi(n, mp) + n) >> L;
+void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L)
+{
+    // compute L = ceil(log2(d));
+    L = 0;
+    while (L < 32 && (uint32_t{1} << L) < d) {
+        L++;
+    }
+
+    mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1);
+}
+
+template <typename T> void init_pushconst_fastdiv(T &p) {
+    static_assert(!std::is_const<T>::value, "unexpected type");
+}
+
+template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) {
+    // Compute magic values to divide by these six numbers.
+    init_fastdiv_values(p.ne02*p.ne01*p.ne00,  p.ne0_012mp,    p.ne0_012L);
+    init_fastdiv_values(p.ne01*p.ne00,         p.ne0_01mp,     p.ne0_01L);
+    init_fastdiv_values(p.ne00,                p.ne0_0mp,      p.ne0_0L);
+    init_fastdiv_values(p.ne12*p.ne11*p.ne10,  p.ne1_012mp,    p.ne1_012L);
+    init_fastdiv_values(p.ne11*p.ne10,         p.ne1_01mp,     p.ne1_01L);
+    init_fastdiv_values(p.ne10,                p.ne1_0mp,      p.ne1_0L);
+}
 
 struct vk_op_binary_push_constants {
     uint32_t ne;
@@ -2914,13 +2952,14 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
         elements = { ne, 1, 1 };
     }
 
-    const vk_op_unary_push_constants pc = {
+    vk_op_unary_push_constants pc = {
         (uint32_t)ne,
         (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
         (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3],                       1                   , (uint32_t)tensor->ne[0]                   , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]),
         0,
         0.0f, 0.0f,
     };
+    init_pushconst_fastdiv(pc);
     ggml_vk_sync_buffers(subctx);
     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
 }
@@ -4125,7 +4164,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
 }
 
 template<typename PC>
-static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc, bool dryrun = false) {
+static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
     VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
     if (src1 != nullptr) {
         std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
@@ -4165,6 +4204,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
     const uint64_t ned3 = dst->ne[3];
     const uint64_t ned = ned0 * ned1;
 
+    init_pushconst_fastdiv(pc);
+
     vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
 
     if (pipeline == nullptr) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
index 4e1fa3af3..ab7c9d7eb 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
@@ -8,6 +8,13 @@ layout (push_constant) uniform parameter
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
     uint d_offset;
     float param1; float param2;
+
+    uint ne0_012mp; uint ne0_012L;
+    uint ne0_01mp;  uint ne0_01L;
+    uint ne0_0mp;   uint ne0_0L;
+    uint ne1_012mp; uint ne1_012L;
+    uint ne1_01mp;  uint ne1_01L;
+    uint ne1_0mp;   uint ne1_0L;
 } p;
 
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@@ -17,22 +24,30 @@ uint get_idx() {
     return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 }
 
+// see init_fastdiv_values in ggml-vulkan.cpp
+uint fastdiv(uint n, uint mp, uint L) {
+    uint msbs, lsbs;
+    // msbs = mulhi(n, mp)
+    umulExtended(n, mp, msbs, lsbs);
+    return (msbs + n) >> L;
+}
+
 uint src0_idx(uint idx) {
-    const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
+    const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
     const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
-    const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00);
+    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
     const uint i02_offset = i02*p.ne01*p.ne00;
-    const uint i01 = (idx - i03_offset - i02_offset) / p.ne00;
+    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
     const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
     return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
 }
 
 uint dst_idx(uint idx) {
-    const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
+    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
     const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
-    const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
     const uint i12_offset = i12*p.ne11*p.ne10;
-    const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
     const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
     return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
 }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 87c92dadd..807d271c6 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3862,6 +3862,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
 
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3}));
 
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f));

From 8d0cfd554a9ae545ff94d27e04458f537b4e8c0e Mon Sep 17 00:00:00 2001
From: JFLFY2255 <JFLFY2255@163.com>
Date: Wed, 4 Dec 2024 17:42:50 +0800
Subject: [PATCH 393/396] llama: Support MiniCPM-1B (with & w/o longrope)
 (#10559)

---
 convert_hf_to_gguf.py     |  57 ++++++++-----
 gguf-py/gguf/constants.py |   9 +-
 include/llama.h           |   3 +-
 src/llama.cpp             | 175 +++++---------------------------------
 4 files changed, 61 insertions(+), 183 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index b931049d1..d8df5cc00 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1831,29 +1831,40 @@ class MiniCPMModel(Model):
     model_arch = gguf.MODEL_ARCH.MINICPM
 
     def set_gguf_parameters(self):
-        block_count = self.hparams["num_hidden_layers"]
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_file_type(self.ftype)
+        super().set_gguf_parameters()
+        embedding_scale = float(self.hparams["scale_emb"])
+        self.gguf_writer.add_embedding_scale(embedding_scale)
+        logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
+        residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
+        self.gguf_writer.add_residual_scale(residual_scale)
+        logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
+        logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
+        self.gguf_writer.add_logit_scale(logit_scale)
+        logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
+        if self.hparams.get("rope_scaling") is not None:
+            if self.hparams["rope_scaling"].get("type") == "longrope":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
+                logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is not None:
+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
+
+            if long_factors is None or short_factors is None:
+                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+
+            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
 
     def set_vocab(self):
-        self._set_vocab_llama_hf()
-
-    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
-        if n_kv_head is not None and n_head != n_kv_head:
-            n_head //= n_kv_head
-
-        return (
-            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-            .swapaxes(1, 2)
-            .reshape(weights.shape)
-        )
+        self._set_vocab_sentencepiece()
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
@@ -1863,9 +1874,9 @@ class MiniCPMModel(Model):
 
         # HF models permute some of the tensors, so we need to undo that
         if name.endswith(("q_proj.weight")):
-            data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
         if name.endswith(("k_proj.weight")):
-            data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 
         return [(self.map_tensor_name(name), data_torch)]
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 7df23371c..703199fcb 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -896,6 +896,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ROPE_FACTORS_LONG,
+        MODEL_TENSOR.ROPE_FACTORS_SHORT,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
@@ -1388,9 +1390,10 @@ class TokenType(IntEnum):
 
 
 class RopeScalingType(Enum):
-    NONE   = 'none'
-    LINEAR = 'linear'
-    YARN   = 'yarn'
+    NONE     = 'none'
+    LINEAR   = 'linear'
+    YARN     = 'yarn'
+    LONGROPE = 'longrope'
 
 
 class PoolingType(IntEnum):
diff --git a/include/llama.h b/include/llama.h
index e85f459fc..168c3fa1f 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -185,7 +185,8 @@ extern "C" {
         LLAMA_ROPE_SCALING_TYPE_NONE        = 0,
         LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
         LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
-        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
+        LLAMA_ROPE_SCALING_TYPE_LONGROPE    = 3,
+        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
     };
 
     enum llama_pooling_type {
diff --git a/src/llama.cpp b/src/llama.cpp
index 6a6f4c2a5..00f78639e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1036,6 +1036,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
             { LLM_TENSOR_OUTPUT,          "output" },
             { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },
+            { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
             { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
             { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
             { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
@@ -1683,9 +1685,10 @@ struct LLM_TN {
 //
 
 static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
-    { LLAMA_ROPE_SCALING_TYPE_NONE,   "none"   },
-    { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
-    { LLAMA_ROPE_SCALING_TYPE_YARN,   "yarn"   },
+    { LLAMA_ROPE_SCALING_TYPE_NONE,       "none"       },
+    { LLAMA_ROPE_SCALING_TYPE_LINEAR,     "linear"     },
+    { LLAMA_ROPE_SCALING_TYPE_YARN,       "yarn"       },
+    { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
 };
 
 static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
@@ -5580,8 +5583,12 @@ static void llm_load_hparams(
         case LLM_ARCH_MINICPM:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
+                ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
+                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
 
                 switch (hparams.n_layer) {
+                    case 52: model.type = e_model::MODEL_1B; break;
                     case 40: model.type = e_model::MODEL_2B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
@@ -7065,7 +7072,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
         LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
     }
 
-    if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
+    if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
         LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
         LLAMA_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
         LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -7690,7 +7697,13 @@ static bool llm_load_tensors(
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        }
+                        else {
+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        }
 
                         if (n_expert == 0) {
                             layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
@@ -13497,153 +13510,6 @@ struct llm_build_context {
         return gf;
     }
 
-    // ref: https://arxiv.org/abs/2203.03466
-    //      https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
-    // based on the original build_llama() function
-    struct ggml_cgraph * build_minicpm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        const int64_t n_embd = hparams.n_embd;
-        //TODO: if the model varies, these parameters need to be read from the model
-        const int64_t n_embd_base = 256;
-        const float scale_embd  = 12.0f;
-        const float scale_depth = 1.4f;
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
-
-        // scale the input embeddings
-        inpL = ggml_scale(ctx0, inpL, scale_embd);
-        cb(inpL, "inp_scaled", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            // scale_res - scale the hidden states for residual connection
-            const float scale_res = scale_depth/sqrtf(float(n_layer));
-            cur = ggml_scale(ctx0, cur, scale_res);
-            cb(cur, "hidden_scaled", -1);
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, lctx, cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            // scale the hidden states for residual connection
-            cur = ggml_scale(ctx0, cur, scale_res);
-            cb(cur, "hidden_scaled_ffn", -1);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head scaling
-        const float scale_lmhead = float(n_embd_base)/float(n_embd);
-        cur = ggml_scale(ctx0, cur, scale_lmhead);
-        cb(cur, "lmhead_scaling", -1);
-
-        // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
     struct ggml_cgraph * build_minicpm3() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
@@ -16742,6 +16608,7 @@ static struct ggml_cgraph * llama_build_graph(
 
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
+        case LLM_ARCH_MINICPM:
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
             {
@@ -16825,10 +16692,6 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_internlm2();
             } break;
-        case LLM_ARCH_MINICPM:
-            {
-                result = llm.build_minicpm();
-            } break;
         case LLM_ARCH_MINICPM3:
             {
                 result = llm.build_minicpm3();

From 253b7fde910731104670724391bfbcb94d97d0c3 Mon Sep 17 00:00:00 2001
From: ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
Date: Wed, 4 Dec 2024 09:45:48 +0000
Subject: [PATCH 394/396] Fix HF repo commit to clone lora test models (#10649)

---
 tests/test-lora-conversion-inference.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test-lora-conversion-inference.sh b/tests/test-lora-conversion-inference.sh
index fe90ce0d1..fb308a9ff 100755
--- a/tests/test-lora-conversion-inference.sh
+++ b/tests/test-lora-conversion-inference.sh
@@ -10,11 +10,16 @@ declare -a params=(
 
 MODELS_REPO=lora-tests
 MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO
+COMMIT=c26d5fb85b4070a9e9c4e65d132c783b98086890
 
 # Clone the Hugging Face repository if the directory does not exist
 if [ ! -d "$MODELS_REPO" ]; then
     echo "Cloning the Hugging Face repository..."
     git clone $MODELS_REPO_URL --depth 1
+    cd $MODELS_REPO
+    git fetch --depth=1 origin $COMMIT
+    git reset --hard $COMMIT
+    cd -
 else
     echo "Repository already exists. Skipping clone."
 fi

From 2803540814bf0a4e44d0960ff6afda6bac971c17 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 4 Dec 2024 14:40:44 +0100
Subject: [PATCH 395/396] ggml-cpu : fix HWCAP2_I8MM value (#10646)

---
 ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 23ae2e10c..e4a9ca013 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2425,7 +2425,7 @@ bool ggml_is_numa(void) {
 #endif
 
 #if !defined(HWCAP2_I8MM)
-#define HWCAP2_I8MM 0
+#define HWCAP2_I8MM (1 << 13)
 #endif
 
 static void ggml_init_arm_arch_features(void) {

From 59f4db10883a4f3e855cffbf2c3ab68430e95272 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 4 Dec 2024 14:45:40 +0100
Subject: [PATCH 396/396] ggml : add predefined list of CPU backend variants to
 build (#10626)

* ggml : add predefined list of CPU backend variants to build

* update CPU dockerfiles
---
 .devops/full.Dockerfile             |  33 +-
 .devops/llama-cli.Dockerfile        |  18 +-
 .devops/llama-server.Dockerfile     |  22 +-
 ggml/CMakeLists.txt                 |  49 ++-
 ggml/src/CMakeLists.txt             |  35 ++
 ggml/src/ggml-backend-reg.cpp       |  32 +-
 ggml/src/ggml-cpu/CMakeLists.txt    | 595 +++++++++++++++-------------
 ggml/src/ggml-cpu/cpu-feats-x86.cpp |  85 ++--
 ggml/src/ggml-cpu/ggml-cpu.c        |   2 +-
 ggml/src/ggml-cpu/ggml-cpu.cpp      |  10 +-
 scripts/build-cpu.sh                |  12 -
 11 files changed, 502 insertions(+), 391 deletions(-)
 delete mode 100755 scripts/build-cpu.sh

diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
index 2a06f82b7..d93c0be6a 100644
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,23 +3,36 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 
 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 
 WORKDIR /app
 
 COPY . .
 
-ENV LLAMA_CURL=1
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build -j $(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib/ \;
 
+FROM ubuntu:$UBUNTU_VERSION as runtime
 
-RUN make -j$(nproc)
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+COPY requirements.txt   /app/requirements.txt
+COPY requirements       /app/requirements
+COPY .devops/tools.sh   /app/tools.sh
+
+RUN pip install --upgrade pip setuptools wheel && \
+    pip install -r /app/requirements.txt
+
+COPY --from=build /app/build/bin/ /app/
+COPY --from=build /app/lib/ /app/
+COPY --from=build /app/convert_hf_to_gguf.py /app/
+COPY --from=build /app/gguf-py /app/gguf-py
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT ["/app/.devops/tools.sh"]
+ENTRYPOINT ["/app/tools.sh"]
diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile
index 7f741aa46..be234d55d 100644
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@@ -3,21 +3,27 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 
 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 
 WORKDIR /app
 
 COPY . .
 
-RUN make -j$(nproc) llama-cli
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build -j $(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib/ \;
 
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 
-RUN apt-get update && \
-    apt-get install -y libgomp1
+WORKDIR /app
 
-COPY --from=build /app/llama-cli /llama-cli
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+
+COPY --from=build /app/build/bin/llama-cli /app/
+COPY --from=build /app/lib/ /app/
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/app/llama-cli" ]
diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile
index 7110dda9e..72ccde2fe 100644
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@@ -9,28 +9,20 @@ WORKDIR /app
 
 COPY . .
 
-
-RUN \
-    # Build multiple versions of the CPU backend
-    scripts/build-cpu.sh avx         -DGGML_AVX=ON -DGGML_AVX2=OFF && \
-    scripts/build-cpu.sh avx2        -DGGML_AVX=ON -DGGML_AVX2=ON && \
-    scripts/build-cpu.sh avx512      -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \
-    scripts/build-cpu.sh amx         -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \
-    # Build llama-server
-    cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
-    cmake --build build --target llama-server -j $(nproc) && \
-    # Copy the built libraries to /app/lib
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build -j $(nproc) && \
     mkdir -p /app/lib && \
-    mv libggml-cpu* /app/lib/ && \
     find build -name "*.so" -exec cp {} /app/lib/ \;
 
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 
+WORKDIR /app
+
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
-COPY --from=build /app/build/bin/llama-server /llama-server
-COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-server /app/
+COPY --from=build /app/lib/ /app/
 
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
@@ -38,4 +30,4 @@ ENV LLAMA_ARG_HOST=0.0.0.0
 
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 
-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 06d371e09..1b3d98967 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -92,30 +92,33 @@ else()
     set(INS_ENB ON)
 endif()
 
-option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
-option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
-
-option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
-option(GGML_AVX_VNNI    "ggml: enable AVX-VNNI"         OFF)
-option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
-option(GGML_AVX512      "ggml: enable AVX512"           OFF)
-option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
-option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI"      OFF)
-option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16"      OFF)
-option(GGML_AMX_TILE    "ggml: enable AMX-TILE"         OFF)
-option(GGML_AMX_INT8    "ggml: enable AMX-INT8"         OFF)
-option(GGML_AMX_BF16    "ggml: enable AMX-BF16"         OFF)
-option(GGML_FMA         "ggml: enable FMA"              ${INS_ENB})
+option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
+option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
+option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
+option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
+option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
+option(GGML_AVX512           "ggml: enable AVX512F"          OFF)
+option(GGML_AVX512_VBMI      "ggml: enable AVX512-VBMI"      OFF)
+option(GGML_AVX512_VNNI      "ggml: enable AVX512-VNNI"      OFF)
+option(GGML_AVX512_BF16      "ggml: enable AVX512-BF16"      OFF)
 if (NOT MSVC)
-    option(GGML_F16C    "ggml: enable F16C"             ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
+    # in MSVC F16C and FMA is implied with AVX2/AVX512
+    option(GGML_FMA          "ggml: enable FMA"              ${INS_ENB})
+    option(GGML_F16C         "ggml: enable F16C"             ${INS_ENB})
+    # MSVC does not seem to support AMX
+    option(GGML_AMX_TILE     "ggml: enable AMX-TILE"         OFF)
+    option(GGML_AMX_INT8     "ggml: enable AMX-INT8"         OFF)
+    option(GGML_AMX_BF16     "ggml: enable AMX-BF16"         OFF)
 endif()
-option(GGML_LASX        "ggml: enable lasx"             ON)
-option(GGML_LSX         "ggml: enable lsx"              ON)
-option(GGML_RVV         "ggml: enable rvv"              ON)
-option(GGML_SVE         "ggml: enable SVE"              OFF)
+option(GGML_LASX             "ggml: enable lasx"             ON)
+option(GGML_LSX              "ggml: enable lsx"              ON)
+option(GGML_RVV              "ggml: enable rvv"              ON)
+option(GGML_SVE              "ggml: enable SVE"              OFF)
+option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
+
 
 if (WIN32)
-    set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
+    set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
 endif()
 
 # ggml core
@@ -180,11 +183,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 
-if (GGML_SYCL)
-    set(CMAKE_CXX_STANDARD 17)
-else()
-    set(CMAKE_CXX_STANDARD 11)
-endif()
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 
 set(THREADS_PREFER_PTHREAD_FLAG ON)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 19289f32b..f07533fdb 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -269,7 +269,42 @@ function(ggml_add_backend backend)
     endif()
 endfunction()
 
+function(ggml_add_cpu_backend_variant tag_name)
+    set(GGML_CPU_TAG_NAME ${tag_name})
+    # other: OPENMP LLAMAFILE CPU_HBM
+    foreach (feat NATIVE
+                  AVX AVX2 AVX_VNNI FMA F16C
+                  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
+                  AMX_TILE AMX_INT8 AMX_BF16)
+        set(GGML_${feat} OFF)
+    endforeach()
+
+    foreach (feat ${ARGN})
+        set(GGML_${feat} ON)
+    endforeach()
+
+    ggml_add_cpu_backend_variant_impl(${tag_name})
+endfunction()
+
 ggml_add_backend(CPU)
+
+if (GGML_CPU_ALL_VARIANTS)
+    if (NOT GGML_BACKEND_DL)
+        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
+    endif()
+    ggml_add_cpu_backend_variant(sandybridge    AVX)
+    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
+    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
+    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+    if (NOT MSVC)
+        # MSVC doesn't support AVX-VNNI or AMX
+        ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
+        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+    endif()
+else ()
+    ggml_add_cpu_backend_variant_impl("")
+endif()
+
 ggml_add_backend(BLAS)
 ggml_add_backend(CANN)
 ggml_add_backend(CUDA)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 2c4bf11b0..5cb0fb9d1 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -483,6 +483,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
                                 best_score = s;
                                 best_path = entry.path().string();
                             }
+                        } else {
+                            if (!silent) {
+                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
+                            }
                         }
                     }
                 }
@@ -505,15 +509,21 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
 }
 
 void ggml_backend_load_all() {
-    ggml_backend_load_best("blas", true);
-    ggml_backend_load_best("cann", true);
-    ggml_backend_load_best("cuda", true);
-    ggml_backend_load_best("hip", true);
-    ggml_backend_load_best("kompute", true);
-    ggml_backend_load_best("metal", true);
-    ggml_backend_load_best("rpc", true);
-    ggml_backend_load_best("sycl", true);
-    ggml_backend_load_best("vulkan", true);
-    ggml_backend_load_best("musa", true);
-    ggml_backend_load_best("cpu", true);
+#ifdef NDEBUG
+    bool silent = true;
+#else
+    bool silent = false;
+#endif
+
+    ggml_backend_load_best("blas", silent);
+    ggml_backend_load_best("cann", silent);
+    ggml_backend_load_best("cuda", silent);
+    ggml_backend_load_best("hip", silent);
+    ggml_backend_load_best("kompute", silent);
+    ggml_backend_load_best("metal", silent);
+    ggml_backend_load_best("rpc", silent);
+    ggml_backend_load_best("sycl", silent);
+    ggml_backend_load_best("vulkan", silent);
+    ggml_backend_load_best("musa", silent);
+    ggml_backend_load_best("cpu", silent);
 }
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 5df63884c..bc326c059 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -1,319 +1,354 @@
-ggml_add_backend_library(ggml-cpu)
-
-list (APPEND GGML_CPU_SOURCES
-    ggml-cpu.c
-    ggml-cpu.cpp
-    ggml-cpu-aarch64.c
-    ggml-cpu-aarch64.h
-    ggml-cpu-quants.c
-    ggml-cpu-quants.h
-    amx/amx.cpp
-    amx/amx.h
-    amx/mmq.cpp
-    amx/mmq.h
-    ggml-cpu-impl.h
-    )
-
-target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
-target_include_directories(ggml-cpu PRIVATE .)
-
-if (APPLE AND GGML_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
-
-        target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
-        target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
-        target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
-
-        target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
+function(ggml_add_cpu_backend_variant_impl tag_name)
+    if (tag_name)
+        set(GGML_CPU_NAME ggml-cpu-${tag_name})
     else()
-        message(WARNING "Accelerate framework not found")
+        set(GGML_CPU_NAME ggml-cpu)
     endif()
-endif()
 
-if (GGML_OPENMP)
-    find_package(OpenMP)
-    if (OpenMP_FOUND)
-        message(STATUS "OpenMP found")
+    ggml_add_backend_library(${GGML_CPU_NAME})
 
-        target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
+    list (APPEND GGML_CPU_SOURCES
+        ggml-cpu/ggml-cpu.c
+        ggml-cpu/ggml-cpu.cpp
+        ggml-cpu/ggml-cpu-aarch64.c
+        ggml-cpu/ggml-cpu-aarch64.h
+        ggml-cpu/ggml-cpu-quants.c
+        ggml-cpu/ggml-cpu-quants.h
+        ggml-cpu/amx/amx.cpp
+        ggml-cpu/amx/amx.h
+        ggml-cpu/amx/mmq.cpp
+        ggml-cpu/amx/mmq.h
+        ggml-cpu/ggml-cpu-impl.h
+        )
 
-        target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-    else()
-        message(WARNING "OpenMP not found")
+    target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
+    target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
+
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate)
+        if (ACCELERATE_FRAMEWORK)
+            message(STATUS "Accelerate framework found")
+
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64)
+
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK})
+        else()
+            message(WARNING "Accelerate framework not found")
+        endif()
     endif()
-endif()
 
-if (GGML_LLAMAFILE)
-    message(STATUS "Using llamafile")
+    if (GGML_OPENMP)
+        find_package(OpenMP)
+        if (OpenMP_FOUND)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
 
-    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        else()
+            message(WARNING "OpenMP not found")
+        endif()
+    endif()
 
-    list(APPEND GGML_CPU_SOURCES
-                llamafile/sgemm.cpp
-                llamafile/sgemm.h)
-endif()
+    if (GGML_LLAMAFILE)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
 
-if (GGML_CPU_HBM)
-    find_library(memkind memkind REQUIRED)
+        list(APPEND GGML_CPU_SOURCES
+                    ggml-cpu/llamafile/sgemm.cpp
+                    ggml-cpu/llamafile/sgemm.h)
+    endif()
 
-    message(STATUS "Using memkind for CPU HBM")
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)
 
-    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
+        message(STATUS "Using memkind for CPU HBM")
 
-    target_link_libraries(ggml-cpu PUBLIC memkind)
-endif()
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)
 
-if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
-    CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-    (NOT CMAKE_OSX_ARCHITECTURES      AND
-     NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+        target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
+    endif()
 
-    message(STATUS "ARM detected")
+    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
+        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+        (NOT CMAKE_OSX_ARCHITECTURES      AND
+        NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
 
-    if (MSVC)
-        list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
-        list(APPEND ARCH_DEFINITIONS __ARM_NEON)
-        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
+        message(STATUS "ARM detected")
 
-        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
-        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
+        if (MSVC)
+            list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
+            list(APPEND ARCH_DEFINITIONS __ARM_NEON)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
 
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
-        if (GGML_COMPILER_SUPPORT_DOTPROD)
-            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
+            set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
+            string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
 
-            message(STATUS "ARM feature DOTPROD enabled")
-        endif ()
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
+            if (GGML_COMPILER_SUPPORT_DOTPROD)
+                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
 
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-
-        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
-
-            message(STATUS "ARM feature MATMUL_INT8 enabled")
-        endif ()
-
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
-            message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
-        endif ()
-
-        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
-    elseif (APPLE)
-        if (GGML_NATIVE)
-            set(USER_PROVIDED_MARCH FALSE)
-            foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
-                if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
-                    set(USER_PROVIDED_MARCH TRUE)
-                    break()
-                endif()
-            endforeach()
-
-            if (NOT USER_PROVIDED_MARCH)
-                set(MARCH_FLAGS "-march=armv8.2a")
-
-                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
-                if (GGML_COMPILER_SUPPORT_DOTPROD)
-                    set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
-                    list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
-
-                    message(STATUS "ARM feature DOTPROD enabled")
-                endif ()
-
-                set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
-
-                set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-                set(CMAKE_REQUIRED_FLAGS     "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
-
-                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-                if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-                    set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
-                    list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
-
-                    message(STATUS "ARM feature MATMUL_INT8 enabled")
-                endif ()
-
-                set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-
-                list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
+                message(STATUS "ARM feature DOTPROD enabled")
             endif ()
-        endif ()
-    else()
-        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
-            # Raspberry Pi 1, Zero
-            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-                # Android armeabi-v7a
-                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
-            else()
-                # Raspberry Pi 2
-                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+
+            if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
+
+                message(STATUS "ARM feature MATMUL_INT8 enabled")
+            endif ()
+
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+            if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+                message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
+            endif ()
+
+            set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
+        elseif (APPLE)
+            if (GGML_NATIVE)
+                set(USER_PROVIDED_MARCH FALSE)
+                foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
+                    if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
+                        set(USER_PROVIDED_MARCH TRUE)
+                        break()
+                    endif()
+                endforeach()
+
+                if (NOT USER_PROVIDED_MARCH)
+                    set(MARCH_FLAGS "-march=armv8.2a")
+
+                    check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
+                    if (GGML_COMPILER_SUPPORT_DOTPROD)
+                        set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
+                        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
+
+                        message(STATUS "ARM feature DOTPROD enabled")
+                    endif ()
+
+                    set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
+
+                    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+                    set(CMAKE_REQUIRED_FLAGS     "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
+
+                    check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+                    if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+                        set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
+                        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
+
+                        message(STATUS "ARM feature MATMUL_INT8 enabled")
+                    endif ()
+
+                    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+                    list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
+                endif ()
+            endif ()
+        else()
+            check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+            if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
+            endif()
+            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
+                # Raspberry Pi 1, Zero
+                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
+            endif()
+            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
+                if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+                    # Android armeabi-v7a
+                    list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
+                else()
+                    # Raspberry Pi 2
+                    list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+                endif()
+            endif()
+            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+                # Android arm64-v8a
+                # Raspberry Pi 3, 4, Zero 2 (32-bit)
+                list(APPEND ARCH_FLAGS -mno-unaligned-access)
+            endif()
+            if (GGML_SVE)
+                list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
             endif()
         endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
-            # Android arm64-v8a
-            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            list(APPEND ARCH_FLAGS -mno-unaligned-access)
-        endif()
-        if (GGML_SVE)
-            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
-        endif()
-    endif()
-elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
-    message(STATUS "x86 detected")
-    if (MSVC)
-        # instruction set detection for MSVC only
-        if (GGML_NATIVE)
-            include(cmake/FindSIMD.cmake)
-        endif ()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (GGML_AVX512_VBMI)
-                list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
-                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
+        if (MSVC)
+            # instruction set detection for MSVC only
+            if (GGML_NATIVE)
+                include(ggml-cpu/cmake/FindSIMD.cmake)
+            endif ()
+            if (GGML_AVX512)
+                list(APPEND ARCH_FLAGS /arch:AVX512)
+                # /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
+                # MSVC has no compile-time flags enabling specific
+                # AVX512 extensions, neither it defines the
+                # macros corresponding to the extensions.
+                # Do it manually.
+                list(APPEND ARCH_DEFINITIONS GGML_AVX512)
+                if (GGML_AVX512_VBMI)
+                    list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512vbmi)
+                    endif()
+                endif()
+                if (GGML_AVX512_VNNI)
+                    list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512vnni)
+                    endif()
+                endif()
+                if (GGML_AVX512_BF16)
+                    list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512bf16)
+                    endif()
+                endif()
+                if (GGML_AMX_TILE)
+                    list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE)
+                endif()
+                if (GGML_AMX_INT8)
+                    list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8)
+                endif()
+                if (GGML_AMX_BF16)
+                    list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16)
+                endif()
+            elseif (GGML_AVX2)
+                list(APPEND ARCH_FLAGS /arch:AVX2)
+                list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C)
+            elseif (GGML_AVX)
+                list(APPEND ARCH_FLAGS /arch:AVX)
+                list(APPEND ARCH_DEFINITIONS GGML_AVX)
+            else ()
+                list(APPEND ARCH_FLAGS /arch:SSE4.2)
+                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+            endif()
+            if (GGML_AVX_VNNI)
+                # MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2
+                #list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
+            endif()
+        else ()
+            if (GGML_NATIVE)
+                list(APPEND ARCH_FLAGS -march=native)
+            else ()
+                list(APPEND ARCH_FLAGS -msse4.2)
+                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+                if (GGML_F16C)
+                    list(APPEND ARCH_FLAGS -mf16c)
+                    list(APPEND ARCH_DEFINITIONS GGML_F16C)
+                endif()
+                if (GGML_FMA)
+                    list(APPEND ARCH_FLAGS -mfma)
+                    list(APPEND ARCH_DEFINITIONS GGML_FMA)
+                endif()
+                if (GGML_AVX)
+                    list(APPEND ARCH_FLAGS -mavx)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX)
+                endif()
+                if (GGML_AVX2)
+                    list(APPEND ARCH_FLAGS -mavx2)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX2)
+                endif()
+                if (GGML_AVX_VNNI)
+                    list(APPEND ARCH_FLAGS -mavxvnni)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI)
+                endif()
+                if (GGML_AVX512)
+                    list(APPEND ARCH_FLAGS -mavx512f)
+                    list(APPEND ARCH_FLAGS -mavx512cd)
+                    list(APPEND ARCH_FLAGS -mavx512vl)
+                    list(APPEND ARCH_FLAGS -mavx512dq)
+                    list(APPEND ARCH_FLAGS -mavx512bw)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512)
+                endif()
+                if (GGML_AVX512_VBMI)
                     list(APPEND ARCH_FLAGS -mavx512vbmi)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI)
                 endif()
-            endif()
-            if (GGML_AVX512_VNNI)
-                list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
-                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                if (GGML_AVX512_VNNI)
                     list(APPEND ARCH_FLAGS -mavx512vnni)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI)
                 endif()
-            endif()
-            if (GGML_AVX512_BF16)
-                list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
-                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                if (GGML_AVX512_BF16)
                     list(APPEND ARCH_FLAGS -mavx512bf16)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16)
+                endif()
+                if (GGML_AMX_TILE)
+                    list(APPEND ARCH_FLAGS -mamx-tile)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE)
+                endif()
+                if (GGML_AMX_INT8)
+                    list(APPEND ARCH_FLAGS -mamx-int8)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8)
+                endif()
+                if (GGML_AMX_BF16)
+                    list(APPEND ARCH_FLAGS -mamx-bf16)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16)
                 endif()
             endif()
-            if (GGML_AMX_TILE)
-                list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
-            endif()
-            if (GGML_AMX_INT8)
-                list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
-            endif()
-            if (GGML_AMX_BF16)
-                list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
-            endif()
-        elseif (GGML_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
-        elseif (GGML_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
         endif()
-        if (GGML_AVX_VNNI)
-            list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
-            if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
-                list(APPEND ARCH_FLAGS -mavxvnni)
-            endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+        message(STATUS "PowerPC detected")
+        execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
+        string(FIND "${POWER10_M}" "POWER10" substring_index)
+        if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
+            set(substring_index -1)
+        endif()
+
+        if (${substring_index} GREATER_EQUAL 0)
+        list(APPEND ARCH_FLAGS -mcpu=power10)
+        elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
+        else()
+            list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
+            # TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+        endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        message(STATUS "loongarch64 detected")
+
+        list(APPEND ARCH_FLAGS -march=loongarch64)
+        if (GGML_LASX)
+            list(APPEND ARCH_FLAGS -mlasx)
+        endif()
+        if (GGML_LSX)
+            list(APPEND ARCH_FLAGS -mlsx)
+        endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
+        message(STATUS "RISC-V detected")
+        if (GGML_RVV)
+            list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
         endif()
     else()
-        if (GGML_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (GGML_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
-        if (GGML_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
-        endif()
-        if (GGML_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
-        endif()
-        if (GGML_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
-        endif()
-        if (GGML_AVX_VNNI)
-            list(APPEND ARCH_FLAGS -mavxvnni)
-        endif()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512dq)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (GGML_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (GGML_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-        if (GGML_AVX512_BF16)
-            list(APPEND ARCH_FLAGS -mavx512bf16)
-        endif()
-        if (GGML_AMX_TILE)
-            list(APPEND ARCH_FLAGS -mamx-tile)
-        endif()
-        if (GGML_AMX_INT8)
-            list(APPEND ARCH_FLAGS -mamx-int8)
-        endif()
-        if (GGML_AMX_BF16)
-            list(APPEND ARCH_FLAGS -mamx-bf16)
-        endif()
-    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-    message(STATUS "PowerPC detected")
-    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
-    string(FIND "${POWER10_M}" "POWER10" substring_index)
-    if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
-        set(substring_index -1)
+        message(STATUS "Unknown architecture")
     endif()
 
-    if (${substring_index} GREATER_EQUAL 0)
-       list(APPEND ARCH_FLAGS -mcpu=power10)
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-       list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
-    else()
-        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-        # TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+    if (GGML_CPU_AARCH64)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
     endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-    message(STATUS "loongarch64 detected")
 
-    list(APPEND ARCH_FLAGS -march=loongarch64)
-    if (GGML_LASX)
-        list(APPEND ARCH_FLAGS -mlasx)
+    message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
+    target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
+    target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
+    target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
+
+    if (GGML_BACKEND_DL)
+        # The feature detection code is compiled as a separate target so that
+        # it can be built without the architecture flags
+        # Since multiple variants of the CPU backend may be included in the same
+        # build, using set_source_files_properties() to set the arch flags is not possible
+        set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
+        add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
+        target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
+        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
+        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
+        set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
     endif()
-    if (GGML_LSX)
-        list(APPEND ARCH_FLAGS -mlsx)
+
+    if (EMSCRIPTEN)
+        set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
     endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
-    message(STATUS "RISC-V detected")
-    if (GGML_RVV)
-        list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
-    endif()
-else()
-    message(STATUS "Unknown architecture")
-endif()
-
-if (GGML_CPU_AARCH64)
-    message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
-    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
-endif()
-
-target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
-set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS     "${ARCH_FLAGS}")
-set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
-
-# the feature detection code must be compiled without any architecture flags
-target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
-# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
-
-if (EMSCRIPTEN)
-    set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
-endif()
+endfunction()
diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
index 514701ffe..e8133d411 100644
--- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp
+++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@@ -1,4 +1,3 @@
-#include "ggml-cpu.h"
 #include "ggml-backend-impl.h"
 
 #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
@@ -13,6 +12,7 @@
 #include <array>
 #include <string>
 
+// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
 struct cpuid_x86 {
     bool SSE3(void) { return f_1_ecx[0]; }
     bool PCLMULQDQ(void) { return f_1_ecx[1]; }
@@ -50,11 +50,15 @@ struct cpuid_x86 {
     bool INVPCID(void) { return f_7_ebx[10]; }
     bool RTM(void) { return is_intel && f_7_ebx[11]; }
     bool AVX512F(void) { return f_7_ebx[16]; }
+    bool AVX512DQ(void) { return f_7_ebx[17]; }
     bool RDSEED(void) { return f_7_ebx[18]; }
     bool ADX(void) { return f_7_ebx[19]; }
     bool AVX512PF(void) { return f_7_ebx[26]; }
     bool AVX512ER(void) { return f_7_ebx[27]; }
     bool AVX512CD(void) { return f_7_ebx[28]; }
+    bool AVX512BW(void) { return f_7_ebx[30]; }
+    bool AVX512VL(void) { return f_7_ebx[31]; }
+
     bool SHA(void) { return f_7_ebx[29]; }
 
     bool PREFETCHWT1(void) { return f_7_ecx[0]; }
@@ -259,36 +263,57 @@ void test_x86_is() {
 static int ggml_backend_cpu_x86_score() {
     // FIXME: this does not check for OS support
 
-    cpuid_x86 is;
-    // if the CPU backend was built with any features not supported by the current CPU, it cannot be used
-    if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
-    if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
-    if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
-    if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
-    if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
-    if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
-    if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
-    if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
-    if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
-    if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
-    if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
-    if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
-
-    // calculate a backend score based on the supported features
-    // more important features have a higher weight
     int score = 0;
-    score +=  ggml_cpu_has_fma        () * 1;
-    score +=  ggml_cpu_has_f16c       () * 1<<1;
-    score +=  ggml_cpu_has_ssse3      () * 1<<2;
-    score +=  ggml_cpu_has_sse3       () * 1<<3;
-    score +=  ggml_cpu_has_avx_vnni   () * 1<<4;
-    score +=  ggml_cpu_has_avx        () * 1<<5;
-    score +=  ggml_cpu_has_avx2       () * 1<<6;
-    score +=  ggml_cpu_has_avx512     () * 1<<7;
-    // score +=  ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
-    score +=  ggml_cpu_has_avx512_bf16() * 1<<9;
-    score +=  ggml_cpu_has_avx512_vnni() * 1<<10;
-    score +=  ggml_cpu_has_amx_int8   () * 1<<11;
+    cpuid_x86 is;
+
+#ifdef GGML_FMA
+    if (!is.FMA()) { return 0; }
+    score += 1;
+#endif
+#ifdef GGML_F16C
+    if (!is.F16C()) { return 0; }
+    score += 1<<1;
+#endif
+#ifdef GGML_SSE42
+    if (!is.SSE42()) { return 0; }
+    score += 1<<2;
+#endif
+#ifdef GGML_AVX
+    if (!is.AVX()) { return 0; }
+    score += 1<<4;
+#endif
+#ifdef GGML_AVX2
+    if (!is.AVX2()) { return 0; }
+    score += 1<<5;
+#endif
+#ifdef GGML_AVX_VNNI
+    if (!is.AVX_VNNI()) { return 0; }
+    score += 1<<6;
+#endif
+#ifdef GGML_AVX512
+    if (!is.AVX512F()) { return 0; }
+    if (!is.AVX512CD()) { return 0; }
+    if (!is.AVX512VL()) { return 0; }
+    if (!is.AVX512DQ()) { return 0; }
+    if (!is.AVX512BW()) { return 0; }
+    score += 1<<7;
+#endif
+#ifdef GGML_AVX512_VBMI
+    if (!is.AVX512_VBMI()) { return 0; }
+    score += 1<<8;
+#endif
+#ifdef GGML_AVX512_BF16
+    if (!is.AVX512_BF16()) { return 0; }
+    score += 1<<9;
+#endif
+#ifdef GGML_AVX512_VNNI
+    if (!is.AVX512_VNNI()) { return 0; }
+    score += 1<<10;
+#endif
+#ifdef GGML_AMX_INT8
+    if (!is.AMX_INT8()) { return 0; }
+    score += 1<<11;
+#endif
 
     return score;
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index e4a9ca013..40ca7bb68 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -756,7 +756,7 @@ do {                                                              \
 #define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
 #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
 #else
-static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
+static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
     float tmp[8];
 
     for (int i = 0; i < 8; i++) {
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 77e5d87a8..d3b4bdb96 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -641,7 +641,15 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_llamafile()) {
             features.push_back({ "LLAMAFILE", "1" });
         }
-        // TODO: rename this
+    #ifdef GGML_USE_ACCELERATE
+        features.push_back({ "ACCELERATE", "1" });
+    #endif
+    #ifdef GGML_USE_CPU_HBM
+        features.push_back({ "CPU_HBM", "1" });
+    #endif
+    #ifdef GGML_USE_OPENMP
+        features.push_back({ "OPENMP", "1" });
+    #endif
     #ifdef GGML_USE_CPU_AARCH64
         features.push_back({ "AARCH64_REPACK", "1" });
     #endif
diff --git a/scripts/build-cpu.sh b/scripts/build-cpu.sh
deleted file mode 100755
index 4b2ad816e..000000000
--- a/scripts/build-cpu.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-name="$1"
-args="${@:2}"
-
-echo "Building $name with args: $args"
-
-rm -fr build-cpu-$1
-cmake -S . -B build-cpu-$1 -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF $args
-cmake --build build-cpu-$1 --config Release -t ggml-cpu -j $(nproc)
-cp build-cpu-$1/bin/libggml-cpu.so ./libggml-cpu-$1.so
-rm -fr build-cpu-$1